001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.namenode; 019 020import static org.apache.hadoop.util.Time.now; 021 022import java.io.DataInput; 023import java.io.DataInputStream; 024import java.io.DataOutputStream; 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileNotFoundException; 028import java.io.FileOutputStream; 029import java.io.IOException; 030import java.security.DigestInputStream; 031import java.security.DigestOutputStream; 032import java.security.MessageDigest; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Collection; 036import java.util.HashMap; 037import java.util.List; 038import java.util.Map; 039import java.util.TreeMap; 040 041import org.apache.commons.logging.Log; 042import org.apache.hadoop.classification.InterfaceAudience; 043import org.apache.hadoop.classification.InterfaceStability; 044import org.apache.hadoop.conf.Configuration; 045import org.apache.hadoop.fs.FileSystem; 046import org.apache.hadoop.fs.Path; 047import org.apache.hadoop.fs.PathIsNotDirectoryException; 048import org.apache.hadoop.fs.UnresolvedLinkException; 049import org.apache.hadoop.fs.permission.PermissionStatus; 050import org.apache.hadoop.hdfs.DFSUtil; 051import org.apache.hadoop.hdfs.protocol.HdfsConstants; 052import org.apache.hadoop.hdfs.protocol.LayoutFlags; 053import org.apache.hadoop.hdfs.protocol.LayoutVersion; 054import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; 055import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; 056import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; 057import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; 058import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; 059import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; 060import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature; 061import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList; 062import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable; 063import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot; 064import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat; 065import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap; 066import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; 067import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress; 068import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter; 069import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step; 070import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType; 071import org.apache.hadoop.hdfs.util.ReadOnlyList; 072import org.apache.hadoop.io.IOUtils; 073import org.apache.hadoop.io.MD5Hash; 074import org.apache.hadoop.io.Text; 075import org.apache.hadoop.util.StringUtils; 076 077import com.google.common.base.Preconditions; 078import com.google.common.annotations.VisibleForTesting; 079 080/** 081 * Contains inner classes for reading or writing the on-disk format for 082 * FSImages. 083 * 084 * In particular, the format of the FSImage looks like: 085 * <pre> 086 * FSImage { 087 * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long, 088 * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long, 089 * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId: 090 * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int, 091 * numOfSnapshottableDirs: int, 092 * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed) 093 * } 094 * 095 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) { 096 * INodeInfo of root, numberOfChildren of root: int 097 * [list of INodeInfo of root's children], 098 * [list of INodeDirectoryInfo of root's directory children] 099 * } 100 * 101 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){ 102 * [list of INodeInfo of INodes in topological order] 103 * } 104 * 105 * INodeInfo { 106 * { 107 * localName: short + byte[] 108 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported 109 * or 110 * { 111 * fullPath: byte[] 112 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported 113 * replicationFactor: short, modificationTime: long, 114 * accessTime: long, preferredBlockSize: long, 115 * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink), 116 * { 117 * nsQuota: long, dsQuota: long, 118 * { 119 * isINodeSnapshottable: byte, 120 * isINodeWithSnapshot: byte (if isINodeSnapshottable is false) 121 * } (when {@link Feature#SNAPSHOT} is supported), 122 * fsPermission: short, PermissionStatus 123 * } for INodeDirectory 124 * or 125 * { 126 * symlinkString, fsPermission: short, PermissionStatus 127 * } for INodeSymlink 128 * or 129 * { 130 * [list of BlockInfo] 131 * [list of FileDiff] 132 * { 133 * isINodeFileUnderConstructionSnapshot: byte, 134 * {clientName: short + byte[], clientMachine: short + byte[]} (when 135 * isINodeFileUnderConstructionSnapshot is true), 136 * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 137 * fsPermission: short, PermissionStatus 138 * } for INodeFile 139 * } 140 * 141 * INodeDirectoryInfo { 142 * fullPath of the directory: short + byte[], 143 * numberOfChildren: int, [list of INodeInfo of children INode], 144 * { 145 * numberOfSnapshots: int, 146 * [list of Snapshot] (when NumberOfSnapshots is positive), 147 * numberOfDirectoryDiffs: int, 148 * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive), 149 * number of children that are directories, 150 * [list of INodeDirectoryInfo of the directory children] (includes 151 * snapshot copies of deleted sub-directories) 152 * } (when {@link Feature#SNAPSHOT} is supported), 153 * } 154 * 155 * Snapshot { 156 * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 157 * the name of the snapshot) 158 * } 159 * 160 * DirectoryDiff { 161 * full path of the root of the associated Snapshot: short + byte[], 162 * childrenSize: int, 163 * isSnapshotRoot: byte, 164 * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false), 165 * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 166 * } 167 * 168 * Diff { 169 * createdListSize: int, [Local name of INode in created list], 170 * deletedListSize: int, [INode in deleted list: INodeInfo] 171 * } 172 * 173 * FileDiff { 174 * full path of the root of the associated Snapshot: short + byte[], 175 * fileSize: long, 176 * snapshotINodeIsNotNull: byte, 177 * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 178 * } 179 * </pre> 180 */ 181@InterfaceAudience.Private 182@InterfaceStability.Evolving 183public class FSImageFormat { 184 private static final Log LOG = FSImage.LOG; 185 186 // Static-only class 187 private FSImageFormat() {} 188 189 interface AbstractLoader { 190 MD5Hash getLoadedImageMd5(); 191 long getLoadedImageTxId(); 192 } 193 194 static class LoaderDelegator implements AbstractLoader { 195 private AbstractLoader impl; 196 private final Configuration conf; 197 private final FSNamesystem fsn; 198 199 LoaderDelegator(Configuration conf, FSNamesystem fsn) { 200 this.conf = conf; 201 this.fsn = fsn; 202 } 203 204 @Override 205 public MD5Hash getLoadedImageMd5() { 206 return impl.getLoadedImageMd5(); 207 } 208 209 @Override 210 public long getLoadedImageTxId() { 211 return impl.getLoadedImageTxId(); 212 } 213 214 public void load(File file) throws IOException { 215 Preconditions.checkState(impl == null, "Image already loaded!"); 216 217 FileInputStream is = null; 218 try { 219 is = new FileInputStream(file); 220 byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length]; 221 IOUtils.readFully(is, magic, 0, magic.length); 222 if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) { 223 FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader( 224 conf, fsn); 225 impl = loader; 226 loader.load(file); 227 } else { 228 Loader loader = new Loader(conf, fsn); 229 impl = loader; 230 loader.load(file); 231 } 232 233 } finally { 234 IOUtils.cleanup(LOG, is); 235 } 236 } 237 } 238 239 /** 240 * Construct a loader class to load the image. It chooses the loader based on 241 * the layout version. 242 */ 243 public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) { 244 return new LoaderDelegator(conf, fsn); 245 } 246 247 /** 248 * A one-shot class responsible for loading an image. The load() function 249 * should be called once, after which the getter methods may be used to retrieve 250 * information about the image that was loaded, if loading was successful. 251 */ 252 public static class Loader implements AbstractLoader { 253 private final Configuration conf; 254 /** which namesystem this loader is working for */ 255 private final FSNamesystem namesystem; 256 257 /** Set to true once a file has been loaded using this loader. */ 258 private boolean loaded = false; 259 260 /** The transaction ID of the last edit represented by the loaded file */ 261 private long imgTxId; 262 /** The MD5 sum of the loaded file */ 263 private MD5Hash imgDigest; 264 265 private Map<Integer, Snapshot> snapshotMap = null; 266 private final ReferenceMap referenceMap = new ReferenceMap(); 267 268 Loader(Configuration conf, FSNamesystem namesystem) { 269 this.conf = conf; 270 this.namesystem = namesystem; 271 } 272 273 /** 274 * Return the MD5 checksum of the image that has been loaded. 275 * @throws IllegalStateException if load() has not yet been called. 276 */ 277 @Override 278 public MD5Hash getLoadedImageMd5() { 279 checkLoaded(); 280 return imgDigest; 281 } 282 283 @Override 284 public long getLoadedImageTxId() { 285 checkLoaded(); 286 return imgTxId; 287 } 288 289 /** 290 * Throw IllegalStateException if load() has not yet been called. 291 */ 292 private void checkLoaded() { 293 if (!loaded) { 294 throw new IllegalStateException("Image not yet loaded!"); 295 } 296 } 297 298 /** 299 * Throw IllegalStateException if load() has already been called. 300 */ 301 private void checkNotLoaded() { 302 if (loaded) { 303 throw new IllegalStateException("Image already loaded!"); 304 } 305 } 306 307 public void load(File curFile) throws IOException { 308 checkNotLoaded(); 309 assert curFile != null : "curFile is null"; 310 311 StartupProgress prog = NameNode.getStartupProgress(); 312 Step step = new Step(StepType.INODES); 313 prog.beginStep(Phase.LOADING_FSIMAGE, step); 314 long startTime = now(); 315 316 // 317 // Load in bits 318 // 319 MessageDigest digester = MD5Hash.getDigester(); 320 DigestInputStream fin = new DigestInputStream( 321 new FileInputStream(curFile), digester); 322 323 DataInputStream in = new DataInputStream(fin); 324 try { 325 // read image version: first appeared in version -1 326 int imgVersion = in.readInt(); 327 if (getLayoutVersion() != imgVersion) { 328 throw new InconsistentFSStateException(curFile, 329 "imgVersion " + imgVersion + 330 " expected to be " + getLayoutVersion()); 331 } 332 boolean supportSnapshot = NameNodeLayoutVersion.supports( 333 LayoutVersion.Feature.SNAPSHOT, imgVersion); 334 if (NameNodeLayoutVersion.supports( 335 LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) { 336 LayoutFlags.read(in); 337 } 338 339 // read namespaceID: first appeared in version -2 340 in.readInt(); 341 342 long numFiles = in.readLong(); 343 344 // read in the last generation stamp for legacy blocks. 345 long genstamp = in.readLong(); 346 namesystem.setGenerationStampV1(genstamp); 347 348 if (NameNodeLayoutVersion.supports( 349 LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) { 350 // read the starting generation stamp for sequential block IDs 351 genstamp = in.readLong(); 352 namesystem.setGenerationStampV2(genstamp); 353 354 // read the last generation stamp for blocks created after 355 // the switch to sequential block IDs. 356 long stampAtIdSwitch = in.readLong(); 357 namesystem.setGenerationStampV1Limit(stampAtIdSwitch); 358 359 // read the max sequential block ID. 360 long maxSequentialBlockId = in.readLong(); 361 namesystem.setLastAllocatedBlockId(maxSequentialBlockId); 362 } else { 363 long startingGenStamp = namesystem.upgradeGenerationStampToV2(); 364 // This is an upgrade. 365 LOG.info("Upgrading to sequential block IDs. Generation stamp " + 366 "for new blocks set to " + startingGenStamp); 367 } 368 369 // read the transaction ID of the last edit represented by 370 // this image 371 if (NameNodeLayoutVersion.supports( 372 LayoutVersion.Feature.STORED_TXIDS, imgVersion)) { 373 imgTxId = in.readLong(); 374 } else { 375 imgTxId = 0; 376 } 377 378 // read the last allocated inode id in the fsimage 379 if (NameNodeLayoutVersion.supports( 380 LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) { 381 long lastInodeId = in.readLong(); 382 namesystem.resetLastInodeId(lastInodeId); 383 if (LOG.isDebugEnabled()) { 384 LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId); 385 } 386 } else { 387 if (LOG.isDebugEnabled()) { 388 LOG.debug("Old layout version doesn't have inode id." 389 + " Will assign new id for each inode."); 390 } 391 } 392 393 if (supportSnapshot) { 394 snapshotMap = namesystem.getSnapshotManager().read(in, this); 395 } 396 397 // read compression related info 398 FSImageCompression compression; 399 if (NameNodeLayoutVersion.supports( 400 LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) { 401 compression = FSImageCompression.readCompressionHeader(conf, in); 402 } else { 403 compression = FSImageCompression.createNoopCompression(); 404 } 405 in = compression.unwrapInputStream(fin); 406 407 LOG.info("Loading image file " + curFile + " using " + compression); 408 409 // load all inodes 410 LOG.info("Number of files = " + numFiles); 411 prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles); 412 Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step); 413 if (NameNodeLayoutVersion.supports( 414 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) { 415 if (supportSnapshot) { 416 loadLocalNameINodesWithSnapshot(numFiles, in, counter); 417 } else { 418 loadLocalNameINodes(numFiles, in, counter); 419 } 420 } else { 421 loadFullNameINodes(numFiles, in, counter); 422 } 423 424 loadFilesUnderConstruction(in, supportSnapshot, counter); 425 prog.endStep(Phase.LOADING_FSIMAGE, step); 426 // Now that the step is finished, set counter equal to total to adjust 427 // for possible under-counting due to reference inodes. 428 prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles); 429 430 loadSecretManagerState(in); 431 432 loadCacheManagerState(in); 433 434 // make sure to read to the end of file 435 boolean eof = (in.read() == -1); 436 assert eof : "Should have reached the end of image file " + curFile; 437 } finally { 438 in.close(); 439 } 440 441 imgDigest = new MD5Hash(digester.digest()); 442 loaded = true; 443 444 LOG.info("Image file " + curFile + " of size " + curFile.length() + 445 " bytes loaded in " + (now() - startTime)/1000 + " seconds."); 446 } 447 448 /** Update the root node's attributes */ 449 private void updateRootAttr(INodeWithAdditionalFields root) { 450 final Quota.Counts q = root.getQuotaCounts(); 451 final long nsQuota = q.get(Quota.NAMESPACE); 452 final long dsQuota = q.get(Quota.DISKSPACE); 453 FSDirectory fsDir = namesystem.dir; 454 if (nsQuota != -1 || dsQuota != -1) { 455 fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota); 456 } 457 fsDir.rootDir.cloneModificationTime(root); 458 fsDir.rootDir.clonePermissionStatus(root); 459 } 460 461 /** 462 * Load fsimage files when 1) only local names are stored, 463 * and 2) snapshot is supported. 464 * 465 * @param numFiles number of files expected to be read 466 * @param in Image input stream 467 * @param counter Counter to increment for namenode startup progress 468 */ 469 private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in, 470 Counter counter) throws IOException { 471 assert NameNodeLayoutVersion.supports( 472 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion()); 473 assert NameNodeLayoutVersion.supports( 474 LayoutVersion.Feature.SNAPSHOT, getLayoutVersion()); 475 476 // load root 477 loadRoot(in, counter); 478 // load rest of the nodes recursively 479 loadDirectoryWithSnapshot(in, counter); 480 } 481 482 /** 483 * load fsimage files assuming only local names are stored. Used when 484 * snapshots are not supported by the layout version. 485 * 486 * @param numFiles number of files expected to be read 487 * @param in image input stream 488 * @param counter Counter to increment for namenode startup progress 489 * @throws IOException 490 */ 491 private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter) 492 throws IOException { 493 assert NameNodeLayoutVersion.supports( 494 LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion()); 495 assert numFiles > 0; 496 497 // load root 498 loadRoot(in, counter); 499 // have loaded the first file (the root) 500 numFiles--; 501 502 // load rest of the nodes directory by directory 503 while (numFiles > 0) { 504 numFiles -= loadDirectory(in, counter); 505 } 506 if (numFiles != 0) { 507 throw new IOException("Read unexpect number of files: " + -numFiles); 508 } 509 } 510 511 /** 512 * Load information about root, and use the information to update the root 513 * directory of NameSystem. 514 * @param in The {@link DataInput} instance to read. 515 * @param counter Counter to increment for namenode startup progress 516 */ 517 private void loadRoot(DataInput in, Counter counter) 518 throws IOException { 519 // load root 520 if (in.readShort() != 0) { 521 throw new IOException("First node is not root"); 522 } 523 final INodeDirectory root = loadINode(null, false, in, counter) 524 .asDirectory(); 525 // update the root's attributes 526 updateRootAttr(root); 527 } 528 529 /** Load children nodes for the parent directory. */ 530 private int loadChildren(INodeDirectory parent, DataInput in, 531 Counter counter) throws IOException { 532 int numChildren = in.readInt(); 533 for (int i = 0; i < numChildren; i++) { 534 // load single inode 535 INode newNode = loadINodeWithLocalName(false, in, true, counter); 536 addToParent(parent, newNode); 537 } 538 return numChildren; 539 } 540 541 /** 542 * Load a directory when snapshot is supported. 543 * @param in The {@link DataInput} instance to read. 544 * @param counter Counter to increment for namenode startup progress 545 */ 546 private void loadDirectoryWithSnapshot(DataInput in, Counter counter) 547 throws IOException { 548 // Step 1. Identify the parent INode 549 long inodeId = in.readLong(); 550 final INodeDirectory parent = this.namesystem.dir.getInode(inodeId) 551 .asDirectory(); 552 553 // Check if the whole subtree has been saved (for reference nodes) 554 boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId()); 555 if (!toLoadSubtree) { 556 return; 557 } 558 559 // Step 2. Load snapshots if parent is snapshottable 560 int numSnapshots = in.readInt(); 561 if (numSnapshots >= 0) { 562 final INodeDirectorySnapshottable snapshottableParent 563 = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName()); 564 // load snapshots and snapshotQuota 565 SnapshotFSImageFormat.loadSnapshotList(snapshottableParent, 566 numSnapshots, in, this); 567 if (snapshottableParent.getSnapshotQuota() > 0) { 568 // add the directory to the snapshottable directory list in 569 // SnapshotManager. Note that we only add root when its snapshot quota 570 // is positive. 571 this.namesystem.getSnapshotManager().addSnapshottable( 572 snapshottableParent); 573 } 574 } 575 576 // Step 3. Load children nodes under parent 577 loadChildren(parent, in, counter); 578 579 // Step 4. load Directory Diff List 580 SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this); 581 582 // Recursively load sub-directories, including snapshot copies of deleted 583 // directories 584 int numSubTree = in.readInt(); 585 for (int i = 0; i < numSubTree; i++) { 586 loadDirectoryWithSnapshot(in, counter); 587 } 588 } 589 590 /** 591 * Load all children of a directory 592 * 593 * @param in input to load from 594 * @param counter Counter to increment for namenode startup progress 595 * @return number of child inodes read 596 * @throws IOException 597 */ 598 private int loadDirectory(DataInput in, Counter counter) throws IOException { 599 String parentPath = FSImageSerialization.readString(in); 600 // Rename .snapshot paths if we're doing an upgrade 601 parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion()); 602 final INodeDirectory parent = INodeDirectory.valueOf( 603 namesystem.dir.getNode(parentPath, true), parentPath); 604 return loadChildren(parent, in, counter); 605 } 606 607 /** 608 * load fsimage files assuming full path names are stored 609 * 610 * @param numFiles total number of files to load 611 * @param in data input stream 612 * @param counter Counter to increment for namenode startup progress 613 * @throws IOException if any error occurs 614 */ 615 private void loadFullNameINodes(long numFiles, DataInput in, Counter counter) 616 throws IOException { 617 byte[][] pathComponents; 618 byte[][] parentPath = {{}}; 619 FSDirectory fsDir = namesystem.dir; 620 INodeDirectory parentINode = fsDir.rootDir; 621 for (long i = 0; i < numFiles; i++) { 622 pathComponents = FSImageSerialization.readPathComponents(in); 623 for (int j=0; j < pathComponents.length; j++) { 624 byte[] newComponent = renameReservedComponentOnUpgrade 625 (pathComponents[j], getLayoutVersion()); 626 if (!Arrays.equals(newComponent, pathComponents[j])) { 627 String oldPath = DFSUtil.byteArray2PathString(pathComponents); 628 pathComponents[j] = newComponent; 629 String newPath = DFSUtil.byteArray2PathString(pathComponents); 630 LOG.info("Renaming reserved path " + oldPath + " to " + newPath); 631 } 632 } 633 final INode newNode = loadINode( 634 pathComponents[pathComponents.length-1], false, in, counter); 635 636 if (isRoot(pathComponents)) { // it is the root 637 // update the root's attributes 638 updateRootAttr(newNode.asDirectory()); 639 continue; 640 } 641 642 namesystem.dir.addToInodeMap(newNode); 643 // check if the new inode belongs to the same parent 644 if(!isParent(pathComponents, parentPath)) { 645 parentINode = getParentINodeDirectory(pathComponents); 646 parentPath = getParent(pathComponents); 647 } 648 649 // add new inode 650 addToParent(parentINode, newNode); 651 } 652 } 653 654 private INodeDirectory getParentINodeDirectory(byte[][] pathComponents 655 ) throws FileNotFoundException, PathIsNotDirectoryException, 656 UnresolvedLinkException { 657 if (pathComponents.length < 2) { // root 658 return null; 659 } 660 // Gets the parent INode 661 final INodesInPath inodes = namesystem.dir.getExistingPathINodes( 662 pathComponents); 663 return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents); 664 } 665 666 /** 667 * Add the child node to parent and, if child is a file, update block map. 668 * This method is only used for image loading so that synchronization, 669 * modification time update and space count update are not needed. 670 */ 671 private void addToParent(INodeDirectory parent, INode child) { 672 FSDirectory fsDir = namesystem.dir; 673 if (parent == fsDir.rootDir) { 674 child.setLocalName(renameReservedRootComponentOnUpgrade( 675 child.getLocalNameBytes(), getLayoutVersion())); 676 } 677 // NOTE: This does not update space counts for parents 678 if (!parent.addChild(child)) { 679 return; 680 } 681 namesystem.dir.cacheName(child); 682 683 if (child.isFile()) { 684 updateBlocksMap(child.asFile()); 685 } 686 } 687 688 public void updateBlocksMap(INodeFile file) { 689 // Add file->block mapping 690 final BlockInfo[] blocks = file.getBlocks(); 691 if (blocks != null) { 692 final BlockManager bm = namesystem.getBlockManager(); 693 for (int i = 0; i < blocks.length; i++) { 694 file.setBlock(i, bm.addBlockCollection(blocks[i], file)); 695 } 696 } 697 } 698 699 /** @return The FSDirectory of the namesystem where the fsimage is loaded */ 700 public FSDirectory getFSDirectoryInLoading() { 701 return namesystem.dir; 702 } 703 704 public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in, 705 boolean updateINodeMap) throws IOException { 706 return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null); 707 } 708 709 public INode loadINodeWithLocalName(boolean isSnapshotINode, 710 DataInput in, boolean updateINodeMap, Counter counter) 711 throws IOException { 712 byte[] localName = FSImageSerialization.readLocalName(in); 713 localName = 714 renameReservedComponentOnUpgrade(localName, getLayoutVersion()); 715 INode inode = loadINode(localName, isSnapshotINode, in, counter); 716 if (updateINodeMap) { 717 namesystem.dir.addToInodeMap(inode); 718 } 719 return inode; 720 } 721 722 /** 723 * load an inode from fsimage except for its name 724 * 725 * @param in data input stream from which image is read 726 * @param counter Counter to increment for namenode startup progress 727 * @return an inode 728 */ 729 @SuppressWarnings("deprecation") 730 INode loadINode(final byte[] localName, boolean isSnapshotINode, 731 DataInput in, Counter counter) throws IOException { 732 final int imgVersion = getLayoutVersion(); 733 if (NameNodeLayoutVersion.supports( 734 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 735 namesystem.getFSDirectory().verifyINodeName(localName); 736 } 737 738 long inodeId = NameNodeLayoutVersion.supports( 739 LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong() 740 : namesystem.allocateNewInodeId(); 741 742 final short replication = namesystem.getBlockManager().adjustReplication( 743 in.readShort()); 744 final long modificationTime = in.readLong(); 745 long atime = 0; 746 if (NameNodeLayoutVersion.supports( 747 LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) { 748 atime = in.readLong(); 749 } 750 final long blockSize = in.readLong(); 751 final int numBlocks = in.readInt(); 752 753 if (numBlocks >= 0) { 754 // file 755 756 // read blocks 757 BlockInfo[] blocks = new BlockInfo[numBlocks]; 758 for (int j = 0; j < numBlocks; j++) { 759 blocks[j] = new BlockInfo(replication); 760 blocks[j].readFields(in); 761 } 762 763 String clientName = ""; 764 String clientMachine = ""; 765 boolean underConstruction = false; 766 FileDiffList fileDiffs = null; 767 if (NameNodeLayoutVersion.supports( 768 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 769 // read diffs 770 fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this); 771 772 if (isSnapshotINode) { 773 underConstruction = in.readBoolean(); 774 if (underConstruction) { 775 clientName = FSImageSerialization.readString(in); 776 clientMachine = FSImageSerialization.readString(in); 777 // convert the last block to BlockUC 778 if (blocks.length > 0) { 779 BlockInfo lastBlk = blocks[blocks.length - 1]; 780 blocks[blocks.length - 1] = new BlockInfoUnderConstruction( 781 lastBlk, replication); 782 } 783 } 784 } 785 } 786 787 final PermissionStatus permissions = PermissionStatus.read(in); 788 789 // return 790 if (counter != null) { 791 counter.increment(); 792 } 793 final INodeFile file = new INodeFile(inodeId, localName, permissions, 794 modificationTime, atime, blocks, replication, blockSize); 795 if (underConstruction) { 796 file.toUnderConstruction(clientName, clientMachine); 797 } 798 return fileDiffs == null ? file : new INodeFile(file, fileDiffs); 799 } else if (numBlocks == -1) { 800 //directory 801 802 //read quotas 803 final long nsQuota = in.readLong(); 804 long dsQuota = -1L; 805 if (NameNodeLayoutVersion.supports( 806 LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) { 807 dsQuota = in.readLong(); 808 } 809 810 //read snapshot info 811 boolean snapshottable = false; 812 boolean withSnapshot = false; 813 if (NameNodeLayoutVersion.supports( 814 LayoutVersion.Feature.SNAPSHOT, imgVersion)) { 815 snapshottable = in.readBoolean(); 816 if (!snapshottable) { 817 withSnapshot = in.readBoolean(); 818 } 819 } 820 821 final PermissionStatus permissions = PermissionStatus.read(in); 822 823 //return 824 if (counter != null) { 825 counter.increment(); 826 } 827 final INodeDirectory dir = new INodeDirectory(inodeId, localName, 828 permissions, modificationTime); 829 if (nsQuota >= 0 || dsQuota >= 0) { 830 dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota); 831 } 832 if (withSnapshot) { 833 dir.addSnapshotFeature(null); 834 } 835 return snapshottable ? new INodeDirectorySnapshottable(dir) : dir; 836 } else if (numBlocks == -2) { 837 //symlink 838 if (!FileSystem.areSymlinksEnabled()) { 839 throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS"); 840 } 841 842 final String symlink = Text.readString(in); 843 final PermissionStatus permissions = PermissionStatus.read(in); 844 if (counter != null) { 845 counter.increment(); 846 } 847 return new INodeSymlink(inodeId, localName, permissions, 848 modificationTime, atime, symlink); 849 } else if (numBlocks == -3) { 850 //reference 851 // Intentionally do not increment counter, because it is too difficult at 852 // this point to assess whether or not this is a reference that counts 853 // toward quota. 854 855 final boolean isWithName = in.readBoolean(); 856 // lastSnapshotId for WithName node, dstSnapshotId for DstReference node 857 int snapshotId = in.readInt(); 858 859 final INodeReference.WithCount withCount 860 = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this); 861 862 if (isWithName) { 863 return new INodeReference.WithName(null, withCount, localName, 864 snapshotId); 865 } else { 866 final INodeReference ref = new INodeReference.DstReference(null, 867 withCount, snapshotId); 868 return ref; 869 } 870 } 871 872 throw new IOException("Unknown inode type: numBlocks=" + numBlocks); 873 } 874 875 /** Load {@link INodeFileAttributes}. */ 876 public INodeFileAttributes loadINodeFileAttributes(DataInput in) 877 throws IOException { 878 final int layoutVersion = getLayoutVersion(); 879 880 if (!NameNodeLayoutVersion.supports( 881 LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 882 return loadINodeWithLocalName(true, in, false).asFile(); 883 } 884 885 final byte[] name = FSImageSerialization.readLocalName(in); 886 final PermissionStatus permissions = PermissionStatus.read(in); 887 final long modificationTime = in.readLong(); 888 final long accessTime = in.readLong(); 889 890 final short replication = namesystem.getBlockManager().adjustReplication( 891 in.readShort()); 892 final long preferredBlockSize = in.readLong(); 893 894 return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime, 895 accessTime, replication, preferredBlockSize, null); 896 } 897 898 public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in) 899 throws IOException { 900 final int layoutVersion = getLayoutVersion(); 901 902 if (!NameNodeLayoutVersion.supports( 903 LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) { 904 return loadINodeWithLocalName(true, in, false).asDirectory(); 905 } 906 907 final byte[] name = FSImageSerialization.readLocalName(in); 908 final PermissionStatus permissions = PermissionStatus.read(in); 909 final long modificationTime = in.readLong(); 910 911 //read quotas 912 final long nsQuota = in.readLong(); 913 final long dsQuota = in.readLong(); 914 915 return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy( 916 name, permissions, null, modificationTime, null) 917 : new INodeDirectoryAttributes.CopyWithQuota(name, permissions, 918 null, modificationTime, nsQuota, dsQuota, null); 919 } 920 921 private void loadFilesUnderConstruction(DataInput in, 922 boolean supportSnapshot, Counter counter) throws IOException { 923 FSDirectory fsDir = namesystem.dir; 924 int size = in.readInt(); 925 926 LOG.info("Number of files under construction = " + size); 927 928 for (int i = 0; i < size; i++) { 929 INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in, 930 namesystem, getLayoutVersion()); 931 counter.increment(); 932 933 // verify that file exists in namespace 934 String path = cons.getLocalName(); 935 INodeFile oldnode = null; 936 boolean inSnapshot = false; 937 if (path != null && FSDirectory.isReservedName(path) && 938 NameNodeLayoutVersion.supports( 939 LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) { 940 // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in 941 // snapshot. If we support INode ID in the layout version, we can use 942 // the inode id to find the oldnode. 943 oldnode = namesystem.dir.getInode(cons.getId()).asFile(); 944 inSnapshot = true; 945 } else { 946 path = renameReservedPathsOnUpgrade(path, getLayoutVersion()); 947 final INodesInPath iip = fsDir.getLastINodeInPath(path); 948 oldnode = INodeFile.valueOf(iip.getINode(0), path); 949 } 950 951 FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature(); 952 oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine()); 953 if (oldnode.numBlocks() > 0) { 954 BlockInfo ucBlock = cons.getLastBlock(); 955 // we do not replace the inode, just replace the last block of oldnode 956 BlockInfo info = namesystem.getBlockManager().addBlockCollection( 957 ucBlock, oldnode); 958 oldnode.setBlock(oldnode.numBlocks() - 1, info); 959 } 960 961 if (!inSnapshot) { 962 namesystem.leaseManager.addLease(cons 963 .getFileUnderConstructionFeature().getClientName(), path); 964 } 965 } 966 } 967 968 private void loadSecretManagerState(DataInput in) 969 throws IOException { 970 int imgVersion = getLayoutVersion(); 971 972 if (!NameNodeLayoutVersion.supports( 973 LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) { 974 //SecretManagerState is not available. 975 //This must not happen if security is turned on. 976 return; 977 } 978 namesystem.loadSecretManagerStateCompat(in); 979 } 980 981 private void loadCacheManagerState(DataInput in) throws IOException { 982 int imgVersion = getLayoutVersion(); 983 if (!NameNodeLayoutVersion.supports( 984 LayoutVersion.Feature.CACHING, imgVersion)) { 985 return; 986 } 987 namesystem.getCacheManager().loadStateCompat(in); 988 } 989 990 private int getLayoutVersion() { 991 return namesystem.getFSImage().getStorage().getLayoutVersion(); 992 } 993 994 private boolean isRoot(byte[][] path) { 995 return path.length == 1 && 996 path[0] == null; 997 } 998 999 private boolean isParent(byte[][] path, byte[][] parent) { 1000 if (path == null || parent == null) 1001 return false; 1002 if (parent.length == 0 || path.length != parent.length + 1) 1003 return false; 1004 boolean isParent = true; 1005 for (int i = 0; i < parent.length; i++) { 1006 isParent = isParent && Arrays.equals(path[i], parent[i]); 1007 } 1008 return isParent; 1009 } 1010 1011 /** 1012 * Return string representing the parent of the given path. 1013 */ 1014 String getParent(String path) { 1015 return path.substring(0, path.lastIndexOf(Path.SEPARATOR)); 1016 } 1017 1018 byte[][] getParent(byte[][] path) { 1019 byte[][] result = new byte[path.length - 1][]; 1020 for (int i = 0; i < result.length; i++) { 1021 result[i] = new byte[path[i].length]; 1022 System.arraycopy(path[i], 0, result[i], 0, path[i].length); 1023 } 1024 return result; 1025 } 1026 1027 public Snapshot getSnapshot(DataInput in) throws IOException { 1028 return snapshotMap.get(in.readInt()); 1029 } 1030 } 1031 1032 @VisibleForTesting 1033 public static final TreeMap<String, String> renameReservedMap = 1034 new TreeMap<String, String>(); 1035 1036 /** 1037 * Use the default key-value pairs that will be used to determine how to 1038 * rename reserved paths on upgrade. 1039 */ 1040 @VisibleForTesting 1041 public static void useDefaultRenameReservedPairs() { 1042 renameReservedMap.clear(); 1043 for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) { 1044 renameReservedMap.put( 1045 key, 1046 key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "." 1047 + "UPGRADE_RENAMED"); 1048 } 1049 } 1050 1051 /** 1052 * Set the key-value pairs that will be used to determine how to rename 1053 * reserved paths on upgrade. 1054 */ 1055 @VisibleForTesting 1056 public static void setRenameReservedPairs(String renameReserved) { 1057 // Clear and set the default values 1058 useDefaultRenameReservedPairs(); 1059 // Overwrite with provided values 1060 setRenameReservedMapInternal(renameReserved); 1061 } 1062 1063 private static void setRenameReservedMapInternal(String renameReserved) { 1064 Collection<String> pairs = 1065 StringUtils.getTrimmedStringCollection(renameReserved); 1066 for (String p : pairs) { 1067 String[] pair = StringUtils.split(p, '/', '='); 1068 Preconditions.checkArgument(pair.length == 2, 1069 "Could not parse key-value pair " + p); 1070 String key = pair[0]; 1071 String value = pair[1]; 1072 Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key), 1073 "Unknown reserved path " + key); 1074 Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value), 1075 "Invalid rename path for " + key + ": " + value); 1076 LOG.info("Will rename reserved path " + key + " to " + value); 1077 renameReservedMap.put(key, value); 1078 } 1079 } 1080 1081 /** 1082 * When upgrading from an old version, the filesystem could contain paths 1083 * that are now reserved in the new version (e.g. .snapshot). This renames 1084 * these new reserved paths to a user-specified value to avoid collisions 1085 * with the reserved name. 1086 * 1087 * @param path Old path potentially containing a reserved path 1088 * @return New path with reserved path components renamed to user value 1089 */ 1090 static String renameReservedPathsOnUpgrade(String path, 1091 final int layoutVersion) { 1092 final String oldPath = path; 1093 // If any known LVs aren't supported, we're doing an upgrade 1094 if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) { 1095 String[] components = INode.getPathNames(path); 1096 // Only need to worry about the root directory 1097 if (components.length > 1) { 1098 components[1] = DFSUtil.bytes2String( 1099 renameReservedRootComponentOnUpgrade( 1100 DFSUtil.string2Bytes(components[1]), 1101 layoutVersion)); 1102 path = DFSUtil.strings2PathString(components); 1103 } 1104 } 1105 if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) { 1106 String[] components = INode.getPathNames(path); 1107 // Special case the root path 1108 if (components.length == 0) { 1109 return path; 1110 } 1111 for (int i=0; i<components.length; i++) { 1112 components[i] = DFSUtil.bytes2String( 1113 renameReservedComponentOnUpgrade( 1114 DFSUtil.string2Bytes(components[i]), 1115 layoutVersion)); 1116 } 1117 path = DFSUtil.strings2PathString(components); 1118 } 1119 1120 if (!path.equals(oldPath)) { 1121 LOG.info("Upgrade process renamed reserved path " + oldPath + " to " 1122 + path); 1123 } 1124 return path; 1125 } 1126 1127 private final static String RESERVED_ERROR_MSG = 1128 FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and " 1129 + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in" 1130 + " this version of HDFS. Please rollback and delete or rename" 1131 + " this path, or upgrade with the " 1132 + StartupOption.RENAMERESERVED.getName() 1133 + " [key-value pairs]" 1134 + " option to automatically rename these paths during upgrade."; 1135 1136 /** 1137 * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single 1138 * byte array path component. 1139 */ 1140 private static byte[] renameReservedComponentOnUpgrade(byte[] component, 1141 final int layoutVersion) { 1142 // If the LV doesn't support snapshots, we're doing an upgrade 1143 if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) { 1144 if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) { 1145 Preconditions.checkArgument( 1146 renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR), 1147 RESERVED_ERROR_MSG); 1148 component = 1149 DFSUtil.string2Bytes(renameReservedMap 1150 .get(HdfsConstants.DOT_SNAPSHOT_DIR)); 1151 } 1152 } 1153 return component; 1154 } 1155 1156 /** 1157 * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single 1158 * byte array path component. 1159 */ 1160 private static byte[] renameReservedRootComponentOnUpgrade(byte[] component, 1161 final int layoutVersion) { 1162 // If the LV doesn't support inode IDs, we're doing an upgrade 1163 if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) { 1164 if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) { 1165 Preconditions.checkArgument( 1166 renameReservedMap.containsKey(FSDirectory.DOT_RESERVED_STRING), 1167 RESERVED_ERROR_MSG); 1168 final String renameString = renameReservedMap 1169 .get(FSDirectory.DOT_RESERVED_STRING); 1170 component = 1171 DFSUtil.string2Bytes(renameString); 1172 LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING 1173 + " to " + renameString); 1174 } 1175 } 1176 return component; 1177 } 1178 1179 /** 1180 * A one-shot class responsible for writing an image file. 1181 * The write() function should be called once, after which the getter 1182 * functions may be used to retrieve information about the file that was written. 1183 * 1184 * This is replaced by the PB-based FSImage. The class is to maintain 1185 * compatibility for the external fsimage tool. 1186 */ 1187 @Deprecated 1188 static class Saver { 1189 private static final int LAYOUT_VERSION = -51; 1190 private final SaveNamespaceContext context; 1191 /** Set to true once an image has been written */ 1192 private boolean saved = false; 1193 1194 /** The MD5 checksum of the file that was written */ 1195 private MD5Hash savedDigest; 1196 private final ReferenceMap referenceMap = new ReferenceMap(); 1197 1198 private final Map<Long, INodeFile> snapshotUCMap = 1199 new HashMap<Long, INodeFile>(); 1200 1201 /** @throws IllegalStateException if the instance has not yet saved an image */ 1202 private void checkSaved() { 1203 if (!saved) { 1204 throw new IllegalStateException("FSImageSaver has not saved an image"); 1205 } 1206 } 1207 1208 /** @throws IllegalStateException if the instance has already saved an image */ 1209 private void checkNotSaved() { 1210 if (saved) { 1211 throw new IllegalStateException("FSImageSaver has already saved an image"); 1212 } 1213 } 1214 1215 1216 Saver(SaveNamespaceContext context) { 1217 this.context = context; 1218 } 1219 1220 /** 1221 * Return the MD5 checksum of the image file that was saved. 1222 */ 1223 MD5Hash getSavedDigest() { 1224 checkSaved(); 1225 return savedDigest; 1226 } 1227 1228 void save(File newFile, FSImageCompression compression) throws IOException { 1229 checkNotSaved(); 1230 1231 final FSNamesystem sourceNamesystem = context.getSourceNamesystem(); 1232 final INodeDirectory rootDir = sourceNamesystem.dir.rootDir; 1233 final long numINodes = rootDir.getDirectoryWithQuotaFeature() 1234 .getSpaceConsumed().get(Quota.NAMESPACE); 1235 String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath(); 1236 Step step = new Step(StepType.INODES, sdPath); 1237 StartupProgress prog = NameNode.getStartupProgress(); 1238 prog.beginStep(Phase.SAVING_CHECKPOINT, step); 1239 prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes); 1240 Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step); 1241 long startTime = now(); 1242 // 1243 // Write out data 1244 // 1245 MessageDigest digester = MD5Hash.getDigester(); 1246 FileOutputStream fout = new FileOutputStream(newFile); 1247 DigestOutputStream fos = new DigestOutputStream(fout, digester); 1248 DataOutputStream out = new DataOutputStream(fos); 1249 try { 1250 out.writeInt(LAYOUT_VERSION); 1251 LayoutFlags.write(out); 1252 // We use the non-locked version of getNamespaceInfo here since 1253 // the coordinating thread of saveNamespace already has read-locked 1254 // the namespace for us. If we attempt to take another readlock 1255 // from the actual saver thread, there's a potential of a 1256 // fairness-related deadlock. See the comments on HDFS-2223. 1257 out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo() 1258 .getNamespaceID()); 1259 out.writeLong(numINodes); 1260 out.writeLong(sourceNamesystem.getGenerationStampV1()); 1261 out.writeLong(sourceNamesystem.getGenerationStampV2()); 1262 out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch()); 1263 out.writeLong(sourceNamesystem.getLastAllocatedBlockId()); 1264 out.writeLong(context.getTxId()); 1265 out.writeLong(sourceNamesystem.getLastInodeId()); 1266 1267 1268 sourceNamesystem.getSnapshotManager().write(out); 1269 1270 // write compression info and set up compressed stream 1271 out = compression.writeHeaderAndWrapStream(fos); 1272 LOG.info("Saving image file " + newFile + 1273 " using " + compression); 1274 1275 // save the root 1276 saveINode2Image(rootDir, out, false, referenceMap, counter); 1277 // save the rest of the nodes 1278 saveImage(rootDir, out, true, false, counter); 1279 prog.endStep(Phase.SAVING_CHECKPOINT, step); 1280 // Now that the step is finished, set counter equal to total to adjust 1281 // for possible under-counting due to reference inodes. 1282 prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes); 1283 // save files under construction 1284 // TODO: for HDFS-5428, since we cannot break the compatibility of 1285 // fsimage, we store part of the under-construction files that are only 1286 // in snapshots in this "under-construction-file" section. As a 1287 // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their 1288 // paths, so that when loading fsimage we do not put them into the lease 1289 // map. In the future, we can remove this hack when we can bump the 1290 // layout version. 1291 sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap); 1292 1293 context.checkCancelled(); 1294 sourceNamesystem.saveSecretManagerStateCompat(out, sdPath); 1295 context.checkCancelled(); 1296 sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath); 1297 context.checkCancelled(); 1298 out.flush(); 1299 context.checkCancelled(); 1300 fout.getChannel().force(true); 1301 } finally { 1302 out.close(); 1303 } 1304 1305 saved = true; 1306 // set md5 of the saved image 1307 savedDigest = new MD5Hash(digester.digest()); 1308 1309 LOG.info("Image file " + newFile + " of size " + newFile.length() + 1310 " bytes saved in " + (now() - startTime)/1000 + " seconds."); 1311 } 1312 1313 /** 1314 * Save children INodes. 1315 * @param children The list of children INodes 1316 * @param out The DataOutputStream to write 1317 * @param inSnapshot Whether the parent directory or its ancestor is in 1318 * the deleted list of some snapshot (caused by rename or 1319 * deletion) 1320 * @param counter Counter to increment for namenode startup progress 1321 * @return Number of children that are directory 1322 */ 1323 private int saveChildren(ReadOnlyList<INode> children, 1324 DataOutputStream out, boolean inSnapshot, Counter counter) 1325 throws IOException { 1326 // Write normal children INode. 1327 out.writeInt(children.size()); 1328 int dirNum = 0; 1329 int i = 0; 1330 for(INode child : children) { 1331 // print all children first 1332 // TODO: for HDFS-5428, we cannot change the format/content of fsimage 1333 // here, thus even if the parent directory is in snapshot, we still 1334 // do not handle INodeUC as those stored in deleted list 1335 saveINode2Image(child, out, false, referenceMap, counter); 1336 if (child.isDirectory()) { 1337 dirNum++; 1338 } else if (inSnapshot && child.isFile() 1339 && child.asFile().isUnderConstruction()) { 1340 this.snapshotUCMap.put(child.getId(), child.asFile()); 1341 } 1342 if (i++ % 50 == 0) { 1343 context.checkCancelled(); 1344 } 1345 } 1346 return dirNum; 1347 } 1348 1349 /** 1350 * Save file tree image starting from the given root. 1351 * This is a recursive procedure, which first saves all children and 1352 * snapshot diffs of a current directory and then moves inside the 1353 * sub-directories. 1354 * 1355 * @param current The current node 1356 * @param out The DataoutputStream to write the image 1357 * @param toSaveSubtree Whether or not to save the subtree to fsimage. For 1358 * reference node, its subtree may already have been 1359 * saved before. 1360 * @param inSnapshot Whether the current directory is in snapshot 1361 * @param counter Counter to increment for namenode startup progress 1362 */ 1363 private void saveImage(INodeDirectory current, DataOutputStream out, 1364 boolean toSaveSubtree, boolean inSnapshot, Counter counter) 1365 throws IOException { 1366 // write the inode id of the directory 1367 out.writeLong(current.getId()); 1368 1369 if (!toSaveSubtree) { 1370 return; 1371 } 1372 1373 final ReadOnlyList<INode> children = current 1374 .getChildrenList(Snapshot.CURRENT_STATE_ID); 1375 int dirNum = 0; 1376 List<INodeDirectory> snapshotDirs = null; 1377 DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature(); 1378 if (sf != null) { 1379 snapshotDirs = new ArrayList<INodeDirectory>(); 1380 sf.getSnapshotDirectory(snapshotDirs); 1381 dirNum += snapshotDirs.size(); 1382 } 1383 1384 // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all 1385 // Snapshots 1386 if (current instanceof INodeDirectorySnapshottable) { 1387 INodeDirectorySnapshottable snapshottableNode = 1388 (INodeDirectorySnapshottable) current; 1389 SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out); 1390 } else { 1391 out.writeInt(-1); // # of snapshots 1392 } 1393 1394 // 3. Write children INode 1395 dirNum += saveChildren(children, out, inSnapshot, counter); 1396 1397 // 4. Write DirectoryDiff lists, if there is any. 1398 SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap); 1399 1400 // Write sub-tree of sub-directories, including possible snapshots of 1401 // deleted sub-directories 1402 out.writeInt(dirNum); // the number of sub-directories 1403 for(INode child : children) { 1404 if(!child.isDirectory()) { 1405 continue; 1406 } 1407 // make sure we only save the subtree under a reference node once 1408 boolean toSave = child.isReference() ? 1409 referenceMap.toProcessSubtree(child.getId()) : true; 1410 saveImage(child.asDirectory(), out, toSave, inSnapshot, counter); 1411 } 1412 if (snapshotDirs != null) { 1413 for (INodeDirectory subDir : snapshotDirs) { 1414 // make sure we only save the subtree under a reference node once 1415 boolean toSave = subDir.getParentReference() != null ? 1416 referenceMap.toProcessSubtree(subDir.getId()) : true; 1417 saveImage(subDir, out, toSave, true, counter); 1418 } 1419 } 1420 } 1421 1422 /** 1423 * Saves inode and increments progress counter. 1424 * 1425 * @param inode INode to save 1426 * @param out DataOutputStream to receive inode 1427 * @param writeUnderConstruction boolean true if this is under construction 1428 * @param referenceMap ReferenceMap containing reference inodes 1429 * @param counter Counter to increment for namenode startup progress 1430 * @throws IOException thrown if there is an I/O error 1431 */ 1432 private void saveINode2Image(INode inode, DataOutputStream out, 1433 boolean writeUnderConstruction, ReferenceMap referenceMap, 1434 Counter counter) throws IOException { 1435 FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction, 1436 referenceMap); 1437 // Intentionally do not increment counter for reference inodes, because it 1438 // is too difficult at this point to assess whether or not this is a 1439 // reference that counts toward quota. 1440 if (!(inode instanceof INodeReference)) { 1441 counter.increment(); 1442 } 1443 } 1444 } 1445}