001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.Time.now;
021
022import java.io.DataInput;
023import java.io.DataInputStream;
024import java.io.DataOutputStream;
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileNotFoundException;
028import java.io.FileOutputStream;
029import java.io.IOException;
030import java.security.DigestInputStream;
031import java.security.DigestOutputStream;
032import java.security.MessageDigest;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Collection;
036import java.util.HashMap;
037import java.util.List;
038import java.util.Map;
039import java.util.TreeMap;
040
041import org.apache.commons.logging.Log;
042import org.apache.hadoop.classification.InterfaceAudience;
043import org.apache.hadoop.classification.InterfaceStability;
044import org.apache.hadoop.conf.Configuration;
045import org.apache.hadoop.fs.FileSystem;
046import org.apache.hadoop.fs.Path;
047import org.apache.hadoop.fs.PathIsNotDirectoryException;
048import org.apache.hadoop.fs.UnresolvedLinkException;
049import org.apache.hadoop.fs.permission.PermissionStatus;
050import org.apache.hadoop.hdfs.DFSUtil;
051import org.apache.hadoop.hdfs.protocol.HdfsConstants;
052import org.apache.hadoop.hdfs.protocol.LayoutFlags;
053import org.apache.hadoop.hdfs.protocol.LayoutVersion;
054import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
055import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
056import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
057import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
058import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
059import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
060import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature;
061import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
062import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
063import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
064import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
065import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
066import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
067import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
068import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
069import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
070import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
071import org.apache.hadoop.hdfs.util.ReadOnlyList;
072import org.apache.hadoop.io.IOUtils;
073import org.apache.hadoop.io.MD5Hash;
074import org.apache.hadoop.io.Text;
075import org.apache.hadoop.util.StringUtils;
076
077import com.google.common.base.Preconditions;
078import com.google.common.annotations.VisibleForTesting;
079
080/**
081 * Contains inner classes for reading or writing the on-disk format for
082 * FSImages.
083 *
084 * In particular, the format of the FSImage looks like:
085 * <pre>
086 * FSImage {
087 *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
088 *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
089 *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
090 *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
091 *   numOfSnapshottableDirs: int,
092 *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
093 * }
094 *
095 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
096 *   INodeInfo of root, numberOfChildren of root: int
097 *   [list of INodeInfo of root's children],
098 *   [list of INodeDirectoryInfo of root's directory children]
099 * }
100 *
101 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
102 *   [list of INodeInfo of INodes in topological order]
103 * }
104 *
105 * INodeInfo {
106 *   {
107 *     localName: short + byte[]
108 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
109 *   or
110 *   {
111 *     fullPath: byte[]
112 *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
113 *   replicationFactor: short, modificationTime: long,
114 *   accessTime: long, preferredBlockSize: long,
115 *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
116 *   {
117 *     nsQuota: long, dsQuota: long,
118 *     {
119 *       isINodeSnapshottable: byte,
120 *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
121 *     } (when {@link Feature#SNAPSHOT} is supported),
122 *     fsPermission: short, PermissionStatus
123 *   } for INodeDirectory
124 *   or
125 *   {
126 *     symlinkString, fsPermission: short, PermissionStatus
127 *   } for INodeSymlink
128 *   or
129 *   {
130 *     [list of BlockInfo]
131 *     [list of FileDiff]
132 *     {
133 *       isINodeFileUnderConstructionSnapshot: byte,
134 *       {clientName: short + byte[], clientMachine: short + byte[]} (when
135 *       isINodeFileUnderConstructionSnapshot is true),
136 *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
137 *     fsPermission: short, PermissionStatus
138 *   } for INodeFile
139 * }
140 *
141 * INodeDirectoryInfo {
142 *   fullPath of the directory: short + byte[],
143 *   numberOfChildren: int, [list of INodeInfo of children INode],
144 *   {
145 *     numberOfSnapshots: int,
146 *     [list of Snapshot] (when NumberOfSnapshots is positive),
147 *     numberOfDirectoryDiffs: int,
148 *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
149 *     number of children that are directories,
150 *     [list of INodeDirectoryInfo of the directory children] (includes
151 *     snapshot copies of deleted sub-directories)
152 *   } (when {@link Feature#SNAPSHOT} is supported),
153 * }
154 *
155 * Snapshot {
156 *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
157 *   the name of the snapshot)
158 * }
159 *
160 * DirectoryDiff {
161 *   full path of the root of the associated Snapshot: short + byte[],
162 *   childrenSize: int,
163 *   isSnapshotRoot: byte,
164 *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
165 *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
166 * }
167 *
168 * Diff {
169 *   createdListSize: int, [Local name of INode in created list],
170 *   deletedListSize: int, [INode in deleted list: INodeInfo]
171 * }
172 *
173 * FileDiff {
174 *   full path of the root of the associated Snapshot: short + byte[],
175 *   fileSize: long,
176 *   snapshotINodeIsNotNull: byte,
177 *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
178 * }
179 * </pre>
180 */
181@InterfaceAudience.Private
182@InterfaceStability.Evolving
183public class FSImageFormat {
184  private static final Log LOG = FSImage.LOG;
185
186  // Static-only class
187  private FSImageFormat() {}
188
189  interface AbstractLoader {
190    MD5Hash getLoadedImageMd5();
191    long getLoadedImageTxId();
192  }
193
194  static class LoaderDelegator implements AbstractLoader {
195    private AbstractLoader impl;
196    private final Configuration conf;
197    private final FSNamesystem fsn;
198
199    LoaderDelegator(Configuration conf, FSNamesystem fsn) {
200      this.conf = conf;
201      this.fsn = fsn;
202    }
203
204    @Override
205    public MD5Hash getLoadedImageMd5() {
206      return impl.getLoadedImageMd5();
207    }
208
209    @Override
210    public long getLoadedImageTxId() {
211      return impl.getLoadedImageTxId();
212    }
213
214    public void load(File file) throws IOException {
215      Preconditions.checkState(impl == null, "Image already loaded!");
216
217      FileInputStream is = null;
218      try {
219        is = new FileInputStream(file);
220        byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
221        IOUtils.readFully(is, magic, 0, magic.length);
222        if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
223          FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
224              conf, fsn);
225          impl = loader;
226          loader.load(file);
227        } else {
228          Loader loader = new Loader(conf, fsn);
229          impl = loader;
230          loader.load(file);
231        }
232
233      } finally {
234        IOUtils.cleanup(LOG, is);
235      }
236    }
237  }
238
239  /**
240   * Construct a loader class to load the image. It chooses the loader based on
241   * the layout version.
242   */
243  public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
244    return new LoaderDelegator(conf, fsn);
245  }
246
247  /**
248   * A one-shot class responsible for loading an image. The load() function
249   * should be called once, after which the getter methods may be used to retrieve
250   * information about the image that was loaded, if loading was successful.
251   */
252  public static class Loader implements AbstractLoader {
253    private final Configuration conf;
254    /** which namesystem this loader is working for */
255    private final FSNamesystem namesystem;
256
257    /** Set to true once a file has been loaded using this loader. */
258    private boolean loaded = false;
259
260    /** The transaction ID of the last edit represented by the loaded file */
261    private long imgTxId;
262    /** The MD5 sum of the loaded file */
263    private MD5Hash imgDigest;
264    
265    private Map<Integer, Snapshot> snapshotMap = null;
266    private final ReferenceMap referenceMap = new ReferenceMap();
267
268    Loader(Configuration conf, FSNamesystem namesystem) {
269      this.conf = conf;
270      this.namesystem = namesystem;
271    }
272
273    /**
274     * Return the MD5 checksum of the image that has been loaded.
275     * @throws IllegalStateException if load() has not yet been called.
276     */
277    @Override
278    public MD5Hash getLoadedImageMd5() {
279      checkLoaded();
280      return imgDigest;
281    }
282
283    @Override
284    public long getLoadedImageTxId() {
285      checkLoaded();
286      return imgTxId;
287    }
288
289    /**
290     * Throw IllegalStateException if load() has not yet been called.
291     */
292    private void checkLoaded() {
293      if (!loaded) {
294        throw new IllegalStateException("Image not yet loaded!");
295      }
296    }
297
298    /**
299     * Throw IllegalStateException if load() has already been called.
300     */
301    private void checkNotLoaded() {
302      if (loaded) {
303        throw new IllegalStateException("Image already loaded!");
304      }
305    }
306
307    public void load(File curFile) throws IOException {
308      checkNotLoaded();
309      assert curFile != null : "curFile is null";
310
311      StartupProgress prog = NameNode.getStartupProgress();
312      Step step = new Step(StepType.INODES);
313      prog.beginStep(Phase.LOADING_FSIMAGE, step);
314      long startTime = now();
315
316      //
317      // Load in bits
318      //
319      MessageDigest digester = MD5Hash.getDigester();
320      DigestInputStream fin = new DigestInputStream(
321           new FileInputStream(curFile), digester);
322
323      DataInputStream in = new DataInputStream(fin);
324      try {
325        // read image version: first appeared in version -1
326        int imgVersion = in.readInt();
327        if (getLayoutVersion() != imgVersion) {
328          throw new InconsistentFSStateException(curFile, 
329              "imgVersion " + imgVersion +
330              " expected to be " + getLayoutVersion());
331        }
332        boolean supportSnapshot = NameNodeLayoutVersion.supports(
333            LayoutVersion.Feature.SNAPSHOT, imgVersion);
334        if (NameNodeLayoutVersion.supports(
335            LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
336          LayoutFlags.read(in);
337        }
338
339        // read namespaceID: first appeared in version -2
340        in.readInt();
341
342        long numFiles = in.readLong();
343
344        // read in the last generation stamp for legacy blocks.
345        long genstamp = in.readLong();
346        namesystem.setGenerationStampV1(genstamp);
347        
348        if (NameNodeLayoutVersion.supports(
349            LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
350          // read the starting generation stamp for sequential block IDs
351          genstamp = in.readLong();
352          namesystem.setGenerationStampV2(genstamp);
353
354          // read the last generation stamp for blocks created after
355          // the switch to sequential block IDs.
356          long stampAtIdSwitch = in.readLong();
357          namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
358
359          // read the max sequential block ID.
360          long maxSequentialBlockId = in.readLong();
361          namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
362        } else {
363          long startingGenStamp = namesystem.upgradeGenerationStampToV2();
364          // This is an upgrade.
365          LOG.info("Upgrading to sequential block IDs. Generation stamp " +
366                   "for new blocks set to " + startingGenStamp);
367        }
368
369        // read the transaction ID of the last edit represented by
370        // this image
371        if (NameNodeLayoutVersion.supports(
372            LayoutVersion.Feature.STORED_TXIDS, imgVersion)) {
373          imgTxId = in.readLong();
374        } else {
375          imgTxId = 0;
376        }
377
378        // read the last allocated inode id in the fsimage
379        if (NameNodeLayoutVersion.supports(
380            LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) {
381          long lastInodeId = in.readLong();
382          namesystem.resetLastInodeId(lastInodeId);
383          if (LOG.isDebugEnabled()) {
384            LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
385          }
386        } else {
387          if (LOG.isDebugEnabled()) {
388            LOG.debug("Old layout version doesn't have inode id."
389                + " Will assign new id for each inode.");
390          }
391        }
392        
393        if (supportSnapshot) {
394          snapshotMap = namesystem.getSnapshotManager().read(in, this);
395        }
396
397        // read compression related info
398        FSImageCompression compression;
399        if (NameNodeLayoutVersion.supports(
400            LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) {
401          compression = FSImageCompression.readCompressionHeader(conf, in);
402        } else {
403          compression = FSImageCompression.createNoopCompression();
404        }
405        in = compression.unwrapInputStream(fin);
406
407        LOG.info("Loading image file " + curFile + " using " + compression);
408        
409        // load all inodes
410        LOG.info("Number of files = " + numFiles);
411        prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
412        Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
413        if (NameNodeLayoutVersion.supports(
414            LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) {
415          if (supportSnapshot) {
416            loadLocalNameINodesWithSnapshot(numFiles, in, counter);
417          } else {
418            loadLocalNameINodes(numFiles, in, counter);
419          }
420        } else {
421          loadFullNameINodes(numFiles, in, counter);
422        }
423
424        loadFilesUnderConstruction(in, supportSnapshot, counter);
425        prog.endStep(Phase.LOADING_FSIMAGE, step);
426        // Now that the step is finished, set counter equal to total to adjust
427        // for possible under-counting due to reference inodes.
428        prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
429
430        loadSecretManagerState(in);
431
432        loadCacheManagerState(in);
433
434        // make sure to read to the end of file
435        boolean eof = (in.read() == -1);
436        assert eof : "Should have reached the end of image file " + curFile;
437      } finally {
438        in.close();
439      }
440
441      imgDigest = new MD5Hash(digester.digest());
442      loaded = true;
443      
444      LOG.info("Image file " + curFile + " of size " + curFile.length() +
445          " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
446    }
447
448  /** Update the root node's attributes */
449  private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
450    final Quota.Counts q = root.getQuotaCounts();
451    final long nsQuota = q.get(Quota.NAMESPACE);
452    final long dsQuota = q.get(Quota.DISKSPACE);
453    FSDirectory fsDir = namesystem.dir;
454    if (nsQuota != -1 || dsQuota != -1) {
455      fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
456    }
457    fsDir.rootDir.cloneModificationTime(root);
458    fsDir.rootDir.clonePermissionStatus(root);    
459  }
460  
461    /**
462     * Load fsimage files when 1) only local names are stored, 
463     * and 2) snapshot is supported.
464     * 
465     * @param numFiles number of files expected to be read
466     * @param in Image input stream
467     * @param counter Counter to increment for namenode startup progress
468     */
469    private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
470        Counter counter) throws IOException {
471      assert NameNodeLayoutVersion.supports(
472          LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
473      assert NameNodeLayoutVersion.supports(
474          LayoutVersion.Feature.SNAPSHOT, getLayoutVersion());
475      
476      // load root
477      loadRoot(in, counter);
478      // load rest of the nodes recursively
479      loadDirectoryWithSnapshot(in, counter);
480    }
481    
482  /** 
483   * load fsimage files assuming only local names are stored. Used when
484   * snapshots are not supported by the layout version.
485   *   
486   * @param numFiles number of files expected to be read
487   * @param in image input stream
488   * @param counter Counter to increment for namenode startup progress
489   * @throws IOException
490   */  
491   private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
492       throws IOException {
493     assert NameNodeLayoutVersion.supports(
494         LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
495     assert numFiles > 0;
496
497     // load root
498     loadRoot(in, counter);
499     // have loaded the first file (the root)
500     numFiles--; 
501
502     // load rest of the nodes directory by directory
503     while (numFiles > 0) {
504       numFiles -= loadDirectory(in, counter);
505     }
506     if (numFiles != 0) {
507       throw new IOException("Read unexpect number of files: " + -numFiles);
508     }
509   }
510   
511    /**
512     * Load information about root, and use the information to update the root
513     * directory of NameSystem.
514     * @param in The {@link DataInput} instance to read.
515     * @param counter Counter to increment for namenode startup progress
516     */
517    private void loadRoot(DataInput in, Counter counter)
518        throws IOException {
519      // load root
520      if (in.readShort() != 0) {
521        throw new IOException("First node is not root");
522      }
523      final INodeDirectory root = loadINode(null, false, in, counter)
524        .asDirectory();
525      // update the root's attributes
526      updateRootAttr(root);
527    }
528   
529    /** Load children nodes for the parent directory. */
530    private int loadChildren(INodeDirectory parent, DataInput in,
531        Counter counter) throws IOException {
532      int numChildren = in.readInt();
533      for (int i = 0; i < numChildren; i++) {
534        // load single inode
535        INode newNode = loadINodeWithLocalName(false, in, true, counter);
536        addToParent(parent, newNode);
537      }
538      return numChildren;
539    }
540    
541    /**
542     * Load a directory when snapshot is supported.
543     * @param in The {@link DataInput} instance to read.
544     * @param counter Counter to increment for namenode startup progress
545     */
546    private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
547        throws IOException {
548      // Step 1. Identify the parent INode
549      long inodeId = in.readLong();
550      final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
551          .asDirectory();
552      
553      // Check if the whole subtree has been saved (for reference nodes)
554      boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
555      if (!toLoadSubtree) {
556        return;
557      }
558      
559      // Step 2. Load snapshots if parent is snapshottable
560      int numSnapshots = in.readInt();
561      if (numSnapshots >= 0) {
562        final INodeDirectorySnapshottable snapshottableParent
563            = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
564        // load snapshots and snapshotQuota
565        SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
566            numSnapshots, in, this);
567        if (snapshottableParent.getSnapshotQuota() > 0) {
568          // add the directory to the snapshottable directory list in 
569          // SnapshotManager. Note that we only add root when its snapshot quota
570          // is positive.
571          this.namesystem.getSnapshotManager().addSnapshottable(
572              snapshottableParent);
573        }
574      }
575
576      // Step 3. Load children nodes under parent
577      loadChildren(parent, in, counter);
578      
579      // Step 4. load Directory Diff List
580      SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
581      
582      // Recursively load sub-directories, including snapshot copies of deleted
583      // directories
584      int numSubTree = in.readInt();
585      for (int i = 0; i < numSubTree; i++) {
586        loadDirectoryWithSnapshot(in, counter);
587      }
588    }
589    
590   /**
591    * Load all children of a directory
592    * 
593    * @param in input to load from
594    * @param counter Counter to increment for namenode startup progress
595    * @return number of child inodes read
596    * @throws IOException
597    */
598   private int loadDirectory(DataInput in, Counter counter) throws IOException {
599     String parentPath = FSImageSerialization.readString(in);
600     // Rename .snapshot paths if we're doing an upgrade
601     parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion());
602     final INodeDirectory parent = INodeDirectory.valueOf(
603         namesystem.dir.getNode(parentPath, true), parentPath);
604     return loadChildren(parent, in, counter);
605   }
606
607  /**
608   * load fsimage files assuming full path names are stored
609   * 
610   * @param numFiles total number of files to load
611   * @param in data input stream
612   * @param counter Counter to increment for namenode startup progress
613   * @throws IOException if any error occurs
614   */
615  private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
616      throws IOException {
617    byte[][] pathComponents;
618    byte[][] parentPath = {{}};      
619    FSDirectory fsDir = namesystem.dir;
620    INodeDirectory parentINode = fsDir.rootDir;
621    for (long i = 0; i < numFiles; i++) {
622      pathComponents = FSImageSerialization.readPathComponents(in);
623      for (int j=0; j < pathComponents.length; j++) {
624        byte[] newComponent = renameReservedComponentOnUpgrade
625            (pathComponents[j], getLayoutVersion());
626        if (!Arrays.equals(newComponent, pathComponents[j])) {
627          String oldPath = DFSUtil.byteArray2PathString(pathComponents);
628          pathComponents[j] = newComponent;
629          String newPath = DFSUtil.byteArray2PathString(pathComponents);
630          LOG.info("Renaming reserved path " + oldPath + " to " + newPath);
631        }
632      }
633      final INode newNode = loadINode(
634          pathComponents[pathComponents.length-1], false, in, counter);
635
636      if (isRoot(pathComponents)) { // it is the root
637        // update the root's attributes
638        updateRootAttr(newNode.asDirectory());
639        continue;
640      }
641
642      namesystem.dir.addToInodeMap(newNode);
643      // check if the new inode belongs to the same parent
644      if(!isParent(pathComponents, parentPath)) {
645        parentINode = getParentINodeDirectory(pathComponents);
646        parentPath = getParent(pathComponents);
647      }
648
649      // add new inode
650      addToParent(parentINode, newNode);
651    }
652  }
653
654  private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
655      ) throws FileNotFoundException, PathIsNotDirectoryException,
656      UnresolvedLinkException {
657    if (pathComponents.length < 2) { // root
658      return null;
659    }
660    // Gets the parent INode
661    final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
662        pathComponents);
663    return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
664  }
665
666  /**
667   * Add the child node to parent and, if child is a file, update block map.
668   * This method is only used for image loading so that synchronization,
669   * modification time update and space count update are not needed.
670   */
671  private void addToParent(INodeDirectory parent, INode child) {
672    FSDirectory fsDir = namesystem.dir;
673    if (parent == fsDir.rootDir) {
674        child.setLocalName(renameReservedRootComponentOnUpgrade(
675            child.getLocalNameBytes(), getLayoutVersion()));
676    }
677    // NOTE: This does not update space counts for parents
678    if (!parent.addChild(child)) {
679      return;
680    }
681    namesystem.dir.cacheName(child);
682
683    if (child.isFile()) {
684      updateBlocksMap(child.asFile());
685    }
686  }
687
688    public void updateBlocksMap(INodeFile file) {
689      // Add file->block mapping
690      final BlockInfo[] blocks = file.getBlocks();
691      if (blocks != null) {
692        final BlockManager bm = namesystem.getBlockManager();
693        for (int i = 0; i < blocks.length; i++) {
694          file.setBlock(i, bm.addBlockCollection(blocks[i], file));
695        } 
696      }
697    }
698
699    /** @return The FSDirectory of the namesystem where the fsimage is loaded */
700    public FSDirectory getFSDirectoryInLoading() {
701      return namesystem.dir;
702    }
703
704    public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
705        boolean updateINodeMap) throws IOException {
706      return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
707    }
708
709    public INode loadINodeWithLocalName(boolean isSnapshotINode,
710        DataInput in, boolean updateINodeMap, Counter counter)
711        throws IOException {
712      byte[] localName = FSImageSerialization.readLocalName(in);
713      localName =
714          renameReservedComponentOnUpgrade(localName, getLayoutVersion());
715      INode inode = loadINode(localName, isSnapshotINode, in, counter);
716      if (updateINodeMap) {
717        namesystem.dir.addToInodeMap(inode);
718      }
719      return inode;
720    }
721  
722  /**
723   * load an inode from fsimage except for its name
724   * 
725   * @param in data input stream from which image is read
726   * @param counter Counter to increment for namenode startup progress
727   * @return an inode
728   */
729  @SuppressWarnings("deprecation")
730  INode loadINode(final byte[] localName, boolean isSnapshotINode,
731      DataInput in, Counter counter) throws IOException {
732    final int imgVersion = getLayoutVersion();
733    if (NameNodeLayoutVersion.supports(
734        LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
735      namesystem.getFSDirectory().verifyINodeName(localName);
736    }
737
738    long inodeId = NameNodeLayoutVersion.supports(
739        LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong()
740        : namesystem.allocateNewInodeId();
741    
742    final short replication = namesystem.getBlockManager().adjustReplication(
743        in.readShort());
744    final long modificationTime = in.readLong();
745    long atime = 0;
746    if (NameNodeLayoutVersion.supports(
747        LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) {
748      atime = in.readLong();
749    }
750    final long blockSize = in.readLong();
751    final int numBlocks = in.readInt();
752
753    if (numBlocks >= 0) {
754      // file
755      
756      // read blocks
757      BlockInfo[] blocks = new BlockInfo[numBlocks];
758      for (int j = 0; j < numBlocks; j++) {
759        blocks[j] = new BlockInfo(replication);
760        blocks[j].readFields(in);
761      }
762
763      String clientName = "";
764      String clientMachine = "";
765      boolean underConstruction = false;
766      FileDiffList fileDiffs = null;
767      if (NameNodeLayoutVersion.supports(
768          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
769        // read diffs
770        fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
771
772        if (isSnapshotINode) {
773          underConstruction = in.readBoolean();
774          if (underConstruction) {
775            clientName = FSImageSerialization.readString(in);
776            clientMachine = FSImageSerialization.readString(in);
777            // convert the last block to BlockUC
778            if (blocks.length > 0) {
779              BlockInfo lastBlk = blocks[blocks.length - 1]; 
780              blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
781                  lastBlk, replication);
782            }
783          }
784        }
785      }
786
787      final PermissionStatus permissions = PermissionStatus.read(in);
788
789      // return
790      if (counter != null) {
791        counter.increment();
792      }
793      final INodeFile file = new INodeFile(inodeId, localName, permissions,
794          modificationTime, atime, blocks, replication, blockSize);
795      if (underConstruction) {
796        file.toUnderConstruction(clientName, clientMachine);
797      }
798        return fileDiffs == null ? file : new INodeFile(file, fileDiffs);
799      } else if (numBlocks == -1) {
800        //directory
801      
802      //read quotas
803      final long nsQuota = in.readLong();
804      long dsQuota = -1L;
805      if (NameNodeLayoutVersion.supports(
806          LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) {
807        dsQuota = in.readLong();
808      }
809
810      //read snapshot info
811      boolean snapshottable = false;
812      boolean withSnapshot = false;
813      if (NameNodeLayoutVersion.supports(
814          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
815        snapshottable = in.readBoolean();
816        if (!snapshottable) {
817          withSnapshot = in.readBoolean();
818        }
819      }
820
821      final PermissionStatus permissions = PermissionStatus.read(in);
822
823      //return
824      if (counter != null) {
825        counter.increment();
826      }
827      final INodeDirectory dir = new INodeDirectory(inodeId, localName,
828          permissions, modificationTime);
829      if (nsQuota >= 0 || dsQuota >= 0) {
830        dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota);
831      }
832      if (withSnapshot) {
833        dir.addSnapshotFeature(null);
834      }
835      return snapshottable ? new INodeDirectorySnapshottable(dir) : dir;
836    } else if (numBlocks == -2) {
837      //symlink
838      if (!FileSystem.areSymlinksEnabled()) {
839        throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
840      }
841
842      final String symlink = Text.readString(in);
843      final PermissionStatus permissions = PermissionStatus.read(in);
844      if (counter != null) {
845        counter.increment();
846      }
847      return new INodeSymlink(inodeId, localName, permissions,
848          modificationTime, atime, symlink);
849    } else if (numBlocks == -3) {
850      //reference
851      // Intentionally do not increment counter, because it is too difficult at
852      // this point to assess whether or not this is a reference that counts
853      // toward quota.
854      
855      final boolean isWithName = in.readBoolean();
856      // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
857      int snapshotId = in.readInt();
858      
859      final INodeReference.WithCount withCount
860          = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
861
862      if (isWithName) {
863          return new INodeReference.WithName(null, withCount, localName,
864              snapshotId);
865      } else {
866        final INodeReference ref = new INodeReference.DstReference(null,
867            withCount, snapshotId);
868        return ref;
869      }
870    }
871    
872    throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
873  }
874
875    /** Load {@link INodeFileAttributes}. */
876    public INodeFileAttributes loadINodeFileAttributes(DataInput in)
877        throws IOException {
878      final int layoutVersion = getLayoutVersion();
879      
880      if (!NameNodeLayoutVersion.supports(
881          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
882        return loadINodeWithLocalName(true, in, false).asFile();
883      }
884  
885      final byte[] name = FSImageSerialization.readLocalName(in);
886      final PermissionStatus permissions = PermissionStatus.read(in);
887      final long modificationTime = in.readLong();
888      final long accessTime = in.readLong();
889  
890      final short replication = namesystem.getBlockManager().adjustReplication(
891          in.readShort());
892      final long preferredBlockSize = in.readLong();
893
894      return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
895          accessTime, replication, preferredBlockSize, null);
896    }
897
898    public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
899        throws IOException {
900      final int layoutVersion = getLayoutVersion();
901      
902      if (!NameNodeLayoutVersion.supports(
903          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
904        return loadINodeWithLocalName(true, in, false).asDirectory();
905      }
906  
907      final byte[] name = FSImageSerialization.readLocalName(in);
908      final PermissionStatus permissions = PermissionStatus.read(in);
909      final long modificationTime = in.readLong();
910      
911      //read quotas
912      final long nsQuota = in.readLong();
913      final long dsQuota = in.readLong();
914  
915      return nsQuota == -1L && dsQuota == -1L ? new INodeDirectoryAttributes.SnapshotCopy(
916          name, permissions, null, modificationTime, null)
917        : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
918            null, modificationTime, nsQuota, dsQuota, null);
919    }
920  
921    private void loadFilesUnderConstruction(DataInput in,
922        boolean supportSnapshot, Counter counter) throws IOException {
923      FSDirectory fsDir = namesystem.dir;
924      int size = in.readInt();
925
926      LOG.info("Number of files under construction = " + size);
927
928      for (int i = 0; i < size; i++) {
929        INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in,
930            namesystem, getLayoutVersion());
931        counter.increment();
932
933        // verify that file exists in namespace
934        String path = cons.getLocalName();
935        INodeFile oldnode = null;
936        boolean inSnapshot = false;
937        if (path != null && FSDirectory.isReservedName(path) && 
938            NameNodeLayoutVersion.supports(
939                LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
940          // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
941          // snapshot. If we support INode ID in the layout version, we can use
942          // the inode id to find the oldnode.
943          oldnode = namesystem.dir.getInode(cons.getId()).asFile();
944          inSnapshot = true;
945        } else {
946          path = renameReservedPathsOnUpgrade(path, getLayoutVersion());
947          final INodesInPath iip = fsDir.getLastINodeInPath(path);
948          oldnode = INodeFile.valueOf(iip.getINode(0), path);
949        }
950
951        FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature();
952        oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine());
953        if (oldnode.numBlocks() > 0) {
954          BlockInfo ucBlock = cons.getLastBlock();
955          // we do not replace the inode, just replace the last block of oldnode
956          BlockInfo info = namesystem.getBlockManager().addBlockCollection(
957              ucBlock, oldnode);
958          oldnode.setBlock(oldnode.numBlocks() - 1, info);
959        }
960
961        if (!inSnapshot) {
962          namesystem.leaseManager.addLease(cons
963              .getFileUnderConstructionFeature().getClientName(), path);
964        }
965      }
966    }
967
968    private void loadSecretManagerState(DataInput in)
969        throws IOException {
970      int imgVersion = getLayoutVersion();
971
972      if (!NameNodeLayoutVersion.supports(
973          LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) {
974        //SecretManagerState is not available.
975        //This must not happen if security is turned on.
976        return; 
977      }
978      namesystem.loadSecretManagerStateCompat(in);
979    }
980
981    private void loadCacheManagerState(DataInput in) throws IOException {
982      int imgVersion = getLayoutVersion();
983      if (!NameNodeLayoutVersion.supports(
984          LayoutVersion.Feature.CACHING, imgVersion)) {
985        return;
986      }
987      namesystem.getCacheManager().loadStateCompat(in);
988    }
989
990    private int getLayoutVersion() {
991      return namesystem.getFSImage().getStorage().getLayoutVersion();
992    }
993
994    private boolean isRoot(byte[][] path) {
995      return path.length == 1 &&
996        path[0] == null;    
997    }
998
999    private boolean isParent(byte[][] path, byte[][] parent) {
1000      if (path == null || parent == null)
1001        return false;
1002      if (parent.length == 0 || path.length != parent.length + 1)
1003        return false;
1004      boolean isParent = true;
1005      for (int i = 0; i < parent.length; i++) {
1006        isParent = isParent && Arrays.equals(path[i], parent[i]); 
1007      }
1008      return isParent;
1009    }
1010
1011    /**
1012     * Return string representing the parent of the given path.
1013     */
1014    String getParent(String path) {
1015      return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
1016    }
1017    
1018    byte[][] getParent(byte[][] path) {
1019      byte[][] result = new byte[path.length - 1][];
1020      for (int i = 0; i < result.length; i++) {
1021        result[i] = new byte[path[i].length];
1022        System.arraycopy(path[i], 0, result[i], 0, path[i].length);
1023      }
1024      return result;
1025    }
1026    
1027    public Snapshot getSnapshot(DataInput in) throws IOException {
1028      return snapshotMap.get(in.readInt());
1029    }
1030  }
1031
1032  @VisibleForTesting
1033  public static final TreeMap<String, String> renameReservedMap =
1034      new TreeMap<String, String>();
1035
1036  /**
1037   * Use the default key-value pairs that will be used to determine how to
1038   * rename reserved paths on upgrade.
1039   */
1040  @VisibleForTesting
1041  public static void useDefaultRenameReservedPairs() {
1042    renameReservedMap.clear();
1043    for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) {
1044      renameReservedMap.put(
1045          key,
1046          key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "."
1047              + "UPGRADE_RENAMED");
1048    }
1049  }
1050
1051  /**
1052   * Set the key-value pairs that will be used to determine how to rename
1053   * reserved paths on upgrade.
1054   */
1055  @VisibleForTesting
1056  public static void setRenameReservedPairs(String renameReserved) {
1057    // Clear and set the default values
1058    useDefaultRenameReservedPairs();
1059    // Overwrite with provided values
1060    setRenameReservedMapInternal(renameReserved);
1061  }
1062
1063  private static void setRenameReservedMapInternal(String renameReserved) {
1064    Collection<String> pairs =
1065        StringUtils.getTrimmedStringCollection(renameReserved);
1066    for (String p : pairs) {
1067      String[] pair = StringUtils.split(p, '/', '=');
1068      Preconditions.checkArgument(pair.length == 2,
1069          "Could not parse key-value pair " + p);
1070      String key = pair[0];
1071      String value = pair[1];
1072      Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key),
1073          "Unknown reserved path " + key);
1074      Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value),
1075          "Invalid rename path for " + key + ": " + value);
1076      LOG.info("Will rename reserved path " + key + " to " + value);
1077      renameReservedMap.put(key, value);
1078    }
1079  }
1080
1081  /**
1082   * When upgrading from an old version, the filesystem could contain paths
1083   * that are now reserved in the new version (e.g. .snapshot). This renames
1084   * these new reserved paths to a user-specified value to avoid collisions
1085   * with the reserved name.
1086   * 
1087   * @param path Old path potentially containing a reserved path
1088   * @return New path with reserved path components renamed to user value
1089   */
1090  static String renameReservedPathsOnUpgrade(String path,
1091      final int layoutVersion) {
1092    final String oldPath = path;
1093    // If any known LVs aren't supported, we're doing an upgrade
1094    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1095      String[] components = INode.getPathNames(path);
1096      // Only need to worry about the root directory
1097      if (components.length > 1) {
1098        components[1] = DFSUtil.bytes2String(
1099            renameReservedRootComponentOnUpgrade(
1100                DFSUtil.string2Bytes(components[1]),
1101                layoutVersion));
1102        path = DFSUtil.strings2PathString(components);
1103      }
1104    }
1105    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1106      String[] components = INode.getPathNames(path);
1107      // Special case the root path
1108      if (components.length == 0) {
1109        return path;
1110      }
1111      for (int i=0; i<components.length; i++) {
1112        components[i] = DFSUtil.bytes2String(
1113            renameReservedComponentOnUpgrade(
1114                DFSUtil.string2Bytes(components[i]),
1115                layoutVersion));
1116      }
1117      path = DFSUtil.strings2PathString(components);
1118    }
1119
1120    if (!path.equals(oldPath)) {
1121      LOG.info("Upgrade process renamed reserved path " + oldPath + " to "
1122          + path);
1123    }
1124    return path;
1125  }
1126
1127  private final static String RESERVED_ERROR_MSG = 
1128      FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and "
1129      + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in"
1130      + " this version of HDFS. Please rollback and delete or rename"
1131      + " this path, or upgrade with the "
1132      + StartupOption.RENAMERESERVED.getName()
1133      + " [key-value pairs]"
1134      + " option to automatically rename these paths during upgrade.";
1135
1136  /**
1137   * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1138   * byte array path component.
1139   */
1140  private static byte[] renameReservedComponentOnUpgrade(byte[] component,
1141      final int layoutVersion) {
1142    // If the LV doesn't support snapshots, we're doing an upgrade
1143    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1144      if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) {
1145        Preconditions.checkArgument(
1146            renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR),
1147            RESERVED_ERROR_MSG);
1148        component =
1149            DFSUtil.string2Bytes(renameReservedMap
1150                .get(HdfsConstants.DOT_SNAPSHOT_DIR));
1151      }
1152    }
1153    return component;
1154  }
1155
1156  /**
1157   * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
1158   * byte array path component.
1159   */
1160  private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
1161      final int layoutVersion) {
1162    // If the LV doesn't support inode IDs, we're doing an upgrade
1163    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1164      if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) {
1165        Preconditions.checkArgument(
1166            renameReservedMap.containsKey(FSDirectory.DOT_RESERVED_STRING),
1167            RESERVED_ERROR_MSG);
1168        final String renameString = renameReservedMap
1169            .get(FSDirectory.DOT_RESERVED_STRING);
1170        component =
1171            DFSUtil.string2Bytes(renameString);
1172        LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING
1173            + " to " + renameString);
1174      }
1175    }
1176    return component;
1177  }
1178
1179  /**
1180   * A one-shot class responsible for writing an image file.
1181   * The write() function should be called once, after which the getter
1182   * functions may be used to retrieve information about the file that was written.
1183   *
1184   * This is replaced by the PB-based FSImage. The class is to maintain
1185   * compatibility for the external fsimage tool.
1186   */
1187  @Deprecated
1188  static class Saver {
1189    private static final int LAYOUT_VERSION = -51;
1190    private final SaveNamespaceContext context;
1191    /** Set to true once an image has been written */
1192    private boolean saved = false;
1193
1194    /** The MD5 checksum of the file that was written */
1195    private MD5Hash savedDigest;
1196    private final ReferenceMap referenceMap = new ReferenceMap();
1197
1198    private final Map<Long, INodeFile> snapshotUCMap =
1199        new HashMap<Long, INodeFile>();
1200
1201    /** @throws IllegalStateException if the instance has not yet saved an image */
1202    private void checkSaved() {
1203      if (!saved) {
1204        throw new IllegalStateException("FSImageSaver has not saved an image");
1205      }
1206    }
1207
1208    /** @throws IllegalStateException if the instance has already saved an image */
1209    private void checkNotSaved() {
1210      if (saved) {
1211        throw new IllegalStateException("FSImageSaver has already saved an image");
1212      }
1213    }
1214
1215
1216    Saver(SaveNamespaceContext context) {
1217      this.context = context;
1218    }
1219
1220    /**
1221     * Return the MD5 checksum of the image file that was saved.
1222     */
1223    MD5Hash getSavedDigest() {
1224      checkSaved();
1225      return savedDigest;
1226    }
1227
1228    void save(File newFile, FSImageCompression compression) throws IOException {
1229      checkNotSaved();
1230
1231      final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
1232      final INodeDirectory rootDir = sourceNamesystem.dir.rootDir;
1233      final long numINodes = rootDir.getDirectoryWithQuotaFeature()
1234          .getSpaceConsumed().get(Quota.NAMESPACE);
1235      String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
1236      Step step = new Step(StepType.INODES, sdPath);
1237      StartupProgress prog = NameNode.getStartupProgress();
1238      prog.beginStep(Phase.SAVING_CHECKPOINT, step);
1239      prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes);
1240      Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
1241      long startTime = now();
1242      //
1243      // Write out data
1244      //
1245      MessageDigest digester = MD5Hash.getDigester();
1246      FileOutputStream fout = new FileOutputStream(newFile);
1247      DigestOutputStream fos = new DigestOutputStream(fout, digester);
1248      DataOutputStream out = new DataOutputStream(fos);
1249      try {
1250        out.writeInt(LAYOUT_VERSION);
1251        LayoutFlags.write(out);
1252        // We use the non-locked version of getNamespaceInfo here since
1253        // the coordinating thread of saveNamespace already has read-locked
1254        // the namespace for us. If we attempt to take another readlock
1255        // from the actual saver thread, there's a potential of a
1256        // fairness-related deadlock. See the comments on HDFS-2223.
1257        out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1258            .getNamespaceID());
1259        out.writeLong(numINodes);
1260        out.writeLong(sourceNamesystem.getGenerationStampV1());
1261        out.writeLong(sourceNamesystem.getGenerationStampV2());
1262        out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
1263        out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
1264        out.writeLong(context.getTxId());
1265        out.writeLong(sourceNamesystem.getLastInodeId());
1266
1267
1268        sourceNamesystem.getSnapshotManager().write(out);
1269
1270        // write compression info and set up compressed stream
1271        out = compression.writeHeaderAndWrapStream(fos);
1272        LOG.info("Saving image file " + newFile +
1273                 " using " + compression);
1274
1275        // save the root
1276        saveINode2Image(rootDir, out, false, referenceMap, counter);
1277        // save the rest of the nodes
1278        saveImage(rootDir, out, true, false, counter);
1279        prog.endStep(Phase.SAVING_CHECKPOINT, step);
1280        // Now that the step is finished, set counter equal to total to adjust
1281        // for possible under-counting due to reference inodes.
1282        prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes);
1283        // save files under construction
1284        // TODO: for HDFS-5428, since we cannot break the compatibility of
1285        // fsimage, we store part of the under-construction files that are only
1286        // in snapshots in this "under-construction-file" section. As a
1287        // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
1288        // paths, so that when loading fsimage we do not put them into the lease
1289        // map. In the future, we can remove this hack when we can bump the
1290        // layout version.
1291        sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
1292
1293        context.checkCancelled();
1294        sourceNamesystem.saveSecretManagerStateCompat(out, sdPath);
1295        context.checkCancelled();
1296        sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath);
1297        context.checkCancelled();
1298        out.flush();
1299        context.checkCancelled();
1300        fout.getChannel().force(true);
1301      } finally {
1302        out.close();
1303      }
1304
1305      saved = true;
1306      // set md5 of the saved image
1307      savedDigest = new MD5Hash(digester.digest());
1308
1309      LOG.info("Image file " + newFile + " of size " + newFile.length() +
1310          " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1311    }
1312
1313    /**
1314     * Save children INodes.
1315     * @param children The list of children INodes
1316     * @param out The DataOutputStream to write
1317     * @param inSnapshot Whether the parent directory or its ancestor is in
1318     *                   the deleted list of some snapshot (caused by rename or
1319     *                   deletion)
1320     * @param counter Counter to increment for namenode startup progress
1321     * @return Number of children that are directory
1322     */
1323    private int saveChildren(ReadOnlyList<INode> children,
1324        DataOutputStream out, boolean inSnapshot, Counter counter)
1325        throws IOException {
1326      // Write normal children INode.
1327      out.writeInt(children.size());
1328      int dirNum = 0;
1329      int i = 0;
1330      for(INode child : children) {
1331        // print all children first
1332        // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1333        // here, thus even if the parent directory is in snapshot, we still
1334        // do not handle INodeUC as those stored in deleted list
1335        saveINode2Image(child, out, false, referenceMap, counter);
1336        if (child.isDirectory()) {
1337          dirNum++;
1338        } else if (inSnapshot && child.isFile()
1339            && child.asFile().isUnderConstruction()) {
1340          this.snapshotUCMap.put(child.getId(), child.asFile());
1341        }
1342        if (i++ % 50 == 0) {
1343          context.checkCancelled();
1344        }
1345      }
1346      return dirNum;
1347    }
1348
1349    /**
1350     * Save file tree image starting from the given root.
1351     * This is a recursive procedure, which first saves all children and
1352     * snapshot diffs of a current directory and then moves inside the
1353     * sub-directories.
1354     *
1355     * @param current The current node
1356     * @param out The DataoutputStream to write the image
1357     * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1358     *                      reference node, its subtree may already have been
1359     *                      saved before.
1360     * @param inSnapshot Whether the current directory is in snapshot
1361     * @param counter Counter to increment for namenode startup progress
1362     */
1363    private void saveImage(INodeDirectory current, DataOutputStream out,
1364        boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1365        throws IOException {
1366      // write the inode id of the directory
1367      out.writeLong(current.getId());
1368
1369      if (!toSaveSubtree) {
1370        return;
1371      }
1372
1373      final ReadOnlyList<INode> children = current
1374          .getChildrenList(Snapshot.CURRENT_STATE_ID);
1375      int dirNum = 0;
1376      List<INodeDirectory> snapshotDirs = null;
1377      DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature();
1378      if (sf != null) {
1379        snapshotDirs = new ArrayList<INodeDirectory>();
1380        sf.getSnapshotDirectory(snapshotDirs);
1381        dirNum += snapshotDirs.size();
1382      }
1383
1384      // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1385      // Snapshots
1386      if (current instanceof INodeDirectorySnapshottable) {
1387        INodeDirectorySnapshottable snapshottableNode =
1388            (INodeDirectorySnapshottable) current;
1389        SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
1390      } else {
1391        out.writeInt(-1); // # of snapshots
1392      }
1393
1394      // 3. Write children INode
1395      dirNum += saveChildren(children, out, inSnapshot, counter);
1396
1397      // 4. Write DirectoryDiff lists, if there is any.
1398      SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1399
1400      // Write sub-tree of sub-directories, including possible snapshots of
1401      // deleted sub-directories
1402      out.writeInt(dirNum); // the number of sub-directories
1403      for(INode child : children) {
1404        if(!child.isDirectory()) {
1405          continue;
1406        }
1407        // make sure we only save the subtree under a reference node once
1408        boolean toSave = child.isReference() ?
1409            referenceMap.toProcessSubtree(child.getId()) : true;
1410        saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1411      }
1412      if (snapshotDirs != null) {
1413        for (INodeDirectory subDir : snapshotDirs) {
1414          // make sure we only save the subtree under a reference node once
1415          boolean toSave = subDir.getParentReference() != null ?
1416              referenceMap.toProcessSubtree(subDir.getId()) : true;
1417          saveImage(subDir, out, toSave, true, counter);
1418        }
1419      }
1420    }
1421
1422    /**
1423     * Saves inode and increments progress counter.
1424     *
1425     * @param inode INode to save
1426     * @param out DataOutputStream to receive inode
1427     * @param writeUnderConstruction boolean true if this is under construction
1428     * @param referenceMap ReferenceMap containing reference inodes
1429     * @param counter Counter to increment for namenode startup progress
1430     * @throws IOException thrown if there is an I/O error
1431     */
1432    private void saveINode2Image(INode inode, DataOutputStream out,
1433        boolean writeUnderConstruction, ReferenceMap referenceMap,
1434        Counter counter) throws IOException {
1435      FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1436        referenceMap);
1437      // Intentionally do not increment counter for reference inodes, because it
1438      // is too difficult at this point to assess whether or not this is a
1439      // reference that counts toward quota.
1440      if (!(inode instanceof INodeReference)) {
1441        counter.increment();
1442      }
1443    }
1444  }
1445}