001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.util.Time.now;
021
022import java.io.DataInput;
023import java.io.DataInputStream;
024import java.io.File;
025import java.io.FileInputStream;
026import java.io.FileNotFoundException;
027import java.io.IOException;
028import java.security.DigestInputStream;
029import java.security.MessageDigest;
030import java.util.Arrays;
031import java.util.Collection;
032import java.util.Map;
033import java.util.TreeMap;
034
035import org.apache.commons.logging.Log;
036import org.apache.hadoop.classification.InterfaceAudience;
037import org.apache.hadoop.classification.InterfaceStability;
038import org.apache.hadoop.conf.Configuration;
039import org.apache.hadoop.fs.FileSystem;
040import org.apache.hadoop.fs.Path;
041import org.apache.hadoop.fs.PathIsNotDirectoryException;
042import org.apache.hadoop.fs.UnresolvedLinkException;
043import org.apache.hadoop.fs.permission.PermissionStatus;
044import org.apache.hadoop.hdfs.DFSUtil;
045import org.apache.hadoop.hdfs.protocol.HdfsConstants;
046import org.apache.hadoop.hdfs.protocol.LayoutFlags;
047import org.apache.hadoop.hdfs.protocol.LayoutVersion;
048import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
049import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
050import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
051import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
052import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
053import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
054import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
055import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
056import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
057import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
058import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
059import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
060import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
061import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
062import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
063import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
064import org.apache.hadoop.io.IOUtils;
065import org.apache.hadoop.io.MD5Hash;
066import org.apache.hadoop.io.Text;
067import org.apache.hadoop.util.StringUtils;
068
069import com.google.common.base.Preconditions;
070import com.google.common.annotations.VisibleForTesting;
071
072/**
073 * This class loads and stores the FSImage of the NameNode. The file
074 * src/main/proto/fsimage.proto describes the on-disk layout of the FSImage.
075 */
076@InterfaceAudience.Private
077@InterfaceStability.Evolving
078public class FSImageFormat {
079  private static final Log LOG = FSImage.LOG;
080
081  // Static-only class
082  private FSImageFormat() {}
083
084  interface AbstractLoader {
085    MD5Hash getLoadedImageMd5();
086    long getLoadedImageTxId();
087  }
088
089  static class LoaderDelegator implements AbstractLoader {
090    private AbstractLoader impl;
091    private final Configuration conf;
092    private final FSNamesystem fsn;
093
094    LoaderDelegator(Configuration conf, FSNamesystem fsn) {
095      this.conf = conf;
096      this.fsn = fsn;
097    }
098
099    @Override
100    public MD5Hash getLoadedImageMd5() {
101      return impl.getLoadedImageMd5();
102    }
103
104    @Override
105    public long getLoadedImageTxId() {
106      return impl.getLoadedImageTxId();
107    }
108
109    public void load(File file) throws IOException {
110      Preconditions.checkState(impl == null, "Image already loaded!");
111
112      FileInputStream is = null;
113      try {
114        is = new FileInputStream(file);
115        byte[] magic = new byte[FSImageUtil.MAGIC_HEADER.length];
116        IOUtils.readFully(is, magic, 0, magic.length);
117        if (Arrays.equals(magic, FSImageUtil.MAGIC_HEADER)) {
118          FSImageFormatProtobuf.Loader loader = new FSImageFormatProtobuf.Loader(
119              conf, fsn);
120          impl = loader;
121          loader.load(file);
122        } else {
123          Loader loader = new Loader(conf, fsn);
124          impl = loader;
125          loader.load(file);
126        }
127
128      } finally {
129        IOUtils.cleanup(LOG, is);
130      }
131    }
132  }
133
134  /**
135   * Construct a loader class to load the image. It chooses the loader based on
136   * the layout version.
137   */
138  public static LoaderDelegator newLoader(Configuration conf, FSNamesystem fsn) {
139    return new LoaderDelegator(conf, fsn);
140  }
141
142  /**
143   * A one-shot class responsible for loading an image. The load() function
144   * should be called once, after which the getter methods may be used to retrieve
145   * information about the image that was loaded, if loading was successful.
146   */
147  public static class Loader implements AbstractLoader {
148    private final Configuration conf;
149    /** which namesystem this loader is working for */
150    private final FSNamesystem namesystem;
151
152    /** Set to true once a file has been loaded using this loader. */
153    private boolean loaded = false;
154
155    /** The transaction ID of the last edit represented by the loaded file */
156    private long imgTxId;
157    /** The MD5 sum of the loaded file */
158    private MD5Hash imgDigest;
159    
160    private Map<Integer, Snapshot> snapshotMap = null;
161    private final ReferenceMap referenceMap = new ReferenceMap();
162
163    Loader(Configuration conf, FSNamesystem namesystem) {
164      this.conf = conf;
165      this.namesystem = namesystem;
166    }
167
168    /**
169     * Return the MD5 checksum of the image that has been loaded.
170     * @throws IllegalStateException if load() has not yet been called.
171     */
172    @Override
173    public MD5Hash getLoadedImageMd5() {
174      checkLoaded();
175      return imgDigest;
176    }
177
178    @Override
179    public long getLoadedImageTxId() {
180      checkLoaded();
181      return imgTxId;
182    }
183
184    /**
185     * Throw IllegalStateException if load() has not yet been called.
186     */
187    private void checkLoaded() {
188      if (!loaded) {
189        throw new IllegalStateException("Image not yet loaded!");
190      }
191    }
192
193    /**
194     * Throw IllegalStateException if load() has already been called.
195     */
196    private void checkNotLoaded() {
197      if (loaded) {
198        throw new IllegalStateException("Image already loaded!");
199      }
200    }
201
202    public void load(File curFile) throws IOException {
203      checkNotLoaded();
204      assert curFile != null : "curFile is null";
205
206      StartupProgress prog = NameNode.getStartupProgress();
207      Step step = new Step(StepType.INODES);
208      prog.beginStep(Phase.LOADING_FSIMAGE, step);
209      long startTime = now();
210
211      //
212      // Load in bits
213      //
214      MessageDigest digester = MD5Hash.getDigester();
215      DigestInputStream fin = new DigestInputStream(
216           new FileInputStream(curFile), digester);
217
218      DataInputStream in = new DataInputStream(fin);
219      try {
220        // read image version: first appeared in version -1
221        int imgVersion = in.readInt();
222        if (getLayoutVersion() != imgVersion) {
223          throw new InconsistentFSStateException(curFile, 
224              "imgVersion " + imgVersion +
225              " expected to be " + getLayoutVersion());
226        }
227        boolean supportSnapshot = NameNodeLayoutVersion.supports(
228            LayoutVersion.Feature.SNAPSHOT, imgVersion);
229        if (NameNodeLayoutVersion.supports(
230            LayoutVersion.Feature.ADD_LAYOUT_FLAGS, imgVersion)) {
231          LayoutFlags.read(in);
232        }
233
234        // read namespaceID: first appeared in version -2
235        in.readInt();
236
237        long numFiles = in.readLong();
238
239        // read in the last generation stamp for legacy blocks.
240        long genstamp = in.readLong();
241        namesystem.setGenerationStampV1(genstamp);
242        
243        if (NameNodeLayoutVersion.supports(
244            LayoutVersion.Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
245          // read the starting generation stamp for sequential block IDs
246          genstamp = in.readLong();
247          namesystem.setGenerationStampV2(genstamp);
248
249          // read the last generation stamp for blocks created after
250          // the switch to sequential block IDs.
251          long stampAtIdSwitch = in.readLong();
252          namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
253
254          // read the max sequential block ID.
255          long maxSequentialBlockId = in.readLong();
256          namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
257        } else {
258          long startingGenStamp = namesystem.upgradeGenerationStampToV2();
259          // This is an upgrade.
260          LOG.info("Upgrading to sequential block IDs. Generation stamp " +
261                   "for new blocks set to " + startingGenStamp);
262        }
263
264        // read the transaction ID of the last edit represented by
265        // this image
266        if (NameNodeLayoutVersion.supports(
267            LayoutVersion.Feature.STORED_TXIDS, imgVersion)) {
268          imgTxId = in.readLong();
269        } else {
270          imgTxId = 0;
271        }
272
273        // read the last allocated inode id in the fsimage
274        if (NameNodeLayoutVersion.supports(
275            LayoutVersion.Feature.ADD_INODE_ID, imgVersion)) {
276          long lastInodeId = in.readLong();
277          namesystem.resetLastInodeId(lastInodeId);
278          if (LOG.isDebugEnabled()) {
279            LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
280          }
281        } else {
282          if (LOG.isDebugEnabled()) {
283            LOG.debug("Old layout version doesn't have inode id."
284                + " Will assign new id for each inode.");
285          }
286        }
287        
288        if (supportSnapshot) {
289          snapshotMap = namesystem.getSnapshotManager().read(in, this);
290        }
291
292        // read compression related info
293        FSImageCompression compression;
294        if (NameNodeLayoutVersion.supports(
295            LayoutVersion.Feature.FSIMAGE_COMPRESSION, imgVersion)) {
296          compression = FSImageCompression.readCompressionHeader(conf, in);
297        } else {
298          compression = FSImageCompression.createNoopCompression();
299        }
300        in = compression.unwrapInputStream(fin);
301
302        LOG.info("Loading image file " + curFile + " using " + compression);
303        
304        // load all inodes
305        LOG.info("Number of files = " + numFiles);
306        prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
307        Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
308        if (NameNodeLayoutVersion.supports(
309            LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, imgVersion)) {
310          if (supportSnapshot) {
311            loadLocalNameINodesWithSnapshot(numFiles, in, counter);
312          } else {
313            loadLocalNameINodes(numFiles, in, counter);
314          }
315        } else {
316          loadFullNameINodes(numFiles, in, counter);
317        }
318
319        loadFilesUnderConstruction(in, supportSnapshot, counter);
320        prog.endStep(Phase.LOADING_FSIMAGE, step);
321        // Now that the step is finished, set counter equal to total to adjust
322        // for possible under-counting due to reference inodes.
323        prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
324
325        loadSecretManagerState(in);
326
327        loadCacheManagerState(in);
328
329        // make sure to read to the end of file
330        boolean eof = (in.read() == -1);
331        assert eof : "Should have reached the end of image file " + curFile;
332      } finally {
333        in.close();
334      }
335
336      imgDigest = new MD5Hash(digester.digest());
337      loaded = true;
338      
339      LOG.info("Image file " + curFile + " of size " + curFile.length() +
340          " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
341    }
342
343  /** Update the root node's attributes */
344  private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
345    final Quota.Counts q = root.getQuotaCounts();
346    final long nsQuota = q.get(Quota.NAMESPACE);
347    final long dsQuota = q.get(Quota.DISKSPACE);
348    FSDirectory fsDir = namesystem.dir;
349    if (nsQuota != -1 || dsQuota != -1) {
350      fsDir.rootDir.getDirectoryWithQuotaFeature().setQuota(nsQuota, dsQuota);
351    }
352    fsDir.rootDir.cloneModificationTime(root);
353    fsDir.rootDir.clonePermissionStatus(root);    
354  }
355  
356    /**
357     * Load fsimage files when 1) only local names are stored, 
358     * and 2) snapshot is supported.
359     * 
360     * @param numFiles number of files expected to be read
361     * @param in Image input stream
362     * @param counter Counter to increment for namenode startup progress
363     */
364    private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
365        Counter counter) throws IOException {
366      assert NameNodeLayoutVersion.supports(
367          LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
368      assert NameNodeLayoutVersion.supports(
369          LayoutVersion.Feature.SNAPSHOT, getLayoutVersion());
370      
371      // load root
372      loadRoot(in, counter);
373      // load rest of the nodes recursively
374      loadDirectoryWithSnapshot(in, counter);
375    }
376    
377  /** 
378   * load fsimage files assuming only local names are stored. Used when
379   * snapshots are not supported by the layout version.
380   *   
381   * @param numFiles number of files expected to be read
382   * @param in image input stream
383   * @param counter Counter to increment for namenode startup progress
384   * @throws IOException
385   */  
386   private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
387       throws IOException {
388     assert NameNodeLayoutVersion.supports(
389         LayoutVersion.Feature.FSIMAGE_NAME_OPTIMIZATION, getLayoutVersion());
390     assert numFiles > 0;
391
392     // load root
393     loadRoot(in, counter);
394     // have loaded the first file (the root)
395     numFiles--; 
396
397     // load rest of the nodes directory by directory
398     while (numFiles > 0) {
399       numFiles -= loadDirectory(in, counter);
400     }
401     if (numFiles != 0) {
402       throw new IOException("Read unexpect number of files: " + -numFiles);
403     }
404   }
405   
406    /**
407     * Load information about root, and use the information to update the root
408     * directory of NameSystem.
409     * @param in The {@link DataInput} instance to read.
410     * @param counter Counter to increment for namenode startup progress
411     */
412    private void loadRoot(DataInput in, Counter counter)
413        throws IOException {
414      // load root
415      if (in.readShort() != 0) {
416        throw new IOException("First node is not root");
417      }
418      final INodeDirectory root = loadINode(null, false, in, counter)
419        .asDirectory();
420      // update the root's attributes
421      updateRootAttr(root);
422    }
423   
424    /** Load children nodes for the parent directory. */
425    private int loadChildren(INodeDirectory parent, DataInput in,
426        Counter counter) throws IOException {
427      int numChildren = in.readInt();
428      for (int i = 0; i < numChildren; i++) {
429        // load single inode
430        INode newNode = loadINodeWithLocalName(false, in, true, counter);
431        addToParent(parent, newNode);
432      }
433      return numChildren;
434    }
435    
436    /**
437     * Load a directory when snapshot is supported.
438     * @param in The {@link DataInput} instance to read.
439     * @param counter Counter to increment for namenode startup progress
440     */
441    private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
442        throws IOException {
443      // Step 1. Identify the parent INode
444      long inodeId = in.readLong();
445      final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
446          .asDirectory();
447      
448      // Check if the whole subtree has been saved (for reference nodes)
449      boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
450      if (!toLoadSubtree) {
451        return;
452      }
453      
454      // Step 2. Load snapshots if parent is snapshottable
455      int numSnapshots = in.readInt();
456      if (numSnapshots >= 0) {
457        final INodeDirectorySnapshottable snapshottableParent
458            = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
459        // load snapshots and snapshotQuota
460        SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
461            numSnapshots, in, this);
462        if (snapshottableParent.getSnapshotQuota() > 0) {
463          // add the directory to the snapshottable directory list in 
464          // SnapshotManager. Note that we only add root when its snapshot quota
465          // is positive.
466          this.namesystem.getSnapshotManager().addSnapshottable(
467              snapshottableParent);
468        }
469      }
470
471      // Step 3. Load children nodes under parent
472      loadChildren(parent, in, counter);
473      
474      // Step 4. load Directory Diff List
475      SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
476      
477      // Recursively load sub-directories, including snapshot copies of deleted
478      // directories
479      int numSubTree = in.readInt();
480      for (int i = 0; i < numSubTree; i++) {
481        loadDirectoryWithSnapshot(in, counter);
482      }
483    }
484    
485   /**
486    * Load all children of a directory
487    * 
488    * @param in
489    * @param counter Counter to increment for namenode startup progress
490    * @return number of child inodes read
491    * @throws IOException
492    */
493   private int loadDirectory(DataInput in, Counter counter) throws IOException {
494     String parentPath = FSImageSerialization.readString(in);
495     // Rename .snapshot paths if we're doing an upgrade
496     parentPath = renameReservedPathsOnUpgrade(parentPath, getLayoutVersion());
497     final INodeDirectory parent = INodeDirectory.valueOf(
498         namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
499     return loadChildren(parent, in, counter);
500   }
501
502  /**
503   * load fsimage files assuming full path names are stored
504   * 
505   * @param numFiles total number of files to load
506   * @param in data input stream
507   * @param counter Counter to increment for namenode startup progress
508   * @throws IOException if any error occurs
509   */
510  private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
511      throws IOException {
512    byte[][] pathComponents;
513    byte[][] parentPath = {{}};      
514    FSDirectory fsDir = namesystem.dir;
515    INodeDirectory parentINode = fsDir.rootDir;
516    for (long i = 0; i < numFiles; i++) {
517      pathComponents = FSImageSerialization.readPathComponents(in);
518      final INode newNode = loadINode(
519          pathComponents[pathComponents.length-1], false, in, counter);
520
521      if (isRoot(pathComponents)) { // it is the root
522        // update the root's attributes
523        updateRootAttr(newNode.asDirectory());
524        continue;
525      }
526
527      namesystem.dir.addToInodeMap(newNode);
528      // check if the new inode belongs to the same parent
529      if(!isParent(pathComponents, parentPath)) {
530        parentINode = getParentINodeDirectory(pathComponents);
531        parentPath = getParent(pathComponents);
532      }
533
534      // add new inode
535      addToParent(parentINode, newNode);
536    }
537  }
538
539  private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
540      ) throws FileNotFoundException, PathIsNotDirectoryException,
541      UnresolvedLinkException {
542    if (pathComponents.length < 2) { // root
543      return null;
544    }
545    // Gets the parent INode
546    final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
547        pathComponents);
548    return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
549  }
550
551  /**
552   * Add the child node to parent and, if child is a file, update block map.
553   * This method is only used for image loading so that synchronization,
554   * modification time update and space count update are not needed.
555   */
556  private void addToParent(INodeDirectory parent, INode child) {
557    FSDirectory fsDir = namesystem.dir;
558    if (parent == fsDir.rootDir) {
559        child.setLocalName(renameReservedRootComponentOnUpgrade(
560            child.getLocalNameBytes(), getLayoutVersion()));
561    }
562    // NOTE: This does not update space counts for parents
563    if (!parent.addChild(child)) {
564      return;
565    }
566    namesystem.dir.cacheName(child);
567
568    if (child.isFile()) {
569      updateBlocksMap(child.asFile());
570    }
571  }
572
573    public void updateBlocksMap(INodeFile file) {
574      // Add file->block mapping
575      final BlockInfo[] blocks = file.getBlocks();
576      if (blocks != null) {
577        final BlockManager bm = namesystem.getBlockManager();
578        for (int i = 0; i < blocks.length; i++) {
579          file.setBlock(i, bm.addBlockCollection(blocks[i], file));
580        } 
581      }
582    }
583
584    public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
585        boolean updateINodeMap) throws IOException {
586      return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
587    }
588
589    public INode loadINodeWithLocalName(boolean isSnapshotINode,
590        DataInput in, boolean updateINodeMap, Counter counter)
591        throws IOException {
592      byte[] localName = FSImageSerialization.readLocalName(in);
593      localName =
594          renameReservedComponentOnUpgrade(localName, getLayoutVersion());
595      INode inode = loadINode(localName, isSnapshotINode, in, counter);
596      if (updateINodeMap) {
597        namesystem.dir.addToInodeMap(inode);
598      }
599      return inode;
600    }
601  
602  /**
603   * load an inode from fsimage except for its name
604   * 
605   * @param in data input stream from which image is read
606   * @param counter Counter to increment for namenode startup progress
607   * @return an inode
608   */
609  @SuppressWarnings("deprecation")
610  INode loadINode(final byte[] localName, boolean isSnapshotINode,
611      DataInput in, Counter counter) throws IOException {
612    final int imgVersion = getLayoutVersion();
613    if (NameNodeLayoutVersion.supports(
614        LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
615      namesystem.getFSDirectory().verifyINodeName(localName);
616    }
617
618    long inodeId = NameNodeLayoutVersion.supports(
619        LayoutVersion.Feature.ADD_INODE_ID, imgVersion) ? in.readLong()
620        : namesystem.allocateNewInodeId();
621    
622    final short replication = namesystem.getBlockManager().adjustReplication(
623        in.readShort());
624    final long modificationTime = in.readLong();
625    long atime = 0;
626    if (NameNodeLayoutVersion.supports(
627        LayoutVersion.Feature.FILE_ACCESS_TIME, imgVersion)) {
628      atime = in.readLong();
629    }
630    final long blockSize = in.readLong();
631    final int numBlocks = in.readInt();
632
633    if (numBlocks >= 0) {
634      // file
635      
636      // read blocks
637      BlockInfo[] blocks = new BlockInfo[numBlocks];
638      for (int j = 0; j < numBlocks; j++) {
639        blocks[j] = new BlockInfo(replication);
640        blocks[j].readFields(in);
641      }
642
643      String clientName = "";
644      String clientMachine = "";
645      boolean underConstruction = false;
646      FileDiffList fileDiffs = null;
647      if (NameNodeLayoutVersion.supports(
648          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
649        // read diffs
650        fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
651
652        if (isSnapshotINode) {
653          underConstruction = in.readBoolean();
654          if (underConstruction) {
655            clientName = FSImageSerialization.readString(in);
656            clientMachine = FSImageSerialization.readString(in);
657            // convert the last block to BlockUC
658            if (blocks != null && blocks.length > 0) {
659              BlockInfo lastBlk = blocks[blocks.length - 1]; 
660              blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
661                  lastBlk, replication);
662            }
663          }
664        }
665      }
666
667      final PermissionStatus permissions = PermissionStatus.read(in);
668
669      // return
670      if (counter != null) {
671        counter.increment();
672      }
673      final INodeFile file = new INodeFile(inodeId, localName, permissions,
674          modificationTime, atime, blocks, replication, blockSize);
675      if (underConstruction) {
676        file.toUnderConstruction(clientName, clientMachine, null);
677      }
678        return fileDiffs == null ? file : new INodeFile(file, fileDiffs);
679      } else if (numBlocks == -1) {
680        //directory
681      
682      //read quotas
683      final long nsQuota = in.readLong();
684      long dsQuota = -1L;
685      if (NameNodeLayoutVersion.supports(
686          LayoutVersion.Feature.DISKSPACE_QUOTA, imgVersion)) {
687        dsQuota = in.readLong();
688      }
689
690      //read snapshot info
691      boolean snapshottable = false;
692      boolean withSnapshot = false;
693      if (NameNodeLayoutVersion.supports(
694          LayoutVersion.Feature.SNAPSHOT, imgVersion)) {
695        snapshottable = in.readBoolean();
696        if (!snapshottable) {
697          withSnapshot = in.readBoolean();
698        }
699      }
700
701      final PermissionStatus permissions = PermissionStatus.read(in);
702
703      //return
704      if (counter != null) {
705        counter.increment();
706      }
707      final INodeDirectory dir = new INodeDirectory(inodeId, localName,
708          permissions, modificationTime);
709      if (nsQuota >= 0 || dsQuota >= 0) {
710        dir.addDirectoryWithQuotaFeature(nsQuota, dsQuota);
711      }
712      if (withSnapshot) {
713        dir.addSnapshotFeature(null);
714      }
715      return snapshottable ? new INodeDirectorySnapshottable(dir) : dir;
716    } else if (numBlocks == -2) {
717      //symlink
718      if (!FileSystem.areSymlinksEnabled()) {
719        throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
720      }
721
722      final String symlink = Text.readString(in);
723      final PermissionStatus permissions = PermissionStatus.read(in);
724      if (counter != null) {
725        counter.increment();
726      }
727      return new INodeSymlink(inodeId, localName, permissions,
728          modificationTime, atime, symlink);
729    } else if (numBlocks == -3) {
730      //reference
731      // Intentionally do not increment counter, because it is too difficult at
732      // this point to assess whether or not this is a reference that counts
733      // toward quota.
734      
735      final boolean isWithName = in.readBoolean();
736      // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
737      int snapshotId = in.readInt();
738      
739      final INodeReference.WithCount withCount
740          = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
741
742      if (isWithName) {
743          return new INodeReference.WithName(null, withCount, localName,
744              snapshotId);
745      } else {
746        final INodeReference ref = new INodeReference.DstReference(null,
747            withCount, snapshotId);
748        return ref;
749      }
750    }
751    
752    throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
753  }
754
755    /** Load {@link INodeFileAttributes}. */
756    public INodeFileAttributes loadINodeFileAttributes(DataInput in)
757        throws IOException {
758      final int layoutVersion = getLayoutVersion();
759      
760      if (!NameNodeLayoutVersion.supports(
761          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
762        return loadINodeWithLocalName(true, in, false).asFile();
763      }
764  
765      final byte[] name = FSImageSerialization.readLocalName(in);
766      final PermissionStatus permissions = PermissionStatus.read(in);
767      final long modificationTime = in.readLong();
768      final long accessTime = in.readLong();
769  
770      final short replication = namesystem.getBlockManager().adjustReplication(
771          in.readShort());
772      final long preferredBlockSize = in.readLong();
773
774      return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
775          accessTime, replication, preferredBlockSize);
776    }
777
778    public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
779        throws IOException {
780      final int layoutVersion = getLayoutVersion();
781      
782      if (!NameNodeLayoutVersion.supports(
783          LayoutVersion.Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
784        return loadINodeWithLocalName(true, in, false).asDirectory();
785      }
786  
787      final byte[] name = FSImageSerialization.readLocalName(in);
788      final PermissionStatus permissions = PermissionStatus.read(in);
789      final long modificationTime = in.readLong();
790      
791      //read quotas
792      final long nsQuota = in.readLong();
793      final long dsQuota = in.readLong();
794  
795      return nsQuota == -1L && dsQuota == -1L?
796          new INodeDirectoryAttributes.SnapshotCopy(name, permissions, null, modificationTime)
797        : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
798            null, modificationTime, nsQuota, dsQuota);
799    }
800  
801    private void loadFilesUnderConstruction(DataInput in,
802        boolean supportSnapshot, Counter counter) throws IOException {
803      FSDirectory fsDir = namesystem.dir;
804      int size = in.readInt();
805
806      LOG.info("Number of files under construction = " + size);
807
808      for (int i = 0; i < size; i++) {
809        INodeFile cons = FSImageSerialization.readINodeUnderConstruction(in,
810            namesystem, getLayoutVersion());
811        counter.increment();
812
813        // verify that file exists in namespace
814        String path = cons.getLocalName();
815        INodeFile oldnode = null;
816        boolean inSnapshot = false;
817        if (path != null && FSDirectory.isReservedName(path) && 
818            NameNodeLayoutVersion.supports(
819                LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
820          // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
821          // snapshot. If we support INode ID in the layout version, we can use
822          // the inode id to find the oldnode.
823          oldnode = namesystem.dir.getInode(cons.getId()).asFile();
824          inSnapshot = true;
825        } else {
826          final INodesInPath iip = fsDir.getLastINodeInPath(path);
827          oldnode = INodeFile.valueOf(iip.getINode(0), path);
828        }
829
830        FileUnderConstructionFeature uc = cons.getFileUnderConstructionFeature();
831        oldnode.toUnderConstruction(uc.getClientName(), uc.getClientMachine(),
832            uc.getClientNode());
833        if (oldnode.numBlocks() > 0) {
834          BlockInfo ucBlock = cons.getLastBlock();
835          // we do not replace the inode, just replace the last block of oldnode
836          BlockInfo info = namesystem.getBlockManager().addBlockCollection(
837              ucBlock, oldnode);
838          oldnode.setBlock(oldnode.numBlocks() - 1, info);
839        }
840
841        if (!inSnapshot) {
842          namesystem.leaseManager.addLease(cons
843              .getFileUnderConstructionFeature().getClientName(), path);
844        }
845      }
846    }
847
848    private void loadSecretManagerState(DataInput in)
849        throws IOException {
850      int imgVersion = getLayoutVersion();
851
852      if (!NameNodeLayoutVersion.supports(
853          LayoutVersion.Feature.DELEGATION_TOKEN, imgVersion)) {
854        //SecretManagerState is not available.
855        //This must not happen if security is turned on.
856        return; 
857      }
858      namesystem.loadSecretManagerStateCompat(in);
859    }
860
861    private void loadCacheManagerState(DataInput in) throws IOException {
862      int imgVersion = getLayoutVersion();
863      if (!NameNodeLayoutVersion.supports(
864          LayoutVersion.Feature.CACHING, imgVersion)) {
865        return;
866      }
867      namesystem.getCacheManager().loadStateCompat(in);
868    }
869
870    private int getLayoutVersion() {
871      return namesystem.getFSImage().getStorage().getLayoutVersion();
872    }
873
874    private boolean isRoot(byte[][] path) {
875      return path.length == 1 &&
876        path[0] == null;    
877    }
878
879    private boolean isParent(byte[][] path, byte[][] parent) {
880      if (path == null || parent == null)
881        return false;
882      if (parent.length == 0 || path.length != parent.length + 1)
883        return false;
884      boolean isParent = true;
885      for (int i = 0; i < parent.length; i++) {
886        isParent = isParent && Arrays.equals(path[i], parent[i]); 
887      }
888      return isParent;
889    }
890
891    /**
892     * Return string representing the parent of the given path.
893     */
894    String getParent(String path) {
895      return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
896    }
897    
898    byte[][] getParent(byte[][] path) {
899      byte[][] result = new byte[path.length - 1][];
900      for (int i = 0; i < result.length; i++) {
901        result[i] = new byte[path[i].length];
902        System.arraycopy(path[i], 0, result[i], 0, path[i].length);
903      }
904      return result;
905    }
906    
907    public Snapshot getSnapshot(DataInput in) throws IOException {
908      return snapshotMap.get(in.readInt());
909    }
910  }
911
912  @VisibleForTesting
913  public static final TreeMap<String, String> renameReservedMap =
914      new TreeMap<String, String>();
915
916  /**
917   * Use the default key-value pairs that will be used to determine how to
918   * rename reserved paths on upgrade.
919   */
920  @VisibleForTesting
921  public static void useDefaultRenameReservedPairs() {
922    renameReservedMap.clear();
923    for (String key: HdfsConstants.RESERVED_PATH_COMPONENTS) {
924      renameReservedMap.put(
925          key,
926          key + "." + HdfsConstants.NAMENODE_LAYOUT_VERSION + "."
927              + "UPGRADE_RENAMED");
928    }
929  }
930
931  /**
932   * Set the key-value pairs that will be used to determine how to rename
933   * reserved paths on upgrade.
934   */
935  @VisibleForTesting
936  public static void setRenameReservedPairs(String renameReserved) {
937    // Clear and set the default values
938    useDefaultRenameReservedPairs();
939    // Overwrite with provided values
940    setRenameReservedMapInternal(renameReserved);
941  }
942
943  private static void setRenameReservedMapInternal(String renameReserved) {
944    Collection<String> pairs =
945        StringUtils.getTrimmedStringCollection(renameReserved);
946    for (String p : pairs) {
947      String[] pair = StringUtils.split(p, '/', '=');
948      Preconditions.checkArgument(pair.length == 2,
949          "Could not parse key-value pair " + p);
950      String key = pair[0];
951      String value = pair[1];
952      Preconditions.checkArgument(DFSUtil.isReservedPathComponent(key),
953          "Unknown reserved path " + key);
954      Preconditions.checkArgument(DFSUtil.isValidNameForComponent(value),
955          "Invalid rename path for " + key + ": " + value);
956      LOG.info("Will rename reserved path " + key + " to " + value);
957      renameReservedMap.put(key, value);
958    }
959  }
960
961  /**
962   * When upgrading from an old version, the filesystem could contain paths
963   * that are now reserved in the new version (e.g. .snapshot). This renames
964   * these new reserved paths to a user-specified value to avoid collisions
965   * with the reserved name.
966   * 
967   * @param path Old path potentially containing a reserved path
968   * @return New path with reserved path components renamed to user value
969   */
970  static String renameReservedPathsOnUpgrade(String path,
971      final int layoutVersion) {
972    final String oldPath = path;
973    // If any known LVs aren't supported, we're doing an upgrade
974    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
975      String[] components = INode.getPathNames(path);
976      // Only need to worry about the root directory
977      if (components.length > 1) {
978        components[1] = DFSUtil.bytes2String(
979            renameReservedRootComponentOnUpgrade(
980                DFSUtil.string2Bytes(components[1]),
981                layoutVersion));
982        path = DFSUtil.strings2PathString(components);
983      }
984    }
985    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
986      String[] components = INode.getPathNames(path);
987      // Special case the root path
988      if (components.length == 0) {
989        return path;
990      }
991      for (int i=0; i<components.length; i++) {
992        components[i] = DFSUtil.bytes2String(
993            renameReservedComponentOnUpgrade(
994                DFSUtil.string2Bytes(components[i]),
995                layoutVersion));
996      }
997      path = DFSUtil.strings2PathString(components);
998    }
999
1000    if (!path.equals(oldPath)) {
1001      LOG.info("Upgrade process renamed reserved path " + oldPath + " to "
1002          + path);
1003    }
1004    return path;
1005  }
1006
1007  private final static String RESERVED_ERROR_MSG = 
1008      FSDirectory.DOT_RESERVED_PATH_PREFIX + " is a reserved path and "
1009      + HdfsConstants.DOT_SNAPSHOT_DIR + " is a reserved path component in"
1010      + " this version of HDFS. Please rollback and delete or rename"
1011      + " this path, or upgrade with the "
1012      + StartupOption.RENAMERESERVED.getName()
1013      + " [key-value pairs]"
1014      + " option to automatically rename these paths during upgrade.";
1015
1016  /**
1017   * Same as {@link #renameReservedPathsOnUpgrade}, but for a single
1018   * byte array path component.
1019   */
1020  private static byte[] renameReservedComponentOnUpgrade(byte[] component,
1021      final int layoutVersion) {
1022    // If the LV doesn't support snapshots, we're doing an upgrade
1023    if (!NameNodeLayoutVersion.supports(Feature.SNAPSHOT, layoutVersion)) {
1024      if (Arrays.equals(component, HdfsConstants.DOT_SNAPSHOT_DIR_BYTES)) {
1025        Preconditions.checkArgument(
1026            renameReservedMap != null &&
1027            renameReservedMap.containsKey(HdfsConstants.DOT_SNAPSHOT_DIR),
1028            RESERVED_ERROR_MSG);
1029        component =
1030            DFSUtil.string2Bytes(renameReservedMap
1031                .get(HdfsConstants.DOT_SNAPSHOT_DIR));
1032      }
1033    }
1034    return component;
1035  }
1036
1037  /**
1038   * Same as {@link #renameReservedPathsOnUpgrade}, but for a single
1039   * byte array path component.
1040   */
1041  private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
1042      final int layoutVersion) {
1043    // If the LV doesn't support inode IDs, we're doing an upgrade
1044    if (!NameNodeLayoutVersion.supports(Feature.ADD_INODE_ID, layoutVersion)) {
1045      if (Arrays.equals(component, FSDirectory.DOT_RESERVED)) {
1046        Preconditions.checkArgument(
1047            renameReservedMap != null &&
1048            renameReservedMap.containsKey(FSDirectory.DOT_RESERVED_STRING),
1049            RESERVED_ERROR_MSG);
1050        final String renameString = renameReservedMap
1051            .get(FSDirectory.DOT_RESERVED_STRING);
1052        component =
1053            DFSUtil.string2Bytes(renameString);
1054        LOG.info("Renamed root path " + FSDirectory.DOT_RESERVED_STRING
1055            + " to " + renameString);
1056      }
1057    }
1058    return component;
1059  }
1060}