001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.Time.now;
021    
022    import java.io.DataInput;
023    import java.io.DataInputStream;
024    import java.io.DataOutputStream;
025    import java.io.File;
026    import java.io.FileInputStream;
027    import java.io.FileNotFoundException;
028    import java.io.FileOutputStream;
029    import java.io.IOException;
030    import java.security.DigestInputStream;
031    import java.security.DigestOutputStream;
032    import java.security.MessageDigest;
033    import java.util.ArrayList;
034    import java.util.Arrays;
035    import java.util.HashMap;
036    import java.util.List;
037    import java.util.Map;
038    
039    import org.apache.commons.logging.Log;
040    import org.apache.hadoop.HadoopIllegalArgumentException;
041    import org.apache.hadoop.classification.InterfaceAudience;
042    import org.apache.hadoop.classification.InterfaceStability;
043    import org.apache.hadoop.conf.Configuration;
044    import org.apache.hadoop.fs.FileSystem;
045    import org.apache.hadoop.fs.Path;
046    import org.apache.hadoop.fs.PathIsNotDirectoryException;
047    import org.apache.hadoop.fs.UnresolvedLinkException;
048    import org.apache.hadoop.fs.permission.PermissionStatus;
049    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
050    import org.apache.hadoop.hdfs.protocol.LayoutVersion;
051    import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
052    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
053    import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
054    import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
055    import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
056    import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
057    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
058    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectoryWithSnapshot;
059    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileUnderConstructionWithSnapshot;
060    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
061    import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
062    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
063    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
064    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
065    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
066    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
067    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
068    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
069    import org.apache.hadoop.hdfs.util.ReadOnlyList;
070    import org.apache.hadoop.io.MD5Hash;
071    import org.apache.hadoop.io.Text;
072    
073    /**
074     * Contains inner classes for reading or writing the on-disk format for
075     * FSImages.
076     * 
077     * In particular, the format of the FSImage looks like:
078     * <pre>
079     * FSImage {
080     *   layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
081     *   namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
082     *   generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
083     *   long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
084     *   numOfSnapshottableDirs: int,
085     *   {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
086     * }
087     * 
088     * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
089     *   INodeInfo of root, numberOfChildren of root: int
090     *   [list of INodeInfo of root's children],
091     *   [list of INodeDirectoryInfo of root's directory children]
092     * }
093     * 
094     * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
095     *   [list of INodeInfo of INodes in topological order]
096     * }
097     * 
098     * INodeInfo {
099     *   {
100     *     localName: short + byte[]
101     *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
102     *   or 
103     *   {
104     *     fullPath: byte[]
105     *   } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
106     *   replicationFactor: short, modificationTime: long,
107     *   accessTime: long, preferredBlockSize: long,
108     *   numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
109     *   { 
110     *     nsQuota: long, dsQuota: long, 
111     *     {
112     *       isINodeSnapshottable: byte,
113     *       isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
114     *     } (when {@link Feature#SNAPSHOT} is supported), 
115     *     fsPermission: short, PermissionStatus
116     *   } for INodeDirectory
117     *   or 
118     *   {
119     *     symlinkString, fsPermission: short, PermissionStatus
120     *   } for INodeSymlink
121     *   or
122     *   {
123     *     [list of BlockInfo]
124     *     [list of FileDiff]
125     *     {
126     *       isINodeFileUnderConstructionSnapshot: byte, 
127     *       {clientName: short + byte[], clientMachine: short + byte[]} (when 
128     *       isINodeFileUnderConstructionSnapshot is true),
129     *     } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode), 
130     *     fsPermission: short, PermissionStatus
131     *   } for INodeFile
132     * }
133     * 
134     * INodeDirectoryInfo {
135     *   fullPath of the directory: short + byte[],
136     *   numberOfChildren: int, [list of INodeInfo of children INode],
137     *   {
138     *     numberOfSnapshots: int,
139     *     [list of Snapshot] (when NumberOfSnapshots is positive),
140     *     numberOfDirectoryDiffs: int,
141     *     [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
142     *     number of children that are directories,
143     *     [list of INodeDirectoryInfo of the directory children] (includes
144     *     snapshot copies of deleted sub-directories)
145     *   } (when {@link Feature#SNAPSHOT} is supported), 
146     * }
147     * 
148     * Snapshot {
149     *   snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is 
150     *   the name of the snapshot)
151     * }
152     * 
153     * DirectoryDiff {
154     *   full path of the root of the associated Snapshot: short + byte[], 
155     *   childrenSize: int, 
156     *   isSnapshotRoot: byte, 
157     *   snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
158     *   snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff 
159     * }
160     * 
161     * Diff {
162     *   createdListSize: int, [Local name of INode in created list],
163     *   deletedListSize: int, [INode in deleted list: INodeInfo]
164     * }
165     *
166     * FileDiff {
167     *   full path of the root of the associated Snapshot: short + byte[], 
168     *   fileSize: long, 
169     *   snapshotINodeIsNotNull: byte,
170     *   snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff 
171     * }
172     * </pre>
173     */
174    @InterfaceAudience.Private
175    @InterfaceStability.Evolving
176    public class FSImageFormat {
177      private static final Log LOG = FSImage.LOG;
178      
179      // Static-only class
180      private FSImageFormat() {}
181      
182      /**
183       * A one-shot class responsible for loading an image. The load() function
184       * should be called once, after which the getter methods may be used to retrieve
185       * information about the image that was loaded, if loading was successful.
186       */
187      public static class Loader {
188        private final Configuration conf;
189        /** which namesystem this loader is working for */
190        private final FSNamesystem namesystem;
191    
192        /** Set to true once a file has been loaded using this loader. */
193        private boolean loaded = false;
194    
195        /** The transaction ID of the last edit represented by the loaded file */
196        private long imgTxId;
197        /** The MD5 sum of the loaded file */
198        private MD5Hash imgDigest;
199        
200        private Map<Integer, Snapshot> snapshotMap = null;
201        private final ReferenceMap referenceMap = new ReferenceMap();
202    
203        Loader(Configuration conf, FSNamesystem namesystem) {
204          this.conf = conf;
205          this.namesystem = namesystem;
206        }
207    
208        /**
209         * Return the MD5 checksum of the image that has been loaded.
210         * @throws IllegalStateException if load() has not yet been called.
211         */
212        MD5Hash getLoadedImageMd5() {
213          checkLoaded();
214          return imgDigest;
215        }
216    
217        long getLoadedImageTxId() {
218          checkLoaded();
219          return imgTxId;
220        }
221    
222        /**
223         * Throw IllegalStateException if load() has not yet been called.
224         */
225        private void checkLoaded() {
226          if (!loaded) {
227            throw new IllegalStateException("Image not yet loaded!");
228          }
229        }
230    
231        /**
232         * Throw IllegalStateException if load() has already been called.
233         */
234        private void checkNotLoaded() {
235          if (loaded) {
236            throw new IllegalStateException("Image already loaded!");
237          }
238        }
239    
240        void load(File curFile) throws IOException {
241          checkNotLoaded();
242          assert curFile != null : "curFile is null";
243    
244          StartupProgress prog = NameNode.getStartupProgress();
245          Step step = new Step(StepType.INODES);
246          prog.beginStep(Phase.LOADING_FSIMAGE, step);
247          long startTime = now();
248    
249          //
250          // Load in bits
251          //
252          MessageDigest digester = MD5Hash.getDigester();
253          DigestInputStream fin = new DigestInputStream(
254               new FileInputStream(curFile), digester);
255    
256          DataInputStream in = new DataInputStream(fin);
257          try {
258            // read image version: first appeared in version -1
259            int imgVersion = in.readInt();
260            if (getLayoutVersion() != imgVersion) {
261              throw new InconsistentFSStateException(curFile, 
262                  "imgVersion " + imgVersion +
263                  " expected to be " + getLayoutVersion());
264            }
265            boolean supportSnapshot = LayoutVersion.supports(Feature.SNAPSHOT,
266                imgVersion);
267    
268            // read namespaceID: first appeared in version -2
269            in.readInt();
270    
271            long numFiles = in.readLong();
272    
273            // read in the last generation stamp for legacy blocks.
274            long genstamp = in.readLong();
275            namesystem.setGenerationStampV1(genstamp);
276            
277            if (LayoutVersion.supports(Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
278              // read the starting generation stamp for sequential block IDs
279              genstamp = in.readLong();
280              namesystem.setGenerationStampV2(genstamp);
281    
282              // read the last generation stamp for blocks created after
283              // the switch to sequential block IDs.
284              long stampAtIdSwitch = in.readLong();
285              namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
286    
287              // read the max sequential block ID.
288              long maxSequentialBlockId = in.readLong();
289              namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
290            } else {
291              long startingGenStamp = namesystem.upgradeGenerationStampToV2();
292              // This is an upgrade.
293              LOG.info("Upgrading to sequential block IDs. Generation stamp " +
294                       "for new blocks set to " + startingGenStamp);
295            }
296    
297            // read the transaction ID of the last edit represented by
298            // this image
299            if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) {
300              imgTxId = in.readLong();
301            } else {
302              imgTxId = 0;
303            }
304    
305            // read the last allocated inode id in the fsimage
306            if (LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion)) {
307              long lastInodeId = in.readLong();
308              namesystem.resetLastInodeId(lastInodeId);
309              if (LOG.isDebugEnabled()) {
310                LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
311              }
312            } else {
313              if (LOG.isDebugEnabled()) {
314                LOG.debug("Old layout version doesn't have inode id."
315                    + " Will assign new id for each inode.");
316              }
317            }
318            
319            if (supportSnapshot) {
320              snapshotMap = namesystem.getSnapshotManager().read(in, this);
321            }
322    
323            // read compression related info
324            FSImageCompression compression;
325            if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imgVersion)) {
326              compression = FSImageCompression.readCompressionHeader(conf, in);
327            } else {
328              compression = FSImageCompression.createNoopCompression();
329            }
330            in = compression.unwrapInputStream(fin);
331    
332            LOG.info("Loading image file " + curFile + " using " + compression);
333            
334            // load all inodes
335            LOG.info("Number of files = " + numFiles);
336            prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
337            Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
338            if (LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
339                imgVersion)) {
340              if (supportSnapshot) {
341                loadLocalNameINodesWithSnapshot(numFiles, in, counter);
342              } else {
343                loadLocalNameINodes(numFiles, in, counter);
344              }
345            } else {
346              loadFullNameINodes(numFiles, in, counter);
347            }
348    
349            loadFilesUnderConstruction(in, supportSnapshot, counter);
350            prog.endStep(Phase.LOADING_FSIMAGE, step);
351            // Now that the step is finished, set counter equal to total to adjust
352            // for possible under-counting due to reference inodes.
353            prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
354    
355            loadSecretManagerState(in);
356    
357            // make sure to read to the end of file
358            boolean eof = (in.read() == -1);
359            assert eof : "Should have reached the end of image file " + curFile;
360          } finally {
361            in.close();
362          }
363    
364          imgDigest = new MD5Hash(digester.digest());
365          loaded = true;
366          
367          LOG.info("Image file " + curFile + " of size " + curFile.length() +
368              " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
369        }
370    
371      /** Update the root node's attributes */
372      private void updateRootAttr(INodeWithAdditionalFields root) {                                                           
373        long nsQuota = root.getNsQuota();
374        long dsQuota = root.getDsQuota();
375        FSDirectory fsDir = namesystem.dir;
376        if (nsQuota != -1 || dsQuota != -1) {
377          fsDir.rootDir.setQuota(nsQuota, dsQuota);
378        }
379        fsDir.rootDir.cloneModificationTime(root);
380        fsDir.rootDir.clonePermissionStatus(root);    
381      }
382      
383        /**
384         * Load fsimage files when 1) only local names are stored, 
385         * and 2) snapshot is supported.
386         * 
387         * @param numFiles number of files expected to be read
388         * @param in Image input stream
389         * @param counter Counter to increment for namenode startup progress
390         */
391        private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
392            Counter counter) throws IOException {
393          assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
394              getLayoutVersion());
395          assert LayoutVersion.supports(Feature.SNAPSHOT, getLayoutVersion());
396          
397          // load root
398          loadRoot(in, counter);
399          // load rest of the nodes recursively
400          loadDirectoryWithSnapshot(in, counter);
401        }
402        
403      /** 
404       * load fsimage files assuming only local names are stored
405       *   
406       * @param numFiles number of files expected to be read
407       * @param in image input stream
408       * @param counter Counter to increment for namenode startup progress
409       * @throws IOException
410       */  
411       private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
412           throws IOException {
413         assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
414             getLayoutVersion());
415         assert numFiles > 0;
416    
417         // load root
418         loadRoot(in, counter);
419         // have loaded the first file (the root)
420         numFiles--; 
421    
422         // load rest of the nodes directory by directory
423         while (numFiles > 0) {
424           numFiles -= loadDirectory(in, counter);
425         }
426         if (numFiles != 0) {
427           throw new IOException("Read unexpect number of files: " + -numFiles);
428         }
429       }
430       
431        /**
432         * Load information about root, and use the information to update the root
433         * directory of NameSystem.
434         * @param in The {@link DataInput} instance to read.
435         * @param counter Counter to increment for namenode startup progress
436         */
437        private void loadRoot(DataInput in, Counter counter)
438            throws IOException {
439          // load root
440          if (in.readShort() != 0) {
441            throw new IOException("First node is not root");
442          }
443          final INodeDirectory root = loadINode(null, false, in, counter)
444            .asDirectory();
445          // update the root's attributes
446          updateRootAttr(root);
447        }
448       
449        /** Load children nodes for the parent directory. */
450        private int loadChildren(INodeDirectory parent, DataInput in,
451            Counter counter) throws IOException {
452          int numChildren = in.readInt();
453          for (int i = 0; i < numChildren; i++) {
454            // load single inode
455            INode newNode = loadINodeWithLocalName(false, in, true, counter);
456            addToParent(parent, newNode);
457          }
458          return numChildren;
459        }
460        
461        /**
462         * Load a directory when snapshot is supported.
463         * @param in The {@link DataInput} instance to read.
464         * @param counter Counter to increment for namenode startup progress
465         */
466        private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
467            throws IOException {
468          // Step 1. Identify the parent INode
469          long inodeId = in.readLong();
470          final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
471              .asDirectory();
472          
473          // Check if the whole subtree has been saved (for reference nodes)
474          boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
475          if (!toLoadSubtree) {
476            return;
477          }
478          
479          // Step 2. Load snapshots if parent is snapshottable
480          int numSnapshots = in.readInt();
481          if (numSnapshots >= 0) {
482            final INodeDirectorySnapshottable snapshottableParent
483                = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
484            // load snapshots and snapshotQuota
485            SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
486                numSnapshots, in, this);
487            if (snapshottableParent.getSnapshotQuota() > 0) {
488              // add the directory to the snapshottable directory list in 
489              // SnapshotManager. Note that we only add root when its snapshot quota
490              // is positive.
491              this.namesystem.getSnapshotManager().addSnapshottable(
492                  snapshottableParent);
493            }
494          }
495    
496          // Step 3. Load children nodes under parent
497          loadChildren(parent, in, counter);
498          
499          // Step 4. load Directory Diff List
500          SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
501          
502          // Recursively load sub-directories, including snapshot copies of deleted
503          // directories
504          int numSubTree = in.readInt();
505          for (int i = 0; i < numSubTree; i++) {
506            loadDirectoryWithSnapshot(in, counter);
507          }
508        }
509        
510       /**
511        * Load all children of a directory
512        * 
513        * @param in
514        * @param counter Counter to increment for namenode startup progress
515        * @return number of child inodes read
516        * @throws IOException
517        */
518       private int loadDirectory(DataInput in, Counter counter) throws IOException {
519         String parentPath = FSImageSerialization.readString(in);
520         final INodeDirectory parent = INodeDirectory.valueOf(
521             namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
522         return loadChildren(parent, in, counter);
523       }
524    
525      /**
526       * load fsimage files assuming full path names are stored
527       * 
528       * @param numFiles total number of files to load
529       * @param in data input stream
530       * @param counter Counter to increment for namenode startup progress
531       * @throws IOException if any error occurs
532       */
533      private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
534          throws IOException {
535        byte[][] pathComponents;
536        byte[][] parentPath = {{}};      
537        FSDirectory fsDir = namesystem.dir;
538        INodeDirectory parentINode = fsDir.rootDir;
539        for (long i = 0; i < numFiles; i++) {
540          pathComponents = FSImageSerialization.readPathComponents(in);
541          final INode newNode = loadINode(
542              pathComponents[pathComponents.length-1], false, in, counter);
543    
544          if (isRoot(pathComponents)) { // it is the root
545            // update the root's attributes
546            updateRootAttr(newNode.asDirectory());
547            continue;
548          }
549          // check if the new inode belongs to the same parent
550          if(!isParent(pathComponents, parentPath)) {
551            parentINode = getParentINodeDirectory(pathComponents);
552            parentPath = getParent(pathComponents);
553          }
554    
555          // add new inode
556          addToParent(parentINode, newNode);
557        }
558      }
559    
560      private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
561          ) throws FileNotFoundException, PathIsNotDirectoryException,
562          UnresolvedLinkException {
563        if (pathComponents.length < 2) { // root
564          return null;
565        }
566        // Gets the parent INode
567        final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
568            pathComponents);
569        return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
570      }
571    
572      /**
573       * Add the child node to parent and, if child is a file, update block map.
574       * This method is only used for image loading so that synchronization,
575       * modification time update and space count update are not needed.
576       */
577      private void addToParent(INodeDirectory parent, INode child) {
578        FSDirectory fsDir = namesystem.dir;
579        if (parent == fsDir.rootDir && FSDirectory.isReservedName(child)) {
580            throw new HadoopIllegalArgumentException("File name \""
581                + child.getLocalName() + "\" is reserved. Please "
582                + " change the name of the existing file or directory to another "
583                + "name before upgrading to this release.");
584        }
585        // NOTE: This does not update space counts for parents
586        if (!parent.addChild(child)) {
587          return;
588        }
589        namesystem.dir.cacheName(child);
590    
591        if (child.isFile()) {
592          updateBlocksMap(child.asFile());
593        }
594      }
595    
596        public void updateBlocksMap(INodeFile file) {
597          // Add file->block mapping
598          final BlockInfo[] blocks = file.getBlocks();
599          if (blocks != null) {
600            final BlockManager bm = namesystem.getBlockManager();
601            for (int i = 0; i < blocks.length; i++) {
602              file.setBlock(i, bm.addBlockCollection(blocks[i], file));
603            } 
604          }
605        }
606    
607        /** @return The FSDirectory of the namesystem where the fsimage is loaded */
608        public FSDirectory getFSDirectoryInLoading() {
609          return namesystem.dir;
610        }
611    
612        public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
613            boolean updateINodeMap) throws IOException {
614          return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
615        }
616    
617        public INode loadINodeWithLocalName(boolean isSnapshotINode,
618            DataInput in, boolean updateINodeMap, Counter counter)
619            throws IOException {
620          final byte[] localName = FSImageSerialization.readLocalName(in);
621          INode inode = loadINode(localName, isSnapshotINode, in, counter);
622          if (updateINodeMap
623              && LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
624            namesystem.dir.addToInodeMap(inode);
625          }
626          return inode;
627        }
628      
629      /**
630       * load an inode from fsimage except for its name
631       * 
632       * @param in data input stream from which image is read
633       * @param counter Counter to increment for namenode startup progress
634       * @return an inode
635       */
636      @SuppressWarnings("deprecation")
637      INode loadINode(final byte[] localName, boolean isSnapshotINode,
638          DataInput in, Counter counter) throws IOException {
639        final int imgVersion = getLayoutVersion();
640        if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
641          namesystem.getFSDirectory().verifyINodeName(localName);
642        }
643    
644        long inodeId = LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion) ? 
645               in.readLong() : namesystem.allocateNewInodeId();
646        
647        final short replication = namesystem.getBlockManager().adjustReplication(
648            in.readShort());
649        final long modificationTime = in.readLong();
650        long atime = 0;
651        if (LayoutVersion.supports(Feature.FILE_ACCESS_TIME, imgVersion)) {
652          atime = in.readLong();
653        }
654        final long blockSize = in.readLong();
655        final int numBlocks = in.readInt();
656    
657        if (numBlocks >= 0) {
658          // file
659          
660          // read blocks
661          BlockInfo[] blocks = null;
662          if (numBlocks >= 0) {
663            blocks = new BlockInfo[numBlocks];
664            for (int j = 0; j < numBlocks; j++) {
665              blocks[j] = new BlockInfo(replication);
666              blocks[j].readFields(in);
667            }
668          }
669    
670          String clientName = "";
671          String clientMachine = "";
672          boolean underConstruction = false;
673          FileDiffList fileDiffs = null;
674          if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
675            // read diffs
676            fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
677    
678            if (isSnapshotINode) {
679              underConstruction = in.readBoolean();
680              if (underConstruction) {
681                clientName = FSImageSerialization.readString(in);
682                clientMachine = FSImageSerialization.readString(in);
683                // convert the last block to BlockUC
684                if (blocks != null && blocks.length > 0) {
685                  BlockInfo lastBlk = blocks[blocks.length - 1]; 
686                  blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
687                      lastBlk, replication);
688                }
689              }
690            }
691          }
692    
693          final PermissionStatus permissions = PermissionStatus.read(in);
694    
695          // return
696          if (counter != null) {
697            counter.increment();
698          }
699          final INodeFile file = new INodeFile(inodeId, localName, permissions,
700              modificationTime, atime, blocks, replication, blockSize);
701          if (underConstruction) {
702            INodeFileUnderConstruction fileUC = new INodeFileUnderConstruction(
703                file, clientName, clientMachine, null);
704            return fileDiffs == null ? fileUC :
705              new INodeFileUnderConstructionWithSnapshot(fileUC, fileDiffs);
706          } else {
707            return fileDiffs == null ? file : 
708              new INodeFileWithSnapshot(file, fileDiffs);
709          }
710        } else if (numBlocks == -1) {
711          //directory
712          
713          //read quotas
714          final long nsQuota = in.readLong();
715          long dsQuota = -1L;
716          if (LayoutVersion.supports(Feature.DISKSPACE_QUOTA, imgVersion)) {
717            dsQuota = in.readLong();
718          }
719    
720          //read snapshot info
721          boolean snapshottable = false;
722          boolean withSnapshot = false;
723          if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
724            snapshottable = in.readBoolean();
725            if (!snapshottable) {
726              withSnapshot = in.readBoolean();
727            }
728          }
729    
730          final PermissionStatus permissions = PermissionStatus.read(in);
731    
732          //return
733          if (counter != null) {
734            counter.increment();
735          }
736          final INodeDirectory dir = nsQuota >= 0 || dsQuota >= 0?
737              new INodeDirectoryWithQuota(inodeId, localName, permissions,
738                  modificationTime, nsQuota, dsQuota)
739              : new INodeDirectory(inodeId, localName, permissions, modificationTime);
740          return snapshottable ? new INodeDirectorySnapshottable(dir)
741              : withSnapshot ? new INodeDirectoryWithSnapshot(dir)
742              : dir;
743        } else if (numBlocks == -2) {
744          //symlink
745          if (!FileSystem.isSymlinksEnabled()) {
746            throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
747          }
748    
749          final String symlink = Text.readString(in);
750          final PermissionStatus permissions = PermissionStatus.read(in);
751          if (counter != null) {
752            counter.increment();
753          }
754          return new INodeSymlink(inodeId, localName, permissions,
755              modificationTime, atime, symlink);
756        } else if (numBlocks == -3) {
757          //reference
758          // Intentionally do not increment counter, because it is too difficult at
759          // this point to assess whether or not this is a reference that counts
760          // toward quota.
761          
762          final boolean isWithName = in.readBoolean();
763          // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
764          int snapshotId = in.readInt();
765          
766          final INodeReference.WithCount withCount
767              = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
768    
769          if (isWithName) {
770              return new INodeReference.WithName(null, withCount, localName,
771                  snapshotId);
772          } else {
773            final INodeReference ref = new INodeReference.DstReference(null,
774                withCount, snapshotId);
775            return ref;
776          }
777        }
778        
779        throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
780      }
781    
782        /** Load {@link INodeFileAttributes}. */
783        public INodeFileAttributes loadINodeFileAttributes(DataInput in)
784            throws IOException {
785          final int layoutVersion = getLayoutVersion();
786          
787          if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
788            return loadINodeWithLocalName(true, in, false).asFile();
789          }
790      
791          final byte[] name = FSImageSerialization.readLocalName(in);
792          final PermissionStatus permissions = PermissionStatus.read(in);
793          final long modificationTime = in.readLong();
794          final long accessTime = in.readLong();
795      
796          final short replication = namesystem.getBlockManager().adjustReplication(
797              in.readShort());
798          final long preferredBlockSize = in.readLong();
799          
800          return new INodeFileAttributes.SnapshotCopy(name, permissions, modificationTime,
801              accessTime, replication, preferredBlockSize);
802        }
803    
804        public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
805            throws IOException {
806          final int layoutVersion = getLayoutVersion();
807          
808          if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
809            return loadINodeWithLocalName(true, in, false).asDirectory();
810          }
811      
812          final byte[] name = FSImageSerialization.readLocalName(in);
813          final PermissionStatus permissions = PermissionStatus.read(in);
814          final long modificationTime = in.readLong();
815          
816          //read quotas
817          final long nsQuota = in.readLong();
818          final long dsQuota = in.readLong();
819      
820          return nsQuota == -1L && dsQuota == -1L?
821              new INodeDirectoryAttributes.SnapshotCopy(name, permissions, modificationTime)
822            : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
823                modificationTime, nsQuota, dsQuota);
824        }
825      
826        private void loadFilesUnderConstruction(DataInput in,
827            boolean supportSnapshot, Counter counter) throws IOException {
828          FSDirectory fsDir = namesystem.dir;
829          int size = in.readInt();
830    
831          LOG.info("Number of files under construction = " + size);
832    
833          for (int i = 0; i < size; i++) {
834            INodeFileUnderConstruction cons = FSImageSerialization
835                .readINodeUnderConstruction(in, namesystem, getLayoutVersion());
836            counter.increment();
837    
838            // verify that file exists in namespace
839            String path = cons.getLocalName();
840            INodeFile oldnode = null;
841            boolean inSnapshot = false;
842            if (path != null && FSDirectory.isReservedName(path) && 
843                LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
844              // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in 
845              // snapshot. If we support INode ID in the layout version, we can use
846              // the inode id to find the oldnode.
847              oldnode = namesystem.dir.getInode(cons.getId()).asFile();
848              inSnapshot = true;
849            } else {
850              final INodesInPath iip = fsDir.getLastINodeInPath(path);
851              oldnode = INodeFile.valueOf(iip.getINode(0), path);
852            }
853            
854            cons.setLocalName(oldnode.getLocalNameBytes());
855            INodeReference parentRef = oldnode.getParentReference();
856            if (parentRef != null) {
857              cons.setParentReference(parentRef);
858            } else {
859              cons.setParent(oldnode.getParent());
860            }
861    
862            if (oldnode instanceof INodeFileWithSnapshot) {
863              cons = new INodeFileUnderConstructionWithSnapshot(cons,
864                  ((INodeFileWithSnapshot) oldnode).getDiffs());
865            }
866    
867            if (!inSnapshot) {
868              fsDir.replaceINodeFile(path, oldnode, cons);
869              namesystem.leaseManager.addLease(cons.getClientName(), path);
870            } else {
871              if (parentRef != null) {
872                // replace oldnode with cons
873                parentRef.setReferredINode(cons);
874              } else {
875                // replace old node in its parent's children list and deleted list
876                oldnode.getParent().replaceChildFileInSnapshot(oldnode, cons);
877                namesystem.dir.addToInodeMap(cons);
878                updateBlocksMap(cons);
879              }
880            }
881          }
882        }
883    
884        private void loadSecretManagerState(DataInput in)
885            throws IOException {
886          int imgVersion = getLayoutVersion();
887    
888          if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) {
889            //SecretManagerState is not available.
890            //This must not happen if security is turned on.
891            return; 
892          }
893          namesystem.loadSecretManagerState(in);
894        }
895    
896        private int getLayoutVersion() {
897          return namesystem.getFSImage().getStorage().getLayoutVersion();
898        }
899    
900        private boolean isRoot(byte[][] path) {
901          return path.length == 1 &&
902            path[0] == null;    
903        }
904    
905        private boolean isParent(byte[][] path, byte[][] parent) {
906          if (path == null || parent == null)
907            return false;
908          if (parent.length == 0 || path.length != parent.length + 1)
909            return false;
910          boolean isParent = true;
911          for (int i = 0; i < parent.length; i++) {
912            isParent = isParent && Arrays.equals(path[i], parent[i]); 
913          }
914          return isParent;
915        }
916    
917        /**
918         * Return string representing the parent of the given path.
919         */
920        String getParent(String path) {
921          return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
922        }
923        
924        byte[][] getParent(byte[][] path) {
925          byte[][] result = new byte[path.length - 1][];
926          for (int i = 0; i < result.length; i++) {
927            result[i] = new byte[path[i].length];
928            System.arraycopy(path[i], 0, result[i], 0, path[i].length);
929          }
930          return result;
931        }
932        
933        public Snapshot getSnapshot(DataInput in) throws IOException {
934          return snapshotMap.get(in.readInt());
935        }
936      }
937      
938      /**
939       * A one-shot class responsible for writing an image file.
940       * The write() function should be called once, after which the getter
941       * functions may be used to retrieve information about the file that was written.
942       */
943      static class Saver {
944        private final SaveNamespaceContext context;
945        /** Set to true once an image has been written */
946        private boolean saved = false;
947        
948        /** The MD5 checksum of the file that was written */
949        private MD5Hash savedDigest;
950        private final ReferenceMap referenceMap = new ReferenceMap();
951        
952        private final Map<Long, INodeFileUnderConstruction> snapshotUCMap = 
953            new HashMap<Long, INodeFileUnderConstruction>();
954    
955        /** @throws IllegalStateException if the instance has not yet saved an image */
956        private void checkSaved() {
957          if (!saved) {
958            throw new IllegalStateException("FSImageSaver has not saved an image");
959          }
960        }
961        
962        /** @throws IllegalStateException if the instance has already saved an image */
963        private void checkNotSaved() {
964          if (saved) {
965            throw new IllegalStateException("FSImageSaver has already saved an image");
966          }
967        }
968        
969    
970        Saver(SaveNamespaceContext context) {
971          this.context = context;
972        }
973    
974        /**
975         * Return the MD5 checksum of the image file that was saved.
976         */
977        MD5Hash getSavedDigest() {
978          checkSaved();
979          return savedDigest;
980        }
981    
982        void save(File newFile, FSImageCompression compression) throws IOException {
983          checkNotSaved();
984    
985          final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
986          FSDirectory fsDir = sourceNamesystem.dir;
987          String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
988          Step step = new Step(StepType.INODES, sdPath);
989          StartupProgress prog = NameNode.getStartupProgress();
990          prog.beginStep(Phase.SAVING_CHECKPOINT, step);
991          prog.setTotal(Phase.SAVING_CHECKPOINT, step,
992            fsDir.rootDir.numItemsInTree());
993          Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
994          long startTime = now();
995          //
996          // Write out data
997          //
998          MessageDigest digester = MD5Hash.getDigester();
999          FileOutputStream fout = new FileOutputStream(newFile);
1000          DigestOutputStream fos = new DigestOutputStream(fout, digester);
1001          DataOutputStream out = new DataOutputStream(fos);
1002          try {
1003            out.writeInt(HdfsConstants.LAYOUT_VERSION);
1004            // We use the non-locked version of getNamespaceInfo here since
1005            // the coordinating thread of saveNamespace already has read-locked
1006            // the namespace for us. If we attempt to take another readlock
1007            // from the actual saver thread, there's a potential of a
1008            // fairness-related deadlock. See the comments on HDFS-2223.
1009            out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1010                .getNamespaceID());
1011            out.writeLong(fsDir.rootDir.numItemsInTree());
1012            out.writeLong(sourceNamesystem.getGenerationStampV1());
1013            out.writeLong(sourceNamesystem.getGenerationStampV2());
1014            out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
1015            out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
1016            out.writeLong(context.getTxId());
1017            out.writeLong(sourceNamesystem.getLastInodeId());
1018    
1019            
1020            sourceNamesystem.getSnapshotManager().write(out);
1021            
1022            // write compression info and set up compressed stream
1023            out = compression.writeHeaderAndWrapStream(fos);
1024            LOG.info("Saving image file " + newFile +
1025                     " using " + compression);
1026    
1027            // save the root
1028            saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
1029            // save the rest of the nodes
1030            saveImage(fsDir.rootDir, out, true, false, counter);
1031            prog.endStep(Phase.SAVING_CHECKPOINT, step);
1032            // Now that the step is finished, set counter equal to total to adjust
1033            // for possible under-counting due to reference inodes.
1034            prog.setCount(Phase.SAVING_CHECKPOINT, step,
1035              fsDir.rootDir.numItemsInTree());
1036            // save files under construction
1037            // TODO: for HDFS-5428, since we cannot break the compatibility of 
1038            // fsimage, we store part of the under-construction files that are only
1039            // in snapshots in this "under-construction-file" section. As a 
1040            // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their 
1041            // paths, so that when loading fsimage we do not put them into the lease
1042            // map. In the future, we can remove this hack when we can bump the 
1043            // layout version.
1044            sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
1045            
1046            context.checkCancelled();
1047            sourceNamesystem.saveSecretManagerState(out, sdPath);
1048            context.checkCancelled();
1049            out.flush();
1050            context.checkCancelled();
1051            fout.getChannel().force(true);
1052          } finally {
1053            out.close();
1054          }
1055    
1056          saved = true;
1057          // set md5 of the saved image
1058          savedDigest = new MD5Hash(digester.digest());
1059    
1060          LOG.info("Image file " + newFile + " of size " + newFile.length() +
1061              " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1062        }
1063    
1064        /**
1065         * Save children INodes.
1066         * @param children The list of children INodes
1067         * @param out The DataOutputStream to write
1068         * @param inSnapshot Whether the parent directory or its ancestor is in 
1069         *                   the deleted list of some snapshot (caused by rename or 
1070         *                   deletion)
1071         * @param counter Counter to increment for namenode startup progress
1072         * @return Number of children that are directory
1073         */
1074        private int saveChildren(ReadOnlyList<INode> children,
1075            DataOutputStream out, boolean inSnapshot, Counter counter)
1076            throws IOException {
1077          // Write normal children INode. 
1078          out.writeInt(children.size());
1079          int dirNum = 0;
1080          int i = 0;
1081          for(INode child : children) {
1082            // print all children first
1083            // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1084            // here, thus even if the parent directory is in snapshot, we still
1085            // do not handle INodeUC as those stored in deleted list
1086            saveINode2Image(child, out, false, referenceMap, counter);
1087            if (child.isDirectory()) {
1088              dirNum++;
1089            } else if (inSnapshot && child.isFile()
1090                && child.asFile().isUnderConstruction()) {
1091              this.snapshotUCMap.put(child.getId(),
1092                  (INodeFileUnderConstruction) child.asFile());
1093            }
1094            if (i++ % 50 == 0) {
1095              context.checkCancelled();
1096            }
1097          }
1098          return dirNum;
1099        }
1100        
1101        /**
1102         * Save file tree image starting from the given root.
1103         * This is a recursive procedure, which first saves all children and 
1104         * snapshot diffs of a current directory and then moves inside the 
1105         * sub-directories.
1106         * 
1107         * @param current The current node
1108         * @param out The DataoutputStream to write the image
1109         * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1110         *                      reference node, its subtree may already have been
1111         *                      saved before.
1112         * @param inSnapshot Whether the current directory is in snapshot
1113         * @param counter Counter to increment for namenode startup progress
1114         */
1115        private void saveImage(INodeDirectory current, DataOutputStream out,
1116            boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1117            throws IOException {
1118          // write the inode id of the directory
1119          out.writeLong(current.getId());
1120          
1121          if (!toSaveSubtree) {
1122            return;
1123          }
1124          
1125          final ReadOnlyList<INode> children = current.getChildrenList(null);
1126          int dirNum = 0;
1127          List<INodeDirectory> snapshotDirs = null;
1128          if (current instanceof INodeDirectoryWithSnapshot) {
1129            snapshotDirs = new ArrayList<INodeDirectory>();
1130            ((INodeDirectoryWithSnapshot) current).getSnapshotDirectory(
1131                snapshotDirs);
1132            dirNum += snapshotDirs.size();
1133          }
1134          
1135          // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1136          // Snapshots
1137          if (current instanceof INodeDirectorySnapshottable) {
1138            INodeDirectorySnapshottable snapshottableNode = 
1139                (INodeDirectorySnapshottable) current;
1140            SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
1141          } else {
1142            out.writeInt(-1); // # of snapshots
1143          }
1144    
1145          // 3. Write children INode 
1146          dirNum += saveChildren(children, out, inSnapshot, counter);
1147          
1148          // 4. Write DirectoryDiff lists, if there is any.
1149          SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1150          
1151          // Write sub-tree of sub-directories, including possible snapshots of 
1152          // deleted sub-directories
1153          out.writeInt(dirNum); // the number of sub-directories
1154          for(INode child : children) {
1155            if(!child.isDirectory()) {
1156              continue;
1157            }
1158            // make sure we only save the subtree under a reference node once
1159            boolean toSave = child.isReference() ? 
1160                referenceMap.toProcessSubtree(child.getId()) : true;
1161            saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1162          }
1163          if (snapshotDirs != null) {
1164            for (INodeDirectory subDir : snapshotDirs) {
1165              // make sure we only save the subtree under a reference node once
1166              boolean toSave = subDir.getParentReference() != null ? 
1167                  referenceMap.toProcessSubtree(subDir.getId()) : true;
1168              saveImage(subDir, out, toSave, true, counter);
1169            }
1170          }
1171        }
1172    
1173        /**
1174         * Saves inode and increments progress counter.
1175         * 
1176         * @param inode INode to save
1177         * @param out DataOutputStream to receive inode
1178         * @param writeUnderConstruction boolean true if this is under construction
1179         * @param referenceMap ReferenceMap containing reference inodes
1180         * @param counter Counter to increment for namenode startup progress
1181         * @throws IOException thrown if there is an I/O error
1182         */
1183        private void saveINode2Image(INode inode, DataOutputStream out,
1184            boolean writeUnderConstruction, ReferenceMap referenceMap,
1185            Counter counter) throws IOException {
1186          FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1187            referenceMap);
1188          // Intentionally do not increment counter for reference inodes, because it
1189          // is too difficult at this point to assess whether or not this is a
1190          // reference that counts toward quota.
1191          if (!(inode instanceof INodeReference)) {
1192            counter.increment();
1193          }
1194        }
1195      }
1196    }