001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.namenode;
019
020 import static org.apache.hadoop.util.Time.now;
021
022 import java.io.DataInput;
023 import java.io.DataInputStream;
024 import java.io.DataOutputStream;
025 import java.io.File;
026 import java.io.FileInputStream;
027 import java.io.FileNotFoundException;
028 import java.io.FileOutputStream;
029 import java.io.IOException;
030 import java.security.DigestInputStream;
031 import java.security.DigestOutputStream;
032 import java.security.MessageDigest;
033 import java.util.ArrayList;
034 import java.util.Arrays;
035 import java.util.HashMap;
036 import java.util.List;
037 import java.util.Map;
038
039 import org.apache.commons.logging.Log;
040 import org.apache.hadoop.HadoopIllegalArgumentException;
041 import org.apache.hadoop.classification.InterfaceAudience;
042 import org.apache.hadoop.classification.InterfaceStability;
043 import org.apache.hadoop.conf.Configuration;
044 import org.apache.hadoop.fs.FileSystem;
045 import org.apache.hadoop.fs.Path;
046 import org.apache.hadoop.fs.PathIsNotDirectoryException;
047 import org.apache.hadoop.fs.UnresolvedLinkException;
048 import org.apache.hadoop.fs.permission.PermissionStatus;
049 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
050 import org.apache.hadoop.hdfs.protocol.LayoutVersion;
051 import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
052 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
053 import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
054 import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
055 import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
056 import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
057 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
058 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectoryWithSnapshot;
059 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileUnderConstructionWithSnapshot;
060 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
061 import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
062 import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat;
063 import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotFSImageFormat.ReferenceMap;
064 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
065 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
066 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
067 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
068 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
069 import org.apache.hadoop.hdfs.util.ReadOnlyList;
070 import org.apache.hadoop.io.MD5Hash;
071 import org.apache.hadoop.io.Text;
072
073 /**
074 * Contains inner classes for reading or writing the on-disk format for
075 * FSImages.
076 *
077 * In particular, the format of the FSImage looks like:
078 * <pre>
079 * FSImage {
080 * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
081 * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
082 * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
083 * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
084 * numOfSnapshottableDirs: int,
085 * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
086 * }
087 *
088 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
089 * INodeInfo of root, numberOfChildren of root: int
090 * [list of INodeInfo of root's children],
091 * [list of INodeDirectoryInfo of root's directory children]
092 * }
093 *
094 * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
095 * [list of INodeInfo of INodes in topological order]
096 * }
097 *
098 * INodeInfo {
099 * {
100 * localName: short + byte[]
101 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
102 * or
103 * {
104 * fullPath: byte[]
105 * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
106 * replicationFactor: short, modificationTime: long,
107 * accessTime: long, preferredBlockSize: long,
108 * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
109 * {
110 * nsQuota: long, dsQuota: long,
111 * {
112 * isINodeSnapshottable: byte,
113 * isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
114 * } (when {@link Feature#SNAPSHOT} is supported),
115 * fsPermission: short, PermissionStatus
116 * } for INodeDirectory
117 * or
118 * {
119 * symlinkString, fsPermission: short, PermissionStatus
120 * } for INodeSymlink
121 * or
122 * {
123 * [list of BlockInfo]
124 * [list of FileDiff]
125 * {
126 * isINodeFileUnderConstructionSnapshot: byte,
127 * {clientName: short + byte[], clientMachine: short + byte[]} (when
128 * isINodeFileUnderConstructionSnapshot is true),
129 * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
130 * fsPermission: short, PermissionStatus
131 * } for INodeFile
132 * }
133 *
134 * INodeDirectoryInfo {
135 * fullPath of the directory: short + byte[],
136 * numberOfChildren: int, [list of INodeInfo of children INode],
137 * {
138 * numberOfSnapshots: int,
139 * [list of Snapshot] (when NumberOfSnapshots is positive),
140 * numberOfDirectoryDiffs: int,
141 * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
142 * number of children that are directories,
143 * [list of INodeDirectoryInfo of the directory children] (includes
144 * snapshot copies of deleted sub-directories)
145 * } (when {@link Feature#SNAPSHOT} is supported),
146 * }
147 *
148 * Snapshot {
149 * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
150 * the name of the snapshot)
151 * }
152 *
153 * DirectoryDiff {
154 * full path of the root of the associated Snapshot: short + byte[],
155 * childrenSize: int,
156 * isSnapshotRoot: byte,
157 * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
158 * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
159 * }
160 *
161 * Diff {
162 * createdListSize: int, [Local name of INode in created list],
163 * deletedListSize: int, [INode in deleted list: INodeInfo]
164 * }
165 *
166 * FileDiff {
167 * full path of the root of the associated Snapshot: short + byte[],
168 * fileSize: long,
169 * snapshotINodeIsNotNull: byte,
170 * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
171 * }
172 * </pre>
173 */
174 @InterfaceAudience.Private
175 @InterfaceStability.Evolving
176 public class FSImageFormat {
177 private static final Log LOG = FSImage.LOG;
178
179 // Static-only class
180 private FSImageFormat() {}
181
182 /**
183 * A one-shot class responsible for loading an image. The load() function
184 * should be called once, after which the getter methods may be used to retrieve
185 * information about the image that was loaded, if loading was successful.
186 */
187 public static class Loader {
188 private final Configuration conf;
189 /** which namesystem this loader is working for */
190 private final FSNamesystem namesystem;
191
192 /** Set to true once a file has been loaded using this loader. */
193 private boolean loaded = false;
194
195 /** The transaction ID of the last edit represented by the loaded file */
196 private long imgTxId;
197 /** The MD5 sum of the loaded file */
198 private MD5Hash imgDigest;
199
200 private Map<Integer, Snapshot> snapshotMap = null;
201 private final ReferenceMap referenceMap = new ReferenceMap();
202
203 Loader(Configuration conf, FSNamesystem namesystem) {
204 this.conf = conf;
205 this.namesystem = namesystem;
206 }
207
208 /**
209 * Return the MD5 checksum of the image that has been loaded.
210 * @throws IllegalStateException if load() has not yet been called.
211 */
212 MD5Hash getLoadedImageMd5() {
213 checkLoaded();
214 return imgDigest;
215 }
216
217 long getLoadedImageTxId() {
218 checkLoaded();
219 return imgTxId;
220 }
221
222 /**
223 * Throw IllegalStateException if load() has not yet been called.
224 */
225 private void checkLoaded() {
226 if (!loaded) {
227 throw new IllegalStateException("Image not yet loaded!");
228 }
229 }
230
231 /**
232 * Throw IllegalStateException if load() has already been called.
233 */
234 private void checkNotLoaded() {
235 if (loaded) {
236 throw new IllegalStateException("Image already loaded!");
237 }
238 }
239
240 void load(File curFile) throws IOException {
241 checkNotLoaded();
242 assert curFile != null : "curFile is null";
243
244 StartupProgress prog = NameNode.getStartupProgress();
245 Step step = new Step(StepType.INODES);
246 prog.beginStep(Phase.LOADING_FSIMAGE, step);
247 long startTime = now();
248
249 //
250 // Load in bits
251 //
252 MessageDigest digester = MD5Hash.getDigester();
253 DigestInputStream fin = new DigestInputStream(
254 new FileInputStream(curFile), digester);
255
256 DataInputStream in = new DataInputStream(fin);
257 try {
258 // read image version: first appeared in version -1
259 int imgVersion = in.readInt();
260 if (getLayoutVersion() != imgVersion) {
261 throw new InconsistentFSStateException(curFile,
262 "imgVersion " + imgVersion +
263 " expected to be " + getLayoutVersion());
264 }
265 boolean supportSnapshot = LayoutVersion.supports(Feature.SNAPSHOT,
266 imgVersion);
267
268 // read namespaceID: first appeared in version -2
269 in.readInt();
270
271 long numFiles = in.readLong();
272
273 // read in the last generation stamp for legacy blocks.
274 long genstamp = in.readLong();
275 namesystem.setGenerationStampV1(genstamp);
276
277 if (LayoutVersion.supports(Feature.SEQUENTIAL_BLOCK_ID, imgVersion)) {
278 // read the starting generation stamp for sequential block IDs
279 genstamp = in.readLong();
280 namesystem.setGenerationStampV2(genstamp);
281
282 // read the last generation stamp for blocks created after
283 // the switch to sequential block IDs.
284 long stampAtIdSwitch = in.readLong();
285 namesystem.setGenerationStampV1Limit(stampAtIdSwitch);
286
287 // read the max sequential block ID.
288 long maxSequentialBlockId = in.readLong();
289 namesystem.setLastAllocatedBlockId(maxSequentialBlockId);
290 } else {
291 long startingGenStamp = namesystem.upgradeGenerationStampToV2();
292 // This is an upgrade.
293 LOG.info("Upgrading to sequential block IDs. Generation stamp " +
294 "for new blocks set to " + startingGenStamp);
295 }
296
297 // read the transaction ID of the last edit represented by
298 // this image
299 if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) {
300 imgTxId = in.readLong();
301 } else {
302 imgTxId = 0;
303 }
304
305 // read the last allocated inode id in the fsimage
306 if (LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion)) {
307 long lastInodeId = in.readLong();
308 namesystem.resetLastInodeId(lastInodeId);
309 if (LOG.isDebugEnabled()) {
310 LOG.debug("load last allocated InodeId from fsimage:" + lastInodeId);
311 }
312 } else {
313 if (LOG.isDebugEnabled()) {
314 LOG.debug("Old layout version doesn't have inode id."
315 + " Will assign new id for each inode.");
316 }
317 }
318
319 if (supportSnapshot) {
320 snapshotMap = namesystem.getSnapshotManager().read(in, this);
321 }
322
323 // read compression related info
324 FSImageCompression compression;
325 if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imgVersion)) {
326 compression = FSImageCompression.readCompressionHeader(conf, in);
327 } else {
328 compression = FSImageCompression.createNoopCompression();
329 }
330 in = compression.unwrapInputStream(fin);
331
332 LOG.info("Loading image file " + curFile + " using " + compression);
333
334 // load all inodes
335 LOG.info("Number of files = " + numFiles);
336 prog.setTotal(Phase.LOADING_FSIMAGE, step, numFiles);
337 Counter counter = prog.getCounter(Phase.LOADING_FSIMAGE, step);
338 if (LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
339 imgVersion)) {
340 if (supportSnapshot) {
341 loadLocalNameINodesWithSnapshot(numFiles, in, counter);
342 } else {
343 loadLocalNameINodes(numFiles, in, counter);
344 }
345 } else {
346 loadFullNameINodes(numFiles, in, counter);
347 }
348
349 loadFilesUnderConstruction(in, supportSnapshot, counter);
350 prog.endStep(Phase.LOADING_FSIMAGE, step);
351 // Now that the step is finished, set counter equal to total to adjust
352 // for possible under-counting due to reference inodes.
353 prog.setCount(Phase.LOADING_FSIMAGE, step, numFiles);
354
355 loadSecretManagerState(in);
356
357 // make sure to read to the end of file
358 boolean eof = (in.read() == -1);
359 assert eof : "Should have reached the end of image file " + curFile;
360 } finally {
361 in.close();
362 }
363
364 imgDigest = new MD5Hash(digester.digest());
365 loaded = true;
366
367 LOG.info("Image file " + curFile + " of size " + curFile.length() +
368 " bytes loaded in " + (now() - startTime)/1000 + " seconds.");
369 }
370
371 /** Update the root node's attributes */
372 private void updateRootAttr(INodeWithAdditionalFields root) {
373 long nsQuota = root.getNsQuota();
374 long dsQuota = root.getDsQuota();
375 FSDirectory fsDir = namesystem.dir;
376 if (nsQuota != -1 || dsQuota != -1) {
377 fsDir.rootDir.setQuota(nsQuota, dsQuota);
378 }
379 fsDir.rootDir.cloneModificationTime(root);
380 fsDir.rootDir.clonePermissionStatus(root);
381 }
382
383 /**
384 * Load fsimage files when 1) only local names are stored,
385 * and 2) snapshot is supported.
386 *
387 * @param numFiles number of files expected to be read
388 * @param in Image input stream
389 * @param counter Counter to increment for namenode startup progress
390 */
391 private void loadLocalNameINodesWithSnapshot(long numFiles, DataInput in,
392 Counter counter) throws IOException {
393 assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
394 getLayoutVersion());
395 assert LayoutVersion.supports(Feature.SNAPSHOT, getLayoutVersion());
396
397 // load root
398 loadRoot(in, counter);
399 // load rest of the nodes recursively
400 loadDirectoryWithSnapshot(in, counter);
401 }
402
403 /**
404 * load fsimage files assuming only local names are stored
405 *
406 * @param numFiles number of files expected to be read
407 * @param in image input stream
408 * @param counter Counter to increment for namenode startup progress
409 * @throws IOException
410 */
411 private void loadLocalNameINodes(long numFiles, DataInput in, Counter counter)
412 throws IOException {
413 assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
414 getLayoutVersion());
415 assert numFiles > 0;
416
417 // load root
418 loadRoot(in, counter);
419 // have loaded the first file (the root)
420 numFiles--;
421
422 // load rest of the nodes directory by directory
423 while (numFiles > 0) {
424 numFiles -= loadDirectory(in, counter);
425 }
426 if (numFiles != 0) {
427 throw new IOException("Read unexpect number of files: " + -numFiles);
428 }
429 }
430
431 /**
432 * Load information about root, and use the information to update the root
433 * directory of NameSystem.
434 * @param in The {@link DataInput} instance to read.
435 * @param counter Counter to increment for namenode startup progress
436 */
437 private void loadRoot(DataInput in, Counter counter)
438 throws IOException {
439 // load root
440 if (in.readShort() != 0) {
441 throw new IOException("First node is not root");
442 }
443 final INodeDirectory root = loadINode(null, false, in, counter)
444 .asDirectory();
445 // update the root's attributes
446 updateRootAttr(root);
447 }
448
449 /** Load children nodes for the parent directory. */
450 private int loadChildren(INodeDirectory parent, DataInput in,
451 Counter counter) throws IOException {
452 int numChildren = in.readInt();
453 for (int i = 0; i < numChildren; i++) {
454 // load single inode
455 INode newNode = loadINodeWithLocalName(false, in, true, counter);
456 addToParent(parent, newNode);
457 }
458 return numChildren;
459 }
460
461 /**
462 * Load a directory when snapshot is supported.
463 * @param in The {@link DataInput} instance to read.
464 * @param counter Counter to increment for namenode startup progress
465 */
466 private void loadDirectoryWithSnapshot(DataInput in, Counter counter)
467 throws IOException {
468 // Step 1. Identify the parent INode
469 long inodeId = in.readLong();
470 final INodeDirectory parent = this.namesystem.dir.getInode(inodeId)
471 .asDirectory();
472
473 // Check if the whole subtree has been saved (for reference nodes)
474 boolean toLoadSubtree = referenceMap.toProcessSubtree(parent.getId());
475 if (!toLoadSubtree) {
476 return;
477 }
478
479 // Step 2. Load snapshots if parent is snapshottable
480 int numSnapshots = in.readInt();
481 if (numSnapshots >= 0) {
482 final INodeDirectorySnapshottable snapshottableParent
483 = INodeDirectorySnapshottable.valueOf(parent, parent.getLocalName());
484 // load snapshots and snapshotQuota
485 SnapshotFSImageFormat.loadSnapshotList(snapshottableParent,
486 numSnapshots, in, this);
487 if (snapshottableParent.getSnapshotQuota() > 0) {
488 // add the directory to the snapshottable directory list in
489 // SnapshotManager. Note that we only add root when its snapshot quota
490 // is positive.
491 this.namesystem.getSnapshotManager().addSnapshottable(
492 snapshottableParent);
493 }
494 }
495
496 // Step 3. Load children nodes under parent
497 loadChildren(parent, in, counter);
498
499 // Step 4. load Directory Diff List
500 SnapshotFSImageFormat.loadDirectoryDiffList(parent, in, this);
501
502 // Recursively load sub-directories, including snapshot copies of deleted
503 // directories
504 int numSubTree = in.readInt();
505 for (int i = 0; i < numSubTree; i++) {
506 loadDirectoryWithSnapshot(in, counter);
507 }
508 }
509
510 /**
511 * Load all children of a directory
512 *
513 * @param in
514 * @param counter Counter to increment for namenode startup progress
515 * @return number of child inodes read
516 * @throws IOException
517 */
518 private int loadDirectory(DataInput in, Counter counter) throws IOException {
519 String parentPath = FSImageSerialization.readString(in);
520 final INodeDirectory parent = INodeDirectory.valueOf(
521 namesystem.dir.rootDir.getNode(parentPath, true), parentPath);
522 return loadChildren(parent, in, counter);
523 }
524
525 /**
526 * load fsimage files assuming full path names are stored
527 *
528 * @param numFiles total number of files to load
529 * @param in data input stream
530 * @param counter Counter to increment for namenode startup progress
531 * @throws IOException if any error occurs
532 */
533 private void loadFullNameINodes(long numFiles, DataInput in, Counter counter)
534 throws IOException {
535 byte[][] pathComponents;
536 byte[][] parentPath = {{}};
537 FSDirectory fsDir = namesystem.dir;
538 INodeDirectory parentINode = fsDir.rootDir;
539 for (long i = 0; i < numFiles; i++) {
540 pathComponents = FSImageSerialization.readPathComponents(in);
541 final INode newNode = loadINode(
542 pathComponents[pathComponents.length-1], false, in, counter);
543
544 if (isRoot(pathComponents)) { // it is the root
545 // update the root's attributes
546 updateRootAttr(newNode.asDirectory());
547 continue;
548 }
549 // check if the new inode belongs to the same parent
550 if(!isParent(pathComponents, parentPath)) {
551 parentINode = getParentINodeDirectory(pathComponents);
552 parentPath = getParent(pathComponents);
553 }
554
555 // add new inode
556 addToParent(parentINode, newNode);
557 }
558 }
559
560 private INodeDirectory getParentINodeDirectory(byte[][] pathComponents
561 ) throws FileNotFoundException, PathIsNotDirectoryException,
562 UnresolvedLinkException {
563 if (pathComponents.length < 2) { // root
564 return null;
565 }
566 // Gets the parent INode
567 final INodesInPath inodes = namesystem.dir.getExistingPathINodes(
568 pathComponents);
569 return INodeDirectory.valueOf(inodes.getINode(-2), pathComponents);
570 }
571
572 /**
573 * Add the child node to parent and, if child is a file, update block map.
574 * This method is only used for image loading so that synchronization,
575 * modification time update and space count update are not needed.
576 */
577 private void addToParent(INodeDirectory parent, INode child) {
578 FSDirectory fsDir = namesystem.dir;
579 if (parent == fsDir.rootDir && FSDirectory.isReservedName(child)) {
580 throw new HadoopIllegalArgumentException("File name \""
581 + child.getLocalName() + "\" is reserved. Please "
582 + " change the name of the existing file or directory to another "
583 + "name before upgrading to this release.");
584 }
585 // NOTE: This does not update space counts for parents
586 if (!parent.addChild(child)) {
587 return;
588 }
589 namesystem.dir.cacheName(child);
590
591 if (child.isFile()) {
592 updateBlocksMap(child.asFile());
593 }
594 }
595
596 public void updateBlocksMap(INodeFile file) {
597 // Add file->block mapping
598 final BlockInfo[] blocks = file.getBlocks();
599 if (blocks != null) {
600 final BlockManager bm = namesystem.getBlockManager();
601 for (int i = 0; i < blocks.length; i++) {
602 file.setBlock(i, bm.addBlockCollection(blocks[i], file));
603 }
604 }
605 }
606
607 /** @return The FSDirectory of the namesystem where the fsimage is loaded */
608 public FSDirectory getFSDirectoryInLoading() {
609 return namesystem.dir;
610 }
611
612 public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
613 boolean updateINodeMap) throws IOException {
614 return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
615 }
616
617 public INode loadINodeWithLocalName(boolean isSnapshotINode,
618 DataInput in, boolean updateINodeMap, Counter counter)
619 throws IOException {
620 final byte[] localName = FSImageSerialization.readLocalName(in);
621 INode inode = loadINode(localName, isSnapshotINode, in, counter);
622 if (updateINodeMap
623 && LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
624 namesystem.dir.addToInodeMap(inode);
625 }
626 return inode;
627 }
628
629 /**
630 * load an inode from fsimage except for its name
631 *
632 * @param in data input stream from which image is read
633 * @param counter Counter to increment for namenode startup progress
634 * @return an inode
635 */
636 @SuppressWarnings("deprecation")
637 INode loadINode(final byte[] localName, boolean isSnapshotINode,
638 DataInput in, Counter counter) throws IOException {
639 final int imgVersion = getLayoutVersion();
640 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
641 namesystem.getFSDirectory().verifyINodeName(localName);
642 }
643
644 long inodeId = LayoutVersion.supports(Feature.ADD_INODE_ID, imgVersion) ?
645 in.readLong() : namesystem.allocateNewInodeId();
646
647 final short replication = namesystem.getBlockManager().adjustReplication(
648 in.readShort());
649 final long modificationTime = in.readLong();
650 long atime = 0;
651 if (LayoutVersion.supports(Feature.FILE_ACCESS_TIME, imgVersion)) {
652 atime = in.readLong();
653 }
654 final long blockSize = in.readLong();
655 final int numBlocks = in.readInt();
656
657 if (numBlocks >= 0) {
658 // file
659
660 // read blocks
661 BlockInfo[] blocks = null;
662 if (numBlocks >= 0) {
663 blocks = new BlockInfo[numBlocks];
664 for (int j = 0; j < numBlocks; j++) {
665 blocks[j] = new BlockInfo(replication);
666 blocks[j].readFields(in);
667 }
668 }
669
670 String clientName = "";
671 String clientMachine = "";
672 boolean underConstruction = false;
673 FileDiffList fileDiffs = null;
674 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
675 // read diffs
676 fileDiffs = SnapshotFSImageFormat.loadFileDiffList(in, this);
677
678 if (isSnapshotINode) {
679 underConstruction = in.readBoolean();
680 if (underConstruction) {
681 clientName = FSImageSerialization.readString(in);
682 clientMachine = FSImageSerialization.readString(in);
683 // convert the last block to BlockUC
684 if (blocks != null && blocks.length > 0) {
685 BlockInfo lastBlk = blocks[blocks.length - 1];
686 blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
687 lastBlk, replication);
688 }
689 }
690 }
691 }
692
693 final PermissionStatus permissions = PermissionStatus.read(in);
694
695 // return
696 if (counter != null) {
697 counter.increment();
698 }
699 final INodeFile file = new INodeFile(inodeId, localName, permissions,
700 modificationTime, atime, blocks, replication, blockSize);
701 if (underConstruction) {
702 INodeFileUnderConstruction fileUC = new INodeFileUnderConstruction(
703 file, clientName, clientMachine, null);
704 return fileDiffs == null ? fileUC :
705 new INodeFileUnderConstructionWithSnapshot(fileUC, fileDiffs);
706 } else {
707 return fileDiffs == null ? file :
708 new INodeFileWithSnapshot(file, fileDiffs);
709 }
710 } else if (numBlocks == -1) {
711 //directory
712
713 //read quotas
714 final long nsQuota = in.readLong();
715 long dsQuota = -1L;
716 if (LayoutVersion.supports(Feature.DISKSPACE_QUOTA, imgVersion)) {
717 dsQuota = in.readLong();
718 }
719
720 //read snapshot info
721 boolean snapshottable = false;
722 boolean withSnapshot = false;
723 if (LayoutVersion.supports(Feature.SNAPSHOT, imgVersion)) {
724 snapshottable = in.readBoolean();
725 if (!snapshottable) {
726 withSnapshot = in.readBoolean();
727 }
728 }
729
730 final PermissionStatus permissions = PermissionStatus.read(in);
731
732 //return
733 if (counter != null) {
734 counter.increment();
735 }
736 final INodeDirectory dir = nsQuota >= 0 || dsQuota >= 0?
737 new INodeDirectoryWithQuota(inodeId, localName, permissions,
738 modificationTime, nsQuota, dsQuota)
739 : new INodeDirectory(inodeId, localName, permissions, modificationTime);
740 return snapshottable ? new INodeDirectorySnapshottable(dir)
741 : withSnapshot ? new INodeDirectoryWithSnapshot(dir)
742 : dir;
743 } else if (numBlocks == -2) {
744 //symlink
745 if (!FileSystem.isSymlinksEnabled()) {
746 throw new IOException("Symlinks not supported - please remove symlink before upgrading to this version of HDFS");
747 }
748
749 final String symlink = Text.readString(in);
750 final PermissionStatus permissions = PermissionStatus.read(in);
751 if (counter != null) {
752 counter.increment();
753 }
754 return new INodeSymlink(inodeId, localName, permissions,
755 modificationTime, atime, symlink);
756 } else if (numBlocks == -3) {
757 //reference
758 // Intentionally do not increment counter, because it is too difficult at
759 // this point to assess whether or not this is a reference that counts
760 // toward quota.
761
762 final boolean isWithName = in.readBoolean();
763 // lastSnapshotId for WithName node, dstSnapshotId for DstReference node
764 int snapshotId = in.readInt();
765
766 final INodeReference.WithCount withCount
767 = referenceMap.loadINodeReferenceWithCount(isSnapshotINode, in, this);
768
769 if (isWithName) {
770 return new INodeReference.WithName(null, withCount, localName,
771 snapshotId);
772 } else {
773 final INodeReference ref = new INodeReference.DstReference(null,
774 withCount, snapshotId);
775 return ref;
776 }
777 }
778
779 throw new IOException("Unknown inode type: numBlocks=" + numBlocks);
780 }
781
782 /** Load {@link INodeFileAttributes}. */
783 public INodeFileAttributes loadINodeFileAttributes(DataInput in)
784 throws IOException {
785 final int layoutVersion = getLayoutVersion();
786
787 if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
788 return loadINodeWithLocalName(true, in, false).asFile();
789 }
790
791 final byte[] name = FSImageSerialization.readLocalName(in);
792 final PermissionStatus permissions = PermissionStatus.read(in);
793 final long modificationTime = in.readLong();
794 final long accessTime = in.readLong();
795
796 final short replication = namesystem.getBlockManager().adjustReplication(
797 in.readShort());
798 final long preferredBlockSize = in.readLong();
799
800 return new INodeFileAttributes.SnapshotCopy(name, permissions, modificationTime,
801 accessTime, replication, preferredBlockSize);
802 }
803
804 public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
805 throws IOException {
806 final int layoutVersion = getLayoutVersion();
807
808 if (!LayoutVersion.supports(Feature.OPTIMIZE_SNAPSHOT_INODES, layoutVersion)) {
809 return loadINodeWithLocalName(true, in, false).asDirectory();
810 }
811
812 final byte[] name = FSImageSerialization.readLocalName(in);
813 final PermissionStatus permissions = PermissionStatus.read(in);
814 final long modificationTime = in.readLong();
815
816 //read quotas
817 final long nsQuota = in.readLong();
818 final long dsQuota = in.readLong();
819
820 return nsQuota == -1L && dsQuota == -1L?
821 new INodeDirectoryAttributes.SnapshotCopy(name, permissions, modificationTime)
822 : new INodeDirectoryAttributes.CopyWithQuota(name, permissions,
823 modificationTime, nsQuota, dsQuota);
824 }
825
826 private void loadFilesUnderConstruction(DataInput in,
827 boolean supportSnapshot, Counter counter) throws IOException {
828 FSDirectory fsDir = namesystem.dir;
829 int size = in.readInt();
830
831 LOG.info("Number of files under construction = " + size);
832
833 for (int i = 0; i < size; i++) {
834 INodeFileUnderConstruction cons = FSImageSerialization
835 .readINodeUnderConstruction(in, namesystem, getLayoutVersion());
836 counter.increment();
837
838 // verify that file exists in namespace
839 String path = cons.getLocalName();
840 INodeFile oldnode = null;
841 boolean inSnapshot = false;
842 if (path != null && FSDirectory.isReservedName(path) &&
843 LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
844 // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
845 // snapshot. If we support INode ID in the layout version, we can use
846 // the inode id to find the oldnode.
847 oldnode = namesystem.dir.getInode(cons.getId()).asFile();
848 inSnapshot = true;
849 } else {
850 final INodesInPath iip = fsDir.getLastINodeInPath(path);
851 oldnode = INodeFile.valueOf(iip.getINode(0), path);
852 }
853
854 cons.setLocalName(oldnode.getLocalNameBytes());
855 INodeReference parentRef = oldnode.getParentReference();
856 if (parentRef != null) {
857 cons.setParentReference(parentRef);
858 } else {
859 cons.setParent(oldnode.getParent());
860 }
861
862 if (oldnode instanceof INodeFileWithSnapshot) {
863 cons = new INodeFileUnderConstructionWithSnapshot(cons,
864 ((INodeFileWithSnapshot) oldnode).getDiffs());
865 }
866
867 if (!inSnapshot) {
868 fsDir.replaceINodeFile(path, oldnode, cons);
869 namesystem.leaseManager.addLease(cons.getClientName(), path);
870 } else {
871 if (parentRef != null) {
872 // replace oldnode with cons
873 parentRef.setReferredINode(cons);
874 } else {
875 // replace old node in its parent's children list and deleted list
876 oldnode.getParent().replaceChildFileInSnapshot(oldnode, cons);
877 namesystem.dir.addToInodeMap(cons);
878 updateBlocksMap(cons);
879 }
880 }
881 }
882 }
883
884 private void loadSecretManagerState(DataInput in)
885 throws IOException {
886 int imgVersion = getLayoutVersion();
887
888 if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) {
889 //SecretManagerState is not available.
890 //This must not happen if security is turned on.
891 return;
892 }
893 namesystem.loadSecretManagerState(in);
894 }
895
896 private int getLayoutVersion() {
897 return namesystem.getFSImage().getStorage().getLayoutVersion();
898 }
899
900 private boolean isRoot(byte[][] path) {
901 return path.length == 1 &&
902 path[0] == null;
903 }
904
905 private boolean isParent(byte[][] path, byte[][] parent) {
906 if (path == null || parent == null)
907 return false;
908 if (parent.length == 0 || path.length != parent.length + 1)
909 return false;
910 boolean isParent = true;
911 for (int i = 0; i < parent.length; i++) {
912 isParent = isParent && Arrays.equals(path[i], parent[i]);
913 }
914 return isParent;
915 }
916
917 /**
918 * Return string representing the parent of the given path.
919 */
920 String getParent(String path) {
921 return path.substring(0, path.lastIndexOf(Path.SEPARATOR));
922 }
923
924 byte[][] getParent(byte[][] path) {
925 byte[][] result = new byte[path.length - 1][];
926 for (int i = 0; i < result.length; i++) {
927 result[i] = new byte[path[i].length];
928 System.arraycopy(path[i], 0, result[i], 0, path[i].length);
929 }
930 return result;
931 }
932
933 public Snapshot getSnapshot(DataInput in) throws IOException {
934 return snapshotMap.get(in.readInt());
935 }
936 }
937
938 /**
939 * A one-shot class responsible for writing an image file.
940 * The write() function should be called once, after which the getter
941 * functions may be used to retrieve information about the file that was written.
942 */
943 static class Saver {
944 private final SaveNamespaceContext context;
945 /** Set to true once an image has been written */
946 private boolean saved = false;
947
948 /** The MD5 checksum of the file that was written */
949 private MD5Hash savedDigest;
950 private final ReferenceMap referenceMap = new ReferenceMap();
951
952 private final Map<Long, INodeFileUnderConstruction> snapshotUCMap =
953 new HashMap<Long, INodeFileUnderConstruction>();
954
955 /** @throws IllegalStateException if the instance has not yet saved an image */
956 private void checkSaved() {
957 if (!saved) {
958 throw new IllegalStateException("FSImageSaver has not saved an image");
959 }
960 }
961
962 /** @throws IllegalStateException if the instance has already saved an image */
963 private void checkNotSaved() {
964 if (saved) {
965 throw new IllegalStateException("FSImageSaver has already saved an image");
966 }
967 }
968
969
970 Saver(SaveNamespaceContext context) {
971 this.context = context;
972 }
973
974 /**
975 * Return the MD5 checksum of the image file that was saved.
976 */
977 MD5Hash getSavedDigest() {
978 checkSaved();
979 return savedDigest;
980 }
981
982 void save(File newFile, FSImageCompression compression) throws IOException {
983 checkNotSaved();
984
985 final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
986 FSDirectory fsDir = sourceNamesystem.dir;
987 String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
988 Step step = new Step(StepType.INODES, sdPath);
989 StartupProgress prog = NameNode.getStartupProgress();
990 prog.beginStep(Phase.SAVING_CHECKPOINT, step);
991 prog.setTotal(Phase.SAVING_CHECKPOINT, step,
992 fsDir.rootDir.numItemsInTree());
993 Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
994 long startTime = now();
995 //
996 // Write out data
997 //
998 MessageDigest digester = MD5Hash.getDigester();
999 FileOutputStream fout = new FileOutputStream(newFile);
1000 DigestOutputStream fos = new DigestOutputStream(fout, digester);
1001 DataOutputStream out = new DataOutputStream(fos);
1002 try {
1003 out.writeInt(HdfsConstants.LAYOUT_VERSION);
1004 // We use the non-locked version of getNamespaceInfo here since
1005 // the coordinating thread of saveNamespace already has read-locked
1006 // the namespace for us. If we attempt to take another readlock
1007 // from the actual saver thread, there's a potential of a
1008 // fairness-related deadlock. See the comments on HDFS-2223.
1009 out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
1010 .getNamespaceID());
1011 out.writeLong(fsDir.rootDir.numItemsInTree());
1012 out.writeLong(sourceNamesystem.getGenerationStampV1());
1013 out.writeLong(sourceNamesystem.getGenerationStampV2());
1014 out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
1015 out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
1016 out.writeLong(context.getTxId());
1017 out.writeLong(sourceNamesystem.getLastInodeId());
1018
1019
1020 sourceNamesystem.getSnapshotManager().write(out);
1021
1022 // write compression info and set up compressed stream
1023 out = compression.writeHeaderAndWrapStream(fos);
1024 LOG.info("Saving image file " + newFile +
1025 " using " + compression);
1026
1027 // save the root
1028 saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
1029 // save the rest of the nodes
1030 saveImage(fsDir.rootDir, out, true, false, counter);
1031 prog.endStep(Phase.SAVING_CHECKPOINT, step);
1032 // Now that the step is finished, set counter equal to total to adjust
1033 // for possible under-counting due to reference inodes.
1034 prog.setCount(Phase.SAVING_CHECKPOINT, step,
1035 fsDir.rootDir.numItemsInTree());
1036 // save files under construction
1037 // TODO: for HDFS-5428, since we cannot break the compatibility of
1038 // fsimage, we store part of the under-construction files that are only
1039 // in snapshots in this "under-construction-file" section. As a
1040 // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
1041 // paths, so that when loading fsimage we do not put them into the lease
1042 // map. In the future, we can remove this hack when we can bump the
1043 // layout version.
1044 sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
1045
1046 context.checkCancelled();
1047 sourceNamesystem.saveSecretManagerState(out, sdPath);
1048 context.checkCancelled();
1049 out.flush();
1050 context.checkCancelled();
1051 fout.getChannel().force(true);
1052 } finally {
1053 out.close();
1054 }
1055
1056 saved = true;
1057 // set md5 of the saved image
1058 savedDigest = new MD5Hash(digester.digest());
1059
1060 LOG.info("Image file " + newFile + " of size " + newFile.length() +
1061 " bytes saved in " + (now() - startTime)/1000 + " seconds.");
1062 }
1063
1064 /**
1065 * Save children INodes.
1066 * @param children The list of children INodes
1067 * @param out The DataOutputStream to write
1068 * @param inSnapshot Whether the parent directory or its ancestor is in
1069 * the deleted list of some snapshot (caused by rename or
1070 * deletion)
1071 * @param counter Counter to increment for namenode startup progress
1072 * @return Number of children that are directory
1073 */
1074 private int saveChildren(ReadOnlyList<INode> children,
1075 DataOutputStream out, boolean inSnapshot, Counter counter)
1076 throws IOException {
1077 // Write normal children INode.
1078 out.writeInt(children.size());
1079 int dirNum = 0;
1080 int i = 0;
1081 for(INode child : children) {
1082 // print all children first
1083 // TODO: for HDFS-5428, we cannot change the format/content of fsimage
1084 // here, thus even if the parent directory is in snapshot, we still
1085 // do not handle INodeUC as those stored in deleted list
1086 saveINode2Image(child, out, false, referenceMap, counter);
1087 if (child.isDirectory()) {
1088 dirNum++;
1089 } else if (inSnapshot && child.isFile()
1090 && child.asFile().isUnderConstruction()) {
1091 this.snapshotUCMap.put(child.getId(),
1092 (INodeFileUnderConstruction) child.asFile());
1093 }
1094 if (i++ % 50 == 0) {
1095 context.checkCancelled();
1096 }
1097 }
1098 return dirNum;
1099 }
1100
1101 /**
1102 * Save file tree image starting from the given root.
1103 * This is a recursive procedure, which first saves all children and
1104 * snapshot diffs of a current directory and then moves inside the
1105 * sub-directories.
1106 *
1107 * @param current The current node
1108 * @param out The DataoutputStream to write the image
1109 * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
1110 * reference node, its subtree may already have been
1111 * saved before.
1112 * @param inSnapshot Whether the current directory is in snapshot
1113 * @param counter Counter to increment for namenode startup progress
1114 */
1115 private void saveImage(INodeDirectory current, DataOutputStream out,
1116 boolean toSaveSubtree, boolean inSnapshot, Counter counter)
1117 throws IOException {
1118 // write the inode id of the directory
1119 out.writeLong(current.getId());
1120
1121 if (!toSaveSubtree) {
1122 return;
1123 }
1124
1125 final ReadOnlyList<INode> children = current.getChildrenList(null);
1126 int dirNum = 0;
1127 List<INodeDirectory> snapshotDirs = null;
1128 if (current instanceof INodeDirectoryWithSnapshot) {
1129 snapshotDirs = new ArrayList<INodeDirectory>();
1130 ((INodeDirectoryWithSnapshot) current).getSnapshotDirectory(
1131 snapshotDirs);
1132 dirNum += snapshotDirs.size();
1133 }
1134
1135 // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
1136 // Snapshots
1137 if (current instanceof INodeDirectorySnapshottable) {
1138 INodeDirectorySnapshottable snapshottableNode =
1139 (INodeDirectorySnapshottable) current;
1140 SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
1141 } else {
1142 out.writeInt(-1); // # of snapshots
1143 }
1144
1145 // 3. Write children INode
1146 dirNum += saveChildren(children, out, inSnapshot, counter);
1147
1148 // 4. Write DirectoryDiff lists, if there is any.
1149 SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
1150
1151 // Write sub-tree of sub-directories, including possible snapshots of
1152 // deleted sub-directories
1153 out.writeInt(dirNum); // the number of sub-directories
1154 for(INode child : children) {
1155 if(!child.isDirectory()) {
1156 continue;
1157 }
1158 // make sure we only save the subtree under a reference node once
1159 boolean toSave = child.isReference() ?
1160 referenceMap.toProcessSubtree(child.getId()) : true;
1161 saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
1162 }
1163 if (snapshotDirs != null) {
1164 for (INodeDirectory subDir : snapshotDirs) {
1165 // make sure we only save the subtree under a reference node once
1166 boolean toSave = subDir.getParentReference() != null ?
1167 referenceMap.toProcessSubtree(subDir.getId()) : true;
1168 saveImage(subDir, out, toSave, true, counter);
1169 }
1170 }
1171 }
1172
1173 /**
1174 * Saves inode and increments progress counter.
1175 *
1176 * @param inode INode to save
1177 * @param out DataOutputStream to receive inode
1178 * @param writeUnderConstruction boolean true if this is under construction
1179 * @param referenceMap ReferenceMap containing reference inodes
1180 * @param counter Counter to increment for namenode startup progress
1181 * @throws IOException thrown if there is an I/O error
1182 */
1183 private void saveINode2Image(INode inode, DataOutputStream out,
1184 boolean writeUnderConstruction, ReferenceMap referenceMap,
1185 Counter counter) throws IOException {
1186 FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
1187 referenceMap);
1188 // Intentionally do not increment counter for reference inodes, because it
1189 // is too difficult at this point to assess whether or not this is a
1190 // reference that counts toward quota.
1191 if (!(inode instanceof INodeReference)) {
1192 counter.increment();
1193 }
1194 }
1195 }
1196 }