001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
021    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
022    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
023    import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
024    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
025    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
026    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
027    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
028    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
029    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
030    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
031    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
032    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
033    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
034    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
035    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
036    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
037    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
038    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
039    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
040    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
041    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
042    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
043    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
044    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
045    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
046    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
047    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
048    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
049    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
050    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
051    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
052    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
053    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
054    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
055    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
056    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
057    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
058    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
059    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
060    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
061    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
062    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
063    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
064    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
065    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
066    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
067    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
068    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
069    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
070    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
071    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
072    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
073    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
074    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
075    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
076    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
077    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
078    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
079    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
080    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
081    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
082    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
083    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
084    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
085    import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
086    import static org.apache.hadoop.util.Time.now;
087    
088    import java.io.BufferedWriter;
089    import java.io.ByteArrayInputStream;
090    import java.io.DataInput;
091    import java.io.DataInputStream;
092    import java.io.DataOutputStream;
093    import java.io.File;
094    import java.io.FileNotFoundException;
095    import java.io.FileOutputStream;
096    import java.io.IOException;
097    import java.io.OutputStreamWriter;
098    import java.io.PrintWriter;
099    import java.io.StringWriter;
100    import java.lang.management.ManagementFactory;
101    import java.net.InetAddress;
102    import java.net.URI;
103    import java.util.ArrayList;
104    import java.util.Arrays;
105    import java.util.Collection;
106    import java.util.Collections;
107    import java.util.Date;
108    import java.util.EnumSet;
109    import java.util.HashMap;
110    import java.util.HashSet;
111    import java.util.Iterator;
112    import java.util.LinkedHashSet;
113    import java.util.List;
114    import java.util.Map;
115    import java.util.Set;
116    import java.util.concurrent.TimeUnit;
117    import java.util.concurrent.locks.ReentrantReadWriteLock;
118    
119    import javax.management.NotCompliantMBeanException;
120    import javax.management.ObjectName;
121    import javax.management.StandardMBean;
122    
123    import org.apache.commons.logging.Log;
124    import org.apache.commons.logging.LogFactory;
125    import org.apache.hadoop.HadoopIllegalArgumentException;
126    import org.apache.hadoop.classification.InterfaceAudience;
127    import org.apache.hadoop.conf.Configuration;
128    import org.apache.hadoop.fs.ContentSummary;
129    import org.apache.hadoop.fs.CreateFlag;
130    import org.apache.hadoop.fs.DirectoryListingStartAfterNotFoundException;
131    import org.apache.hadoop.fs.FileAlreadyExistsException;
132    import org.apache.hadoop.fs.FileStatus;
133    import org.apache.hadoop.fs.FileSystem;
134    import org.apache.hadoop.fs.FsServerDefaults;
135    import org.apache.hadoop.fs.InvalidPathException;
136    import org.apache.hadoop.fs.Options;
137    import org.apache.hadoop.fs.Options.Rename;
138    import org.apache.hadoop.fs.ParentNotDirectoryException;
139    import org.apache.hadoop.fs.Path;
140    import org.apache.hadoop.fs.UnresolvedLinkException;
141    import org.apache.hadoop.fs.permission.FsAction;
142    import org.apache.hadoop.fs.permission.FsPermission;
143    import org.apache.hadoop.fs.permission.PermissionStatus;
144    import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
145    import org.apache.hadoop.ha.ServiceFailedException;
146    import org.apache.hadoop.hdfs.DFSConfigKeys;
147    import org.apache.hadoop.hdfs.DFSUtil;
148    import org.apache.hadoop.hdfs.HAUtil;
149    import org.apache.hadoop.hdfs.HdfsConfiguration;
150    import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
151    import org.apache.hadoop.hdfs.protocol.Block;
152    import org.apache.hadoop.hdfs.protocol.ClientProtocol;
153    import org.apache.hadoop.hdfs.protocol.DatanodeID;
154    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
155    import org.apache.hadoop.hdfs.protocol.DirectoryListing;
156    import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
157    import org.apache.hadoop.hdfs.protocol.HdfsConstants;
158    import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
159    import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
160    import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
161    import org.apache.hadoop.hdfs.protocol.LocatedBlock;
162    import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
163    import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
164    import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
165    import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
166    import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry;
167    import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
168    import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
169    import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
170    import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
171    import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
172    import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
173    import org.apache.hadoop.hdfs.server.blockmanagement.*;
174    import org.apache.hadoop.hdfs.server.common.GenerationStamp;
175    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
176    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
177    import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
178    import org.apache.hadoop.hdfs.server.common.Storage;
179    import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
180    import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
181    import org.apache.hadoop.hdfs.server.common.Util;
182    import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
183    import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
184    import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
185    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
186    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
187    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
188    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
189    import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
190    import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
191    import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
192    import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
193    import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
194    import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
195    import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
196    import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
197    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
198    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable.SnapshotDiffInfo;
199    import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
200    import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
201    import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
202    import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
203    import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
204    import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
205    import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
206    import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
207    import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
208    import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
209    import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
210    import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
211    import org.apache.hadoop.io.IOUtils;
212    import org.apache.hadoop.io.Text;
213    import org.apache.hadoop.ipc.RetryCache;
214    import org.apache.hadoop.ipc.RetryCache.CacheEntry;
215    import org.apache.hadoop.ipc.RetryCache.CacheEntryWithPayload;
216    import org.apache.hadoop.ipc.RetriableException;
217    import org.apache.hadoop.ipc.Server;
218    import org.apache.hadoop.ipc.StandbyException;
219    import org.apache.hadoop.metrics2.annotation.Metric;
220    import org.apache.hadoop.metrics2.annotation.Metrics;
221    import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
222    import org.apache.hadoop.metrics2.util.MBeans;
223    import org.apache.hadoop.net.NetworkTopology;
224    import org.apache.hadoop.net.Node;
225    import org.apache.hadoop.security.AccessControlException;
226    import org.apache.hadoop.security.UserGroupInformation;
227    import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
228    import org.apache.hadoop.security.token.SecretManager.InvalidToken;
229    import org.apache.hadoop.security.token.Token;
230    import org.apache.hadoop.security.token.TokenIdentifier;
231    import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier;
232    import org.apache.hadoop.security.token.delegation.DelegationKey;
233    import org.apache.hadoop.util.Daemon;
234    import org.apache.hadoop.util.DataChecksum;
235    import org.apache.hadoop.util.Time;
236    import org.apache.hadoop.util.VersionInfo;
237    import org.mortbay.util.ajax.JSON;
238    
239    import com.google.common.annotations.VisibleForTesting;
240    import com.google.common.base.Charsets;
241    import com.google.common.base.Preconditions;
242    import com.google.common.collect.Lists;
243    
244    /***************************************************
245     * FSNamesystem does the actual bookkeeping work for the
246     * DataNode.
247     *
248     * It tracks several important tables.
249     *
250     * 1)  valid fsname --> blocklist  (kept on disk, logged)
251     * 2)  Set of all valid blocks (inverted #1)
252     * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
253     * 4)  machine --> blocklist (inverted #2)
254     * 5)  LRU cache of updated-heartbeat machines
255     ***************************************************/
256    @InterfaceAudience.Private
257    @Metrics(context="dfs")
258    public class FSNamesystem implements Namesystem, FSClusterStats,
259        FSNamesystemMBean, NameNodeMXBean {
260      public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
261    
262      private static final ThreadLocal<StringBuilder> auditBuffer =
263        new ThreadLocal<StringBuilder>() {
264          @Override
265          protected StringBuilder initialValue() {
266            return new StringBuilder();
267          }
268      };
269    
270      @VisibleForTesting
271      public boolean isAuditEnabled() {
272        return !isDefaultAuditLogger || auditLog.isInfoEnabled();
273      }
274    
275      private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink)
276          throws IOException {
277        return (isAuditEnabled() && isExternalInvocation())
278            ? dir.getFileInfo(path, resolveSymlink) : null;
279      }
280      
281      private void logAuditEvent(boolean succeeded, String cmd, String src)
282          throws IOException {
283        logAuditEvent(succeeded, cmd, src, null, null);
284      }
285      
286      private void logAuditEvent(boolean succeeded, String cmd, String src,
287          String dst, HdfsFileStatus stat) throws IOException {
288        if (isAuditEnabled() && isExternalInvocation()) {
289          logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
290                        cmd, src, dst, stat);
291        }
292      }
293    
294      private void logAuditEvent(boolean succeeded,
295          UserGroupInformation ugi, InetAddress addr, String cmd, String src,
296          String dst, HdfsFileStatus stat) {
297        FileStatus status = null;
298        if (stat != null) {
299          Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
300          Path path = dst != null ? new Path(dst) : new Path(src);
301          status = new FileStatus(stat.getLen(), stat.isDir(),
302              stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
303              stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
304              stat.getGroup(), symlink, path);
305        }
306        for (AuditLogger logger : auditLoggers) {
307          if (logger instanceof HdfsAuditLogger) {
308            HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
309            hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
310                status, ugi, dtSecretManager);
311          } else {
312            logger.logAuditEvent(succeeded, ugi.toString(), addr,
313                cmd, src, dst, status);
314          }
315        }
316      }
317    
318      /**
319       * Logger for audit events, noting successful FSNamesystem operations. Emits
320       * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
321       * <code>key=value</code> pairs to be written for the following properties:
322       * <code>
323       * ugi=&lt;ugi in RPC&gt;
324       * ip=&lt;remote IP&gt;
325       * cmd=&lt;command&gt;
326       * src=&lt;src path&gt;
327       * dst=&lt;dst path (optional)&gt;
328       * perm=&lt;permissions (optional)&gt;
329       * </code>
330       */
331      public static final Log auditLog = LogFactory.getLog(
332          FSNamesystem.class.getName() + ".audit");
333    
334      static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
335      static int BLOCK_DELETION_INCREMENT = 1000;
336      private final boolean isPermissionEnabled;
337      private final UserGroupInformation fsOwner;
338      private final String fsOwnerShortUserName;
339      private final String supergroup;
340      private final boolean standbyShouldCheckpoint;
341      
342      // Scan interval is not configurable.
343      private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
344        TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
345      final DelegationTokenSecretManager dtSecretManager;
346      private final boolean alwaysUseDelegationTokensForTests;
347    
348      private static final Step STEP_AWAITING_REPORTED_BLOCKS =
349        new Step(StepType.AWAITING_REPORTED_BLOCKS);
350    
351      // Tracks whether the default audit logger is the only configured audit
352      // logger; this allows isAuditEnabled() to return false in case the
353      // underlying logger is disabled, and avoid some unnecessary work.
354      private final boolean isDefaultAuditLogger;
355      private final List<AuditLogger> auditLoggers;
356    
357      /** The namespace tree. */
358      FSDirectory dir;
359      private final BlockManager blockManager;
360      private final SnapshotManager snapshotManager;
361      private final DatanodeStatistics datanodeStatistics;
362    
363      // Block pool ID used by this namenode
364      private String blockPoolId;
365    
366      final LeaseManager leaseManager = new LeaseManager(this); 
367    
368      volatile Daemon smmthread = null;  // SafeModeMonitor thread
369      
370      Daemon nnrmthread = null; // NamenodeResourceMonitor thread
371    
372      Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
373      /**
374       * When an active namenode will roll its own edit log, in # edits
375       */
376      private final long editLogRollerThreshold;
377      /**
378       * Check interval of an active namenode's edit log roller thread 
379       */
380      private final int editLogRollerInterval;
381    
382      private volatile boolean hasResourcesAvailable = false;
383      private volatile boolean fsRunning = true;
384      
385      /** The start time of the namesystem. */
386      private final long startTime = now();
387    
388      /** The interval of namenode checking for the disk space availability */
389      private final long resourceRecheckInterval;
390    
391      // The actual resource checker instance.
392      NameNodeResourceChecker nnResourceChecker;
393    
394      private final FsServerDefaults serverDefaults;
395      private final boolean supportAppends;
396      private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
397    
398      private volatile SafeModeInfo safeMode;  // safe mode information
399    
400      private final long maxFsObjects;          // maximum number of fs objects
401    
402      private final long minBlockSize;         // minimum block size
403      private final long maxBlocksPerFile;     // maximum # of blocks per file
404    
405      /**
406       * The global generation stamp for legacy blocks with randomly
407       * generated block IDs.
408       */
409      private final GenerationStamp generationStampV1 = new GenerationStamp();
410    
411      /**
412       * The global generation stamp for this file system.
413       */
414      private final GenerationStamp generationStampV2 = new GenerationStamp();
415    
416      /**
417       * The value of the generation stamp when the first switch to sequential
418       * block IDs was made. Blocks with generation stamps below this value
419       * have randomly allocated block IDs. Blocks with generation stamps above
420       * this value had sequentially allocated block IDs. Read from the fsImage
421       * (or initialized as an offset from the V1 (legacy) generation stamp on
422       * upgrade).
423       */
424      private long generationStampV1Limit =
425          GenerationStamp.GRANDFATHER_GENERATION_STAMP;
426    
427      /**
428       * The global block ID space for this file system.
429       */
430      @VisibleForTesting
431      private final SequentialBlockIdGenerator blockIdGenerator;
432    
433      // precision of access times.
434      private final long accessTimePrecision;
435    
436      /** Lock to protect FSNamesystem. */
437      private ReentrantReadWriteLock fsLock = new ReentrantReadWriteLock(true);
438    
439      /**
440       * Used when this NN is in standby state to read from the shared edit log.
441       */
442      private EditLogTailer editLogTailer = null;
443    
444      /**
445       * Used when this NN is in standby state to perform checkpoints.
446       */
447      private StandbyCheckpointer standbyCheckpointer;
448    
449      /**
450       * Reference to the NN's HAContext object. This is only set once
451       * {@link #startCommonServices(Configuration, HAContext)} is called. 
452       */
453      private HAContext haContext;
454    
455      private final boolean haEnabled;
456      
457      /**
458       * Whether the namenode is in the middle of starting the active service
459       */
460      private volatile boolean startingActiveService = false;
461        
462      private INodeId inodeId;
463      
464      private final RetryCache retryCache;
465      
466      /**
467       * Set the last allocated inode id when fsimage or editlog is loaded. 
468       */
469      public void resetLastInodeId(long newValue) throws IOException {
470        try {
471          inodeId.skipTo(newValue);
472        } catch(IllegalStateException ise) {
473          throw new IOException(ise);
474        }
475      }
476    
477      /** Should only be used for tests to reset to any value */
478      void resetLastInodeIdWithoutChecking(long newValue) {
479        inodeId.setCurrentValue(newValue);
480      }
481      
482      /** @return the last inode ID. */
483      public long getLastInodeId() {
484        return inodeId.getCurrentValue();
485      }
486    
487      /** Allocate a new inode ID. */
488      public long allocateNewInodeId() {
489        return inodeId.nextValue();
490      }
491      
492      /**
493       * Clear all loaded data
494       */
495      void clear() {
496        dir.reset();
497        dtSecretManager.reset();
498        generationStampV1.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
499        generationStampV2.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
500        blockIdGenerator.setCurrentValue(
501            SequentialBlockIdGenerator.LAST_RESERVED_BLOCK_ID);
502        generationStampV1Limit = GenerationStamp.GRANDFATHER_GENERATION_STAMP;
503        leaseManager.removeAllLeases();
504        inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
505        snapshotManager.clearSnapshottableDirs();
506      }
507    
508      @VisibleForTesting
509      LeaseManager getLeaseManager() {
510        return leaseManager;
511      }
512      
513      /**
514       * Check the supplied configuration for correctness.
515       * @param conf Supplies the configuration to validate.
516       * @throws IOException if the configuration could not be queried.
517       * @throws IllegalArgumentException if the configuration is invalid.
518       */
519      private static void checkConfiguration(Configuration conf)
520          throws IOException {
521    
522        final Collection<URI> namespaceDirs =
523            FSNamesystem.getNamespaceDirs(conf);
524        final Collection<URI> editsDirs =
525            FSNamesystem.getNamespaceEditsDirs(conf);
526        final Collection<URI> requiredEditsDirs =
527            FSNamesystem.getRequiredNamespaceEditsDirs(conf);
528        final Collection<URI> sharedEditsDirs =
529            FSNamesystem.getSharedEditsDirs(conf);
530    
531        for (URI u : requiredEditsDirs) {
532          if (u.toString().compareTo(
533                  DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
534            continue;
535          }
536    
537          // Each required directory must also be in editsDirs or in
538          // sharedEditsDirs.
539          if (!editsDirs.contains(u) &&
540              !sharedEditsDirs.contains(u)) {
541            throw new IllegalArgumentException(
542                "Required edits directory " + u.toString() + " not present in " +
543                DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
544                DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
545                editsDirs.toString() + "; " +
546                DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
547                requiredEditsDirs.toString() + ". " +
548                DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
549                sharedEditsDirs.toString() + ".");
550          }
551        }
552    
553        if (namespaceDirs.size() == 1) {
554          LOG.warn("Only one image storage directory ("
555              + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of dataloss"
556              + " due to lack of redundant storage directories!");
557        }
558        if (editsDirs.size() == 1) {
559          LOG.warn("Only one namespace edits storage directory ("
560              + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of dataloss"
561              + " due to lack of redundant storage directories!");
562        }
563      }
564    
565      /**
566       * Instantiates an FSNamesystem loaded from the image and edits
567       * directories specified in the passed Configuration.
568       *
569       * @param conf the Configuration which specifies the storage directories
570       *             from which to load
571       * @return an FSNamesystem which contains the loaded namespace
572       * @throws IOException if loading fails
573       */
574      public static FSNamesystem loadFromDisk(Configuration conf)
575          throws IOException {
576    
577        checkConfiguration(conf);
578        FSImage fsImage = new FSImage(conf,
579            FSNamesystem.getNamespaceDirs(conf),
580            FSNamesystem.getNamespaceEditsDirs(conf));
581        FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
582        StartupOption startOpt = NameNode.getStartupOption(conf);
583        if (startOpt == StartupOption.RECOVER) {
584          namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
585        }
586    
587        long loadStart = now();
588        String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
589        namesystem.loadFSImage(startOpt, fsImage,
590          HAUtil.isHAEnabled(conf, nameserviceId));
591        long timeTakenToLoadFSImage = now() - loadStart;
592        LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
593        NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
594        if (nnMetrics != null) {
595          nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
596        }
597        return namesystem;
598      }
599      
600      FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
601        this(conf, fsImage, false);
602      }
603      
604      /**
605       * Create an FSNamesystem associated with the specified image.
606       * 
607       * Note that this does not load any data off of disk -- if you would
608       * like that behavior, use {@link #loadFromDisk(Configuration)}
609       *
610       * @param conf configuration
611       * @param fsImage The FSImage to associate with
612       * @param ignoreRetryCache Whether or not should ignore the retry cache setup
613       *                         step. For Secondary NN this should be set to true.
614       * @throws IOException on bad configuration
615       */
616      FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
617          throws IOException {
618        try {
619          resourceRecheckInterval = conf.getLong(
620              DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
621              DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
622    
623          this.blockManager = new BlockManager(this, this, conf);
624          this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
625          this.blockIdGenerator = new SequentialBlockIdGenerator(this.blockManager);
626    
627          this.fsOwner = UserGroupInformation.getCurrentUser();
628          this.fsOwnerShortUserName = fsOwner.getShortUserName();
629          this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
630                                     DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
631          this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
632                                                     DFS_PERMISSIONS_ENABLED_DEFAULT);
633          LOG.info("fsOwner             = " + fsOwner);
634          LOG.info("supergroup          = " + supergroup);
635          LOG.info("isPermissionEnabled = " + isPermissionEnabled);
636    
637          // block allocation has to be persisted in HA using a shared edits directory
638          // so that the standby has up-to-date namespace information
639          String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
640          this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
641          
642          // Sanity check the HA-related config.
643          if (nameserviceId != null) {
644            LOG.info("Determined nameservice ID: " + nameserviceId);
645          }
646          LOG.info("HA Enabled: " + haEnabled);
647          if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
648            LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
649            throw new IOException("Invalid configuration: a shared edits dir " +
650                "must not be specified if HA is not enabled.");
651          }
652    
653          // Get the checksum type from config
654          String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
655          DataChecksum.Type checksumType;
656          try {
657             checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
658          } catch (IllegalArgumentException iae) {
659             throw new IOException("Invalid checksum type in "
660                + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
661          }
662    
663          this.serverDefaults = new FsServerDefaults(
664              conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
665              conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
666              conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
667              (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
668              conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
669              conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
670              conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
671              checksumType);
672          
673          this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
674                                           DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
675    
676          this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
677              DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
678          this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
679              DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
680          this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
681              DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
682          this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
683          LOG.info("Append Enabled: " + supportAppends);
684    
685          this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
686          
687          this.standbyShouldCheckpoint = conf.getBoolean(
688              DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
689          // # edit autoroll threshold is a multiple of the checkpoint threshold 
690          this.editLogRollerThreshold = (long)
691              (conf.getFloat(
692                  DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
693                  DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
694              conf.getLong(
695                  DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
696                  DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
697          this.editLogRollerInterval = conf.getInt(
698              DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
699              DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
700          this.inodeId = new INodeId();
701          
702          // For testing purposes, allow the DT secret manager to be started regardless
703          // of whether security is enabled.
704          alwaysUseDelegationTokensForTests = conf.getBoolean(
705              DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
706              DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
707    
708          this.dtSecretManager = createDelegationTokenSecretManager(conf);
709          this.dir = new FSDirectory(fsImage, this, conf);
710          this.snapshotManager = new SnapshotManager(dir);
711          this.safeMode = new SafeModeInfo(conf);
712          this.auditLoggers = initAuditLoggers(conf);
713          this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
714            auditLoggers.get(0) instanceof DefaultAuditLogger;
715          this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
716        } catch(IOException e) {
717          LOG.error(getClass().getSimpleName() + " initialization failed.", e);
718          close();
719          throw e;
720        } catch (RuntimeException re) {
721          LOG.error(getClass().getSimpleName() + " initialization failed.", re);
722          close();
723          throw re;
724        }
725      }
726      
727      @VisibleForTesting
728      public RetryCache getRetryCache() {
729        return retryCache;
730      }
731      
732      /** Whether or not retry cache is enabled */
733      boolean hasRetryCache() {
734        return retryCache != null;
735      }
736      
737      void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
738        if (retryCache != null) {
739          retryCache.addCacheEntryWithPayload(clientId, callId, payload);
740        }
741      }
742      
743      void addCacheEntry(byte[] clientId, int callId) {
744        if (retryCache != null) {
745          retryCache.addCacheEntry(clientId, callId);
746        }
747      }
748      
749      @VisibleForTesting
750      static RetryCache initRetryCache(Configuration conf) {
751        boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
752            DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
753        LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
754        if (enable) {
755          float heapPercent = conf.getFloat(
756              DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
757              DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
758          long entryExpiryMillis = conf.getLong(
759              DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
760              DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
761          LOG.info("Retry cache will use " + heapPercent
762              + " of total heap and retry cache entry expiry time is "
763              + entryExpiryMillis + " millis");
764          long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
765          return new RetryCache("Namenode Retry Cache", heapPercent,
766              entryExpiryNanos);
767        }
768        return null;
769      }
770    
771      private List<AuditLogger> initAuditLoggers(Configuration conf) {
772        // Initialize the custom access loggers if configured.
773        Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
774        List<AuditLogger> auditLoggers = Lists.newArrayList();
775        if (alClasses != null && !alClasses.isEmpty()) {
776          for (String className : alClasses) {
777            try {
778              AuditLogger logger;
779              if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
780                logger = new DefaultAuditLogger();
781              } else {
782                logger = (AuditLogger) Class.forName(className).newInstance();
783              }
784              logger.initialize(conf);
785              auditLoggers.add(logger);
786            } catch (RuntimeException re) {
787              throw re;
788            } catch (Exception e) {
789              throw new RuntimeException(e);
790            }
791          }
792        }
793    
794        // Make sure there is at least one logger installed.
795        if (auditLoggers.isEmpty()) {
796          auditLoggers.add(new DefaultAuditLogger());
797        }
798        return Collections.unmodifiableList(auditLoggers);
799      }
800    
801      void loadFSImage(StartupOption startOpt, FSImage fsImage, boolean haEnabled)
802          throws IOException {
803        // format before starting up if requested
804        if (startOpt == StartupOption.FORMAT) {
805          
806          fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
807    
808          startOpt = StartupOption.REGULAR;
809        }
810        boolean success = false;
811        writeLock();
812        try {
813          // We shouldn't be calling saveNamespace if we've come up in standby state.
814          MetaRecoveryContext recovery = startOpt.createRecoveryContext();
815          boolean needToSave =
816            fsImage.recoverTransitionRead(startOpt, this, recovery) && !haEnabled;
817          if (needToSave) {
818            fsImage.saveNamespace(this);
819          } else {
820            // No need to save, so mark the phase done.
821            StartupProgress prog = NameNode.getStartupProgress();
822            prog.beginPhase(Phase.SAVING_CHECKPOINT);
823            prog.endPhase(Phase.SAVING_CHECKPOINT);
824          }
825          // This will start a new log segment and write to the seen_txid file, so
826          // we shouldn't do it when coming up in standby state
827          if (!haEnabled) {
828            fsImage.openEditLogForWrite();
829          }
830          success = true;
831        } finally {
832          if (!success) {
833            fsImage.close();
834          }
835          writeUnlock();
836        }
837        dir.imageLoadComplete();
838      }
839    
840      private void startSecretManager() {
841        if (dtSecretManager != null) {
842          try {
843            dtSecretManager.startThreads();
844          } catch (IOException e) {
845            // Inability to start secret manager
846            // can't be recovered from.
847            throw new RuntimeException(e);
848          }
849        }
850      }
851      
852      private void startSecretManagerIfNecessary() {
853        boolean shouldRun = shouldUseDelegationTokens() &&
854          !isInSafeMode() && getEditLog().isOpenForWrite();
855        boolean running = dtSecretManager.isRunning();
856        if (shouldRun && !running) {
857          startSecretManager();
858        }
859      }
860    
861      private void stopSecretManager() {
862        if (dtSecretManager != null) {
863          dtSecretManager.stopThreads();
864        }
865      }
866      
867      /** 
868       * Start services common to both active and standby states
869       * @param haContext 
870       * @throws IOException
871       */
872      void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
873        this.registerMBean(); // register the MBean for the FSNamesystemState
874        writeLock();
875        this.haContext = haContext;
876        try {
877          nnResourceChecker = new NameNodeResourceChecker(conf);
878          checkAvailableResources();
879          assert safeMode != null &&
880            !safeMode.isPopulatingReplQueues();
881          StartupProgress prog = NameNode.getStartupProgress();
882          prog.beginPhase(Phase.SAFEMODE);
883          prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
884            getCompleteBlocksTotal());
885          setBlockTotal();
886          blockManager.activate(conf);
887        } finally {
888          writeUnlock();
889        }
890        
891        registerMXBean();
892        DefaultMetricsSystem.instance().register(this);
893      }
894      
895      /** 
896       * Stop services common to both active and standby states
897       * @throws IOException
898       */
899      void stopCommonServices() {
900        writeLock();
901        try {
902          if (blockManager != null) blockManager.close();
903        } finally {
904          writeUnlock();
905        }
906        RetryCache.clear(retryCache);
907      }
908      
909      /**
910       * Start services required in active state
911       * @throws IOException
912       */
913      void startActiveServices() throws IOException {
914        startingActiveService = true;
915        LOG.info("Starting services required for active state");
916        writeLock();
917        try {
918          FSEditLog editLog = dir.fsImage.getEditLog();
919          
920          if (!editLog.isOpenForWrite()) {
921            // During startup, we're already open for write during initialization.
922            editLog.initJournalsForWrite();
923            // May need to recover
924            editLog.recoverUnclosedStreams();
925            
926            LOG.info("Catching up to latest edits from old active before " +
927                "taking over writer role in edits logs");
928            editLogTailer.catchupDuringFailover();
929            
930            blockManager.setPostponeBlocksFromFuture(false);
931            blockManager.getDatanodeManager().markAllDatanodesStale();
932            blockManager.clearQueues();
933            blockManager.processAllPendingDNMessages();
934            
935            if (!isInSafeMode() ||
936                (isInSafeMode() && safeMode.isPopulatingReplQueues())) {
937              LOG.info("Reprocessing replication and invalidation queues");
938              blockManager.processMisReplicatedBlocks();
939            }
940            
941            if (LOG.isDebugEnabled()) {
942              LOG.debug("NameNode metadata after re-processing " +
943                  "replication and invalidation queues during failover:\n" +
944                  metaSaveAsString());
945            }
946            
947            long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
948            LOG.info("Will take over writing edit logs at txnid " + 
949                nextTxId);
950            editLog.setNextTxId(nextTxId);
951    
952            dir.fsImage.editLog.openForWrite();
953          }
954          if (haEnabled) {
955            // Renew all of the leases before becoming active.
956            // This is because, while we were in standby mode,
957            // the leases weren't getting renewed on this NN.
958            // Give them all a fresh start here.
959            leaseManager.renewAllLeases();
960          }
961          leaseManager.startMonitor();
962          startSecretManagerIfNecessary();
963    
964          //ResourceMonitor required only at ActiveNN. See HDFS-2914
965          this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
966          nnrmthread.start();
967    
968          nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
969              editLogRollerThreshold, editLogRollerInterval));
970          nnEditLogRoller.start();
971    
972        } finally {
973          writeUnlock();
974          startingActiveService = false;
975        }
976      }
977      
978      /**
979       * @return Whether the namenode is transitioning to active state and is in the
980       *         middle of the {@link #startActiveServices()}
981       */
982      public boolean inTransitionToActive() {
983        return haEnabled && haContext != null
984            && haContext.getState().getServiceState() == HAServiceState.ACTIVE
985            && startingActiveService;
986      }
987    
988      private boolean shouldUseDelegationTokens() {
989        return UserGroupInformation.isSecurityEnabled() ||
990          alwaysUseDelegationTokensForTests;
991      }
992    
993      /** 
994       * Stop services required in active state
995       * @throws InterruptedException
996       */
997      void stopActiveServices() {
998        LOG.info("Stopping services started for active state");
999        writeLock();
1000        try {
1001          stopSecretManager();
1002          if (leaseManager != null) {
1003            leaseManager.stopMonitor();
1004          }
1005          if (nnrmthread != null) {
1006            ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1007            nnrmthread.interrupt();
1008          }
1009          if (nnEditLogRoller != null) {
1010            ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1011            nnEditLogRoller.interrupt();
1012          }
1013          if (dir != null && dir.fsImage != null) {
1014            if (dir.fsImage.editLog != null) {
1015              dir.fsImage.editLog.close();
1016            }
1017            // Update the fsimage with the last txid that we wrote
1018            // so that the tailer starts from the right spot.
1019            dir.fsImage.updateLastAppliedTxIdFromWritten();
1020          }
1021        } finally {
1022          writeUnlock();
1023        }
1024      }
1025      
1026      /**
1027       * Start services required in standby state 
1028       * 
1029       * @throws IOException
1030       */
1031      void startStandbyServices(final Configuration conf) throws IOException {
1032        LOG.info("Starting services required for standby state");
1033        if (!dir.fsImage.editLog.isOpenForRead()) {
1034          // During startup, we're already open for read.
1035          dir.fsImage.editLog.initSharedJournalsForRead();
1036        }
1037        
1038        blockManager.setPostponeBlocksFromFuture(true);
1039    
1040        editLogTailer = new EditLogTailer(this, conf);
1041        editLogTailer.start();
1042        if (standbyShouldCheckpoint) {
1043          standbyCheckpointer = new StandbyCheckpointer(conf, this);
1044          standbyCheckpointer.start();
1045        }
1046      }
1047    
1048    
1049      /**
1050       * Called while the NN is in Standby state, but just about to be
1051       * asked to enter Active state. This cancels any checkpoints
1052       * currently being taken.
1053       */
1054      void prepareToStopStandbyServices() throws ServiceFailedException {
1055        if (standbyCheckpointer != null) {
1056          standbyCheckpointer.cancelAndPreventCheckpoints(
1057              "About to leave standby state");
1058        }
1059      }
1060    
1061      /** Stop services required in standby state */
1062      void stopStandbyServices() throws IOException {
1063        LOG.info("Stopping services started for standby state");
1064        if (standbyCheckpointer != null) {
1065          standbyCheckpointer.stop();
1066        }
1067        if (editLogTailer != null) {
1068          editLogTailer.stop();
1069        }
1070        if (dir != null && dir.fsImage != null && dir.fsImage.editLog != null) {
1071          dir.fsImage.editLog.close();
1072        }
1073      }
1074      
1075      @Override
1076      public void checkOperation(OperationCategory op) throws StandbyException {
1077        if (haContext != null) {
1078          // null in some unit tests
1079          haContext.checkOperation(op);
1080        }
1081      }
1082      
1083      /**
1084       * @throws RetriableException
1085       *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1086       *           NameNode is in active state
1087       * @throws SafeModeException
1088       *           Otherwise if NameNode is in SafeMode.
1089       */
1090      private void checkNameNodeSafeMode(String errorMsg)
1091          throws RetriableException, SafeModeException {
1092        if (isInSafeMode()) {
1093          SafeModeException se = new SafeModeException(errorMsg, safeMode);
1094          if (haEnabled && haContext != null
1095              && haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1096            throw new RetriableException(se);
1097          } else {
1098            throw se;
1099          }
1100        }
1101      }
1102      
1103      public static Collection<URI> getNamespaceDirs(Configuration conf) {
1104        return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1105      }
1106    
1107      /**
1108       * Get all edits dirs which are required. If any shared edits dirs are
1109       * configured, these are also included in the set of required dirs.
1110       * 
1111       * @param conf the HDFS configuration.
1112       * @return all required dirs.
1113       */
1114      public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1115        Set<URI> ret = new HashSet<URI>();
1116        ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1117        ret.addAll(getSharedEditsDirs(conf));
1118        return ret;
1119      }
1120    
1121      private static Collection<URI> getStorageDirs(Configuration conf,
1122                                                    String propertyName) {
1123        Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1124        StartupOption startOpt = NameNode.getStartupOption(conf);
1125        if(startOpt == StartupOption.IMPORT) {
1126          // In case of IMPORT this will get rid of default directories 
1127          // but will retain directories specified in hdfs-site.xml
1128          // When importing image from a checkpoint, the name-node can
1129          // start with empty set of storage directories.
1130          Configuration cE = new HdfsConfiguration(false);
1131          cE.addResource("core-default.xml");
1132          cE.addResource("core-site.xml");
1133          cE.addResource("hdfs-default.xml");
1134          Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1135          dirNames.removeAll(dirNames2);
1136          if(dirNames.isEmpty())
1137            LOG.warn("!!! WARNING !!!" +
1138              "\n\tThe NameNode currently runs without persistent storage." +
1139              "\n\tAny changes to the file system meta-data may be lost." +
1140              "\n\tRecommended actions:" +
1141              "\n\t\t- shutdown and restart NameNode with configured \"" 
1142              + propertyName + "\" in hdfs-site.xml;" +
1143              "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1144              "of the file system meta-data.");
1145        } else if (dirNames.isEmpty()) {
1146          dirNames = Collections.singletonList(
1147              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1148        }
1149        return Util.stringCollectionAsURIs(dirNames);
1150      }
1151    
1152      /**
1153       * Return an ordered list of edits directories to write to.
1154       * The list is ordered such that all shared edits directories
1155       * are ordered before non-shared directories, and any duplicates
1156       * are removed. The order they are specified in the configuration
1157       * is retained.
1158       * @return Collection of shared edits directories.
1159       * @throws IOException if multiple shared edits directories are configured
1160       */
1161      public static List<URI> getNamespaceEditsDirs(Configuration conf)
1162          throws IOException {
1163        return getNamespaceEditsDirs(conf, true);
1164      }
1165      
1166      public static List<URI> getNamespaceEditsDirs(Configuration conf,
1167          boolean includeShared)
1168          throws IOException {
1169        // Use a LinkedHashSet so that order is maintained while we de-dup
1170        // the entries.
1171        LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1172        
1173        if (includeShared) {
1174          List<URI> sharedDirs = getSharedEditsDirs(conf);
1175      
1176          // Fail until multiple shared edits directories are supported (HDFS-2782)
1177          if (sharedDirs.size() > 1) {
1178            throw new IOException(
1179                "Multiple shared edits directories are not yet supported");
1180          }
1181      
1182          // First add the shared edits dirs. It's critical that the shared dirs
1183          // are added first, since JournalSet syncs them in the order they are listed,
1184          // and we need to make sure all edits are in place in the shared storage
1185          // before they are replicated locally. See HDFS-2874.
1186          for (URI dir : sharedDirs) {
1187            if (!editsDirs.add(dir)) {
1188              LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1189                  DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1190            }
1191          }
1192        }    
1193        // Now add the non-shared dirs.
1194        for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1195          if (!editsDirs.add(dir)) {
1196            LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1197                DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1198                DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1199          }
1200        }
1201    
1202        if (editsDirs.isEmpty()) {
1203          // If this is the case, no edit dirs have been explicitly configured.
1204          // Image dirs are to be used for edits too.
1205          return Lists.newArrayList(getNamespaceDirs(conf));
1206        } else {
1207          return Lists.newArrayList(editsDirs);
1208        }
1209      }
1210      
1211      /**
1212       * Returns edit directories that are shared between primary and secondary.
1213       * @param conf
1214       * @return Collection of edit directories.
1215       */
1216      public static List<URI> getSharedEditsDirs(Configuration conf) {
1217        // don't use getStorageDirs here, because we want an empty default
1218        // rather than the dir in /tmp
1219        Collection<String> dirNames = conf.getTrimmedStringCollection(
1220            DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1221        return Util.stringCollectionAsURIs(dirNames);
1222      }
1223    
1224      @Override
1225      public void readLock() {
1226        this.fsLock.readLock().lock();
1227      }
1228      @Override
1229      public void readUnlock() {
1230        this.fsLock.readLock().unlock();
1231      }
1232      @Override
1233      public void writeLock() {
1234        this.fsLock.writeLock().lock();
1235      }
1236      @Override
1237      public void writeLockInterruptibly() throws InterruptedException {
1238        this.fsLock.writeLock().lockInterruptibly();
1239      }
1240      @Override
1241      public void writeUnlock() {
1242        this.fsLock.writeLock().unlock();
1243      }
1244      @Override
1245      public boolean hasWriteLock() {
1246        return this.fsLock.isWriteLockedByCurrentThread();
1247      }
1248      @Override
1249      public boolean hasReadLock() {
1250        return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1251      }
1252    
1253      NamespaceInfo getNamespaceInfo() {
1254        readLock();
1255        try {
1256          return unprotectedGetNamespaceInfo();
1257        } finally {
1258          readUnlock();
1259        }
1260      }
1261    
1262      /**
1263       * Version of @see #getNamespaceInfo() that is not protected by a lock.
1264       */
1265      NamespaceInfo unprotectedGetNamespaceInfo() {
1266        return new NamespaceInfo(dir.fsImage.getStorage().getNamespaceID(),
1267            getClusterId(), getBlockPoolId(),
1268            dir.fsImage.getStorage().getCTime());
1269      }
1270    
1271      /**
1272       * Close down this file system manager.
1273       * Causes heartbeat and lease daemons to stop; waits briefly for
1274       * them to finish, but a short timeout returns control back to caller.
1275       */
1276      void close() {
1277        fsRunning = false;
1278        try {
1279          stopCommonServices();
1280          if (smmthread != null) smmthread.interrupt();
1281        } finally {
1282          // using finally to ensure we also wait for lease daemon
1283          try {
1284            stopActiveServices();
1285            stopStandbyServices();
1286            if (dir != null) {
1287              dir.close();
1288            }
1289          } catch (IOException ie) {
1290            LOG.error("Error closing FSDirectory", ie);
1291            IOUtils.cleanup(LOG, dir);
1292          }
1293        }
1294      }
1295    
1296      @Override
1297      public boolean isRunning() {
1298        return fsRunning;
1299      }
1300      
1301      @Override
1302      public boolean isInStandbyState() {
1303        if (haContext == null || haContext.getState() == null) {
1304          // We're still starting up. In this case, if HA is
1305          // on for the cluster, we always start in standby. Otherwise
1306          // start in active.
1307          return haEnabled;
1308        }
1309    
1310        return HAServiceState.STANDBY == haContext.getState().getServiceState();
1311      }
1312    
1313      /**
1314       * Dump all metadata into specified file
1315       */
1316      void metaSave(String filename) throws IOException {
1317        checkSuperuserPrivilege();
1318        checkOperation(OperationCategory.UNCHECKED);
1319        writeLock();
1320        try {
1321          checkOperation(OperationCategory.UNCHECKED);
1322          File file = new File(System.getProperty("hadoop.log.dir"), filename);
1323          PrintWriter out = new PrintWriter(new BufferedWriter(
1324              new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1325          metaSave(out);
1326          out.flush();
1327          out.close();
1328        } finally {
1329          writeUnlock();
1330        }
1331      }
1332    
1333      private void metaSave(PrintWriter out) {
1334        assert hasWriteLock();
1335        long totalInodes = this.dir.totalInodes();
1336        long totalBlocks = this.getBlocksTotal();
1337        out.println(totalInodes + " files and directories, " + totalBlocks
1338            + " blocks = " + (totalInodes + totalBlocks) + " total");
1339    
1340        blockManager.metaSave(out);
1341      }
1342    
1343      private String metaSaveAsString() {
1344        StringWriter sw = new StringWriter();
1345        PrintWriter pw = new PrintWriter(sw);
1346        metaSave(pw);
1347        pw.flush();
1348        return sw.toString();
1349      }
1350      
1351    
1352      long getDefaultBlockSize() {
1353        return serverDefaults.getBlockSize();
1354      }
1355    
1356      FsServerDefaults getServerDefaults() throws StandbyException {
1357        checkOperation(OperationCategory.READ);
1358        return serverDefaults;
1359      }
1360    
1361      long getAccessTimePrecision() {
1362        return accessTimePrecision;
1363      }
1364    
1365      private boolean isAccessTimeSupported() {
1366        return accessTimePrecision > 0;
1367      }
1368    
1369      /////////////////////////////////////////////////////////
1370      //
1371      // These methods are called by HadoopFS clients
1372      //
1373      /////////////////////////////////////////////////////////
1374      /**
1375       * Set permissions for an existing file.
1376       * @throws IOException
1377       */
1378      void setPermission(String src, FsPermission permission)
1379          throws AccessControlException, FileNotFoundException, SafeModeException,
1380          UnresolvedLinkException, IOException {
1381        try {
1382          setPermissionInt(src, permission);
1383        } catch (AccessControlException e) {
1384          logAuditEvent(false, "setPermission", src);
1385          throw e;
1386        }
1387      }
1388    
1389      private void setPermissionInt(String src, FsPermission permission)
1390          throws AccessControlException, FileNotFoundException, SafeModeException,
1391          UnresolvedLinkException, IOException {
1392        HdfsFileStatus resultingStat = null;
1393        FSPermissionChecker pc = getPermissionChecker();
1394        checkOperation(OperationCategory.WRITE);
1395        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1396        writeLock();
1397        try {
1398          checkOperation(OperationCategory.WRITE);
1399          checkNameNodeSafeMode("Cannot set permission for " + src);
1400          src = FSDirectory.resolvePath(src, pathComponents, dir);
1401          checkOwner(pc, src);
1402          dir.setPermission(src, permission);
1403          resultingStat = getAuditFileInfo(src, false);
1404        } finally {
1405          writeUnlock();
1406        }
1407        getEditLog().logSync();
1408        logAuditEvent(true, "setPermission", src, null, resultingStat);
1409      }
1410    
1411      /**
1412       * Set owner for an existing file.
1413       * @throws IOException
1414       */
1415      void setOwner(String src, String username, String group)
1416          throws AccessControlException, FileNotFoundException, SafeModeException,
1417          UnresolvedLinkException, IOException {
1418        try {
1419          setOwnerInt(src, username, group);
1420        } catch (AccessControlException e) {
1421          logAuditEvent(false, "setOwner", src);
1422          throw e;
1423        } 
1424      }
1425    
1426      private void setOwnerInt(String src, String username, String group)
1427          throws AccessControlException, FileNotFoundException, SafeModeException,
1428          UnresolvedLinkException, IOException {
1429        HdfsFileStatus resultingStat = null;
1430        FSPermissionChecker pc = getPermissionChecker();
1431        checkOperation(OperationCategory.WRITE);
1432        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1433        writeLock();
1434        try {
1435          checkOperation(OperationCategory.WRITE);
1436          checkNameNodeSafeMode("Cannot set owner for " + src);
1437          src = FSDirectory.resolvePath(src, pathComponents, dir);
1438          checkOwner(pc, src);
1439          if (!pc.isSuperUser()) {
1440            if (username != null && !pc.getUser().equals(username)) {
1441              throw new AccessControlException("Non-super user cannot change owner");
1442            }
1443            if (group != null && !pc.containsGroup(group)) {
1444              throw new AccessControlException("User does not belong to " + group);
1445            }
1446          }
1447          dir.setOwner(src, username, group);
1448          resultingStat = getAuditFileInfo(src, false);
1449        } finally {
1450          writeUnlock();
1451        }
1452        getEditLog().logSync();
1453        logAuditEvent(true, "setOwner", src, null, resultingStat);
1454      }
1455    
1456      /**
1457       * Get block locations within the specified range.
1458       * @see ClientProtocol#getBlockLocations(String, long, long)
1459       */
1460      LocatedBlocks getBlockLocations(String clientMachine, String src,
1461          long offset, long length) throws AccessControlException,
1462          FileNotFoundException, UnresolvedLinkException, IOException {
1463        LocatedBlocks blocks = getBlockLocations(src, offset, length, true, true,
1464            true);
1465        if (blocks != null) {
1466          blockManager.getDatanodeManager().sortLocatedBlocks(
1467              clientMachine, blocks.getLocatedBlocks());
1468          
1469          LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1470          if (lastBlock != null) {
1471            ArrayList<LocatedBlock> lastBlockList = new ArrayList<LocatedBlock>();
1472            lastBlockList.add(lastBlock);
1473            blockManager.getDatanodeManager().sortLocatedBlocks(
1474                                  clientMachine, lastBlockList);
1475          }
1476        }
1477        return blocks;
1478      }
1479    
1480      /**
1481       * Get block locations within the specified range.
1482       * @see ClientProtocol#getBlockLocations(String, long, long)
1483       * @throws FileNotFoundException, UnresolvedLinkException, IOException
1484       */
1485      LocatedBlocks getBlockLocations(String src, long offset, long length,
1486          boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode)
1487          throws FileNotFoundException, UnresolvedLinkException, IOException {
1488        try {
1489          return getBlockLocationsInt(src, offset, length, doAccessTime,
1490                                      needBlockToken, checkSafeMode);
1491        } catch (AccessControlException e) {
1492          logAuditEvent(false, "open", src);
1493          throw e;
1494        }
1495      }
1496    
1497      private LocatedBlocks getBlockLocationsInt(String src, long offset,
1498          long length, boolean doAccessTime, boolean needBlockToken,
1499          boolean checkSafeMode)
1500          throws FileNotFoundException, UnresolvedLinkException, IOException {
1501        if (offset < 0) {
1502          throw new HadoopIllegalArgumentException(
1503              "Negative offset is not supported. File: " + src);
1504        }
1505        if (length < 0) {
1506          throw new HadoopIllegalArgumentException(
1507              "Negative length is not supported. File: " + src);
1508        }
1509        final LocatedBlocks ret = getBlockLocationsUpdateTimes(src,
1510            offset, length, doAccessTime, needBlockToken);  
1511        logAuditEvent(true, "open", src);
1512        if (checkSafeMode && isInSafeMode()) {
1513          for (LocatedBlock b : ret.getLocatedBlocks()) {
1514            // if safemode & no block locations yet then throw safemodeException
1515            if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1516              SafeModeException se = new SafeModeException(
1517                  "Zero blocklocations for " + src, safeMode);
1518              if (haEnabled && haContext != null && 
1519                  haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1520                throw new RetriableException(se);
1521              } else {
1522                throw se;
1523              }
1524            }
1525          }
1526        }
1527        return ret;
1528      }
1529    
1530      /*
1531       * Get block locations within the specified range, updating the
1532       * access times if necessary. 
1533       */
1534      private LocatedBlocks getBlockLocationsUpdateTimes(String src, long offset,
1535          long length, boolean doAccessTime, boolean needBlockToken)
1536          throws FileNotFoundException,
1537          UnresolvedLinkException, IOException {
1538        FSPermissionChecker pc = getPermissionChecker();
1539        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1540        for (int attempt = 0; attempt < 2; attempt++) {
1541          boolean isReadOp = (attempt == 0);
1542          if (isReadOp) { // first attempt is with readlock
1543            checkOperation(OperationCategory.READ);
1544            readLock();
1545          }  else { // second attempt is with  write lock
1546            checkOperation(OperationCategory.WRITE);
1547            writeLock(); // writelock is needed to set accesstime
1548          }
1549          src = FSDirectory.resolvePath(src, pathComponents, dir);
1550          try {
1551            if (isReadOp) {
1552              checkOperation(OperationCategory.READ);
1553            } else {
1554              checkOperation(OperationCategory.WRITE);
1555            }
1556            if (isPermissionEnabled) {
1557              checkPathAccess(pc, src, FsAction.READ);
1558            }
1559    
1560            // if the namenode is in safemode, then do not update access time
1561            if (isInSafeMode()) {
1562              doAccessTime = false;
1563            }
1564    
1565            final INodesInPath iip = dir.getLastINodeInPath(src);
1566            final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1567            if (!iip.isSnapshot() //snapshots are readonly, so don't update atime.
1568                && doAccessTime && isAccessTimeSupported()) {
1569              final long now = now();
1570              if (now > inode.getAccessTime() + getAccessTimePrecision()) {
1571                // if we have to set access time but we only have the readlock, then
1572                // restart this entire operation with the writeLock.
1573                if (isReadOp) {
1574                  continue;
1575                }
1576                dir.setTimes(src, inode, -1, now, false, iip.getLatestSnapshot());
1577              }
1578            }
1579            final long fileSize = iip.isSnapshot() ?
1580                inode.computeFileSize(iip.getPathSnapshot())
1581                : inode.computeFileSizeNotIncludingLastUcBlock();
1582            boolean isUc = inode.isUnderConstruction();
1583            if (iip.isSnapshot()) {
1584              // if src indicates a snapshot file, we need to make sure the returned
1585              // blocks do not exceed the size of the snapshot file.
1586              length = Math.min(length, fileSize - offset);
1587              isUc = false;
1588            }
1589            return blockManager.createLocatedBlocks(inode.getBlocks(), fileSize,
1590                isUc, offset, length, needBlockToken, iip.isSnapshot());
1591          } finally {
1592            if (isReadOp) {
1593              readUnlock();
1594            } else {
1595              writeUnlock();
1596            }
1597          }
1598        }
1599        return null; // can never reach here
1600      }
1601    
1602      /**
1603       * Moves all the blocks from srcs and appends them to trg
1604       * To avoid rollbacks we will verify validitity of ALL of the args
1605       * before we start actual move.
1606       * 
1607       * This does not support ".inodes" relative path
1608       * @param target
1609       * @param srcs
1610       * @throws IOException
1611       */
1612      void concat(String target, String [] srcs) 
1613          throws IOException, UnresolvedLinkException {
1614        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1615        if (cacheEntry != null && cacheEntry.isSuccess()) {
1616          return; // Return previous response
1617        }
1618        
1619        // Either there is no previous request in progres or it has failed
1620        if(FSNamesystem.LOG.isDebugEnabled()) {
1621          FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
1622              " to " + target);
1623        }
1624        
1625        boolean success = false;
1626        try {
1627          concatInt(target, srcs, cacheEntry != null);
1628          success = true;
1629        } catch (AccessControlException e) {
1630          logAuditEvent(false, "concat", Arrays.toString(srcs), target, null);
1631          throw e;
1632        } finally {
1633          RetryCache.setState(cacheEntry, success);
1634        }
1635      }
1636    
1637      private void concatInt(String target, String [] srcs, 
1638          boolean logRetryCache) throws IOException, UnresolvedLinkException {
1639        // verify args
1640        if(target.isEmpty()) {
1641          throw new IllegalArgumentException("Target file name is empty");
1642        }
1643        if(srcs == null || srcs.length == 0) {
1644          throw new IllegalArgumentException("No sources given");
1645        }
1646        
1647        // We require all files be in the same directory
1648        String trgParent = 
1649          target.substring(0, target.lastIndexOf(Path.SEPARATOR_CHAR));
1650        for (String s : srcs) {
1651          String srcParent = s.substring(0, s.lastIndexOf(Path.SEPARATOR_CHAR));
1652          if (!srcParent.equals(trgParent)) {
1653            throw new IllegalArgumentException(
1654               "Sources and target are not in the same directory");
1655          }
1656        }
1657    
1658        HdfsFileStatus resultingStat = null;
1659        FSPermissionChecker pc = getPermissionChecker();
1660        checkOperation(OperationCategory.WRITE);
1661        writeLock();
1662        try {
1663          checkOperation(OperationCategory.WRITE);
1664          checkNameNodeSafeMode("Cannot concat " + target);
1665          concatInternal(pc, target, srcs, logRetryCache);
1666          resultingStat = getAuditFileInfo(target, false);
1667        } finally {
1668          writeUnlock();
1669        }
1670        getEditLog().logSync();
1671        logAuditEvent(true, "concat", Arrays.toString(srcs), target, resultingStat);
1672      }
1673    
1674      /** See {@link #concat(String, String[])} */
1675      private void concatInternal(FSPermissionChecker pc, String target,
1676          String[] srcs, boolean logRetryCache) throws IOException,
1677          UnresolvedLinkException {
1678        assert hasWriteLock();
1679    
1680        // write permission for the target
1681        if (isPermissionEnabled) {
1682          checkPathAccess(pc, target, FsAction.WRITE);
1683    
1684          // and srcs
1685          for(String aSrc: srcs) {
1686            checkPathAccess(pc, aSrc, FsAction.READ); // read the file
1687            checkParentAccess(pc, aSrc, FsAction.WRITE); // for delete 
1688          }
1689        }
1690    
1691        // to make sure no two files are the same
1692        Set<INode> si = new HashSet<INode>();
1693    
1694        // we put the following prerequisite for the operation
1695        // replication and blocks sizes should be the same for ALL the blocks
1696    
1697        // check the target
1698        final INodeFile trgInode = INodeFile.valueOf(dir.getINode4Write(target),
1699            target);
1700        if(trgInode.isUnderConstruction()) {
1701          throw new HadoopIllegalArgumentException("concat: target file "
1702              + target + " is under construction");
1703        }
1704        // per design target shouldn't be empty and all the blocks same size
1705        if(trgInode.numBlocks() == 0) {
1706          throw new HadoopIllegalArgumentException("concat: target file "
1707              + target + " is empty");
1708        }
1709        if (trgInode instanceof INodeFileWithSnapshot) {
1710          throw new HadoopIllegalArgumentException("concat: target file "
1711              + target + " is in a snapshot");
1712        }
1713    
1714        long blockSize = trgInode.getPreferredBlockSize();
1715    
1716        // check the end block to be full
1717        final BlockInfo last = trgInode.getLastBlock();
1718        if(blockSize != last.getNumBytes()) {
1719          throw new HadoopIllegalArgumentException("The last block in " + target
1720              + " is not full; last block size = " + last.getNumBytes()
1721              + " but file block size = " + blockSize);
1722        }
1723    
1724        si.add(trgInode);
1725        final short repl = trgInode.getFileReplication();
1726    
1727        // now check the srcs
1728        boolean endSrc = false; // final src file doesn't have to have full end block
1729        for(int i=0; i<srcs.length; i++) {
1730          String src = srcs[i];
1731          if(i==srcs.length-1)
1732            endSrc=true;
1733    
1734          final INodeFile srcInode = INodeFile.valueOf(dir.getINode4Write(src), src);
1735          if(src.isEmpty() 
1736              || srcInode.isUnderConstruction()
1737              || srcInode.numBlocks() == 0) {
1738            throw new HadoopIllegalArgumentException("concat: source file " + src
1739                + " is invalid or empty or underConstruction");
1740          }
1741    
1742          // check replication and blocks size
1743          if(repl != srcInode.getBlockReplication()) {
1744            throw new HadoopIllegalArgumentException("concat: the soruce file "
1745                + src + " and the target file " + target
1746                + " should have the same replication: source replication is "
1747                + srcInode.getBlockReplication()
1748                + " but target replication is " + repl);
1749          }
1750    
1751          //boolean endBlock=false;
1752          // verify that all the blocks are of the same length as target
1753          // should be enough to check the end blocks
1754          final BlockInfo[] srcBlocks = srcInode.getBlocks();
1755          int idx = srcBlocks.length-1;
1756          if(endSrc)
1757            idx = srcBlocks.length-2; // end block of endSrc is OK not to be full
1758          if(idx >= 0 && srcBlocks[idx].getNumBytes() != blockSize) {
1759            throw new HadoopIllegalArgumentException("concat: the soruce file "
1760                + src + " and the target file " + target
1761                + " should have the same blocks sizes: target block size is "
1762                + blockSize + " but the size of source block " + idx + " is "
1763                + srcBlocks[idx].getNumBytes());
1764          }
1765    
1766          si.add(srcInode);
1767        }
1768    
1769        // make sure no two files are the same
1770        if(si.size() < srcs.length+1) { // trg + srcs
1771          // it means at least two files are the same
1772          throw new HadoopIllegalArgumentException(
1773              "concat: at least two of the source files are the same");
1774        }
1775    
1776        if(NameNode.stateChangeLog.isDebugEnabled()) {
1777          NameNode.stateChangeLog.debug("DIR* NameSystem.concat: " + 
1778              Arrays.toString(srcs) + " to " + target);
1779        }
1780    
1781        dir.concat(target,srcs, logRetryCache);
1782      }
1783      
1784      /**
1785       * stores the modification and access time for this inode. 
1786       * The access time is precise upto an hour. The transaction, if needed, is
1787       * written to the edits log but is not flushed.
1788       */
1789      void setTimes(String src, long mtime, long atime) 
1790          throws IOException, UnresolvedLinkException {
1791        if (!isAccessTimeSupported() && atime != -1) {
1792          throw new IOException("Access time for hdfs is not configured. " +
1793                                " Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
1794        }
1795        try {
1796          setTimesInt(src, mtime, atime);
1797        } catch (AccessControlException e) {
1798          logAuditEvent(false, "setTimes", src);
1799          throw e;
1800        }
1801      }
1802    
1803      private void setTimesInt(String src, long mtime, long atime) 
1804        throws IOException, UnresolvedLinkException {
1805        HdfsFileStatus resultingStat = null;
1806        FSPermissionChecker pc = getPermissionChecker();
1807        checkOperation(OperationCategory.WRITE);
1808        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1809        writeLock();
1810        try {
1811          checkOperation(OperationCategory.WRITE);
1812          checkNameNodeSafeMode("Cannot set times " + src);
1813          src = FSDirectory.resolvePath(src, pathComponents, dir);
1814    
1815          // Write access is required to set access and modification times
1816          if (isPermissionEnabled) {
1817            checkPathAccess(pc, src, FsAction.WRITE);
1818          }
1819          final INodesInPath iip = dir.getINodesInPath4Write(src);
1820          final INode inode = iip.getLastINode();
1821          if (inode != null) {
1822            dir.setTimes(src, inode, mtime, atime, true, iip.getLatestSnapshot());
1823            resultingStat = getAuditFileInfo(src, false);
1824          } else {
1825            throw new FileNotFoundException("File/Directory " + src + " does not exist.");
1826          }
1827        } finally {
1828          writeUnlock();
1829        }
1830        logAuditEvent(true, "setTimes", src, null, resultingStat);
1831      }
1832    
1833      /**
1834       * Create a symbolic link.
1835       */
1836      @SuppressWarnings("deprecation")
1837      void createSymlink(String target, String link,
1838          PermissionStatus dirPerms, boolean createParent) 
1839          throws IOException, UnresolvedLinkException {
1840        if (!FileSystem.isSymlinksEnabled()) {
1841          throw new UnsupportedOperationException("Symlinks not supported");
1842        }
1843        if (!DFSUtil.isValidName(link)) {
1844          throw new InvalidPathException("Invalid link name: " + link);
1845        }
1846        if (FSDirectory.isReservedName(target)) {
1847          throw new InvalidPathException("Invalid target name: " + target);
1848        }
1849        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1850        if (cacheEntry != null && cacheEntry.isSuccess()) {
1851          return; // Return previous response
1852        }
1853        boolean success = false;
1854        try {
1855          createSymlinkInt(target, link, dirPerms, createParent, cacheEntry != null);
1856          success = true;
1857        } catch (AccessControlException e) {
1858          logAuditEvent(false, "createSymlink", link, target, null);
1859          throw e;
1860        } finally {
1861          RetryCache.setState(cacheEntry, success);
1862        }
1863      }
1864    
1865      private void createSymlinkInt(String target, String link,
1866          PermissionStatus dirPerms, boolean createParent, boolean logRetryCache) 
1867          throws IOException, UnresolvedLinkException {
1868        if (NameNode.stateChangeLog.isDebugEnabled()) {
1869          NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target="
1870              + target + " link=" + link);
1871        }
1872        HdfsFileStatus resultingStat = null;
1873        FSPermissionChecker pc = getPermissionChecker();
1874        checkOperation(OperationCategory.WRITE);
1875        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(link);
1876        writeLock();
1877        try {
1878          checkOperation(OperationCategory.WRITE);
1879          checkNameNodeSafeMode("Cannot create symlink " + link);
1880          link = FSDirectory.resolvePath(link, pathComponents, dir);
1881          if (!createParent) {
1882            verifyParentDir(link);
1883          }
1884          if (!dir.isValidToCreate(link)) {
1885            throw new IOException("failed to create link " + link 
1886                +" either because the filename is invalid or the file exists");
1887          }
1888          if (isPermissionEnabled) {
1889            checkAncestorAccess(pc, link, FsAction.WRITE);
1890          }
1891          // validate that we have enough inodes.
1892          checkFsObjectLimit();
1893    
1894          // add symbolic link to namespace
1895          dir.addSymlink(link, target, dirPerms, createParent, logRetryCache);
1896          resultingStat = getAuditFileInfo(link, false);
1897        } finally {
1898          writeUnlock();
1899        }
1900        getEditLog().logSync();
1901        logAuditEvent(true, "createSymlink", link, target, resultingStat);
1902      }
1903    
1904      /**
1905       * Set replication for an existing file.
1906       * 
1907       * The NameNode sets new replication and schedules either replication of 
1908       * under-replicated data blocks or removal of the excessive block copies 
1909       * if the blocks are over-replicated.
1910       * 
1911       * @see ClientProtocol#setReplication(String, short)
1912       * @param src file name
1913       * @param replication new replication
1914       * @return true if successful; 
1915       *         false if file does not exist or is a directory
1916       */
1917      boolean setReplication(final String src, final short replication)
1918          throws IOException {
1919        try {
1920          return setReplicationInt(src, replication);
1921        } catch (AccessControlException e) {
1922          logAuditEvent(false, "setReplication", src);
1923          throw e;
1924        }
1925      }
1926    
1927      private boolean setReplicationInt(String src, final short replication)
1928          throws IOException {
1929        blockManager.verifyReplication(src, replication, null);
1930        final boolean isFile;
1931        FSPermissionChecker pc = getPermissionChecker();
1932        checkOperation(OperationCategory.WRITE);
1933        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1934        writeLock();
1935        try {
1936          checkOperation(OperationCategory.WRITE);
1937          checkNameNodeSafeMode("Cannot set replication for " + src);
1938          src = FSDirectory.resolvePath(src, pathComponents, dir);
1939          if (isPermissionEnabled) {
1940            checkPathAccess(pc, src, FsAction.WRITE);
1941          }
1942    
1943          final short[] blockRepls = new short[2]; // 0: old, 1: new
1944          final Block[] blocks = dir.setReplication(src, replication, blockRepls);
1945          isFile = blocks != null;
1946          if (isFile) {
1947            blockManager.setReplication(blockRepls[0], blockRepls[1], src, blocks);
1948          }
1949        } finally {
1950          writeUnlock();
1951        }
1952    
1953        getEditLog().logSync();
1954        if (isFile) {
1955          logAuditEvent(true, "setReplication", src);
1956        }
1957        return isFile;
1958      }
1959    
1960      long getPreferredBlockSize(String filename) 
1961          throws IOException, UnresolvedLinkException {
1962        FSPermissionChecker pc = getPermissionChecker();
1963        checkOperation(OperationCategory.READ);
1964        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(filename);
1965        readLock();
1966        try {
1967          checkOperation(OperationCategory.READ);
1968          filename = FSDirectory.resolvePath(filename, pathComponents, dir);
1969          if (isPermissionEnabled) {
1970            checkTraverse(pc, filename);
1971          }
1972          return dir.getPreferredBlockSize(filename);
1973        } finally {
1974          readUnlock();
1975        }
1976      }
1977    
1978      /**
1979       * Verify that parent directory of src exists.
1980       */
1981      private void verifyParentDir(String src) throws FileNotFoundException,
1982          ParentNotDirectoryException, UnresolvedLinkException {
1983        assert hasReadLock();
1984        Path parent = new Path(src).getParent();
1985        if (parent != null) {
1986          final INode parentNode = dir.getINode(parent.toString());
1987          if (parentNode == null) {
1988            throw new FileNotFoundException("Parent directory doesn't exist: "
1989                + parent);
1990          } else if (!parentNode.isDirectory() && !parentNode.isSymlink()) {
1991            throw new ParentNotDirectoryException("Parent path is not a directory: "
1992                + parent);
1993          }
1994        }
1995      }
1996      
1997      /**
1998       * Create a new file entry in the namespace.
1999       * 
2000       * For description of parameters and exceptions thrown see
2001       * {@link ClientProtocol#create()}, except it returns valid file status upon
2002       * success
2003       * 
2004       * For retryCache handling details see -
2005       * {@link #getFileStatus(boolean, CacheEntryWithPayload)}
2006       * 
2007       */
2008      HdfsFileStatus startFile(String src, PermissionStatus permissions,
2009          String holder, String clientMachine, EnumSet<CreateFlag> flag,
2010          boolean createParent, short replication, long blockSize)
2011          throws AccessControlException, SafeModeException,
2012          FileAlreadyExistsException, UnresolvedLinkException,
2013          FileNotFoundException, ParentNotDirectoryException, IOException {
2014        HdfsFileStatus status = null;
2015        CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2016            null);
2017        if (cacheEntry != null && cacheEntry.isSuccess()) {
2018          return (HdfsFileStatus) cacheEntry.getPayload();
2019        }
2020        
2021        try {
2022          status = startFileInt(src, permissions, holder, clientMachine, flag,
2023              createParent, replication, blockSize, cacheEntry != null);
2024        } catch (AccessControlException e) {
2025          logAuditEvent(false, "create", src);
2026          throw e;
2027        } finally {
2028          RetryCache.setState(cacheEntry, status != null, status);
2029        }
2030        return status;
2031      }
2032    
2033      private HdfsFileStatus startFileInt(String src, PermissionStatus permissions,
2034          String holder, String clientMachine, EnumSet<CreateFlag> flag,
2035          boolean createParent, short replication, long blockSize,
2036          boolean logRetryCache) throws AccessControlException, SafeModeException,
2037          FileAlreadyExistsException, UnresolvedLinkException,
2038          FileNotFoundException, ParentNotDirectoryException, IOException {
2039        if (NameNode.stateChangeLog.isDebugEnabled()) {
2040          NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: src=" + src
2041              + ", holder=" + holder
2042              + ", clientMachine=" + clientMachine
2043              + ", createParent=" + createParent
2044              + ", replication=" + replication
2045              + ", createFlag=" + flag.toString());
2046        }
2047        if (!DFSUtil.isValidName(src)) {
2048          throw new InvalidPathException(src);
2049        }
2050        blockManager.verifyReplication(src, replication, clientMachine);
2051    
2052        boolean skipSync = false;
2053        HdfsFileStatus stat = null;
2054        FSPermissionChecker pc = getPermissionChecker();
2055        checkOperation(OperationCategory.WRITE);
2056        if (blockSize < minBlockSize) {
2057          throw new IOException("Specified block size is less than configured" +
2058              " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2059              + "): " + blockSize + " < " + minBlockSize);
2060        }
2061        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2062        boolean create = flag.contains(CreateFlag.CREATE);
2063        boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2064        writeLock();
2065        try {
2066          checkOperation(OperationCategory.WRITE);
2067          checkNameNodeSafeMode("Cannot create file" + src);
2068          src = FSDirectory.resolvePath(src, pathComponents, dir);
2069          startFileInternal(pc, src, permissions, holder, clientMachine, create,
2070              overwrite, createParent, replication, blockSize, logRetryCache);
2071          stat = dir.getFileInfo(src, false);
2072        } catch (StandbyException se) {
2073          skipSync = true;
2074          throw se;
2075        } finally {
2076          writeUnlock();
2077          // There might be transactions logged while trying to recover the lease.
2078          // They need to be sync'ed even when an exception was thrown.
2079          if (!skipSync) {
2080            getEditLog().logSync();
2081          }
2082        } 
2083        logAuditEvent(true, "create", src, null, stat);
2084        return stat;
2085      }
2086    
2087      /**
2088       * Create a new file or overwrite an existing file<br>
2089       * 
2090       * Once the file is create the client then allocates a new block with the next
2091       * call using {@link NameNode#addBlock()}.
2092       * <p>
2093       * For description of parameters and exceptions thrown see
2094       * {@link ClientProtocol#create()}
2095       */
2096      private void startFileInternal(FSPermissionChecker pc, String src,
2097          PermissionStatus permissions, String holder, String clientMachine,
2098          boolean create, boolean overwrite, boolean createParent,
2099          short replication, long blockSize, boolean logRetryEntry)
2100          throws FileAlreadyExistsException, AccessControlException,
2101          UnresolvedLinkException, FileNotFoundException,
2102          ParentNotDirectoryException, IOException {
2103        assert hasWriteLock();
2104        // Verify that the destination does not exist as a directory already.
2105        final INodesInPath iip = dir.getINodesInPath4Write(src);
2106        final INode inode = iip.getLastINode();
2107        if (inode != null && inode.isDirectory()) {
2108          throw new FileAlreadyExistsException("Cannot create file " + src
2109              + "; already exists as a directory.");
2110        }
2111        final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2112        if (isPermissionEnabled) {
2113          if (overwrite && myFile != null) {
2114            checkPathAccess(pc, src, FsAction.WRITE);
2115          } else {
2116            checkAncestorAccess(pc, src, FsAction.WRITE);
2117          }
2118        }
2119    
2120        if (!createParent) {
2121          verifyParentDir(src);
2122        }
2123    
2124        try {
2125          if (myFile == null) {
2126            if (!create) {
2127              throw new FileNotFoundException("failed to overwrite non-existent file "
2128                + src + " on client " + clientMachine);
2129            }
2130          } else {
2131            if (overwrite) {
2132              try {
2133                deleteInt(src, true, false); // File exists - delete if overwrite
2134              } catch (AccessControlException e) {
2135                logAuditEvent(false, "delete", src);
2136                throw e;
2137              }
2138            } else {
2139              // If lease soft limit time is expired, recover the lease
2140              recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2141              throw new FileAlreadyExistsException("failed to create file " + src
2142                  + " on client " + clientMachine + " because the file exists");
2143            }
2144          }
2145    
2146          checkFsObjectLimit();
2147          final DatanodeDescriptor clientNode = 
2148              blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2149    
2150          INodeFileUnderConstruction newNode = dir.addFile(src, permissions,
2151              replication, blockSize, holder, clientMachine, clientNode);
2152          if (newNode == null) {
2153            throw new IOException("DIR* NameSystem.startFile: " +
2154                                  "Unable to add file to namespace.");
2155          }
2156          leaseManager.addLease(newNode.getClientName(), src);
2157    
2158          // record file record in log, record new generation stamp
2159          getEditLog().logOpenFile(src, newNode, logRetryEntry);
2160          if (NameNode.stateChangeLog.isDebugEnabled()) {
2161            NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: "
2162                                       +"add "+src+" to namespace for "+holder);
2163          }
2164        } catch (IOException ie) {
2165          NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: "
2166                                       +ie.getMessage());
2167          throw ie;
2168        }
2169      }
2170      
2171      /**
2172       * Append to an existing file for append.
2173       * <p>
2174       * 
2175       * The method returns the last block of the file if this is a partial block,
2176       * which can still be used for writing more data. The client uses the returned
2177       * block locations to form the data pipeline for this block.<br>
2178       * The method returns null if the last block is full. The client then
2179       * allocates a new block with the next call using {@link NameNode#addBlock()}.
2180       * <p>
2181       * 
2182       * For description of parameters and exceptions thrown see
2183       * {@link ClientProtocol#append(String, String)}
2184       * 
2185       * @return the last block locations if the block is partial or null otherwise
2186       */
2187      private LocatedBlock appendFileInternal(FSPermissionChecker pc, String src,
2188          String holder, String clientMachine, boolean logRetryCache)
2189          throws AccessControlException, UnresolvedLinkException,
2190          FileNotFoundException, IOException {
2191        assert hasWriteLock();
2192        // Verify that the destination does not exist as a directory already.
2193        final INodesInPath iip = dir.getINodesInPath4Write(src);
2194        final INode inode = iip.getLastINode();
2195        if (inode != null && inode.isDirectory()) {
2196          throw new FileAlreadyExistsException("Cannot append to directory " + src
2197              + "; already exists as a directory.");
2198        }
2199        if (isPermissionEnabled) {
2200          checkPathAccess(pc, src, FsAction.WRITE);
2201        }
2202    
2203        try {
2204          if (inode == null) {
2205            throw new FileNotFoundException("failed to append to non-existent file "
2206              + src + " on client " + clientMachine);
2207          }
2208          INodeFile myFile = INodeFile.valueOf(inode, src, true);
2209          // Opening an existing file for write - may need to recover lease.
2210          recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2211          
2212          // recoverLeaseInternal may create a new InodeFile via 
2213          // finalizeINodeFileUnderConstruction so we need to refresh 
2214          // the referenced file.  
2215          myFile = INodeFile.valueOf(dir.getINode(src), src, true);
2216          
2217          final DatanodeDescriptor clientNode = 
2218              blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2219          return prepareFileForWrite(src, myFile, holder, clientMachine, clientNode,
2220              true, iip.getLatestSnapshot(), logRetryCache);
2221        } catch (IOException ie) {
2222          NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2223          throw ie;
2224        }
2225      }
2226      
2227      /**
2228       * Replace current node with a INodeUnderConstruction.
2229       * Recreate in-memory lease record.
2230       * 
2231       * @param src path to the file
2232       * @param file existing file object
2233       * @param leaseHolder identifier of the lease holder on this file
2234       * @param clientMachine identifier of the client machine
2235       * @param clientNode if the client is collocated with a DN, that DN's descriptor
2236       * @param writeToEditLog whether to persist this change to the edit log
2237       * @param logRetryCache whether to record RPC ids in editlog for retry cache
2238       *                      rebuilding
2239       * @return the last block locations if the block is partial or null otherwise
2240       * @throws UnresolvedLinkException
2241       * @throws IOException
2242       */
2243      LocatedBlock prepareFileForWrite(String src, INodeFile file,
2244          String leaseHolder, String clientMachine, DatanodeDescriptor clientNode,
2245          boolean writeToEditLog, Snapshot latestSnapshot, boolean logRetryCache)
2246          throws IOException {
2247        file = file.recordModification(latestSnapshot, dir.getINodeMap());
2248        final INodeFileUnderConstruction cons = file.toUnderConstruction(
2249            leaseHolder, clientMachine, clientNode);
2250    
2251        dir.replaceINodeFile(src, file, cons);
2252        leaseManager.addLease(cons.getClientName(), src);
2253        
2254        LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons);
2255        if (writeToEditLog) {
2256          getEditLog().logOpenFile(src, cons, logRetryCache);
2257        }
2258        return ret;
2259      }
2260    
2261      /**
2262       * Recover lease;
2263       * Immediately revoke the lease of the current lease holder and start lease
2264       * recovery so that the file can be forced to be closed.
2265       * 
2266       * @param src the path of the file to start lease recovery
2267       * @param holder the lease holder's name
2268       * @param clientMachine the client machine's name
2269       * @return true if the file is already closed
2270       * @throws IOException
2271       */
2272      boolean recoverLease(String src, String holder, String clientMachine)
2273          throws IOException {
2274        if (!DFSUtil.isValidName(src)) {
2275          throw new IOException("Invalid file name: " + src);
2276        }
2277      
2278        boolean skipSync = false;
2279        FSPermissionChecker pc = getPermissionChecker();
2280        checkOperation(OperationCategory.WRITE);
2281        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2282        writeLock();
2283        try {
2284          checkOperation(OperationCategory.WRITE);
2285          checkNameNodeSafeMode("Cannot recover the lease of " + src);
2286          src = FSDirectory.resolvePath(src, pathComponents, dir);
2287          final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src);
2288          if (!inode.isUnderConstruction()) {
2289            return true;
2290          }
2291          if (isPermissionEnabled) {
2292            checkPathAccess(pc, src, FsAction.WRITE);
2293          }
2294      
2295          recoverLeaseInternal(inode, src, holder, clientMachine, true);
2296        } catch (StandbyException se) {
2297          skipSync = true;
2298          throw se;
2299        } finally {
2300          writeUnlock();
2301          // There might be transactions logged while trying to recover the lease.
2302          // They need to be sync'ed even when an exception was thrown.
2303          if (!skipSync) {
2304            getEditLog().logSync();
2305          }
2306        }
2307        return false;
2308      }
2309    
2310      private void recoverLeaseInternal(INodeFile fileInode, 
2311          String src, String holder, String clientMachine, boolean force)
2312          throws IOException {
2313        assert hasWriteLock();
2314        if (fileInode != null && fileInode.isUnderConstruction()) {
2315          INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) fileInode;
2316          //
2317          // If the file is under construction , then it must be in our
2318          // leases. Find the appropriate lease record.
2319          //
2320          Lease lease = leaseManager.getLease(holder);
2321          //
2322          // We found the lease for this file. And surprisingly the original
2323          // holder is trying to recreate this file. This should never occur.
2324          //
2325          if (!force && lease != null) {
2326            Lease leaseFile = leaseManager.getLeaseByPath(src);
2327            if ((leaseFile != null && leaseFile.equals(lease)) ||
2328                lease.getHolder().equals(holder)) { 
2329              throw new AlreadyBeingCreatedException(
2330                "failed to create file " + src + " for " + holder +
2331                " on client " + clientMachine + 
2332                " because current leaseholder is trying to recreate file.");
2333            }
2334          }
2335          //
2336          // Find the original holder.
2337          //
2338          lease = leaseManager.getLease(pendingFile.getClientName());
2339          if (lease == null) {
2340            throw new AlreadyBeingCreatedException(
2341              "failed to create file " + src + " for " + holder +
2342              " on client " + clientMachine + 
2343              " because pendingCreates is non-null but no leases found.");
2344          }
2345          if (force) {
2346            // close now: no need to wait for soft lease expiration and 
2347            // close only the file src
2348            LOG.info("recoverLease: " + lease + ", src=" + src +
2349              " from client " + pendingFile.getClientName());
2350            internalReleaseLease(lease, src, holder);
2351          } else {
2352            assert lease.getHolder().equals(pendingFile.getClientName()) :
2353              "Current lease holder " + lease.getHolder() +
2354              " does not match file creator " + pendingFile.getClientName();
2355            //
2356            // If the original holder has not renewed in the last SOFTLIMIT 
2357            // period, then start lease recovery.
2358            //
2359            if (lease.expiredSoftLimit()) {
2360              LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2361                  + pendingFile.getClientName());
2362              boolean isClosed = internalReleaseLease(lease, src, null);
2363              if(!isClosed)
2364                throw new RecoveryInProgressException(
2365                    "Failed to close file " + src +
2366                    ". Lease recovery is in progress. Try again later.");
2367            } else {
2368              final BlockInfo lastBlock = pendingFile.getLastBlock();
2369              if (lastBlock != null
2370                  && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2371                throw new RecoveryInProgressException("Recovery in progress, file ["
2372                    + src + "], " + "lease owner [" + lease.getHolder() + "]");
2373              } else {
2374                throw new AlreadyBeingCreatedException("Failed to create file ["
2375                    + src + "] for [" + holder + "] on client [" + clientMachine
2376                    + "], because this file is already being created by ["
2377                    + pendingFile.getClientName() + "] on ["
2378                    + pendingFile.getClientMachine() + "]");
2379              }
2380            }
2381          }
2382        }
2383      }
2384    
2385      /**
2386       * Append to an existing file in the namespace.
2387       */
2388      LocatedBlock appendFile(String src, String holder, String clientMachine)
2389          throws AccessControlException, SafeModeException,
2390          FileAlreadyExistsException, FileNotFoundException,
2391          ParentNotDirectoryException, IOException {
2392        LocatedBlock lb = null;
2393        CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2394            null);
2395        if (cacheEntry != null && cacheEntry.isSuccess()) {
2396          return (LocatedBlock) cacheEntry.getPayload();
2397        }
2398          
2399        boolean success = false;
2400        try {
2401          lb = appendFileInt(src, holder, clientMachine, cacheEntry != null);
2402          success = true;
2403          return lb;
2404        } catch (AccessControlException e) {
2405          logAuditEvent(false, "append", src);
2406          throw e;
2407        } finally {
2408          RetryCache.setState(cacheEntry, success, lb);
2409        }
2410      }
2411    
2412      private LocatedBlock appendFileInt(String src, String holder,
2413          String clientMachine, boolean logRetryCache)
2414          throws AccessControlException, SafeModeException,
2415          FileAlreadyExistsException, FileNotFoundException,
2416          ParentNotDirectoryException, IOException {
2417        if (NameNode.stateChangeLog.isDebugEnabled()) {
2418          NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
2419              + ", holder=" + holder
2420              + ", clientMachine=" + clientMachine);
2421        }
2422        boolean skipSync = false;
2423        if (!supportAppends) {
2424          throw new UnsupportedOperationException(
2425              "Append is not enabled on this NameNode. Use the " +
2426              DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2427        }
2428    
2429        LocatedBlock lb = null;
2430        FSPermissionChecker pc = getPermissionChecker();
2431        checkOperation(OperationCategory.WRITE);
2432        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2433        writeLock();
2434        try {
2435          checkOperation(OperationCategory.WRITE);
2436          checkNameNodeSafeMode("Cannot append to file" + src);
2437          src = FSDirectory.resolvePath(src, pathComponents, dir);
2438          lb = appendFileInternal(pc, src, holder, clientMachine, logRetryCache);
2439        } catch (StandbyException se) {
2440          skipSync = true;
2441          throw se;
2442        } finally {
2443          writeUnlock();
2444          // There might be transactions logged while trying to recover the lease.
2445          // They need to be sync'ed even when an exception was thrown.
2446          if (!skipSync) {
2447            getEditLog().logSync();
2448          }
2449        }
2450        if (lb != null) {
2451          if (NameNode.stateChangeLog.isDebugEnabled()) {
2452            NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: file "
2453                +src+" for "+holder+" at "+clientMachine
2454                +" block " + lb.getBlock()
2455                +" block size " + lb.getBlock().getNumBytes());
2456          }
2457        }
2458        logAuditEvent(true, "append", src);
2459        return lb;
2460      }
2461    
2462      ExtendedBlock getExtendedBlock(Block blk) {
2463        return new ExtendedBlock(blockPoolId, blk);
2464      }
2465      
2466      void setBlockPoolId(String bpid) {
2467        blockPoolId = bpid;
2468        blockManager.setBlockPoolId(blockPoolId);
2469      }
2470    
2471      /**
2472       * The client would like to obtain an additional block for the indicated
2473       * filename (which is being written-to).  Return an array that consists
2474       * of the block, plus a set of machines.  The first on this list should
2475       * be where the client writes data.  Subsequent items in the list must
2476       * be provided in the connection to the first datanode.
2477       *
2478       * Make sure the previous blocks have been reported by datanodes and
2479       * are replicated.  Will return an empty 2-elt array if we want the
2480       * client to "try again later".
2481       */
2482      LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
2483          ExtendedBlock previous, HashMap<Node, Node> excludedNodes, 
2484          List<String> favoredNodes)
2485          throws LeaseExpiredException, NotReplicatedYetException,
2486          QuotaExceededException, SafeModeException, UnresolvedLinkException,
2487          IOException {
2488        long blockSize;
2489        int replication;
2490        DatanodeDescriptor clientNode = null;
2491    
2492        if(NameNode.stateChangeLog.isDebugEnabled()) {
2493          NameNode.stateChangeLog.debug(
2494              "BLOCK* NameSystem.getAdditionalBlock: file "
2495              +src+" for "+clientName);
2496        }
2497    
2498        // Part I. Analyze the state of the file with respect to the input data.
2499        checkOperation(OperationCategory.READ);
2500        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2501        readLock();
2502        try {
2503          checkOperation(OperationCategory.READ);
2504          src = FSDirectory.resolvePath(src, pathComponents, dir);
2505          LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2506          final INode[] inodes = analyzeFileState(
2507              src, fileId, clientName, previous, onRetryBlock).getINodes();
2508          final INodeFileUnderConstruction pendingFile =
2509              (INodeFileUnderConstruction) inodes[inodes.length - 1].asFile();
2510    
2511          if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
2512            // This is a retry. Just return the last block if having locations.
2513            return onRetryBlock[0];
2514          }
2515          if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
2516            throw new IOException("File has reached the limit on maximum number of"
2517                + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
2518                + "): " + pendingFile.getBlocks().length + " >= "
2519                + maxBlocksPerFile);
2520          }
2521          blockSize = pendingFile.getPreferredBlockSize();
2522          clientNode = pendingFile.getClientNode();
2523          replication = pendingFile.getFileReplication();
2524        } finally {
2525          readUnlock();
2526        }
2527    
2528        // choose targets for the new block to be allocated.
2529        final DatanodeDescriptor targets[] = getBlockManager().chooseTarget( 
2530            src, replication, clientNode, excludedNodes, blockSize, favoredNodes);
2531    
2532        // Part II.
2533        // Allocate a new block, add it to the INode and the BlocksMap. 
2534        Block newBlock = null;
2535        long offset;
2536        checkOperation(OperationCategory.WRITE);
2537        writeLock();
2538        try {
2539          checkOperation(OperationCategory.WRITE);
2540          // Run the full analysis again, since things could have changed
2541          // while chooseTarget() was executing.
2542          LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2543          INodesInPath inodesInPath =
2544              analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
2545          final INode[] inodes = inodesInPath.getINodes();
2546          final INodeFileUnderConstruction pendingFile =
2547              (INodeFileUnderConstruction) inodes[inodes.length - 1].asFile();
2548    
2549          if (onRetryBlock[0] != null) {
2550            if (onRetryBlock[0].getLocations().length > 0) {
2551              // This is a retry. Just return the last block if having locations.
2552              return onRetryBlock[0];
2553            } else {
2554              // add new chosen targets to already allocated block and return
2555              BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2556              ((BlockInfoUnderConstruction) lastBlockInFile)
2557                  .setExpectedLocations(targets);
2558              offset = pendingFile.computeFileSize();
2559              return makeLocatedBlock(lastBlockInFile, targets, offset);
2560            }
2561          }
2562    
2563          // commit the last block and complete it if it has minimum replicas
2564          commitOrCompleteLastBlock(pendingFile,
2565                                    ExtendedBlock.getLocalBlock(previous));
2566    
2567          // allocate new block, record block locations in INode.
2568          newBlock = createNewBlock();
2569          saveAllocatedBlock(src, inodesInPath, newBlock, targets);
2570    
2571          dir.persistBlocks(src, pendingFile, false);
2572          offset = pendingFile.computeFileSize();
2573        } finally {
2574          writeUnlock();
2575        }
2576        getEditLog().logSync();
2577    
2578        // Return located block
2579        return makeLocatedBlock(newBlock, targets, offset);
2580      }
2581    
2582      INodesInPath analyzeFileState(String src,
2583                                    long fileId,
2584                                    String clientName,
2585                                    ExtendedBlock previous,
2586                                    LocatedBlock[] onRetryBlock)
2587              throws IOException  {
2588        assert hasReadLock();
2589    
2590        checkBlock(previous);
2591        onRetryBlock[0] = null;
2592        checkOperation(OperationCategory.WRITE);
2593        checkNameNodeSafeMode("Cannot add block to " + src);
2594    
2595        // have we exceeded the configured limit of fs objects.
2596        checkFsObjectLimit();
2597    
2598        Block previousBlock = ExtendedBlock.getLocalBlock(previous);
2599        final INodesInPath iip = dir.getINodesInPath4Write(src);
2600        final INodeFileUnderConstruction pendingFile
2601            = checkLease(src, fileId, clientName, iip.getLastINode());
2602        BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2603        if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
2604          // The block that the client claims is the current last block
2605          // doesn't match up with what we think is the last block. There are
2606          // four possibilities:
2607          // 1) This is the first block allocation of an append() pipeline
2608          //    which started appending exactly at a block boundary.
2609          //    In this case, the client isn't passed the previous block,
2610          //    so it makes the allocateBlock() call with previous=null.
2611          //    We can distinguish this since the last block of the file
2612          //    will be exactly a full block.
2613          // 2) This is a retry from a client that missed the response of a
2614          //    prior getAdditionalBlock() call, perhaps because of a network
2615          //    timeout, or because of an HA failover. In that case, we know
2616          //    by the fact that the client is re-issuing the RPC that it
2617          //    never began to write to the old block. Hence it is safe to
2618          //    to return the existing block.
2619          // 3) This is an entirely bogus request/bug -- we should error out
2620          //    rather than potentially appending a new block with an empty
2621          //    one in the middle, etc
2622          // 4) This is a retry from a client that timed out while
2623          //    the prior getAdditionalBlock() is still being processed,
2624          //    currently working on chooseTarget(). 
2625          //    There are no means to distinguish between the first and 
2626          //    the second attempts in Part I, because the first one hasn't
2627          //    changed the namesystem state yet.
2628          //    We run this analysis again in Part II where case 4 is impossible.
2629    
2630          BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
2631          if (previous == null &&
2632              lastBlockInFile != null &&
2633              lastBlockInFile.getNumBytes() == pendingFile.getPreferredBlockSize() &&
2634              lastBlockInFile.isComplete()) {
2635            // Case 1
2636            if (NameNode.stateChangeLog.isDebugEnabled()) {
2637               NameNode.stateChangeLog.debug(
2638                   "BLOCK* NameSystem.allocateBlock: handling block allocation" +
2639                   " writing to a file with a complete previous block: src=" +
2640                   src + " lastBlock=" + lastBlockInFile);
2641            }
2642          } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
2643            if (lastBlockInFile.getNumBytes() != 0) {
2644              throw new IOException(
2645                  "Request looked like a retry to allocate block " +
2646                  lastBlockInFile + " but it already contains " +
2647                  lastBlockInFile.getNumBytes() + " bytes");
2648            }
2649    
2650            // Case 2
2651            // Return the last block.
2652            NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
2653                "caught retry for allocation of a new block in " +
2654                src + ". Returning previously allocated block " + lastBlockInFile);
2655            long offset = pendingFile.computeFileSize();
2656            onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
2657                ((BlockInfoUnderConstruction)lastBlockInFile).getExpectedLocations(),
2658                offset);
2659            return iip;
2660          } else {
2661            // Case 3
2662            throw new IOException("Cannot allocate block in " + src + ": " +
2663                "passed 'previous' block " + previous + " does not match actual " +
2664                "last block in file " + lastBlockInFile);
2665          }
2666        }
2667    
2668        // Check if the penultimate block is minimally replicated
2669        if (!checkFileProgress(pendingFile, false)) {
2670          throw new NotReplicatedYetException("Not replicated yet: " + src);
2671        }
2672        return iip;
2673      }
2674    
2675      LocatedBlock makeLocatedBlock(Block blk,
2676                                            DatanodeInfo[] locs,
2677                                            long offset) throws IOException {
2678        LocatedBlock lBlk = new LocatedBlock(
2679            getExtendedBlock(blk), locs, offset);
2680        getBlockManager().setBlockToken(
2681            lBlk, BlockTokenSecretManager.AccessMode.WRITE);
2682        return lBlk;
2683      }
2684    
2685      /** @see NameNode#getAdditionalDatanode(String, ExtendedBlock, DatanodeInfo[], DatanodeInfo[], int, String) */
2686      LocatedBlock getAdditionalDatanode(String src, final ExtendedBlock blk,
2687          final DatanodeInfo[] existings,  final HashMap<Node, Node> excludes,
2688          final int numAdditionalNodes, final String clientName
2689          ) throws IOException {
2690        //check if the feature is enabled
2691        dtpReplaceDatanodeOnFailure.checkEnabled();
2692    
2693        final DatanodeDescriptor clientnode;
2694        final long preferredblocksize;
2695        final List<DatanodeDescriptor> chosen;
2696        checkOperation(OperationCategory.READ);
2697        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2698        readLock();
2699        try {
2700          checkOperation(OperationCategory.READ);
2701          //check safe mode
2702          checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
2703          src = FSDirectory.resolvePath(src, pathComponents, dir);
2704    
2705          //check lease
2706          final INodeFileUnderConstruction file = checkLease(src, clientName);
2707          clientnode = file.getClientNode();
2708          preferredblocksize = file.getPreferredBlockSize();
2709    
2710          //find datanode descriptors
2711          chosen = new ArrayList<DatanodeDescriptor>();
2712          for(DatanodeInfo d : existings) {
2713            final DatanodeDescriptor descriptor = blockManager.getDatanodeManager(
2714                ).getDatanode(d);
2715            if (descriptor != null) {
2716              chosen.add(descriptor);
2717            }
2718          }
2719        } finally {
2720          readUnlock();
2721        }
2722    
2723        // choose new datanodes.
2724        final DatanodeInfo[] targets = blockManager.getBlockPlacementPolicy(
2725            ).chooseTarget(src, numAdditionalNodes, clientnode, chosen, true,
2726            excludes, preferredblocksize);
2727        final LocatedBlock lb = new LocatedBlock(blk, targets);
2728        blockManager.setBlockToken(lb, AccessMode.COPY);
2729        return lb;
2730      }
2731    
2732      /**
2733       * The client would like to let go of the given block
2734       */
2735      boolean abandonBlock(ExtendedBlock b, String src, String holder)
2736          throws LeaseExpiredException, FileNotFoundException,
2737          UnresolvedLinkException, IOException {
2738        if(NameNode.stateChangeLog.isDebugEnabled()) {
2739          NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
2740              + "of file " + src);
2741        }
2742        checkOperation(OperationCategory.WRITE);
2743        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2744        writeLock();
2745        try {
2746          checkOperation(OperationCategory.WRITE);
2747          checkNameNodeSafeMode("Cannot abandon block " + b + " for fle" + src);
2748          src = FSDirectory.resolvePath(src, pathComponents, dir);
2749    
2750          //
2751          // Remove the block from the pending creates list
2752          //
2753          INodeFileUnderConstruction file = checkLease(src, holder);
2754          boolean removed = dir.removeBlock(src, file,
2755              ExtendedBlock.getLocalBlock(b));
2756          if (!removed) {
2757            return true;
2758          }
2759          if(NameNode.stateChangeLog.isDebugEnabled()) {
2760            NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
2761                                          + b + " is removed from pendingCreates");
2762          }
2763          dir.persistBlocks(src, file, false);
2764        } finally {
2765          writeUnlock();
2766        }
2767        getEditLog().logSync();
2768    
2769        return true;
2770      }
2771      
2772      /** make sure that we still have the lease on this file. */
2773      private INodeFileUnderConstruction checkLease(String src, String holder)
2774          throws LeaseExpiredException, UnresolvedLinkException,
2775          FileNotFoundException {
2776        return checkLease(src, INodeId.GRANDFATHER_INODE_ID, holder,
2777            dir.getINode(src));
2778      }
2779      
2780      private INodeFileUnderConstruction checkLease(String src, long fileId,
2781          String holder, INode inode) throws LeaseExpiredException,
2782          FileNotFoundException {
2783        assert hasReadLock();
2784        if (inode == null || !inode.isFile()) {
2785          Lease lease = leaseManager.getLease(holder);
2786          throw new LeaseExpiredException(
2787              "No lease on " + src + ": File does not exist. "
2788              + (lease != null ? lease.toString()
2789                  : "Holder " + holder + " does not have any open files."));
2790        }
2791        final INodeFile file = inode.asFile();
2792        if (!file.isUnderConstruction()) {
2793          Lease lease = leaseManager.getLease(holder);
2794          throw new LeaseExpiredException(
2795              "No lease on " + src + ": File is not open for writing. "
2796              + (lease != null ? lease.toString()
2797                  : "Holder " + holder + " does not have any open files."));
2798        }
2799        INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)file;
2800        if (holder != null && !pendingFile.getClientName().equals(holder)) {
2801          throw new LeaseExpiredException("Lease mismatch on " + src + " owned by "
2802              + pendingFile.getClientName() + " but is accessed by " + holder);
2803        }
2804        INodeId.checkId(fileId, pendingFile);
2805        return pendingFile;
2806      }
2807     
2808      /**
2809       * Complete in-progress write to the given file.
2810       * @return true if successful, false if the client should continue to retry
2811       *         (e.g if not all blocks have reached minimum replication yet)
2812       * @throws IOException on error (eg lease mismatch, file not open, file deleted)
2813       */
2814      boolean completeFile(String src, String holder,
2815                           ExtendedBlock last, long fileId)
2816        throws SafeModeException, UnresolvedLinkException, IOException {
2817        if (NameNode.stateChangeLog.isDebugEnabled()) {
2818          NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
2819              src + " for " + holder);
2820        }
2821        checkBlock(last);
2822        boolean success = false;
2823        checkOperation(OperationCategory.WRITE);
2824        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2825        writeLock();
2826        try {
2827          checkOperation(OperationCategory.WRITE);
2828          checkNameNodeSafeMode("Cannot complete file " + src);
2829          src = FSDirectory.resolvePath(src, pathComponents, dir);
2830          success = completeFileInternal(src, holder,
2831            ExtendedBlock.getLocalBlock(last), fileId);
2832        } finally {
2833          writeUnlock();
2834        }
2835        getEditLog().logSync();
2836        NameNode.stateChangeLog.info("DIR* completeFile: " + src + " is closed by "
2837            + holder);
2838        return success;
2839      }
2840    
2841      private boolean completeFileInternal(String src, 
2842          String holder, Block last, long fileId) throws SafeModeException,
2843          UnresolvedLinkException, IOException {
2844        assert hasWriteLock();
2845        final INodesInPath iip = dir.getLastINodeInPath(src);
2846        final INodeFileUnderConstruction pendingFile;
2847        try {
2848          pendingFile = checkLease(src, fileId, holder, iip.getINode(0));
2849        } catch (LeaseExpiredException lee) {
2850          final INode inode = dir.getINode(src);
2851          if (inode != null
2852              && inode.isFile()
2853              && !inode.asFile().isUnderConstruction()) {
2854            // This could be a retry RPC - i.e the client tried to close
2855            // the file, but missed the RPC response. Thus, it is trying
2856            // again to close the file. If the file still exists and
2857            // the client's view of the last block matches the actual
2858            // last block, then we'll treat it as a successful close.
2859            // See HDFS-3031.
2860            final Block realLastBlock = inode.asFile().getLastBlock();
2861            if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
2862              NameNode.stateChangeLog.info("DIR* completeFile: " +
2863                  "request from " + holder + " to complete " + src +
2864                  " which is already closed. But, it appears to be an RPC " +
2865                  "retry. Returning success");
2866              return true;
2867            }
2868          }
2869          throw lee;
2870        }
2871        // commit the last block and complete it if it has minimum replicas
2872        commitOrCompleteLastBlock(pendingFile, last);
2873    
2874        if (!checkFileProgress(pendingFile, true)) {
2875          return false;
2876        }
2877    
2878        finalizeINodeFileUnderConstruction(src, pendingFile,
2879            iip.getLatestSnapshot());
2880        return true;
2881      }
2882    
2883      /**
2884       * Save allocated block at the given pending filename
2885       * 
2886       * @param src path to the file
2887       * @param inodesInPath representing each of the components of src. 
2888       *                     The last INode is the INode for the file.
2889       * @throws QuotaExceededException If addition of block exceeds space quota
2890       */
2891      BlockInfo saveAllocatedBlock(String src, INodesInPath inodes,
2892          Block newBlock, DatanodeDescriptor targets[]) throws IOException {
2893        assert hasWriteLock();
2894        BlockInfo b = dir.addBlock(src, inodes, newBlock, targets);
2895        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + src + ". "
2896            + getBlockPoolId() + " " + b);
2897        for (DatanodeDescriptor dn : targets) {
2898          dn.incBlocksScheduled();
2899        }
2900        return b;
2901      }
2902    
2903      /**
2904       * Create new block with a unique block id and a new generation stamp.
2905       */
2906      Block createNewBlock() throws IOException {
2907        assert hasWriteLock();
2908        Block b = new Block(nextBlockId(), 0, 0);
2909        // Increment the generation stamp for every new block.
2910        b.setGenerationStamp(nextGenerationStamp(false));
2911        return b;
2912      }
2913    
2914      /**
2915       * Check that the indicated file's blocks are present and
2916       * replicated.  If not, return false. If checkall is true, then check
2917       * all blocks, otherwise check only penultimate block.
2918       */
2919      boolean checkFileProgress(INodeFile v, boolean checkall) {
2920        readLock();
2921        try {
2922          if (checkall) {
2923            //
2924            // check all blocks of the file.
2925            //
2926            for (BlockInfo block: v.getBlocks()) {
2927              if (!block.isComplete()) {
2928                LOG.info("BLOCK* checkFileProgress: " + block
2929                    + " has not reached minimal replication "
2930                    + blockManager.minReplication);
2931                return false;
2932              }
2933            }
2934          } else {
2935            //
2936            // check the penultimate block of this file
2937            //
2938            BlockInfo b = v.getPenultimateBlock();
2939            if (b != null && !b.isComplete()) {
2940              LOG.info("BLOCK* checkFileProgress: " + b
2941                  + " has not reached minimal replication "
2942                  + blockManager.minReplication);
2943              return false;
2944            }
2945          }
2946          return true;
2947        } finally {
2948          readUnlock();
2949        }
2950      }
2951    
2952      ////////////////////////////////////////////////////////////////
2953      // Here's how to handle block-copy failure during client write:
2954      // -- As usual, the client's write should result in a streaming
2955      // backup write to a k-machine sequence.
2956      // -- If one of the backup machines fails, no worries.  Fail silently.
2957      // -- Before client is allowed to close and finalize file, make sure
2958      // that the blocks are backed up.  Namenode may have to issue specific backup
2959      // commands to make up for earlier datanode failures.  Once all copies
2960      // are made, edit namespace and return to client.
2961      ////////////////////////////////////////////////////////////////
2962    
2963      /** 
2964       * Change the indicated filename. 
2965       * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} instead.
2966       */
2967      @Deprecated
2968      boolean renameTo(String src, String dst) 
2969          throws IOException, UnresolvedLinkException {
2970        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
2971        if (cacheEntry != null && cacheEntry.isSuccess()) {
2972          return true; // Return previous response
2973        }
2974        boolean ret = false;
2975        try {
2976          ret = renameToInt(src, dst, cacheEntry != null);
2977        } catch (AccessControlException e) {
2978          logAuditEvent(false, "rename", src, dst, null);
2979          throw e;
2980        } finally {
2981          RetryCache.setState(cacheEntry, ret);
2982        }
2983        return ret;
2984      }
2985    
2986      private boolean renameToInt(String src, String dst, boolean logRetryCache) 
2987        throws IOException, UnresolvedLinkException {
2988        if (NameNode.stateChangeLog.isDebugEnabled()) {
2989          NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src +
2990              " to " + dst);
2991        }
2992        if (!DFSUtil.isValidName(dst)) {
2993          throw new IOException("Invalid name: " + dst);
2994        }
2995        FSPermissionChecker pc = getPermissionChecker();
2996        checkOperation(OperationCategory.WRITE);
2997        byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
2998        byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
2999        boolean status = false;
3000        HdfsFileStatus resultingStat = null;
3001        writeLock();
3002        try {
3003          checkOperation(OperationCategory.WRITE);
3004          checkNameNodeSafeMode("Cannot rename " + src);
3005          src = FSDirectory.resolvePath(src, srcComponents, dir);
3006          dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3007          checkOperation(OperationCategory.WRITE);
3008          status = renameToInternal(pc, src, dst, logRetryCache);
3009          if (status) {
3010            resultingStat = getAuditFileInfo(dst, false);
3011          }
3012        } finally {
3013          writeUnlock();
3014        }
3015        getEditLog().logSync();
3016        if (status) {
3017          logAuditEvent(true, "rename", src, dst, resultingStat);
3018        }
3019        return status;
3020      }
3021    
3022      /** @deprecated See {@link #renameTo(String, String)} */
3023      @Deprecated
3024      private boolean renameToInternal(FSPermissionChecker pc, String src,
3025          String dst, boolean logRetryCache) throws IOException,
3026          UnresolvedLinkException {
3027        assert hasWriteLock();
3028        if (isPermissionEnabled) {
3029          //We should not be doing this.  This is move() not renameTo().
3030          //but for now,
3031          //NOTE: yes, this is bad!  it's assuming much lower level behavior
3032          //      of rewriting the dst
3033          String actualdst = dir.isDir(dst)?
3034              dst + Path.SEPARATOR + new Path(src).getName(): dst;
3035          // Rename does not operates on link targets
3036          // Do not resolveLink when checking permissions of src and dst
3037          // Check write access to parent of src
3038          checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3039          // Check write access to ancestor of dst
3040          checkPermission(pc, actualdst, false, FsAction.WRITE, null, null, null,
3041              false);
3042        }
3043    
3044        if (dir.renameTo(src, dst, logRetryCache)) {
3045          return true;
3046        }
3047        return false;
3048      }
3049      
3050    
3051      /** Rename src to dst */
3052      void renameTo(String src, String dst, Options.Rename... options)
3053          throws IOException, UnresolvedLinkException {
3054        if (NameNode.stateChangeLog.isDebugEnabled()) {
3055          NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: with options - "
3056              + src + " to " + dst);
3057        }
3058        if (!DFSUtil.isValidName(dst)) {
3059          throw new InvalidPathException("Invalid name: " + dst);
3060        }
3061        final FSPermissionChecker pc = getPermissionChecker();
3062        
3063        checkOperation(OperationCategory.WRITE);
3064        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3065        if (cacheEntry != null && cacheEntry.isSuccess()) {
3066          return; // Return previous response
3067        }
3068        byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3069        byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3070        HdfsFileStatus resultingStat = null;
3071        boolean success = false;
3072        writeLock();
3073        try {
3074          checkOperation(OperationCategory.WRITE);
3075          checkNameNodeSafeMode("Cannot rename " + src);
3076          src = FSDirectory.resolvePath(src, srcComponents, dir);
3077          dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3078          renameToInternal(pc, src, dst, cacheEntry != null, options);
3079          resultingStat = getAuditFileInfo(dst, false);
3080          success = true;
3081        } finally {
3082          writeUnlock();
3083          RetryCache.setState(cacheEntry, success);
3084        }
3085        getEditLog().logSync();
3086        if (resultingStat != null) {
3087          StringBuilder cmd = new StringBuilder("rename options=");
3088          for (Rename option : options) {
3089            cmd.append(option.value()).append(" ");
3090          }
3091          logAuditEvent(true, cmd.toString(), src, dst, resultingStat);
3092        }
3093      }
3094    
3095      private void renameToInternal(FSPermissionChecker pc, String src, String dst,
3096          boolean logRetryCache, Options.Rename... options) throws IOException {
3097        assert hasWriteLock();
3098        if (isPermissionEnabled) {
3099          // Rename does not operates on link targets
3100          // Do not resolveLink when checking permissions of src and dst
3101          // Check write access to parent of src
3102          checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3103          // Check write access to ancestor of dst
3104          checkPermission(pc, dst, false, FsAction.WRITE, null, null, null, false);
3105        }
3106    
3107        dir.renameTo(src, dst, logRetryCache, options);
3108      }
3109      
3110      /**
3111       * Remove the indicated file from namespace.
3112       * 
3113       * @see ClientProtocol#delete(String, boolean) for detailed description and 
3114       * description of exceptions
3115       */
3116      boolean delete(String src, boolean recursive)
3117          throws AccessControlException, SafeModeException,
3118          UnresolvedLinkException, IOException {
3119        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3120        if (cacheEntry != null && cacheEntry.isSuccess()) {
3121          return true; // Return previous response
3122        }
3123        boolean ret = false;
3124        try {
3125          ret = deleteInt(src, recursive, cacheEntry != null);
3126        } catch (AccessControlException e) {
3127          logAuditEvent(false, "delete", src);
3128          throw e;
3129        } finally {
3130          RetryCache.setState(cacheEntry, ret);
3131        }
3132        return ret;
3133      }
3134          
3135      private boolean deleteInt(String src, boolean recursive, boolean logRetryCache)
3136          throws AccessControlException, SafeModeException,
3137          UnresolvedLinkException, IOException {
3138        if (NameNode.stateChangeLog.isDebugEnabled()) {
3139          NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src);
3140        }
3141        boolean status = deleteInternal(src, recursive, true, logRetryCache);
3142        if (status) {
3143          logAuditEvent(true, "delete", src);
3144        }
3145        return status;
3146      }
3147        
3148      private FSPermissionChecker getPermissionChecker()
3149          throws AccessControlException {
3150        try {
3151          return new FSPermissionChecker(fsOwnerShortUserName, supergroup, getRemoteUser());
3152        } catch (IOException ioe) {
3153          throw new AccessControlException(ioe);
3154        }
3155      }
3156      
3157      /**
3158       * Remove a file/directory from the namespace.
3159       * <p>
3160       * For large directories, deletion is incremental. The blocks under
3161       * the directory are collected and deleted a small number at a time holding
3162       * the {@link FSNamesystem} lock.
3163       * <p>
3164       * For small directory or file the deletion is done in one shot.
3165       * 
3166       * @see ClientProtocol#delete(String, boolean) for description of exceptions
3167       */
3168      private boolean deleteInternal(String src, boolean recursive,
3169          boolean enforcePermission, boolean logRetryCache)
3170          throws AccessControlException, SafeModeException, UnresolvedLinkException,
3171                 IOException {
3172        BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
3173        List<INode> removedINodes = new ArrayList<INode>();
3174        FSPermissionChecker pc = getPermissionChecker();
3175        checkOperation(OperationCategory.WRITE);
3176        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3177        boolean ret = false;
3178        writeLock();
3179        try {
3180          checkOperation(OperationCategory.WRITE);
3181          checkNameNodeSafeMode("Cannot delete " + src);
3182          src = FSDirectory.resolvePath(src, pathComponents, dir);
3183          if (!recursive && dir.isNonEmptyDirectory(src)) {
3184            throw new IOException(src + " is non empty");
3185          }
3186          if (enforcePermission && isPermissionEnabled) {
3187            checkPermission(pc, src, false, null, FsAction.WRITE, null,
3188                FsAction.ALL, false);
3189          }
3190          // Unlink the target directory from directory tree
3191          if (!dir.delete(src, collectedBlocks, removedINodes, logRetryCache)) {
3192            return false;
3193          }
3194          ret = true;
3195        } finally {
3196          writeUnlock();
3197        }
3198        getEditLog().logSync(); 
3199        removeBlocks(collectedBlocks); // Incremental deletion of blocks
3200        collectedBlocks.clear();
3201        dir.writeLock();
3202        try {
3203          dir.removeFromInodeMap(removedINodes);
3204        } finally {
3205          dir.writeUnlock();
3206        }
3207        removedINodes.clear();
3208        if (NameNode.stateChangeLog.isDebugEnabled()) {
3209          NameNode.stateChangeLog.debug("DIR* Namesystem.delete: "
3210            + src +" is removed");
3211        }
3212        return ret;
3213      }
3214    
3215      /**
3216       * From the given list, incrementally remove the blocks from blockManager
3217       * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3218       * ensure that other waiters on the lock can get in. See HDFS-2938
3219       * 
3220       * @param blocks
3221       *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3222       *          of blocks that need to be removed from blocksMap
3223       */
3224      void removeBlocks(BlocksMapUpdateInfo blocks) {
3225        int start = 0;
3226        int end = 0;
3227        List<Block> toDeleteList = blocks.getToDeleteList();
3228        while (start < toDeleteList.size()) {
3229          end = BLOCK_DELETION_INCREMENT + start;
3230          end = end > toDeleteList.size() ? toDeleteList.size() : end;
3231          writeLock();
3232          try {
3233            for (int i = start; i < end; i++) {
3234              blockManager.removeBlock(toDeleteList.get(i));
3235            }
3236          } finally {
3237            writeUnlock();
3238          }
3239          start = end;
3240        }
3241      }
3242      
3243      /**
3244       * Remove leases, inodes and blocks related to a given path
3245       * @param src The given path
3246       * @param blocks Containing the list of blocks to be deleted from blocksMap
3247       * @param removedINodes Containing the list of inodes to be removed from 
3248       *                      inodesMap
3249       */
3250      void removePathAndBlocks(String src, BlocksMapUpdateInfo blocks,
3251          List<INode> removedINodes) {
3252        assert hasWriteLock();
3253        leaseManager.removeLeaseWithPrefixPath(src);
3254        // remove inodes from inodesMap
3255        if (removedINodes != null) {
3256          dir.removeFromInodeMap(removedINodes);
3257          removedINodes.clear();
3258        }
3259        if (blocks == null) {
3260          return;
3261        }
3262        
3263        removeBlocksAndUpdateSafemodeTotal(blocks);
3264      }
3265    
3266      /**
3267       * Removes the blocks from blocksmap and updates the safemode blocks total
3268       * 
3269       * @param blocks
3270       *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3271       *          of blocks that need to be removed from blocksMap
3272       */
3273      void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3274        assert hasWriteLock();
3275        // In the case that we are a Standby tailing edits from the
3276        // active while in safe-mode, we need to track the total number
3277        // of blocks and safe blocks in the system.
3278        boolean trackBlockCounts = isSafeModeTrackingBlocks();
3279        int numRemovedComplete = 0, numRemovedSafe = 0;
3280    
3281        for (Block b : blocks.getToDeleteList()) {
3282          if (trackBlockCounts) {
3283            BlockInfo bi = getStoredBlock(b);
3284            if (bi.isComplete()) {
3285              numRemovedComplete++;
3286              if (bi.numNodes() >= blockManager.minReplication) {
3287                numRemovedSafe++;
3288              }
3289            }
3290          }
3291          blockManager.removeBlock(b);
3292        }
3293        if (trackBlockCounts) {
3294          if (LOG.isDebugEnabled()) {
3295            LOG.debug("Adjusting safe-mode totals for deletion."
3296                + "decreasing safeBlocks by " + numRemovedSafe
3297                + ", totalBlocks by " + numRemovedComplete);
3298          }
3299          adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3300        }
3301      }
3302    
3303      /**
3304       * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3305       */
3306      private boolean isSafeModeTrackingBlocks() {
3307        if (!haEnabled) {
3308          // Never track blocks incrementally in non-HA code.
3309          return false;
3310        }
3311        SafeModeInfo sm = this.safeMode;
3312        return sm != null && sm.shouldIncrementallyTrackBlocks();
3313      }
3314    
3315      /**
3316       * Get the file info for a specific file.
3317       *
3318       * @param src The string representation of the path to the file
3319       * @param resolveLink whether to throw UnresolvedLinkException 
3320       *        if src refers to a symlink
3321       *
3322       * @throws AccessControlException if access is denied
3323       * @throws UnresolvedLinkException if a symlink is encountered.
3324       *
3325       * @return object containing information regarding the file
3326       *         or null if file not found
3327       * @throws StandbyException 
3328       */
3329      HdfsFileStatus getFileInfo(String src, boolean resolveLink) 
3330        throws AccessControlException, UnresolvedLinkException,
3331               StandbyException, IOException {
3332        if (!DFSUtil.isValidName(src)) {
3333          throw new InvalidPathException("Invalid file name: " + src);
3334        }
3335        HdfsFileStatus stat = null;
3336        FSPermissionChecker pc = getPermissionChecker();
3337        checkOperation(OperationCategory.READ);
3338        if (!DFSUtil.isValidName(src)) {
3339          throw new InvalidPathException("Invalid file name: " + src);
3340        }
3341        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3342        readLock();
3343        try {
3344          checkOperation(OperationCategory.READ);
3345          src = FSDirectory.resolvePath(src, pathComponents, dir);
3346          if (isPermissionEnabled) {
3347            checkPermission(pc, src, false, null, null, null, null, resolveLink);
3348          }
3349          stat = dir.getFileInfo(src, resolveLink);
3350        } catch (AccessControlException e) {
3351          logAuditEvent(false, "getfileinfo", src);
3352          throw e;
3353        } finally {
3354          readUnlock();
3355        }
3356        logAuditEvent(true, "getfileinfo", src);
3357        return stat;
3358      }
3359      
3360      /**
3361       * Returns true if the file is closed
3362       */
3363      boolean isFileClosed(String src) 
3364          throws AccessControlException, UnresolvedLinkException,
3365          StandbyException, IOException {
3366        FSPermissionChecker pc = getPermissionChecker();    
3367        checkOperation(OperationCategory.READ);
3368        readLock();
3369        try {
3370          checkOperation(OperationCategory.READ);
3371          if (isPermissionEnabled) {
3372            checkTraverse(pc, src);
3373          }
3374          return !INodeFile.valueOf(dir.getINode(src), src).isUnderConstruction();
3375        } catch (AccessControlException e) {
3376          if (isAuditEnabled() && isExternalInvocation()) {
3377            logAuditEvent(false, "isFileClosed", src);
3378          }
3379          throw e;
3380        } finally {
3381          readUnlock();
3382        }
3383      }
3384    
3385      /**
3386       * Create all the necessary directories
3387       */
3388      boolean mkdirs(String src, PermissionStatus permissions,
3389          boolean createParent) throws IOException, UnresolvedLinkException {
3390        boolean ret = false;
3391        try {
3392          ret = mkdirsInt(src, permissions, createParent);
3393        } catch (AccessControlException e) {
3394          logAuditEvent(false, "mkdirs", src);
3395          throw e;
3396        }
3397        return ret;
3398      }
3399    
3400      private boolean mkdirsInt(String src, PermissionStatus permissions,
3401          boolean createParent) throws IOException, UnresolvedLinkException {
3402        if(NameNode.stateChangeLog.isDebugEnabled()) {
3403          NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
3404        }
3405        if (!DFSUtil.isValidName(src)) {
3406          throw new InvalidPathException(src);
3407        }
3408        FSPermissionChecker pc = getPermissionChecker();
3409        checkOperation(OperationCategory.WRITE);
3410        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3411        HdfsFileStatus resultingStat = null;
3412        boolean status = false;
3413        writeLock();
3414        try {
3415          checkOperation(OperationCategory.WRITE);   
3416          checkNameNodeSafeMode("Cannot create directory " + src);
3417          src = FSDirectory.resolvePath(src, pathComponents, dir);
3418          status = mkdirsInternal(pc, src, permissions, createParent);
3419          if (status) {
3420            resultingStat = dir.getFileInfo(src, false);
3421          }
3422        } finally {
3423          writeUnlock();
3424        }
3425        getEditLog().logSync();
3426        if (status) {
3427          logAuditEvent(true, "mkdirs", src, null, resultingStat);
3428        }
3429        return status;
3430      }
3431        
3432      /**
3433       * Create all the necessary directories
3434       */
3435      private boolean mkdirsInternal(FSPermissionChecker pc, String src,
3436          PermissionStatus permissions, boolean createParent) 
3437          throws IOException, UnresolvedLinkException {
3438        assert hasWriteLock();
3439        if (isPermissionEnabled) {
3440          checkTraverse(pc, src);
3441        }
3442        if (dir.isDirMutable(src)) {
3443          // all the users of mkdirs() are used to expect 'true' even if
3444          // a new directory is not created.
3445          return true;
3446        }
3447        if (isPermissionEnabled) {
3448          checkAncestorAccess(pc, src, FsAction.WRITE);
3449        }
3450        if (!createParent) {
3451          verifyParentDir(src);
3452        }
3453    
3454        // validate that we have enough inodes. This is, at best, a 
3455        // heuristic because the mkdirs() operation might need to 
3456        // create multiple inodes.
3457        checkFsObjectLimit();
3458    
3459        if (!dir.mkdirs(src, permissions, false, now())) {
3460          throw new IOException("Failed to create directory: " + src);
3461        }
3462        return true;
3463      }
3464    
3465      ContentSummary getContentSummary(String src) throws AccessControlException,
3466          FileNotFoundException, UnresolvedLinkException, StandbyException {
3467        FSPermissionChecker pc = getPermissionChecker();
3468        checkOperation(OperationCategory.READ);
3469        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3470        readLock();
3471        try {
3472          checkOperation(OperationCategory.READ);
3473          src = FSDirectory.resolvePath(src, pathComponents, dir);
3474          if (isPermissionEnabled) {
3475            checkPermission(pc, src, false, null, null, null, FsAction.READ_EXECUTE);
3476          }
3477          return dir.getContentSummary(src);
3478        } finally {
3479          readUnlock();
3480        }
3481      }
3482    
3483      /**
3484       * Set the namespace quota and diskspace quota for a directory.
3485       * See {@link ClientProtocol#setQuota(String, long, long)} for the 
3486       * contract.
3487       * 
3488       * Note: This does not support ".inodes" relative path.
3489       */
3490      void setQuota(String path, long nsQuota, long dsQuota) 
3491          throws IOException, UnresolvedLinkException {
3492        checkSuperuserPrivilege();
3493        checkOperation(OperationCategory.WRITE);
3494        writeLock();
3495        try {
3496          checkOperation(OperationCategory.WRITE);
3497          checkNameNodeSafeMode("Cannot set quota on " + path);
3498          dir.setQuota(path, nsQuota, dsQuota);
3499        } finally {
3500          writeUnlock();
3501        }
3502        getEditLog().logSync();
3503      }
3504    
3505      /** Persist all metadata about this file.
3506       * @param src The string representation of the path
3507       * @param clientName The string representation of the client
3508       * @param lastBlockLength The length of the last block 
3509       *                        under construction reported from client.
3510       * @throws IOException if path does not exist
3511       */
3512      void fsync(String src, String clientName, long lastBlockLength) 
3513          throws IOException, UnresolvedLinkException {
3514        NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3515        checkOperation(OperationCategory.WRITE);
3516        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3517        writeLock();
3518        try {
3519          checkOperation(OperationCategory.WRITE);
3520          checkNameNodeSafeMode("Cannot fsync file " + src);
3521          src = FSDirectory.resolvePath(src, pathComponents, dir);
3522          INodeFileUnderConstruction pendingFile  = checkLease(src, clientName);
3523          if (lastBlockLength > 0) {
3524            pendingFile.updateLengthOfLastBlock(lastBlockLength);
3525          }
3526          dir.persistBlocks(src, pendingFile, false);
3527        } finally {
3528          writeUnlock();
3529        }
3530        getEditLog().logSync();
3531      }
3532    
3533      /**
3534       * Move a file that is being written to be immutable.
3535       * @param src The filename
3536       * @param lease The lease for the client creating the file
3537       * @param recoveryLeaseHolder reassign lease to this holder if the last block
3538       *        needs recovery; keep current holder if null.
3539       * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3540       *         replication;<br>
3541       *         RecoveryInProgressException if lease recovery is in progress.<br>
3542       *         IOException in case of an error.
3543       * @return true  if file has been successfully finalized and closed or 
3544       *         false if block recovery has been initiated. Since the lease owner
3545       *         has been changed and logged, caller should call logSync().
3546       */
3547      boolean internalReleaseLease(Lease lease, String src, 
3548          String recoveryLeaseHolder) throws AlreadyBeingCreatedException, 
3549          IOException, UnresolvedLinkException {
3550        LOG.info("Recovering " + lease + ", src=" + src);
3551        assert !isInSafeMode();
3552        assert hasWriteLock();
3553    
3554        final INodesInPath iip = dir.getLastINodeInPath(src);
3555        final INodeFileUnderConstruction pendingFile
3556            = INodeFileUnderConstruction.valueOf(iip.getINode(0), src);
3557        int nrBlocks = pendingFile.numBlocks();
3558        BlockInfo[] blocks = pendingFile.getBlocks();
3559    
3560        int nrCompleteBlocks;
3561        BlockInfo curBlock = null;
3562        for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3563          curBlock = blocks[nrCompleteBlocks];
3564          if(!curBlock.isComplete())
3565            break;
3566          assert blockManager.checkMinReplication(curBlock) :
3567                  "A COMPLETE block is not minimally replicated in " + src;
3568        }
3569    
3570        // If there are no incomplete blocks associated with this file,
3571        // then reap lease immediately and close the file.
3572        if(nrCompleteBlocks == nrBlocks) {
3573          finalizeINodeFileUnderConstruction(src, pendingFile,
3574              iip.getLatestSnapshot());
3575          NameNode.stateChangeLog.warn("BLOCK*"
3576            + " internalReleaseLease: All existing blocks are COMPLETE,"
3577            + " lease removed, file closed.");
3578          return true;  // closed!
3579        }
3580    
3581        // Only the last and the penultimate blocks may be in non COMPLETE state.
3582        // If the penultimate block is not COMPLETE, then it must be COMMITTED.
3583        if(nrCompleteBlocks < nrBlocks - 2 ||
3584           nrCompleteBlocks == nrBlocks - 2 &&
3585             curBlock != null &&
3586             curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
3587          final String message = "DIR* NameSystem.internalReleaseLease: "
3588            + "attempt to release a create lock on "
3589            + src + " but file is already closed.";
3590          NameNode.stateChangeLog.warn(message);
3591          throw new IOException(message);
3592        }
3593    
3594        // The last block is not COMPLETE, and
3595        // that the penultimate block if exists is either COMPLETE or COMMITTED
3596        final BlockInfo lastBlock = pendingFile.getLastBlock();
3597        BlockUCState lastBlockState = lastBlock.getBlockUCState();
3598        BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
3599        boolean penultimateBlockMinReplication;
3600        BlockUCState penultimateBlockState;
3601        if (penultimateBlock == null) {
3602          penultimateBlockState = BlockUCState.COMPLETE;
3603          // If penultimate block doesn't exist then its minReplication is met
3604          penultimateBlockMinReplication = true;
3605        } else {
3606          penultimateBlockState = BlockUCState.COMMITTED;
3607          penultimateBlockMinReplication = 
3608            blockManager.checkMinReplication(penultimateBlock);
3609        }
3610        assert penultimateBlockState == BlockUCState.COMPLETE ||
3611               penultimateBlockState == BlockUCState.COMMITTED :
3612               "Unexpected state of penultimate block in " + src;
3613    
3614        switch(lastBlockState) {
3615        case COMPLETE:
3616          assert false : "Already checked that the last block is incomplete";
3617          break;
3618        case COMMITTED:
3619          // Close file if committed blocks are minimally replicated
3620          if(penultimateBlockMinReplication &&
3621              blockManager.checkMinReplication(lastBlock)) {
3622            finalizeINodeFileUnderConstruction(src, pendingFile,
3623                iip.getLatestSnapshot());
3624            NameNode.stateChangeLog.warn("BLOCK*"
3625              + " internalReleaseLease: Committed blocks are minimally replicated,"
3626              + " lease removed, file closed.");
3627            return true;  // closed!
3628          }
3629          // Cannot close file right now, since some blocks 
3630          // are not yet minimally replicated.
3631          // This may potentially cause infinite loop in lease recovery
3632          // if there are no valid replicas on data-nodes.
3633          String message = "DIR* NameSystem.internalReleaseLease: " +
3634              "Failed to release lease for file " + src +
3635              ". Committed blocks are waiting to be minimally replicated." +
3636              " Try again later.";
3637          NameNode.stateChangeLog.warn(message);
3638          throw new AlreadyBeingCreatedException(message);
3639        case UNDER_CONSTRUCTION:
3640        case UNDER_RECOVERY:
3641          final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)lastBlock;
3642          // setup the last block locations from the blockManager if not known
3643          if (uc.getNumExpectedLocations() == 0) {
3644            uc.setExpectedLocations(blockManager.getNodes(lastBlock));
3645          }
3646    
3647          if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
3648            // There is no datanode reported to this block.
3649            // may be client have crashed before writing data to pipeline.
3650            // This blocks doesn't need any recovery.
3651            // We can remove this block and close the file.
3652            pendingFile.removeLastBlock(lastBlock);
3653            finalizeINodeFileUnderConstruction(src, pendingFile,
3654                iip.getLatestSnapshot());
3655            NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
3656                + "Removed empty last block and closed file.");
3657            return true;
3658          }
3659          // start recovery of the last block for this file
3660          long blockRecoveryId = nextGenerationStamp(isLegacyBlock(uc));
3661          lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
3662          uc.initializeBlockRecovery(blockRecoveryId);
3663          leaseManager.renewLease(lease);
3664          // Cannot close file right now, since the last block requires recovery.
3665          // This may potentially cause infinite loop in lease recovery
3666          // if there are no valid replicas on data-nodes.
3667          NameNode.stateChangeLog.warn(
3668                    "DIR* NameSystem.internalReleaseLease: " +
3669                    "File " + src + " has not been closed." +
3670                   " Lease recovery is in progress. " +
3671                    "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
3672          break;
3673        }
3674        return false;
3675      }
3676    
3677      private Lease reassignLease(Lease lease, String src, String newHolder,
3678          INodeFileUnderConstruction pendingFile) {
3679        assert hasWriteLock();
3680        if(newHolder == null)
3681          return lease;
3682        // The following transaction is not synced. Make sure it's sync'ed later.
3683        logReassignLease(lease.getHolder(), src, newHolder);
3684        return reassignLeaseInternal(lease, src, newHolder, pendingFile);
3685      }
3686      
3687      Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
3688          INodeFileUnderConstruction pendingFile) {
3689        assert hasWriteLock();
3690        pendingFile.setClientName(newHolder);
3691        return leaseManager.reassignLease(lease, src, newHolder);
3692      }
3693    
3694      private void commitOrCompleteLastBlock(final INodeFileUnderConstruction fileINode,
3695          final Block commitBlock) throws IOException {
3696        assert hasWriteLock();
3697        if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
3698          return;
3699        }
3700    
3701        // Adjust disk space consumption if required
3702        final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
3703        if (diff > 0) {
3704          try {
3705            String path = leaseManager.findPath(fileINode);
3706            dir.updateSpaceConsumed(path, 0, -diff*fileINode.getFileReplication());
3707          } catch (IOException e) {
3708            LOG.warn("Unexpected exception while updating disk space.", e);
3709          }
3710        }
3711      }
3712    
3713      private void finalizeINodeFileUnderConstruction(String src, 
3714          INodeFileUnderConstruction pendingFile, Snapshot latestSnapshot) 
3715          throws IOException, UnresolvedLinkException {
3716        assert hasWriteLock();
3717        leaseManager.removeLease(pendingFile.getClientName(), src);
3718        
3719        pendingFile = pendingFile.recordModification(latestSnapshot,
3720            dir.getINodeMap());
3721    
3722        // The file is no longer pending.
3723        // Create permanent INode, update blocks
3724        final INodeFile newFile = pendingFile.toINodeFile(now());
3725        dir.replaceINodeFile(src, pendingFile, newFile);
3726    
3727        // close file and persist block allocations for this file
3728        dir.closeFile(src, newFile);
3729    
3730        blockManager.checkReplication(newFile);
3731      }
3732    
3733      @VisibleForTesting
3734      BlockInfo getStoredBlock(Block block) {
3735        return blockManager.getStoredBlock(block);
3736      }
3737      
3738      @Override
3739      public boolean isInSnapshot(BlockInfoUnderConstruction blockUC) {
3740        assert hasReadLock();
3741        final BlockCollection bc = blockUC.getBlockCollection();
3742        if (bc == null || !(bc instanceof INodeFileUnderConstruction)) {
3743          return false;
3744        }
3745    
3746        INodeFileUnderConstruction inodeUC = (INodeFileUnderConstruction) blockUC
3747            .getBlockCollection();
3748        String fullName = inodeUC.getName();
3749        try {
3750          if (fullName != null && fullName.startsWith(Path.SEPARATOR)
3751              && dir.getINode(fullName) == inodeUC) {
3752            // If file exists in normal path then no need to look in snapshot
3753            return false;
3754          }
3755        } catch (UnresolvedLinkException e) {
3756          LOG.error("Error while resolving the link : " + fullName, e);
3757          return false;
3758        }
3759        /*
3760         * 1. if bc is an instance of INodeFileUnderConstructionWithSnapshot, and
3761         * bc is not in the current fsdirectory tree, bc must represent a snapshot
3762         * file. 
3763         * 2. if fullName is not an absolute path, bc cannot be existent in the 
3764         * current fsdirectory tree. 
3765         * 3. if bc is not the current node associated with fullName, bc must be a
3766         * snapshot inode.
3767         */
3768        return true;
3769      }
3770    
3771      void commitBlockSynchronization(ExtendedBlock lastblock,
3772          long newgenerationstamp, long newlength,
3773          boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
3774          String[] newtargetstorages)
3775          throws IOException, UnresolvedLinkException {
3776        LOG.info("commitBlockSynchronization(lastblock=" + lastblock
3777                 + ", newgenerationstamp=" + newgenerationstamp
3778                 + ", newlength=" + newlength
3779                 + ", newtargets=" + Arrays.asList(newtargets)
3780                 + ", closeFile=" + closeFile
3781                 + ", deleteBlock=" + deleteblock
3782                 + ")");
3783        checkOperation(OperationCategory.WRITE);
3784        String src = "";
3785        writeLock();
3786        try {
3787          checkOperation(OperationCategory.WRITE);
3788          // If a DN tries to commit to the standby, the recovery will
3789          // fail, and the next retry will succeed on the new NN.
3790      
3791          checkNameNodeSafeMode(
3792              "Cannot commitBlockSynchronization while in safe mode");
3793          final BlockInfo storedBlock = getStoredBlock(
3794              ExtendedBlock.getLocalBlock(lastblock));
3795          if (storedBlock == null) {
3796            if (deleteblock) {
3797              // This may be a retry attempt so ignore the failure
3798              // to locate the block.
3799              if (LOG.isDebugEnabled()) {
3800                LOG.debug("Block (=" + lastblock + ") not found");
3801              }
3802              return;
3803            } else {
3804              throw new IOException("Block (=" + lastblock + ") not found");
3805            }
3806          }
3807          INodeFile iFile = ((INode)storedBlock.getBlockCollection()).asFile();
3808          if (!iFile.isUnderConstruction() || storedBlock.isComplete()) {
3809            if (LOG.isDebugEnabled()) {
3810              LOG.debug("Unexpected block (=" + lastblock
3811                        + ") since the file (=" + iFile.getLocalName()
3812                        + ") is not under construction");
3813            }
3814            return;
3815          }
3816    
3817          long recoveryId =
3818            ((BlockInfoUnderConstruction)storedBlock).getBlockRecoveryId();
3819          if(recoveryId != newgenerationstamp) {
3820            throw new IOException("The recovery id " + newgenerationstamp
3821                                  + " does not match current recovery id "
3822                                  + recoveryId + " for block " + lastblock); 
3823          }
3824    
3825          INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)iFile;
3826    
3827          if (deleteblock) {
3828            Block blockToDel = ExtendedBlock.getLocalBlock(lastblock);
3829            boolean remove = pendingFile.removeLastBlock(blockToDel);
3830            if (remove) {
3831              blockManager.removeBlockFromMap(storedBlock);
3832            }
3833          }
3834          else {
3835            // update last block
3836            storedBlock.setGenerationStamp(newgenerationstamp);
3837            storedBlock.setNumBytes(newlength);
3838    
3839            // find the DatanodeDescriptor objects
3840            // There should be no locations in the blockManager till now because the
3841            // file is underConstruction
3842            List<DatanodeDescriptor> targetList =
3843                new ArrayList<DatanodeDescriptor>(newtargets.length);
3844            if (newtargets.length > 0) {
3845              for (DatanodeID newtarget : newtargets) {
3846                // try to get targetNode
3847                DatanodeDescriptor targetNode =
3848                    blockManager.getDatanodeManager().getDatanode(newtarget);
3849                if (targetNode != null)
3850                  targetList.add(targetNode);
3851                else if (LOG.isDebugEnabled()) {
3852                  LOG.debug("DatanodeDescriptor (=" + newtarget + ") not found");
3853                }
3854              }
3855            }
3856            if ((closeFile) && !targetList.isEmpty()) {
3857              // the file is getting closed. Insert block locations into blockManager.
3858              // Otherwise fsck will report these blocks as MISSING, especially if the
3859              // blocksReceived from Datanodes take a long time to arrive.
3860              for (DatanodeDescriptor targetNode : targetList) {
3861                targetNode.addBlock(storedBlock);
3862              }
3863            }
3864            // add pipeline locations into the INodeUnderConstruction
3865            DatanodeDescriptor[] targetArray =
3866                new DatanodeDescriptor[targetList.size()];
3867            pendingFile.setLastBlock(storedBlock, targetList.toArray(targetArray));
3868          }
3869    
3870          if (closeFile) {
3871            src = closeFileCommitBlocks(pendingFile, storedBlock);
3872          } else {
3873            // If this commit does not want to close the file, persist blocks
3874            src = persistBlocks(pendingFile, false);
3875          }
3876        } finally {
3877          writeUnlock();
3878        }
3879        getEditLog().logSync();
3880        if (closeFile) {
3881          LOG.info("commitBlockSynchronization(newblock=" + lastblock
3882              + ", file=" + src
3883              + ", newgenerationstamp=" + newgenerationstamp
3884              + ", newlength=" + newlength
3885              + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
3886        } else {
3887          LOG.info("commitBlockSynchronization(" + lastblock + ") successful");
3888        }
3889      }
3890    
3891      /**
3892       *
3893       * @param pendingFile
3894       * @param storedBlock
3895       * @return Path of the file that was closed.
3896       * @throws IOException
3897       */
3898      @VisibleForTesting
3899      String closeFileCommitBlocks(INodeFileUnderConstruction pendingFile,
3900                                           BlockInfo storedBlock)
3901          throws IOException {
3902    
3903        String src = leaseManager.findPath(pendingFile);
3904    
3905        // commit the last block and complete it if it has minimum replicas
3906        commitOrCompleteLastBlock(pendingFile, storedBlock);
3907    
3908        //remove lease, close file
3909        finalizeINodeFileUnderConstruction(src, pendingFile,
3910                                           Snapshot.findLatestSnapshot(pendingFile, null));
3911    
3912        return src;
3913      }
3914    
3915      /**
3916       * Persist the block list for the given file.
3917       *
3918       * @param pendingFile
3919       * @return Path to the given file.
3920       * @throws IOException
3921       */
3922      @VisibleForTesting
3923      String persistBlocks(INodeFileUnderConstruction pendingFile,
3924          boolean logRetryCache) throws IOException {
3925        String src = leaseManager.findPath(pendingFile);
3926        dir.persistBlocks(src, pendingFile, logRetryCache);
3927        return src;
3928      }
3929    
3930      /**
3931       * Renew the lease(s) held by the given client
3932       */
3933      void renewLease(String holder) throws IOException {
3934        checkOperation(OperationCategory.WRITE);
3935        writeLock();
3936        try {
3937          checkOperation(OperationCategory.WRITE);
3938          checkNameNodeSafeMode("Cannot renew lease for " + holder);
3939          leaseManager.renewLease(holder);
3940        } finally {
3941          writeUnlock();
3942        }
3943      }
3944    
3945      /**
3946       * Get a partial listing of the indicated directory
3947       *
3948       * @param src the directory name
3949       * @param startAfter the name to start after
3950       * @param needLocation if blockLocations need to be returned
3951       * @return a partial listing starting after startAfter
3952       * 
3953       * @throws AccessControlException if access is denied
3954       * @throws UnresolvedLinkException if symbolic link is encountered
3955       * @throws IOException if other I/O error occurred
3956       */
3957      DirectoryListing getListing(String src, byte[] startAfter,
3958          boolean needLocation) 
3959          throws AccessControlException, UnresolvedLinkException, IOException {
3960        try {
3961          return getListingInt(src, startAfter, needLocation);
3962        } catch (AccessControlException e) {
3963          logAuditEvent(false, "listStatus", src);
3964          throw e;
3965        }
3966      }
3967    
3968      private DirectoryListing getListingInt(String src, byte[] startAfter,
3969          boolean needLocation) 
3970        throws AccessControlException, UnresolvedLinkException, IOException {
3971        DirectoryListing dl;
3972        FSPermissionChecker pc = getPermissionChecker();
3973        checkOperation(OperationCategory.READ);
3974        byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3975        String startAfterString = new String(startAfter);
3976        readLock();
3977        try {
3978          checkOperation(OperationCategory.READ);
3979          src = FSDirectory.resolvePath(src, pathComponents, dir);
3980    
3981          // Get file name when startAfter is an INodePath
3982          if (FSDirectory.isReservedName(startAfterString)) {
3983            byte[][] startAfterComponents = FSDirectory
3984                .getPathComponentsForReservedPath(startAfterString);
3985            try {
3986              String tmp = FSDirectory.resolvePath(src, startAfterComponents, dir);
3987              byte[][] regularPath = INode.getPathComponents(tmp);
3988              startAfter = regularPath[regularPath.length - 1];
3989            } catch (IOException e) {
3990              // Possibly the inode is deleted
3991              throw new DirectoryListingStartAfterNotFoundException(
3992                  "Can't find startAfter " + startAfterString);
3993            }
3994          }
3995          
3996          if (isPermissionEnabled) {
3997            if (dir.isDir(src)) {
3998              checkPathAccess(pc, src, FsAction.READ_EXECUTE);
3999            } else {
4000              checkTraverse(pc, src);
4001            }
4002          }
4003          logAuditEvent(true, "listStatus", src);
4004          dl = dir.getListing(src, startAfter, needLocation);
4005        } finally {
4006          readUnlock();
4007        }
4008        return dl;
4009      }
4010    
4011      /////////////////////////////////////////////////////////
4012      //
4013      // These methods are called by datanodes
4014      //
4015      /////////////////////////////////////////////////////////
4016      /**
4017       * Register Datanode.
4018       * <p>
4019       * The purpose of registration is to identify whether the new datanode
4020       * serves a new data storage, and will report new data block copies,
4021       * which the namenode was not aware of; or the datanode is a replacement
4022       * node for the data storage that was previously served by a different
4023       * or the same (in terms of host:port) datanode.
4024       * The data storages are distinguished by their storageIDs. When a new
4025       * data storage is reported the namenode issues a new unique storageID.
4026       * <p>
4027       * Finally, the namenode returns its namespaceID as the registrationID
4028       * for the datanodes. 
4029       * namespaceID is a persistent attribute of the name space.
4030       * The registrationID is checked every time the datanode is communicating
4031       * with the namenode. 
4032       * Datanodes with inappropriate registrationID are rejected.
4033       * If the namenode stops, and then restarts it can restore its 
4034       * namespaceID and will continue serving the datanodes that has previously
4035       * registered with the namenode without restarting the whole cluster.
4036       * 
4037       * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4038       */
4039      void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4040        writeLock();
4041        try {
4042          getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4043          checkSafeMode();
4044        } finally {
4045          writeUnlock();
4046        }
4047      }
4048      
4049      /**
4050       * Get registrationID for datanodes based on the namespaceID.
4051       * 
4052       * @see #registerDatanode(DatanodeRegistration)
4053       * @return registration ID
4054       */
4055      String getRegistrationID() {
4056        return Storage.getRegistrationID(dir.fsImage.getStorage());
4057      }
4058    
4059      /**
4060       * The given node has reported in.  This method should:
4061       * 1) Record the heartbeat, so the datanode isn't timed out
4062       * 2) Adjust usage stats for future block allocation
4063       * 
4064       * If a substantial amount of time passed since the last datanode 
4065       * heartbeat then request an immediate block report.  
4066       * 
4067       * @return an array of datanode commands 
4068       * @throws IOException
4069       */
4070      HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4071          long capacity, long dfsUsed, long remaining, long blockPoolUsed,
4072          int xceiverCount, int xmitsInProgress, int failedVolumes) 
4073            throws IOException {
4074        readLock();
4075        try {
4076          final int maxTransfer = blockManager.getMaxReplicationStreams()
4077              - xmitsInProgress;
4078          DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4079              nodeReg, blockPoolId, capacity, dfsUsed, remaining, blockPoolUsed,
4080              xceiverCount, maxTransfer, failedVolumes);
4081          return new HeartbeatResponse(cmds, createHaStatusHeartbeat());
4082        } finally {
4083          readUnlock();
4084        }
4085      }
4086    
4087      private NNHAStatusHeartbeat createHaStatusHeartbeat() {
4088        HAState state = haContext.getState();
4089        return new NNHAStatusHeartbeat(state.getServiceState(),
4090            getFSImage().getLastAppliedOrWrittenTxId());
4091      }
4092    
4093      /**
4094       * Returns whether or not there were available resources at the last check of
4095       * resources.
4096       *
4097       * @return true if there were sufficient resources available, false otherwise.
4098       */
4099      boolean nameNodeHasResourcesAvailable() {
4100        return hasResourcesAvailable;
4101      }
4102    
4103      /**
4104       * Perform resource checks and cache the results.
4105       * @throws IOException
4106       */
4107      void checkAvailableResources() {
4108        Preconditions.checkState(nnResourceChecker != null,
4109            "nnResourceChecker not initialized");
4110        hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4111      }
4112    
4113      /**
4114       * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4115       * there are found to be insufficient resources available, causes the NN to
4116       * enter safe mode. If resources are later found to have returned to
4117       * acceptable levels, this daemon will cause the NN to exit safe mode.
4118       */
4119      class NameNodeResourceMonitor implements Runnable  {
4120        boolean shouldNNRmRun = true;
4121        @Override
4122        public void run () {
4123          try {
4124            while (fsRunning && shouldNNRmRun) {
4125              checkAvailableResources();
4126              if(!nameNodeHasResourcesAvailable()) {
4127                String lowResourcesMsg = "NameNode low on available disk space. ";
4128                if (!isInSafeMode()) {
4129                  FSNamesystem.LOG.warn(lowResourcesMsg + "Entering safe mode.");
4130                } else {
4131                  FSNamesystem.LOG.warn(lowResourcesMsg + "Already in safe mode.");
4132                }
4133                enterSafeMode(true);
4134              }
4135              try {
4136                Thread.sleep(resourceRecheckInterval);
4137              } catch (InterruptedException ie) {
4138                // Deliberately ignore
4139              }
4140            }
4141          } catch (Exception e) {
4142            FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4143          }
4144        }
4145    
4146        public void stopMonitor() {
4147          shouldNNRmRun = false;
4148        }
4149     }
4150    
4151      class NameNodeEditLogRoller implements Runnable {
4152    
4153        private boolean shouldRun = true;
4154        private final long rollThreshold;
4155        private final long sleepIntervalMs;
4156    
4157        public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4158            this.rollThreshold = rollThreshold;
4159            this.sleepIntervalMs = sleepIntervalMs;
4160        }
4161    
4162        @Override
4163        public void run() {
4164          while (fsRunning && shouldRun) {
4165            try {
4166              FSEditLog editLog = getFSImage().getEditLog();
4167              long numEdits =
4168                  editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4169              if (numEdits > rollThreshold) {
4170                FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4171                    + " number of edits in open segment exceeds threshold of "
4172                    + rollThreshold);
4173                rollEditLog();
4174              }
4175              Thread.sleep(sleepIntervalMs);
4176            } catch (InterruptedException e) {
4177              FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4178                  + " was interrupted, exiting");
4179              break;
4180            } catch (Exception e) {
4181              FSNamesystem.LOG.error("Swallowing exception in "
4182                  + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4183            }
4184          }
4185        }
4186    
4187        public void stop() {
4188          shouldRun = false;
4189        }
4190      }
4191    
4192      public FSImage getFSImage() {
4193        return dir.fsImage;
4194      }
4195    
4196      public FSEditLog getEditLog() {
4197        return getFSImage().getEditLog();
4198      }    
4199    
4200      private void checkBlock(ExtendedBlock block) throws IOException {
4201        if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4202          throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4203              + " - expected " + blockPoolId);
4204        }
4205      }
4206    
4207      @Metric({"MissingBlocks", "Number of missing blocks"})
4208      public long getMissingBlocksCount() {
4209        // not locking
4210        return blockManager.getMissingBlocksCount();
4211      }
4212      
4213      @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4214      public int getExpiredHeartbeats() {
4215        return datanodeStatistics.getExpiredHeartbeats();
4216      }
4217      
4218      @Metric({"TransactionsSinceLastCheckpoint",
4219          "Number of transactions since last checkpoint"})
4220      public long getTransactionsSinceLastCheckpoint() {
4221        return getEditLog().getLastWrittenTxId() -
4222            getFSImage().getStorage().getMostRecentCheckpointTxId();
4223      }
4224      
4225      @Metric({"TransactionsSinceLastLogRoll",
4226          "Number of transactions since last edit log roll"})
4227      public long getTransactionsSinceLastLogRoll() {
4228        if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
4229          return 0;
4230        } else {
4231          return getEditLog().getLastWrittenTxId() -
4232            getEditLog().getCurSegmentTxId() + 1;
4233        }
4234      }
4235      
4236      @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4237      public long getLastWrittenTransactionId() {
4238        return getEditLog().getLastWrittenTxId();
4239      }
4240      
4241      @Metric({"LastCheckpointTime",
4242          "Time in milliseconds since the epoch of the last checkpoint"})
4243      public long getLastCheckpointTime() {
4244        return getFSImage().getStorage().getMostRecentCheckpointTime();
4245      }
4246    
4247      /** @see ClientProtocol#getStats() */
4248      long[] getStats() {
4249        final long[] stats = datanodeStatistics.getStats();
4250        stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4251        stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4252        stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4253        return stats;
4254      }
4255    
4256      @Override // FSNamesystemMBean
4257      @Metric({"CapacityTotal",
4258          "Total raw capacity of data nodes in bytes"})
4259      public long getCapacityTotal() {
4260        return datanodeStatistics.getCapacityTotal();
4261      }
4262    
4263      @Metric({"CapacityTotalGB",
4264          "Total raw capacity of data nodes in GB"})
4265      public float getCapacityTotalGB() {
4266        return DFSUtil.roundBytesToGB(getCapacityTotal());
4267      }
4268    
4269      @Override // FSNamesystemMBean
4270      @Metric({"CapacityUsed",
4271          "Total used capacity across all data nodes in bytes"})
4272      public long getCapacityUsed() {
4273        return datanodeStatistics.getCapacityUsed();
4274      }
4275    
4276      @Metric({"CapacityUsedGB",
4277          "Total used capacity across all data nodes in GB"})
4278      public float getCapacityUsedGB() {
4279        return DFSUtil.roundBytesToGB(getCapacityUsed());
4280      }
4281    
4282      @Override // FSNamesystemMBean
4283      @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4284      public long getCapacityRemaining() {
4285        return datanodeStatistics.getCapacityRemaining();
4286      }
4287    
4288      @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4289      public float getCapacityRemainingGB() {
4290        return DFSUtil.roundBytesToGB(getCapacityRemaining());
4291      }
4292    
4293      @Metric({"CapacityUsedNonDFS",
4294          "Total space used by data nodes for non DFS purposes in bytes"})
4295      public long getCapacityUsedNonDFS() {
4296        return datanodeStatistics.getCapacityUsedNonDFS();
4297      }
4298    
4299      /**
4300       * Total number of connections.
4301       */
4302      @Override // FSNamesystemMBean
4303      @Metric
4304      public int getTotalLoad() {
4305        return datanodeStatistics.getXceiverCount();
4306      }
4307      
4308      @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4309      public int getNumSnapshottableDirs() {
4310        return this.snapshotManager.getNumSnapshottableDirs();
4311      }
4312    
4313      @Metric({ "Snapshots", "The number of snapshots" })
4314      public int getNumSnapshots() {
4315        return this.snapshotManager.getNumSnapshots();
4316      }
4317    
4318      int getNumberOfDatanodes(DatanodeReportType type) {
4319        readLock();
4320        try {
4321          return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4322              type).size(); 
4323        } finally {
4324          readUnlock();
4325        }
4326      }
4327    
4328      DatanodeInfo[] datanodeReport(final DatanodeReportType type
4329          ) throws AccessControlException, StandbyException {
4330        checkSuperuserPrivilege();
4331        checkOperation(OperationCategory.UNCHECKED);
4332        readLock();
4333        try {
4334          checkOperation(OperationCategory.UNCHECKED);
4335          final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4336          final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4337    
4338          DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4339          for (int i=0; i<arr.length; i++) {
4340            arr[i] = new DatanodeInfo(results.get(i));
4341          }
4342          return arr;
4343        } finally {
4344          readUnlock();
4345        }
4346      }
4347    
4348      /**
4349       * Save namespace image.
4350       * This will save current namespace into fsimage file and empty edits file.
4351       * Requires superuser privilege and safe mode.
4352       * 
4353       * @throws AccessControlException if superuser privilege is violated.
4354       * @throws IOException if 
4355       */
4356      void saveNamespace() throws AccessControlException, IOException {
4357        checkOperation(OperationCategory.UNCHECKED);
4358        checkSuperuserPrivilege();
4359        
4360        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
4361        if (cacheEntry != null && cacheEntry.isSuccess()) {
4362          return; // Return previous response
4363        }
4364        boolean success = false;
4365        readLock();
4366        try {
4367          checkOperation(OperationCategory.UNCHECKED);
4368          if (!isInSafeMode()) {
4369            throw new IOException("Safe mode should be turned ON "
4370                + "in order to create namespace image.");
4371          }
4372          getFSImage().saveNamespace(this);
4373          success = true;
4374        } finally {
4375          readUnlock();
4376          RetryCache.setState(cacheEntry, success);
4377        }
4378        LOG.info("New namespace image has been created");
4379      }
4380      
4381      /**
4382       * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4383       * Requires superuser privilege.
4384       * 
4385       * @throws AccessControlException if superuser privilege is violated.
4386       */
4387      boolean restoreFailedStorage(String arg) throws AccessControlException,
4388          StandbyException {
4389        checkSuperuserPrivilege();
4390        checkOperation(OperationCategory.UNCHECKED);
4391        writeLock();
4392        try {
4393          checkOperation(OperationCategory.UNCHECKED);
4394          
4395          // if it is disabled - enable it and vice versa.
4396          if(arg.equals("check"))
4397            return getFSImage().getStorage().getRestoreFailedStorage();
4398          
4399          boolean val = arg.equals("true");  // false if not
4400          getFSImage().getStorage().setRestoreFailedStorage(val);
4401          
4402          return val;
4403        } finally {
4404          writeUnlock();
4405        }
4406      }
4407    
4408      Date getStartTime() {
4409        return new Date(startTime); 
4410      }
4411        
4412      void finalizeUpgrade() throws IOException {
4413        checkSuperuserPrivilege();
4414        checkOperation(OperationCategory.WRITE);
4415        writeLock();
4416        try {
4417          checkOperation(OperationCategory.WRITE);
4418          getFSImage().finalizeUpgrade();
4419        } finally {
4420          writeUnlock();
4421        }
4422      }
4423    
4424      void refreshNodes() throws IOException {
4425        checkOperation(OperationCategory.UNCHECKED);
4426        checkSuperuserPrivilege();
4427        getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
4428      }
4429    
4430      void setBalancerBandwidth(long bandwidth) throws IOException {
4431        checkOperation(OperationCategory.UNCHECKED);
4432        checkSuperuserPrivilege();
4433        getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
4434      }
4435    
4436      /**
4437       * SafeModeInfo contains information related to the safe mode.
4438       * <p>
4439       * An instance of {@link SafeModeInfo} is created when the name node
4440       * enters safe mode.
4441       * <p>
4442       * During name node startup {@link SafeModeInfo} counts the number of
4443       * <em>safe blocks</em>, those that have at least the minimal number of
4444       * replicas, and calculates the ratio of safe blocks to the total number
4445       * of blocks in the system, which is the size of blocks in
4446       * {@link FSNamesystem#blockManager}. When the ratio reaches the
4447       * {@link #threshold} it starts the SafeModeMonitor daemon in order
4448       * to monitor whether the safe mode {@link #extension} is passed.
4449       * Then it leaves safe mode and destroys itself.
4450       * <p>
4451       * If safe mode is turned on manually then the number of safe blocks is
4452       * not tracked because the name node is not intended to leave safe mode
4453       * automatically in the case.
4454       *
4455       * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
4456       */
4457      public class SafeModeInfo {
4458        // configuration fields
4459        /** Safe mode threshold condition %.*/
4460        private double threshold;
4461        /** Safe mode minimum number of datanodes alive */
4462        private int datanodeThreshold;
4463        /** Safe mode extension after the threshold. */
4464        private int extension;
4465        /** Min replication required by safe mode. */
4466        private int safeReplication;
4467        /** threshold for populating needed replication queues */
4468        private double replQueueThreshold;
4469          
4470        // internal fields
4471        /** Time when threshold was reached.
4472         * <br> -1 safe mode is off
4473         * <br> 0 safe mode is on, and threshold is not reached yet
4474         * <br> >0 safe mode is on, but we are in extension period 
4475         */
4476        private long reached = -1;  
4477        /** Total number of blocks. */
4478        int blockTotal; 
4479        /** Number of safe blocks. */
4480        int blockSafe;
4481        /** Number of blocks needed to satisfy safe mode threshold condition */
4482        private int blockThreshold;
4483        /** Number of blocks needed before populating replication queues */
4484        private int blockReplQueueThreshold;
4485        /** time of the last status printout */
4486        private long lastStatusReport = 0;
4487        /** flag indicating whether replication queues have been initialized */
4488        boolean initializedReplQueues = false;
4489        /** Was safemode entered automatically because available resources were low. */
4490        private boolean resourcesLow = false;
4491        /** Should safemode adjust its block totals as blocks come in */
4492        private boolean shouldIncrementallyTrackBlocks = false;
4493        /** counter for tracking startup progress of reported blocks */
4494        private Counter awaitingReportedBlocksCounter;
4495        
4496        /**
4497         * Creates SafeModeInfo when the name node enters
4498         * automatic safe mode at startup.
4499         *  
4500         * @param conf configuration
4501         */
4502        private SafeModeInfo(Configuration conf) {
4503          this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
4504              DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
4505          if(threshold > 1.0) {
4506            LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
4507          }
4508          this.datanodeThreshold = conf.getInt(
4509            DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
4510            DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
4511          this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
4512          this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
4513                                             DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
4514          
4515          LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
4516          LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
4517          LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
4518    
4519          // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
4520          this.replQueueThreshold = 
4521            conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
4522                          (float) threshold);
4523          this.blockTotal = 0; 
4524          this.blockSafe = 0;
4525        }
4526    
4527        /**
4528         * In the HA case, the StandbyNode can be in safemode while the namespace
4529         * is modified by the edit log tailer. In this case, the number of total
4530         * blocks changes as edits are processed (eg blocks are added and deleted).
4531         * However, we don't want to do the incremental tracking during the
4532         * startup-time loading process -- only once the initial total has been
4533         * set after the image has been loaded.
4534         */
4535        private boolean shouldIncrementallyTrackBlocks() {
4536          return shouldIncrementallyTrackBlocks;
4537        }
4538    
4539        /**
4540         * Creates SafeModeInfo when safe mode is entered manually, or because
4541         * available resources are low.
4542         *
4543         * The {@link #threshold} is set to 1.5 so that it could never be reached.
4544         * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
4545         * 
4546         * @see SafeModeInfo
4547         */
4548        private SafeModeInfo(boolean resourcesLow, boolean isReplQueuesInited) {
4549          this.threshold = 1.5f;  // this threshold can never be reached
4550          this.datanodeThreshold = Integer.MAX_VALUE;
4551          this.extension = Integer.MAX_VALUE;
4552          this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
4553          this.replQueueThreshold = 1.5f; // can never be reached
4554          this.blockTotal = -1;
4555          this.blockSafe = -1;
4556          this.resourcesLow = resourcesLow;
4557          this.initializedReplQueues = isReplQueuesInited;
4558          enter();
4559          reportStatus("STATE* Safe mode is ON.", true);
4560        }
4561          
4562        /**
4563         * Check if safe mode is on.
4564         * @return true if in safe mode
4565         */
4566        private synchronized boolean isOn() {
4567          doConsistencyCheck();
4568          return this.reached >= 0;
4569        }
4570          
4571        /**
4572         * Check if we are populating replication queues.
4573         */
4574        private synchronized boolean isPopulatingReplQueues() {
4575          return initializedReplQueues;
4576        }
4577    
4578        /**
4579         * Enter safe mode.
4580         */
4581        private void enter() {
4582          this.reached = 0;
4583        }
4584          
4585        /**
4586         * Leave safe mode.
4587         * <p>
4588         * Check for invalid, under- & over-replicated blocks in the end of startup.
4589         */
4590        private synchronized void leave() {
4591          // if not done yet, initialize replication queues.
4592          // In the standby, do not populate repl queues
4593          if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
4594            initializeReplQueues();
4595          }
4596          long timeInSafemode = now() - startTime;
4597          NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
4598                                        + timeInSafemode/1000 + " secs");
4599          NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
4600    
4601          //Log the following only once (when transitioning from ON -> OFF)
4602          if (reached >= 0) {
4603            NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
4604          }
4605          reached = -1;
4606          safeMode = null;
4607          final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
4608          NameNode.stateChangeLog.info("STATE* Network topology has "
4609              + nt.getNumOfRacks() + " racks and "
4610              + nt.getNumOfLeaves() + " datanodes");
4611          NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
4612              + blockManager.numOfUnderReplicatedBlocks() + " blocks");
4613    
4614          startSecretManagerIfNecessary();
4615    
4616          // If startup has not yet completed, end safemode phase.
4617          StartupProgress prog = NameNode.getStartupProgress();
4618          if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4619            prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
4620            prog.endPhase(Phase.SAFEMODE);
4621          }
4622        }
4623    
4624        /**
4625         * Initialize replication queues.
4626         */
4627        private synchronized void initializeReplQueues() {
4628          LOG.info("initializing replication queues");
4629          assert !isPopulatingReplQueues() : "Already initialized repl queues";
4630          long startTimeMisReplicatedScan = now();
4631          blockManager.processMisReplicatedBlocks();
4632          initializedReplQueues = true;
4633          NameNode.stateChangeLog.info("STATE* Replication Queue initialization "
4634              + "scan for invalid, over- and under-replicated blocks "
4635              + "completed in " + (now() - startTimeMisReplicatedScan)
4636              + " msec");
4637        }
4638    
4639        /**
4640         * Check whether we have reached the threshold for 
4641         * initializing replication queues.
4642         */
4643        private synchronized boolean canInitializeReplQueues() {
4644          return shouldPopulateReplQueues()
4645              && blockSafe >= blockReplQueueThreshold;
4646        }
4647          
4648        /** 
4649         * Safe mode can be turned off iff 
4650         * the threshold is reached and 
4651         * the extension time have passed.
4652         * @return true if can leave or false otherwise.
4653         */
4654        private synchronized boolean canLeave() {
4655          if (reached == 0)
4656            return false;
4657          if (now() - reached < extension) {
4658            reportStatus("STATE* Safe mode ON.", false);
4659            return false;
4660          }
4661          return !needEnter();
4662        }
4663          
4664        /** 
4665         * There is no need to enter safe mode 
4666         * if DFS is empty or {@link #threshold} == 0
4667         */
4668        private boolean needEnter() {
4669          return (threshold != 0 && blockSafe < blockThreshold) ||
4670            (getNumLiveDataNodes() < datanodeThreshold) ||
4671            (!nameNodeHasResourcesAvailable());
4672        }
4673          
4674        /**
4675         * Check and trigger safe mode if needed. 
4676         */
4677        private void checkMode() {
4678          // Have to have write-lock since leaving safemode initializes
4679          // repl queues, which requires write lock
4680          assert hasWriteLock();
4681          // if smmthread is already running, the block threshold must have been 
4682          // reached before, there is no need to enter the safe mode again
4683          if (smmthread == null && needEnter()) {
4684            enter();
4685            // check if we are ready to initialize replication queues
4686            if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
4687              initializeReplQueues();
4688            }
4689            reportStatus("STATE* Safe mode ON.", false);
4690            return;
4691          }
4692          // the threshold is reached or was reached before
4693          if (!isOn() ||                           // safe mode is off
4694              extension <= 0 || threshold <= 0) {  // don't need to wait
4695            this.leave(); // leave safe mode
4696            return;
4697          }
4698          if (reached > 0) {  // threshold has already been reached before
4699            reportStatus("STATE* Safe mode ON.", false);
4700            return;
4701          }
4702          // start monitor
4703          reached = now();
4704          if (smmthread == null) {
4705            smmthread = new Daemon(new SafeModeMonitor());
4706            smmthread.start();
4707            reportStatus("STATE* Safe mode extension entered.", true);
4708          }
4709    
4710          // check if we are ready to initialize replication queues
4711          if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
4712            initializeReplQueues();
4713          }
4714        }
4715          
4716        /**
4717         * Set total number of blocks.
4718         */
4719        private synchronized void setBlockTotal(int total) {
4720          this.blockTotal = total;
4721          this.blockThreshold = (int) (blockTotal * threshold);
4722          this.blockReplQueueThreshold = 
4723            (int) (blockTotal * replQueueThreshold);
4724          if (haEnabled) {
4725            // After we initialize the block count, any further namespace
4726            // modifications done while in safe mode need to keep track
4727            // of the number of total blocks in the system.
4728            this.shouldIncrementallyTrackBlocks = true;
4729          }
4730          if(blockSafe < 0)
4731            this.blockSafe = 0;
4732          checkMode();
4733        }
4734          
4735        /**
4736         * Increment number of safe blocks if current block has 
4737         * reached minimal replication.
4738         * @param replication current replication 
4739         */
4740        private synchronized void incrementSafeBlockCount(short replication) {
4741          if (replication == safeReplication) {
4742            this.blockSafe++;
4743    
4744            // Report startup progress only if we haven't completed startup yet.
4745            StartupProgress prog = NameNode.getStartupProgress();
4746            if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4747              if (this.awaitingReportedBlocksCounter == null) {
4748                this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
4749                  STEP_AWAITING_REPORTED_BLOCKS);
4750              }
4751              this.awaitingReportedBlocksCounter.increment();
4752            }
4753    
4754            checkMode();
4755          }
4756        }
4757          
4758        /**
4759         * Decrement number of safe blocks if current block has 
4760         * fallen below minimal replication.
4761         * @param replication current replication 
4762         */
4763        private synchronized void decrementSafeBlockCount(short replication) {
4764          if (replication == safeReplication-1) {
4765            this.blockSafe--;
4766            //blockSafe is set to -1 in manual / low resources safemode
4767            assert blockSafe >= 0 || isManual() || areResourcesLow();
4768            checkMode();
4769          }
4770        }
4771    
4772        /**
4773         * Check if safe mode was entered manually
4774         */
4775        private boolean isManual() {
4776          return extension == Integer.MAX_VALUE;
4777        }
4778    
4779        /**
4780         * Set manual safe mode.
4781         */
4782        private synchronized void setManual() {
4783          extension = Integer.MAX_VALUE;
4784        }
4785    
4786        /**
4787         * Check if safe mode was entered due to resources being low.
4788         */
4789        private boolean areResourcesLow() {
4790          return resourcesLow;
4791        }
4792    
4793        /**
4794         * Set that resources are low for this instance of safe mode.
4795         */
4796        private void setResourcesLow() {
4797          resourcesLow = true;
4798        }
4799    
4800        /**
4801         * A tip on how safe mode is to be turned off: manually or automatically.
4802         */
4803        String getTurnOffTip() {
4804          if(!isOn())
4805            return "Safe mode is OFF.";
4806    
4807          //Manual OR low-resource safemode. (Admin intervention required)
4808          String leaveMsg = "It was turned on manually. ";
4809          if (areResourcesLow()) {
4810            leaveMsg = "Resources are low on NN. Please add or free up more "
4811              + "resources then turn off safe mode manually. NOTE:  If you turn off"
4812              + " safe mode before adding resources, "
4813              + "the NN will immediately return to safe mode. ";
4814          }
4815          if (isManual() || areResourcesLow()) {
4816            return leaveMsg
4817              + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
4818          }
4819    
4820          //Automatic safemode. System will come out of safemode automatically.
4821          leaveMsg = "Safe mode will be turned off automatically";
4822          int numLive = getNumLiveDataNodes();
4823          String msg = "";
4824          if (reached == 0) {
4825            if (blockSafe < blockThreshold) {
4826              msg += String.format(
4827                "The reported blocks %d needs additional %d"
4828                + " blocks to reach the threshold %.4f of total blocks %d.\n",
4829                blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
4830            }
4831            if (numLive < datanodeThreshold) {
4832              msg += String.format(
4833                "The number of live datanodes %d needs an additional %d live "
4834                + "datanodes to reach the minimum number %d.\n",
4835                numLive, (datanodeThreshold - numLive), datanodeThreshold);
4836            }
4837          } else {
4838            msg = String.format("The reported blocks %d has reached the threshold"
4839                + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
4840    
4841            msg += String.format("The number of live datanodes %d has reached "
4842                                   + "the minimum number %d. ",
4843                                   numLive, datanodeThreshold);
4844          }
4845          msg += leaveMsg;
4846          // threshold is not reached or manual or resources low
4847          if(reached == 0 || (isManual() && !areResourcesLow())) {
4848            return msg;
4849          }
4850          // extension period is in progress
4851          return msg + (reached + extension - now() > 0 ?
4852            " in " + (reached + extension - now()) / 1000 + " seconds."
4853            : " soon.");
4854        }
4855    
4856        /**
4857         * Print status every 20 seconds.
4858         */
4859        private void reportStatus(String msg, boolean rightNow) {
4860          long curTime = now();
4861          if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
4862            return;
4863          NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
4864          lastStatusReport = curTime;
4865        }
4866    
4867        @Override
4868        public String toString() {
4869          String resText = "Current safe blocks = " 
4870            + blockSafe 
4871            + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
4872            + ". Minimal replication = " + safeReplication + ".";
4873          if (reached > 0) 
4874            resText += " Threshold was reached " + new Date(reached) + ".";
4875          return resText;
4876        }
4877          
4878        /**
4879         * Checks consistency of the class state.
4880         * This is costly so only runs if asserts are enabled.
4881         */
4882        private void doConsistencyCheck() {
4883          boolean assertsOn = false;
4884          assert assertsOn = true; // set to true if asserts are on
4885          if (!assertsOn) return;
4886          
4887          if (blockTotal == -1 && blockSafe == -1) {
4888            return; // manual safe mode
4889          }
4890          int activeBlocks = blockManager.getActiveBlockCount();
4891          if ((blockTotal != activeBlocks) &&
4892              !(blockSafe >= 0 && blockSafe <= blockTotal)) {
4893            throw new AssertionError(
4894                " SafeMode: Inconsistent filesystem state: "
4895            + "SafeMode data: blockTotal=" + blockTotal
4896            + " blockSafe=" + blockSafe + "; "
4897            + "BlockManager data: active="  + activeBlocks);
4898          }
4899        }
4900    
4901        private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
4902          if (!shouldIncrementallyTrackBlocks) {
4903            return;
4904          }
4905          assert haEnabled;
4906          
4907          if (LOG.isDebugEnabled()) {
4908            LOG.debug("Adjusting block totals from " +
4909                blockSafe + "/" + blockTotal + " to " +
4910                (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
4911          }
4912          assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
4913            blockSafe + " by " + deltaSafe + ": would be negative";
4914          assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
4915            blockTotal + " by " + deltaTotal + ": would be negative";
4916          
4917          blockSafe += deltaSafe;
4918          setBlockTotal(blockTotal + deltaTotal);
4919        }
4920      }
4921        
4922      /**
4923       * Periodically check whether it is time to leave safe mode.
4924       * This thread starts when the threshold level is reached.
4925       *
4926       */
4927      class SafeModeMonitor implements Runnable {
4928        /** interval in msec for checking safe mode: {@value} */
4929        private static final long recheckInterval = 1000;
4930          
4931        /**
4932         */
4933        @Override
4934        public void run() {
4935          while (fsRunning) {
4936            writeLock();
4937            try {
4938              if (safeMode == null) { // Not in safe mode.
4939                break;
4940              }
4941              if (safeMode.canLeave()) {
4942                // Leave safe mode.
4943                safeMode.leave();
4944                smmthread = null;
4945                break;
4946              }
4947            } finally {
4948              writeUnlock();
4949            }
4950    
4951            try {
4952              Thread.sleep(recheckInterval);
4953            } catch (InterruptedException ie) {
4954              // Ignored
4955            }
4956          }
4957          if (!fsRunning) {
4958            LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
4959          }
4960        }
4961      }
4962        
4963      boolean setSafeMode(SafeModeAction action) throws IOException {
4964        if (action != SafeModeAction.SAFEMODE_GET) {
4965          checkSuperuserPrivilege();
4966          switch(action) {
4967          case SAFEMODE_LEAVE: // leave safe mode
4968            leaveSafeMode();
4969            break;
4970          case SAFEMODE_ENTER: // enter safe mode
4971            enterSafeMode(false);
4972            break;
4973          default:
4974            LOG.error("Unexpected safe mode action");
4975          }
4976        }
4977        return isInSafeMode();
4978      }
4979    
4980      @Override
4981      public void checkSafeMode() {
4982        // safeMode is volatile, and may be set to null at any time
4983        SafeModeInfo safeMode = this.safeMode;
4984        if (safeMode != null) {
4985          safeMode.checkMode();
4986        }
4987      }
4988    
4989      @Override
4990      public boolean isInSafeMode() {
4991        // safeMode is volatile, and may be set to null at any time
4992        SafeModeInfo safeMode = this.safeMode;
4993        if (safeMode == null)
4994          return false;
4995        return safeMode.isOn();
4996      }
4997    
4998      @Override
4999      public boolean isInStartupSafeMode() {
5000        // safeMode is volatile, and may be set to null at any time
5001        SafeModeInfo safeMode = this.safeMode;
5002        if (safeMode == null)
5003          return false;
5004        // If the NN is in safemode, and not due to manual / low resources, we
5005        // assume it must be because of startup. If the NN had low resources during
5006        // startup, we assume it came out of startup safemode and it is now in low
5007        // resources safemode
5008        return !safeMode.isManual() && !safeMode.areResourcesLow()
5009          && safeMode.isOn();
5010      }
5011    
5012      /**
5013       * Check if replication queues are to be populated
5014       * @return true when node is HAState.Active and not in the very first safemode
5015       */
5016      @Override
5017      public boolean isPopulatingReplQueues() {
5018        if (!shouldPopulateReplQueues()) {
5019          return false;
5020        }
5021        // safeMode is volatile, and may be set to null at any time
5022        SafeModeInfo safeMode = this.safeMode;
5023        if (safeMode == null)
5024          return true;
5025        return safeMode.isPopulatingReplQueues();
5026      }
5027    
5028      private boolean shouldPopulateReplQueues() {
5029        if(haContext == null || haContext.getState() == null)
5030          return false;
5031        return haContext.getState().shouldPopulateReplQueues();
5032      }
5033    
5034      @Override
5035      public void incrementSafeBlockCount(int replication) {
5036        // safeMode is volatile, and may be set to null at any time
5037        SafeModeInfo safeMode = this.safeMode;
5038        if (safeMode == null)
5039          return;
5040        safeMode.incrementSafeBlockCount((short)replication);
5041      }
5042    
5043      @Override
5044      public void decrementSafeBlockCount(Block b) {
5045        // safeMode is volatile, and may be set to null at any time
5046        SafeModeInfo safeMode = this.safeMode;
5047        if (safeMode == null) // mostly true
5048          return;
5049        BlockInfo storedBlock = getStoredBlock(b);
5050        if (storedBlock.isComplete()) {
5051          safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5052        }
5053      }
5054      
5055      /**
5056       * Adjust the total number of blocks safe and expected during safe mode.
5057       * If safe mode is not currently on, this is a no-op.
5058       * @param deltaSafe the change in number of safe blocks
5059       * @param deltaTotal the change i nnumber of total blocks expected
5060       */
5061      @Override
5062      public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5063        // safeMode is volatile, and may be set to null at any time
5064        SafeModeInfo safeMode = this.safeMode;
5065        if (safeMode == null)
5066          return;
5067        safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5068      }
5069    
5070      /**
5071       * Set the total number of blocks in the system. 
5072       */
5073      public void setBlockTotal() {
5074        // safeMode is volatile, and may be set to null at any time
5075        SafeModeInfo safeMode = this.safeMode;
5076        if (safeMode == null)
5077          return;
5078        safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5079      }
5080    
5081      /**
5082       * Get the total number of blocks in the system. 
5083       */
5084      @Override // FSNamesystemMBean
5085      @Metric
5086      public long getBlocksTotal() {
5087        return blockManager.getTotalBlocks();
5088      }
5089    
5090      /**
5091       * Get the total number of COMPLETE blocks in the system.
5092       * For safe mode only complete blocks are counted.
5093       */
5094      private long getCompleteBlocksTotal() {
5095        // Calculate number of blocks under construction
5096        long numUCBlocks = 0;
5097        readLock();
5098        try {
5099          for (Lease lease : leaseManager.getSortedLeases()) {
5100            for (String path : lease.getPaths()) {
5101              final INodeFileUnderConstruction cons;
5102              try {
5103                cons = INodeFileUnderConstruction.valueOf(dir.getINode(path), path);
5104              } catch (UnresolvedLinkException e) {
5105                throw new AssertionError("Lease files should reside on this FS");
5106              } catch (IOException e) {
5107                throw new RuntimeException(e);
5108              }
5109              BlockInfo[] blocks = cons.getBlocks();
5110              if(blocks == null)
5111                continue;
5112              for(BlockInfo b : blocks) {
5113                if(!b.isComplete())
5114                  numUCBlocks++;
5115              }
5116            }
5117          }
5118          LOG.info("Number of blocks under construction: " + numUCBlocks);
5119          return getBlocksTotal() - numUCBlocks;
5120        } finally {
5121          readUnlock();
5122        }
5123      }
5124    
5125      /**
5126       * Enter safe mode. If resourcesLow is false, then we assume it is manual
5127       * @throws IOException
5128       */
5129      void enterSafeMode(boolean resourcesLow) throws IOException {
5130        writeLock();
5131        try {
5132          // Stop the secret manager, since rolling the master key would
5133          // try to write to the edit log
5134          stopSecretManager();
5135    
5136          // Ensure that any concurrent operations have been fully synced
5137          // before entering safe mode. This ensures that the FSImage
5138          // is entirely stable on disk as soon as we're in safe mode.
5139          boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5140          // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5141          // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5142          if (isEditlogOpenForWrite) {
5143            getEditLog().logSyncAll();
5144          }
5145          if (!isInSafeMode()) {
5146            safeMode = new SafeModeInfo(resourcesLow, isPopulatingReplQueues());
5147            return;
5148          }
5149          if (resourcesLow) {
5150            safeMode.setResourcesLow();
5151          } else {
5152            safeMode.setManual();
5153          }
5154          if (isEditlogOpenForWrite) {
5155            getEditLog().logSyncAll();
5156          }
5157          NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5158              + safeMode.getTurnOffTip());
5159        } finally {
5160          writeUnlock();
5161        }
5162      }
5163    
5164      /**
5165       * Leave safe mode.
5166       * @throws IOException
5167       */
5168      void leaveSafeMode() {
5169        writeLock();
5170        try {
5171          if (!isInSafeMode()) {
5172            NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5173            return;
5174          }
5175          safeMode.leave();
5176        } finally {
5177          writeUnlock();
5178        }
5179      }
5180        
5181      String getSafeModeTip() {
5182        readLock();
5183        try {
5184          if (!isInSafeMode()) {
5185            return "";
5186          }
5187          return safeMode.getTurnOffTip();
5188        } finally {
5189          readUnlock();
5190        }
5191      }
5192    
5193      CheckpointSignature rollEditLog() throws IOException {
5194        checkSuperuserPrivilege();
5195        checkOperation(OperationCategory.JOURNAL);
5196        writeLock();
5197        try {
5198          checkOperation(OperationCategory.JOURNAL);
5199          checkNameNodeSafeMode("Log not rolled");
5200          if (Server.isRpcInvocation()) {
5201            LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5202          }
5203          return getFSImage().rollEditLog();
5204        } finally {
5205          writeUnlock();
5206        }
5207      }
5208    
5209      NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5210          NamenodeRegistration activeNamenode) throws IOException {
5211        checkOperation(OperationCategory.CHECKPOINT);
5212        CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
5213            null);
5214        if (cacheEntry != null && cacheEntry.isSuccess()) {
5215          return (NamenodeCommand) cacheEntry.getPayload();
5216        }
5217        writeLock();
5218        NamenodeCommand cmd = null;
5219        try {
5220          checkOperation(OperationCategory.CHECKPOINT);
5221    
5222          checkNameNodeSafeMode("Checkpoint not started");
5223          LOG.info("Start checkpoint for " + backupNode.getAddress());
5224          cmd = getFSImage().startCheckpoint(backupNode, activeNamenode);
5225          getEditLog().logSync();
5226          return cmd;
5227        } finally {
5228          writeUnlock();
5229          RetryCache.setState(cacheEntry, cmd != null, cmd);
5230        }
5231      }
5232    
5233      public void processIncrementalBlockReport(final DatanodeID nodeID,
5234          final String poolId, final ReceivedDeletedBlockInfo blockInfos[])
5235          throws IOException {
5236        writeLock();
5237        try {
5238          blockManager.processIncrementalBlockReport(nodeID, poolId, blockInfos);
5239        } finally {
5240          writeUnlock();
5241        }
5242      }
5243      
5244      void endCheckpoint(NamenodeRegistration registration,
5245                                CheckpointSignature sig) throws IOException {
5246        checkOperation(OperationCategory.CHECKPOINT);
5247        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5248        if (cacheEntry != null && cacheEntry.isSuccess()) {
5249          return; // Return previous response
5250        }
5251        boolean success = false;
5252        readLock();
5253        try {
5254          checkOperation(OperationCategory.CHECKPOINT);
5255    
5256          checkNameNodeSafeMode("Checkpoint not ended");
5257          LOG.info("End checkpoint for " + registration.getAddress());
5258          getFSImage().endCheckpoint(sig);
5259          success = true;
5260        } finally {
5261          readUnlock();
5262          RetryCache.setState(cacheEntry, success);
5263        }
5264      }
5265    
5266      PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5267        return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5268      }
5269    
5270      private void checkOwner(FSPermissionChecker pc, String path)
5271          throws AccessControlException, UnresolvedLinkException {
5272        checkPermission(pc, path, true, null, null, null, null);
5273      }
5274    
5275      private void checkPathAccess(FSPermissionChecker pc,
5276          String path, FsAction access) throws AccessControlException,
5277          UnresolvedLinkException {
5278        checkPermission(pc, path, false, null, null, access, null);
5279      }
5280    
5281      private void checkParentAccess(FSPermissionChecker pc,
5282          String path, FsAction access) throws AccessControlException,
5283          UnresolvedLinkException {
5284        checkPermission(pc, path, false, null, access, null, null);
5285      }
5286    
5287      private void checkAncestorAccess(FSPermissionChecker pc,
5288          String path, FsAction access) throws AccessControlException,
5289          UnresolvedLinkException {
5290        checkPermission(pc, path, false, access, null, null, null);
5291      }
5292    
5293      private void checkTraverse(FSPermissionChecker pc, String path)
5294          throws AccessControlException, UnresolvedLinkException {
5295        checkPermission(pc, path, false, null, null, null, null);
5296      }
5297    
5298      @Override
5299      public void checkSuperuserPrivilege()
5300          throws AccessControlException {
5301        if (isPermissionEnabled) {
5302          FSPermissionChecker pc = getPermissionChecker();
5303          pc.checkSuperuserPrivilege();
5304        }
5305      }
5306    
5307      /**
5308       * Check whether current user have permissions to access the path. For more
5309       * details of the parameters, see
5310       * {@link FSPermissionChecker#checkPermission()}.
5311       */
5312      private void checkPermission(FSPermissionChecker pc,
5313          String path, boolean doCheckOwner, FsAction ancestorAccess,
5314          FsAction parentAccess, FsAction access, FsAction subAccess)
5315          throws AccessControlException, UnresolvedLinkException {
5316            checkPermission(pc, path, doCheckOwner, ancestorAccess,
5317                parentAccess, access, subAccess, true);
5318      }
5319    
5320      /**
5321       * Check whether current user have permissions to access the path. For more
5322       * details of the parameters, see
5323       * {@link FSPermissionChecker#checkPermission()}.
5324       */
5325      private void checkPermission(FSPermissionChecker pc,
5326          String path, boolean doCheckOwner, FsAction ancestorAccess,
5327          FsAction parentAccess, FsAction access, FsAction subAccess,
5328          boolean resolveLink)
5329          throws AccessControlException, UnresolvedLinkException {
5330        if (!pc.isSuperUser()) {
5331          dir.waitForReady();
5332          readLock();
5333          try {
5334            pc.checkPermission(path, dir.rootDir, doCheckOwner, ancestorAccess,
5335                parentAccess, access, subAccess, resolveLink);
5336          } finally {
5337            readUnlock();
5338          }
5339        }
5340      }
5341      
5342      /**
5343       * Check to see if we have exceeded the limit on the number
5344       * of inodes.
5345       */
5346      void checkFsObjectLimit() throws IOException {
5347        if (maxFsObjects != 0 &&
5348            maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5349          throw new IOException("Exceeded the configured number of objects " +
5350                                 maxFsObjects + " in the filesystem.");
5351        }
5352      }
5353    
5354      /**
5355       * Get the total number of objects in the system. 
5356       */
5357      long getMaxObjects() {
5358        return maxFsObjects;
5359      }
5360    
5361      @Override // FSNamesystemMBean
5362      @Metric
5363      public long getFilesTotal() {
5364        readLock();
5365        try {
5366          return this.dir.totalInodes();
5367        } finally {
5368          readUnlock();
5369        }
5370      }
5371    
5372      @Override // FSNamesystemMBean
5373      @Metric
5374      public long getPendingReplicationBlocks() {
5375        return blockManager.getPendingReplicationBlocksCount();
5376      }
5377    
5378      @Override // FSNamesystemMBean
5379      @Metric
5380      public long getUnderReplicatedBlocks() {
5381        return blockManager.getUnderReplicatedBlocksCount();
5382      }
5383    
5384      /** Returns number of blocks with corrupt replicas */
5385      @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5386      public long getCorruptReplicaBlocks() {
5387        return blockManager.getCorruptReplicaBlocksCount();
5388      }
5389    
5390      @Override // FSNamesystemMBean
5391      @Metric
5392      public long getScheduledReplicationBlocks() {
5393        return blockManager.getScheduledReplicationBlocksCount();
5394      }
5395    
5396      @Metric
5397      public long getPendingDeletionBlocks() {
5398        return blockManager.getPendingDeletionBlocksCount();
5399      }
5400    
5401      @Metric
5402      public long getExcessBlocks() {
5403        return blockManager.getExcessBlocksCount();
5404      }
5405      
5406      // HA-only metric
5407      @Metric
5408      public long getPostponedMisreplicatedBlocks() {
5409        return blockManager.getPostponedMisreplicatedBlocksCount();
5410      }
5411    
5412      // HA-only metric
5413      @Metric
5414      public int getPendingDataNodeMessageCount() {
5415        return blockManager.getPendingDataNodeMessageCount();
5416      }
5417      
5418      // HA-only metric
5419      @Metric
5420      public String getHAState() {
5421        return haContext.getState().toString();
5422      }
5423    
5424      // HA-only metric
5425      @Metric
5426      public long getMillisSinceLastLoadedEdits() {
5427        if (isInStandbyState() && editLogTailer != null) {
5428          return now() - editLogTailer.getLastLoadTimestamp();
5429        } else {
5430          return 0;
5431        }
5432      }
5433      
5434      @Metric
5435      public int getBlockCapacity() {
5436        return blockManager.getCapacity();
5437      }
5438    
5439      @Override // FSNamesystemMBean
5440      public String getFSState() {
5441        return isInSafeMode() ? "safeMode" : "Operational";
5442      }
5443      
5444      private ObjectName mbeanName;
5445    
5446      /**
5447       * Register the FSNamesystem MBean using the name
5448       *        "hadoop:service=NameNode,name=FSNamesystemState"
5449       */
5450      private void registerMBean() {
5451        // We can only implement one MXBean interface, so we keep the old one.
5452        try {
5453          StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
5454          mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
5455        } catch (NotCompliantMBeanException e) {
5456          throw new RuntimeException("Bad MBean setup", e);
5457        }
5458    
5459        LOG.info("Registered FSNamesystemState MBean");
5460      }
5461    
5462      /**
5463       * shutdown FSNamesystem
5464       */
5465      void shutdown() {
5466        if (mbeanName != null) {
5467          MBeans.unregister(mbeanName);
5468        }
5469        if (dir != null) {
5470          dir.shutdown();
5471        }
5472        if (blockManager != null) {
5473          blockManager.shutdown();
5474        }
5475      }
5476      
5477    
5478      @Override // FSNamesystemMBean
5479      public int getNumLiveDataNodes() {
5480        return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
5481      }
5482    
5483      @Override // FSNamesystemMBean
5484      public int getNumDeadDataNodes() {
5485        return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
5486      }
5487      
5488      @Override // FSNamesystemMBean
5489      @Metric({"StaleDataNodes", 
5490        "Number of datanodes marked stale due to delayed heartbeat"})
5491      public int getNumStaleDataNodes() {
5492        return getBlockManager().getDatanodeManager().getNumStaleNodes();
5493      }
5494    
5495      /**
5496       * Sets the current generation stamp for legacy blocks
5497       */
5498      void setGenerationStampV1(long stamp) {
5499        generationStampV1.setCurrentValue(stamp);
5500      }
5501    
5502      /**
5503       * Gets the current generation stamp for legacy blocks
5504       */
5505      long getGenerationStampV1() {
5506        return generationStampV1.getCurrentValue();
5507      }
5508    
5509      /**
5510       * Gets the current generation stamp for this filesystem
5511       */
5512      void setGenerationStampV2(long stamp) {
5513        generationStampV2.setCurrentValue(stamp);
5514      }
5515    
5516      /**
5517       * Gets the current generation stamp for this filesystem
5518       */
5519      long getGenerationStampV2() {
5520        return generationStampV2.getCurrentValue();
5521      }
5522    
5523      /**
5524       * Upgrades the generation stamp for the filesystem
5525       * by reserving a sufficient range for all existing blocks.
5526       * Should be invoked only during the first upgrade to
5527       * sequential block IDs.
5528       */
5529      long upgradeGenerationStampToV2() {
5530        Preconditions.checkState(generationStampV2.getCurrentValue() ==
5531            GenerationStamp.LAST_RESERVED_STAMP);
5532    
5533        generationStampV2.skipTo(
5534            generationStampV1.getCurrentValue() +
5535            HdfsConstants.RESERVED_GENERATION_STAMPS_V1);
5536    
5537        generationStampV1Limit = generationStampV2.getCurrentValue();
5538        return generationStampV2.getCurrentValue();
5539      }
5540    
5541      /**
5542       * Sets the generation stamp that delineates random and sequentially
5543       * allocated block IDs.
5544       * @param stamp
5545       */
5546      void setGenerationStampV1Limit(long stamp) {
5547        Preconditions.checkState(generationStampV1Limit ==
5548                                 GenerationStamp.GRANDFATHER_GENERATION_STAMP);
5549        generationStampV1Limit = stamp;
5550      }
5551    
5552      /**
5553       * Gets the value of the generation stamp that delineates sequential
5554       * and random block IDs.
5555       */
5556      long getGenerationStampAtblockIdSwitch() {
5557        return generationStampV1Limit;
5558      }
5559    
5560      @VisibleForTesting
5561      SequentialBlockIdGenerator getBlockIdGenerator() {
5562        return blockIdGenerator;
5563      }
5564    
5565      /**
5566       * Sets the maximum allocated block ID for this filesystem. This is
5567       * the basis for allocating new block IDs.
5568       */
5569      void setLastAllocatedBlockId(long blockId) {
5570        blockIdGenerator.skipTo(blockId);
5571      }
5572    
5573      /**
5574       * Gets the maximum sequentially allocated block ID for this filesystem
5575       */
5576      long getLastAllocatedBlockId() {
5577        return blockIdGenerator.getCurrentValue();
5578      }
5579    
5580      /**
5581       * Increments, logs and then returns the stamp
5582       */
5583      long nextGenerationStamp(boolean legacyBlock)
5584          throws IOException, SafeModeException {
5585        assert hasWriteLock();
5586        checkNameNodeSafeMode("Cannot get next generation stamp");
5587    
5588        long gs;
5589        if (legacyBlock) {
5590          gs = getNextGenerationStampV1();
5591          getEditLog().logGenerationStampV1(gs);
5592        } else {
5593          gs = getNextGenerationStampV2();
5594          getEditLog().logGenerationStampV2(gs);
5595        }
5596    
5597        // NB: callers sync the log
5598        return gs;
5599      }
5600    
5601      @VisibleForTesting
5602      long getNextGenerationStampV1() throws IOException {
5603        long genStampV1 = generationStampV1.nextValue();
5604    
5605        if (genStampV1 >= generationStampV1Limit) {
5606          // We ran out of generation stamps for legacy blocks. In practice, it
5607          // is extremely unlikely as we reserved 1T v1 generation stamps. The
5608          // result is that we can no longer append to the legacy blocks that
5609          // were created before the upgrade to sequential block IDs.
5610          throw new OutOfV1GenerationStampsException();
5611        }
5612    
5613        return genStampV1;
5614      }
5615    
5616      @VisibleForTesting
5617      long getNextGenerationStampV2() {
5618        return generationStampV2.nextValue();
5619      }
5620    
5621      long getGenerationStampV1Limit() {
5622        return generationStampV1Limit;
5623      }
5624    
5625      /**
5626       * Determine whether the block ID was randomly generated (legacy) or
5627       * sequentially generated. The generation stamp value is used to
5628       * make the distinction.
5629       * @param block
5630       * @return true if the block ID was randomly generated, false otherwise.
5631       */
5632      boolean isLegacyBlock(Block block) {
5633        return block.getGenerationStamp() < getGenerationStampV1Limit();
5634      }
5635    
5636      /**
5637       * Increments, logs and then returns the block ID
5638       */
5639      private long nextBlockId() throws IOException {
5640        assert hasWriteLock();
5641        checkNameNodeSafeMode("Cannot get next block ID");
5642        final long blockId = blockIdGenerator.nextValue();
5643        getEditLog().logAllocateBlockId(blockId);
5644        // NB: callers sync the log
5645        return blockId;
5646      }
5647    
5648      private INodeFileUnderConstruction checkUCBlock(ExtendedBlock block,
5649          String clientName) throws IOException {
5650        assert hasWriteLock();
5651        checkNameNodeSafeMode("Cannot get a new generation stamp and an "
5652            + "access token for block " + block);
5653        
5654        // check stored block state
5655        BlockInfo storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
5656        if (storedBlock == null || 
5657            storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
5658            throw new IOException(block + 
5659                " does not exist or is not under Construction" + storedBlock);
5660        }
5661        
5662        // check file inode
5663        final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
5664        if (file==null || !file.isUnderConstruction()) {
5665          throw new IOException("The file " + storedBlock + 
5666              " belonged to does not exist or it is not under construction.");
5667        }
5668        
5669        // check lease
5670        INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)file;
5671        if (clientName == null || !clientName.equals(pendingFile.getClientName())) {
5672          throw new LeaseExpiredException("Lease mismatch: " + block + 
5673              " is accessed by a non lease holder " + clientName); 
5674        }
5675    
5676        return pendingFile;
5677      }
5678      
5679      /**
5680       * Client is reporting some bad block locations.
5681       */
5682      void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
5683        checkOperation(OperationCategory.WRITE);
5684        NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
5685        writeLock();
5686        try {
5687          checkOperation(OperationCategory.WRITE);
5688          for (int i = 0; i < blocks.length; i++) {
5689            ExtendedBlock blk = blocks[i].getBlock();
5690            DatanodeInfo[] nodes = blocks[i].getLocations();
5691            for (int j = 0; j < nodes.length; j++) {
5692              DatanodeInfo dn = nodes[j];
5693              blockManager.findAndMarkBlockAsCorrupt(blk, dn,
5694                  "client machine reported it");
5695            }
5696          }
5697        } finally {
5698          writeUnlock();
5699        }
5700      }
5701    
5702      /**
5703       * Get a new generation stamp together with an access token for 
5704       * a block under construction
5705       * 
5706       * This method is called for recovering a failed pipeline or setting up
5707       * a pipeline to append to a block.
5708       * 
5709       * @param block a block
5710       * @param clientName the name of a client
5711       * @return a located block with a new generation stamp and an access token
5712       * @throws IOException if any error occurs
5713       */
5714      LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
5715          String clientName) throws IOException {
5716        LocatedBlock locatedBlock;
5717        checkOperation(OperationCategory.WRITE);
5718        writeLock();
5719        try {
5720          checkOperation(OperationCategory.WRITE);
5721    
5722          // check vadility of parameters
5723          checkUCBlock(block, clientName);
5724      
5725          // get a new generation stamp and an access token
5726          block.setGenerationStamp(
5727              nextGenerationStamp(isLegacyBlock(block.getLocalBlock())));
5728          locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
5729          blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
5730        } finally {
5731          writeUnlock();
5732        }
5733        // Ensure we record the new generation stamp
5734        getEditLog().logSync();
5735        return locatedBlock;
5736      }
5737      
5738      /**
5739       * Update a pipeline for a block under construction
5740       * 
5741       * @param clientName the name of the client
5742       * @param oldBlock and old block
5743       * @param newBlock a new block with a new generation stamp and length
5744       * @param newNodes datanodes in the pipeline
5745       * @throws IOException if any error occurs
5746       */
5747      void updatePipeline(String clientName, ExtendedBlock oldBlock, 
5748          ExtendedBlock newBlock, DatanodeID[] newNodes)
5749          throws IOException {
5750        checkOperation(OperationCategory.WRITE);
5751        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5752        if (cacheEntry != null && cacheEntry.isSuccess()) {
5753          return; // Return previous response
5754        }
5755        LOG.info("updatePipeline(block=" + oldBlock
5756                 + ", newGenerationStamp=" + newBlock.getGenerationStamp()
5757                 + ", newLength=" + newBlock.getNumBytes()
5758                 + ", newNodes=" + Arrays.asList(newNodes)
5759                 + ", clientName=" + clientName
5760                 + ")");
5761        writeLock();
5762        boolean success = false;
5763        try {
5764          checkOperation(OperationCategory.WRITE);
5765          checkNameNodeSafeMode("Pipeline not updated");
5766          assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
5767            + oldBlock + " has different block identifier";
5768          updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
5769              cacheEntry != null);
5770          success = true;
5771        } finally {
5772          writeUnlock();
5773          RetryCache.setState(cacheEntry, success);
5774        }
5775        getEditLog().logSync();
5776        LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock);
5777      }
5778    
5779      /** @see #updatePipeline(String, ExtendedBlock, ExtendedBlock, DatanodeID[]) */
5780      private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, 
5781          ExtendedBlock newBlock, DatanodeID[] newNodes, boolean logRetryCache)
5782          throws IOException {
5783        assert hasWriteLock();
5784        // check the vadility of the block and lease holder name
5785        final INodeFileUnderConstruction pendingFile
5786            = checkUCBlock(oldBlock, clientName);
5787        final BlockInfoUnderConstruction blockinfo
5788            = (BlockInfoUnderConstruction)pendingFile.getLastBlock();
5789    
5790        // check new GS & length: this is not expected
5791        if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
5792            newBlock.getNumBytes() < blockinfo.getNumBytes()) {
5793          String msg = "Update " + oldBlock + " (len = " + 
5794            blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
5795            " (len = " + newBlock.getNumBytes() +")";
5796          LOG.warn(msg);
5797          throw new IOException(msg);
5798        }
5799    
5800        // Update old block with the new generation stamp and new length
5801        blockinfo.setNumBytes(newBlock.getNumBytes());
5802        blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
5803    
5804        // find the DatanodeDescriptor objects
5805        final DatanodeManager dm = getBlockManager().getDatanodeManager();
5806        DatanodeDescriptor[] descriptors = null;
5807        if (newNodes.length > 0) {
5808          descriptors = new DatanodeDescriptor[newNodes.length];
5809          for(int i = 0; i < newNodes.length; i++) {
5810            descriptors[i] = dm.getDatanode(newNodes[i]);
5811          }
5812        }
5813        blockinfo.setExpectedLocations(descriptors);
5814    
5815        String src = leaseManager.findPath(pendingFile);
5816        dir.persistBlocks(src, pendingFile, logRetryCache);
5817      }
5818    
5819      // rename was successful. If any part of the renamed subtree had
5820      // files that were being written to, update with new filename.
5821      void unprotectedChangeLease(String src, String dst) {
5822        assert hasWriteLock();
5823        leaseManager.changeLease(src, dst);
5824      }
5825    
5826      /**
5827       * Serializes leases. 
5828       */
5829      void saveFilesUnderConstruction(DataOutputStream out,
5830          Map<Long, INodeFileUnderConstruction> snapshotUCMap) throws IOException {
5831        // This is run by an inferior thread of saveNamespace, which holds a read
5832        // lock on our behalf. If we took the read lock here, we could block
5833        // for fairness if a writer is waiting on the lock.
5834        synchronized (leaseManager) {
5835          Map<String, INodeFileUnderConstruction> nodes =
5836              leaseManager.getINodesUnderConstruction();
5837          for (Map.Entry<String, INodeFileUnderConstruction> entry
5838              : nodes.entrySet()) {
5839            // TODO: for HDFS-5428, because of rename operations, some
5840            // under-construction files that are
5841            // in the current fs directory can also be captured in the
5842            // snapshotUCMap. We should remove them from the snapshotUCMap.
5843            snapshotUCMap.remove(entry.getValue().getId());
5844          }
5845          
5846          out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size    
5847          for (Map.Entry<String, INodeFileUnderConstruction> entry
5848               : nodes.entrySet()) {
5849            FSImageSerialization.writeINodeUnderConstruction(
5850                out, entry.getValue(), entry.getKey());
5851          }
5852          for (Map.Entry<Long, INodeFileUnderConstruction> entry
5853              : snapshotUCMap.entrySet()) {
5854            // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
5855            // as their paths
5856            StringBuilder b = new StringBuilder();
5857            b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
5858                .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
5859                .append(Path.SEPARATOR).append(entry.getValue().getId());
5860            FSImageSerialization.writeINodeUnderConstruction(
5861                out, entry.getValue(), b.toString());
5862          }
5863        }
5864      }
5865    
5866      /**
5867       * Register a Backup name-node, verifying that it belongs
5868       * to the correct namespace, and adding it to the set of
5869       * active journals if necessary.
5870       * 
5871       * @param bnReg registration of the new BackupNode
5872       * @param nnReg registration of this NameNode
5873       * @throws IOException if the namespace IDs do not match
5874       */
5875      void registerBackupNode(NamenodeRegistration bnReg,
5876          NamenodeRegistration nnReg) throws IOException {
5877        writeLock();
5878        try {
5879          if(getFSImage().getStorage().getNamespaceID() 
5880             != bnReg.getNamespaceID())
5881            throw new IOException("Incompatible namespaceIDs: "
5882                + " Namenode namespaceID = "
5883                + getFSImage().getStorage().getNamespaceID() + "; "
5884                + bnReg.getRole() +
5885                " node namespaceID = " + bnReg.getNamespaceID());
5886          if (bnReg.getRole() == NamenodeRole.BACKUP) {
5887            getFSImage().getEditLog().registerBackupNode(
5888                bnReg, nnReg);
5889          }
5890        } finally {
5891          writeUnlock();
5892        }
5893      }
5894    
5895      /**
5896       * Release (unregister) backup node.
5897       * <p>
5898       * Find and remove the backup stream corresponding to the node.
5899       * @param registration
5900       * @throws IOException
5901       */
5902      void releaseBackupNode(NamenodeRegistration registration)
5903        throws IOException {
5904        checkOperation(OperationCategory.WRITE);
5905        writeLock();
5906        try {
5907          checkOperation(OperationCategory.WRITE);
5908          if(getFSImage().getStorage().getNamespaceID()
5909             != registration.getNamespaceID())
5910            throw new IOException("Incompatible namespaceIDs: "
5911                + " Namenode namespaceID = "
5912                + getFSImage().getStorage().getNamespaceID() + "; "
5913                + registration.getRole() +
5914                " node namespaceID = " + registration.getNamespaceID());
5915          getEditLog().releaseBackupStream(registration);
5916        } finally {
5917          writeUnlock();
5918        }
5919      }
5920    
5921      static class CorruptFileBlockInfo {
5922        String path;
5923        Block block;
5924        
5925        public CorruptFileBlockInfo(String p, Block b) {
5926          path = p;
5927          block = b;
5928        }
5929        
5930        @Override
5931        public String toString() {
5932          return block.getBlockName() + "\t" + path;
5933        }
5934      }
5935      /**
5936       * @param path Restrict corrupt files to this portion of namespace.
5937       * @param startBlockAfter Support for continuation; the set of files we return
5938       *  back is ordered by blockid; startBlockAfter tells where to start from
5939       * @return a list in which each entry describes a corrupt file/block
5940       * @throws AccessControlException
5941       * @throws IOException
5942       */
5943      Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
5944            String[] cookieTab) throws IOException {
5945        checkSuperuserPrivilege();
5946        checkOperation(OperationCategory.READ);
5947        readLock();
5948        try {
5949          checkOperation(OperationCategory.READ);
5950          if (!isPopulatingReplQueues()) {
5951            throw new IOException("Cannot run listCorruptFileBlocks because " +
5952                                  "replication queues have not been initialized.");
5953          }
5954          // print a limited # of corrupt files per call
5955          int count = 0;
5956          ArrayList<CorruptFileBlockInfo> corruptFiles = new ArrayList<CorruptFileBlockInfo>();
5957    
5958          final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
5959    
5960          if (cookieTab == null) {
5961            cookieTab = new String[] { null };
5962          }
5963          int skip = getIntCookie(cookieTab[0]);
5964          for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
5965            blkIterator.next();
5966          }
5967    
5968          while (blkIterator.hasNext()) {
5969            Block blk = blkIterator.next();
5970            final INode inode = (INode)blockManager.getBlockCollection(blk);
5971            skip++;
5972            if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
5973              String src = FSDirectory.getFullPathName(inode);
5974              if (src.startsWith(path)){
5975                corruptFiles.add(new CorruptFileBlockInfo(src, blk));
5976                count++;
5977                if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
5978                  break;
5979              }
5980            }
5981          }
5982          cookieTab[0] = String.valueOf(skip);
5983          LOG.info("list corrupt file blocks returned: " + count);
5984          return corruptFiles;
5985        } finally {
5986          readUnlock();
5987        }
5988      }
5989    
5990      /**
5991       * Convert string cookie to integer.
5992       */
5993      private static int getIntCookie(String cookie){
5994        int c;
5995        if(cookie == null){
5996          c = 0;
5997        } else {
5998          try{
5999            c = Integer.parseInt(cookie);
6000          }catch (NumberFormatException e) {
6001            c = 0;
6002          }
6003        }
6004        c = Math.max(0, c);
6005        return c;
6006      }
6007    
6008      /**
6009       * Create delegation token secret manager
6010       */
6011      private DelegationTokenSecretManager createDelegationTokenSecretManager(
6012          Configuration conf) {
6013        return new DelegationTokenSecretManager(conf.getLong(
6014            DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6015            DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6016            conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6017                DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6018            conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6019                DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6020            DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6021            conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6022                DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6023            this);
6024      }
6025    
6026      /**
6027       * Returns the DelegationTokenSecretManager instance in the namesystem.
6028       * @return delegation token secret manager object
6029       */
6030      DelegationTokenSecretManager getDelegationTokenSecretManager() {
6031        return dtSecretManager;
6032      }
6033    
6034      /**
6035       * @param renewer
6036       * @return Token<DelegationTokenIdentifier>
6037       * @throws IOException
6038       */
6039      Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6040          throws IOException {
6041        Token<DelegationTokenIdentifier> token;
6042        checkOperation(OperationCategory.WRITE);
6043        writeLock();
6044        try {
6045          checkOperation(OperationCategory.WRITE);
6046          checkNameNodeSafeMode("Cannot issue delegation token");
6047          if (!isAllowedDelegationTokenOp()) {
6048            throw new IOException(
6049              "Delegation Token can be issued only with kerberos or web authentication");
6050          }
6051          if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6052            LOG.warn("trying to get DT with no secret manager running");
6053            return null;
6054          }
6055    
6056          UserGroupInformation ugi = getRemoteUser();
6057          String user = ugi.getUserName();
6058          Text owner = new Text(user);
6059          Text realUser = null;
6060          if (ugi.getRealUser() != null) {
6061            realUser = new Text(ugi.getRealUser().getUserName());
6062          }
6063          DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6064            renewer, realUser);
6065          token = new Token<DelegationTokenIdentifier>(
6066            dtId, dtSecretManager);
6067          long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6068          getEditLog().logGetDelegationToken(dtId, expiryTime);
6069        } finally {
6070          writeUnlock();
6071        }
6072        getEditLog().logSync();
6073        return token;
6074      }
6075    
6076      /**
6077       * 
6078       * @param token
6079       * @return New expiryTime of the token
6080       * @throws InvalidToken
6081       * @throws IOException
6082       */
6083      long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6084          throws InvalidToken, IOException {
6085        long expiryTime;
6086        checkOperation(OperationCategory.WRITE);
6087        writeLock();
6088        try {
6089          checkOperation(OperationCategory.WRITE);
6090    
6091          checkNameNodeSafeMode("Cannot renew delegation token");
6092          if (!isAllowedDelegationTokenOp()) {
6093            throw new IOException(
6094                "Delegation Token can be renewed only with kerberos or web authentication");
6095          }
6096          String renewer = getRemoteUser().getShortUserName();
6097          expiryTime = dtSecretManager.renewToken(token, renewer);
6098          DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6099          ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6100          DataInputStream in = new DataInputStream(buf);
6101          id.readFields(in);
6102          getEditLog().logRenewDelegationToken(id, expiryTime);
6103        } finally {
6104          writeUnlock();
6105        }
6106        getEditLog().logSync();
6107        return expiryTime;
6108      }
6109    
6110      /**
6111       * 
6112       * @param token
6113       * @throws IOException
6114       */
6115      void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6116          throws IOException {
6117        checkOperation(OperationCategory.WRITE);
6118        writeLock();
6119        try {
6120          checkOperation(OperationCategory.WRITE);
6121    
6122          checkNameNodeSafeMode("Cannot cancel delegation token");
6123          String canceller = getRemoteUser().getUserName();
6124          DelegationTokenIdentifier id = dtSecretManager
6125            .cancelToken(token, canceller);
6126          getEditLog().logCancelDelegationToken(id);
6127        } finally {
6128          writeUnlock();
6129        }
6130        getEditLog().logSync();
6131      }
6132      
6133      /**
6134       * @param out save state of the secret manager
6135       * @param sdPath String storage directory path
6136       */
6137      void saveSecretManagerState(DataOutputStream out, String sdPath)
6138          throws IOException {
6139        dtSecretManager.saveSecretManagerState(out, sdPath);
6140      }
6141    
6142      /**
6143       * @param in load the state of secret manager from input stream
6144       */
6145      void loadSecretManagerState(DataInput in) throws IOException {
6146        dtSecretManager.loadSecretManagerState(in);
6147      }
6148    
6149      /**
6150       * Log the updateMasterKey operation to edit logs
6151       * 
6152       * @param key new delegation key.
6153       */
6154      public void logUpdateMasterKey(DelegationKey key) {
6155        
6156        assert !isInSafeMode() :
6157          "this should never be called while in safemode, since we stop " +
6158          "the DT manager before entering safemode!";
6159        // No need to hold FSN lock since we don't access any internal
6160        // structures, and this is stopped before the FSN shuts itself
6161        // down, etc.
6162        getEditLog().logUpdateMasterKey(key);
6163        getEditLog().logSync();
6164      }
6165      
6166      /**
6167       * Log the cancellation of expired tokens to edit logs
6168       * 
6169       * @param id token identifier to cancel
6170       */
6171      public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6172        assert !isInSafeMode() :
6173          "this should never be called while in safemode, since we stop " +
6174          "the DT manager before entering safemode!";
6175        // No need to hold FSN lock since we don't access any internal
6176        // structures, and this is stopped before the FSN shuts itself
6177        // down, etc.
6178        getEditLog().logCancelDelegationToken(id);
6179      }  
6180      
6181      private void logReassignLease(String leaseHolder, String src,
6182          String newHolder) {
6183        assert hasWriteLock();
6184        getEditLog().logReassignLease(leaseHolder, src, newHolder);
6185      }
6186      
6187      /**
6188       * 
6189       * @return true if delegation token operation is allowed
6190       */
6191      private boolean isAllowedDelegationTokenOp() throws IOException {
6192        AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6193        if (UserGroupInformation.isSecurityEnabled()
6194            && (authMethod != AuthenticationMethod.KERBEROS)
6195            && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6196            && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6197          return false;
6198        }
6199        return true;
6200      }
6201      
6202      /**
6203       * Returns authentication method used to establish the connection
6204       * @return AuthenticationMethod used to establish connection
6205       * @throws IOException
6206       */
6207      private AuthenticationMethod getConnectionAuthenticationMethod()
6208          throws IOException {
6209        UserGroupInformation ugi = getRemoteUser();
6210        AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6211        if (authMethod == AuthenticationMethod.PROXY) {
6212          authMethod = ugi.getRealUser().getAuthenticationMethod();
6213        }
6214        return authMethod;
6215      }
6216      
6217      /**
6218       * Client invoked methods are invoked over RPC and will be in 
6219       * RPC call context even if the client exits.
6220       */
6221      private boolean isExternalInvocation() {
6222        return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6223      }
6224    
6225      private static InetAddress getRemoteIp() {
6226        InetAddress ip = Server.getRemoteIp();
6227        if (ip != null) {
6228          return ip;
6229        }
6230        return NamenodeWebHdfsMethods.getRemoteIp();
6231      }
6232      
6233      // optimize ugi lookup for RPC operations to avoid a trip through
6234      // UGI.getCurrentUser which is synch'ed
6235      private static UserGroupInformation getRemoteUser() throws IOException {
6236        return NameNode.getRemoteUser();
6237      }
6238      
6239      /**
6240       * Log fsck event in the audit log 
6241       */
6242      void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6243        if (isAuditEnabled()) {
6244          logAuditEvent(true, getRemoteUser(),
6245                        remoteAddress,
6246                        "fsck", src, null, null);
6247        }
6248      }
6249      /**
6250       * Register NameNodeMXBean
6251       */
6252      private void registerMXBean() {
6253        MBeans.register("NameNode", "NameNodeInfo", this);
6254      }
6255    
6256      /**
6257       * Class representing Namenode information for JMX interfaces
6258       */
6259      @Override // NameNodeMXBean
6260      public String getVersion() {
6261        return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6262      }
6263    
6264      @Override // NameNodeMXBean
6265      public long getUsed() {
6266        return this.getCapacityUsed();
6267      }
6268    
6269      @Override // NameNodeMXBean
6270      public long getFree() {
6271        return this.getCapacityRemaining();
6272      }
6273    
6274      @Override // NameNodeMXBean
6275      public long getTotal() {
6276        return this.getCapacityTotal();
6277      }
6278    
6279      @Override // NameNodeMXBean
6280      public String getSafemode() {
6281        if (!this.isInSafeMode())
6282          return "";
6283        return "Safe mode is ON. " + this.getSafeModeTip();
6284      }
6285    
6286      @Override // NameNodeMXBean
6287      public boolean isUpgradeFinalized() {
6288        return this.getFSImage().isUpgradeFinalized();
6289      }
6290    
6291      @Override // NameNodeMXBean
6292      public long getNonDfsUsedSpace() {
6293        return datanodeStatistics.getCapacityUsedNonDFS();
6294      }
6295    
6296      @Override // NameNodeMXBean
6297      public float getPercentUsed() {
6298        return datanodeStatistics.getCapacityUsedPercent();
6299      }
6300    
6301      @Override // NameNodeMXBean
6302      public long getBlockPoolUsedSpace() {
6303        return datanodeStatistics.getBlockPoolUsed();
6304      }
6305    
6306      @Override // NameNodeMXBean
6307      public float getPercentBlockPoolUsed() {
6308        return datanodeStatistics.getPercentBlockPoolUsed();
6309      }
6310    
6311      @Override // NameNodeMXBean
6312      public float getPercentRemaining() {
6313        return datanodeStatistics.getCapacityRemainingPercent();
6314      }
6315    
6316      @Override // NameNodeMXBean
6317      public long getTotalBlocks() {
6318        return getBlocksTotal();
6319      }
6320    
6321      @Override // NameNodeMXBean
6322      @Metric
6323      public long getTotalFiles() {
6324        return getFilesTotal();
6325      }
6326    
6327      @Override // NameNodeMXBean
6328      public long getNumberOfMissingBlocks() {
6329        return getMissingBlocksCount();
6330      }
6331      
6332      @Override // NameNodeMXBean
6333      public int getThreads() {
6334        return ManagementFactory.getThreadMXBean().getThreadCount();
6335      }
6336    
6337      /**
6338       * Returned information is a JSON representation of map with host name as the
6339       * key and value is a map of live node attribute keys to its values
6340       */
6341      @Override // NameNodeMXBean
6342      public String getLiveNodes() {
6343        final Map<String, Map<String,Object>> info = 
6344          new HashMap<String, Map<String,Object>>();
6345        final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6346        blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6347        for (DatanodeDescriptor node : live) {
6348          final Map<String, Object> innerinfo = new HashMap<String, Object>();
6349          innerinfo.put("lastContact", getLastContact(node));
6350          innerinfo.put("usedSpace", getDfsUsed(node));
6351          innerinfo.put("adminState", node.getAdminState().toString());
6352          innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed());
6353          innerinfo.put("capacity", node.getCapacity());
6354          innerinfo.put("numBlocks", node.numBlocks());
6355          innerinfo.put("version", node.getSoftwareVersion());
6356          info.put(node.getHostName(), innerinfo);
6357        }
6358        return JSON.toString(info);
6359      }
6360    
6361      /**
6362       * Returned information is a JSON representation of map with host name as the
6363       * key and value is a map of dead node attribute keys to its values
6364       */
6365      @Override // NameNodeMXBean
6366      public String getDeadNodes() {
6367        final Map<String, Map<String, Object>> info = 
6368          new HashMap<String, Map<String, Object>>();
6369        final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6370        blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
6371        for (DatanodeDescriptor node : dead) {
6372          final Map<String, Object> innerinfo = new HashMap<String, Object>();
6373          innerinfo.put("lastContact", getLastContact(node));
6374          innerinfo.put("decommissioned", node.isDecommissioned());
6375          info.put(node.getHostName(), innerinfo);
6376        }
6377        return JSON.toString(info);
6378      }
6379    
6380      /**
6381       * Returned information is a JSON representation of map with host name as the
6382       * key and value is a map of decomisioning node attribute keys to its values
6383       */
6384      @Override // NameNodeMXBean
6385      public String getDecomNodes() {
6386        final Map<String, Map<String, Object>> info = 
6387          new HashMap<String, Map<String, Object>>();
6388        final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
6389            ).getDecommissioningNodes();
6390        for (DatanodeDescriptor node : decomNodeList) {
6391          final Map<String, Object> innerinfo = new HashMap<String, Object>();
6392          innerinfo.put("underReplicatedBlocks", node.decommissioningStatus
6393              .getUnderReplicatedBlocks());
6394          innerinfo.put("decommissionOnlyReplicas", node.decommissioningStatus
6395              .getDecommissionOnlyReplicas());
6396          innerinfo.put("underReplicateInOpenFiles", node.decommissioningStatus
6397              .getUnderReplicatedInOpenFiles());
6398          info.put(node.getHostName(), innerinfo);
6399        }
6400        return JSON.toString(info);
6401      }
6402    
6403      private long getLastContact(DatanodeDescriptor alivenode) {
6404        return (Time.now() - alivenode.getLastUpdate())/1000;
6405      }
6406    
6407      private long getDfsUsed(DatanodeDescriptor alivenode) {
6408        return alivenode.getDfsUsed();
6409      }
6410    
6411      @Override  // NameNodeMXBean
6412      public String getClusterId() {
6413        return dir.fsImage.getStorage().getClusterID();
6414      }
6415      
6416      @Override  // NameNodeMXBean
6417      public String getBlockPoolId() {
6418        return blockPoolId;
6419      }
6420      
6421      @Override  // NameNodeMXBean
6422      public String getNameDirStatuses() {
6423        Map<String, Map<File, StorageDirType>> statusMap =
6424          new HashMap<String, Map<File, StorageDirType>>();
6425        
6426        Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
6427        for (Iterator<StorageDirectory> it
6428            = getFSImage().getStorage().dirIterator(); it.hasNext();) {
6429          StorageDirectory st = it.next();
6430          activeDirs.put(st.getRoot(), st.getStorageDirType());
6431        }
6432        statusMap.put("active", activeDirs);
6433        
6434        List<Storage.StorageDirectory> removedStorageDirs
6435            = getFSImage().getStorage().getRemovedStorageDirs();
6436        Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
6437        for (StorageDirectory st : removedStorageDirs) {
6438          failedDirs.put(st.getRoot(), st.getStorageDirType());
6439        }
6440        statusMap.put("failed", failedDirs);
6441        
6442        return JSON.toString(statusMap);
6443      }
6444    
6445      @Override // NameNodeMxBean
6446      public String getJournalTransactionInfo() {
6447        Map<String, String> txnIdMap = new HashMap<String, String>();
6448        txnIdMap.put("LastAppliedOrWrittenTxId",
6449            Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
6450        txnIdMap.put("MostRecentCheckpointTxId",
6451            Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
6452        return JSON.toString(txnIdMap);
6453      }
6454      
6455      /** @return the block manager. */
6456      public BlockManager getBlockManager() {
6457        return blockManager;
6458      }
6459      /** @return the FSDirectory. */
6460      public FSDirectory getFSDirectory() {
6461        return dir;
6462      }
6463    
6464      @Override  //NameNodeMXBean
6465      public int getDistinctVersionCount() {
6466        return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
6467          .size();
6468      }
6469    
6470      @Override  //NameNodeMXBean
6471      public Map<String, Integer> getDistinctVersions() {
6472        return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
6473      }
6474    
6475      @Override  //NameNodeMXBean
6476      public String getSoftwareVersion() {
6477        return VersionInfo.getVersion();
6478      }
6479    
6480      /**
6481       * Verifies that the given identifier and password are valid and match.
6482       * @param identifier Token identifier.
6483       * @param password Password in the token.
6484       */
6485      public synchronized void verifyToken(DelegationTokenIdentifier identifier,
6486          byte[] password) throws InvalidToken, RetriableException {
6487        try {
6488          getDelegationTokenSecretManager().verifyToken(identifier, password);
6489        } catch (InvalidToken it) {
6490          if (inTransitionToActive()) {
6491            throw new RetriableException(it);
6492          }
6493          throw it;
6494        }
6495      }
6496      
6497      @Override
6498      public boolean isGenStampInFuture(Block block) {
6499        if (isLegacyBlock(block)) {
6500          return block.getGenerationStamp() > getGenerationStampV1();
6501        } else {
6502          return block.getGenerationStamp() > getGenerationStampV2();
6503        }
6504      }
6505    
6506      @VisibleForTesting
6507      public EditLogTailer getEditLogTailer() {
6508        return editLogTailer;
6509      }
6510      
6511      @VisibleForTesting
6512      public void setEditLogTailerForTests(EditLogTailer tailer) {
6513        this.editLogTailer = tailer;
6514      }
6515      
6516      @VisibleForTesting
6517      void setFsLockForTests(ReentrantReadWriteLock lock) {
6518        this.fsLock = lock;
6519      }
6520      
6521      @VisibleForTesting
6522      ReentrantReadWriteLock getFsLockForTests() {
6523        return fsLock;
6524      }
6525    
6526      @VisibleForTesting
6527      public SafeModeInfo getSafeModeInfoForTests() {
6528        return safeMode;
6529      }
6530      
6531      @VisibleForTesting
6532      public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
6533        this.nnResourceChecker = nnResourceChecker;
6534      }
6535    
6536      @Override
6537      public boolean isAvoidingStaleDataNodesForWrite() {
6538        return this.blockManager.getDatanodeManager()
6539            .shouldAvoidStaleDataNodesForWrite();
6540      }
6541      
6542      public SnapshotManager getSnapshotManager() {
6543        return snapshotManager;
6544      }
6545      
6546      /** Allow snapshot on a directroy. */
6547      void allowSnapshot(String path) throws SafeModeException, IOException {
6548        writeLock();
6549        try {
6550          checkOperation(OperationCategory.WRITE);
6551          checkNameNodeSafeMode("Cannot allow snapshot for " + path);
6552          checkSuperuserPrivilege();
6553    
6554          dir.writeLock();
6555          try {
6556            snapshotManager.setSnapshottable(path, true);
6557          } finally {
6558            dir.writeUnlock();
6559          }
6560          getEditLog().logAllowSnapshot(path);
6561        } finally {
6562          writeUnlock();
6563        }
6564        getEditLog().logSync();
6565    
6566        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6567          logAuditEvent(true, "allowSnapshot", path, null, null);
6568        }
6569      }
6570      
6571      /** Disallow snapshot on a directory. */
6572      void disallowSnapshot(String path) throws SafeModeException, IOException {
6573        writeLock();
6574        try {
6575          checkOperation(OperationCategory.WRITE);
6576          checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
6577          checkSuperuserPrivilege();
6578    
6579          dir.writeLock();
6580          try {
6581            snapshotManager.resetSnapshottable(path);
6582          } finally {
6583            dir.writeUnlock();
6584          }
6585          getEditLog().logDisallowSnapshot(path);
6586        } finally {
6587          writeUnlock();
6588        }
6589        getEditLog().logSync();
6590        
6591        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6592          logAuditEvent(true, "disallowSnapshot", path, null, null);
6593        }
6594      }
6595      
6596      /**
6597       * Create a snapshot
6598       * @param snapshotRoot The directory path where the snapshot is taken
6599       * @param snapshotName The name of the snapshot
6600       */
6601      String createSnapshot(String snapshotRoot, String snapshotName)
6602          throws SafeModeException, IOException {
6603        checkOperation(OperationCategory.WRITE);
6604        final FSPermissionChecker pc = getPermissionChecker();
6605        CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
6606            null);
6607        if (cacheEntry != null && cacheEntry.isSuccess()) {
6608          return (String) cacheEntry.getPayload();
6609        }
6610        writeLock();
6611        String snapshotPath = null;
6612        try {
6613          checkOperation(OperationCategory.WRITE);
6614          checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
6615          if (isPermissionEnabled) {
6616            checkOwner(pc, snapshotRoot);
6617          }
6618    
6619          if (snapshotName == null || snapshotName.isEmpty()) {
6620            snapshotName = Snapshot.generateDefaultSnapshotName();
6621          }
6622          dir.verifySnapshotName(snapshotName, snapshotRoot);
6623          dir.writeLock();
6624          try {
6625            snapshotPath = snapshotManager.createSnapshot(snapshotRoot, snapshotName);
6626          } finally {
6627            dir.writeUnlock();
6628          }
6629          getEditLog().logCreateSnapshot(snapshotRoot, snapshotName,
6630              cacheEntry != null);
6631        } finally {
6632          writeUnlock();
6633          RetryCache.setState(cacheEntry, snapshotPath != null, snapshotPath);
6634        }
6635        getEditLog().logSync();
6636        
6637        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6638          logAuditEvent(true, "createSnapshot", snapshotRoot, snapshotPath, null);
6639        }
6640        return snapshotPath;
6641      }
6642      
6643      /**
6644       * Rename a snapshot
6645       * @param path The directory path where the snapshot was taken
6646       * @param snapshotOldName Old snapshot name
6647       * @param snapshotNewName New snapshot name
6648       * @throws SafeModeException
6649       * @throws IOException 
6650       */
6651      void renameSnapshot(String path, String snapshotOldName,
6652          String snapshotNewName) throws SafeModeException, IOException {
6653        checkOperation(OperationCategory.WRITE);
6654        final FSPermissionChecker pc = getPermissionChecker();
6655        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6656        if (cacheEntry != null && cacheEntry.isSuccess()) {
6657          return; // Return previous response
6658        }
6659        writeLock();
6660        boolean success = false;
6661        try {
6662          checkOperation(OperationCategory.WRITE);
6663          checkNameNodeSafeMode("Cannot rename snapshot for " + path);
6664          if (isPermissionEnabled) {
6665            checkOwner(pc, path);
6666          }
6667          dir.verifySnapshotName(snapshotNewName, path);
6668          
6669          snapshotManager.renameSnapshot(path, snapshotOldName, snapshotNewName);
6670          getEditLog().logRenameSnapshot(path, snapshotOldName, snapshotNewName,
6671              cacheEntry != null);
6672          success = true;
6673        } finally {
6674          writeUnlock();
6675          RetryCache.setState(cacheEntry, success);
6676        }
6677        getEditLog().logSync();
6678        
6679        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6680          String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
6681          String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
6682          logAuditEvent(true, "renameSnapshot", oldSnapshotRoot, newSnapshotRoot, null);
6683        }
6684      }
6685      
6686      /**
6687       * Get the list of snapshottable directories that are owned 
6688       * by the current user. Return all the snapshottable directories if the 
6689       * current user is a super user.
6690       * @return The list of all the current snapshottable directories
6691       * @throws IOException
6692       */
6693      public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
6694          throws IOException {
6695        SnapshottableDirectoryStatus[] status = null;
6696        final FSPermissionChecker checker = getPermissionChecker();
6697        readLock();
6698        try {
6699          checkOperation(OperationCategory.READ);
6700          final String user = checker.isSuperUser()? null : checker.getUser();
6701          status = snapshotManager.getSnapshottableDirListing(user);
6702        } finally {
6703          readUnlock();
6704        }
6705        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6706          logAuditEvent(true, "listSnapshottableDirectory", null, null, null);
6707        }
6708        return status;
6709      }
6710      
6711      /**
6712       * Get the difference between two snapshots (or between a snapshot and the
6713       * current status) of a snapshottable directory.
6714       * 
6715       * @param path The full path of the snapshottable directory.
6716       * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
6717       *          or empty string indicates the current tree.
6718       * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
6719       *          empty string indicates the current tree.
6720       * @return A report about the difference between {@code fromSnapshot} and 
6721       *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
6722       *         directories belonging to the snapshottable directories are listed 
6723       *         and labeled as M/-/+/R respectively. 
6724       * @throws IOException
6725       */
6726      SnapshotDiffReport getSnapshotDiffReport(String path,
6727          String fromSnapshot, String toSnapshot) throws IOException {
6728        SnapshotDiffInfo diffs = null;
6729        final FSPermissionChecker pc = getPermissionChecker();
6730        readLock();
6731        try {
6732          checkOperation(OperationCategory.READ);
6733          if (isPermissionEnabled) {
6734            checkSubtreeReadPermission(pc, path, fromSnapshot);
6735            checkSubtreeReadPermission(pc, path, toSnapshot);
6736          }
6737          diffs = snapshotManager.diff(path, fromSnapshot, toSnapshot);
6738        } finally {
6739          readUnlock();
6740        }
6741        
6742        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6743          logAuditEvent(true, "computeSnapshotDiff", null, null, null);
6744        }
6745        return diffs != null ? diffs.generateReport() : new SnapshotDiffReport(
6746            path, fromSnapshot, toSnapshot,
6747            Collections.<DiffReportEntry> emptyList());
6748      }
6749      
6750      private void checkSubtreeReadPermission(final FSPermissionChecker pc,
6751          final String snapshottablePath, final String snapshot)
6752              throws AccessControlException, UnresolvedLinkException {
6753        final String fromPath = snapshot == null?
6754            snapshottablePath: Snapshot.getSnapshotPath(snapshottablePath, snapshot);
6755        checkPermission(pc, fromPath, false, null, null, FsAction.READ, FsAction.READ);
6756      }
6757      
6758      /**
6759       * Delete a snapshot of a snapshottable directory
6760       * @param snapshotRoot The snapshottable directory
6761       * @param snapshotName The name of the to-be-deleted snapshot
6762       * @throws SafeModeException
6763       * @throws IOException
6764       */
6765      void deleteSnapshot(String snapshotRoot, String snapshotName)
6766          throws SafeModeException, IOException {
6767        checkOperation(OperationCategory.WRITE);
6768        final FSPermissionChecker pc = getPermissionChecker();
6769        
6770        CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6771        if (cacheEntry != null && cacheEntry.isSuccess()) {
6772          return; // Return previous response
6773        }
6774        boolean success = false;
6775        BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
6776        writeLock();
6777        try {
6778          checkOperation(OperationCategory.WRITE);
6779          checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
6780          if (isPermissionEnabled) {
6781            checkOwner(pc, snapshotRoot);
6782          }
6783    
6784          List<INode> removedINodes = new ArrayList<INode>();
6785          dir.writeLock();
6786          try {
6787            snapshotManager.deleteSnapshot(snapshotRoot, snapshotName,
6788                collectedBlocks, removedINodes);
6789            dir.removeFromInodeMap(removedINodes);
6790          } finally {
6791            dir.writeUnlock();
6792          }
6793          removedINodes.clear();
6794          getEditLog().logDeleteSnapshot(snapshotRoot, snapshotName,
6795              cacheEntry != null);
6796          success = true;
6797        } finally {
6798          writeUnlock();
6799          RetryCache.setState(cacheEntry, success);
6800        }
6801        getEditLog().logSync();
6802    
6803        removeBlocks(collectedBlocks);
6804        collectedBlocks.clear();
6805    
6806        if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6807          String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
6808          logAuditEvent(true, "deleteSnapshot", rootPath, null, null);
6809        }
6810      }
6811    
6812      /**
6813       * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
6814       * @param toRemove the list of INodeDirectorySnapshottable to be removed
6815       */
6816      void removeSnapshottableDirs(List<INodeDirectorySnapshottable> toRemove) {
6817        if (snapshotManager != null) {
6818          snapshotManager.removeSnapshottable(toRemove);
6819        }
6820      }
6821    
6822      /**
6823       * Default AuditLogger implementation; used when no access logger is
6824       * defined in the config file. It can also be explicitly listed in the
6825       * config file.
6826       */
6827      private static class DefaultAuditLogger extends HdfsAuditLogger {
6828    
6829        private boolean logTokenTrackingId;
6830    
6831        @Override
6832        public void initialize(Configuration conf) {
6833          logTokenTrackingId = conf.getBoolean(
6834              DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6835              DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
6836        }
6837    
6838        @Override
6839        public void logAuditEvent(boolean succeeded, String userName,
6840            InetAddress addr, String cmd, String src, String dst,
6841            FileStatus status, UserGroupInformation ugi,
6842            DelegationTokenSecretManager dtSecretManager) {
6843          if (auditLog.isInfoEnabled()) {
6844            final StringBuilder sb = auditBuffer.get();
6845            sb.setLength(0);
6846            sb.append("allowed=").append(succeeded).append("\t");
6847            sb.append("ugi=").append(userName).append("\t");
6848            sb.append("ip=").append(addr).append("\t");
6849            sb.append("cmd=").append(cmd).append("\t");
6850            sb.append("src=").append(src).append("\t");
6851            sb.append("dst=").append(dst).append("\t");
6852            if (null == status) {
6853              sb.append("perm=null");
6854            } else {
6855              sb.append("perm=");
6856              sb.append(status.getOwner()).append(":");
6857              sb.append(status.getGroup()).append(":");
6858              sb.append(status.getPermission());
6859            }
6860            if (logTokenTrackingId) {
6861              sb.append("\t").append("trackingId=");
6862              String trackingId = null;
6863              if (ugi != null && dtSecretManager != null
6864                  && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
6865                for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
6866                  if (tid instanceof DelegationTokenIdentifier) {
6867                    DelegationTokenIdentifier dtid =
6868                        (DelegationTokenIdentifier)tid;
6869                    trackingId = dtSecretManager.getTokenTrackingId(dtid);
6870                    break;
6871                  }
6872                }
6873              }
6874              sb.append(trackingId);
6875            }
6876            auditLog.info(sb);
6877          }
6878        }
6879    
6880      }
6881    
6882    }