001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.namenode;
019
020import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
021import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
022import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
023import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
024import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
025import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
026import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
027import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
028import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
029import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
030import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
031import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
032import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
033import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
034import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
035import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
036import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
037import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
038import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
039import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT;
040import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY;
041import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
042import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
043import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
044import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
045import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
046import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
047import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
048import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
049import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
050import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
051import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
052import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
053import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
054import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
055import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
056import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
057import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
058import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
059import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
060import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
061import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
062import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
063import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
064import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
065import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
066import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
067import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
068import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
069import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
070import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
071import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
072import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
073import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
074import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
075import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
076import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
077import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
078import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
079import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
080import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
081import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
082import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
083import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
084import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
085import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
086import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
087import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
088import static org.apache.hadoop.util.Time.now;
089
090import java.io.BufferedWriter;
091import java.io.ByteArrayInputStream;
092import java.io.DataInput;
093import java.io.DataInputStream;
094import java.io.File;
095import java.io.FileNotFoundException;
096import java.io.FileOutputStream;
097import java.io.IOException;
098import java.io.OutputStreamWriter;
099import java.io.PrintWriter;
100import java.io.StringWriter;
101import java.lang.management.ManagementFactory;
102import java.net.InetAddress;
103import java.net.URI;
104import java.util.ArrayList;
105import java.util.Arrays;
106import java.util.Collection;
107import java.util.Collections;
108import java.util.Date;
109import java.util.EnumSet;
110import java.util.HashMap;
111import java.util.HashSet;
112import java.util.Iterator;
113import java.util.LinkedHashSet;
114import java.util.List;
115import java.util.Map;
116import java.util.Set;
117import java.util.concurrent.TimeUnit;
118import java.util.concurrent.locks.ReentrantLock;
119import java.util.concurrent.locks.ReentrantReadWriteLock;
120
121import javax.management.NotCompliantMBeanException;
122import javax.management.ObjectName;
123import javax.management.StandardMBean;
124
125import org.apache.commons.logging.Log;
126import org.apache.commons.logging.LogFactory;
127import org.apache.commons.logging.impl.Log4JLogger;
128import org.apache.hadoop.HadoopIllegalArgumentException;
129import org.apache.hadoop.classification.InterfaceAudience;
130import org.apache.hadoop.conf.Configuration;
131import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedListEntries;
132import org.apache.hadoop.fs.CacheFlag;
133import org.apache.hadoop.fs.ContentSummary;
134import org.apache.hadoop.fs.CreateFlag;
135import org.apache.hadoop.fs.DirectoryListingStartAfterNotFoundException;
136import org.apache.hadoop.fs.FileAlreadyExistsException;
137import org.apache.hadoop.fs.FileStatus;
138import org.apache.hadoop.fs.FileSystem;
139import org.apache.hadoop.fs.FsServerDefaults;
140import org.apache.hadoop.fs.InvalidPathException;
141import org.apache.hadoop.fs.Options;
142import org.apache.hadoop.fs.Options.Rename;
143import org.apache.hadoop.fs.ParentNotDirectoryException;
144import org.apache.hadoop.fs.Path;
145import org.apache.hadoop.fs.UnresolvedLinkException;
146import org.apache.hadoop.fs.permission.AclEntry;
147import org.apache.hadoop.fs.permission.AclStatus;
148import org.apache.hadoop.fs.permission.FsAction;
149import org.apache.hadoop.fs.permission.FsPermission;
150import org.apache.hadoop.fs.permission.PermissionStatus;
151import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
152import org.apache.hadoop.ha.ServiceFailedException;
153import org.apache.hadoop.hdfs.DFSConfigKeys;
154import org.apache.hadoop.hdfs.DFSUtil;
155import org.apache.hadoop.hdfs.HAUtil;
156import org.apache.hadoop.hdfs.HdfsConfiguration;
157import org.apache.hadoop.hdfs.StorageType;
158import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
159import org.apache.hadoop.hdfs.protocol.Block;
160import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
161import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
162import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
163import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
164import org.apache.hadoop.hdfs.protocol.ClientProtocol;
165import org.apache.hadoop.hdfs.protocol.DatanodeID;
166import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
167import org.apache.hadoop.hdfs.protocol.DirectoryListing;
168import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
169import org.apache.hadoop.hdfs.protocol.HdfsConstants;
170import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
171import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
172import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
173import org.apache.hadoop.hdfs.protocol.LocatedBlock;
174import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
175import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
176import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
177import org.apache.hadoop.hdfs.protocol.RollingUpgradeException;
178import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
179import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
180import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry;
181import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
182import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
183import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
184import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
185import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
186import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
187import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager.SecretManagerState;
188import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
189import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
190import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
191import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
192import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
193import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
194import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStatistics;
195import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
196import org.apache.hadoop.hdfs.server.blockmanagement.OutOfV1GenerationStampsException;
197import org.apache.hadoop.hdfs.server.common.GenerationStamp;
198import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
199import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
200import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.RollingUpgradeStartupOption;
201import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
202import org.apache.hadoop.hdfs.server.common.Storage;
203import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
204import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
205import org.apache.hadoop.hdfs.server.common.Util;
206import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
207import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
208import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream;
209import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
210import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
211import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
212import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
213import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
214import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
215import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
216import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
217import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
218import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable.SnapshotDiffInfo;
219import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
220import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
221import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
222import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
223import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
224import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
225import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
226import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
227import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
228import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
229import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
230import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
231import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
232import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
233import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
234import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
235import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
236import org.apache.hadoop.hdfs.server.protocol.StorageReport;
237import org.apache.hadoop.hdfs.util.ChunkedArrayList;
238import org.apache.hadoop.io.IOUtils;
239import org.apache.hadoop.io.Text;
240import org.apache.hadoop.ipc.RetriableException;
241import org.apache.hadoop.ipc.RetryCache;
242import org.apache.hadoop.ipc.RetryCache.CacheEntry;
243import org.apache.hadoop.ipc.RetryCache.CacheEntryWithPayload;
244import org.apache.hadoop.ipc.Server;
245import org.apache.hadoop.ipc.StandbyException;
246import org.apache.hadoop.metrics2.annotation.Metric;
247import org.apache.hadoop.metrics2.annotation.Metrics;
248import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
249import org.apache.hadoop.metrics2.util.MBeans;
250import org.apache.hadoop.net.NetworkTopology;
251import org.apache.hadoop.net.Node;
252import org.apache.hadoop.security.AccessControlException;
253import org.apache.hadoop.security.UserGroupInformation;
254import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
255import org.apache.hadoop.security.token.SecretManager.InvalidToken;
256import org.apache.hadoop.security.token.Token;
257import org.apache.hadoop.security.token.TokenIdentifier;
258import org.apache.hadoop.security.token.delegation.DelegationKey;
259import org.apache.hadoop.util.Daemon;
260import org.apache.hadoop.util.DataChecksum;
261import org.apache.hadoop.util.StringUtils;
262import org.apache.hadoop.util.Time;
263import org.apache.hadoop.util.VersionInfo;
264import org.apache.log4j.Appender;
265import org.apache.log4j.AsyncAppender;
266import org.apache.log4j.Logger;
267import org.mortbay.util.ajax.JSON;
268
269import com.google.common.annotations.VisibleForTesting;
270import com.google.common.base.Charsets;
271import com.google.common.base.Preconditions;
272import com.google.common.collect.ImmutableMap;
273import com.google.common.collect.Lists;
274
275/***************************************************
276 * FSNamesystem does the actual bookkeeping work for the
277 * DataNode.
278 *
279 * It tracks several important tables.
280 *
281 * 1)  valid fsname --> blocklist  (kept on disk, logged)
282 * 2)  Set of all valid blocks (inverted #1)
283 * 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
284 * 4)  machine --> blocklist (inverted #2)
285 * 5)  LRU cache of updated-heartbeat machines
286 ***************************************************/
287@InterfaceAudience.Private
288@Metrics(context="dfs")
289public class FSNamesystem implements Namesystem, FSClusterStats,
290    FSNamesystemMBean, NameNodeMXBean {
291  public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
292
293  private static final ThreadLocal<StringBuilder> auditBuffer =
294    new ThreadLocal<StringBuilder>() {
295      @Override
296      protected StringBuilder initialValue() {
297        return new StringBuilder();
298      }
299  };
300
301  @VisibleForTesting
302  public boolean isAuditEnabled() {
303    return !isDefaultAuditLogger || auditLog.isInfoEnabled();
304  }
305
306  private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink)
307      throws IOException {
308    return (isAuditEnabled() && isExternalInvocation())
309        ? dir.getFileInfo(path, resolveSymlink) : null;
310  }
311  
312  private void logAuditEvent(boolean succeeded, String cmd, String src)
313      throws IOException {
314    logAuditEvent(succeeded, cmd, src, null, null);
315  }
316  
317  private void logAuditEvent(boolean succeeded, String cmd, String src,
318      String dst, HdfsFileStatus stat) throws IOException {
319    if (isAuditEnabled() && isExternalInvocation()) {
320      logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
321                    cmd, src, dst, stat);
322    }
323  }
324
325  private void logAuditEvent(boolean succeeded,
326      UserGroupInformation ugi, InetAddress addr, String cmd, String src,
327      String dst, HdfsFileStatus stat) {
328    FileStatus status = null;
329    if (stat != null) {
330      Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
331      Path path = dst != null ? new Path(dst) : new Path(src);
332      status = new FileStatus(stat.getLen(), stat.isDir(),
333          stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
334          stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
335          stat.getGroup(), symlink, path);
336    }
337    for (AuditLogger logger : auditLoggers) {
338      if (logger instanceof HdfsAuditLogger) {
339        HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
340        hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
341            status, ugi, dtSecretManager);
342      } else {
343        logger.logAuditEvent(succeeded, ugi.toString(), addr,
344            cmd, src, dst, status);
345      }
346    }
347  }
348
349  /**
350   * Logger for audit events, noting successful FSNamesystem operations. Emits
351   * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
352   * <code>key=value</code> pairs to be written for the following properties:
353   * <code>
354   * ugi=&lt;ugi in RPC&gt;
355   * ip=&lt;remote IP&gt;
356   * cmd=&lt;command&gt;
357   * src=&lt;src path&gt;
358   * dst=&lt;dst path (optional)&gt;
359   * perm=&lt;permissions (optional)&gt;
360   * </code>
361   */
362  public static final Log auditLog = LogFactory.getLog(
363      FSNamesystem.class.getName() + ".audit");
364
365  static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
366  static int BLOCK_DELETION_INCREMENT = 1000;
367  private final boolean isPermissionEnabled;
368  private final UserGroupInformation fsOwner;
369  private final String fsOwnerShortUserName;
370  private final String supergroup;
371  private final boolean standbyShouldCheckpoint;
372  
373  // Scan interval is not configurable.
374  private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
375    TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
376  final DelegationTokenSecretManager dtSecretManager;
377  private final boolean alwaysUseDelegationTokensForTests;
378
379  private static final Step STEP_AWAITING_REPORTED_BLOCKS =
380    new Step(StepType.AWAITING_REPORTED_BLOCKS);
381
382  // Tracks whether the default audit logger is the only configured audit
383  // logger; this allows isAuditEnabled() to return false in case the
384  // underlying logger is disabled, and avoid some unnecessary work.
385  private final boolean isDefaultAuditLogger;
386  private final List<AuditLogger> auditLoggers;
387
388  /** The namespace tree. */
389  FSDirectory dir;
390  private final BlockManager blockManager;
391  private final SnapshotManager snapshotManager;
392  private final CacheManager cacheManager;
393  private final DatanodeStatistics datanodeStatistics;
394
395  private RollingUpgradeInfo rollingUpgradeInfo = null;
396  /**
397   * A flag that indicates whether the checkpointer should checkpoint a rollback
398   * fsimage. The edit log tailer sets this flag. The checkpoint will create a
399   * rollback fsimage if the flag is true, and then change the flag to false.
400   */
401  private volatile boolean needRollbackFsImage;
402
403  // Block pool ID used by this namenode
404  private String blockPoolId;
405
406  final LeaseManager leaseManager = new LeaseManager(this); 
407
408  volatile Daemon smmthread = null;  // SafeModeMonitor thread
409  
410  Daemon nnrmthread = null; // NamenodeResourceMonitor thread
411
412  Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
413  /**
414   * When an active namenode will roll its own edit log, in # edits
415   */
416  private final long editLogRollerThreshold;
417  /**
418   * Check interval of an active namenode's edit log roller thread 
419   */
420  private final int editLogRollerInterval;
421
422  private volatile boolean hasResourcesAvailable = false;
423  private volatile boolean fsRunning = true;
424  
425  /** The start time of the namesystem. */
426  private final long startTime = now();
427
428  /** The interval of namenode checking for the disk space availability */
429  private final long resourceRecheckInterval;
430
431  // The actual resource checker instance.
432  NameNodeResourceChecker nnResourceChecker;
433
434  private final FsServerDefaults serverDefaults;
435  private final boolean supportAppends;
436  private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
437
438  private volatile SafeModeInfo safeMode;  // safe mode information
439
440  private final long maxFsObjects;          // maximum number of fs objects
441
442  private final long minBlockSize;         // minimum block size
443  private final long maxBlocksPerFile;     // maximum # of blocks per file
444
445  /**
446   * The global generation stamp for legacy blocks with randomly
447   * generated block IDs.
448   */
449  private final GenerationStamp generationStampV1 = new GenerationStamp();
450
451  /**
452   * The global generation stamp for this file system.
453   */
454  private final GenerationStamp generationStampV2 = new GenerationStamp();
455
456  /**
457   * The value of the generation stamp when the first switch to sequential
458   * block IDs was made. Blocks with generation stamps below this value
459   * have randomly allocated block IDs. Blocks with generation stamps above
460   * this value had sequentially allocated block IDs. Read from the fsImage
461   * (or initialized as an offset from the V1 (legacy) generation stamp on
462   * upgrade).
463   */
464  private long generationStampV1Limit =
465      GenerationStamp.GRANDFATHER_GENERATION_STAMP;
466
467  /**
468   * The global block ID space for this file system.
469   */
470  @VisibleForTesting
471  private final SequentialBlockIdGenerator blockIdGenerator;
472
473  // precision of access times.
474  private final long accessTimePrecision;
475
476  /** Lock to protect FSNamesystem. */
477  private final FSNamesystemLock fsLock;
478
479  /**
480   * Used when this NN is in standby state to read from the shared edit log.
481   */
482  private EditLogTailer editLogTailer = null;
483
484  /**
485   * Used when this NN is in standby state to perform checkpoints.
486   */
487  private StandbyCheckpointer standbyCheckpointer;
488
489  /**
490   * Reference to the NN's HAContext object. This is only set once
491   * {@link #startCommonServices(Configuration, HAContext)} is called. 
492   */
493  private HAContext haContext;
494
495  private final boolean haEnabled;
496
497  /** flag indicating whether replication queues have been initialized */
498  boolean initializedReplQueues = false;
499
500  /**
501   * Whether the namenode is in the middle of starting the active service
502   */
503  private volatile boolean startingActiveService = false;
504    
505  private INodeId inodeId;
506  
507  private final RetryCache retryCache;
508
509  private final AclConfigFlag aclConfigFlag;
510
511  /**
512   * Set the last allocated inode id when fsimage or editlog is loaded. 
513   */
514  public void resetLastInodeId(long newValue) throws IOException {
515    try {
516      inodeId.skipTo(newValue);
517    } catch(IllegalStateException ise) {
518      throw new IOException(ise);
519    }
520  }
521
522  /** Should only be used for tests to reset to any value */
523  void resetLastInodeIdWithoutChecking(long newValue) {
524    inodeId.setCurrentValue(newValue);
525  }
526  
527  /** @return the last inode ID. */
528  public long getLastInodeId() {
529    return inodeId.getCurrentValue();
530  }
531
532  /** Allocate a new inode ID. */
533  public long allocateNewInodeId() {
534    return inodeId.nextValue();
535  }
536  
537  /**
538   * Clear all loaded data
539   */
540  void clear() {
541    dir.reset();
542    dtSecretManager.reset();
543    generationStampV1.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
544    generationStampV2.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
545    blockIdGenerator.setCurrentValue(
546        SequentialBlockIdGenerator.LAST_RESERVED_BLOCK_ID);
547    generationStampV1Limit = GenerationStamp.GRANDFATHER_GENERATION_STAMP;
548    leaseManager.removeAllLeases();
549    inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
550    snapshotManager.clearSnapshottableDirs();
551    cacheManager.clear();
552  }
553
554  @VisibleForTesting
555  LeaseManager getLeaseManager() {
556    return leaseManager;
557  }
558  
559  boolean isHaEnabled() {
560    return haEnabled;
561  }
562  
563  /**
564   * Check the supplied configuration for correctness.
565   * @param conf Supplies the configuration to validate.
566   * @throws IOException if the configuration could not be queried.
567   * @throws IllegalArgumentException if the configuration is invalid.
568   */
569  private static void checkConfiguration(Configuration conf)
570      throws IOException {
571
572    final Collection<URI> namespaceDirs =
573        FSNamesystem.getNamespaceDirs(conf);
574    final Collection<URI> editsDirs =
575        FSNamesystem.getNamespaceEditsDirs(conf);
576    final Collection<URI> requiredEditsDirs =
577        FSNamesystem.getRequiredNamespaceEditsDirs(conf);
578    final Collection<URI> sharedEditsDirs =
579        FSNamesystem.getSharedEditsDirs(conf);
580
581    for (URI u : requiredEditsDirs) {
582      if (u.toString().compareTo(
583              DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
584        continue;
585      }
586
587      // Each required directory must also be in editsDirs or in
588      // sharedEditsDirs.
589      if (!editsDirs.contains(u) &&
590          !sharedEditsDirs.contains(u)) {
591        throw new IllegalArgumentException(
592            "Required edits directory " + u.toString() + " not present in " +
593            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
594            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
595            editsDirs.toString() + "; " +
596            DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
597            requiredEditsDirs.toString() + ". " +
598            DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
599            sharedEditsDirs.toString() + ".");
600      }
601    }
602
603    if (namespaceDirs.size() == 1) {
604      LOG.warn("Only one image storage directory ("
605          + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of dataloss"
606          + " due to lack of redundant storage directories!");
607    }
608    if (editsDirs.size() == 1) {
609      LOG.warn("Only one namespace edits storage directory ("
610          + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of dataloss"
611          + " due to lack of redundant storage directories!");
612    }
613  }
614
615  /**
616   * Instantiates an FSNamesystem loaded from the image and edits
617   * directories specified in the passed Configuration.
618   *
619   * @param conf the Configuration which specifies the storage directories
620   *             from which to load
621   * @return an FSNamesystem which contains the loaded namespace
622   * @throws IOException if loading fails
623   */
624  static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
625
626    checkConfiguration(conf);
627    FSImage fsImage = new FSImage(conf,
628        FSNamesystem.getNamespaceDirs(conf),
629        FSNamesystem.getNamespaceEditsDirs(conf));
630    FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
631    StartupOption startOpt = NameNode.getStartupOption(conf);
632    if (startOpt == StartupOption.RECOVER) {
633      namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
634    }
635
636    long loadStart = now();
637    try {
638      namesystem.loadFSImage(startOpt);
639    } catch (IOException ioe) {
640      LOG.warn("Encountered exception loading fsimage", ioe);
641      fsImage.close();
642      throw ioe;
643    }
644    long timeTakenToLoadFSImage = now() - loadStart;
645    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
646    NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
647    if (nnMetrics != null) {
648      nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
649    }
650    return namesystem;
651  }
652  
653  FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
654    this(conf, fsImage, false);
655  }
656  
657  /**
658   * Create an FSNamesystem associated with the specified image.
659   * 
660   * Note that this does not load any data off of disk -- if you would
661   * like that behavior, use {@link #loadFromDisk(Configuration)}
662   *
663   * @param conf configuration
664   * @param fsImage The FSImage to associate with
665   * @param ignoreRetryCache Whether or not should ignore the retry cache setup
666   *                         step. For Secondary NN this should be set to true.
667   * @throws IOException on bad configuration
668   */
669  FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
670      throws IOException {
671    if (conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY,
672                        DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT)) {
673      LOG.info("Enabling async auditlog");
674      enableAsyncAuditLog();
675    }
676    boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
677    LOG.info("fsLock is fair:" + fair);
678    fsLock = new FSNamesystemLock(fair);
679    try {
680      resourceRecheckInterval = conf.getLong(
681          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
682          DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
683
684      this.blockManager = new BlockManager(this, this, conf);
685      this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
686      this.blockIdGenerator = new SequentialBlockIdGenerator(this.blockManager);
687
688      this.fsOwner = UserGroupInformation.getCurrentUser();
689      this.fsOwnerShortUserName = fsOwner.getShortUserName();
690      this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, 
691                                 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
692      this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
693                                                 DFS_PERMISSIONS_ENABLED_DEFAULT);
694      LOG.info("fsOwner             = " + fsOwner);
695      LOG.info("supergroup          = " + supergroup);
696      LOG.info("isPermissionEnabled = " + isPermissionEnabled);
697
698      // block allocation has to be persisted in HA using a shared edits directory
699      // so that the standby has up-to-date namespace information
700      String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
701      this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);  
702      
703      // Sanity check the HA-related config.
704      if (nameserviceId != null) {
705        LOG.info("Determined nameservice ID: " + nameserviceId);
706      }
707      LOG.info("HA Enabled: " + haEnabled);
708      if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
709        LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
710        throw new IOException("Invalid configuration: a shared edits dir " +
711            "must not be specified if HA is not enabled.");
712      }
713
714      // Get the checksum type from config
715      String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
716      DataChecksum.Type checksumType;
717      try {
718         checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
719      } catch (IllegalArgumentException iae) {
720         throw new IOException("Invalid checksum type in "
721            + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
722      }
723
724      this.serverDefaults = new FsServerDefaults(
725          conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
726          conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
727          conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
728          (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
729          conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
730          conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
731          conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
732          checksumType);
733      
734      this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY, 
735                                       DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
736
737      this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
738          DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
739      this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
740          DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
741      this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
742          DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
743      this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
744      LOG.info("Append Enabled: " + supportAppends);
745
746      this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
747      
748      this.standbyShouldCheckpoint = conf.getBoolean(
749          DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
750      // # edit autoroll threshold is a multiple of the checkpoint threshold 
751      this.editLogRollerThreshold = (long)
752          (conf.getFloat(
753              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
754              DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
755          conf.getLong(
756              DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
757              DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
758      this.editLogRollerInterval = conf.getInt(
759          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
760          DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
761      this.inodeId = new INodeId();
762      
763      // For testing purposes, allow the DT secret manager to be started regardless
764      // of whether security is enabled.
765      alwaysUseDelegationTokensForTests = conf.getBoolean(
766          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
767          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
768
769      this.dtSecretManager = createDelegationTokenSecretManager(conf);
770      this.dir = new FSDirectory(fsImage, this, conf);
771      this.snapshotManager = new SnapshotManager(dir);
772      this.cacheManager = new CacheManager(this, conf, blockManager);
773      this.safeMode = new SafeModeInfo(conf);
774      this.auditLoggers = initAuditLoggers(conf);
775      this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
776        auditLoggers.get(0) instanceof DefaultAuditLogger;
777      this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
778      this.aclConfigFlag = new AclConfigFlag(conf);
779    } catch(IOException e) {
780      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
781      close();
782      throw e;
783    } catch (RuntimeException re) {
784      LOG.error(getClass().getSimpleName() + " initialization failed.", re);
785      close();
786      throw re;
787    }
788  }
789  
790  @VisibleForTesting
791  public RetryCache getRetryCache() {
792    return retryCache;
793  }
794
795  void lockRetryCache() {
796    if (retryCache != null) {
797      retryCache.lock();
798    }
799  }
800
801  void unlockRetryCache() {
802    if (retryCache != null) {
803      retryCache.unlock();
804    }
805  }
806
807  /** Whether or not retry cache is enabled */
808  boolean hasRetryCache() {
809    return retryCache != null;
810  }
811  
812  void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
813    if (retryCache != null) {
814      retryCache.addCacheEntryWithPayload(clientId, callId, payload);
815    }
816  }
817  
818  void addCacheEntry(byte[] clientId, int callId) {
819    if (retryCache != null) {
820      retryCache.addCacheEntry(clientId, callId);
821    }
822  }
823
824  @VisibleForTesting
825  static RetryCache initRetryCache(Configuration conf) {
826    boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
827        DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
828    LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
829    if (enable) {
830      float heapPercent = conf.getFloat(
831          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
832          DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
833      long entryExpiryMillis = conf.getLong(
834          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
835          DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
836      LOG.info("Retry cache will use " + heapPercent
837          + " of total heap and retry cache entry expiry time is "
838          + entryExpiryMillis + " millis");
839      long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
840      return new RetryCache("NameNodeRetryCache", heapPercent,
841          entryExpiryNanos);
842    }
843    return null;
844  }
845
846  private List<AuditLogger> initAuditLoggers(Configuration conf) {
847    // Initialize the custom access loggers if configured.
848    Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
849    List<AuditLogger> auditLoggers = Lists.newArrayList();
850    if (alClasses != null && !alClasses.isEmpty()) {
851      for (String className : alClasses) {
852        try {
853          AuditLogger logger;
854          if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
855            logger = new DefaultAuditLogger();
856          } else {
857            logger = (AuditLogger) Class.forName(className).newInstance();
858          }
859          logger.initialize(conf);
860          auditLoggers.add(logger);
861        } catch (RuntimeException re) {
862          throw re;
863        } catch (Exception e) {
864          throw new RuntimeException(e);
865        }
866      }
867    }
868
869    // Make sure there is at least one logger installed.
870    if (auditLoggers.isEmpty()) {
871      auditLoggers.add(new DefaultAuditLogger());
872    }
873    return Collections.unmodifiableList(auditLoggers);
874  }
875
876  private void loadFSImage(StartupOption startOpt) throws IOException {
877    final FSImage fsImage = getFSImage();
878
879    // format before starting up if requested
880    if (startOpt == StartupOption.FORMAT) {
881      
882      fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
883
884      startOpt = StartupOption.REGULAR;
885    }
886    boolean success = false;
887    writeLock();
888    try {
889      // We shouldn't be calling saveNamespace if we've come up in standby state.
890      MetaRecoveryContext recovery = startOpt.createRecoveryContext();
891      final boolean staleImage
892          = fsImage.recoverTransitionRead(startOpt, this, recovery);
893      if (RollingUpgradeStartupOption.ROLLBACK.matches(startOpt)) {
894        rollingUpgradeInfo = null;
895      }
896      final boolean needToSave = staleImage && !haEnabled && !isRollingUpgrade(); 
897      LOG.info("Need to save fs image? " + needToSave
898          + " (staleImage=" + staleImage + ", haEnabled=" + haEnabled
899          + ", isRollingUpgrade=" + isRollingUpgrade() + ")");
900      if (needToSave) {
901        fsImage.saveNamespace(this);
902      } else {
903        // No need to save, so mark the phase done.
904        StartupProgress prog = NameNode.getStartupProgress();
905        prog.beginPhase(Phase.SAVING_CHECKPOINT);
906        prog.endPhase(Phase.SAVING_CHECKPOINT);
907      }
908      // This will start a new log segment and write to the seen_txid file, so
909      // we shouldn't do it when coming up in standby state
910      if (!haEnabled || (haEnabled && startOpt == StartupOption.UPGRADE)) {
911        fsImage.openEditLogForWrite();
912      }
913      success = true;
914    } finally {
915      if (!success) {
916        fsImage.close();
917      }
918      writeUnlock();
919    }
920    dir.imageLoadComplete();
921  }
922
923  private void startSecretManager() {
924    if (dtSecretManager != null) {
925      try {
926        dtSecretManager.startThreads();
927      } catch (IOException e) {
928        // Inability to start secret manager
929        // can't be recovered from.
930        throw new RuntimeException(e);
931      }
932    }
933  }
934  
935  private void startSecretManagerIfNecessary() {
936    boolean shouldRun = shouldUseDelegationTokens() &&
937      !isInSafeMode() && getEditLog().isOpenForWrite();
938    boolean running = dtSecretManager.isRunning();
939    if (shouldRun && !running) {
940      startSecretManager();
941    }
942  }
943
944  private void stopSecretManager() {
945    if (dtSecretManager != null) {
946      dtSecretManager.stopThreads();
947    }
948  }
949  
950  /** 
951   * Start services common to both active and standby states
952   * @param haContext 
953   * @throws IOException
954   */
955  void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
956    this.registerMBean(); // register the MBean for the FSNamesystemState
957    writeLock();
958    this.haContext = haContext;
959    try {
960      nnResourceChecker = new NameNodeResourceChecker(conf);
961      checkAvailableResources();
962      assert safeMode != null && !isPopulatingReplQueues();
963      StartupProgress prog = NameNode.getStartupProgress();
964      prog.beginPhase(Phase.SAFEMODE);
965      prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
966        getCompleteBlocksTotal());
967      setBlockTotal();
968      blockManager.activate(conf);
969    } finally {
970      writeUnlock();
971    }
972    
973    registerMXBean();
974    DefaultMetricsSystem.instance().register(this);
975  }
976  
977  /** 
978   * Stop services common to both active and standby states
979   * @throws IOException
980   */
981  void stopCommonServices() {
982    writeLock();
983    try {
984      if (blockManager != null) blockManager.close();
985    } finally {
986      writeUnlock();
987    }
988    RetryCache.clear(retryCache);
989  }
990  
991  /**
992   * Start services required in active state
993   * @throws IOException
994   */
995  void startActiveServices() throws IOException {
996    startingActiveService = true;
997    LOG.info("Starting services required for active state");
998    writeLock();
999    try {
1000      FSEditLog editLog = dir.fsImage.getEditLog();
1001      
1002      if (!editLog.isOpenForWrite()) {
1003        // During startup, we're already open for write during initialization.
1004        editLog.initJournalsForWrite();
1005        // May need to recover
1006        editLog.recoverUnclosedStreams();
1007        
1008        LOG.info("Catching up to latest edits from old active before " +
1009            "taking over writer role in edits logs");
1010        editLogTailer.catchupDuringFailover();
1011        
1012        blockManager.setPostponeBlocksFromFuture(false);
1013        blockManager.getDatanodeManager().markAllDatanodesStale();
1014        blockManager.clearQueues();
1015        blockManager.processAllPendingDNMessages();
1016
1017        // Only need to re-process the queue, If not in SafeMode.
1018        if (!isInSafeMode()) {
1019          LOG.info("Reprocessing replication and invalidation queues");
1020          initializeReplQueues();
1021        }
1022
1023        if (LOG.isDebugEnabled()) {
1024          LOG.debug("NameNode metadata after re-processing " +
1025              "replication and invalidation queues during failover:\n" +
1026              metaSaveAsString());
1027        }
1028        
1029        long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
1030        LOG.info("Will take over writing edit logs at txnid " + 
1031            nextTxId);
1032        editLog.setNextTxId(nextTxId);
1033
1034        dir.fsImage.editLog.openForWrite();
1035      }
1036      
1037      if (haEnabled) {
1038        // Renew all of the leases before becoming active.
1039        // This is because, while we were in standby mode,
1040        // the leases weren't getting renewed on this NN.
1041        // Give them all a fresh start here.
1042        leaseManager.renewAllLeases();
1043      }
1044      leaseManager.startMonitor();
1045      startSecretManagerIfNecessary();
1046
1047      //ResourceMonitor required only at ActiveNN. See HDFS-2914
1048      this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
1049      nnrmthread.start();
1050
1051      nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
1052          editLogRollerThreshold, editLogRollerInterval));
1053      nnEditLogRoller.start();
1054
1055      cacheManager.startMonitorThread();
1056      blockManager.getDatanodeManager().setShouldSendCachingCommands(true);
1057    } finally {
1058      writeUnlock();
1059      startingActiveService = false;
1060    }
1061  }
1062
1063  /**
1064   * Initialize replication queues.
1065   */
1066  private void initializeReplQueues() {
1067    LOG.info("initializing replication queues");
1068    blockManager.processMisReplicatedBlocks();
1069    initializedReplQueues = true;
1070  }
1071
1072  private boolean inActiveState() {
1073    return haContext != null &&
1074        haContext.getState().getServiceState() == HAServiceState.ACTIVE;
1075  }
1076
1077  /**
1078   * @return Whether the namenode is transitioning to active state and is in the
1079   *         middle of the {@link #startActiveServices()}
1080   */
1081  public boolean inTransitionToActive() {
1082    return haEnabled && inActiveState() && startingActiveService;
1083  }
1084
1085  private boolean shouldUseDelegationTokens() {
1086    return UserGroupInformation.isSecurityEnabled() ||
1087      alwaysUseDelegationTokensForTests;
1088  }
1089
1090  /** 
1091   * Stop services required in active state
1092   * @throws InterruptedException
1093   */
1094  void stopActiveServices() {
1095    LOG.info("Stopping services started for active state");
1096    writeLock();
1097    try {
1098      stopSecretManager();
1099      if (leaseManager != null) {
1100        leaseManager.stopMonitor();
1101      }
1102      if (nnrmthread != null) {
1103        ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1104        nnrmthread.interrupt();
1105      }
1106      if (nnEditLogRoller != null) {
1107        ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1108        nnEditLogRoller.interrupt();
1109      }
1110      if (dir != null && dir.fsImage != null) {
1111        if (dir.fsImage.editLog != null) {
1112          dir.fsImage.editLog.close();
1113        }
1114        // Update the fsimage with the last txid that we wrote
1115        // so that the tailer starts from the right spot.
1116        dir.fsImage.updateLastAppliedTxIdFromWritten();
1117      }
1118      cacheManager.stopMonitorThread();
1119      cacheManager.clearDirectiveStats();
1120      blockManager.getDatanodeManager().clearPendingCachingCommands();
1121      blockManager.getDatanodeManager().setShouldSendCachingCommands(false);
1122      // Don't want to keep replication queues when not in Active.
1123      blockManager.clearQueues();
1124      initializedReplQueues = false;
1125    } finally {
1126      writeUnlock();
1127    }
1128  }
1129  
1130  /**
1131   * Start services required in standby state 
1132   * 
1133   * @throws IOException
1134   */
1135  void startStandbyServices(final Configuration conf) throws IOException {
1136    LOG.info("Starting services required for standby state");
1137    if (!dir.fsImage.editLog.isOpenForRead()) {
1138      // During startup, we're already open for read.
1139      dir.fsImage.editLog.initSharedJournalsForRead();
1140    }
1141    
1142    blockManager.setPostponeBlocksFromFuture(true);
1143
1144    editLogTailer = new EditLogTailer(this, conf);
1145    editLogTailer.start();
1146    if (standbyShouldCheckpoint) {
1147      standbyCheckpointer = new StandbyCheckpointer(conf, this);
1148      standbyCheckpointer.start();
1149    }
1150  }
1151
1152  /**
1153   * Called when the NN is in Standby state and the editlog tailer tails the
1154   * OP_ROLLING_UPGRADE_START.
1155   */
1156  void triggerRollbackCheckpoint() {
1157    setNeedRollbackFsImage(true);
1158    if (standbyCheckpointer != null) {
1159      standbyCheckpointer.triggerRollbackCheckpoint();
1160    }
1161  }
1162
1163  /**
1164   * Called while the NN is in Standby state, but just about to be
1165   * asked to enter Active state. This cancels any checkpoints
1166   * currently being taken.
1167   */
1168  void prepareToStopStandbyServices() throws ServiceFailedException {
1169    if (standbyCheckpointer != null) {
1170      standbyCheckpointer.cancelAndPreventCheckpoints(
1171          "About to leave standby state");
1172    }
1173  }
1174
1175  /** Stop services required in standby state */
1176  void stopStandbyServices() throws IOException {
1177    LOG.info("Stopping services started for standby state");
1178    if (standbyCheckpointer != null) {
1179      standbyCheckpointer.stop();
1180    }
1181    if (editLogTailer != null) {
1182      editLogTailer.stop();
1183    }
1184    if (dir != null && dir.fsImage != null && dir.fsImage.editLog != null) {
1185      dir.fsImage.editLog.close();
1186    }
1187  }
1188  
1189  @Override
1190  public void checkOperation(OperationCategory op) throws StandbyException {
1191    if (haContext != null) {
1192      // null in some unit tests
1193      haContext.checkOperation(op);
1194    }
1195  }
1196  
1197  /**
1198   * @throws RetriableException
1199   *           If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1200   *           NameNode is in active state
1201   * @throws SafeModeException
1202   *           Otherwise if NameNode is in SafeMode.
1203   */
1204  private void checkNameNodeSafeMode(String errorMsg)
1205      throws RetriableException, SafeModeException {
1206    if (isInSafeMode()) {
1207      SafeModeException se = new SafeModeException(errorMsg, safeMode);
1208      if (haEnabled && haContext != null
1209          && haContext.getState().getServiceState() == HAServiceState.ACTIVE
1210          && shouldRetrySafeMode(this.safeMode)) {
1211        throw new RetriableException(se);
1212      } else {
1213        throw se;
1214      }
1215    }
1216  }
1217  
1218  /**
1219   * We already know that the safemode is on. We will throw a RetriableException
1220   * if the safemode is not manual or caused by low resource.
1221   */
1222  private boolean shouldRetrySafeMode(SafeModeInfo safeMode) {
1223    if (safeMode == null) {
1224      return false;
1225    } else {
1226      return !safeMode.isManual() && !safeMode.areResourcesLow();
1227    }
1228  }
1229  
1230  public static Collection<URI> getNamespaceDirs(Configuration conf) {
1231    return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1232  }
1233
1234  /**
1235   * Get all edits dirs which are required. If any shared edits dirs are
1236   * configured, these are also included in the set of required dirs.
1237   * 
1238   * @param conf the HDFS configuration.
1239   * @return all required dirs.
1240   */
1241  public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1242    Set<URI> ret = new HashSet<URI>();
1243    ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1244    ret.addAll(getSharedEditsDirs(conf));
1245    return ret;
1246  }
1247
1248  private static Collection<URI> getStorageDirs(Configuration conf,
1249                                                String propertyName) {
1250    Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1251    StartupOption startOpt = NameNode.getStartupOption(conf);
1252    if(startOpt == StartupOption.IMPORT) {
1253      // In case of IMPORT this will get rid of default directories 
1254      // but will retain directories specified in hdfs-site.xml
1255      // When importing image from a checkpoint, the name-node can
1256      // start with empty set of storage directories.
1257      Configuration cE = new HdfsConfiguration(false);
1258      cE.addResource("core-default.xml");
1259      cE.addResource("core-site.xml");
1260      cE.addResource("hdfs-default.xml");
1261      Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1262      dirNames.removeAll(dirNames2);
1263      if(dirNames.isEmpty())
1264        LOG.warn("!!! WARNING !!!" +
1265          "\n\tThe NameNode currently runs without persistent storage." +
1266          "\n\tAny changes to the file system meta-data may be lost." +
1267          "\n\tRecommended actions:" +
1268          "\n\t\t- shutdown and restart NameNode with configured \"" 
1269          + propertyName + "\" in hdfs-site.xml;" +
1270          "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1271          "of the file system meta-data.");
1272    } else if (dirNames.isEmpty()) {
1273      dirNames = Collections.singletonList(
1274          DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1275    }
1276    return Util.stringCollectionAsURIs(dirNames);
1277  }
1278
1279  /**
1280   * Return an ordered list of edits directories to write to.
1281   * The list is ordered such that all shared edits directories
1282   * are ordered before non-shared directories, and any duplicates
1283   * are removed. The order they are specified in the configuration
1284   * is retained.
1285   * @return Collection of shared edits directories.
1286   * @throws IOException if multiple shared edits directories are configured
1287   */
1288  public static List<URI> getNamespaceEditsDirs(Configuration conf)
1289      throws IOException {
1290    return getNamespaceEditsDirs(conf, true);
1291  }
1292  
1293  public static List<URI> getNamespaceEditsDirs(Configuration conf,
1294      boolean includeShared)
1295      throws IOException {
1296    // Use a LinkedHashSet so that order is maintained while we de-dup
1297    // the entries.
1298    LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1299    
1300    if (includeShared) {
1301      List<URI> sharedDirs = getSharedEditsDirs(conf);
1302  
1303      // Fail until multiple shared edits directories are supported (HDFS-2782)
1304      if (sharedDirs.size() > 1) {
1305        throw new IOException(
1306            "Multiple shared edits directories are not yet supported");
1307      }
1308  
1309      // First add the shared edits dirs. It's critical that the shared dirs
1310      // are added first, since JournalSet syncs them in the order they are listed,
1311      // and we need to make sure all edits are in place in the shared storage
1312      // before they are replicated locally. See HDFS-2874.
1313      for (URI dir : sharedDirs) {
1314        if (!editsDirs.add(dir)) {
1315          LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1316              DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1317        }
1318      }
1319    }    
1320    // Now add the non-shared dirs.
1321    for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1322      if (!editsDirs.add(dir)) {
1323        LOG.warn("Edits URI " + dir + " listed multiple times in " + 
1324            DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1325            DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1326      }
1327    }
1328
1329    if (editsDirs.isEmpty()) {
1330      // If this is the case, no edit dirs have been explicitly configured.
1331      // Image dirs are to be used for edits too.
1332      return Lists.newArrayList(getNamespaceDirs(conf));
1333    } else {
1334      return Lists.newArrayList(editsDirs);
1335    }
1336  }
1337  
1338  /**
1339   * Returns edit directories that are shared between primary and secondary.
1340   * @param conf
1341   * @return Collection of edit directories.
1342   */
1343  public static List<URI> getSharedEditsDirs(Configuration conf) {
1344    // don't use getStorageDirs here, because we want an empty default
1345    // rather than the dir in /tmp
1346    Collection<String> dirNames = conf.getTrimmedStringCollection(
1347        DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1348    return Util.stringCollectionAsURIs(dirNames);
1349  }
1350
1351  @Override
1352  public void readLock() {
1353    this.fsLock.readLock().lock();
1354  }
1355  @Override
1356  public void longReadLockInterruptibly() throws InterruptedException {
1357    this.fsLock.longReadLock().lockInterruptibly();
1358    try {
1359      this.fsLock.readLock().lockInterruptibly();
1360    } catch (InterruptedException ie) {
1361      // In the event we're interrupted while getting the normal FSNS read lock,
1362      // release the long read lock.
1363      this.fsLock.longReadLock().unlock();
1364      throw ie;
1365    }
1366  }
1367  @Override
1368  public void longReadUnlock() {
1369    this.fsLock.readLock().unlock();
1370    this.fsLock.longReadLock().unlock();
1371  }
1372  @Override
1373  public void readUnlock() {
1374    this.fsLock.readLock().unlock();
1375  }
1376  @Override
1377  public void writeLock() {
1378    this.fsLock.longReadLock().lock();
1379    this.fsLock.writeLock().lock();
1380  }
1381  @Override
1382  public void writeLockInterruptibly() throws InterruptedException {
1383    this.fsLock.longReadLock().lockInterruptibly();
1384    try {
1385      this.fsLock.writeLock().lockInterruptibly();
1386    } catch (InterruptedException ie) {
1387      // In the event we're interrupted while getting the normal FSNS write
1388      // lock, release the long read lock.
1389      this.fsLock.longReadLock().unlock();
1390      throw ie;
1391    }
1392  }
1393  @Override
1394  public void writeUnlock() {
1395    this.fsLock.writeLock().unlock();
1396    this.fsLock.longReadLock().unlock();
1397  }
1398  @Override
1399  public boolean hasWriteLock() {
1400    return this.fsLock.isWriteLockedByCurrentThread();
1401  }
1402  @Override
1403  public boolean hasReadLock() {
1404    return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1405  }
1406
1407  public int getReadHoldCount() {
1408    return this.fsLock.getReadHoldCount();
1409  }
1410
1411  public int getWriteHoldCount() {
1412    return this.fsLock.getWriteHoldCount();
1413  }
1414
1415  NamespaceInfo getNamespaceInfo() {
1416    readLock();
1417    try {
1418      return unprotectedGetNamespaceInfo();
1419    } finally {
1420      readUnlock();
1421    }
1422  }
1423
1424  /**
1425   * Version of @see #getNamespaceInfo() that is not protected by a lock.
1426   */
1427  NamespaceInfo unprotectedGetNamespaceInfo() {
1428    return new NamespaceInfo(dir.fsImage.getStorage().getNamespaceID(),
1429        getClusterId(), getBlockPoolId(),
1430        dir.fsImage.getStorage().getCTime());
1431  }
1432
1433  /**
1434   * Close down this file system manager.
1435   * Causes heartbeat and lease daemons to stop; waits briefly for
1436   * them to finish, but a short timeout returns control back to caller.
1437   */
1438  void close() {
1439    fsRunning = false;
1440    try {
1441      stopCommonServices();
1442      if (smmthread != null) smmthread.interrupt();
1443    } finally {
1444      // using finally to ensure we also wait for lease daemon
1445      try {
1446        stopActiveServices();
1447        stopStandbyServices();
1448        if (dir != null) {
1449          dir.close();
1450        }
1451      } catch (IOException ie) {
1452        LOG.error("Error closing FSDirectory", ie);
1453        IOUtils.cleanup(LOG, dir);
1454      }
1455    }
1456  }
1457
1458  @Override
1459  public boolean isRunning() {
1460    return fsRunning;
1461  }
1462  
1463  @Override
1464  public boolean isInStandbyState() {
1465    if (haContext == null || haContext.getState() == null) {
1466      // We're still starting up. In this case, if HA is
1467      // on for the cluster, we always start in standby. Otherwise
1468      // start in active.
1469      return haEnabled;
1470    }
1471
1472    return HAServiceState.STANDBY == haContext.getState().getServiceState();
1473  }
1474
1475  /**
1476   * Dump all metadata into specified file
1477   */
1478  void metaSave(String filename) throws IOException {
1479    checkSuperuserPrivilege();
1480    checkOperation(OperationCategory.UNCHECKED);
1481    writeLock();
1482    try {
1483      checkOperation(OperationCategory.UNCHECKED);
1484      File file = new File(System.getProperty("hadoop.log.dir"), filename);
1485      PrintWriter out = new PrintWriter(new BufferedWriter(
1486          new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1487      metaSave(out);
1488      out.flush();
1489      out.close();
1490    } finally {
1491      writeUnlock();
1492    }
1493  }
1494
1495  private void metaSave(PrintWriter out) {
1496    assert hasWriteLock();
1497    long totalInodes = this.dir.totalInodes();
1498    long totalBlocks = this.getBlocksTotal();
1499    out.println(totalInodes + " files and directories, " + totalBlocks
1500        + " blocks = " + (totalInodes + totalBlocks) + " total");
1501
1502    blockManager.metaSave(out);
1503  }
1504
1505  private String metaSaveAsString() {
1506    StringWriter sw = new StringWriter();
1507    PrintWriter pw = new PrintWriter(sw);
1508    metaSave(pw);
1509    pw.flush();
1510    return sw.toString();
1511  }
1512  
1513
1514  long getDefaultBlockSize() {
1515    return serverDefaults.getBlockSize();
1516  }
1517
1518  FsServerDefaults getServerDefaults() throws StandbyException {
1519    checkOperation(OperationCategory.READ);
1520    return serverDefaults;
1521  }
1522
1523  long getAccessTimePrecision() {
1524    return accessTimePrecision;
1525  }
1526
1527  private boolean isAccessTimeSupported() {
1528    return accessTimePrecision > 0;
1529  }
1530
1531  /////////////////////////////////////////////////////////
1532  //
1533  // These methods are called by HadoopFS clients
1534  //
1535  /////////////////////////////////////////////////////////
1536  /**
1537   * Set permissions for an existing file.
1538   * @throws IOException
1539   */
1540  void setPermission(String src, FsPermission permission)
1541      throws AccessControlException, FileNotFoundException, SafeModeException,
1542      UnresolvedLinkException, IOException {
1543    try {
1544      setPermissionInt(src, permission);
1545    } catch (AccessControlException e) {
1546      logAuditEvent(false, "setPermission", src);
1547      throw e;
1548    }
1549  }
1550
1551  private void setPermissionInt(String src, FsPermission permission)
1552      throws AccessControlException, FileNotFoundException, SafeModeException,
1553      UnresolvedLinkException, IOException {
1554    HdfsFileStatus resultingStat = null;
1555    FSPermissionChecker pc = getPermissionChecker();
1556    checkOperation(OperationCategory.WRITE);
1557    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1558    writeLock();
1559    try {
1560      checkOperation(OperationCategory.WRITE);
1561      checkNameNodeSafeMode("Cannot set permission for " + src);
1562      src = FSDirectory.resolvePath(src, pathComponents, dir);
1563      checkOwner(pc, src);
1564      dir.setPermission(src, permission);
1565      resultingStat = getAuditFileInfo(src, false);
1566    } finally {
1567      writeUnlock();
1568    }
1569    getEditLog().logSync();
1570    logAuditEvent(true, "setPermission", src, null, resultingStat);
1571  }
1572
1573  /**
1574   * Set owner for an existing file.
1575   * @throws IOException
1576   */
1577  void setOwner(String src, String username, String group)
1578      throws AccessControlException, FileNotFoundException, SafeModeException,
1579      UnresolvedLinkException, IOException {
1580    try {
1581      setOwnerInt(src, username, group);
1582    } catch (AccessControlException e) {
1583      logAuditEvent(false, "setOwner", src);
1584      throw e;
1585    } 
1586  }
1587
1588  private void setOwnerInt(String src, String username, String group)
1589      throws AccessControlException, FileNotFoundException, SafeModeException,
1590      UnresolvedLinkException, IOException {
1591    HdfsFileStatus resultingStat = null;
1592    FSPermissionChecker pc = getPermissionChecker();
1593    checkOperation(OperationCategory.WRITE);
1594    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1595    writeLock();
1596    try {
1597      checkOperation(OperationCategory.WRITE);
1598      checkNameNodeSafeMode("Cannot set owner for " + src);
1599      src = FSDirectory.resolvePath(src, pathComponents, dir);
1600      checkOwner(pc, src);
1601      if (!pc.isSuperUser()) {
1602        if (username != null && !pc.getUser().equals(username)) {
1603          throw new AccessControlException("Non-super user cannot change owner");
1604        }
1605        if (group != null && !pc.containsGroup(group)) {
1606          throw new AccessControlException("User does not belong to " + group);
1607        }
1608      }
1609      dir.setOwner(src, username, group);
1610      resultingStat = getAuditFileInfo(src, false);
1611    } finally {
1612      writeUnlock();
1613    }
1614    getEditLog().logSync();
1615    logAuditEvent(true, "setOwner", src, null, resultingStat);
1616  }
1617
1618  /**
1619   * Get block locations within the specified range.
1620   * @see ClientProtocol#getBlockLocations(String, long, long)
1621   */
1622  LocatedBlocks getBlockLocations(String clientMachine, String src,
1623      long offset, long length) throws AccessControlException,
1624      FileNotFoundException, UnresolvedLinkException, IOException {
1625    LocatedBlocks blocks = getBlockLocations(src, offset, length, true, true,
1626        true);
1627    if (blocks != null) {
1628      blockManager.getDatanodeManager().sortLocatedBlocks(
1629          clientMachine, blocks.getLocatedBlocks());
1630      
1631      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1632      if (lastBlock != null) {
1633        ArrayList<LocatedBlock> lastBlockList = new ArrayList<LocatedBlock>();
1634        lastBlockList.add(lastBlock);
1635        blockManager.getDatanodeManager().sortLocatedBlocks(
1636                              clientMachine, lastBlockList);
1637      }
1638    }
1639    return blocks;
1640  }
1641
1642  /**
1643   * Get block locations within the specified range.
1644   * @see ClientProtocol#getBlockLocations(String, long, long)
1645   * @throws FileNotFoundException, UnresolvedLinkException, IOException
1646   */
1647  LocatedBlocks getBlockLocations(String src, long offset, long length,
1648      boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode)
1649      throws FileNotFoundException, UnresolvedLinkException, IOException {
1650    try {
1651      return getBlockLocationsInt(src, offset, length, doAccessTime,
1652                                  needBlockToken, checkSafeMode);
1653    } catch (AccessControlException e) {
1654      logAuditEvent(false, "open", src);
1655      throw e;
1656    }
1657  }
1658
1659  private LocatedBlocks getBlockLocationsInt(String src, long offset,
1660      long length, boolean doAccessTime, boolean needBlockToken,
1661      boolean checkSafeMode)
1662      throws FileNotFoundException, UnresolvedLinkException, IOException {
1663    if (offset < 0) {
1664      throw new HadoopIllegalArgumentException(
1665          "Negative offset is not supported. File: " + src);
1666    }
1667    if (length < 0) {
1668      throw new HadoopIllegalArgumentException(
1669          "Negative length is not supported. File: " + src);
1670    }
1671    final LocatedBlocks ret = getBlockLocationsUpdateTimes(src,
1672        offset, length, doAccessTime, needBlockToken);  
1673    logAuditEvent(true, "open", src);
1674    if (checkSafeMode && isInSafeMode()) {
1675      for (LocatedBlock b : ret.getLocatedBlocks()) {
1676        // if safemode & no block locations yet then throw safemodeException
1677        if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1678          SafeModeException se = new SafeModeException(
1679              "Zero blocklocations for " + src, safeMode);
1680          if (haEnabled && haContext != null && 
1681              haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1682            throw new RetriableException(se);
1683          } else {
1684            throw se;
1685          }
1686        }
1687      }
1688    }
1689    return ret;
1690  }
1691
1692  /*
1693   * Get block locations within the specified range, updating the
1694   * access times if necessary. 
1695   */
1696  private LocatedBlocks getBlockLocationsUpdateTimes(String src, long offset,
1697      long length, boolean doAccessTime, boolean needBlockToken)
1698      throws FileNotFoundException,
1699      UnresolvedLinkException, IOException {
1700    FSPermissionChecker pc = getPermissionChecker();
1701    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1702    for (int attempt = 0; attempt < 2; attempt++) {
1703      boolean isReadOp = (attempt == 0);
1704      if (isReadOp) { // first attempt is with readlock
1705        checkOperation(OperationCategory.READ);
1706        readLock();
1707      }  else { // second attempt is with  write lock
1708        checkOperation(OperationCategory.WRITE);
1709        writeLock(); // writelock is needed to set accesstime
1710      }
1711      src = FSDirectory.resolvePath(src, pathComponents, dir);
1712      try {
1713        if (isReadOp) {
1714          checkOperation(OperationCategory.READ);
1715        } else {
1716          checkOperation(OperationCategory.WRITE);
1717        }
1718        if (isPermissionEnabled) {
1719          checkPathAccess(pc, src, FsAction.READ);
1720        }
1721
1722        // if the namenode is in safemode, then do not update access time
1723        if (isInSafeMode()) {
1724          doAccessTime = false;
1725        }
1726
1727        final INodesInPath iip = dir.getLastINodeInPath(src);
1728        final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1729        if (!iip.isSnapshot() //snapshots are readonly, so don't update atime.
1730            && doAccessTime && isAccessTimeSupported()) {
1731          final long now = now();
1732          if (now > inode.getAccessTime() + getAccessTimePrecision()) {
1733            // if we have to set access time but we only have the readlock, then
1734            // restart this entire operation with the writeLock.
1735            if (isReadOp) {
1736              continue;
1737            }
1738            dir.setTimes(src, inode, -1, now, false, iip.getLatestSnapshotId());
1739          }
1740        }
1741        final long fileSize = iip.isSnapshot() ?
1742            inode.computeFileSize(iip.getPathSnapshotId())
1743            : inode.computeFileSizeNotIncludingLastUcBlock();
1744        boolean isUc = inode.isUnderConstruction();
1745        if (iip.isSnapshot()) {
1746          // if src indicates a snapshot file, we need to make sure the returned
1747          // blocks do not exceed the size of the snapshot file.
1748          length = Math.min(length, fileSize - offset);
1749          isUc = false;
1750        }
1751        LocatedBlocks blocks =
1752          blockManager.createLocatedBlocks(inode.getBlocks(), fileSize,
1753            isUc, offset, length, needBlockToken, iip.isSnapshot());
1754        // Set caching information for the located blocks.
1755        for (LocatedBlock lb: blocks.getLocatedBlocks()) {
1756          cacheManager.setCachedLocations(lb);
1757        }
1758        return blocks;
1759      } finally {
1760        if (isReadOp) {
1761          readUnlock();
1762        } else {
1763          writeUnlock();
1764        }
1765      }
1766    }
1767    return null; // can never reach here
1768  }
1769
1770  /**
1771   * Moves all the blocks from srcs and appends them to trg
1772   * To avoid rollbacks we will verify validitity of ALL of the args
1773   * before we start actual move.
1774   * 
1775   * This does not support ".inodes" relative path
1776   * @param target
1777   * @param srcs
1778   * @throws IOException
1779   */
1780  void concat(String target, String [] srcs) 
1781      throws IOException, UnresolvedLinkException {
1782    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1783    if (cacheEntry != null && cacheEntry.isSuccess()) {
1784      return; // Return previous response
1785    }
1786    
1787    // Either there is no previous request in progres or it has failed
1788    if(FSNamesystem.LOG.isDebugEnabled()) {
1789      FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
1790          " to " + target);
1791    }
1792    
1793    boolean success = false;
1794    try {
1795      concatInt(target, srcs, cacheEntry != null);
1796      success = true;
1797    } catch (AccessControlException e) {
1798      logAuditEvent(false, "concat", Arrays.toString(srcs), target, null);
1799      throw e;
1800    } finally {
1801      RetryCache.setState(cacheEntry, success);
1802    }
1803  }
1804
1805  private void concatInt(String target, String [] srcs, 
1806      boolean logRetryCache) throws IOException, UnresolvedLinkException {
1807    // verify args
1808    if(target.isEmpty()) {
1809      throw new IllegalArgumentException("Target file name is empty");
1810    }
1811    if(srcs == null || srcs.length == 0) {
1812      throw new IllegalArgumentException("No sources given");
1813    }
1814    
1815    // We require all files be in the same directory
1816    String trgParent = 
1817      target.substring(0, target.lastIndexOf(Path.SEPARATOR_CHAR));
1818    for (String s : srcs) {
1819      String srcParent = s.substring(0, s.lastIndexOf(Path.SEPARATOR_CHAR));
1820      if (!srcParent.equals(trgParent)) {
1821        throw new IllegalArgumentException(
1822           "Sources and target are not in the same directory");
1823      }
1824    }
1825
1826    HdfsFileStatus resultingStat = null;
1827    FSPermissionChecker pc = getPermissionChecker();
1828    checkOperation(OperationCategory.WRITE);
1829    writeLock();
1830    try {
1831      checkOperation(OperationCategory.WRITE);
1832      checkNameNodeSafeMode("Cannot concat " + target);
1833      concatInternal(pc, target, srcs, logRetryCache);
1834      resultingStat = getAuditFileInfo(target, false);
1835    } finally {
1836      writeUnlock();
1837    }
1838    getEditLog().logSync();
1839    logAuditEvent(true, "concat", Arrays.toString(srcs), target, resultingStat);
1840  }
1841
1842  /** See {@link #concat(String, String[])} */
1843  private void concatInternal(FSPermissionChecker pc, String target,
1844      String[] srcs, boolean logRetryCache) throws IOException,
1845      UnresolvedLinkException {
1846    assert hasWriteLock();
1847
1848    // write permission for the target
1849    if (isPermissionEnabled) {
1850      checkPathAccess(pc, target, FsAction.WRITE);
1851
1852      // and srcs
1853      for(String aSrc: srcs) {
1854        checkPathAccess(pc, aSrc, FsAction.READ); // read the file
1855        checkParentAccess(pc, aSrc, FsAction.WRITE); // for delete 
1856      }
1857    }
1858
1859    // to make sure no two files are the same
1860    Set<INode> si = new HashSet<INode>();
1861
1862    // we put the following prerequisite for the operation
1863    // replication and blocks sizes should be the same for ALL the blocks
1864
1865    // check the target
1866    final INodeFile trgInode = INodeFile.valueOf(dir.getINode4Write(target),
1867        target);
1868    if(trgInode.isUnderConstruction()) {
1869      throw new HadoopIllegalArgumentException("concat: target file "
1870          + target + " is under construction");
1871    }
1872    // per design target shouldn't be empty and all the blocks same size
1873    if(trgInode.numBlocks() == 0) {
1874      throw new HadoopIllegalArgumentException("concat: target file "
1875          + target + " is empty");
1876    }
1877    if (trgInode.isWithSnapshot()) {
1878      throw new HadoopIllegalArgumentException("concat: target file "
1879          + target + " is in a snapshot");
1880    }
1881
1882    long blockSize = trgInode.getPreferredBlockSize();
1883
1884    // check the end block to be full
1885    final BlockInfo last = trgInode.getLastBlock();
1886    if(blockSize != last.getNumBytes()) {
1887      throw new HadoopIllegalArgumentException("The last block in " + target
1888          + " is not full; last block size = " + last.getNumBytes()
1889          + " but file block size = " + blockSize);
1890    }
1891
1892    si.add(trgInode);
1893    final short repl = trgInode.getFileReplication();
1894
1895    // now check the srcs
1896    boolean endSrc = false; // final src file doesn't have to have full end block
1897    for(int i=0; i<srcs.length; i++) {
1898      String src = srcs[i];
1899      if(i==srcs.length-1)
1900        endSrc=true;
1901
1902      final INodeFile srcInode = INodeFile.valueOf(dir.getINode4Write(src), src);
1903      if(src.isEmpty() 
1904          || srcInode.isUnderConstruction()
1905          || srcInode.numBlocks() == 0) {
1906        throw new HadoopIllegalArgumentException("concat: source file " + src
1907            + " is invalid or empty or underConstruction");
1908      }
1909
1910      // check replication and blocks size
1911      if(repl != srcInode.getBlockReplication()) {
1912        throw new HadoopIllegalArgumentException("concat: the soruce file "
1913            + src + " and the target file " + target
1914            + " should have the same replication: source replication is "
1915            + srcInode.getBlockReplication()
1916            + " but target replication is " + repl);
1917      }
1918
1919      //boolean endBlock=false;
1920      // verify that all the blocks are of the same length as target
1921      // should be enough to check the end blocks
1922      final BlockInfo[] srcBlocks = srcInode.getBlocks();
1923      int idx = srcBlocks.length-1;
1924      if(endSrc)
1925        idx = srcBlocks.length-2; // end block of endSrc is OK not to be full
1926      if(idx >= 0 && srcBlocks[idx].getNumBytes() != blockSize) {
1927        throw new HadoopIllegalArgumentException("concat: the soruce file "
1928            + src + " and the target file " + target
1929            + " should have the same blocks sizes: target block size is "
1930            + blockSize + " but the size of source block " + idx + " is "
1931            + srcBlocks[idx].getNumBytes());
1932      }
1933
1934      si.add(srcInode);
1935    }
1936
1937    // make sure no two files are the same
1938    if(si.size() < srcs.length+1) { // trg + srcs
1939      // it means at least two files are the same
1940      throw new HadoopIllegalArgumentException(
1941          "concat: at least two of the source files are the same");
1942    }
1943
1944    if(NameNode.stateChangeLog.isDebugEnabled()) {
1945      NameNode.stateChangeLog.debug("DIR* NameSystem.concat: " + 
1946          Arrays.toString(srcs) + " to " + target);
1947    }
1948
1949    dir.concat(target,srcs, logRetryCache);
1950  }
1951  
1952  /**
1953   * stores the modification and access time for this inode. 
1954   * The access time is precise upto an hour. The transaction, if needed, is
1955   * written to the edits log but is not flushed.
1956   */
1957  void setTimes(String src, long mtime, long atime) 
1958      throws IOException, UnresolvedLinkException {
1959    if (!isAccessTimeSupported() && atime != -1) {
1960      throw new IOException("Access time for hdfs is not configured. " +
1961                            " Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
1962    }
1963    try {
1964      setTimesInt(src, mtime, atime);
1965    } catch (AccessControlException e) {
1966      logAuditEvent(false, "setTimes", src);
1967      throw e;
1968    }
1969  }
1970
1971  private void setTimesInt(String src, long mtime, long atime) 
1972    throws IOException, UnresolvedLinkException {
1973    HdfsFileStatus resultingStat = null;
1974    FSPermissionChecker pc = getPermissionChecker();
1975    checkOperation(OperationCategory.WRITE);
1976    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1977    writeLock();
1978    try {
1979      checkOperation(OperationCategory.WRITE);
1980      checkNameNodeSafeMode("Cannot set times " + src);
1981      src = FSDirectory.resolvePath(src, pathComponents, dir);
1982
1983      // Write access is required to set access and modification times
1984      if (isPermissionEnabled) {
1985        checkPathAccess(pc, src, FsAction.WRITE);
1986      }
1987      final INodesInPath iip = dir.getINodesInPath4Write(src);
1988      final INode inode = iip.getLastINode();
1989      if (inode != null) {
1990        dir.setTimes(src, inode, mtime, atime, true, iip.getLatestSnapshotId());
1991        resultingStat = getAuditFileInfo(src, false);
1992      } else {
1993        throw new FileNotFoundException("File/Directory " + src + " does not exist.");
1994      }
1995    } finally {
1996      writeUnlock();
1997    }
1998    logAuditEvent(true, "setTimes", src, null, resultingStat);
1999  }
2000
2001  /**
2002   * Create a symbolic link.
2003   */
2004  @SuppressWarnings("deprecation")
2005  void createSymlink(String target, String link,
2006      PermissionStatus dirPerms, boolean createParent) 
2007      throws IOException, UnresolvedLinkException {
2008    if (!FileSystem.areSymlinksEnabled()) {
2009      throw new UnsupportedOperationException("Symlinks not supported");
2010    }
2011    if (!DFSUtil.isValidName(link)) {
2012      throw new InvalidPathException("Invalid link name: " + link);
2013    }
2014    if (FSDirectory.isReservedName(target)) {
2015      throw new InvalidPathException("Invalid target name: " + target);
2016    }
2017    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
2018    if (cacheEntry != null && cacheEntry.isSuccess()) {
2019      return; // Return previous response
2020    }
2021    boolean success = false;
2022    try {
2023      createSymlinkInt(target, link, dirPerms, createParent, cacheEntry != null);
2024      success = true;
2025    } catch (AccessControlException e) {
2026      logAuditEvent(false, "createSymlink", link, target, null);
2027      throw e;
2028    } finally {
2029      RetryCache.setState(cacheEntry, success);
2030    }
2031  }
2032
2033  private void createSymlinkInt(String target, String link,
2034      PermissionStatus dirPerms, boolean createParent, boolean logRetryCache) 
2035      throws IOException, UnresolvedLinkException {
2036    if (NameNode.stateChangeLog.isDebugEnabled()) {
2037      NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target="
2038          + target + " link=" + link);
2039    }
2040    HdfsFileStatus resultingStat = null;
2041    FSPermissionChecker pc = getPermissionChecker();
2042    checkOperation(OperationCategory.WRITE);
2043    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(link);
2044    writeLock();
2045    try {
2046      checkOperation(OperationCategory.WRITE);
2047      checkNameNodeSafeMode("Cannot create symlink " + link);
2048      link = FSDirectory.resolvePath(link, pathComponents, dir);
2049      if (!createParent) {
2050        verifyParentDir(link);
2051      }
2052      if (!dir.isValidToCreate(link)) {
2053        throw new IOException("failed to create link " + link 
2054            +" either because the filename is invalid or the file exists");
2055      }
2056      if (isPermissionEnabled) {
2057        checkAncestorAccess(pc, link, FsAction.WRITE);
2058      }
2059      // validate that we have enough inodes.
2060      checkFsObjectLimit();
2061
2062      // add symbolic link to namespace
2063      dir.addSymlink(link, target, dirPerms, createParent, logRetryCache);
2064      resultingStat = getAuditFileInfo(link, false);
2065    } finally {
2066      writeUnlock();
2067    }
2068    getEditLog().logSync();
2069    logAuditEvent(true, "createSymlink", link, target, resultingStat);
2070  }
2071
2072  /**
2073   * Set replication for an existing file.
2074   * 
2075   * The NameNode sets new replication and schedules either replication of 
2076   * under-replicated data blocks or removal of the excessive block copies 
2077   * if the blocks are over-replicated.
2078   * 
2079   * @see ClientProtocol#setReplication(String, short)
2080   * @param src file name
2081   * @param replication new replication
2082   * @return true if successful; 
2083   *         false if file does not exist or is a directory
2084   */
2085  boolean setReplication(final String src, final short replication)
2086      throws IOException {
2087    try {
2088      return setReplicationInt(src, replication);
2089    } catch (AccessControlException e) {
2090      logAuditEvent(false, "setReplication", src);
2091      throw e;
2092    }
2093  }
2094
2095  private boolean setReplicationInt(String src, final short replication)
2096      throws IOException {
2097    blockManager.verifyReplication(src, replication, null);
2098    final boolean isFile;
2099    FSPermissionChecker pc = getPermissionChecker();
2100    checkOperation(OperationCategory.WRITE);
2101    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2102    writeLock();
2103    try {
2104      checkOperation(OperationCategory.WRITE);
2105      checkNameNodeSafeMode("Cannot set replication for " + src);
2106      src = FSDirectory.resolvePath(src, pathComponents, dir);
2107      if (isPermissionEnabled) {
2108        checkPathAccess(pc, src, FsAction.WRITE);
2109      }
2110
2111      final short[] blockRepls = new short[2]; // 0: old, 1: new
2112      final Block[] blocks = dir.setReplication(src, replication, blockRepls);
2113      isFile = blocks != null;
2114      if (isFile) {
2115        blockManager.setReplication(blockRepls[0], blockRepls[1], src, blocks);
2116      }
2117    } finally {
2118      writeUnlock();
2119    }
2120
2121    getEditLog().logSync();
2122    if (isFile) {
2123      logAuditEvent(true, "setReplication", src);
2124    }
2125    return isFile;
2126  }
2127
2128  long getPreferredBlockSize(String filename) 
2129      throws IOException, UnresolvedLinkException {
2130    FSPermissionChecker pc = getPermissionChecker();
2131    checkOperation(OperationCategory.READ);
2132    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(filename);
2133    readLock();
2134    try {
2135      checkOperation(OperationCategory.READ);
2136      filename = FSDirectory.resolvePath(filename, pathComponents, dir);
2137      if (isPermissionEnabled) {
2138        checkTraverse(pc, filename);
2139      }
2140      return dir.getPreferredBlockSize(filename);
2141    } finally {
2142      readUnlock();
2143    }
2144  }
2145
2146  /**
2147   * Verify that parent directory of src exists.
2148   */
2149  private void verifyParentDir(String src) throws FileNotFoundException,
2150      ParentNotDirectoryException, UnresolvedLinkException {
2151    assert hasReadLock();
2152    Path parent = new Path(src).getParent();
2153    if (parent != null) {
2154      final INode parentNode = dir.getINode(parent.toString());
2155      if (parentNode == null) {
2156        throw new FileNotFoundException("Parent directory doesn't exist: "
2157            + parent);
2158      } else if (!parentNode.isDirectory() && !parentNode.isSymlink()) {
2159        throw new ParentNotDirectoryException("Parent path is not a directory: "
2160            + parent);
2161      }
2162    }
2163  }
2164  
2165  /**
2166   * Create a new file entry in the namespace.
2167   * 
2168   * For description of parameters and exceptions thrown see
2169   * {@link ClientProtocol#create()}, except it returns valid file status upon
2170   * success
2171   * 
2172   * For retryCache handling details see -
2173   * {@link #getFileStatus(boolean, CacheEntryWithPayload)}
2174   * 
2175   */
2176  HdfsFileStatus startFile(String src, PermissionStatus permissions,
2177      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2178      boolean createParent, short replication, long blockSize)
2179      throws AccessControlException, SafeModeException,
2180      FileAlreadyExistsException, UnresolvedLinkException,
2181      FileNotFoundException, ParentNotDirectoryException, IOException {
2182    HdfsFileStatus status = null;
2183    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2184        null);
2185    if (cacheEntry != null && cacheEntry.isSuccess()) {
2186      return (HdfsFileStatus) cacheEntry.getPayload();
2187    }
2188    
2189    try {
2190      status = startFileInt(src, permissions, holder, clientMachine, flag,
2191          createParent, replication, blockSize, cacheEntry != null);
2192    } catch (AccessControlException e) {
2193      logAuditEvent(false, "create", src);
2194      throw e;
2195    } finally {
2196      RetryCache.setState(cacheEntry, status != null, status);
2197    }
2198    return status;
2199  }
2200
2201  private HdfsFileStatus startFileInt(String src, PermissionStatus permissions,
2202      String holder, String clientMachine, EnumSet<CreateFlag> flag,
2203      boolean createParent, short replication, long blockSize,
2204      boolean logRetryCache) throws AccessControlException, SafeModeException,
2205      FileAlreadyExistsException, UnresolvedLinkException,
2206      FileNotFoundException, ParentNotDirectoryException, IOException {
2207    if (NameNode.stateChangeLog.isDebugEnabled()) {
2208      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: src=" + src
2209          + ", holder=" + holder
2210          + ", clientMachine=" + clientMachine
2211          + ", createParent=" + createParent
2212          + ", replication=" + replication
2213          + ", createFlag=" + flag.toString());
2214    }
2215    if (!DFSUtil.isValidName(src)) {
2216      throw new InvalidPathException(src);
2217    }
2218    blockManager.verifyReplication(src, replication, clientMachine);
2219
2220    boolean skipSync = false;
2221    HdfsFileStatus stat = null;
2222    FSPermissionChecker pc = getPermissionChecker();
2223    checkOperation(OperationCategory.WRITE);
2224    if (blockSize < minBlockSize) {
2225      throw new IOException("Specified block size is less than configured" +
2226          " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2227          + "): " + blockSize + " < " + minBlockSize);
2228    }
2229    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2230    boolean create = flag.contains(CreateFlag.CREATE);
2231    boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2232    writeLock();
2233    try {
2234      checkOperation(OperationCategory.WRITE);
2235      checkNameNodeSafeMode("Cannot create file" + src);
2236      src = FSDirectory.resolvePath(src, pathComponents, dir);
2237      startFileInternal(pc, src, permissions, holder, clientMachine, create,
2238          overwrite, createParent, replication, blockSize, logRetryCache);
2239      stat = dir.getFileInfo(src, false);
2240    } catch (StandbyException se) {
2241      skipSync = true;
2242      throw se;
2243    } finally {
2244      writeUnlock();
2245      // There might be transactions logged while trying to recover the lease.
2246      // They need to be sync'ed even when an exception was thrown.
2247      if (!skipSync) {
2248        getEditLog().logSync();
2249      }
2250    } 
2251    logAuditEvent(true, "create", src, null, stat);
2252    return stat;
2253  }
2254
2255  /**
2256   * Create a new file or overwrite an existing file<br>
2257   * 
2258   * Once the file is create the client then allocates a new block with the next
2259   * call using {@link NameNode#addBlock()}.
2260   * <p>
2261   * For description of parameters and exceptions thrown see
2262   * {@link ClientProtocol#create()}
2263   */
2264  private void startFileInternal(FSPermissionChecker pc, String src,
2265      PermissionStatus permissions, String holder, String clientMachine,
2266      boolean create, boolean overwrite, boolean createParent,
2267      short replication, long blockSize, boolean logRetryEntry)
2268      throws FileAlreadyExistsException, AccessControlException,
2269      UnresolvedLinkException, FileNotFoundException,
2270      ParentNotDirectoryException, IOException {
2271    assert hasWriteLock();
2272    // Verify that the destination does not exist as a directory already.
2273    final INodesInPath iip = dir.getINodesInPath4Write(src);
2274    final INode inode = iip.getLastINode();
2275    if (inode != null && inode.isDirectory()) {
2276      throw new FileAlreadyExistsException(src +
2277          " already exists as a directory");
2278    }
2279    final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2280    if (isPermissionEnabled) {
2281      if (overwrite && myFile != null) {
2282        checkPathAccess(pc, src, FsAction.WRITE);
2283      } else {
2284        checkAncestorAccess(pc, src, FsAction.WRITE);
2285      }
2286    }
2287
2288    if (!createParent) {
2289      verifyParentDir(src);
2290    }
2291
2292    try {
2293      if (myFile == null) {
2294        if (!create) {
2295          throw new FileNotFoundException("Can't overwrite non-existent " +
2296              src + " for client " + clientMachine);
2297        }
2298      } else {
2299        if (overwrite) {
2300          try {
2301            deleteInt(src, true, false); // File exists - delete if overwrite
2302          } catch (AccessControlException e) {
2303            logAuditEvent(false, "delete", src);
2304            throw e;
2305          }
2306        } else {
2307          // If lease soft limit time is expired, recover the lease
2308          recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2309          throw new FileAlreadyExistsException(src + " for client " +
2310              clientMachine + " already exists");
2311        }
2312      }
2313
2314      checkFsObjectLimit();
2315      final DatanodeDescriptor clientNode = 
2316          blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2317
2318      INodeFile newNode = dir.addFile(src, permissions, replication, blockSize,
2319          holder, clientMachine, clientNode);
2320      if (newNode == null) {
2321        throw new IOException("Unable to add " + src +  " to namespace");
2322      }
2323      leaseManager.addLease(newNode.getFileUnderConstructionFeature()
2324          .getClientName(), src);
2325
2326      // record file record in log, record new generation stamp
2327      getEditLog().logOpenFile(src, newNode, logRetryEntry);
2328      if (NameNode.stateChangeLog.isDebugEnabled()) {
2329        NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: added " +
2330            src + " inode " + newNode.getId() + " " + holder);
2331      }
2332    } catch (IOException ie) {
2333      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: " + src + " " +
2334          ie.getMessage());
2335      throw ie;
2336    }
2337  }
2338  
2339  /**
2340   * Append to an existing file for append.
2341   * <p>
2342   * 
2343   * The method returns the last block of the file if this is a partial block,
2344   * which can still be used for writing more data. The client uses the returned
2345   * block locations to form the data pipeline for this block.<br>
2346   * The method returns null if the last block is full. The client then
2347   * allocates a new block with the next call using {@link NameNode#addBlock()}.
2348   * <p>
2349   * 
2350   * For description of parameters and exceptions thrown see
2351   * {@link ClientProtocol#append(String, String)}
2352   * 
2353   * @return the last block locations if the block is partial or null otherwise
2354   */
2355  private LocatedBlock appendFileInternal(FSPermissionChecker pc, String src,
2356      String holder, String clientMachine, boolean logRetryCache)
2357      throws AccessControlException, UnresolvedLinkException,
2358      FileNotFoundException, IOException {
2359    assert hasWriteLock();
2360    // Verify that the destination does not exist as a directory already.
2361    final INodesInPath iip = dir.getINodesInPath4Write(src);
2362    final INode inode = iip.getLastINode();
2363    if (inode != null && inode.isDirectory()) {
2364      throw new FileAlreadyExistsException("Cannot append to directory " + src
2365          + "; already exists as a directory.");
2366    }
2367    if (isPermissionEnabled) {
2368      checkPathAccess(pc, src, FsAction.WRITE);
2369    }
2370
2371    try {
2372      if (inode == null) {
2373        throw new FileNotFoundException("failed to append to non-existent file "
2374          + src + " for client " + clientMachine);
2375      }
2376      INodeFile myFile = INodeFile.valueOf(inode, src, true);
2377      // Opening an existing file for write - may need to recover lease.
2378      recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2379      
2380      // recoverLeaseInternal may create a new InodeFile via 
2381      // finalizeINodeFileUnderConstruction so we need to refresh 
2382      // the referenced file.  
2383      myFile = INodeFile.valueOf(dir.getINode(src), src, true);
2384      final BlockInfo lastBlock = myFile.getLastBlock();
2385      // Check that the block has at least minimum replication.
2386      if(lastBlock != null && lastBlock.isComplete() &&
2387          !getBlockManager().isSufficientlyReplicated(lastBlock)) {
2388        throw new IOException("append: lastBlock=" + lastBlock +
2389            " of src=" + src + " is not sufficiently replicated yet.");
2390      }
2391      final DatanodeDescriptor clientNode = 
2392          blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2393      return prepareFileForWrite(src, myFile, holder, clientMachine, clientNode,
2394          true, iip.getLatestSnapshotId(), logRetryCache);
2395    } catch (IOException ie) {
2396      NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2397      throw ie;
2398    }
2399  }
2400  
2401  /**
2402   * Replace current node with a INodeUnderConstruction.
2403   * Recreate in-memory lease record.
2404   * 
2405   * @param src path to the file
2406   * @param file existing file object
2407   * @param leaseHolder identifier of the lease holder on this file
2408   * @param clientMachine identifier of the client machine
2409   * @param clientNode if the client is collocated with a DN, that DN's descriptor
2410   * @param writeToEditLog whether to persist this change to the edit log
2411   * @param logRetryCache whether to record RPC ids in editlog for retry cache
2412   *                      rebuilding
2413   * @return the last block locations if the block is partial or null otherwise
2414   * @throws UnresolvedLinkException
2415   * @throws IOException
2416   */
2417  LocatedBlock prepareFileForWrite(String src, INodeFile file,
2418      String leaseHolder, String clientMachine, DatanodeDescriptor clientNode,
2419      boolean writeToEditLog, int latestSnapshot, boolean logRetryCache)
2420      throws IOException {
2421    file = file.recordModification(latestSnapshot);
2422    final INodeFile cons = file.toUnderConstruction(leaseHolder, clientMachine,
2423        clientNode);
2424
2425    leaseManager.addLease(cons.getFileUnderConstructionFeature()
2426        .getClientName(), src);
2427    
2428    LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons);
2429    if (writeToEditLog) {
2430      getEditLog().logOpenFile(src, cons, logRetryCache);
2431    }
2432    return ret;
2433  }
2434
2435  /**
2436   * Recover lease;
2437   * Immediately revoke the lease of the current lease holder and start lease
2438   * recovery so that the file can be forced to be closed.
2439   * 
2440   * @param src the path of the file to start lease recovery
2441   * @param holder the lease holder's name
2442   * @param clientMachine the client machine's name
2443   * @return true if the file is already closed
2444   * @throws IOException
2445   */
2446  boolean recoverLease(String src, String holder, String clientMachine)
2447      throws IOException {
2448    if (!DFSUtil.isValidName(src)) {
2449      throw new IOException("Invalid file name: " + src);
2450    }
2451  
2452    boolean skipSync = false;
2453    FSPermissionChecker pc = getPermissionChecker();
2454    checkOperation(OperationCategory.WRITE);
2455    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2456    writeLock();
2457    try {
2458      checkOperation(OperationCategory.WRITE);
2459      checkNameNodeSafeMode("Cannot recover the lease of " + src);
2460      src = FSDirectory.resolvePath(src, pathComponents, dir);
2461      final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src);
2462      if (!inode.isUnderConstruction()) {
2463        return true;
2464      }
2465      if (isPermissionEnabled) {
2466        checkPathAccess(pc, src, FsAction.WRITE);
2467      }
2468  
2469      recoverLeaseInternal(inode, src, holder, clientMachine, true);
2470    } catch (StandbyException se) {
2471      skipSync = true;
2472      throw se;
2473    } finally {
2474      writeUnlock();
2475      // There might be transactions logged while trying to recover the lease.
2476      // They need to be sync'ed even when an exception was thrown.
2477      if (!skipSync) {
2478        getEditLog().logSync();
2479      }
2480    }
2481    return false;
2482  }
2483
2484  private void recoverLeaseInternal(INodeFile fileInode, 
2485      String src, String holder, String clientMachine, boolean force)
2486      throws IOException {
2487    assert hasWriteLock();
2488    if (fileInode != null && fileInode.isUnderConstruction()) {
2489      //
2490      // If the file is under construction , then it must be in our
2491      // leases. Find the appropriate lease record.
2492      //
2493      Lease lease = leaseManager.getLease(holder);
2494      //
2495      // We found the lease for this file. And surprisingly the original
2496      // holder is trying to recreate this file. This should never occur.
2497      //
2498      if (!force && lease != null) {
2499        Lease leaseFile = leaseManager.getLeaseByPath(src);
2500        if ((leaseFile != null && leaseFile.equals(lease)) ||
2501            lease.getHolder().equals(holder)) { 
2502          throw new AlreadyBeingCreatedException(
2503            "failed to create file " + src + " for " + holder +
2504            " for client " + clientMachine +
2505            " because current leaseholder is trying to recreate file.");
2506        }
2507      }
2508      //
2509      // Find the original holder.
2510      //
2511      FileUnderConstructionFeature uc = fileInode.getFileUnderConstructionFeature();
2512      String clientName = uc.getClientName();
2513      lease = leaseManager.getLease(clientName);
2514      if (lease == null) {
2515        throw new AlreadyBeingCreatedException(
2516          "failed to create file " + src + " for " + holder +
2517          " for client " + clientMachine +
2518          " because pendingCreates is non-null but no leases found.");
2519      }
2520      if (force) {
2521        // close now: no need to wait for soft lease expiration and 
2522        // close only the file src
2523        LOG.info("recoverLease: " + lease + ", src=" + src +
2524          " from client " + clientName);
2525        internalReleaseLease(lease, src, holder);
2526      } else {
2527        assert lease.getHolder().equals(clientName) :
2528          "Current lease holder " + lease.getHolder() +
2529          " does not match file creator " + clientName;
2530        //
2531        // If the original holder has not renewed in the last SOFTLIMIT 
2532        // period, then start lease recovery.
2533        //
2534        if (lease.expiredSoftLimit()) {
2535          LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2536              + clientName);
2537          boolean isClosed = internalReleaseLease(lease, src, null);
2538          if(!isClosed)
2539            throw new RecoveryInProgressException(
2540                "Failed to close file " + src +
2541                ". Lease recovery is in progress. Try again later.");
2542        } else {
2543          final BlockInfo lastBlock = fileInode.getLastBlock();
2544          if (lastBlock != null
2545              && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2546            throw new RecoveryInProgressException("Recovery in progress, file ["
2547                + src + "], " + "lease owner [" + lease.getHolder() + "]");
2548          } else {
2549            throw new AlreadyBeingCreatedException("Failed to create file ["
2550                + src + "] for [" + holder + "] for client [" + clientMachine
2551                + "], because this file is already being created by ["
2552                + clientName + "] on ["
2553                + uc.getClientMachine() + "]");
2554          }
2555        }
2556      }
2557    }
2558  }
2559
2560  /**
2561   * Append to an existing file in the namespace.
2562   */
2563  LocatedBlock appendFile(String src, String holder, String clientMachine)
2564      throws AccessControlException, SafeModeException,
2565      FileAlreadyExistsException, FileNotFoundException,
2566      ParentNotDirectoryException, IOException {
2567    LocatedBlock lb = null;
2568    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2569        null);
2570    if (cacheEntry != null && cacheEntry.isSuccess()) {
2571      return (LocatedBlock) cacheEntry.getPayload();
2572    }
2573      
2574    boolean success = false;
2575    try {
2576      lb = appendFileInt(src, holder, clientMachine, cacheEntry != null);
2577      success = true;
2578      return lb;
2579    } catch (AccessControlException e) {
2580      logAuditEvent(false, "append", src);
2581      throw e;
2582    } finally {
2583      RetryCache.setState(cacheEntry, success, lb);
2584    }
2585  }
2586
2587  private LocatedBlock appendFileInt(String src, String holder,
2588      String clientMachine, boolean logRetryCache)
2589      throws AccessControlException, SafeModeException,
2590      FileAlreadyExistsException, FileNotFoundException,
2591      ParentNotDirectoryException, IOException {
2592    if (NameNode.stateChangeLog.isDebugEnabled()) {
2593      NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
2594          + ", holder=" + holder
2595          + ", clientMachine=" + clientMachine);
2596    }
2597    boolean skipSync = false;
2598    if (!supportAppends) {
2599      throw new UnsupportedOperationException(
2600          "Append is not enabled on this NameNode. Use the " +
2601          DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2602    }
2603
2604    LocatedBlock lb = null;
2605    FSPermissionChecker pc = getPermissionChecker();
2606    checkOperation(OperationCategory.WRITE);
2607    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2608    writeLock();
2609    try {
2610      checkOperation(OperationCategory.WRITE);
2611      checkNameNodeSafeMode("Cannot append to file" + src);
2612      src = FSDirectory.resolvePath(src, pathComponents, dir);
2613      lb = appendFileInternal(pc, src, holder, clientMachine, logRetryCache);
2614    } catch (StandbyException se) {
2615      skipSync = true;
2616      throw se;
2617    } finally {
2618      writeUnlock();
2619      // There might be transactions logged while trying to recover the lease.
2620      // They need to be sync'ed even when an exception was thrown.
2621      if (!skipSync) {
2622        getEditLog().logSync();
2623      }
2624    }
2625    if (lb != null) {
2626      if (NameNode.stateChangeLog.isDebugEnabled()) {
2627        NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: file "
2628            +src+" for "+holder+" at "+clientMachine
2629            +" block " + lb.getBlock()
2630            +" block size " + lb.getBlock().getNumBytes());
2631      }
2632    }
2633    logAuditEvent(true, "append", src);
2634    return lb;
2635  }
2636
2637  ExtendedBlock getExtendedBlock(Block blk) {
2638    return new ExtendedBlock(blockPoolId, blk);
2639  }
2640  
2641  void setBlockPoolId(String bpid) {
2642    blockPoolId = bpid;
2643    blockManager.setBlockPoolId(blockPoolId);
2644  }
2645
2646  /**
2647   * The client would like to obtain an additional block for the indicated
2648   * filename (which is being written-to).  Return an array that consists
2649   * of the block, plus a set of machines.  The first on this list should
2650   * be where the client writes data.  Subsequent items in the list must
2651   * be provided in the connection to the first datanode.
2652   *
2653   * Make sure the previous blocks have been reported by datanodes and
2654   * are replicated.  Will return an empty 2-elt array if we want the
2655   * client to "try again later".
2656   */
2657  LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
2658      ExtendedBlock previous, Set<Node> excludedNodes, 
2659      List<String> favoredNodes)
2660      throws LeaseExpiredException, NotReplicatedYetException,
2661      QuotaExceededException, SafeModeException, UnresolvedLinkException,
2662      IOException {
2663    long blockSize;
2664    int replication;
2665    DatanodeDescriptor clientNode = null;
2666
2667    if(NameNode.stateChangeLog.isDebugEnabled()) {
2668      NameNode.stateChangeLog.debug("BLOCK* NameSystem.getAdditionalBlock: "
2669          + src + " inodeId " +  fileId  + " for " + clientName);
2670    }
2671
2672    // Part I. Analyze the state of the file with respect to the input data.
2673    checkOperation(OperationCategory.READ);
2674    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2675    readLock();
2676    try {
2677      checkOperation(OperationCategory.READ);
2678      src = FSDirectory.resolvePath(src, pathComponents, dir);
2679      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2680      final INode[] inodes = analyzeFileState(
2681          src, fileId, clientName, previous, onRetryBlock).getINodes();
2682      final INodeFile pendingFile = inodes[inodes.length - 1].asFile();
2683
2684      if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
2685        // This is a retry. Just return the last block if having locations.
2686        return onRetryBlock[0];
2687      }
2688      if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
2689        throw new IOException("File has reached the limit on maximum number of"
2690            + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
2691            + "): " + pendingFile.getBlocks().length + " >= "
2692            + maxBlocksPerFile);
2693      }
2694      blockSize = pendingFile.getPreferredBlockSize();
2695      clientNode = pendingFile.getFileUnderConstructionFeature().getClientNode();
2696      replication = pendingFile.getFileReplication();
2697    } finally {
2698      readUnlock();
2699    }
2700
2701    // choose targets for the new block to be allocated.
2702    final DatanodeStorageInfo targets[] = getBlockManager().chooseTarget( 
2703        src, replication, clientNode, excludedNodes, blockSize, favoredNodes);
2704
2705    // Part II.
2706    // Allocate a new block, add it to the INode and the BlocksMap. 
2707    Block newBlock = null;
2708    long offset;
2709    checkOperation(OperationCategory.WRITE);
2710    writeLock();
2711    try {
2712      checkOperation(OperationCategory.WRITE);
2713      // Run the full analysis again, since things could have changed
2714      // while chooseTarget() was executing.
2715      LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2716      INodesInPath inodesInPath =
2717          analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
2718      final INode[] inodes = inodesInPath.getINodes();
2719      final INodeFile pendingFile = inodes[inodes.length - 1].asFile();
2720
2721      if (onRetryBlock[0] != null) {
2722        if (onRetryBlock[0].getLocations().length > 0) {
2723          // This is a retry. Just return the last block if having locations.
2724          return onRetryBlock[0];
2725        } else {
2726          // add new chosen targets to already allocated block and return
2727          BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2728          ((BlockInfoUnderConstruction) lastBlockInFile)
2729              .setExpectedLocations(targets);
2730          offset = pendingFile.computeFileSize();
2731          return makeLocatedBlock(lastBlockInFile, targets, offset);
2732        }
2733      }
2734
2735      // commit the last block and complete it if it has minimum replicas
2736      commitOrCompleteLastBlock(pendingFile,
2737                                ExtendedBlock.getLocalBlock(previous));
2738
2739      // allocate new block, record block locations in INode.
2740      newBlock = createNewBlock();
2741      saveAllocatedBlock(src, inodesInPath, newBlock, targets);
2742
2743      dir.persistNewBlock(src, pendingFile);
2744      offset = pendingFile.computeFileSize();
2745    } finally {
2746      writeUnlock();
2747    }
2748    getEditLog().logSync();
2749
2750    // Return located block
2751    return makeLocatedBlock(newBlock, targets, offset);
2752  }
2753
2754  INodesInPath analyzeFileState(String src,
2755                                long fileId,
2756                                String clientName,
2757                                ExtendedBlock previous,
2758                                LocatedBlock[] onRetryBlock)
2759          throws IOException  {
2760    assert hasReadLock();
2761
2762    checkBlock(previous);
2763    onRetryBlock[0] = null;
2764    checkOperation(OperationCategory.WRITE);
2765    checkNameNodeSafeMode("Cannot add block to " + src);
2766
2767    // have we exceeded the configured limit of fs objects.
2768    checkFsObjectLimit();
2769
2770    Block previousBlock = ExtendedBlock.getLocalBlock(previous);
2771    final INodesInPath iip = dir.getINodesInPath4Write(src);
2772    final INodeFile pendingFile
2773        = checkLease(src, fileId, clientName, iip.getLastINode());
2774    BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2775    if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
2776      // The block that the client claims is the current last block
2777      // doesn't match up with what we think is the last block. There are
2778      // four possibilities:
2779      // 1) This is the first block allocation of an append() pipeline
2780      //    which started appending exactly at a block boundary.
2781      //    In this case, the client isn't passed the previous block,
2782      //    so it makes the allocateBlock() call with previous=null.
2783      //    We can distinguish this since the last block of the file
2784      //    will be exactly a full block.
2785      // 2) This is a retry from a client that missed the response of a
2786      //    prior getAdditionalBlock() call, perhaps because of a network
2787      //    timeout, or because of an HA failover. In that case, we know
2788      //    by the fact that the client is re-issuing the RPC that it
2789      //    never began to write to the old block. Hence it is safe to
2790      //    to return the existing block.
2791      // 3) This is an entirely bogus request/bug -- we should error out
2792      //    rather than potentially appending a new block with an empty
2793      //    one in the middle, etc
2794      // 4) This is a retry from a client that timed out while
2795      //    the prior getAdditionalBlock() is still being processed,
2796      //    currently working on chooseTarget(). 
2797      //    There are no means to distinguish between the first and 
2798      //    the second attempts in Part I, because the first one hasn't
2799      //    changed the namesystem state yet.
2800      //    We run this analysis again in Part II where case 4 is impossible.
2801
2802      BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
2803      if (previous == null &&
2804          lastBlockInFile != null &&
2805          lastBlockInFile.getNumBytes() == pendingFile.getPreferredBlockSize() &&
2806          lastBlockInFile.isComplete()) {
2807        // Case 1
2808        if (NameNode.stateChangeLog.isDebugEnabled()) {
2809           NameNode.stateChangeLog.debug(
2810               "BLOCK* NameSystem.allocateBlock: handling block allocation" +
2811               " writing to a file with a complete previous block: src=" +
2812               src + " lastBlock=" + lastBlockInFile);
2813        }
2814      } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
2815        if (lastBlockInFile.getNumBytes() != 0) {
2816          throw new IOException(
2817              "Request looked like a retry to allocate block " +
2818              lastBlockInFile + " but it already contains " +
2819              lastBlockInFile.getNumBytes() + " bytes");
2820        }
2821
2822        // Case 2
2823        // Return the last block.
2824        NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
2825            "caught retry for allocation of a new block in " +
2826            src + ". Returning previously allocated block " + lastBlockInFile);
2827        long offset = pendingFile.computeFileSize();
2828        onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
2829            ((BlockInfoUnderConstruction)lastBlockInFile).getExpectedStorageLocations(),
2830            offset);
2831        return iip;
2832      } else {
2833        // Case 3
2834        throw new IOException("Cannot allocate block in " + src + ": " +
2835            "passed 'previous' block " + previous + " does not match actual " +
2836            "last block in file " + lastBlockInFile);
2837      }
2838    }
2839
2840    // Check if the penultimate block is minimally replicated
2841    if (!checkFileProgress(pendingFile, false)) {
2842      throw new NotReplicatedYetException("Not replicated yet: " + src);
2843    }
2844    return iip;
2845  }
2846
2847  LocatedBlock makeLocatedBlock(Block blk, DatanodeStorageInfo[] locs,
2848                                        long offset) throws IOException {
2849    LocatedBlock lBlk = new LocatedBlock(
2850        getExtendedBlock(blk), locs, offset, false);
2851    getBlockManager().setBlockToken(
2852        lBlk, BlockTokenSecretManager.AccessMode.WRITE);
2853    return lBlk;
2854  }
2855
2856  /** @see NameNode#getAdditionalDatanode(String, ExtendedBlock, DatanodeInfo[], DatanodeInfo[], int, String) */
2857  LocatedBlock getAdditionalDatanode(String src, final ExtendedBlock blk,
2858      final DatanodeInfo[] existings, final String[] storageIDs,
2859      final Set<Node> excludes,
2860      final int numAdditionalNodes, final String clientName
2861      ) throws IOException {
2862    //check if the feature is enabled
2863    dtpReplaceDatanodeOnFailure.checkEnabled();
2864
2865    final DatanodeDescriptor clientnode;
2866    final long preferredblocksize;
2867    final List<DatanodeStorageInfo> chosen;
2868    checkOperation(OperationCategory.READ);
2869    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2870    readLock();
2871    try {
2872      checkOperation(OperationCategory.READ);
2873      //check safe mode
2874      checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
2875      src = FSDirectory.resolvePath(src, pathComponents, dir);
2876
2877      //check lease
2878      final INodeFile file = checkLease(src, clientName);
2879      clientnode = file.getFileUnderConstructionFeature().getClientNode();
2880      preferredblocksize = file.getPreferredBlockSize();
2881
2882      //find datanode storages
2883      final DatanodeManager dm = blockManager.getDatanodeManager();
2884      chosen = Arrays.asList(dm.getDatanodeStorageInfos(existings, storageIDs));
2885    } finally {
2886      readUnlock();
2887    }
2888
2889    // choose new datanodes.
2890    final DatanodeStorageInfo[] targets = blockManager.getBlockPlacementPolicy(
2891        ).chooseTarget(src, numAdditionalNodes, clientnode, chosen, true,
2892            // TODO: get storage type from the file
2893        excludes, preferredblocksize, StorageType.DEFAULT);
2894    final LocatedBlock lb = new LocatedBlock(blk, targets);
2895    blockManager.setBlockToken(lb, AccessMode.COPY);
2896    return lb;
2897  }
2898
2899  /**
2900   * The client would like to let go of the given block
2901   */
2902  boolean abandonBlock(ExtendedBlock b, String src, String holder)
2903      throws LeaseExpiredException, FileNotFoundException,
2904      UnresolvedLinkException, IOException {
2905    if(NameNode.stateChangeLog.isDebugEnabled()) {
2906      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
2907          + "of file " + src);
2908    }
2909    checkOperation(OperationCategory.WRITE);
2910    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2911    writeLock();
2912    try {
2913      checkOperation(OperationCategory.WRITE);
2914      checkNameNodeSafeMode("Cannot abandon block " + b + " for fle" + src);
2915      src = FSDirectory.resolvePath(src, pathComponents, dir);
2916
2917      //
2918      // Remove the block from the pending creates list
2919      //
2920      INodeFile file = checkLease(src, holder);
2921      boolean removed = dir.removeBlock(src, file,
2922          ExtendedBlock.getLocalBlock(b));
2923      if (!removed) {
2924        return true;
2925      }
2926      if(NameNode.stateChangeLog.isDebugEnabled()) {
2927        NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
2928                                      + b + " is removed from pendingCreates");
2929      }
2930      dir.persistBlocks(src, file, false);
2931    } finally {
2932      writeUnlock();
2933    }
2934    getEditLog().logSync();
2935
2936    return true;
2937  }
2938  
2939  /** make sure that we still have the lease on this file. */
2940  private INodeFile checkLease(String src, String holder)
2941      throws LeaseExpiredException, UnresolvedLinkException,
2942      FileNotFoundException {
2943    return checkLease(src, INodeId.GRANDFATHER_INODE_ID, holder,
2944        dir.getINode(src));
2945  }
2946  
2947  private INodeFile checkLease(String src, long fileId, String holder,
2948      INode inode) throws LeaseExpiredException, FileNotFoundException {
2949    assert hasReadLock();
2950    if (inode == null || !inode.isFile()) {
2951      Lease lease = leaseManager.getLease(holder);
2952      throw new LeaseExpiredException(
2953          "No lease on " + src + ": File does not exist. "
2954          + (lease != null ? lease.toString()
2955              : "Holder " + holder + " does not have any open files."));
2956    }
2957    final INodeFile file = inode.asFile();
2958    if (!file.isUnderConstruction()) {
2959      Lease lease = leaseManager.getLease(holder);
2960      throw new LeaseExpiredException(
2961          "No lease on " + src + ": File is not open for writing. "
2962          + (lease != null ? lease.toString()
2963              : "Holder " + holder + " does not have any open files."));
2964    }
2965    // No further modification is allowed on a deleted file.
2966    // A file is considered deleted, if it has no parent or is marked
2967    // as deleted in the snapshot feature.
2968    if (file.getParent() == null || (file.isWithSnapshot() &&
2969        file.getFileWithSnapshotFeature().isCurrentFileDeleted())) {
2970      throw new FileNotFoundException(src);
2971    }
2972    String clientName = file.getFileUnderConstructionFeature().getClientName();
2973    if (holder != null && !clientName.equals(holder)) {
2974      throw new LeaseExpiredException("Lease mismatch on " + src + " owned by "
2975          + clientName + " but is accessed by " + holder);
2976    }
2977    INodeId.checkId(fileId, file);
2978    return file;
2979  }
2980 
2981  /**
2982   * Complete in-progress write to the given file.
2983   * @return true if successful, false if the client should continue to retry
2984   *         (e.g if not all blocks have reached minimum replication yet)
2985   * @throws IOException on error (eg lease mismatch, file not open, file deleted)
2986   */
2987  boolean completeFile(String src, String holder,
2988                       ExtendedBlock last, long fileId)
2989    throws SafeModeException, UnresolvedLinkException, IOException {
2990    if (NameNode.stateChangeLog.isDebugEnabled()) {
2991      NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
2992          src + " for " + holder);
2993    }
2994    checkBlock(last);
2995    boolean success = false;
2996    checkOperation(OperationCategory.WRITE);
2997    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2998    writeLock();
2999    try {
3000      checkOperation(OperationCategory.WRITE);
3001      checkNameNodeSafeMode("Cannot complete file " + src);
3002      src = FSDirectory.resolvePath(src, pathComponents, dir);
3003      success = completeFileInternal(src, holder,
3004        ExtendedBlock.getLocalBlock(last), fileId);
3005    } finally {
3006      writeUnlock();
3007    }
3008    getEditLog().logSync();
3009    if (success) {
3010      NameNode.stateChangeLog.info("DIR* completeFile: " + src
3011          + " is closed by " + holder);
3012    }
3013    return success;
3014  }
3015
3016  private boolean completeFileInternal(String src, 
3017      String holder, Block last, long fileId) throws SafeModeException,
3018      UnresolvedLinkException, IOException {
3019    assert hasWriteLock();
3020    final INodesInPath iip = dir.getLastINodeInPath(src);
3021    final INodeFile pendingFile;
3022    try {
3023      pendingFile = checkLease(src, fileId, holder, iip.getINode(0));
3024    } catch (LeaseExpiredException lee) {
3025      final INode inode = dir.getINode(src);
3026      if (inode != null
3027          && inode.isFile()
3028          && !inode.asFile().isUnderConstruction()) {
3029        // This could be a retry RPC - i.e the client tried to close
3030        // the file, but missed the RPC response. Thus, it is trying
3031        // again to close the file. If the file still exists and
3032        // the client's view of the last block matches the actual
3033        // last block, then we'll treat it as a successful close.
3034        // See HDFS-3031.
3035        final Block realLastBlock = inode.asFile().getLastBlock();
3036        if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
3037          NameNode.stateChangeLog.info("DIR* completeFile: " +
3038              "request from " + holder + " to complete " + src +
3039              " which is already closed. But, it appears to be an RPC " +
3040              "retry. Returning success");
3041          return true;
3042        }
3043      }
3044      throw lee;
3045    }
3046    // Check the state of the penultimate block. It should be completed
3047    // before attempting to complete the last one.
3048    if (!checkFileProgress(pendingFile, false)) {
3049      return false;
3050    }
3051
3052    // commit the last block and complete it if it has minimum replicas
3053    commitOrCompleteLastBlock(pendingFile, last);
3054
3055    if (!checkFileProgress(pendingFile, true)) {
3056      return false;
3057    }
3058
3059    finalizeINodeFileUnderConstruction(src, pendingFile,
3060        iip.getLatestSnapshotId());
3061    return true;
3062  }
3063
3064  /**
3065   * Save allocated block at the given pending filename
3066   * 
3067   * @param src path to the file
3068   * @param inodesInPath representing each of the components of src. 
3069   *                     The last INode is the INode for the file.
3070   * @throws QuotaExceededException If addition of block exceeds space quota
3071   */
3072  BlockInfo saveAllocatedBlock(String src, INodesInPath inodes,
3073      Block newBlock, DatanodeStorageInfo[] targets)
3074          throws IOException {
3075    assert hasWriteLock();
3076    BlockInfo b = dir.addBlock(src, inodes, newBlock, targets);
3077    NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + src + ". "
3078        + getBlockPoolId() + " " + b);
3079    DatanodeStorageInfo.incrementBlocksScheduled(targets);
3080    return b;
3081  }
3082
3083  /**
3084   * Create new block with a unique block id and a new generation stamp.
3085   */
3086  Block createNewBlock() throws IOException {
3087    assert hasWriteLock();
3088    Block b = new Block(nextBlockId(), 0, 0);
3089    // Increment the generation stamp for every new block.
3090    b.setGenerationStamp(nextGenerationStamp(false));
3091    return b;
3092  }
3093
3094  /**
3095   * Check that the indicated file's blocks are present and
3096   * replicated.  If not, return false. If checkall is true, then check
3097   * all blocks, otherwise check only penultimate block.
3098   */
3099  boolean checkFileProgress(INodeFile v, boolean checkall) {
3100    readLock();
3101    try {
3102      if (checkall) {
3103        //
3104        // check all blocks of the file.
3105        //
3106        for (BlockInfo block: v.getBlocks()) {
3107          if (!block.isComplete()) {
3108            LOG.info("BLOCK* checkFileProgress: " + block
3109                + " has not reached minimal replication "
3110                + blockManager.minReplication);
3111            return false;
3112          }
3113        }
3114      } else {
3115        //
3116        // check the penultimate block of this file
3117        //
3118        BlockInfo b = v.getPenultimateBlock();
3119        if (b != null && !b.isComplete()) {
3120          LOG.warn("BLOCK* checkFileProgress: " + b
3121              + " has not reached minimal replication "
3122              + blockManager.minReplication);
3123          return false;
3124        }
3125      }
3126      return true;
3127    } finally {
3128      readUnlock();
3129    }
3130  }
3131
3132  ////////////////////////////////////////////////////////////////
3133  // Here's how to handle block-copy failure during client write:
3134  // -- As usual, the client's write should result in a streaming
3135  // backup write to a k-machine sequence.
3136  // -- If one of the backup machines fails, no worries.  Fail silently.
3137  // -- Before client is allowed to close and finalize file, make sure
3138  // that the blocks are backed up.  Namenode may have to issue specific backup
3139  // commands to make up for earlier datanode failures.  Once all copies
3140  // are made, edit namespace and return to client.
3141  ////////////////////////////////////////////////////////////////
3142
3143  /** 
3144   * Change the indicated filename. 
3145   * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} instead.
3146   */
3147  @Deprecated
3148  boolean renameTo(String src, String dst) 
3149      throws IOException, UnresolvedLinkException {
3150    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3151    if (cacheEntry != null && cacheEntry.isSuccess()) {
3152      return true; // Return previous response
3153    }
3154    boolean ret = false;
3155    try {
3156      ret = renameToInt(src, dst, cacheEntry != null);
3157    } catch (AccessControlException e) {
3158      logAuditEvent(false, "rename", src, dst, null);
3159      throw e;
3160    } finally {
3161      RetryCache.setState(cacheEntry, ret);
3162    }
3163    return ret;
3164  }
3165
3166  private boolean renameToInt(String src, String dst, boolean logRetryCache) 
3167    throws IOException, UnresolvedLinkException {
3168    if (NameNode.stateChangeLog.isDebugEnabled()) {
3169      NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src +
3170          " to " + dst);
3171    }
3172    if (!DFSUtil.isValidName(dst)) {
3173      throw new IOException("Invalid name: " + dst);
3174    }
3175    FSPermissionChecker pc = getPermissionChecker();
3176    checkOperation(OperationCategory.WRITE);
3177    byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3178    byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3179    boolean status = false;
3180    HdfsFileStatus resultingStat = null;
3181    writeLock();
3182    try {
3183      checkOperation(OperationCategory.WRITE);
3184      checkNameNodeSafeMode("Cannot rename " + src);
3185      src = FSDirectory.resolvePath(src, srcComponents, dir);
3186      dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3187      checkOperation(OperationCategory.WRITE);
3188      status = renameToInternal(pc, src, dst, logRetryCache);
3189      if (status) {
3190        resultingStat = getAuditFileInfo(dst, false);
3191      }
3192    } finally {
3193      writeUnlock();
3194    }
3195    getEditLog().logSync();
3196    if (status) {
3197      logAuditEvent(true, "rename", src, dst, resultingStat);
3198    }
3199    return status;
3200  }
3201
3202  /** @deprecated See {@link #renameTo(String, String)} */
3203  @Deprecated
3204  private boolean renameToInternal(FSPermissionChecker pc, String src,
3205      String dst, boolean logRetryCache) throws IOException,
3206      UnresolvedLinkException {
3207    assert hasWriteLock();
3208    if (isPermissionEnabled) {
3209      //We should not be doing this.  This is move() not renameTo().
3210      //but for now,
3211      //NOTE: yes, this is bad!  it's assuming much lower level behavior
3212      //      of rewriting the dst
3213      String actualdst = dir.isDir(dst)?
3214          dst + Path.SEPARATOR + new Path(src).getName(): dst;
3215      // Rename does not operates on link targets
3216      // Do not resolveLink when checking permissions of src and dst
3217      // Check write access to parent of src
3218      checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3219      // Check write access to ancestor of dst
3220      checkPermission(pc, actualdst, false, FsAction.WRITE, null, null, null,
3221          false);
3222    }
3223
3224    if (dir.renameTo(src, dst, logRetryCache)) {
3225      return true;
3226    }
3227    return false;
3228  }
3229  
3230
3231  /** Rename src to dst */
3232  void renameTo(String src, String dst, Options.Rename... options)
3233      throws IOException, UnresolvedLinkException {
3234    if (NameNode.stateChangeLog.isDebugEnabled()) {
3235      NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: with options - "
3236          + src + " to " + dst);
3237    }
3238    if (!DFSUtil.isValidName(dst)) {
3239      throw new InvalidPathException("Invalid name: " + dst);
3240    }
3241    final FSPermissionChecker pc = getPermissionChecker();
3242    
3243    checkOperation(OperationCategory.WRITE);
3244    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3245    if (cacheEntry != null && cacheEntry.isSuccess()) {
3246      return; // Return previous response
3247    }
3248    byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3249    byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3250    HdfsFileStatus resultingStat = null;
3251    boolean success = false;
3252    writeLock();
3253    try {
3254      checkOperation(OperationCategory.WRITE);
3255      checkNameNodeSafeMode("Cannot rename " + src);
3256      src = FSDirectory.resolvePath(src, srcComponents, dir);
3257      dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3258      renameToInternal(pc, src, dst, cacheEntry != null, options);
3259      resultingStat = getAuditFileInfo(dst, false);
3260      success = true;
3261    } finally {
3262      writeUnlock();
3263      RetryCache.setState(cacheEntry, success);
3264    }
3265    getEditLog().logSync();
3266    if (resultingStat != null) {
3267      StringBuilder cmd = new StringBuilder("rename options=");
3268      for (Rename option : options) {
3269        cmd.append(option.value()).append(" ");
3270      }
3271      logAuditEvent(true, cmd.toString(), src, dst, resultingStat);
3272    }
3273  }
3274
3275  private void renameToInternal(FSPermissionChecker pc, String src, String dst,
3276      boolean logRetryCache, Options.Rename... options) throws IOException {
3277    assert hasWriteLock();
3278    if (isPermissionEnabled) {
3279      // Rename does not operates on link targets
3280      // Do not resolveLink when checking permissions of src and dst
3281      // Check write access to parent of src
3282      checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3283      // Check write access to ancestor of dst
3284      checkPermission(pc, dst, false, FsAction.WRITE, null, null, null, false);
3285    }
3286
3287    dir.renameTo(src, dst, logRetryCache, options);
3288  }
3289  
3290  /**
3291   * Remove the indicated file from namespace.
3292   * 
3293   * @see ClientProtocol#delete(String, boolean) for detailed description and 
3294   * description of exceptions
3295   */
3296  boolean delete(String src, boolean recursive)
3297      throws AccessControlException, SafeModeException,
3298      UnresolvedLinkException, IOException {
3299    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3300    if (cacheEntry != null && cacheEntry.isSuccess()) {
3301      return true; // Return previous response
3302    }
3303    boolean ret = false;
3304    try {
3305      ret = deleteInt(src, recursive, cacheEntry != null);
3306    } catch (AccessControlException e) {
3307      logAuditEvent(false, "delete", src);
3308      throw e;
3309    } finally {
3310      RetryCache.setState(cacheEntry, ret);
3311    }
3312    return ret;
3313  }
3314      
3315  private boolean deleteInt(String src, boolean recursive, boolean logRetryCache)
3316      throws AccessControlException, SafeModeException,
3317      UnresolvedLinkException, IOException {
3318    if (NameNode.stateChangeLog.isDebugEnabled()) {
3319      NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src);
3320    }
3321    boolean status = deleteInternal(src, recursive, true, logRetryCache);
3322    if (status) {
3323      logAuditEvent(true, "delete", src);
3324    }
3325    return status;
3326  }
3327    
3328  private FSPermissionChecker getPermissionChecker()
3329      throws AccessControlException {
3330    try {
3331      return new FSPermissionChecker(fsOwnerShortUserName, supergroup, getRemoteUser());
3332    } catch (IOException ioe) {
3333      throw new AccessControlException(ioe);
3334    }
3335  }
3336  
3337  /**
3338   * Remove a file/directory from the namespace.
3339   * <p>
3340   * For large directories, deletion is incremental. The blocks under
3341   * the directory are collected and deleted a small number at a time holding
3342   * the {@link FSNamesystem} lock.
3343   * <p>
3344   * For small directory or file the deletion is done in one shot.
3345   * 
3346   * @see ClientProtocol#delete(String, boolean) for description of exceptions
3347   */
3348  private boolean deleteInternal(String src, boolean recursive,
3349      boolean enforcePermission, boolean logRetryCache)
3350      throws AccessControlException, SafeModeException, UnresolvedLinkException,
3351             IOException {
3352    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
3353    List<INode> removedINodes = new ChunkedArrayList<INode>();
3354    FSPermissionChecker pc = getPermissionChecker();
3355    checkOperation(OperationCategory.WRITE);
3356    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3357    boolean ret = false;
3358    writeLock();
3359    try {
3360      checkOperation(OperationCategory.WRITE);
3361      checkNameNodeSafeMode("Cannot delete " + src);
3362      src = FSDirectory.resolvePath(src, pathComponents, dir);
3363      if (!recursive && dir.isNonEmptyDirectory(src)) {
3364        throw new IOException(src + " is non empty");
3365      }
3366      if (enforcePermission && isPermissionEnabled) {
3367        checkPermission(pc, src, false, null, FsAction.WRITE, null,
3368            FsAction.ALL, false);
3369      }
3370      // Unlink the target directory from directory tree
3371      if (!dir.delete(src, collectedBlocks, removedINodes, logRetryCache)) {
3372        return false;
3373      }
3374      ret = true;
3375    } finally {
3376      writeUnlock();
3377    }
3378    getEditLog().logSync(); 
3379    removeBlocks(collectedBlocks); // Incremental deletion of blocks
3380    collectedBlocks.clear();
3381
3382    dir.writeLock();
3383    try {
3384      dir.removeFromInodeMap(removedINodes);
3385    } finally {
3386      dir.writeUnlock();
3387    }
3388    removedINodes.clear();
3389    if (NameNode.stateChangeLog.isDebugEnabled()) {
3390      NameNode.stateChangeLog.debug("DIR* Namesystem.delete: "
3391        + src +" is removed");
3392    }
3393    return ret;
3394  }
3395
3396  /**
3397   * From the given list, incrementally remove the blocks from blockManager
3398   * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3399   * ensure that other waiters on the lock can get in. See HDFS-2938
3400   * 
3401   * @param blocks
3402   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3403   *          of blocks that need to be removed from blocksMap
3404   */
3405  void removeBlocks(BlocksMapUpdateInfo blocks) {
3406    List<Block> toDeleteList = blocks.getToDeleteList();
3407    Iterator<Block> iter = toDeleteList.iterator();
3408    while (iter.hasNext()) {
3409      writeLock();
3410      try {
3411        for (int i = 0; i < BLOCK_DELETION_INCREMENT && iter.hasNext(); i++) {
3412          blockManager.removeBlock(iter.next());
3413        }
3414      } finally {
3415        writeUnlock();
3416      }
3417    }
3418  }
3419  
3420  /**
3421   * Remove leases, inodes and blocks related to a given path
3422   * @param src The given path
3423   * @param blocks Containing the list of blocks to be deleted from blocksMap
3424   * @param removedINodes Containing the list of inodes to be removed from 
3425   *                      inodesMap
3426   */
3427  void removePathAndBlocks(String src, BlocksMapUpdateInfo blocks,
3428      List<INode> removedINodes) {
3429    assert hasWriteLock();
3430    leaseManager.removeLeaseWithPrefixPath(src);
3431    // remove inodes from inodesMap
3432    if (removedINodes != null) {
3433      dir.removeFromInodeMap(removedINodes);
3434      removedINodes.clear();
3435    }
3436    if (blocks == null) {
3437      return;
3438    }
3439    
3440    removeBlocksAndUpdateSafemodeTotal(blocks);
3441  }
3442
3443  /**
3444   * Removes the blocks from blocksmap and updates the safemode blocks total
3445   * 
3446   * @param blocks
3447   *          An instance of {@link BlocksMapUpdateInfo} which contains a list
3448   *          of blocks that need to be removed from blocksMap
3449   */
3450  void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3451    assert hasWriteLock();
3452    // In the case that we are a Standby tailing edits from the
3453    // active while in safe-mode, we need to track the total number
3454    // of blocks and safe blocks in the system.
3455    boolean trackBlockCounts = isSafeModeTrackingBlocks();
3456    int numRemovedComplete = 0, numRemovedSafe = 0;
3457
3458    for (Block b : blocks.getToDeleteList()) {
3459      if (trackBlockCounts) {
3460        BlockInfo bi = getStoredBlock(b);
3461        if (bi.isComplete()) {
3462          numRemovedComplete++;
3463          if (bi.numNodes() >= blockManager.minReplication) {
3464            numRemovedSafe++;
3465          }
3466        }
3467      }
3468      blockManager.removeBlock(b);
3469    }
3470    if (trackBlockCounts) {
3471      if (LOG.isDebugEnabled()) {
3472        LOG.debug("Adjusting safe-mode totals for deletion."
3473            + "decreasing safeBlocks by " + numRemovedSafe
3474            + ", totalBlocks by " + numRemovedComplete);
3475      }
3476      adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3477    }
3478  }
3479
3480  /**
3481   * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3482   */
3483  private boolean isSafeModeTrackingBlocks() {
3484    if (!haEnabled) {
3485      // Never track blocks incrementally in non-HA code.
3486      return false;
3487    }
3488    SafeModeInfo sm = this.safeMode;
3489    return sm != null && sm.shouldIncrementallyTrackBlocks();
3490  }
3491
3492  /**
3493   * Get the file info for a specific file.
3494   *
3495   * @param src The string representation of the path to the file
3496   * @param resolveLink whether to throw UnresolvedLinkException 
3497   *        if src refers to a symlink
3498   *
3499   * @throws AccessControlException if access is denied
3500   * @throws UnresolvedLinkException if a symlink is encountered.
3501   *
3502   * @return object containing information regarding the file
3503   *         or null if file not found
3504   * @throws StandbyException 
3505   */
3506  HdfsFileStatus getFileInfo(String src, boolean resolveLink) 
3507    throws AccessControlException, UnresolvedLinkException,
3508           StandbyException, IOException {
3509    if (!DFSUtil.isValidName(src)) {
3510      throw new InvalidPathException("Invalid file name: " + src);
3511    }
3512    HdfsFileStatus stat = null;
3513    FSPermissionChecker pc = getPermissionChecker();
3514    checkOperation(OperationCategory.READ);
3515    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3516    readLock();
3517    try {
3518      checkOperation(OperationCategory.READ);
3519      src = FSDirectory.resolvePath(src, pathComponents, dir);
3520      if (isPermissionEnabled) {
3521        checkPermission(pc, src, false, null, null, null, null, resolveLink);
3522      }
3523      stat = dir.getFileInfo(src, resolveLink);
3524    } catch (AccessControlException e) {
3525      logAuditEvent(false, "getfileinfo", src);
3526      throw e;
3527    } finally {
3528      readUnlock();
3529    }
3530    logAuditEvent(true, "getfileinfo", src);
3531    return stat;
3532  }
3533  
3534  /**
3535   * Returns true if the file is closed
3536   */
3537  boolean isFileClosed(String src) 
3538      throws AccessControlException, UnresolvedLinkException,
3539      StandbyException, IOException {
3540    FSPermissionChecker pc = getPermissionChecker();  
3541    checkOperation(OperationCategory.READ);
3542    readLock();
3543    try {
3544      checkOperation(OperationCategory.READ);
3545      if (isPermissionEnabled) {
3546        checkTraverse(pc, src);
3547      }
3548      return !INodeFile.valueOf(dir.getINode(src), src).isUnderConstruction();
3549    } catch (AccessControlException e) {
3550      if (isAuditEnabled() && isExternalInvocation()) {
3551        logAuditEvent(false, "isFileClosed", src);
3552      }
3553      throw e;
3554    } finally {
3555      readUnlock();
3556    }
3557  }
3558
3559  /**
3560   * Create all the necessary directories
3561   */
3562  boolean mkdirs(String src, PermissionStatus permissions,
3563      boolean createParent) throws IOException, UnresolvedLinkException {
3564    boolean ret = false;
3565    try {
3566      ret = mkdirsInt(src, permissions, createParent);
3567    } catch (AccessControlException e) {
3568      logAuditEvent(false, "mkdirs", src);
3569      throw e;
3570    }
3571    return ret;
3572  }
3573
3574  private boolean mkdirsInt(String src, PermissionStatus permissions,
3575      boolean createParent) throws IOException, UnresolvedLinkException {
3576    if(NameNode.stateChangeLog.isDebugEnabled()) {
3577      NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
3578    }
3579    if (!DFSUtil.isValidName(src)) {
3580      throw new InvalidPathException(src);
3581    }
3582    FSPermissionChecker pc = getPermissionChecker();
3583    checkOperation(OperationCategory.WRITE);
3584    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3585    HdfsFileStatus resultingStat = null;
3586    boolean status = false;
3587    writeLock();
3588    try {
3589      checkOperation(OperationCategory.WRITE);   
3590      checkNameNodeSafeMode("Cannot create directory " + src);
3591      src = FSDirectory.resolvePath(src, pathComponents, dir);
3592      status = mkdirsInternal(pc, src, permissions, createParent);
3593      if (status) {
3594        resultingStat = dir.getFileInfo(src, false);
3595      }
3596    } finally {
3597      writeUnlock();
3598    }
3599    getEditLog().logSync();
3600    if (status) {
3601      logAuditEvent(true, "mkdirs", src, null, resultingStat);
3602    }
3603    return status;
3604  }
3605    
3606  /**
3607   * Create all the necessary directories
3608   */
3609  private boolean mkdirsInternal(FSPermissionChecker pc, String src,
3610      PermissionStatus permissions, boolean createParent) 
3611      throws IOException, UnresolvedLinkException {
3612    assert hasWriteLock();
3613    if (isPermissionEnabled) {
3614      checkTraverse(pc, src);
3615    }
3616    if (dir.isDirMutable(src)) {
3617      // all the users of mkdirs() are used to expect 'true' even if
3618      // a new directory is not created.
3619      return true;
3620    }
3621    if (isPermissionEnabled) {
3622      checkAncestorAccess(pc, src, FsAction.WRITE);
3623    }
3624    if (!createParent) {
3625      verifyParentDir(src);
3626    }
3627
3628    // validate that we have enough inodes. This is, at best, a 
3629    // heuristic because the mkdirs() operation might need to 
3630    // create multiple inodes.
3631    checkFsObjectLimit();
3632
3633    if (!dir.mkdirs(src, permissions, false, now())) {
3634      throw new IOException("Failed to create directory: " + src);
3635    }
3636    return true;
3637  }
3638
3639  /**
3640   * Get the content summary for a specific file/dir.
3641   *
3642   * @param src The string representation of the path to the file
3643   *
3644   * @throws AccessControlException if access is denied
3645   * @throws UnresolvedLinkException if a symlink is encountered.
3646   * @throws FileNotFoundException if no file exists
3647   * @throws StandbyException
3648   * @throws IOException for issues with writing to the audit log
3649   *
3650   * @return object containing information regarding the file
3651   *         or null if file not found
3652   */
3653  ContentSummary getContentSummary(String src) throws IOException {
3654    FSPermissionChecker pc = getPermissionChecker();
3655    checkOperation(OperationCategory.READ);
3656    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3657    readLock();
3658    boolean success = true;
3659    try {
3660      checkOperation(OperationCategory.READ);
3661      src = FSDirectory.resolvePath(src, pathComponents, dir);
3662      if (isPermissionEnabled) {
3663        checkPermission(pc, src, false, null, null, null, FsAction.READ_EXECUTE);
3664      }
3665      return dir.getContentSummary(src);
3666
3667    } catch (AccessControlException ace) {
3668      success = false;
3669      throw ace;
3670    } finally {
3671      readUnlock();
3672      logAuditEvent(success, "contentSummary", src);
3673    }
3674  }
3675
3676  /**
3677   * Set the namespace quota and diskspace quota for a directory.
3678   * See {@link ClientProtocol#setQuota(String, long, long)} for the 
3679   * contract.
3680   * 
3681   * Note: This does not support ".inodes" relative path.
3682   */
3683  void setQuota(String path, long nsQuota, long dsQuota) 
3684      throws IOException, UnresolvedLinkException {
3685    checkSuperuserPrivilege();
3686    checkOperation(OperationCategory.WRITE);
3687    writeLock();
3688    try {
3689      checkOperation(OperationCategory.WRITE);
3690      checkNameNodeSafeMode("Cannot set quota on " + path);
3691      dir.setQuota(path, nsQuota, dsQuota);
3692    } finally {
3693      writeUnlock();
3694    }
3695    getEditLog().logSync();
3696  }
3697
3698  /** Persist all metadata about this file.
3699   * @param src The string representation of the path
3700   * @param clientName The string representation of the client
3701   * @param lastBlockLength The length of the last block 
3702   *                        under construction reported from client.
3703   * @throws IOException if path does not exist
3704   */
3705  void fsync(String src, String clientName, long lastBlockLength) 
3706      throws IOException, UnresolvedLinkException {
3707    NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3708    checkOperation(OperationCategory.WRITE);
3709    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3710    writeLock();
3711    try {
3712      checkOperation(OperationCategory.WRITE);
3713      checkNameNodeSafeMode("Cannot fsync file " + src);
3714      src = FSDirectory.resolvePath(src, pathComponents, dir);
3715      INodeFile pendingFile  = checkLease(src, clientName);
3716      if (lastBlockLength > 0) {
3717        pendingFile.getFileUnderConstructionFeature().updateLengthOfLastBlock(
3718            pendingFile, lastBlockLength);
3719      }
3720      dir.persistBlocks(src, pendingFile, false);
3721    } finally {
3722      writeUnlock();
3723    }
3724    getEditLog().logSync();
3725  }
3726
3727  /**
3728   * Move a file that is being written to be immutable.
3729   * @param src The filename
3730   * @param lease The lease for the client creating the file
3731   * @param recoveryLeaseHolder reassign lease to this holder if the last block
3732   *        needs recovery; keep current holder if null.
3733   * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3734   *         replication;<br>
3735   *         RecoveryInProgressException if lease recovery is in progress.<br>
3736   *         IOException in case of an error.
3737   * @return true  if file has been successfully finalized and closed or 
3738   *         false if block recovery has been initiated. Since the lease owner
3739   *         has been changed and logged, caller should call logSync().
3740   */
3741  boolean internalReleaseLease(Lease lease, String src, 
3742      String recoveryLeaseHolder) throws AlreadyBeingCreatedException, 
3743      IOException, UnresolvedLinkException {
3744    LOG.info("Recovering " + lease + ", src=" + src);
3745    assert !isInSafeMode();
3746    assert hasWriteLock();
3747
3748    final INodesInPath iip = dir.getLastINodeInPath(src);
3749    final INodeFile pendingFile = iip.getINode(0).asFile();
3750    int nrBlocks = pendingFile.numBlocks();
3751    BlockInfo[] blocks = pendingFile.getBlocks();
3752
3753    int nrCompleteBlocks;
3754    BlockInfo curBlock = null;
3755    for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3756      curBlock = blocks[nrCompleteBlocks];
3757      if(!curBlock.isComplete())
3758        break;
3759      assert blockManager.checkMinReplication(curBlock) :
3760              "A COMPLETE block is not minimally replicated in " + src;
3761    }
3762
3763    // If there are no incomplete blocks associated with this file,
3764    // then reap lease immediately and close the file.
3765    if(nrCompleteBlocks == nrBlocks) {
3766      finalizeINodeFileUnderConstruction(src, pendingFile,
3767          iip.getLatestSnapshotId());
3768      NameNode.stateChangeLog.warn("BLOCK*"
3769        + " internalReleaseLease: All existing blocks are COMPLETE,"
3770        + " lease removed, file closed.");
3771      return true;  // closed!
3772    }
3773
3774    // Only the last and the penultimate blocks may be in non COMPLETE state.
3775    // If the penultimate block is not COMPLETE, then it must be COMMITTED.
3776    if(nrCompleteBlocks < nrBlocks - 2 ||
3777       nrCompleteBlocks == nrBlocks - 2 &&
3778         curBlock != null &&
3779         curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
3780      final String message = "DIR* NameSystem.internalReleaseLease: "
3781        + "attempt to release a create lock on "
3782        + src + " but file is already closed.";
3783      NameNode.stateChangeLog.warn(message);
3784      throw new IOException(message);
3785    }
3786
3787    // The last block is not COMPLETE, and
3788    // that the penultimate block if exists is either COMPLETE or COMMITTED
3789    final BlockInfo lastBlock = pendingFile.getLastBlock();
3790    BlockUCState lastBlockState = lastBlock.getBlockUCState();
3791    BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
3792    boolean penultimateBlockMinReplication;
3793    BlockUCState penultimateBlockState;
3794    if (penultimateBlock == null) {
3795      penultimateBlockState = BlockUCState.COMPLETE;
3796      // If penultimate block doesn't exist then its minReplication is met
3797      penultimateBlockMinReplication = true;
3798    } else {
3799      penultimateBlockState = BlockUCState.COMMITTED;
3800      penultimateBlockMinReplication = 
3801        blockManager.checkMinReplication(penultimateBlock);
3802    }
3803    assert penultimateBlockState == BlockUCState.COMPLETE ||
3804           penultimateBlockState == BlockUCState.COMMITTED :
3805           "Unexpected state of penultimate block in " + src;
3806
3807    switch(lastBlockState) {
3808    case COMPLETE:
3809      assert false : "Already checked that the last block is incomplete";
3810      break;
3811    case COMMITTED:
3812      // Close file if committed blocks are minimally replicated
3813      if(penultimateBlockMinReplication &&
3814          blockManager.checkMinReplication(lastBlock)) {
3815        finalizeINodeFileUnderConstruction(src, pendingFile,
3816            iip.getLatestSnapshotId());
3817        NameNode.stateChangeLog.warn("BLOCK*"
3818          + " internalReleaseLease: Committed blocks are minimally replicated,"
3819          + " lease removed, file closed.");
3820        return true;  // closed!
3821      }
3822      // Cannot close file right now, since some blocks 
3823      // are not yet minimally replicated.
3824      // This may potentially cause infinite loop in lease recovery
3825      // if there are no valid replicas on data-nodes.
3826      String message = "DIR* NameSystem.internalReleaseLease: " +
3827          "Failed to release lease for file " + src +
3828          ". Committed blocks are waiting to be minimally replicated." +
3829          " Try again later.";
3830      NameNode.stateChangeLog.warn(message);
3831      throw new AlreadyBeingCreatedException(message);
3832    case UNDER_CONSTRUCTION:
3833    case UNDER_RECOVERY:
3834      final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)lastBlock;
3835      // setup the last block locations from the blockManager if not known
3836      if (uc.getNumExpectedLocations() == 0) {
3837        uc.setExpectedLocations(blockManager.getStorages(lastBlock));
3838      }
3839
3840      if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
3841        // There is no datanode reported to this block.
3842        // may be client have crashed before writing data to pipeline.
3843        // This blocks doesn't need any recovery.
3844        // We can remove this block and close the file.
3845        pendingFile.removeLastBlock(lastBlock);
3846        finalizeINodeFileUnderConstruction(src, pendingFile,
3847            iip.getLatestSnapshotId());
3848        NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
3849            + "Removed empty last block and closed file.");
3850        return true;
3851      }
3852      // start recovery of the last block for this file
3853      long blockRecoveryId = nextGenerationStamp(isLegacyBlock(uc));
3854      lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
3855      uc.initializeBlockRecovery(blockRecoveryId);
3856      leaseManager.renewLease(lease);
3857      // Cannot close file right now, since the last block requires recovery.
3858      // This may potentially cause infinite loop in lease recovery
3859      // if there are no valid replicas on data-nodes.
3860      NameNode.stateChangeLog.warn(
3861                "DIR* NameSystem.internalReleaseLease: " +
3862                "File " + src + " has not been closed." +
3863               " Lease recovery is in progress. " +
3864                "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
3865      break;
3866    }
3867    return false;
3868  }
3869
3870  private Lease reassignLease(Lease lease, String src, String newHolder,
3871      INodeFile pendingFile) {
3872    assert hasWriteLock();
3873    if(newHolder == null)
3874      return lease;
3875    // The following transaction is not synced. Make sure it's sync'ed later.
3876    logReassignLease(lease.getHolder(), src, newHolder);
3877    return reassignLeaseInternal(lease, src, newHolder, pendingFile);
3878  }
3879  
3880  Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
3881      INodeFile pendingFile) {
3882    assert hasWriteLock();
3883    pendingFile.getFileUnderConstructionFeature().setClientName(newHolder);
3884    return leaseManager.reassignLease(lease, src, newHolder);
3885  }
3886
3887  private void commitOrCompleteLastBlock(final INodeFile fileINode,
3888      final Block commitBlock) throws IOException {
3889    assert hasWriteLock();
3890    Preconditions.checkArgument(fileINode.isUnderConstruction());
3891    if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
3892      return;
3893    }
3894
3895    // Adjust disk space consumption if required
3896    final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
3897    if (diff > 0) {
3898      try {
3899        String path = fileINode.getFullPathName();
3900        dir.updateSpaceConsumed(path, 0, -diff*fileINode.getFileReplication());
3901      } catch (IOException e) {
3902        LOG.warn("Unexpected exception while updating disk space.", e);
3903      }
3904    }
3905  }
3906
3907  private void finalizeINodeFileUnderConstruction(String src,
3908      INodeFile pendingFile, int latestSnapshot) throws IOException,
3909      UnresolvedLinkException {
3910    assert hasWriteLock();
3911    FileUnderConstructionFeature uc = pendingFile.getFileUnderConstructionFeature();
3912    Preconditions.checkArgument(uc != null);
3913    leaseManager.removeLease(uc.getClientName(), src);
3914    
3915    pendingFile = pendingFile.recordModification(latestSnapshot);
3916
3917    // The file is no longer pending.
3918    // Create permanent INode, update blocks. No need to replace the inode here
3919    // since we just remove the uc feature from pendingFile
3920    final INodeFile newFile = pendingFile.toCompleteFile(now());
3921
3922    // close file and persist block allocations for this file
3923    dir.closeFile(src, newFile);
3924
3925    blockManager.checkReplication(newFile);
3926  }
3927
3928  @VisibleForTesting
3929  BlockInfo getStoredBlock(Block block) {
3930    return blockManager.getStoredBlock(block);
3931  }
3932  
3933  @Override
3934  public boolean isInSnapshot(BlockInfoUnderConstruction blockUC) {
3935    assert hasReadLock();
3936    final BlockCollection bc = blockUC.getBlockCollection();
3937    if (bc == null || !(bc instanceof INodeFile)
3938        || !((INodeFile) bc).isUnderConstruction()) {
3939      return false;
3940    }
3941
3942    INodeFile inodeUC = (INodeFile) bc;
3943    String fullName = inodeUC.getName();
3944    try {
3945      if (fullName != null && fullName.startsWith(Path.SEPARATOR)
3946          && dir.getINode(fullName) == inodeUC) {
3947        // If file exists in normal path then no need to look in snapshot
3948        return false;
3949      }
3950    } catch (UnresolvedLinkException e) {
3951      LOG.error("Error while resolving the link : " + fullName, e);
3952      return false;
3953    }
3954    /*
3955     * 1. if bc is an instance of INodeFileUnderConstructionWithSnapshot, and
3956     * bc is not in the current fsdirectory tree, bc must represent a snapshot
3957     * file. 
3958     * 2. if fullName is not an absolute path, bc cannot be existent in the 
3959     * current fsdirectory tree. 
3960     * 3. if bc is not the current node associated with fullName, bc must be a
3961     * snapshot inode.
3962     */
3963    return true;
3964  }
3965
3966  void commitBlockSynchronization(ExtendedBlock lastblock,
3967      long newgenerationstamp, long newlength,
3968      boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
3969      String[] newtargetstorages)
3970      throws IOException, UnresolvedLinkException {
3971    LOG.info("commitBlockSynchronization(lastblock=" + lastblock
3972             + ", newgenerationstamp=" + newgenerationstamp
3973             + ", newlength=" + newlength
3974             + ", newtargets=" + Arrays.asList(newtargets)
3975             + ", closeFile=" + closeFile
3976             + ", deleteBlock=" + deleteblock
3977             + ")");
3978    checkOperation(OperationCategory.WRITE);
3979    String src = "";
3980    writeLock();
3981    try {
3982      checkOperation(OperationCategory.WRITE);
3983      // If a DN tries to commit to the standby, the recovery will
3984      // fail, and the next retry will succeed on the new NN.
3985  
3986      checkNameNodeSafeMode(
3987          "Cannot commitBlockSynchronization while in safe mode");
3988      final BlockInfo storedBlock = getStoredBlock(
3989          ExtendedBlock.getLocalBlock(lastblock));
3990      if (storedBlock == null) {
3991        if (deleteblock) {
3992          // This may be a retry attempt so ignore the failure
3993          // to locate the block.
3994          if (LOG.isDebugEnabled()) {
3995            LOG.debug("Block (=" + lastblock + ") not found");
3996          }
3997          return;
3998        } else {
3999          throw new IOException("Block (=" + lastblock + ") not found");
4000        }
4001      }
4002      INodeFile iFile = ((INode)storedBlock.getBlockCollection()).asFile();
4003      if (!iFile.isUnderConstruction() || storedBlock.isComplete()) {
4004        if (LOG.isDebugEnabled()) {
4005          LOG.debug("Unexpected block (=" + lastblock
4006                    + ") since the file (=" + iFile.getLocalName()
4007                    + ") is not under construction");
4008        }
4009        return;
4010      }
4011
4012      long recoveryId =
4013        ((BlockInfoUnderConstruction)storedBlock).getBlockRecoveryId();
4014      if(recoveryId != newgenerationstamp) {
4015        throw new IOException("The recovery id " + newgenerationstamp
4016                              + " does not match current recovery id "
4017                              + recoveryId + " for block " + lastblock); 
4018      }
4019
4020      if (deleteblock) {
4021        Block blockToDel = ExtendedBlock.getLocalBlock(lastblock);
4022        boolean remove = iFile.removeLastBlock(blockToDel);
4023        if (remove) {
4024          blockManager.removeBlockFromMap(storedBlock);
4025        }
4026      }
4027      else {
4028        // update last block
4029        storedBlock.setGenerationStamp(newgenerationstamp);
4030        storedBlock.setNumBytes(newlength);
4031
4032        // find the DatanodeDescriptor objects
4033        // There should be no locations in the blockManager till now because the
4034        // file is underConstruction
4035        ArrayList<DatanodeDescriptor> trimmedTargets =
4036            new ArrayList<DatanodeDescriptor>(newtargets.length);
4037        ArrayList<String> trimmedStorages =
4038            new ArrayList<String>(newtargets.length);
4039        if (newtargets.length > 0) {
4040          for (int i = 0; i < newtargets.length; ++i) {
4041            // try to get targetNode
4042            DatanodeDescriptor targetNode =
4043                blockManager.getDatanodeManager().getDatanode(newtargets[i]);
4044            if (targetNode != null) {
4045              trimmedTargets.add(targetNode);
4046              trimmedStorages.add(newtargetstorages[i]);
4047            } else if (LOG.isDebugEnabled()) {
4048              LOG.debug("DatanodeDescriptor (=" + newtargets[i] + ") not found");
4049            }
4050          }
4051        }
4052        if ((closeFile) && !trimmedTargets.isEmpty()) {
4053          // the file is getting closed. Insert block locations into blockManager.
4054          // Otherwise fsck will report these blocks as MISSING, especially if the
4055          // blocksReceived from Datanodes take a long time to arrive.
4056          for (int i = 0; i < trimmedTargets.size(); i++) {
4057            trimmedTargets.get(i).addBlock(
4058              trimmedStorages.get(i), storedBlock);
4059          }
4060        }
4061
4062        // add pipeline locations into the INodeUnderConstruction
4063        DatanodeStorageInfo[] trimmedStorageInfos =
4064            blockManager.getDatanodeManager().getDatanodeStorageInfos(
4065                trimmedTargets.toArray(new DatanodeID[trimmedTargets.size()]),
4066                trimmedStorages.toArray(new String[trimmedStorages.size()]));
4067        iFile.setLastBlock(storedBlock, trimmedStorageInfos);
4068      }
4069
4070      if (closeFile) {
4071        src = closeFileCommitBlocks(iFile, storedBlock);
4072      } else {
4073        // If this commit does not want to close the file, persist blocks
4074        src = persistBlocks(iFile, false);
4075      }
4076    } finally {
4077      writeUnlock();
4078    }
4079    getEditLog().logSync();
4080    if (closeFile) {
4081      LOG.info("commitBlockSynchronization(newblock=" + lastblock
4082          + ", file=" + src
4083          + ", newgenerationstamp=" + newgenerationstamp
4084          + ", newlength=" + newlength
4085          + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
4086    } else {
4087      LOG.info("commitBlockSynchronization(" + lastblock + ") successful");
4088    }
4089  }
4090
4091  /**
4092   *
4093   * @param pendingFile
4094   * @param storedBlock
4095   * @return Path of the file that was closed.
4096   * @throws IOException
4097   */
4098  @VisibleForTesting
4099  String closeFileCommitBlocks(INodeFile pendingFile, BlockInfo storedBlock)
4100      throws IOException {
4101    String src = pendingFile.getFullPathName();
4102
4103    // commit the last block and complete it if it has minimum replicas
4104    commitOrCompleteLastBlock(pendingFile, storedBlock);
4105
4106    //remove lease, close file
4107    finalizeINodeFileUnderConstruction(src, pendingFile,
4108        Snapshot.findLatestSnapshot(pendingFile, Snapshot.CURRENT_STATE_ID));
4109
4110    return src;
4111  }
4112
4113  /**
4114   * Persist the block list for the given file.
4115   *
4116   * @param pendingFile
4117   * @return Path to the given file.
4118   * @throws IOException
4119   */
4120  @VisibleForTesting
4121  String persistBlocks(INodeFile pendingFile, boolean logRetryCache)
4122      throws IOException {
4123    String src = pendingFile.getFullPathName();
4124    dir.persistBlocks(src, pendingFile, logRetryCache);
4125    return src;
4126  }
4127
4128  /**
4129   * Renew the lease(s) held by the given client
4130   */
4131  void renewLease(String holder) throws IOException {
4132    checkOperation(OperationCategory.WRITE);
4133    readLock();
4134    try {
4135      checkOperation(OperationCategory.WRITE);
4136      checkNameNodeSafeMode("Cannot renew lease for " + holder);
4137      leaseManager.renewLease(holder);
4138    } finally {
4139      readUnlock();
4140    }
4141  }
4142
4143  /**
4144   * Get a partial listing of the indicated directory
4145   *
4146   * @param src the directory name
4147   * @param startAfter the name to start after
4148   * @param needLocation if blockLocations need to be returned
4149   * @return a partial listing starting after startAfter
4150   * 
4151   * @throws AccessControlException if access is denied
4152   * @throws UnresolvedLinkException if symbolic link is encountered
4153   * @throws IOException if other I/O error occurred
4154   */
4155  DirectoryListing getListing(String src, byte[] startAfter,
4156      boolean needLocation) 
4157      throws AccessControlException, UnresolvedLinkException, IOException {
4158    try {
4159      return getListingInt(src, startAfter, needLocation);
4160    } catch (AccessControlException e) {
4161      logAuditEvent(false, "listStatus", src);
4162      throw e;
4163    }
4164  }
4165
4166  private DirectoryListing getListingInt(String src, byte[] startAfter,
4167      boolean needLocation) 
4168    throws AccessControlException, UnresolvedLinkException, IOException {
4169    DirectoryListing dl;
4170    FSPermissionChecker pc = getPermissionChecker();
4171    checkOperation(OperationCategory.READ);
4172    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
4173    String startAfterString = new String(startAfter);
4174    readLock();
4175    try {
4176      checkOperation(OperationCategory.READ);
4177      src = FSDirectory.resolvePath(src, pathComponents, dir);
4178
4179      // Get file name when startAfter is an INodePath
4180      if (FSDirectory.isReservedName(startAfterString)) {
4181        byte[][] startAfterComponents = FSDirectory
4182            .getPathComponentsForReservedPath(startAfterString);
4183        try {
4184          String tmp = FSDirectory.resolvePath(src, startAfterComponents, dir);
4185          byte[][] regularPath = INode.getPathComponents(tmp);
4186          startAfter = regularPath[regularPath.length - 1];
4187        } catch (IOException e) {
4188          // Possibly the inode is deleted
4189          throw new DirectoryListingStartAfterNotFoundException(
4190              "Can't find startAfter " + startAfterString);
4191        }
4192      }
4193      
4194      if (isPermissionEnabled) {
4195        if (dir.isDir(src)) {
4196          checkPathAccess(pc, src, FsAction.READ_EXECUTE);
4197        } else {
4198          checkTraverse(pc, src);
4199        }
4200      }
4201      logAuditEvent(true, "listStatus", src);
4202      dl = dir.getListing(src, startAfter, needLocation);
4203    } finally {
4204      readUnlock();
4205    }
4206    return dl;
4207  }
4208
4209  /////////////////////////////////////////////////////////
4210  //
4211  // These methods are called by datanodes
4212  //
4213  /////////////////////////////////////////////////////////
4214  /**
4215   * Register Datanode.
4216   * <p>
4217   * The purpose of registration is to identify whether the new datanode
4218   * serves a new data storage, and will report new data block copies,
4219   * which the namenode was not aware of; or the datanode is a replacement
4220   * node for the data storage that was previously served by a different
4221   * or the same (in terms of host:port) datanode.
4222   * The data storages are distinguished by their storageIDs. When a new
4223   * data storage is reported the namenode issues a new unique storageID.
4224   * <p>
4225   * Finally, the namenode returns its namespaceID as the registrationID
4226   * for the datanodes. 
4227   * namespaceID is a persistent attribute of the name space.
4228   * The registrationID is checked every time the datanode is communicating
4229   * with the namenode. 
4230   * Datanodes with inappropriate registrationID are rejected.
4231   * If the namenode stops, and then restarts it can restore its 
4232   * namespaceID and will continue serving the datanodes that has previously
4233   * registered with the namenode without restarting the whole cluster.
4234   * 
4235   * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4236   */
4237  void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4238    writeLock();
4239    try {
4240      getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4241      checkSafeMode();
4242    } finally {
4243      writeUnlock();
4244    }
4245  }
4246  
4247  /**
4248   * Get registrationID for datanodes based on the namespaceID.
4249   * 
4250   * @see #registerDatanode(DatanodeRegistration)
4251   * @return registration ID
4252   */
4253  String getRegistrationID() {
4254    return Storage.getRegistrationID(dir.fsImage.getStorage());
4255  }
4256
4257  /**
4258   * The given node has reported in.  This method should:
4259   * 1) Record the heartbeat, so the datanode isn't timed out
4260   * 2) Adjust usage stats for future block allocation
4261   * 
4262   * If a substantial amount of time passed since the last datanode 
4263   * heartbeat then request an immediate block report.  
4264   * 
4265   * @return an array of datanode commands 
4266   * @throws IOException
4267   */
4268  HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4269      StorageReport[] reports, long cacheCapacity, long cacheUsed,
4270      int xceiverCount, int xmitsInProgress, int failedVolumes)
4271        throws IOException {
4272    readLock();
4273    try {
4274      //get datanode commands
4275      final int maxTransfer = blockManager.getMaxReplicationStreams()
4276          - xmitsInProgress;
4277      DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4278          nodeReg, reports, blockPoolId, cacheCapacity, cacheUsed,
4279          xceiverCount, maxTransfer, failedVolumes);
4280      
4281      //create ha status
4282      final NNHAStatusHeartbeat haState = new NNHAStatusHeartbeat(
4283          haContext.getState().getServiceState(),
4284          getFSImage().getLastAppliedOrWrittenTxId());
4285
4286      return new HeartbeatResponse(cmds, haState, rollingUpgradeInfo);
4287    } finally {
4288      readUnlock();
4289    }
4290  }
4291
4292  /**
4293   * Returns whether or not there were available resources at the last check of
4294   * resources.
4295   *
4296   * @return true if there were sufficient resources available, false otherwise.
4297   */
4298  boolean nameNodeHasResourcesAvailable() {
4299    return hasResourcesAvailable;
4300  }
4301
4302  /**
4303   * Perform resource checks and cache the results.
4304   * @throws IOException
4305   */
4306  void checkAvailableResources() {
4307    Preconditions.checkState(nnResourceChecker != null,
4308        "nnResourceChecker not initialized");
4309    hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4310  }
4311
4312  /**
4313   * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4314   * there are found to be insufficient resources available, causes the NN to
4315   * enter safe mode. If resources are later found to have returned to
4316   * acceptable levels, this daemon will cause the NN to exit safe mode.
4317   */
4318  class NameNodeResourceMonitor implements Runnable  {
4319    boolean shouldNNRmRun = true;
4320    @Override
4321    public void run () {
4322      try {
4323        while (fsRunning && shouldNNRmRun) {
4324          checkAvailableResources();
4325          if(!nameNodeHasResourcesAvailable()) {
4326            String lowResourcesMsg = "NameNode low on available disk space. ";
4327            if (!isInSafeMode()) {
4328              FSNamesystem.LOG.warn(lowResourcesMsg + "Entering safe mode.");
4329            } else {
4330              FSNamesystem.LOG.warn(lowResourcesMsg + "Already in safe mode.");
4331            }
4332            enterSafeMode(true);
4333          }
4334          try {
4335            Thread.sleep(resourceRecheckInterval);
4336          } catch (InterruptedException ie) {
4337            // Deliberately ignore
4338          }
4339        }
4340      } catch (Exception e) {
4341        FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4342      }
4343    }
4344
4345    public void stopMonitor() {
4346      shouldNNRmRun = false;
4347    }
4348 }
4349
4350  class NameNodeEditLogRoller implements Runnable {
4351
4352    private boolean shouldRun = true;
4353    private final long rollThreshold;
4354    private final long sleepIntervalMs;
4355
4356    public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4357        this.rollThreshold = rollThreshold;
4358        this.sleepIntervalMs = sleepIntervalMs;
4359    }
4360
4361    @Override
4362    public void run() {
4363      while (fsRunning && shouldRun) {
4364        try {
4365          FSEditLog editLog = getFSImage().getEditLog();
4366          long numEdits =
4367              editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4368          if (numEdits > rollThreshold) {
4369            FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4370                + " number of edits in open segment exceeds threshold of "
4371                + rollThreshold);
4372            rollEditLog();
4373          }
4374          Thread.sleep(sleepIntervalMs);
4375        } catch (InterruptedException e) {
4376          FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4377              + " was interrupted, exiting");
4378          break;
4379        } catch (Exception e) {
4380          FSNamesystem.LOG.error("Swallowing exception in "
4381              + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4382        }
4383      }
4384    }
4385
4386    public void stop() {
4387      shouldRun = false;
4388    }
4389  }
4390
4391  public FSImage getFSImage() {
4392    return dir.fsImage;
4393  }
4394
4395  public FSEditLog getEditLog() {
4396    return getFSImage().getEditLog();
4397  }    
4398
4399  private void checkBlock(ExtendedBlock block) throws IOException {
4400    if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4401      throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4402          + " - expected " + blockPoolId);
4403    }
4404  }
4405
4406  @Metric({"MissingBlocks", "Number of missing blocks"})
4407  public long getMissingBlocksCount() {
4408    // not locking
4409    return blockManager.getMissingBlocksCount();
4410  }
4411  
4412  @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4413  public int getExpiredHeartbeats() {
4414    return datanodeStatistics.getExpiredHeartbeats();
4415  }
4416  
4417  @Metric({"TransactionsSinceLastCheckpoint",
4418      "Number of transactions since last checkpoint"})
4419  public long getTransactionsSinceLastCheckpoint() {
4420    return getEditLog().getLastWrittenTxId() -
4421        getFSImage().getStorage().getMostRecentCheckpointTxId();
4422  }
4423  
4424  @Metric({"TransactionsSinceLastLogRoll",
4425      "Number of transactions since last edit log roll"})
4426  public long getTransactionsSinceLastLogRoll() {
4427    if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
4428      return 0;
4429    } else {
4430      return getEditLog().getLastWrittenTxId() -
4431        getEditLog().getCurSegmentTxId() + 1;
4432    }
4433  }
4434  
4435  @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4436  public long getLastWrittenTransactionId() {
4437    return getEditLog().getLastWrittenTxId();
4438  }
4439  
4440  @Metric({"LastCheckpointTime",
4441      "Time in milliseconds since the epoch of the last checkpoint"})
4442  public long getLastCheckpointTime() {
4443    return getFSImage().getStorage().getMostRecentCheckpointTime();
4444  }
4445
4446  /** @see ClientProtocol#getStats() */
4447  long[] getStats() {
4448    final long[] stats = datanodeStatistics.getStats();
4449    stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4450    stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4451    stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4452    return stats;
4453  }
4454
4455  @Override // FSNamesystemMBean
4456  @Metric({"CapacityTotal",
4457      "Total raw capacity of data nodes in bytes"})
4458  public long getCapacityTotal() {
4459    return datanodeStatistics.getCapacityTotal();
4460  }
4461
4462  @Metric({"CapacityTotalGB",
4463      "Total raw capacity of data nodes in GB"})
4464  public float getCapacityTotalGB() {
4465    return DFSUtil.roundBytesToGB(getCapacityTotal());
4466  }
4467
4468  @Override // FSNamesystemMBean
4469  @Metric({"CapacityUsed",
4470      "Total used capacity across all data nodes in bytes"})
4471  public long getCapacityUsed() {
4472    return datanodeStatistics.getCapacityUsed();
4473  }
4474
4475  @Metric({"CapacityUsedGB",
4476      "Total used capacity across all data nodes in GB"})
4477  public float getCapacityUsedGB() {
4478    return DFSUtil.roundBytesToGB(getCapacityUsed());
4479  }
4480
4481  @Override // FSNamesystemMBean
4482  @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4483  public long getCapacityRemaining() {
4484    return datanodeStatistics.getCapacityRemaining();
4485  }
4486
4487  @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4488  public float getCapacityRemainingGB() {
4489    return DFSUtil.roundBytesToGB(getCapacityRemaining());
4490  }
4491
4492  @Metric({"CapacityUsedNonDFS",
4493      "Total space used by data nodes for non DFS purposes in bytes"})
4494  public long getCapacityUsedNonDFS() {
4495    return datanodeStatistics.getCapacityUsedNonDFS();
4496  }
4497
4498  /**
4499   * Total number of connections.
4500   */
4501  @Override // FSNamesystemMBean
4502  @Metric
4503  public int getTotalLoad() {
4504    return datanodeStatistics.getXceiverCount();
4505  }
4506  
4507  @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4508  public int getNumSnapshottableDirs() {
4509    return this.snapshotManager.getNumSnapshottableDirs();
4510  }
4511
4512  @Metric({ "Snapshots", "The number of snapshots" })
4513  public int getNumSnapshots() {
4514    return this.snapshotManager.getNumSnapshots();
4515  }
4516
4517  @Override
4518  public String getSnapshotStats() {
4519    Map<String, Object> info = new HashMap<String, Object>();
4520    info.put("SnapshottableDirectories", this.getNumSnapshottableDirs());
4521    info.put("Snapshots", this.getNumSnapshots());
4522    return JSON.toString(info);
4523  }
4524
4525  int getNumberOfDatanodes(DatanodeReportType type) {
4526    readLock();
4527    try {
4528      return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4529          type).size(); 
4530    } finally {
4531      readUnlock();
4532    }
4533  }
4534
4535  DatanodeInfo[] datanodeReport(final DatanodeReportType type
4536      ) throws AccessControlException, StandbyException {
4537    checkSuperuserPrivilege();
4538    checkOperation(OperationCategory.UNCHECKED);
4539    readLock();
4540    try {
4541      checkOperation(OperationCategory.UNCHECKED);
4542      final DatanodeManager dm = getBlockManager().getDatanodeManager();      
4543      final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4544
4545      DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4546      for (int i=0; i<arr.length; i++) {
4547        arr[i] = new DatanodeInfo(results.get(i));
4548      }
4549      return arr;
4550    } finally {
4551      readUnlock();
4552    }
4553  }
4554
4555  /**
4556   * Save namespace image.
4557   * This will save current namespace into fsimage file and empty edits file.
4558   * Requires superuser privilege and safe mode.
4559   * 
4560   * @throws AccessControlException if superuser privilege is violated.
4561   * @throws IOException if 
4562   */
4563  void saveNamespace() throws AccessControlException, IOException {
4564    checkOperation(OperationCategory.UNCHECKED);
4565    checkSuperuserPrivilege();
4566    
4567    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
4568    if (cacheEntry != null && cacheEntry.isSuccess()) {
4569      return; // Return previous response
4570    }
4571    boolean success = false;
4572    readLock();
4573    try {
4574      checkOperation(OperationCategory.UNCHECKED);
4575
4576      if (!isInSafeMode()) {
4577        throw new IOException("Safe mode should be turned ON "
4578            + "in order to create namespace image.");
4579      }
4580      getFSImage().saveNamespace(this);
4581      success = true;
4582    } finally {
4583      readUnlock();
4584      RetryCache.setState(cacheEntry, success);
4585    }
4586    LOG.info("New namespace image has been created");
4587  }
4588  
4589  /**
4590   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4591   * Requires superuser privilege.
4592   * 
4593   * @throws AccessControlException if superuser privilege is violated.
4594   */
4595  boolean restoreFailedStorage(String arg) throws AccessControlException,
4596      StandbyException {
4597    checkSuperuserPrivilege();
4598    checkOperation(OperationCategory.UNCHECKED);
4599    writeLock();
4600    try {
4601      checkOperation(OperationCategory.UNCHECKED);
4602      
4603      // if it is disabled - enable it and vice versa.
4604      if(arg.equals("check"))
4605        return getFSImage().getStorage().getRestoreFailedStorage();
4606      
4607      boolean val = arg.equals("true");  // false if not
4608      getFSImage().getStorage().setRestoreFailedStorage(val);
4609      
4610      return val;
4611    } finally {
4612      writeUnlock();
4613    }
4614  }
4615
4616  Date getStartTime() {
4617    return new Date(startTime); 
4618  }
4619    
4620  void finalizeUpgrade() throws IOException {
4621    checkSuperuserPrivilege();
4622    checkOperation(OperationCategory.UNCHECKED);
4623    writeLock();
4624    try {
4625      checkOperation(OperationCategory.UNCHECKED);
4626      getFSImage().finalizeUpgrade(this.isHaEnabled() && inActiveState());
4627    } finally {
4628      writeUnlock();
4629    }
4630  }
4631
4632  void refreshNodes() throws IOException {
4633    checkOperation(OperationCategory.UNCHECKED);
4634    checkSuperuserPrivilege();
4635    getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
4636  }
4637
4638  void setBalancerBandwidth(long bandwidth) throws IOException {
4639    checkOperation(OperationCategory.UNCHECKED);
4640    checkSuperuserPrivilege();
4641    getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
4642  }
4643
4644  /**
4645   * SafeModeInfo contains information related to the safe mode.
4646   * <p>
4647   * An instance of {@link SafeModeInfo} is created when the name node
4648   * enters safe mode.
4649   * <p>
4650   * During name node startup {@link SafeModeInfo} counts the number of
4651   * <em>safe blocks</em>, those that have at least the minimal number of
4652   * replicas, and calculates the ratio of safe blocks to the total number
4653   * of blocks in the system, which is the size of blocks in
4654   * {@link FSNamesystem#blockManager}. When the ratio reaches the
4655   * {@link #threshold} it starts the SafeModeMonitor daemon in order
4656   * to monitor whether the safe mode {@link #extension} is passed.
4657   * Then it leaves safe mode and destroys itself.
4658   * <p>
4659   * If safe mode is turned on manually then the number of safe blocks is
4660   * not tracked because the name node is not intended to leave safe mode
4661   * automatically in the case.
4662   *
4663   * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
4664   */
4665  public class SafeModeInfo {
4666    // configuration fields
4667    /** Safe mode threshold condition %.*/
4668    private final double threshold;
4669    /** Safe mode minimum number of datanodes alive */
4670    private final int datanodeThreshold;
4671    /** Safe mode extension after the threshold. */
4672    private int extension;
4673    /** Min replication required by safe mode. */
4674    private final int safeReplication;
4675    /** threshold for populating needed replication queues */
4676    private final double replQueueThreshold;
4677    // internal fields
4678    /** Time when threshold was reached.
4679     * <br> -1 safe mode is off
4680     * <br> 0 safe mode is on, and threshold is not reached yet
4681     * <br> >0 safe mode is on, but we are in extension period 
4682     */
4683    private long reached = -1;  
4684    /** Total number of blocks. */
4685    int blockTotal; 
4686    /** Number of safe blocks. */
4687    int blockSafe;
4688    /** Number of blocks needed to satisfy safe mode threshold condition */
4689    private int blockThreshold;
4690    /** Number of blocks needed before populating replication queues */
4691    private int blockReplQueueThreshold;
4692    /** time of the last status printout */
4693    private long lastStatusReport = 0;
4694    /** Was safemode entered automatically because available resources were low. */
4695    private boolean resourcesLow = false;
4696    /** Should safemode adjust its block totals as blocks come in */
4697    private boolean shouldIncrementallyTrackBlocks = false;
4698    /** counter for tracking startup progress of reported blocks */
4699    private Counter awaitingReportedBlocksCounter;
4700    
4701    /**
4702     * Creates SafeModeInfo when the name node enters
4703     * automatic safe mode at startup.
4704     *  
4705     * @param conf configuration
4706     */
4707    private SafeModeInfo(Configuration conf) {
4708      this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
4709          DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
4710      if(threshold > 1.0) {
4711        LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
4712      }
4713      this.datanodeThreshold = conf.getInt(
4714        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
4715        DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
4716      this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
4717      this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY, 
4718                                         DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
4719      
4720      LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
4721      LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
4722      LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + "     = " + extension);
4723
4724      // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
4725      this.replQueueThreshold = 
4726        conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
4727                      (float) threshold);
4728      this.blockTotal = 0; 
4729      this.blockSafe = 0;
4730    }
4731
4732    /**
4733     * In the HA case, the StandbyNode can be in safemode while the namespace
4734     * is modified by the edit log tailer. In this case, the number of total
4735     * blocks changes as edits are processed (eg blocks are added and deleted).
4736     * However, we don't want to do the incremental tracking during the
4737     * startup-time loading process -- only once the initial total has been
4738     * set after the image has been loaded.
4739     */
4740    private boolean shouldIncrementallyTrackBlocks() {
4741      return shouldIncrementallyTrackBlocks;
4742    }
4743
4744    /**
4745     * Creates SafeModeInfo when safe mode is entered manually, or because
4746     * available resources are low.
4747     *
4748     * The {@link #threshold} is set to 1.5 so that it could never be reached.
4749     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
4750     * 
4751     * @see SafeModeInfo
4752     */
4753    private SafeModeInfo(boolean resourcesLow) {
4754      this.threshold = 1.5f;  // this threshold can never be reached
4755      this.datanodeThreshold = Integer.MAX_VALUE;
4756      this.extension = Integer.MAX_VALUE;
4757      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
4758      this.replQueueThreshold = 1.5f; // can never be reached
4759      this.blockTotal = -1;
4760      this.blockSafe = -1;
4761      this.resourcesLow = resourcesLow;
4762      enter();
4763      reportStatus("STATE* Safe mode is ON.", true);
4764    }
4765      
4766    /**
4767     * Check if safe mode is on.
4768     * @return true if in safe mode
4769     */
4770    private synchronized boolean isOn() {
4771      doConsistencyCheck();
4772      return this.reached >= 0;
4773    }
4774      
4775    /**
4776     * Enter safe mode.
4777     */
4778    private void enter() {
4779      this.reached = 0;
4780    }
4781      
4782    /**
4783     * Leave safe mode.
4784     * <p>
4785     * Check for invalid, under- & over-replicated blocks in the end of startup.
4786     */
4787    private synchronized void leave() {
4788      // if not done yet, initialize replication queues.
4789      // In the standby, do not populate repl queues
4790      if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
4791        initializeReplQueues();
4792      }
4793      long timeInSafemode = now() - startTime;
4794      NameNode.stateChangeLog.info("STATE* Leaving safe mode after " 
4795                                    + timeInSafemode/1000 + " secs");
4796      NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
4797
4798      //Log the following only once (when transitioning from ON -> OFF)
4799      if (reached >= 0) {
4800        NameNode.stateChangeLog.info("STATE* Safe mode is OFF"); 
4801      }
4802      reached = -1;
4803      safeMode = null;
4804      final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
4805      NameNode.stateChangeLog.info("STATE* Network topology has "
4806          + nt.getNumOfRacks() + " racks and "
4807          + nt.getNumOfLeaves() + " datanodes");
4808      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
4809          + blockManager.numOfUnderReplicatedBlocks() + " blocks");
4810
4811      startSecretManagerIfNecessary();
4812
4813      // If startup has not yet completed, end safemode phase.
4814      StartupProgress prog = NameNode.getStartupProgress();
4815      if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4816        prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
4817        prog.endPhase(Phase.SAFEMODE);
4818      }
4819    }
4820
4821    /**
4822     * Check whether we have reached the threshold for 
4823     * initializing replication queues.
4824     */
4825    private synchronized boolean canInitializeReplQueues() {
4826      return shouldPopulateReplQueues()
4827          && blockSafe >= blockReplQueueThreshold;
4828    }
4829      
4830    /** 
4831     * Safe mode can be turned off iff 
4832     * the threshold is reached and 
4833     * the extension time have passed.
4834     * @return true if can leave or false otherwise.
4835     */
4836    private synchronized boolean canLeave() {
4837      if (reached == 0) {
4838        return false;
4839      }
4840
4841      if (now() - reached < extension) {
4842        reportStatus("STATE* Safe mode ON, in safe mode extension.", false);
4843        return false;
4844      }
4845
4846      if (needEnter()) {
4847        reportStatus("STATE* Safe mode ON, thresholds not met.", false);
4848        return false;
4849      }
4850
4851      return true;
4852    }
4853      
4854    /** 
4855     * There is no need to enter safe mode 
4856     * if DFS is empty or {@link #threshold} == 0
4857     */
4858    private boolean needEnter() {
4859      return (threshold != 0 && blockSafe < blockThreshold) ||
4860        (datanodeThreshold != 0 && getNumLiveDataNodes() < datanodeThreshold) ||
4861        (!nameNodeHasResourcesAvailable());
4862    }
4863      
4864    /**
4865     * Check and trigger safe mode if needed. 
4866     */
4867    private void checkMode() {
4868      // Have to have write-lock since leaving safemode initializes
4869      // repl queues, which requires write lock
4870      assert hasWriteLock();
4871      // if smmthread is already running, the block threshold must have been 
4872      // reached before, there is no need to enter the safe mode again
4873      if (smmthread == null && needEnter()) {
4874        enter();
4875        // check if we are ready to initialize replication queues
4876        if (canInitializeReplQueues() && !isPopulatingReplQueues()
4877            && !haEnabled) {
4878          initializeReplQueues();
4879        }
4880        reportStatus("STATE* Safe mode ON.", false);
4881        return;
4882      }
4883      // the threshold is reached or was reached before
4884      if (!isOn() ||                           // safe mode is off
4885          extension <= 0 || threshold <= 0) {  // don't need to wait
4886        this.leave(); // leave safe mode
4887        return;
4888      }
4889      if (reached > 0) {  // threshold has already been reached before
4890        reportStatus("STATE* Safe mode ON.", false);
4891        return;
4892      }
4893      // start monitor
4894      reached = now();
4895      if (smmthread == null) {
4896        smmthread = new Daemon(new SafeModeMonitor());
4897        smmthread.start();
4898        reportStatus("STATE* Safe mode extension entered.", true);
4899      }
4900
4901      // check if we are ready to initialize replication queues
4902      if (canInitializeReplQueues() && !isPopulatingReplQueues() && !haEnabled) {
4903        initializeReplQueues();
4904      }
4905    }
4906      
4907    /**
4908     * Set total number of blocks.
4909     */
4910    private synchronized void setBlockTotal(int total) {
4911      this.blockTotal = total;
4912      this.blockThreshold = (int) (blockTotal * threshold);
4913      this.blockReplQueueThreshold = 
4914        (int) (blockTotal * replQueueThreshold);
4915      if (haEnabled) {
4916        // After we initialize the block count, any further namespace
4917        // modifications done while in safe mode need to keep track
4918        // of the number of total blocks in the system.
4919        this.shouldIncrementallyTrackBlocks = true;
4920      }
4921      if(blockSafe < 0)
4922        this.blockSafe = 0;
4923      checkMode();
4924    }
4925      
4926    /**
4927     * Increment number of safe blocks if current block has 
4928     * reached minimal replication.
4929     * @param replication current replication 
4930     */
4931    private synchronized void incrementSafeBlockCount(short replication) {
4932      if (replication == safeReplication) {
4933        this.blockSafe++;
4934
4935        // Report startup progress only if we haven't completed startup yet.
4936        StartupProgress prog = NameNode.getStartupProgress();
4937        if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4938          if (this.awaitingReportedBlocksCounter == null) {
4939            this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
4940              STEP_AWAITING_REPORTED_BLOCKS);
4941          }
4942          this.awaitingReportedBlocksCounter.increment();
4943        }
4944
4945        checkMode();
4946      }
4947    }
4948      
4949    /**
4950     * Decrement number of safe blocks if current block has 
4951     * fallen below minimal replication.
4952     * @param replication current replication 
4953     */
4954    private synchronized void decrementSafeBlockCount(short replication) {
4955      if (replication == safeReplication-1) {
4956        this.blockSafe--;
4957        //blockSafe is set to -1 in manual / low resources safemode
4958        assert blockSafe >= 0 || isManual() || areResourcesLow();
4959        checkMode();
4960      }
4961    }
4962
4963    /**
4964     * Check if safe mode was entered manually
4965     */
4966    private boolean isManual() {
4967      return extension == Integer.MAX_VALUE;
4968    }
4969
4970    /**
4971     * Set manual safe mode.
4972     */
4973    private synchronized void setManual() {
4974      extension = Integer.MAX_VALUE;
4975    }
4976
4977    /**
4978     * Check if safe mode was entered due to resources being low.
4979     */
4980    private boolean areResourcesLow() {
4981      return resourcesLow;
4982    }
4983
4984    /**
4985     * Set that resources are low for this instance of safe mode.
4986     */
4987    private void setResourcesLow() {
4988      resourcesLow = true;
4989    }
4990
4991    /**
4992     * A tip on how safe mode is to be turned off: manually or automatically.
4993     */
4994    String getTurnOffTip() {
4995      if(!isOn()) {
4996        return "Safe mode is OFF.";
4997      }
4998
4999      //Manual OR low-resource safemode. (Admin intervention required)
5000      String adminMsg = "It was turned on manually. ";
5001      if (areResourcesLow()) {
5002        adminMsg = "Resources are low on NN. Please add or free up more "
5003          + "resources then turn off safe mode manually. NOTE:  If you turn off"
5004          + " safe mode before adding resources, "
5005          + "the NN will immediately return to safe mode. ";
5006      }
5007      if (isManual() || areResourcesLow()) {
5008        return adminMsg
5009          + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
5010      }
5011
5012      boolean thresholdsMet = true;
5013      int numLive = getNumLiveDataNodes();
5014      String msg = "";
5015      if (blockSafe < blockThreshold) {
5016        msg += String.format(
5017          "The reported blocks %d needs additional %d"
5018          + " blocks to reach the threshold %.4f of total blocks %d.\n",
5019          blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
5020        thresholdsMet = false;
5021      } else {
5022        msg += String.format("The reported blocks %d has reached the threshold"
5023            + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
5024      }
5025      if (numLive < datanodeThreshold) {
5026        msg += String.format(
5027          "The number of live datanodes %d needs an additional %d live "
5028          + "datanodes to reach the minimum number %d.\n",
5029          numLive, (datanodeThreshold - numLive), datanodeThreshold);
5030        thresholdsMet = false;
5031      } else {
5032        msg += String.format("The number of live datanodes %d has reached "
5033            + "the minimum number %d. ",
5034            numLive, datanodeThreshold);
5035      }
5036      msg += (reached > 0) ? "In safe mode extension. " : "";
5037      msg += "Safe mode will be turned off automatically ";
5038
5039      if (!thresholdsMet) {
5040        msg += "once the thresholds have been reached.";
5041      } else if (reached + extension - now() > 0) {
5042        msg += ("in " + (reached + extension - now()) / 1000 + " seconds.");
5043      } else {
5044        msg += "soon.";
5045      }
5046
5047      return msg;
5048    }
5049
5050    /**
5051     * Print status every 20 seconds.
5052     */
5053    private void reportStatus(String msg, boolean rightNow) {
5054      long curTime = now();
5055      if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
5056        return;
5057      NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
5058      lastStatusReport = curTime;
5059    }
5060
5061    @Override
5062    public String toString() {
5063      String resText = "Current safe blocks = " 
5064        + blockSafe 
5065        + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
5066        + ". Minimal replication = " + safeReplication + ".";
5067      if (reached > 0) 
5068        resText += " Threshold was reached " + new Date(reached) + ".";
5069      return resText;
5070    }
5071      
5072    /**
5073     * Checks consistency of the class state.
5074     * This is costly so only runs if asserts are enabled.
5075     */
5076    private void doConsistencyCheck() {
5077      boolean assertsOn = false;
5078      assert assertsOn = true; // set to true if asserts are on
5079      if (!assertsOn) return;
5080      
5081      if (blockTotal == -1 && blockSafe == -1) {
5082        return; // manual safe mode
5083      }
5084      int activeBlocks = blockManager.getActiveBlockCount();
5085      if ((blockTotal != activeBlocks) &&
5086          !(blockSafe >= 0 && blockSafe <= blockTotal)) {
5087        throw new AssertionError(
5088            " SafeMode: Inconsistent filesystem state: "
5089        + "SafeMode data: blockTotal=" + blockTotal
5090        + " blockSafe=" + blockSafe + "; "
5091        + "BlockManager data: active="  + activeBlocks);
5092      }
5093    }
5094
5095    private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
5096      if (!shouldIncrementallyTrackBlocks) {
5097        return;
5098      }
5099      assert haEnabled;
5100      
5101      if (LOG.isDebugEnabled()) {
5102        LOG.debug("Adjusting block totals from " +
5103            blockSafe + "/" + blockTotal + " to " +
5104            (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
5105      }
5106      assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
5107        blockSafe + " by " + deltaSafe + ": would be negative";
5108      assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
5109        blockTotal + " by " + deltaTotal + ": would be negative";
5110      
5111      blockSafe += deltaSafe;
5112      setBlockTotal(blockTotal + deltaTotal);
5113    }
5114  }
5115    
5116  /**
5117   * Periodically check whether it is time to leave safe mode.
5118   * This thread starts when the threshold level is reached.
5119   *
5120   */
5121  class SafeModeMonitor implements Runnable {
5122    /** interval in msec for checking safe mode: {@value} */
5123    private static final long recheckInterval = 1000;
5124      
5125    /**
5126     */
5127    @Override
5128    public void run() {
5129      while (fsRunning) {
5130        writeLock();
5131        try {
5132          if (safeMode == null) { // Not in safe mode.
5133            break;
5134          }
5135          if (safeMode.canLeave()) {
5136            // Leave safe mode.
5137            safeMode.leave();
5138            smmthread = null;
5139            break;
5140          }
5141        } finally {
5142          writeUnlock();
5143        }
5144
5145        try {
5146          Thread.sleep(recheckInterval);
5147        } catch (InterruptedException ie) {
5148          // Ignored
5149        }
5150      }
5151      if (!fsRunning) {
5152        LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
5153      }
5154    }
5155  }
5156    
5157  boolean setSafeMode(SafeModeAction action) throws IOException {
5158    if (action != SafeModeAction.SAFEMODE_GET) {
5159      checkSuperuserPrivilege();
5160      switch(action) {
5161      case SAFEMODE_LEAVE: // leave safe mode
5162        leaveSafeMode();
5163        break;
5164      case SAFEMODE_ENTER: // enter safe mode
5165        enterSafeMode(false);
5166        break;
5167      default:
5168        LOG.error("Unexpected safe mode action");
5169      }
5170    }
5171    return isInSafeMode();
5172  }
5173
5174  @Override
5175  public void checkSafeMode() {
5176    // safeMode is volatile, and may be set to null at any time
5177    SafeModeInfo safeMode = this.safeMode;
5178    if (safeMode != null) {
5179      safeMode.checkMode();
5180    }
5181  }
5182
5183  @Override
5184  public boolean isInSafeMode() {
5185    // safeMode is volatile, and may be set to null at any time
5186    SafeModeInfo safeMode = this.safeMode;
5187    if (safeMode == null)
5188      return false;
5189    return safeMode.isOn();
5190  }
5191
5192  @Override
5193  public boolean isInStartupSafeMode() {
5194    // safeMode is volatile, and may be set to null at any time
5195    SafeModeInfo safeMode = this.safeMode;
5196    if (safeMode == null)
5197      return false;
5198    // If the NN is in safemode, and not due to manual / low resources, we
5199    // assume it must be because of startup. If the NN had low resources during
5200    // startup, we assume it came out of startup safemode and it is now in low
5201    // resources safemode
5202    return !safeMode.isManual() && !safeMode.areResourcesLow()
5203      && safeMode.isOn();
5204  }
5205
5206  /**
5207   * Check if replication queues are to be populated
5208   * @return true when node is HAState.Active and not in the very first safemode
5209   */
5210  @Override
5211  public boolean isPopulatingReplQueues() {
5212    if (!shouldPopulateReplQueues()) {
5213      return false;
5214    }
5215    return initializedReplQueues;
5216  }
5217
5218  private boolean shouldPopulateReplQueues() {
5219    if(haContext == null || haContext.getState() == null)
5220      return false;
5221    return haContext.getState().shouldPopulateReplQueues();
5222  }
5223
5224  @Override
5225  public void incrementSafeBlockCount(int replication) {
5226    // safeMode is volatile, and may be set to null at any time
5227    SafeModeInfo safeMode = this.safeMode;
5228    if (safeMode == null)
5229      return;
5230    safeMode.incrementSafeBlockCount((short)replication);
5231  }
5232
5233  @Override
5234  public void decrementSafeBlockCount(Block b) {
5235    // safeMode is volatile, and may be set to null at any time
5236    SafeModeInfo safeMode = this.safeMode;
5237    if (safeMode == null) // mostly true
5238      return;
5239    BlockInfo storedBlock = getStoredBlock(b);
5240    if (storedBlock.isComplete()) {
5241      safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5242    }
5243  }
5244  
5245  /**
5246   * Adjust the total number of blocks safe and expected during safe mode.
5247   * If safe mode is not currently on, this is a no-op.
5248   * @param deltaSafe the change in number of safe blocks
5249   * @param deltaTotal the change i nnumber of total blocks expected
5250   */
5251  @Override
5252  public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5253    // safeMode is volatile, and may be set to null at any time
5254    SafeModeInfo safeMode = this.safeMode;
5255    if (safeMode == null)
5256      return;
5257    safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5258  }
5259
5260  /**
5261   * Set the total number of blocks in the system. 
5262   */
5263  public void setBlockTotal() {
5264    // safeMode is volatile, and may be set to null at any time
5265    SafeModeInfo safeMode = this.safeMode;
5266    if (safeMode == null)
5267      return;
5268    safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5269  }
5270
5271  /**
5272   * Get the total number of blocks in the system. 
5273   */
5274  @Override // FSNamesystemMBean
5275  @Metric
5276  public long getBlocksTotal() {
5277    return blockManager.getTotalBlocks();
5278  }
5279
5280  /**
5281   * Get the total number of COMPLETE blocks in the system.
5282   * For safe mode only complete blocks are counted.
5283   */
5284  private long getCompleteBlocksTotal() {
5285    // Calculate number of blocks under construction
5286    long numUCBlocks = 0;
5287    readLock();
5288    try {
5289      for (Lease lease : leaseManager.getSortedLeases()) {
5290        for (String path : lease.getPaths()) {
5291          final INodeFile cons;
5292          try {
5293            cons = dir.getINode(path).asFile();
5294            Preconditions.checkState(cons.isUnderConstruction());
5295          } catch (UnresolvedLinkException e) {
5296            throw new AssertionError("Lease files should reside on this FS");
5297          }
5298          BlockInfo[] blocks = cons.getBlocks();
5299          if(blocks == null)
5300            continue;
5301          for(BlockInfo b : blocks) {
5302            if(!b.isComplete())
5303              numUCBlocks++;
5304          }
5305        }
5306      }
5307      LOG.info("Number of blocks under construction: " + numUCBlocks);
5308      return getBlocksTotal() - numUCBlocks;
5309    } finally {
5310      readUnlock();
5311    }
5312  }
5313
5314  /**
5315   * Enter safe mode. If resourcesLow is false, then we assume it is manual
5316   * @throws IOException
5317   */
5318  void enterSafeMode(boolean resourcesLow) throws IOException {
5319    writeLock();
5320    try {
5321      // Stop the secret manager, since rolling the master key would
5322      // try to write to the edit log
5323      stopSecretManager();
5324
5325      // Ensure that any concurrent operations have been fully synced
5326      // before entering safe mode. This ensures that the FSImage
5327      // is entirely stable on disk as soon as we're in safe mode.
5328      boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5329      // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5330      // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5331      if (isEditlogOpenForWrite) {
5332        getEditLog().logSyncAll();
5333      }
5334      if (!isInSafeMode()) {
5335        safeMode = new SafeModeInfo(resourcesLow);
5336        return;
5337      }
5338      if (resourcesLow) {
5339        safeMode.setResourcesLow();
5340      } else {
5341        safeMode.setManual();
5342      }
5343      if (isEditlogOpenForWrite) {
5344        getEditLog().logSyncAll();
5345      }
5346      NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5347          + safeMode.getTurnOffTip());
5348    } finally {
5349      writeUnlock();
5350    }
5351  }
5352
5353  /**
5354   * Leave safe mode.
5355   * @throws IOException
5356   */
5357  void leaveSafeMode() {
5358    writeLock();
5359    try {
5360      if (!isInSafeMode()) {
5361        NameNode.stateChangeLog.info("STATE* Safe mode is already OFF"); 
5362        return;
5363      }
5364      safeMode.leave();
5365    } finally {
5366      writeUnlock();
5367    }
5368  }
5369    
5370  String getSafeModeTip() {
5371    readLock();
5372    try {
5373      if (!isInSafeMode()) {
5374        return "";
5375      }
5376      return safeMode.getTurnOffTip();
5377    } finally {
5378      readUnlock();
5379    }
5380  }
5381
5382  CheckpointSignature rollEditLog() throws IOException {
5383    checkSuperuserPrivilege();
5384    checkOperation(OperationCategory.JOURNAL);
5385    writeLock();
5386    try {
5387      checkOperation(OperationCategory.JOURNAL);
5388      checkNameNodeSafeMode("Log not rolled");
5389      if (Server.isRpcInvocation()) {
5390        LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5391      }
5392      return getFSImage().rollEditLog();
5393    } finally {
5394      writeUnlock();
5395    }
5396  }
5397
5398  NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5399      NamenodeRegistration activeNamenode) throws IOException {
5400    checkOperation(OperationCategory.CHECKPOINT);
5401    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
5402        null);
5403    if (cacheEntry != null && cacheEntry.isSuccess()) {
5404      return (NamenodeCommand) cacheEntry.getPayload();
5405    }
5406    writeLock();
5407    NamenodeCommand cmd = null;
5408    try {
5409      checkOperation(OperationCategory.CHECKPOINT);
5410      checkNameNodeSafeMode("Checkpoint not started");
5411      
5412      LOG.info("Start checkpoint for " + backupNode.getAddress());
5413      cmd = getFSImage().startCheckpoint(backupNode, activeNamenode);
5414      getEditLog().logSync();
5415      return cmd;
5416    } finally {
5417      writeUnlock();
5418      RetryCache.setState(cacheEntry, cmd != null, cmd);
5419    }
5420  }
5421
5422  public void processIncrementalBlockReport(final DatanodeID nodeID,
5423      final String poolId, final StorageReceivedDeletedBlocks srdb)
5424      throws IOException {
5425    writeLock();
5426    try {
5427      blockManager.processIncrementalBlockReport(nodeID, srdb);
5428    } finally {
5429      writeUnlock();
5430    }
5431  }
5432  
5433  void endCheckpoint(NamenodeRegistration registration,
5434                            CheckpointSignature sig) throws IOException {
5435    checkOperation(OperationCategory.CHECKPOINT);
5436    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5437    if (cacheEntry != null && cacheEntry.isSuccess()) {
5438      return; // Return previous response
5439    }
5440    boolean success = false;
5441    readLock();
5442    try {
5443      checkOperation(OperationCategory.CHECKPOINT);
5444
5445      checkNameNodeSafeMode("Checkpoint not ended");
5446      LOG.info("End checkpoint for " + registration.getAddress());
5447      getFSImage().endCheckpoint(sig);
5448      success = true;
5449    } finally {
5450      readUnlock();
5451      RetryCache.setState(cacheEntry, success);
5452    }
5453  }
5454
5455  PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5456    return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5457  }
5458
5459  private void checkOwner(FSPermissionChecker pc, String path)
5460      throws AccessControlException, UnresolvedLinkException {
5461    checkPermission(pc, path, true, null, null, null, null);
5462  }
5463
5464  private void checkPathAccess(FSPermissionChecker pc,
5465      String path, FsAction access) throws AccessControlException,
5466      UnresolvedLinkException {
5467    checkPermission(pc, path, false, null, null, access, null);
5468  }
5469
5470  private void checkParentAccess(FSPermissionChecker pc,
5471      String path, FsAction access) throws AccessControlException,
5472      UnresolvedLinkException {
5473    checkPermission(pc, path, false, null, access, null, null);
5474  }
5475
5476  private void checkAncestorAccess(FSPermissionChecker pc,
5477      String path, FsAction access) throws AccessControlException,
5478      UnresolvedLinkException {
5479    checkPermission(pc, path, false, access, null, null, null);
5480  }
5481
5482  private void checkTraverse(FSPermissionChecker pc, String path)
5483      throws AccessControlException, UnresolvedLinkException {
5484    checkPermission(pc, path, false, null, null, null, null);
5485  }
5486
5487  @Override
5488  public void checkSuperuserPrivilege()
5489      throws AccessControlException {
5490    if (isPermissionEnabled) {
5491      FSPermissionChecker pc = getPermissionChecker();
5492      pc.checkSuperuserPrivilege();
5493    }
5494  }
5495
5496  /**
5497   * Check whether current user have permissions to access the path. For more
5498   * details of the parameters, see
5499   * {@link FSPermissionChecker#checkPermission()}.
5500   */
5501  private void checkPermission(FSPermissionChecker pc,
5502      String path, boolean doCheckOwner, FsAction ancestorAccess,
5503      FsAction parentAccess, FsAction access, FsAction subAccess)
5504      throws AccessControlException, UnresolvedLinkException {
5505        checkPermission(pc, path, doCheckOwner, ancestorAccess,
5506            parentAccess, access, subAccess, true);
5507  }
5508
5509  /**
5510   * Check whether current user have permissions to access the path. For more
5511   * details of the parameters, see
5512   * {@link FSPermissionChecker#checkPermission()}.
5513   */
5514  private void checkPermission(FSPermissionChecker pc,
5515      String path, boolean doCheckOwner, FsAction ancestorAccess,
5516      FsAction parentAccess, FsAction access, FsAction subAccess,
5517      boolean resolveLink)
5518      throws AccessControlException, UnresolvedLinkException {
5519    if (!pc.isSuperUser()) {
5520      dir.waitForReady();
5521      readLock();
5522      try {
5523        pc.checkPermission(path, dir.rootDir, doCheckOwner, ancestorAccess,
5524            parentAccess, access, subAccess, resolveLink);
5525      } finally {
5526        readUnlock();
5527      }
5528    }
5529  }
5530  
5531  /**
5532   * Check to see if we have exceeded the limit on the number
5533   * of inodes.
5534   */
5535  void checkFsObjectLimit() throws IOException {
5536    if (maxFsObjects != 0 &&
5537        maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5538      throw new IOException("Exceeded the configured number of objects " +
5539                             maxFsObjects + " in the filesystem.");
5540    }
5541  }
5542
5543  /**
5544   * Get the total number of objects in the system. 
5545   */
5546  @Override // FSNamesystemMBean
5547  public long getMaxObjects() {
5548    return maxFsObjects;
5549  }
5550
5551  @Override // FSNamesystemMBean
5552  @Metric
5553  public long getFilesTotal() {
5554    readLock();
5555    try {
5556      return this.dir.totalInodes();
5557    } finally {
5558      readUnlock();
5559    }
5560  }
5561
5562  @Override // FSNamesystemMBean
5563  @Metric
5564  public long getPendingReplicationBlocks() {
5565    return blockManager.getPendingReplicationBlocksCount();
5566  }
5567
5568  @Override // FSNamesystemMBean
5569  @Metric
5570  public long getUnderReplicatedBlocks() {
5571    return blockManager.getUnderReplicatedBlocksCount();
5572  }
5573
5574  /** Returns number of blocks with corrupt replicas */
5575  @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5576  public long getCorruptReplicaBlocks() {
5577    return blockManager.getCorruptReplicaBlocksCount();
5578  }
5579
5580  @Override // FSNamesystemMBean
5581  @Metric
5582  public long getScheduledReplicationBlocks() {
5583    return blockManager.getScheduledReplicationBlocksCount();
5584  }
5585
5586  @Override
5587  @Metric
5588  public long getPendingDeletionBlocks() {
5589    return blockManager.getPendingDeletionBlocksCount();
5590  }
5591
5592  @Metric
5593  public long getExcessBlocks() {
5594    return blockManager.getExcessBlocksCount();
5595  }
5596  
5597  // HA-only metric
5598  @Metric
5599  public long getPostponedMisreplicatedBlocks() {
5600    return blockManager.getPostponedMisreplicatedBlocksCount();
5601  }
5602
5603  // HA-only metric
5604  @Metric
5605  public int getPendingDataNodeMessageCount() {
5606    return blockManager.getPendingDataNodeMessageCount();
5607  }
5608  
5609  // HA-only metric
5610  @Metric
5611  public String getHAState() {
5612    return haContext.getState().toString();
5613  }
5614
5615  // HA-only metric
5616  @Metric
5617  public long getMillisSinceLastLoadedEdits() {
5618    if (isInStandbyState() && editLogTailer != null) {
5619      return now() - editLogTailer.getLastLoadTimestamp();
5620    } else {
5621      return 0;
5622    }
5623  }
5624  
5625  @Metric
5626  public int getBlockCapacity() {
5627    return blockManager.getCapacity();
5628  }
5629
5630  @Override // FSNamesystemMBean
5631  public String getFSState() {
5632    return isInSafeMode() ? "safeMode" : "Operational";
5633  }
5634  
5635  private ObjectName mbeanName;
5636  private ObjectName mxbeanName;
5637
5638  /**
5639   * Register the FSNamesystem MBean using the name
5640   *        "hadoop:service=NameNode,name=FSNamesystemState"
5641   */
5642  private void registerMBean() {
5643    // We can only implement one MXBean interface, so we keep the old one.
5644    try {
5645      StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
5646      mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
5647    } catch (NotCompliantMBeanException e) {
5648      throw new RuntimeException("Bad MBean setup", e);
5649    }
5650
5651    LOG.info("Registered FSNamesystemState MBean");
5652  }
5653
5654  /**
5655   * shutdown FSNamesystem
5656   */
5657  void shutdown() {
5658    if (mbeanName != null) {
5659      MBeans.unregister(mbeanName);
5660      mbeanName = null;
5661    }
5662    if (mxbeanName != null) {
5663      MBeans.unregister(mxbeanName);
5664      mxbeanName = null;
5665    }
5666    if (dir != null) {
5667      dir.shutdown();
5668    }
5669    if (blockManager != null) {
5670      blockManager.shutdown();
5671    }
5672  }
5673  
5674
5675  @Override // FSNamesystemMBean
5676  public int getNumLiveDataNodes() {
5677    return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
5678  }
5679
5680  @Override // FSNamesystemMBean
5681  public int getNumDeadDataNodes() {
5682    return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
5683  }
5684  
5685  @Override // FSNamesystemMBean
5686  public int getNumDecomLiveDataNodes() {
5687    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
5688    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
5689    int liveDecommissioned = 0;
5690    for (DatanodeDescriptor node : live) {
5691      liveDecommissioned += node.isDecommissioned() ? 1 : 0;
5692    }
5693    return liveDecommissioned;
5694  }
5695
5696  @Override // FSNamesystemMBean
5697  public int getNumDecomDeadDataNodes() {
5698    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
5699    getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
5700    int deadDecommissioned = 0;
5701    for (DatanodeDescriptor node : dead) {
5702      deadDecommissioned += node.isDecommissioned() ? 1 : 0;
5703    }
5704    return deadDecommissioned;
5705  }
5706
5707  @Override // FSNamesystemMBean
5708  public int getNumDecommissioningDataNodes() {
5709    return getBlockManager().getDatanodeManager().getDecommissioningNodes()
5710        .size();
5711  }
5712
5713  @Override // FSNamesystemMBean
5714  @Metric({"StaleDataNodes", 
5715    "Number of datanodes marked stale due to delayed heartbeat"})
5716  public int getNumStaleDataNodes() {
5717    return getBlockManager().getDatanodeManager().getNumStaleNodes();
5718  }
5719
5720  /**
5721   * Sets the current generation stamp for legacy blocks
5722   */
5723  void setGenerationStampV1(long stamp) {
5724    generationStampV1.setCurrentValue(stamp);
5725  }
5726
5727  /**
5728   * Gets the current generation stamp for legacy blocks
5729   */
5730  long getGenerationStampV1() {
5731    return generationStampV1.getCurrentValue();
5732  }
5733
5734  /**
5735   * Gets the current generation stamp for this filesystem
5736   */
5737  void setGenerationStampV2(long stamp) {
5738    generationStampV2.setCurrentValue(stamp);
5739  }
5740
5741  /**
5742   * Gets the current generation stamp for this filesystem
5743   */
5744  long getGenerationStampV2() {
5745    return generationStampV2.getCurrentValue();
5746  }
5747
5748  /**
5749   * Upgrades the generation stamp for the filesystem
5750   * by reserving a sufficient range for all existing blocks.
5751   * Should be invoked only during the first upgrade to
5752   * sequential block IDs.
5753   */
5754  long upgradeGenerationStampToV2() {
5755    Preconditions.checkState(generationStampV2.getCurrentValue() ==
5756        GenerationStamp.LAST_RESERVED_STAMP);
5757
5758    generationStampV2.skipTo(
5759        generationStampV1.getCurrentValue() +
5760        HdfsConstants.RESERVED_GENERATION_STAMPS_V1);
5761
5762    generationStampV1Limit = generationStampV2.getCurrentValue();
5763    return generationStampV2.getCurrentValue();
5764  }
5765
5766  /**
5767   * Sets the generation stamp that delineates random and sequentially
5768   * allocated block IDs.
5769   * @param stamp
5770   */
5771  void setGenerationStampV1Limit(long stamp) {
5772    Preconditions.checkState(generationStampV1Limit ==
5773                             GenerationStamp.GRANDFATHER_GENERATION_STAMP);
5774    generationStampV1Limit = stamp;
5775  }
5776
5777  /**
5778   * Gets the value of the generation stamp that delineates sequential
5779   * and random block IDs.
5780   */
5781  long getGenerationStampAtblockIdSwitch() {
5782    return generationStampV1Limit;
5783  }
5784
5785  @VisibleForTesting
5786  SequentialBlockIdGenerator getBlockIdGenerator() {
5787    return blockIdGenerator;
5788  }
5789
5790  /**
5791   * Sets the maximum allocated block ID for this filesystem. This is
5792   * the basis for allocating new block IDs.
5793   */
5794  void setLastAllocatedBlockId(long blockId) {
5795    blockIdGenerator.skipTo(blockId);
5796  }
5797
5798  /**
5799   * Gets the maximum sequentially allocated block ID for this filesystem
5800   */
5801  long getLastAllocatedBlockId() {
5802    return blockIdGenerator.getCurrentValue();
5803  }
5804
5805  /**
5806   * Increments, logs and then returns the stamp
5807   */
5808  long nextGenerationStamp(boolean legacyBlock)
5809      throws IOException, SafeModeException {
5810    assert hasWriteLock();
5811    checkNameNodeSafeMode("Cannot get next generation stamp");
5812
5813    long gs;
5814    if (legacyBlock) {
5815      gs = getNextGenerationStampV1();
5816      getEditLog().logGenerationStampV1(gs);
5817    } else {
5818      gs = getNextGenerationStampV2();
5819      getEditLog().logGenerationStampV2(gs);
5820    }
5821
5822    // NB: callers sync the log
5823    return gs;
5824  }
5825
5826  @VisibleForTesting
5827  long getNextGenerationStampV1() throws IOException {
5828    long genStampV1 = generationStampV1.nextValue();
5829
5830    if (genStampV1 >= generationStampV1Limit) {
5831      // We ran out of generation stamps for legacy blocks. In practice, it
5832      // is extremely unlikely as we reserved 1T v1 generation stamps. The
5833      // result is that we can no longer append to the legacy blocks that
5834      // were created before the upgrade to sequential block IDs.
5835      throw new OutOfV1GenerationStampsException();
5836    }
5837
5838    return genStampV1;
5839  }
5840
5841  @VisibleForTesting
5842  long getNextGenerationStampV2() {
5843    return generationStampV2.nextValue();
5844  }
5845
5846  long getGenerationStampV1Limit() {
5847    return generationStampV1Limit;
5848  }
5849
5850  /**
5851   * Determine whether the block ID was randomly generated (legacy) or
5852   * sequentially generated. The generation stamp value is used to
5853   * make the distinction.
5854   * @param block
5855   * @return true if the block ID was randomly generated, false otherwise.
5856   */
5857  boolean isLegacyBlock(Block block) {
5858    return block.getGenerationStamp() < getGenerationStampV1Limit();
5859  }
5860
5861  /**
5862   * Increments, logs and then returns the block ID
5863   */
5864  private long nextBlockId() throws IOException {
5865    assert hasWriteLock();
5866    checkNameNodeSafeMode("Cannot get next block ID");
5867    final long blockId = blockIdGenerator.nextValue();
5868    getEditLog().logAllocateBlockId(blockId);
5869    // NB: callers sync the log
5870    return blockId;
5871  }
5872
5873  private INodeFile checkUCBlock(ExtendedBlock block,
5874      String clientName) throws IOException {
5875    assert hasWriteLock();
5876    checkNameNodeSafeMode("Cannot get a new generation stamp and an "
5877        + "access token for block " + block);
5878    
5879    // check stored block state
5880    BlockInfo storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
5881    if (storedBlock == null || 
5882        storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
5883        throw new IOException(block + 
5884            " does not exist or is not under Construction" + storedBlock);
5885    }
5886    
5887    // check file inode
5888    final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
5889    if (file == null || !file.isUnderConstruction()) {
5890      throw new IOException("The file " + storedBlock + 
5891          " belonged to does not exist or it is not under construction.");
5892    }
5893    
5894    // check lease
5895    if (clientName == null
5896        || !clientName.equals(file.getFileUnderConstructionFeature()
5897            .getClientName())) {
5898      throw new LeaseExpiredException("Lease mismatch: " + block + 
5899          " is accessed by a non lease holder " + clientName); 
5900    }
5901
5902    return file;
5903  }
5904  
5905  /**
5906   * Client is reporting some bad block locations.
5907   */
5908  void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
5909    checkOperation(OperationCategory.WRITE);
5910    NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
5911    writeLock();
5912    try {
5913      checkOperation(OperationCategory.WRITE);
5914      for (int i = 0; i < blocks.length; i++) {
5915        ExtendedBlock blk = blocks[i].getBlock();
5916        DatanodeInfo[] nodes = blocks[i].getLocations();
5917        String[] storageIDs = blocks[i].getStorageIDs();
5918        for (int j = 0; j < nodes.length; j++) {
5919          blockManager.findAndMarkBlockAsCorrupt(blk, nodes[j],
5920              storageIDs == null ? null: storageIDs[j], 
5921              "client machine reported it");
5922        }
5923      }
5924    } finally {
5925      writeUnlock();
5926    }
5927  }
5928
5929  /**
5930   * Get a new generation stamp together with an access token for 
5931   * a block under construction
5932   * 
5933   * This method is called for recovering a failed pipeline or setting up
5934   * a pipeline to append to a block.
5935   * 
5936   * @param block a block
5937   * @param clientName the name of a client
5938   * @return a located block with a new generation stamp and an access token
5939   * @throws IOException if any error occurs
5940   */
5941  LocatedBlock updateBlockForPipeline(ExtendedBlock block, 
5942      String clientName) throws IOException {
5943    LocatedBlock locatedBlock;
5944    checkOperation(OperationCategory.WRITE);
5945    writeLock();
5946    try {
5947      checkOperation(OperationCategory.WRITE);
5948
5949      // check vadility of parameters
5950      checkUCBlock(block, clientName);
5951  
5952      // get a new generation stamp and an access token
5953      block.setGenerationStamp(
5954          nextGenerationStamp(isLegacyBlock(block.getLocalBlock())));
5955      locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
5956      blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
5957    } finally {
5958      writeUnlock();
5959    }
5960    // Ensure we record the new generation stamp
5961    getEditLog().logSync();
5962    return locatedBlock;
5963  }
5964  
5965  /**
5966   * Update a pipeline for a block under construction
5967   * 
5968   * @param clientName the name of the client
5969   * @param oldBlock and old block
5970   * @param newBlock a new block with a new generation stamp and length
5971   * @param newNodes datanodes in the pipeline
5972   * @throws IOException if any error occurs
5973   */
5974  void updatePipeline(String clientName, ExtendedBlock oldBlock, 
5975      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs)
5976      throws IOException {
5977    checkOperation(OperationCategory.WRITE);
5978    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5979    if (cacheEntry != null && cacheEntry.isSuccess()) {
5980      return; // Return previous response
5981    }
5982    LOG.info("updatePipeline(block=" + oldBlock
5983             + ", newGenerationStamp=" + newBlock.getGenerationStamp()
5984             + ", newLength=" + newBlock.getNumBytes()
5985             + ", newNodes=" + Arrays.asList(newNodes)
5986             + ", clientName=" + clientName
5987             + ")");
5988    writeLock();
5989    boolean success = false;
5990    try {
5991      checkOperation(OperationCategory.WRITE);
5992      checkNameNodeSafeMode("Pipeline not updated");
5993      assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
5994        + oldBlock + " has different block identifier";
5995      updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
5996          newStorageIDs, cacheEntry != null);
5997      success = true;
5998    } finally {
5999      writeUnlock();
6000      RetryCache.setState(cacheEntry, success);
6001    }
6002    getEditLog().logSync();
6003    LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock);
6004  }
6005
6006  /** @see #updatePipeline(String, ExtendedBlock, ExtendedBlock, DatanodeID[]) */
6007  private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock, 
6008      ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs,
6009      boolean logRetryCache)
6010      throws IOException {
6011    assert hasWriteLock();
6012    // check the vadility of the block and lease holder name
6013    final INodeFile pendingFile = checkUCBlock(oldBlock, clientName);
6014    final BlockInfoUnderConstruction blockinfo
6015        = (BlockInfoUnderConstruction)pendingFile.getLastBlock();
6016
6017    // check new GS & length: this is not expected
6018    if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
6019        newBlock.getNumBytes() < blockinfo.getNumBytes()) {
6020      String msg = "Update " + oldBlock + " (len = " + 
6021        blockinfo.getNumBytes() + ") to an older state: " + newBlock + 
6022        " (len = " + newBlock.getNumBytes() +")";
6023      LOG.warn(msg);
6024      throw new IOException(msg);
6025    }
6026
6027    // Update old block with the new generation stamp and new length
6028    blockinfo.setNumBytes(newBlock.getNumBytes());
6029    blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
6030
6031    // find the DatanodeDescriptor objects
6032    final DatanodeStorageInfo[] storages = blockManager.getDatanodeManager()
6033        .getDatanodeStorageInfos(newNodes, newStorageIDs);
6034    blockinfo.setExpectedLocations(storages);
6035
6036    String src = pendingFile.getFullPathName();
6037    dir.persistBlocks(src, pendingFile, logRetryCache);
6038  }
6039
6040  // rename was successful. If any part of the renamed subtree had
6041  // files that were being written to, update with new filename.
6042  void unprotectedChangeLease(String src, String dst) {
6043    assert hasWriteLock();
6044    leaseManager.changeLease(src, dst);
6045  }
6046
6047  /**
6048   * @return all the under-construction files in the lease map
6049   */
6050  Map<String, INodeFile> getFilesUnderConstruction() {
6051    synchronized (leaseManager) {
6052      return leaseManager.getINodesUnderConstruction();
6053    }
6054  }
6055
6056  /**
6057   * Register a Backup name-node, verifying that it belongs
6058   * to the correct namespace, and adding it to the set of
6059   * active journals if necessary.
6060   * 
6061   * @param bnReg registration of the new BackupNode
6062   * @param nnReg registration of this NameNode
6063   * @throws IOException if the namespace IDs do not match
6064   */
6065  void registerBackupNode(NamenodeRegistration bnReg,
6066      NamenodeRegistration nnReg) throws IOException {
6067    writeLock();
6068    try {
6069      if(getFSImage().getStorage().getNamespaceID() 
6070         != bnReg.getNamespaceID())
6071        throw new IOException("Incompatible namespaceIDs: "
6072            + " Namenode namespaceID = "
6073            + getFSImage().getStorage().getNamespaceID() + "; "
6074            + bnReg.getRole() +
6075            " node namespaceID = " + bnReg.getNamespaceID());
6076      if (bnReg.getRole() == NamenodeRole.BACKUP) {
6077        getFSImage().getEditLog().registerBackupNode(
6078            bnReg, nnReg);
6079      }
6080    } finally {
6081      writeUnlock();
6082    }
6083  }
6084
6085  /**
6086   * Release (unregister) backup node.
6087   * <p>
6088   * Find and remove the backup stream corresponding to the node.
6089   * @param registration
6090   * @throws IOException
6091   */
6092  void releaseBackupNode(NamenodeRegistration registration)
6093    throws IOException {
6094    checkOperation(OperationCategory.WRITE);
6095    writeLock();
6096    try {
6097      checkOperation(OperationCategory.WRITE);
6098      if(getFSImage().getStorage().getNamespaceID()
6099         != registration.getNamespaceID())
6100        throw new IOException("Incompatible namespaceIDs: "
6101            + " Namenode namespaceID = "
6102            + getFSImage().getStorage().getNamespaceID() + "; "
6103            + registration.getRole() +
6104            " node namespaceID = " + registration.getNamespaceID());
6105      getEditLog().releaseBackupStream(registration);
6106    } finally {
6107      writeUnlock();
6108    }
6109  }
6110
6111  static class CorruptFileBlockInfo {
6112    final String path;
6113    final Block block;
6114    
6115    public CorruptFileBlockInfo(String p, Block b) {
6116      path = p;
6117      block = b;
6118    }
6119    
6120    @Override
6121    public String toString() {
6122      return block.getBlockName() + "\t" + path;
6123    }
6124  }
6125  /**
6126   * @param path Restrict corrupt files to this portion of namespace.
6127   * @param startBlockAfter Support for continuation; the set of files we return
6128   *  back is ordered by blockid; startBlockAfter tells where to start from
6129   * @return a list in which each entry describes a corrupt file/block
6130   * @throws AccessControlException
6131   * @throws IOException
6132   */
6133  Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
6134  String[] cookieTab) throws IOException {
6135    checkSuperuserPrivilege();
6136    checkOperation(OperationCategory.READ);
6137    readLock();
6138    try {
6139      checkOperation(OperationCategory.READ);
6140      if (!isPopulatingReplQueues()) {
6141        throw new IOException("Cannot run listCorruptFileBlocks because " +
6142                              "replication queues have not been initialized.");
6143      }
6144      // print a limited # of corrupt files per call
6145      int count = 0;
6146      ArrayList<CorruptFileBlockInfo> corruptFiles = new ArrayList<CorruptFileBlockInfo>();
6147
6148      final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
6149
6150      if (cookieTab == null) {
6151        cookieTab = new String[] { null };
6152      }
6153      int skip = getIntCookie(cookieTab[0]);
6154      for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
6155        blkIterator.next();
6156      }
6157
6158      while (blkIterator.hasNext()) {
6159        Block blk = blkIterator.next();
6160        final INode inode = (INode)blockManager.getBlockCollection(blk);
6161        skip++;
6162        if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
6163          String src = FSDirectory.getFullPathName(inode);
6164          if (src.startsWith(path)){
6165            corruptFiles.add(new CorruptFileBlockInfo(src, blk));
6166            count++;
6167            if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
6168              break;
6169          }
6170        }
6171      }
6172      cookieTab[0] = String.valueOf(skip);
6173      LOG.info("list corrupt file blocks returned: " + count);
6174      return corruptFiles;
6175    } finally {
6176      readUnlock();
6177    }
6178  }
6179
6180  /**
6181   * Convert string cookie to integer.
6182   */
6183  private static int getIntCookie(String cookie){
6184    int c;
6185    if(cookie == null){
6186      c = 0;
6187    } else {
6188      try{
6189        c = Integer.parseInt(cookie);
6190      }catch (NumberFormatException e) {
6191        c = 0;
6192      }
6193    }
6194    c = Math.max(0, c);
6195    return c;
6196  }
6197
6198  /**
6199   * Create delegation token secret manager
6200   */
6201  private DelegationTokenSecretManager createDelegationTokenSecretManager(
6202      Configuration conf) {
6203    return new DelegationTokenSecretManager(conf.getLong(
6204        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6205        DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6206        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6207            DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6208        conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6209            DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6210        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6211        conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6212            DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6213        this);
6214  }
6215
6216  /**
6217   * Returns the DelegationTokenSecretManager instance in the namesystem.
6218   * @return delegation token secret manager object
6219   */
6220  DelegationTokenSecretManager getDelegationTokenSecretManager() {
6221    return dtSecretManager;
6222  }
6223
6224  /**
6225   * @param renewer
6226   * @return Token<DelegationTokenIdentifier>
6227   * @throws IOException
6228   */
6229  Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6230      throws IOException {
6231    Token<DelegationTokenIdentifier> token;
6232    checkOperation(OperationCategory.WRITE);
6233    writeLock();
6234    try {
6235      checkOperation(OperationCategory.WRITE);
6236      checkNameNodeSafeMode("Cannot issue delegation token");
6237      if (!isAllowedDelegationTokenOp()) {
6238        throw new IOException(
6239          "Delegation Token can be issued only with kerberos or web authentication");
6240      }
6241      if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6242        LOG.warn("trying to get DT with no secret manager running");
6243        return null;
6244      }
6245
6246      UserGroupInformation ugi = getRemoteUser();
6247      String user = ugi.getUserName();
6248      Text owner = new Text(user);
6249      Text realUser = null;
6250      if (ugi.getRealUser() != null) {
6251        realUser = new Text(ugi.getRealUser().getUserName());
6252      }
6253      DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6254        renewer, realUser);
6255      token = new Token<DelegationTokenIdentifier>(
6256        dtId, dtSecretManager);
6257      long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6258      getEditLog().logGetDelegationToken(dtId, expiryTime);
6259    } finally {
6260      writeUnlock();
6261    }
6262    getEditLog().logSync();
6263    return token;
6264  }
6265
6266  /**
6267   * 
6268   * @param token
6269   * @return New expiryTime of the token
6270   * @throws InvalidToken
6271   * @throws IOException
6272   */
6273  long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6274      throws InvalidToken, IOException {
6275    long expiryTime;
6276    checkOperation(OperationCategory.WRITE);
6277    writeLock();
6278    try {
6279      checkOperation(OperationCategory.WRITE);
6280
6281      checkNameNodeSafeMode("Cannot renew delegation token");
6282      if (!isAllowedDelegationTokenOp()) {
6283        throw new IOException(
6284            "Delegation Token can be renewed only with kerberos or web authentication");
6285      }
6286      String renewer = getRemoteUser().getShortUserName();
6287      expiryTime = dtSecretManager.renewToken(token, renewer);
6288      DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6289      ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6290      DataInputStream in = new DataInputStream(buf);
6291      id.readFields(in);
6292      getEditLog().logRenewDelegationToken(id, expiryTime);
6293    } finally {
6294      writeUnlock();
6295    }
6296    getEditLog().logSync();
6297    return expiryTime;
6298  }
6299
6300  /**
6301   * 
6302   * @param token
6303   * @throws IOException
6304   */
6305  void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6306      throws IOException {
6307    checkOperation(OperationCategory.WRITE);
6308    writeLock();
6309    try {
6310      checkOperation(OperationCategory.WRITE);
6311
6312      checkNameNodeSafeMode("Cannot cancel delegation token");
6313      String canceller = getRemoteUser().getUserName();
6314      DelegationTokenIdentifier id = dtSecretManager
6315        .cancelToken(token, canceller);
6316      getEditLog().logCancelDelegationToken(id);
6317    } finally {
6318      writeUnlock();
6319    }
6320    getEditLog().logSync();
6321  }
6322
6323  SecretManagerState saveSecretManagerState() {
6324    return dtSecretManager.saveSecretManagerState();
6325  }
6326
6327  /**
6328   * @param in load the state of secret manager from input stream
6329   */
6330  void loadSecretManagerStateCompat(DataInput in) throws IOException {
6331    dtSecretManager.loadSecretManagerStateCompat(in);
6332  }
6333
6334  void loadSecretManagerState(SecretManagerSection s,
6335      List<SecretManagerSection.DelegationKey> keys,
6336      List<SecretManagerSection.PersistToken> tokens) throws IOException {
6337    dtSecretManager.loadSecretManagerState(new SecretManagerState(s, keys, tokens));
6338  }
6339
6340  /**
6341   * Log the updateMasterKey operation to edit logs
6342   * 
6343   * @param key new delegation key.
6344   */
6345  public void logUpdateMasterKey(DelegationKey key) {
6346    
6347    assert !isInSafeMode() :
6348      "this should never be called while in safemode, since we stop " +
6349      "the DT manager before entering safemode!";
6350    // No need to hold FSN lock since we don't access any internal
6351    // structures, and this is stopped before the FSN shuts itself
6352    // down, etc.
6353    getEditLog().logUpdateMasterKey(key);
6354    getEditLog().logSync();
6355  }
6356  
6357  /**
6358   * Log the cancellation of expired tokens to edit logs
6359   * 
6360   * @param id token identifier to cancel
6361   */
6362  public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6363    assert !isInSafeMode() :
6364      "this should never be called while in safemode, since we stop " +
6365      "the DT manager before entering safemode!";
6366    // No need to hold FSN lock since we don't access any internal
6367    // structures, and this is stopped before the FSN shuts itself
6368    // down, etc.
6369    getEditLog().logCancelDelegationToken(id);
6370  }  
6371  
6372  private void logReassignLease(String leaseHolder, String src,
6373      String newHolder) {
6374    assert hasWriteLock();
6375    getEditLog().logReassignLease(leaseHolder, src, newHolder);
6376  }
6377  
6378  /**
6379   * 
6380   * @return true if delegation token operation is allowed
6381   */
6382  private boolean isAllowedDelegationTokenOp() throws IOException {
6383    return !UserGroupInformation.isSecurityEnabled()
6384        || getConnectionAuthenticationMethod().allowsDelegation();
6385  }
6386  
6387  /**
6388   * Returns authentication method used to establish the connection
6389   * @return AuthenticationMethod used to establish connection
6390   * @throws IOException
6391   */
6392  private AuthenticationMethod getConnectionAuthenticationMethod()
6393      throws IOException {
6394    UserGroupInformation ugi = getRemoteUser();
6395    AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6396    if (authMethod == AuthenticationMethod.PROXY) {
6397      authMethod = ugi.getRealUser().getAuthenticationMethod();
6398    }
6399    return authMethod;
6400  }
6401  
6402  /**
6403   * Client invoked methods are invoked over RPC and will be in 
6404   * RPC call context even if the client exits.
6405   */
6406  private boolean isExternalInvocation() {
6407    return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6408  }
6409
6410  private static InetAddress getRemoteIp() {
6411    InetAddress ip = Server.getRemoteIp();
6412    if (ip != null) {
6413      return ip;
6414    }
6415    return NamenodeWebHdfsMethods.getRemoteIp();
6416  }
6417  
6418  // optimize ugi lookup for RPC operations to avoid a trip through
6419  // UGI.getCurrentUser which is synch'ed
6420  private static UserGroupInformation getRemoteUser() throws IOException {
6421    return NameNode.getRemoteUser();
6422  }
6423  
6424  /**
6425   * Log fsck event in the audit log 
6426   */
6427  void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6428    if (isAuditEnabled()) {
6429      logAuditEvent(true, getRemoteUser(),
6430                    remoteAddress,
6431                    "fsck", src, null, null);
6432    }
6433  }
6434  /**
6435   * Register NameNodeMXBean
6436   */
6437  private void registerMXBean() {
6438    mxbeanName = MBeans.register("NameNode", "NameNodeInfo", this);
6439  }
6440
6441  /**
6442   * Class representing Namenode information for JMX interfaces
6443   */
6444  @Override // NameNodeMXBean
6445  public String getVersion() {
6446    return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6447  }
6448
6449  @Override // NameNodeMXBean
6450  public long getUsed() {
6451    return this.getCapacityUsed();
6452  }
6453
6454  @Override // NameNodeMXBean
6455  public long getFree() {
6456    return this.getCapacityRemaining();
6457  }
6458
6459  @Override // NameNodeMXBean
6460  public long getTotal() {
6461    return this.getCapacityTotal();
6462  }
6463
6464  @Override // NameNodeMXBean
6465  public String getSafemode() {
6466    if (!this.isInSafeMode())
6467      return "";
6468    return "Safe mode is ON. " + this.getSafeModeTip();
6469  }
6470
6471  @Override // NameNodeMXBean
6472  public boolean isUpgradeFinalized() {
6473    return this.getFSImage().isUpgradeFinalized();
6474  }
6475
6476  @Override // NameNodeMXBean
6477  public long getNonDfsUsedSpace() {
6478    return datanodeStatistics.getCapacityUsedNonDFS();
6479  }
6480
6481  @Override // NameNodeMXBean
6482  public float getPercentUsed() {
6483    return datanodeStatistics.getCapacityUsedPercent();
6484  }
6485
6486  @Override // NameNodeMXBean
6487  public long getBlockPoolUsedSpace() {
6488    return datanodeStatistics.getBlockPoolUsed();
6489  }
6490
6491  @Override // NameNodeMXBean
6492  public float getPercentBlockPoolUsed() {
6493    return datanodeStatistics.getPercentBlockPoolUsed();
6494  }
6495
6496  @Override // NameNodeMXBean
6497  public float getPercentRemaining() {
6498    return datanodeStatistics.getCapacityRemainingPercent();
6499  }
6500
6501  @Override // NameNodeMXBean
6502  public long getCacheCapacity() {
6503    return datanodeStatistics.getCacheCapacity();
6504  }
6505
6506  @Override // NameNodeMXBean
6507  public long getCacheUsed() {
6508    return datanodeStatistics.getCacheUsed();
6509  }
6510
6511  @Override // NameNodeMXBean
6512  public long getTotalBlocks() {
6513    return getBlocksTotal();
6514  }
6515
6516  @Override // NameNodeMXBean
6517  @Metric
6518  public long getTotalFiles() {
6519    return getFilesTotal();
6520  }
6521
6522  @Override // NameNodeMXBean
6523  public long getNumberOfMissingBlocks() {
6524    return getMissingBlocksCount();
6525  }
6526  
6527  @Override // NameNodeMXBean
6528  public int getThreads() {
6529    return ManagementFactory.getThreadMXBean().getThreadCount();
6530  }
6531
6532  /**
6533   * Returned information is a JSON representation of map with host name as the
6534   * key and value is a map of live node attribute keys to its values
6535   */
6536  @Override // NameNodeMXBean
6537  public String getLiveNodes() {
6538    final Map<String, Map<String,Object>> info = 
6539      new HashMap<String, Map<String,Object>>();
6540    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6541    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6542    for (DatanodeDescriptor node : live) {
6543      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
6544          .put("infoAddr", node.getInfoAddr())
6545          .put("infoSecureAddr", node.getInfoSecureAddr())
6546          .put("xferaddr", node.getXferAddr())
6547          .put("lastContact", getLastContact(node))
6548          .put("usedSpace", getDfsUsed(node))
6549          .put("adminState", node.getAdminState().toString())
6550          .put("nonDfsUsedSpace", node.getNonDfsUsed())
6551          .put("capacity", node.getCapacity())
6552          .put("numBlocks", node.numBlocks())
6553          .put("version", node.getSoftwareVersion())
6554          .put("used", node.getDfsUsed())
6555          .put("remaining", node.getRemaining())
6556          .put("blockScheduled", node.getBlocksScheduled())
6557          .put("blockPoolUsed", node.getBlockPoolUsed())
6558          .put("blockPoolUsedPercent", node.getBlockPoolUsedPercent())
6559          .put("volfails", node.getVolumeFailures())
6560          .build();
6561
6562      info.put(node.getHostName(), innerinfo);
6563    }
6564    return JSON.toString(info);
6565  }
6566
6567  /**
6568   * Returned information is a JSON representation of map with host name as the
6569   * key and value is a map of dead node attribute keys to its values
6570   */
6571  @Override // NameNodeMXBean
6572  public String getDeadNodes() {
6573    final Map<String, Map<String, Object>> info = 
6574      new HashMap<String, Map<String, Object>>();
6575    final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6576    blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
6577    for (DatanodeDescriptor node : dead) {
6578      Map<String, Object> innerinfo = ImmutableMap.<String, Object>builder()
6579          .put("lastContact", getLastContact(node))
6580          .put("decommissioned", node.isDecommissioned())
6581          .put("xferaddr", node.getXferAddr())
6582          .build();
6583      info.put(node.getHostName(), innerinfo);
6584    }
6585    return JSON.toString(info);
6586  }
6587
6588  /**
6589   * Returned information is a JSON representation of map with host name as the
6590   * key and value is a map of decomisioning node attribute keys to its values
6591   */
6592  @Override // NameNodeMXBean
6593  public String getDecomNodes() {
6594    final Map<String, Map<String, Object>> info = 
6595      new HashMap<String, Map<String, Object>>();
6596    final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
6597        ).getDecommissioningNodes();
6598    for (DatanodeDescriptor node : decomNodeList) {
6599      Map<String, Object> innerinfo = ImmutableMap
6600          .<String, Object> builder()
6601          .put("xferaddr", node.getXferAddr())
6602          .put("underReplicatedBlocks",
6603              node.decommissioningStatus.getUnderReplicatedBlocks())
6604          .put("decommissionOnlyReplicas",
6605              node.decommissioningStatus.getDecommissionOnlyReplicas())
6606          .put("underReplicateInOpenFiles",
6607              node.decommissioningStatus.getUnderReplicatedInOpenFiles())
6608          .build();
6609      info.put(node.getHostName(), innerinfo);
6610    }
6611    return JSON.toString(info);
6612  }
6613
6614  private long getLastContact(DatanodeDescriptor alivenode) {
6615    return (Time.now() - alivenode.getLastUpdate())/1000;
6616  }
6617
6618  private long getDfsUsed(DatanodeDescriptor alivenode) {
6619    return alivenode.getDfsUsed();
6620  }
6621
6622  @Override  // NameNodeMXBean
6623  public String getClusterId() {
6624    return dir.fsImage.getStorage().getClusterID();
6625  }
6626  
6627  @Override  // NameNodeMXBean
6628  public String getBlockPoolId() {
6629    return blockPoolId;
6630  }
6631  
6632  @Override  // NameNodeMXBean
6633  public String getNameDirStatuses() {
6634    Map<String, Map<File, StorageDirType>> statusMap =
6635      new HashMap<String, Map<File, StorageDirType>>();
6636    
6637    Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
6638    for (Iterator<StorageDirectory> it
6639        = getFSImage().getStorage().dirIterator(); it.hasNext();) {
6640      StorageDirectory st = it.next();
6641      activeDirs.put(st.getRoot(), st.getStorageDirType());
6642    }
6643    statusMap.put("active", activeDirs);
6644    
6645    List<Storage.StorageDirectory> removedStorageDirs
6646        = getFSImage().getStorage().getRemovedStorageDirs();
6647    Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
6648    for (StorageDirectory st : removedStorageDirs) {
6649      failedDirs.put(st.getRoot(), st.getStorageDirType());
6650    }
6651    statusMap.put("failed", failedDirs);
6652    
6653    return JSON.toString(statusMap);
6654  }
6655
6656  @Override // NameNodeMXBean
6657  public String getNodeUsage() {
6658    float median = 0;
6659    float max = 0;
6660    float min = 0;
6661    float dev = 0;
6662
6663    final Map<String, Map<String,Object>> info =
6664        new HashMap<String, Map<String,Object>>();
6665    final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6666    blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6667
6668    if (live.size() > 0) {
6669      float totalDfsUsed = 0;
6670      float[] usages = new float[live.size()];
6671      int i = 0;
6672      for (DatanodeDescriptor dn : live) {
6673        usages[i++] = dn.getDfsUsedPercent();
6674        totalDfsUsed += dn.getDfsUsedPercent();
6675      }
6676      totalDfsUsed /= live.size();
6677      Arrays.sort(usages);
6678      median = usages[usages.length / 2];
6679      max = usages[usages.length - 1];
6680      min = usages[0];
6681
6682      for (i = 0; i < usages.length; i++) {
6683        dev += (usages[i] - totalDfsUsed) * (usages[i] - totalDfsUsed);
6684      }
6685      dev = (float) Math.sqrt(dev / usages.length);
6686    }
6687
6688    final Map<String, Object> innerInfo = new HashMap<String, Object>();
6689    innerInfo.put("min", StringUtils.format("%.2f%%", min));
6690    innerInfo.put("median", StringUtils.format("%.2f%%", median));
6691    innerInfo.put("max", StringUtils.format("%.2f%%", max));
6692    innerInfo.put("stdDev", StringUtils.format("%.2f%%", dev));
6693    info.put("nodeUsage", innerInfo);
6694
6695    return JSON.toString(info);
6696  }
6697
6698  @Override  // NameNodeMXBean
6699  public String getNameJournalStatus() {
6700    List<Map<String, String>> jasList = new ArrayList<Map<String, String>>();
6701    FSEditLog log = getFSImage().getEditLog();
6702    if (log != null) {
6703      boolean openForWrite = log.isOpenForWrite();
6704      for (JournalAndStream jas : log.getJournals()) {
6705        final Map<String, String> jasMap = new HashMap<String, String>();
6706        String manager = jas.getManager().toString();
6707
6708        jasMap.put("required", String.valueOf(jas.isRequired()));
6709        jasMap.put("disabled", String.valueOf(jas.isDisabled()));
6710        jasMap.put("manager", manager);
6711
6712        if (jas.isDisabled()) {
6713          jasMap.put("stream", "Failed");
6714        } else if (openForWrite) {
6715          EditLogOutputStream elos = jas.getCurrentStream();
6716          if (elos != null) {
6717            jasMap.put("stream", elos.generateReport());
6718          } else {
6719            jasMap.put("stream", "not currently writing");
6720          }
6721        } else {
6722          jasMap.put("stream", "open for read");
6723        }
6724        jasList.add(jasMap);
6725      }
6726    }
6727    return JSON.toString(jasList);
6728  }
6729
6730  @Override // NameNodeMxBean
6731  public String getJournalTransactionInfo() {
6732    Map<String, String> txnIdMap = new HashMap<String, String>();
6733    txnIdMap.put("LastAppliedOrWrittenTxId",
6734        Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
6735    txnIdMap.put("MostRecentCheckpointTxId",
6736        Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
6737    return JSON.toString(txnIdMap);
6738  }
6739  
6740  @Override  // NameNodeMXBean
6741  public String getNNStarted() {
6742    return getStartTime().toString();
6743  }
6744
6745  @Override  // NameNodeMXBean
6746  public String getCompileInfo() {
6747    return VersionInfo.getDate() + " by " + VersionInfo.getUser() +
6748        " from " + VersionInfo.getBranch();
6749  }
6750
6751  /** @return the block manager. */
6752  public BlockManager getBlockManager() {
6753    return blockManager;
6754  }
6755  /** @return the FSDirectory. */
6756  public FSDirectory getFSDirectory() {
6757    return dir;
6758  }
6759  /** @return the cache manager. */
6760  public CacheManager getCacheManager() {
6761    return cacheManager;
6762  }
6763
6764  @Override  // NameNodeMXBean
6765  public String getCorruptFiles() {
6766    List<String> list = new ArrayList<String>();
6767    Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks;
6768    try {
6769      corruptFileBlocks = listCorruptFileBlocks("/", null);
6770      int corruptFileCount = corruptFileBlocks.size();
6771      if (corruptFileCount != 0) {
6772        for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
6773          list.add(c.toString());
6774        }
6775      }
6776    } catch (IOException e) {
6777      LOG.warn("Get corrupt file blocks returned error: " + e.getMessage());
6778    }
6779    return JSON.toString(list);
6780  }
6781
6782  @Override  //NameNodeMXBean
6783  public int getDistinctVersionCount() {
6784    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
6785      .size();
6786  }
6787
6788  @Override  //NameNodeMXBean
6789  public Map<String, Integer> getDistinctVersions() {
6790    return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
6791  }
6792
6793  @Override  //NameNodeMXBean
6794  public String getSoftwareVersion() {
6795    return VersionInfo.getVersion();
6796  }
6797
6798  /**
6799   * Verifies that the given identifier and password are valid and match.
6800   * @param identifier Token identifier.
6801   * @param password Password in the token.
6802   */
6803  public synchronized void verifyToken(DelegationTokenIdentifier identifier,
6804      byte[] password) throws InvalidToken, RetriableException {
6805    try {
6806      getDelegationTokenSecretManager().verifyToken(identifier, password);
6807    } catch (InvalidToken it) {
6808      if (inTransitionToActive()) {
6809        throw new RetriableException(it);
6810      }
6811      throw it;
6812    }
6813  }
6814  
6815  @Override
6816  public boolean isGenStampInFuture(Block block) {
6817    if (isLegacyBlock(block)) {
6818      return block.getGenerationStamp() > getGenerationStampV1();
6819    } else {
6820      return block.getGenerationStamp() > getGenerationStampV2();
6821    }
6822  }
6823
6824  @VisibleForTesting
6825  public EditLogTailer getEditLogTailer() {
6826    return editLogTailer;
6827  }
6828  
6829  @VisibleForTesting
6830  public void setEditLogTailerForTests(EditLogTailer tailer) {
6831    this.editLogTailer = tailer;
6832  }
6833  
6834  @VisibleForTesting
6835  void setFsLockForTests(ReentrantReadWriteLock lock) {
6836    this.fsLock.coarseLock = lock;
6837  }
6838  
6839  @VisibleForTesting
6840  public ReentrantReadWriteLock getFsLockForTests() {
6841    return fsLock.coarseLock;
6842  }
6843  
6844  @VisibleForTesting
6845  public ReentrantLock getLongReadLockForTests() {
6846    return fsLock.longReadLock;
6847  }
6848
6849  @VisibleForTesting
6850  public SafeModeInfo getSafeModeInfoForTests() {
6851    return safeMode;
6852  }
6853  
6854  @VisibleForTesting
6855  public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
6856    this.nnResourceChecker = nnResourceChecker;
6857  }
6858
6859  @Override
6860  public boolean isAvoidingStaleDataNodesForWrite() {
6861    return this.blockManager.getDatanodeManager()
6862        .shouldAvoidStaleDataNodesForWrite();
6863  }
6864
6865  @Override // FSClusterStats
6866  public int getNumDatanodesInService() {
6867    return getNumLiveDataNodes() - getNumDecomLiveDataNodes();
6868  }
6869
6870  public SnapshotManager getSnapshotManager() {
6871    return snapshotManager;
6872  }
6873  
6874  /** Allow snapshot on a directroy. */
6875  void allowSnapshot(String path) throws SafeModeException, IOException {
6876    checkOperation(OperationCategory.WRITE);
6877    writeLock();
6878    try {
6879      checkOperation(OperationCategory.WRITE);
6880      checkNameNodeSafeMode("Cannot allow snapshot for " + path);
6881      checkSuperuserPrivilege();
6882
6883      dir.writeLock();
6884      try {
6885        snapshotManager.setSnapshottable(path, true);
6886      } finally {
6887        dir.writeUnlock();
6888      }
6889      getEditLog().logAllowSnapshot(path);
6890    } finally {
6891      writeUnlock();
6892    }
6893    getEditLog().logSync();
6894
6895    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6896      logAuditEvent(true, "allowSnapshot", path, null, null);
6897    }
6898  }
6899  
6900  /** Disallow snapshot on a directory. */
6901  void disallowSnapshot(String path) throws SafeModeException, IOException {
6902    checkOperation(OperationCategory.WRITE);
6903    writeLock();
6904    try {
6905      checkOperation(OperationCategory.WRITE);
6906      checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
6907      checkSuperuserPrivilege();
6908
6909      dir.writeLock();
6910      try {
6911        snapshotManager.resetSnapshottable(path);
6912      } finally {
6913        dir.writeUnlock();
6914      }
6915      getEditLog().logDisallowSnapshot(path);
6916    } finally {
6917      writeUnlock();
6918    }
6919    getEditLog().logSync();
6920    
6921    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6922      logAuditEvent(true, "disallowSnapshot", path, null, null);
6923    }
6924  }
6925  
6926  /**
6927   * Create a snapshot
6928   * @param snapshotRoot The directory path where the snapshot is taken
6929   * @param snapshotName The name of the snapshot
6930   */
6931  String createSnapshot(String snapshotRoot, String snapshotName)
6932      throws SafeModeException, IOException {
6933    checkOperation(OperationCategory.WRITE);
6934    final FSPermissionChecker pc = getPermissionChecker();
6935    CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
6936        null);
6937    if (cacheEntry != null && cacheEntry.isSuccess()) {
6938      return (String) cacheEntry.getPayload();
6939    }
6940    String snapshotPath = null;
6941    writeLock();
6942    try {
6943      checkOperation(OperationCategory.WRITE);
6944      checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
6945      if (isPermissionEnabled) {
6946        checkOwner(pc, snapshotRoot);
6947      }
6948
6949      if (snapshotName == null || snapshotName.isEmpty()) {
6950        snapshotName = Snapshot.generateDefaultSnapshotName();
6951      }
6952      if(snapshotName != null){
6953        if (!DFSUtil.isValidNameForComponent(snapshotName)) {
6954            throw new InvalidPathException("Invalid snapshot name: "
6955                + snapshotName);
6956        }
6957      }
6958      dir.verifySnapshotName(snapshotName, snapshotRoot);
6959      dir.writeLock();
6960      try {
6961        snapshotPath = snapshotManager.createSnapshot(snapshotRoot, snapshotName);
6962      } finally {
6963        dir.writeUnlock();
6964      }
6965      getEditLog().logCreateSnapshot(snapshotRoot, snapshotName,
6966          cacheEntry != null);
6967    } finally {
6968      writeUnlock();
6969      RetryCache.setState(cacheEntry, snapshotPath != null, snapshotPath);
6970    }
6971    getEditLog().logSync();
6972    
6973    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6974      logAuditEvent(true, "createSnapshot", snapshotRoot, snapshotPath, null);
6975    }
6976    return snapshotPath;
6977  }
6978  
6979  /**
6980   * Rename a snapshot
6981   * @param path The directory path where the snapshot was taken
6982   * @param snapshotOldName Old snapshot name
6983   * @param snapshotNewName New snapshot name
6984   * @throws SafeModeException
6985   * @throws IOException 
6986   */
6987  void renameSnapshot(String path, String snapshotOldName,
6988      String snapshotNewName) throws SafeModeException, IOException {
6989    checkOperation(OperationCategory.WRITE);
6990    final FSPermissionChecker pc = getPermissionChecker();
6991    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6992    if (cacheEntry != null && cacheEntry.isSuccess()) {
6993      return; // Return previous response
6994    }
6995    writeLock();
6996    boolean success = false;
6997    try {
6998      checkOperation(OperationCategory.WRITE);
6999      checkNameNodeSafeMode("Cannot rename snapshot for " + path);
7000      if (isPermissionEnabled) {
7001        checkOwner(pc, path);
7002      }
7003      dir.verifySnapshotName(snapshotNewName, path);
7004      
7005      snapshotManager.renameSnapshot(path, snapshotOldName, snapshotNewName);
7006      getEditLog().logRenameSnapshot(path, snapshotOldName, snapshotNewName,
7007          cacheEntry != null);
7008      success = true;
7009    } finally {
7010      writeUnlock();
7011      RetryCache.setState(cacheEntry, success);
7012    }
7013    getEditLog().logSync();
7014    
7015    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7016      String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
7017      String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
7018      logAuditEvent(true, "renameSnapshot", oldSnapshotRoot, newSnapshotRoot, null);
7019    }
7020  }
7021  
7022  /**
7023   * Get the list of snapshottable directories that are owned 
7024   * by the current user. Return all the snapshottable directories if the 
7025   * current user is a super user.
7026   * @return The list of all the current snapshottable directories
7027   * @throws IOException
7028   */
7029  public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
7030      throws IOException {
7031    SnapshottableDirectoryStatus[] status = null;
7032    checkOperation(OperationCategory.READ);
7033    final FSPermissionChecker checker = getPermissionChecker();
7034    readLock();
7035    try {
7036      checkOperation(OperationCategory.READ);
7037      final String user = checker.isSuperUser()? null : checker.getUser();
7038      status = snapshotManager.getSnapshottableDirListing(user);
7039    } finally {
7040      readUnlock();
7041    }
7042    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7043      logAuditEvent(true, "listSnapshottableDirectory", null, null, null);
7044    }
7045    return status;
7046  }
7047  
7048  /**
7049   * Get the difference between two snapshots (or between a snapshot and the
7050   * current status) of a snapshottable directory.
7051   * 
7052   * @param path The full path of the snapshottable directory.
7053   * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
7054   *          or empty string indicates the current tree.
7055   * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
7056   *          empty string indicates the current tree.
7057   * @return A report about the difference between {@code fromSnapshot} and 
7058   *         {@code toSnapshot}. Modified/deleted/created/renamed files and 
7059   *         directories belonging to the snapshottable directories are listed 
7060   *         and labeled as M/-/+/R respectively. 
7061   * @throws IOException
7062   */
7063  SnapshotDiffReport getSnapshotDiffReport(String path,
7064      String fromSnapshot, String toSnapshot) throws IOException {
7065    SnapshotDiffInfo diffs = null;
7066    checkOperation(OperationCategory.READ);
7067    final FSPermissionChecker pc = getPermissionChecker();
7068    readLock();
7069    try {
7070      checkOperation(OperationCategory.READ);
7071      if (isPermissionEnabled) {
7072        checkSubtreeReadPermission(pc, path, fromSnapshot);
7073        checkSubtreeReadPermission(pc, path, toSnapshot);
7074      }
7075      diffs = snapshotManager.diff(path, fromSnapshot, toSnapshot);
7076    } finally {
7077      readUnlock();
7078    }
7079    
7080    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7081      logAuditEvent(true, "computeSnapshotDiff", null, null, null);
7082    }
7083    return diffs != null ? diffs.generateReport() : new SnapshotDiffReport(
7084        path, fromSnapshot, toSnapshot,
7085        Collections.<DiffReportEntry> emptyList());
7086  }
7087  
7088  private void checkSubtreeReadPermission(final FSPermissionChecker pc,
7089      final String snapshottablePath, final String snapshot)
7090          throws AccessControlException, UnresolvedLinkException {
7091    final String fromPath = snapshot == null?
7092        snapshottablePath: Snapshot.getSnapshotPath(snapshottablePath, snapshot);
7093    checkPermission(pc, fromPath, false, null, null, FsAction.READ, FsAction.READ);
7094  }
7095  
7096  /**
7097   * Delete a snapshot of a snapshottable directory
7098   * @param snapshotRoot The snapshottable directory
7099   * @param snapshotName The name of the to-be-deleted snapshot
7100   * @throws SafeModeException
7101   * @throws IOException
7102   */
7103  void deleteSnapshot(String snapshotRoot, String snapshotName)
7104      throws SafeModeException, IOException {
7105    checkOperation(OperationCategory.WRITE);
7106    final FSPermissionChecker pc = getPermissionChecker();
7107    
7108    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7109    if (cacheEntry != null && cacheEntry.isSuccess()) {
7110      return; // Return previous response
7111    }
7112    boolean success = false;
7113    BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
7114    writeLock();
7115    try {
7116      checkOperation(OperationCategory.WRITE);
7117      checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
7118      if (isPermissionEnabled) {
7119        checkOwner(pc, snapshotRoot);
7120      }
7121
7122      List<INode> removedINodes = new ChunkedArrayList<INode>();
7123      dir.writeLock();
7124      try {
7125        snapshotManager.deleteSnapshot(snapshotRoot, snapshotName,
7126            collectedBlocks, removedINodes);
7127        dir.removeFromInodeMap(removedINodes);
7128      } finally {
7129        dir.writeUnlock();
7130      }
7131      removedINodes.clear();
7132      getEditLog().logDeleteSnapshot(snapshotRoot, snapshotName,
7133          cacheEntry != null);
7134      success = true;
7135    } finally {
7136      writeUnlock();
7137      RetryCache.setState(cacheEntry, success);
7138    }
7139    getEditLog().logSync();
7140
7141    removeBlocks(collectedBlocks);
7142    collectedBlocks.clear();
7143
7144    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7145      String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
7146      logAuditEvent(true, "deleteSnapshot", rootPath, null, null);
7147    }
7148  }
7149
7150  /**
7151   * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
7152   * @param toRemove the list of INodeDirectorySnapshottable to be removed
7153   */
7154  void removeSnapshottableDirs(List<INodeDirectorySnapshottable> toRemove) {
7155    if (snapshotManager != null) {
7156      snapshotManager.removeSnapshottable(toRemove);
7157    }
7158  }
7159
7160  RollingUpgradeInfo queryRollingUpgrade() throws IOException {
7161    checkSuperuserPrivilege();
7162    checkOperation(OperationCategory.READ);
7163    readLock();
7164    try {
7165      if (rollingUpgradeInfo != null) {
7166        boolean hasRollbackImage = this.getFSImage().hasRollbackFSImage();
7167        rollingUpgradeInfo.setCreatedRollbackImages(hasRollbackImage);
7168      }
7169      return rollingUpgradeInfo;
7170    } finally {
7171      readUnlock();
7172    }
7173  }
7174
7175  RollingUpgradeInfo startRollingUpgrade() throws IOException {
7176    checkSuperuserPrivilege();
7177    checkOperation(OperationCategory.WRITE);
7178    writeLock();
7179    try {
7180      checkOperation(OperationCategory.WRITE);
7181      long startTime = now();
7182      if (!haEnabled) { // for non-HA, we require NN to be in safemode
7183        startRollingUpgradeInternalForNonHA(startTime);
7184      } else { // for HA, NN cannot be in safemode
7185        checkNameNodeSafeMode("Failed to start rolling upgrade");
7186        startRollingUpgradeInternal(startTime);
7187      }
7188
7189      getEditLog().logStartRollingUpgrade(rollingUpgradeInfo.getStartTime());
7190      if (haEnabled) {
7191        // roll the edit log to make sure the standby NameNode can tail
7192        getFSImage().rollEditLog();
7193      }
7194    } finally {
7195      writeUnlock();
7196    }
7197
7198    getEditLog().logSync();
7199    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7200      logAuditEvent(true, "startRollingUpgrade", null, null, null);
7201    }
7202    return rollingUpgradeInfo;
7203  }
7204
7205  /**
7206   * Update internal state to indicate that a rolling upgrade is in progress.
7207   * @param startTime
7208   */
7209  void startRollingUpgradeInternal(long startTime)
7210      throws IOException {
7211    checkRollingUpgrade("start rolling upgrade");
7212    getFSImage().checkUpgrade(this);
7213    setRollingUpgradeInfo(false, startTime);
7214  }
7215
7216  /**
7217   * Update internal state to indicate that a rolling upgrade is in progress for
7218   * non-HA setup. This requires the namesystem is in SafeMode and after doing a
7219   * checkpoint for rollback the namesystem will quit the safemode automatically 
7220   */
7221  private void startRollingUpgradeInternalForNonHA(long startTime)
7222      throws IOException {
7223    Preconditions.checkState(!haEnabled);
7224    if (!isInSafeMode()) {
7225      throw new IOException("Safe mode should be turned ON "
7226          + "in order to create namespace image.");
7227    }
7228    checkRollingUpgrade("start rolling upgrade");
7229    getFSImage().checkUpgrade(this);
7230    // in non-HA setup, we do an extra ckpt to generate a rollback image
7231    getFSImage().saveNamespace(this, NameNodeFile.IMAGE_ROLLBACK, null);
7232    LOG.info("Successfully saved namespace for preparing rolling upgrade.");
7233
7234    // leave SafeMode automatically
7235    setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
7236    setRollingUpgradeInfo(true, startTime);
7237  }
7238
7239  void setRollingUpgradeInfo(boolean createdRollbackImages, long startTime) {
7240    rollingUpgradeInfo = new RollingUpgradeInfo(blockPoolId,
7241        createdRollbackImages, startTime, 0L);
7242  }
7243
7244  public void setCreatedRollbackImages(boolean created) {
7245    if (rollingUpgradeInfo != null) {
7246      rollingUpgradeInfo.setCreatedRollbackImages(created);
7247    }
7248  }
7249
7250  public RollingUpgradeInfo getRollingUpgradeInfo() {
7251    return rollingUpgradeInfo;
7252  }
7253
7254  public boolean isNeedRollbackFsImage() {
7255    return needRollbackFsImage;
7256  }
7257
7258  public void setNeedRollbackFsImage(boolean needRollbackFsImage) {
7259    this.needRollbackFsImage = needRollbackFsImage;
7260  }
7261
7262  @Override  // NameNodeMXBean
7263  public RollingUpgradeInfo.Bean getRollingUpgradeStatus() {
7264    readLock();
7265    try {
7266      RollingUpgradeInfo upgradeInfo = getRollingUpgradeInfo();
7267      if (upgradeInfo != null) {
7268        return new RollingUpgradeInfo.Bean(upgradeInfo);
7269      }
7270      return null;
7271    } finally {
7272      readUnlock();
7273    }
7274  }
7275
7276  /** Is rolling upgrade in progress? */
7277  public boolean isRollingUpgrade() {
7278    return rollingUpgradeInfo != null;
7279  }
7280
7281  void checkRollingUpgrade(String action) throws RollingUpgradeException {
7282    if (isRollingUpgrade()) {
7283      throw new RollingUpgradeException("Failed to " + action
7284          + " since a rolling upgrade is already in progress."
7285          + " Existing rolling upgrade info:\n" + rollingUpgradeInfo);
7286    }
7287  }
7288
7289  RollingUpgradeInfo finalizeRollingUpgrade() throws IOException {
7290    checkSuperuserPrivilege();
7291    checkOperation(OperationCategory.WRITE);
7292    writeLock();
7293    final RollingUpgradeInfo returnInfo;
7294    try {
7295      checkOperation(OperationCategory.WRITE);
7296      checkNameNodeSafeMode("Failed to finalize rolling upgrade");
7297
7298      returnInfo = finalizeRollingUpgradeInternal(now());
7299      getEditLog().logFinalizeRollingUpgrade(returnInfo.getFinalizeTime());
7300      getFSImage().saveNamespace(this);
7301      getFSImage().renameCheckpoint(NameNodeFile.IMAGE_ROLLBACK,
7302          NameNodeFile.IMAGE);
7303    } finally {
7304      writeUnlock();
7305    }
7306
7307    // getEditLog().logSync() is not needed since it does saveNamespace 
7308
7309    if (auditLog.isInfoEnabled() && isExternalInvocation()) {
7310      logAuditEvent(true, "finalizeRollingUpgrade", null, null, null);
7311    }
7312    return returnInfo;
7313  }
7314
7315  RollingUpgradeInfo finalizeRollingUpgradeInternal(long finalizeTime)
7316      throws RollingUpgradeException {
7317    if (!isRollingUpgrade()) {
7318      throw new RollingUpgradeException(
7319          "Failed to finalize rolling upgrade since there is no rolling upgrade in progress.");
7320    }
7321
7322    final long startTime = rollingUpgradeInfo.getStartTime();
7323    rollingUpgradeInfo = null;
7324    return new RollingUpgradeInfo(blockPoolId, false, startTime, finalizeTime);
7325  }
7326
7327  long addCacheDirective(CacheDirectiveInfo directive, EnumSet<CacheFlag> flags)
7328      throws IOException {
7329    checkOperation(OperationCategory.WRITE);
7330    final FSPermissionChecker pc = isPermissionEnabled ?
7331        getPermissionChecker() : null;
7332    CacheEntryWithPayload cacheEntry =
7333        RetryCache.waitForCompletion(retryCache, null);
7334    if (cacheEntry != null && cacheEntry.isSuccess()) {
7335      return (Long) cacheEntry.getPayload();
7336    }
7337    boolean success = false;
7338    if (!flags.contains(CacheFlag.FORCE)) {
7339      cacheManager.waitForRescanIfNeeded();
7340    }
7341    writeLock();
7342    Long result = null;
7343    try {
7344      checkOperation(OperationCategory.WRITE);
7345      if (isInSafeMode()) {
7346        throw new SafeModeException(
7347            "Cannot add cache directive", safeMode);
7348      }
7349      if (directive.getId() != null) {
7350        throw new IOException("addDirective: you cannot specify an ID " +
7351            "for this operation.");
7352      }
7353      CacheDirectiveInfo effectiveDirective = 
7354          cacheManager.addDirective(directive, pc, flags);
7355      getEditLog().logAddCacheDirectiveInfo(effectiveDirective,
7356          cacheEntry != null);
7357      result = effectiveDirective.getId();
7358      success = true;
7359    } finally {
7360      writeUnlock();
7361      if (success) {
7362        getEditLog().logSync();
7363      }
7364      if (isAuditEnabled() && isExternalInvocation()) {
7365        logAuditEvent(success, "addCacheDirective", null, null, null);
7366      }
7367      RetryCache.setState(cacheEntry, success, result);
7368    }
7369    return result;
7370  }
7371
7372  void modifyCacheDirective(CacheDirectiveInfo directive,
7373      EnumSet<CacheFlag> flags) throws IOException {
7374    checkOperation(OperationCategory.WRITE);
7375    final FSPermissionChecker pc = isPermissionEnabled ?
7376        getPermissionChecker() : null;
7377    boolean success = false;
7378    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7379    if (cacheEntry != null && cacheEntry.isSuccess()) {
7380      return;
7381    }
7382    if (!flags.contains(CacheFlag.FORCE)) {
7383      cacheManager.waitForRescanIfNeeded();
7384    }
7385    writeLock();
7386    try {
7387      checkOperation(OperationCategory.WRITE);
7388      if (isInSafeMode()) {
7389        throw new SafeModeException(
7390            "Cannot add cache directive", safeMode);
7391      }
7392      cacheManager.modifyDirective(directive, pc, flags);
7393      getEditLog().logModifyCacheDirectiveInfo(directive,
7394          cacheEntry != null);
7395      success = true;
7396    } finally {
7397      writeUnlock();
7398      if (success) {
7399        getEditLog().logSync();
7400      }
7401      if (isAuditEnabled() && isExternalInvocation()) {
7402        logAuditEvent(success, "modifyCacheDirective", null, null, null);
7403      }
7404      RetryCache.setState(cacheEntry, success);
7405    }
7406  }
7407
7408  void removeCacheDirective(Long id) throws IOException {
7409    checkOperation(OperationCategory.WRITE);
7410    final FSPermissionChecker pc = isPermissionEnabled ?
7411        getPermissionChecker() : null;
7412    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7413    if (cacheEntry != null && cacheEntry.isSuccess()) {
7414      return;
7415    }
7416    boolean success = false;
7417    writeLock();
7418    try {
7419      checkOperation(OperationCategory.WRITE);
7420      if (isInSafeMode()) {
7421        throw new SafeModeException(
7422            "Cannot remove cache directives", safeMode);
7423      }
7424      cacheManager.removeDirective(id, pc);
7425      getEditLog().logRemoveCacheDirectiveInfo(id, cacheEntry != null);
7426      success = true;
7427    } finally {
7428      writeUnlock();
7429      if (isAuditEnabled() && isExternalInvocation()) {
7430        logAuditEvent(success, "removeCacheDirective", null, null,
7431            null);
7432      }
7433      RetryCache.setState(cacheEntry, success);
7434    }
7435    getEditLog().logSync();
7436  }
7437
7438  BatchedListEntries<CacheDirectiveEntry> listCacheDirectives(
7439      long startId, CacheDirectiveInfo filter) throws IOException {
7440    checkOperation(OperationCategory.READ);
7441    final FSPermissionChecker pc = isPermissionEnabled ?
7442        getPermissionChecker() : null;
7443    BatchedListEntries<CacheDirectiveEntry> results;
7444    cacheManager.waitForRescanIfNeeded();
7445    readLock();
7446    boolean success = false;
7447    try {
7448      checkOperation(OperationCategory.READ);
7449      results =
7450          cacheManager.listCacheDirectives(startId, filter, pc);
7451      success = true;
7452    } finally {
7453      readUnlock();
7454      if (isAuditEnabled() && isExternalInvocation()) {
7455        logAuditEvent(success, "listCacheDirectives", null, null,
7456            null);
7457      }
7458    }
7459    return results;
7460  }
7461
7462  public void addCachePool(CachePoolInfo req) throws IOException {
7463    checkOperation(OperationCategory.WRITE);
7464    final FSPermissionChecker pc = isPermissionEnabled ?
7465        getPermissionChecker() : null;
7466    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7467    if (cacheEntry != null && cacheEntry.isSuccess()) {
7468      return; // Return previous response
7469    }
7470    writeLock();
7471    boolean success = false;
7472    try {
7473      checkOperation(OperationCategory.WRITE);
7474      if (isInSafeMode()) {
7475        throw new SafeModeException(
7476            "Cannot add cache pool " + req.getPoolName(), safeMode);
7477      }
7478      if (pc != null) {
7479        pc.checkSuperuserPrivilege();
7480      }
7481      CachePoolInfo info = cacheManager.addCachePool(req);
7482      getEditLog().logAddCachePool(info, cacheEntry != null);
7483      success = true;
7484    } finally {
7485      writeUnlock();
7486      if (isAuditEnabled() && isExternalInvocation()) {
7487        logAuditEvent(success, "addCachePool", req.getPoolName(), null, null);
7488      }
7489      RetryCache.setState(cacheEntry, success);
7490    }
7491    
7492    getEditLog().logSync();
7493  }
7494
7495  public void modifyCachePool(CachePoolInfo req) throws IOException {
7496    checkOperation(OperationCategory.WRITE);
7497    final FSPermissionChecker pc =
7498        isPermissionEnabled ? getPermissionChecker() : null;
7499    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7500    if (cacheEntry != null && cacheEntry.isSuccess()) {
7501      return; // Return previous response
7502    }
7503    writeLock();
7504    boolean success = false;
7505    try {
7506      checkOperation(OperationCategory.WRITE);
7507      if (isInSafeMode()) {
7508        throw new SafeModeException(
7509            "Cannot modify cache pool " + req.getPoolName(), safeMode);
7510      }
7511      if (pc != null) {
7512        pc.checkSuperuserPrivilege();
7513      }
7514      cacheManager.modifyCachePool(req);
7515      getEditLog().logModifyCachePool(req, cacheEntry != null);
7516      success = true;
7517    } finally {
7518      writeUnlock();
7519      if (isAuditEnabled() && isExternalInvocation()) {
7520        logAuditEvent(success, "modifyCachePool", req.getPoolName(), null, null);
7521      }
7522      RetryCache.setState(cacheEntry, success);
7523    }
7524
7525    getEditLog().logSync();
7526  }
7527
7528  public void removeCachePool(String cachePoolName) throws IOException {
7529    checkOperation(OperationCategory.WRITE);
7530    final FSPermissionChecker pc =
7531        isPermissionEnabled ? getPermissionChecker() : null;
7532    CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
7533    if (cacheEntry != null && cacheEntry.isSuccess()) {
7534      return; // Return previous response
7535    }
7536    writeLock();
7537    boolean success = false;
7538    try {
7539      checkOperation(OperationCategory.WRITE);
7540      if (isInSafeMode()) {
7541        throw new SafeModeException(
7542            "Cannot remove cache pool " + cachePoolName, safeMode);
7543      }
7544      if (pc != null) {
7545        pc.checkSuperuserPrivilege();
7546      }
7547      cacheManager.removeCachePool(cachePoolName);
7548      getEditLog().logRemoveCachePool(cachePoolName, cacheEntry != null);
7549      success = true;
7550    } finally {
7551      writeUnlock();
7552      if (isAuditEnabled() && isExternalInvocation()) {
7553        logAuditEvent(success, "removeCachePool", cachePoolName, null, null);
7554      }
7555      RetryCache.setState(cacheEntry, success);
7556    }
7557    
7558    getEditLog().logSync();
7559  }
7560
7561  public BatchedListEntries<CachePoolEntry> listCachePools(String prevKey)
7562      throws IOException {
7563    final FSPermissionChecker pc =
7564        isPermissionEnabled ? getPermissionChecker() : null;
7565    BatchedListEntries<CachePoolEntry> results;
7566    checkOperation(OperationCategory.READ);
7567    boolean success = false;
7568    cacheManager.waitForRescanIfNeeded();
7569    readLock();
7570    try {
7571      checkOperation(OperationCategory.READ);
7572      results = cacheManager.listCachePools(pc, prevKey);
7573      success = true;
7574    } finally {
7575      readUnlock();
7576      if (isAuditEnabled() && isExternalInvocation()) {
7577        logAuditEvent(success, "listCachePools", null, null, null);
7578      }
7579    }
7580    return results;
7581  }
7582
7583  void modifyAclEntries(String src, List<AclEntry> aclSpec) throws IOException {
7584    aclConfigFlag.checkForApiCall();
7585    HdfsFileStatus resultingStat = null;
7586    FSPermissionChecker pc = getPermissionChecker();
7587    checkOperation(OperationCategory.WRITE);
7588    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
7589    writeLock();
7590    try {
7591      checkOperation(OperationCategory.WRITE);
7592      checkNameNodeSafeMode("Cannot modify ACL entries on " + src);
7593      src = FSDirectory.resolvePath(src, pathComponents, dir);
7594      checkOwner(pc, src);
7595      dir.modifyAclEntries(src, aclSpec);
7596      resultingStat = getAuditFileInfo(src, false);
7597    } finally {
7598      writeUnlock();
7599    }
7600    getEditLog().logSync();
7601    logAuditEvent(true, "modifyAclEntries", src, null, resultingStat);
7602  }
7603
7604  void removeAclEntries(String src, List<AclEntry> aclSpec) throws IOException {
7605    aclConfigFlag.checkForApiCall();
7606    HdfsFileStatus resultingStat = null;
7607    FSPermissionChecker pc = getPermissionChecker();
7608    checkOperation(OperationCategory.WRITE);
7609    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
7610    writeLock();
7611    try {
7612      checkOperation(OperationCategory.WRITE);
7613      checkNameNodeSafeMode("Cannot remove ACL entries on " + src);
7614      src = FSDirectory.resolvePath(src, pathComponents, dir);
7615      checkOwner(pc, src);
7616      dir.removeAclEntries(src, aclSpec);
7617      resultingStat = getAuditFileInfo(src, false);
7618    } finally {
7619      writeUnlock();
7620    }
7621    getEditLog().logSync();
7622    logAuditEvent(true, "removeAclEntries", src, null, resultingStat);
7623  }
7624
7625  void removeDefaultAcl(String src) throws IOException {
7626    aclConfigFlag.checkForApiCall();
7627    HdfsFileStatus resultingStat = null;
7628    FSPermissionChecker pc = getPermissionChecker();
7629    checkOperation(OperationCategory.WRITE);
7630    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
7631    writeLock();
7632    try {
7633      checkOperation(OperationCategory.WRITE);
7634      checkNameNodeSafeMode("Cannot remove default ACL entries on " + src);
7635      src = FSDirectory.resolvePath(src, pathComponents, dir);
7636      checkOwner(pc, src);
7637      dir.removeDefaultAcl(src);
7638      resultingStat = getAuditFileInfo(src, false);
7639    } finally {
7640      writeUnlock();
7641    }
7642    getEditLog().logSync();
7643    logAuditEvent(true, "removeDefaultAcl", src, null, resultingStat);
7644  }
7645
7646  void removeAcl(String src) throws IOException {
7647    aclConfigFlag.checkForApiCall();
7648    HdfsFileStatus resultingStat = null;
7649    FSPermissionChecker pc = getPermissionChecker();
7650    checkOperation(OperationCategory.WRITE);
7651    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
7652    writeLock();
7653    try {
7654      checkOperation(OperationCategory.WRITE);
7655      checkNameNodeSafeMode("Cannot remove ACL on " + src);
7656      src = FSDirectory.resolvePath(src, pathComponents, dir);
7657      checkOwner(pc, src);
7658      dir.removeAcl(src);
7659      resultingStat = getAuditFileInfo(src, false);
7660    } finally {
7661      writeUnlock();
7662    }
7663    getEditLog().logSync();
7664    logAuditEvent(true, "removeAcl", src, null, resultingStat);
7665  }
7666
7667  void setAcl(String src, List<AclEntry> aclSpec) throws IOException {
7668    aclConfigFlag.checkForApiCall();
7669    HdfsFileStatus resultingStat = null;
7670    FSPermissionChecker pc = getPermissionChecker();
7671    checkOperation(OperationCategory.WRITE);
7672    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
7673    writeLock();
7674    try {
7675      checkOperation(OperationCategory.WRITE);
7676      checkNameNodeSafeMode("Cannot set ACL on " + src);
7677      src = FSDirectory.resolvePath(src, pathComponents, dir);
7678      checkOwner(pc, src);
7679      dir.setAcl(src, aclSpec);
7680      resultingStat = getAuditFileInfo(src, false);
7681    } finally {
7682      writeUnlock();
7683    }
7684    getEditLog().logSync();
7685    logAuditEvent(true, "setAcl", src, null, resultingStat);
7686  }
7687
7688  AclStatus getAclStatus(String src) throws IOException {
7689    aclConfigFlag.checkForApiCall();
7690    FSPermissionChecker pc = getPermissionChecker();
7691    checkOperation(OperationCategory.READ);
7692    readLock();
7693    try {
7694      checkOperation(OperationCategory.READ);
7695      if (isPermissionEnabled) {
7696        checkPermission(pc, src, false, null, null, null, null);
7697      }
7698      return dir.getAclStatus(src);
7699    } finally {
7700      readUnlock();
7701    }
7702  }
7703
7704  /**
7705   * Default AuditLogger implementation; used when no access logger is
7706   * defined in the config file. It can also be explicitly listed in the
7707   * config file.
7708   */
7709  private static class DefaultAuditLogger extends HdfsAuditLogger {
7710
7711    private boolean logTokenTrackingId;
7712
7713    @Override
7714    public void initialize(Configuration conf) {
7715      logTokenTrackingId = conf.getBoolean(
7716          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
7717          DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
7718    }
7719
7720    @Override
7721    public void logAuditEvent(boolean succeeded, String userName,
7722        InetAddress addr, String cmd, String src, String dst,
7723        FileStatus status, UserGroupInformation ugi,
7724        DelegationTokenSecretManager dtSecretManager) {
7725      if (auditLog.isInfoEnabled()) {
7726        final StringBuilder sb = auditBuffer.get();
7727        sb.setLength(0);
7728        sb.append("allowed=").append(succeeded).append("\t");
7729        sb.append("ugi=").append(userName).append("\t");
7730        sb.append("ip=").append(addr).append("\t");
7731        sb.append("cmd=").append(cmd).append("\t");
7732        sb.append("src=").append(src).append("\t");
7733        sb.append("dst=").append(dst).append("\t");
7734        if (null == status) {
7735          sb.append("perm=null");
7736        } else {
7737          sb.append("perm=");
7738          sb.append(status.getOwner()).append(":");
7739          sb.append(status.getGroup()).append(":");
7740          sb.append(status.getPermission());
7741        }
7742        if (logTokenTrackingId) {
7743          sb.append("\t").append("trackingId=");
7744          String trackingId = null;
7745          if (ugi != null && dtSecretManager != null
7746              && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
7747            for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
7748              if (tid instanceof DelegationTokenIdentifier) {
7749                DelegationTokenIdentifier dtid =
7750                    (DelegationTokenIdentifier)tid;
7751                trackingId = dtSecretManager.getTokenTrackingId(dtid);
7752                break;
7753              }
7754            }
7755          }
7756          sb.append(trackingId);
7757        }
7758        logAuditMessage(sb.toString());
7759      }
7760    }
7761
7762    public void logAuditMessage(String message) {
7763      auditLog.info(message);
7764    }
7765  }
7766
7767  private static void enableAsyncAuditLog() {
7768    if (!(auditLog instanceof Log4JLogger)) {
7769      LOG.warn("Log4j is required to enable async auditlog");
7770      return;
7771    }
7772    Logger logger = ((Log4JLogger)auditLog).getLogger();
7773    @SuppressWarnings("unchecked")
7774    List<Appender> appenders = Collections.list(logger.getAllAppenders());
7775    // failsafe against trying to async it more than once
7776    if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
7777      AsyncAppender asyncAppender = new AsyncAppender();
7778      // change logger to have an async appender containing all the
7779      // previously configured appenders
7780      for (Appender appender : appenders) {
7781        logger.removeAppender(appender);
7782        asyncAppender.addAppender(appender);
7783      }
7784      logger.addAppender(asyncAppender);        
7785    }
7786  }
7787
7788}
7789