001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.namenode;
019
020 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
021 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
022 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
023 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
024 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT;
025 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
026 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_DEFAULT;
027 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY;
028 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_DEFAULT;
029 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY;
030 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT;
031 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_WRITE_PACKET_SIZE_KEY;
032 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT;
033 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY;
034 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
035 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
036 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT;
037 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY;
038 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOGGERS_KEY;
039 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT;
040 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY;
041 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT;
042 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY;
043 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME;
044 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT;
045 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY;
046 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
047 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
048 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT;
049 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
050 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
051 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
052 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
053 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
054 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS;
055 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT;
056 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD;
057 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT;
058 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT;
059 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
060 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
061 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
062 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
063 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
064 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
065 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
066 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT;
067 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY;
068 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT;
069 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY;
070 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT;
071 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY;
072 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY;
073 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT;
074 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
075 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
076 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
077 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
078 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
079 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
080 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
081 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
082 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
083 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
084 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_DEFAULT;
085 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SUPPORT_APPEND_KEY;
086 import static org.apache.hadoop.util.Time.now;
087
088 import java.io.BufferedWriter;
089 import java.io.ByteArrayInputStream;
090 import java.io.DataInput;
091 import java.io.DataInputStream;
092 import java.io.DataOutputStream;
093 import java.io.File;
094 import java.io.FileNotFoundException;
095 import java.io.FileOutputStream;
096 import java.io.IOException;
097 import java.io.OutputStreamWriter;
098 import java.io.PrintWriter;
099 import java.io.StringWriter;
100 import java.lang.management.ManagementFactory;
101 import java.net.InetAddress;
102 import java.net.URI;
103 import java.util.ArrayList;
104 import java.util.Arrays;
105 import java.util.Collection;
106 import java.util.Collections;
107 import java.util.Date;
108 import java.util.EnumSet;
109 import java.util.HashMap;
110 import java.util.HashSet;
111 import java.util.Iterator;
112 import java.util.LinkedHashSet;
113 import java.util.List;
114 import java.util.Map;
115 import java.util.Set;
116 import java.util.concurrent.TimeUnit;
117 import java.util.concurrent.locks.ReentrantReadWriteLock;
118
119 import javax.management.NotCompliantMBeanException;
120 import javax.management.ObjectName;
121 import javax.management.StandardMBean;
122
123 import org.apache.commons.logging.Log;
124 import org.apache.commons.logging.LogFactory;
125 import org.apache.hadoop.HadoopIllegalArgumentException;
126 import org.apache.hadoop.classification.InterfaceAudience;
127 import org.apache.hadoop.conf.Configuration;
128 import org.apache.hadoop.fs.ContentSummary;
129 import org.apache.hadoop.fs.CreateFlag;
130 import org.apache.hadoop.fs.DirectoryListingStartAfterNotFoundException;
131 import org.apache.hadoop.fs.FileAlreadyExistsException;
132 import org.apache.hadoop.fs.FileStatus;
133 import org.apache.hadoop.fs.FileSystem;
134 import org.apache.hadoop.fs.FsServerDefaults;
135 import org.apache.hadoop.fs.InvalidPathException;
136 import org.apache.hadoop.fs.Options;
137 import org.apache.hadoop.fs.Options.Rename;
138 import org.apache.hadoop.fs.ParentNotDirectoryException;
139 import org.apache.hadoop.fs.Path;
140 import org.apache.hadoop.fs.UnresolvedLinkException;
141 import org.apache.hadoop.fs.permission.FsAction;
142 import org.apache.hadoop.fs.permission.FsPermission;
143 import org.apache.hadoop.fs.permission.PermissionStatus;
144 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
145 import org.apache.hadoop.ha.ServiceFailedException;
146 import org.apache.hadoop.hdfs.DFSConfigKeys;
147 import org.apache.hadoop.hdfs.DFSUtil;
148 import org.apache.hadoop.hdfs.HAUtil;
149 import org.apache.hadoop.hdfs.HdfsConfiguration;
150 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
151 import org.apache.hadoop.hdfs.protocol.Block;
152 import org.apache.hadoop.hdfs.protocol.ClientProtocol;
153 import org.apache.hadoop.hdfs.protocol.DatanodeID;
154 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
155 import org.apache.hadoop.hdfs.protocol.DirectoryListing;
156 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
157 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
158 import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
159 import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
160 import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
161 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
162 import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
163 import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
164 import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
165 import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
166 import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry;
167 import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
168 import org.apache.hadoop.hdfs.protocol.datatransfer.ReplaceDatanodeOnFailure;
169 import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
170 import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
171 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
172 import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
173 import org.apache.hadoop.hdfs.server.blockmanagement.*;
174 import org.apache.hadoop.hdfs.server.common.GenerationStamp;
175 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
176 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
177 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
178 import org.apache.hadoop.hdfs.server.common.Storage;
179 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
180 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
181 import org.apache.hadoop.hdfs.server.common.Util;
182 import org.apache.hadoop.hdfs.server.namenode.INode.BlocksMapUpdateInfo;
183 import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
184 import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
185 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
186 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
187 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
188 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Status;
189 import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
190 import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
191 import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
192 import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
193 import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
194 import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
195 import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
196 import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
197 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
198 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable.SnapshotDiffInfo;
199 import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeFileWithSnapshot;
200 import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
201 import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager;
202 import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
203 import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
204 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
205 import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
206 import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
207 import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
208 import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
209 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
210 import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
211 import org.apache.hadoop.io.IOUtils;
212 import org.apache.hadoop.io.Text;
213 import org.apache.hadoop.ipc.RetryCache;
214 import org.apache.hadoop.ipc.RetryCache.CacheEntry;
215 import org.apache.hadoop.ipc.RetryCache.CacheEntryWithPayload;
216 import org.apache.hadoop.ipc.RetriableException;
217 import org.apache.hadoop.ipc.Server;
218 import org.apache.hadoop.ipc.StandbyException;
219 import org.apache.hadoop.metrics2.annotation.Metric;
220 import org.apache.hadoop.metrics2.annotation.Metrics;
221 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
222 import org.apache.hadoop.metrics2.util.MBeans;
223 import org.apache.hadoop.net.NetworkTopology;
224 import org.apache.hadoop.net.Node;
225 import org.apache.hadoop.security.AccessControlException;
226 import org.apache.hadoop.security.UserGroupInformation;
227 import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
228 import org.apache.hadoop.security.token.SecretManager.InvalidToken;
229 import org.apache.hadoop.security.token.Token;
230 import org.apache.hadoop.security.token.TokenIdentifier;
231 import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier;
232 import org.apache.hadoop.security.token.delegation.DelegationKey;
233 import org.apache.hadoop.util.Daemon;
234 import org.apache.hadoop.util.DataChecksum;
235 import org.apache.hadoop.util.Time;
236 import org.apache.hadoop.util.VersionInfo;
237 import org.mortbay.util.ajax.JSON;
238
239 import com.google.common.annotations.VisibleForTesting;
240 import com.google.common.base.Charsets;
241 import com.google.common.base.Preconditions;
242 import com.google.common.collect.Lists;
243
244 /***************************************************
245 * FSNamesystem does the actual bookkeeping work for the
246 * DataNode.
247 *
248 * It tracks several important tables.
249 *
250 * 1) valid fsname --> blocklist (kept on disk, logged)
251 * 2) Set of all valid blocks (inverted #1)
252 * 3) block --> machinelist (kept in memory, rebuilt dynamically from reports)
253 * 4) machine --> blocklist (inverted #2)
254 * 5) LRU cache of updated-heartbeat machines
255 ***************************************************/
256 @InterfaceAudience.Private
257 @Metrics(context="dfs")
258 public class FSNamesystem implements Namesystem, FSClusterStats,
259 FSNamesystemMBean, NameNodeMXBean {
260 public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
261
262 private static final ThreadLocal<StringBuilder> auditBuffer =
263 new ThreadLocal<StringBuilder>() {
264 @Override
265 protected StringBuilder initialValue() {
266 return new StringBuilder();
267 }
268 };
269
270 @VisibleForTesting
271 public boolean isAuditEnabled() {
272 return !isDefaultAuditLogger || auditLog.isInfoEnabled();
273 }
274
275 private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink)
276 throws IOException {
277 return (isAuditEnabled() && isExternalInvocation())
278 ? dir.getFileInfo(path, resolveSymlink) : null;
279 }
280
281 private void logAuditEvent(boolean succeeded, String cmd, String src)
282 throws IOException {
283 logAuditEvent(succeeded, cmd, src, null, null);
284 }
285
286 private void logAuditEvent(boolean succeeded, String cmd, String src,
287 String dst, HdfsFileStatus stat) throws IOException {
288 if (isAuditEnabled() && isExternalInvocation()) {
289 logAuditEvent(succeeded, getRemoteUser(), getRemoteIp(),
290 cmd, src, dst, stat);
291 }
292 }
293
294 private void logAuditEvent(boolean succeeded,
295 UserGroupInformation ugi, InetAddress addr, String cmd, String src,
296 String dst, HdfsFileStatus stat) {
297 FileStatus status = null;
298 if (stat != null) {
299 Path symlink = stat.isSymlink() ? new Path(stat.getSymlink()) : null;
300 Path path = dst != null ? new Path(dst) : new Path(src);
301 status = new FileStatus(stat.getLen(), stat.isDir(),
302 stat.getReplication(), stat.getBlockSize(), stat.getModificationTime(),
303 stat.getAccessTime(), stat.getPermission(), stat.getOwner(),
304 stat.getGroup(), symlink, path);
305 }
306 for (AuditLogger logger : auditLoggers) {
307 if (logger instanceof HdfsAuditLogger) {
308 HdfsAuditLogger hdfsLogger = (HdfsAuditLogger) logger;
309 hdfsLogger.logAuditEvent(succeeded, ugi.toString(), addr, cmd, src, dst,
310 status, ugi, dtSecretManager);
311 } else {
312 logger.logAuditEvent(succeeded, ugi.toString(), addr,
313 cmd, src, dst, status);
314 }
315 }
316 }
317
318 /**
319 * Logger for audit events, noting successful FSNamesystem operations. Emits
320 * to FSNamesystem.audit at INFO. Each event causes a set of tab-separated
321 * <code>key=value</code> pairs to be written for the following properties:
322 * <code>
323 * ugi=<ugi in RPC>
324 * ip=<remote IP>
325 * cmd=<command>
326 * src=<src path>
327 * dst=<dst path (optional)>
328 * perm=<permissions (optional)>
329 * </code>
330 */
331 public static final Log auditLog = LogFactory.getLog(
332 FSNamesystem.class.getName() + ".audit");
333
334 static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
335 static int BLOCK_DELETION_INCREMENT = 1000;
336 private final boolean isPermissionEnabled;
337 private final UserGroupInformation fsOwner;
338 private final String fsOwnerShortUserName;
339 private final String supergroup;
340 private final boolean standbyShouldCheckpoint;
341
342 // Scan interval is not configurable.
343 private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
344 TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
345 final DelegationTokenSecretManager dtSecretManager;
346 private final boolean alwaysUseDelegationTokensForTests;
347
348 private static final Step STEP_AWAITING_REPORTED_BLOCKS =
349 new Step(StepType.AWAITING_REPORTED_BLOCKS);
350
351 // Tracks whether the default audit logger is the only configured audit
352 // logger; this allows isAuditEnabled() to return false in case the
353 // underlying logger is disabled, and avoid some unnecessary work.
354 private final boolean isDefaultAuditLogger;
355 private final List<AuditLogger> auditLoggers;
356
357 /** The namespace tree. */
358 FSDirectory dir;
359 private final BlockManager blockManager;
360 private final SnapshotManager snapshotManager;
361 private final DatanodeStatistics datanodeStatistics;
362
363 // Block pool ID used by this namenode
364 private String blockPoolId;
365
366 final LeaseManager leaseManager = new LeaseManager(this);
367
368 volatile Daemon smmthread = null; // SafeModeMonitor thread
369
370 Daemon nnrmthread = null; // NamenodeResourceMonitor thread
371
372 Daemon nnEditLogRoller = null; // NameNodeEditLogRoller thread
373 /**
374 * When an active namenode will roll its own edit log, in # edits
375 */
376 private final long editLogRollerThreshold;
377 /**
378 * Check interval of an active namenode's edit log roller thread
379 */
380 private final int editLogRollerInterval;
381
382 private volatile boolean hasResourcesAvailable = false;
383 private volatile boolean fsRunning = true;
384
385 /** The start time of the namesystem. */
386 private final long startTime = now();
387
388 /** The interval of namenode checking for the disk space availability */
389 private final long resourceRecheckInterval;
390
391 // The actual resource checker instance.
392 NameNodeResourceChecker nnResourceChecker;
393
394 private final FsServerDefaults serverDefaults;
395 private final boolean supportAppends;
396 private final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
397
398 private volatile SafeModeInfo safeMode; // safe mode information
399
400 private final long maxFsObjects; // maximum number of fs objects
401
402 private final long minBlockSize; // minimum block size
403 private final long maxBlocksPerFile; // maximum # of blocks per file
404
405 /**
406 * The global generation stamp for legacy blocks with randomly
407 * generated block IDs.
408 */
409 private final GenerationStamp generationStampV1 = new GenerationStamp();
410
411 /**
412 * The global generation stamp for this file system.
413 */
414 private final GenerationStamp generationStampV2 = new GenerationStamp();
415
416 /**
417 * The value of the generation stamp when the first switch to sequential
418 * block IDs was made. Blocks with generation stamps below this value
419 * have randomly allocated block IDs. Blocks with generation stamps above
420 * this value had sequentially allocated block IDs. Read from the fsImage
421 * (or initialized as an offset from the V1 (legacy) generation stamp on
422 * upgrade).
423 */
424 private long generationStampV1Limit =
425 GenerationStamp.GRANDFATHER_GENERATION_STAMP;
426
427 /**
428 * The global block ID space for this file system.
429 */
430 @VisibleForTesting
431 private final SequentialBlockIdGenerator blockIdGenerator;
432
433 // precision of access times.
434 private final long accessTimePrecision;
435
436 /** Lock to protect FSNamesystem. */
437 private ReentrantReadWriteLock fsLock = new ReentrantReadWriteLock(true);
438
439 /**
440 * Used when this NN is in standby state to read from the shared edit log.
441 */
442 private EditLogTailer editLogTailer = null;
443
444 /**
445 * Used when this NN is in standby state to perform checkpoints.
446 */
447 private StandbyCheckpointer standbyCheckpointer;
448
449 /**
450 * Reference to the NN's HAContext object. This is only set once
451 * {@link #startCommonServices(Configuration, HAContext)} is called.
452 */
453 private HAContext haContext;
454
455 private final boolean haEnabled;
456
457 /**
458 * Whether the namenode is in the middle of starting the active service
459 */
460 private volatile boolean startingActiveService = false;
461
462 private INodeId inodeId;
463
464 private final RetryCache retryCache;
465
466 /**
467 * Set the last allocated inode id when fsimage or editlog is loaded.
468 */
469 public void resetLastInodeId(long newValue) throws IOException {
470 try {
471 inodeId.skipTo(newValue);
472 } catch(IllegalStateException ise) {
473 throw new IOException(ise);
474 }
475 }
476
477 /** Should only be used for tests to reset to any value */
478 void resetLastInodeIdWithoutChecking(long newValue) {
479 inodeId.setCurrentValue(newValue);
480 }
481
482 /** @return the last inode ID. */
483 public long getLastInodeId() {
484 return inodeId.getCurrentValue();
485 }
486
487 /** Allocate a new inode ID. */
488 public long allocateNewInodeId() {
489 return inodeId.nextValue();
490 }
491
492 /**
493 * Clear all loaded data
494 */
495 void clear() {
496 dir.reset();
497 dtSecretManager.reset();
498 generationStampV1.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
499 generationStampV2.setCurrentValue(GenerationStamp.LAST_RESERVED_STAMP);
500 blockIdGenerator.setCurrentValue(
501 SequentialBlockIdGenerator.LAST_RESERVED_BLOCK_ID);
502 generationStampV1Limit = GenerationStamp.GRANDFATHER_GENERATION_STAMP;
503 leaseManager.removeAllLeases();
504 inodeId.setCurrentValue(INodeId.LAST_RESERVED_ID);
505 snapshotManager.clearSnapshottableDirs();
506 }
507
508 @VisibleForTesting
509 LeaseManager getLeaseManager() {
510 return leaseManager;
511 }
512
513 /**
514 * Check the supplied configuration for correctness.
515 * @param conf Supplies the configuration to validate.
516 * @throws IOException if the configuration could not be queried.
517 * @throws IllegalArgumentException if the configuration is invalid.
518 */
519 private static void checkConfiguration(Configuration conf)
520 throws IOException {
521
522 final Collection<URI> namespaceDirs =
523 FSNamesystem.getNamespaceDirs(conf);
524 final Collection<URI> editsDirs =
525 FSNamesystem.getNamespaceEditsDirs(conf);
526 final Collection<URI> requiredEditsDirs =
527 FSNamesystem.getRequiredNamespaceEditsDirs(conf);
528 final Collection<URI> sharedEditsDirs =
529 FSNamesystem.getSharedEditsDirs(conf);
530
531 for (URI u : requiredEditsDirs) {
532 if (u.toString().compareTo(
533 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT) == 0) {
534 continue;
535 }
536
537 // Each required directory must also be in editsDirs or in
538 // sharedEditsDirs.
539 if (!editsDirs.contains(u) &&
540 !sharedEditsDirs.contains(u)) {
541 throw new IllegalArgumentException(
542 "Required edits directory " + u.toString() + " not present in " +
543 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + ". " +
544 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY + "=" +
545 editsDirs.toString() + "; " +
546 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY + "=" +
547 requiredEditsDirs.toString() + ". " +
548 DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY + "=" +
549 sharedEditsDirs.toString() + ".");
550 }
551 }
552
553 if (namespaceDirs.size() == 1) {
554 LOG.warn("Only one image storage directory ("
555 + DFS_NAMENODE_NAME_DIR_KEY + ") configured. Beware of dataloss"
556 + " due to lack of redundant storage directories!");
557 }
558 if (editsDirs.size() == 1) {
559 LOG.warn("Only one namespace edits storage directory ("
560 + DFS_NAMENODE_EDITS_DIR_KEY + ") configured. Beware of dataloss"
561 + " due to lack of redundant storage directories!");
562 }
563 }
564
565 /**
566 * Instantiates an FSNamesystem loaded from the image and edits
567 * directories specified in the passed Configuration.
568 *
569 * @param conf the Configuration which specifies the storage directories
570 * from which to load
571 * @return an FSNamesystem which contains the loaded namespace
572 * @throws IOException if loading fails
573 */
574 public static FSNamesystem loadFromDisk(Configuration conf)
575 throws IOException {
576
577 checkConfiguration(conf);
578 FSImage fsImage = new FSImage(conf,
579 FSNamesystem.getNamespaceDirs(conf),
580 FSNamesystem.getNamespaceEditsDirs(conf));
581 FSNamesystem namesystem = new FSNamesystem(conf, fsImage, false);
582 StartupOption startOpt = NameNode.getStartupOption(conf);
583 if (startOpt == StartupOption.RECOVER) {
584 namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
585 }
586
587 long loadStart = now();
588 String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
589 namesystem.loadFSImage(startOpt, fsImage,
590 HAUtil.isHAEnabled(conf, nameserviceId));
591 long timeTakenToLoadFSImage = now() - loadStart;
592 LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
593 NameNodeMetrics nnMetrics = NameNode.getNameNodeMetrics();
594 if (nnMetrics != null) {
595 nnMetrics.setFsImageLoadTime((int) timeTakenToLoadFSImage);
596 }
597 return namesystem;
598 }
599
600 FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
601 this(conf, fsImage, false);
602 }
603
604 /**
605 * Create an FSNamesystem associated with the specified image.
606 *
607 * Note that this does not load any data off of disk -- if you would
608 * like that behavior, use {@link #loadFromDisk(Configuration)}
609 *
610 * @param conf configuration
611 * @param fsImage The FSImage to associate with
612 * @param ignoreRetryCache Whether or not should ignore the retry cache setup
613 * step. For Secondary NN this should be set to true.
614 * @throws IOException on bad configuration
615 */
616 FSNamesystem(Configuration conf, FSImage fsImage, boolean ignoreRetryCache)
617 throws IOException {
618 try {
619 resourceRecheckInterval = conf.getLong(
620 DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY,
621 DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT);
622
623 this.blockManager = new BlockManager(this, this, conf);
624 this.datanodeStatistics = blockManager.getDatanodeManager().getDatanodeStatistics();
625 this.blockIdGenerator = new SequentialBlockIdGenerator(this.blockManager);
626
627 this.fsOwner = UserGroupInformation.getCurrentUser();
628 this.fsOwnerShortUserName = fsOwner.getShortUserName();
629 this.supergroup = conf.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY,
630 DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
631 this.isPermissionEnabled = conf.getBoolean(DFS_PERMISSIONS_ENABLED_KEY,
632 DFS_PERMISSIONS_ENABLED_DEFAULT);
633 LOG.info("fsOwner = " + fsOwner);
634 LOG.info("supergroup = " + supergroup);
635 LOG.info("isPermissionEnabled = " + isPermissionEnabled);
636
637 // block allocation has to be persisted in HA using a shared edits directory
638 // so that the standby has up-to-date namespace information
639 String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
640 this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);
641
642 // Sanity check the HA-related config.
643 if (nameserviceId != null) {
644 LOG.info("Determined nameservice ID: " + nameserviceId);
645 }
646 LOG.info("HA Enabled: " + haEnabled);
647 if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
648 LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
649 throw new IOException("Invalid configuration: a shared edits dir " +
650 "must not be specified if HA is not enabled.");
651 }
652
653 // Get the checksum type from config
654 String checksumTypeStr = conf.get(DFS_CHECKSUM_TYPE_KEY, DFS_CHECKSUM_TYPE_DEFAULT);
655 DataChecksum.Type checksumType;
656 try {
657 checksumType = DataChecksum.Type.valueOf(checksumTypeStr);
658 } catch (IllegalArgumentException iae) {
659 throw new IOException("Invalid checksum type in "
660 + DFS_CHECKSUM_TYPE_KEY + ": " + checksumTypeStr);
661 }
662
663 this.serverDefaults = new FsServerDefaults(
664 conf.getLongBytes(DFS_BLOCK_SIZE_KEY, DFS_BLOCK_SIZE_DEFAULT),
665 conf.getInt(DFS_BYTES_PER_CHECKSUM_KEY, DFS_BYTES_PER_CHECKSUM_DEFAULT),
666 conf.getInt(DFS_CLIENT_WRITE_PACKET_SIZE_KEY, DFS_CLIENT_WRITE_PACKET_SIZE_DEFAULT),
667 (short) conf.getInt(DFS_REPLICATION_KEY, DFS_REPLICATION_DEFAULT),
668 conf.getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT),
669 conf.getBoolean(DFS_ENCRYPT_DATA_TRANSFER_KEY, DFS_ENCRYPT_DATA_TRANSFER_DEFAULT),
670 conf.getLong(FS_TRASH_INTERVAL_KEY, FS_TRASH_INTERVAL_DEFAULT),
671 checksumType);
672
673 this.maxFsObjects = conf.getLong(DFS_NAMENODE_MAX_OBJECTS_KEY,
674 DFS_NAMENODE_MAX_OBJECTS_DEFAULT);
675
676 this.minBlockSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,
677 DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_DEFAULT);
678 this.maxBlocksPerFile = conf.getLong(DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY,
679 DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT);
680 this.accessTimePrecision = conf.getLong(DFS_NAMENODE_ACCESSTIME_PRECISION_KEY,
681 DFS_NAMENODE_ACCESSTIME_PRECISION_DEFAULT);
682 this.supportAppends = conf.getBoolean(DFS_SUPPORT_APPEND_KEY, DFS_SUPPORT_APPEND_DEFAULT);
683 LOG.info("Append Enabled: " + supportAppends);
684
685 this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
686
687 this.standbyShouldCheckpoint = conf.getBoolean(
688 DFS_HA_STANDBY_CHECKPOINTS_KEY, DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
689 // # edit autoroll threshold is a multiple of the checkpoint threshold
690 this.editLogRollerThreshold = (long)
691 (conf.getFloat(
692 DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD,
693 DFS_NAMENODE_EDIT_LOG_AUTOROLL_MULTIPLIER_THRESHOLD_DEFAULT) *
694 conf.getLong(
695 DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
696 DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT));
697 this.editLogRollerInterval = conf.getInt(
698 DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS,
699 DFS_NAMENODE_EDIT_LOG_AUTOROLL_CHECK_INTERVAL_MS_DEFAULT);
700 this.inodeId = new INodeId();
701
702 // For testing purposes, allow the DT secret manager to be started regardless
703 // of whether security is enabled.
704 alwaysUseDelegationTokensForTests = conf.getBoolean(
705 DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
706 DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
707
708 this.dtSecretManager = createDelegationTokenSecretManager(conf);
709 this.dir = new FSDirectory(fsImage, this, conf);
710 this.snapshotManager = new SnapshotManager(dir);
711 this.safeMode = new SafeModeInfo(conf);
712 this.auditLoggers = initAuditLoggers(conf);
713 this.isDefaultAuditLogger = auditLoggers.size() == 1 &&
714 auditLoggers.get(0) instanceof DefaultAuditLogger;
715 this.retryCache = ignoreRetryCache ? null : initRetryCache(conf);
716 } catch(IOException e) {
717 LOG.error(getClass().getSimpleName() + " initialization failed.", e);
718 close();
719 throw e;
720 } catch (RuntimeException re) {
721 LOG.error(getClass().getSimpleName() + " initialization failed.", re);
722 close();
723 throw re;
724 }
725 }
726
727 @VisibleForTesting
728 public RetryCache getRetryCache() {
729 return retryCache;
730 }
731
732 /** Whether or not retry cache is enabled */
733 boolean hasRetryCache() {
734 return retryCache != null;
735 }
736
737 void addCacheEntryWithPayload(byte[] clientId, int callId, Object payload) {
738 if (retryCache != null) {
739 retryCache.addCacheEntryWithPayload(clientId, callId, payload);
740 }
741 }
742
743 void addCacheEntry(byte[] clientId, int callId) {
744 if (retryCache != null) {
745 retryCache.addCacheEntry(clientId, callId);
746 }
747 }
748
749 @VisibleForTesting
750 static RetryCache initRetryCache(Configuration conf) {
751 boolean enable = conf.getBoolean(DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY,
752 DFS_NAMENODE_ENABLE_RETRY_CACHE_DEFAULT);
753 LOG.info("Retry cache on namenode is " + (enable ? "enabled" : "disabled"));
754 if (enable) {
755 float heapPercent = conf.getFloat(
756 DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_KEY,
757 DFS_NAMENODE_RETRY_CACHE_HEAP_PERCENT_DEFAULT);
758 long entryExpiryMillis = conf.getLong(
759 DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_KEY,
760 DFS_NAMENODE_RETRY_CACHE_EXPIRYTIME_MILLIS_DEFAULT);
761 LOG.info("Retry cache will use " + heapPercent
762 + " of total heap and retry cache entry expiry time is "
763 + entryExpiryMillis + " millis");
764 long entryExpiryNanos = entryExpiryMillis * 1000 * 1000;
765 return new RetryCache("Namenode Retry Cache", heapPercent,
766 entryExpiryNanos);
767 }
768 return null;
769 }
770
771 private List<AuditLogger> initAuditLoggers(Configuration conf) {
772 // Initialize the custom access loggers if configured.
773 Collection<String> alClasses = conf.getStringCollection(DFS_NAMENODE_AUDIT_LOGGERS_KEY);
774 List<AuditLogger> auditLoggers = Lists.newArrayList();
775 if (alClasses != null && !alClasses.isEmpty()) {
776 for (String className : alClasses) {
777 try {
778 AuditLogger logger;
779 if (DFS_NAMENODE_DEFAULT_AUDIT_LOGGER_NAME.equals(className)) {
780 logger = new DefaultAuditLogger();
781 } else {
782 logger = (AuditLogger) Class.forName(className).newInstance();
783 }
784 logger.initialize(conf);
785 auditLoggers.add(logger);
786 } catch (RuntimeException re) {
787 throw re;
788 } catch (Exception e) {
789 throw new RuntimeException(e);
790 }
791 }
792 }
793
794 // Make sure there is at least one logger installed.
795 if (auditLoggers.isEmpty()) {
796 auditLoggers.add(new DefaultAuditLogger());
797 }
798 return Collections.unmodifiableList(auditLoggers);
799 }
800
801 void loadFSImage(StartupOption startOpt, FSImage fsImage, boolean haEnabled)
802 throws IOException {
803 // format before starting up if requested
804 if (startOpt == StartupOption.FORMAT) {
805
806 fsImage.format(this, fsImage.getStorage().determineClusterId());// reuse current id
807
808 startOpt = StartupOption.REGULAR;
809 }
810 boolean success = false;
811 writeLock();
812 try {
813 // We shouldn't be calling saveNamespace if we've come up in standby state.
814 MetaRecoveryContext recovery = startOpt.createRecoveryContext();
815 boolean needToSave =
816 fsImage.recoverTransitionRead(startOpt, this, recovery) && !haEnabled;
817 if (needToSave) {
818 fsImage.saveNamespace(this);
819 } else {
820 // No need to save, so mark the phase done.
821 StartupProgress prog = NameNode.getStartupProgress();
822 prog.beginPhase(Phase.SAVING_CHECKPOINT);
823 prog.endPhase(Phase.SAVING_CHECKPOINT);
824 }
825 // This will start a new log segment and write to the seen_txid file, so
826 // we shouldn't do it when coming up in standby state
827 if (!haEnabled) {
828 fsImage.openEditLogForWrite();
829 }
830 success = true;
831 } finally {
832 if (!success) {
833 fsImage.close();
834 }
835 writeUnlock();
836 }
837 dir.imageLoadComplete();
838 }
839
840 private void startSecretManager() {
841 if (dtSecretManager != null) {
842 try {
843 dtSecretManager.startThreads();
844 } catch (IOException e) {
845 // Inability to start secret manager
846 // can't be recovered from.
847 throw new RuntimeException(e);
848 }
849 }
850 }
851
852 private void startSecretManagerIfNecessary() {
853 boolean shouldRun = shouldUseDelegationTokens() &&
854 !isInSafeMode() && getEditLog().isOpenForWrite();
855 boolean running = dtSecretManager.isRunning();
856 if (shouldRun && !running) {
857 startSecretManager();
858 }
859 }
860
861 private void stopSecretManager() {
862 if (dtSecretManager != null) {
863 dtSecretManager.stopThreads();
864 }
865 }
866
867 /**
868 * Start services common to both active and standby states
869 * @param haContext
870 * @throws IOException
871 */
872 void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
873 this.registerMBean(); // register the MBean for the FSNamesystemState
874 writeLock();
875 this.haContext = haContext;
876 try {
877 nnResourceChecker = new NameNodeResourceChecker(conf);
878 checkAvailableResources();
879 assert safeMode != null &&
880 !safeMode.isPopulatingReplQueues();
881 StartupProgress prog = NameNode.getStartupProgress();
882 prog.beginPhase(Phase.SAFEMODE);
883 prog.setTotal(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS,
884 getCompleteBlocksTotal());
885 setBlockTotal();
886 blockManager.activate(conf);
887 } finally {
888 writeUnlock();
889 }
890
891 registerMXBean();
892 DefaultMetricsSystem.instance().register(this);
893 }
894
895 /**
896 * Stop services common to both active and standby states
897 * @throws IOException
898 */
899 void stopCommonServices() {
900 writeLock();
901 try {
902 if (blockManager != null) blockManager.close();
903 } finally {
904 writeUnlock();
905 }
906 RetryCache.clear(retryCache);
907 }
908
909 /**
910 * Start services required in active state
911 * @throws IOException
912 */
913 void startActiveServices() throws IOException {
914 startingActiveService = true;
915 LOG.info("Starting services required for active state");
916 writeLock();
917 try {
918 FSEditLog editLog = dir.fsImage.getEditLog();
919
920 if (!editLog.isOpenForWrite()) {
921 // During startup, we're already open for write during initialization.
922 editLog.initJournalsForWrite();
923 // May need to recover
924 editLog.recoverUnclosedStreams();
925
926 LOG.info("Catching up to latest edits from old active before " +
927 "taking over writer role in edits logs");
928 editLogTailer.catchupDuringFailover();
929
930 blockManager.setPostponeBlocksFromFuture(false);
931 blockManager.getDatanodeManager().markAllDatanodesStale();
932 blockManager.clearQueues();
933 blockManager.processAllPendingDNMessages();
934
935 if (!isInSafeMode() ||
936 (isInSafeMode() && safeMode.isPopulatingReplQueues())) {
937 LOG.info("Reprocessing replication and invalidation queues");
938 blockManager.processMisReplicatedBlocks();
939 }
940
941 if (LOG.isDebugEnabled()) {
942 LOG.debug("NameNode metadata after re-processing " +
943 "replication and invalidation queues during failover:\n" +
944 metaSaveAsString());
945 }
946
947 long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
948 LOG.info("Will take over writing edit logs at txnid " +
949 nextTxId);
950 editLog.setNextTxId(nextTxId);
951
952 dir.fsImage.editLog.openForWrite();
953 }
954 if (haEnabled) {
955 // Renew all of the leases before becoming active.
956 // This is because, while we were in standby mode,
957 // the leases weren't getting renewed on this NN.
958 // Give them all a fresh start here.
959 leaseManager.renewAllLeases();
960 }
961 leaseManager.startMonitor();
962 startSecretManagerIfNecessary();
963
964 //ResourceMonitor required only at ActiveNN. See HDFS-2914
965 this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
966 nnrmthread.start();
967
968 nnEditLogRoller = new Daemon(new NameNodeEditLogRoller(
969 editLogRollerThreshold, editLogRollerInterval));
970 nnEditLogRoller.start();
971
972 } finally {
973 writeUnlock();
974 startingActiveService = false;
975 }
976 }
977
978 /**
979 * @return Whether the namenode is transitioning to active state and is in the
980 * middle of the {@link #startActiveServices()}
981 */
982 public boolean inTransitionToActive() {
983 return haEnabled && haContext != null
984 && haContext.getState().getServiceState() == HAServiceState.ACTIVE
985 && startingActiveService;
986 }
987
988 private boolean shouldUseDelegationTokens() {
989 return UserGroupInformation.isSecurityEnabled() ||
990 alwaysUseDelegationTokensForTests;
991 }
992
993 /**
994 * Stop services required in active state
995 * @throws InterruptedException
996 */
997 void stopActiveServices() {
998 LOG.info("Stopping services started for active state");
999 writeLock();
1000 try {
1001 stopSecretManager();
1002 if (leaseManager != null) {
1003 leaseManager.stopMonitor();
1004 }
1005 if (nnrmthread != null) {
1006 ((NameNodeResourceMonitor) nnrmthread.getRunnable()).stopMonitor();
1007 nnrmthread.interrupt();
1008 }
1009 if (nnEditLogRoller != null) {
1010 ((NameNodeEditLogRoller)nnEditLogRoller.getRunnable()).stop();
1011 nnEditLogRoller.interrupt();
1012 }
1013 if (dir != null && dir.fsImage != null) {
1014 if (dir.fsImage.editLog != null) {
1015 dir.fsImage.editLog.close();
1016 }
1017 // Update the fsimage with the last txid that we wrote
1018 // so that the tailer starts from the right spot.
1019 dir.fsImage.updateLastAppliedTxIdFromWritten();
1020 }
1021 } finally {
1022 writeUnlock();
1023 }
1024 }
1025
1026 /**
1027 * Start services required in standby state
1028 *
1029 * @throws IOException
1030 */
1031 void startStandbyServices(final Configuration conf) throws IOException {
1032 LOG.info("Starting services required for standby state");
1033 if (!dir.fsImage.editLog.isOpenForRead()) {
1034 // During startup, we're already open for read.
1035 dir.fsImage.editLog.initSharedJournalsForRead();
1036 }
1037
1038 blockManager.setPostponeBlocksFromFuture(true);
1039
1040 editLogTailer = new EditLogTailer(this, conf);
1041 editLogTailer.start();
1042 if (standbyShouldCheckpoint) {
1043 standbyCheckpointer = new StandbyCheckpointer(conf, this);
1044 standbyCheckpointer.start();
1045 }
1046 }
1047
1048
1049 /**
1050 * Called while the NN is in Standby state, but just about to be
1051 * asked to enter Active state. This cancels any checkpoints
1052 * currently being taken.
1053 */
1054 void prepareToStopStandbyServices() throws ServiceFailedException {
1055 if (standbyCheckpointer != null) {
1056 standbyCheckpointer.cancelAndPreventCheckpoints(
1057 "About to leave standby state");
1058 }
1059 }
1060
1061 /** Stop services required in standby state */
1062 void stopStandbyServices() throws IOException {
1063 LOG.info("Stopping services started for standby state");
1064 if (standbyCheckpointer != null) {
1065 standbyCheckpointer.stop();
1066 }
1067 if (editLogTailer != null) {
1068 editLogTailer.stop();
1069 }
1070 if (dir != null && dir.fsImage != null && dir.fsImage.editLog != null) {
1071 dir.fsImage.editLog.close();
1072 }
1073 }
1074
1075 @Override
1076 public void checkOperation(OperationCategory op) throws StandbyException {
1077 if (haContext != null) {
1078 // null in some unit tests
1079 haContext.checkOperation(op);
1080 }
1081 }
1082
1083 /**
1084 * @throws RetriableException
1085 * If 1) The NameNode is in SafeMode, 2) HA is enabled, and 3)
1086 * NameNode is in active state
1087 * @throws SafeModeException
1088 * Otherwise if NameNode is in SafeMode.
1089 */
1090 private void checkNameNodeSafeMode(String errorMsg)
1091 throws RetriableException, SafeModeException {
1092 if (isInSafeMode()) {
1093 SafeModeException se = new SafeModeException(errorMsg, safeMode);
1094 if (haEnabled && haContext != null
1095 && haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1096 throw new RetriableException(se);
1097 } else {
1098 throw se;
1099 }
1100 }
1101 }
1102
1103 public static Collection<URI> getNamespaceDirs(Configuration conf) {
1104 return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
1105 }
1106
1107 /**
1108 * Get all edits dirs which are required. If any shared edits dirs are
1109 * configured, these are also included in the set of required dirs.
1110 *
1111 * @param conf the HDFS configuration.
1112 * @return all required dirs.
1113 */
1114 public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
1115 Set<URI> ret = new HashSet<URI>();
1116 ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
1117 ret.addAll(getSharedEditsDirs(conf));
1118 return ret;
1119 }
1120
1121 private static Collection<URI> getStorageDirs(Configuration conf,
1122 String propertyName) {
1123 Collection<String> dirNames = conf.getTrimmedStringCollection(propertyName);
1124 StartupOption startOpt = NameNode.getStartupOption(conf);
1125 if(startOpt == StartupOption.IMPORT) {
1126 // In case of IMPORT this will get rid of default directories
1127 // but will retain directories specified in hdfs-site.xml
1128 // When importing image from a checkpoint, the name-node can
1129 // start with empty set of storage directories.
1130 Configuration cE = new HdfsConfiguration(false);
1131 cE.addResource("core-default.xml");
1132 cE.addResource("core-site.xml");
1133 cE.addResource("hdfs-default.xml");
1134 Collection<String> dirNames2 = cE.getTrimmedStringCollection(propertyName);
1135 dirNames.removeAll(dirNames2);
1136 if(dirNames.isEmpty())
1137 LOG.warn("!!! WARNING !!!" +
1138 "\n\tThe NameNode currently runs without persistent storage." +
1139 "\n\tAny changes to the file system meta-data may be lost." +
1140 "\n\tRecommended actions:" +
1141 "\n\t\t- shutdown and restart NameNode with configured \""
1142 + propertyName + "\" in hdfs-site.xml;" +
1143 "\n\t\t- use Backup Node as a persistent and up-to-date storage " +
1144 "of the file system meta-data.");
1145 } else if (dirNames.isEmpty()) {
1146 dirNames = Collections.singletonList(
1147 DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_DEFAULT);
1148 }
1149 return Util.stringCollectionAsURIs(dirNames);
1150 }
1151
1152 /**
1153 * Return an ordered list of edits directories to write to.
1154 * The list is ordered such that all shared edits directories
1155 * are ordered before non-shared directories, and any duplicates
1156 * are removed. The order they are specified in the configuration
1157 * is retained.
1158 * @return Collection of shared edits directories.
1159 * @throws IOException if multiple shared edits directories are configured
1160 */
1161 public static List<URI> getNamespaceEditsDirs(Configuration conf)
1162 throws IOException {
1163 return getNamespaceEditsDirs(conf, true);
1164 }
1165
1166 public static List<URI> getNamespaceEditsDirs(Configuration conf,
1167 boolean includeShared)
1168 throws IOException {
1169 // Use a LinkedHashSet so that order is maintained while we de-dup
1170 // the entries.
1171 LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
1172
1173 if (includeShared) {
1174 List<URI> sharedDirs = getSharedEditsDirs(conf);
1175
1176 // Fail until multiple shared edits directories are supported (HDFS-2782)
1177 if (sharedDirs.size() > 1) {
1178 throw new IOException(
1179 "Multiple shared edits directories are not yet supported");
1180 }
1181
1182 // First add the shared edits dirs. It's critical that the shared dirs
1183 // are added first, since JournalSet syncs them in the order they are listed,
1184 // and we need to make sure all edits are in place in the shared storage
1185 // before they are replicated locally. See HDFS-2874.
1186 for (URI dir : sharedDirs) {
1187 if (!editsDirs.add(dir)) {
1188 LOG.warn("Edits URI " + dir + " listed multiple times in " +
1189 DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
1190 }
1191 }
1192 }
1193 // Now add the non-shared dirs.
1194 for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
1195 if (!editsDirs.add(dir)) {
1196 LOG.warn("Edits URI " + dir + " listed multiple times in " +
1197 DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
1198 DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
1199 }
1200 }
1201
1202 if (editsDirs.isEmpty()) {
1203 // If this is the case, no edit dirs have been explicitly configured.
1204 // Image dirs are to be used for edits too.
1205 return Lists.newArrayList(getNamespaceDirs(conf));
1206 } else {
1207 return Lists.newArrayList(editsDirs);
1208 }
1209 }
1210
1211 /**
1212 * Returns edit directories that are shared between primary and secondary.
1213 * @param conf
1214 * @return Collection of edit directories.
1215 */
1216 public static List<URI> getSharedEditsDirs(Configuration conf) {
1217 // don't use getStorageDirs here, because we want an empty default
1218 // rather than the dir in /tmp
1219 Collection<String> dirNames = conf.getTrimmedStringCollection(
1220 DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
1221 return Util.stringCollectionAsURIs(dirNames);
1222 }
1223
1224 @Override
1225 public void readLock() {
1226 this.fsLock.readLock().lock();
1227 }
1228 @Override
1229 public void readUnlock() {
1230 this.fsLock.readLock().unlock();
1231 }
1232 @Override
1233 public void writeLock() {
1234 this.fsLock.writeLock().lock();
1235 }
1236 @Override
1237 public void writeLockInterruptibly() throws InterruptedException {
1238 this.fsLock.writeLock().lockInterruptibly();
1239 }
1240 @Override
1241 public void writeUnlock() {
1242 this.fsLock.writeLock().unlock();
1243 }
1244 @Override
1245 public boolean hasWriteLock() {
1246 return this.fsLock.isWriteLockedByCurrentThread();
1247 }
1248 @Override
1249 public boolean hasReadLock() {
1250 return this.fsLock.getReadHoldCount() > 0 || hasWriteLock();
1251 }
1252
1253 NamespaceInfo getNamespaceInfo() {
1254 readLock();
1255 try {
1256 return unprotectedGetNamespaceInfo();
1257 } finally {
1258 readUnlock();
1259 }
1260 }
1261
1262 /**
1263 * Version of @see #getNamespaceInfo() that is not protected by a lock.
1264 */
1265 NamespaceInfo unprotectedGetNamespaceInfo() {
1266 return new NamespaceInfo(dir.fsImage.getStorage().getNamespaceID(),
1267 getClusterId(), getBlockPoolId(),
1268 dir.fsImage.getStorage().getCTime());
1269 }
1270
1271 /**
1272 * Close down this file system manager.
1273 * Causes heartbeat and lease daemons to stop; waits briefly for
1274 * them to finish, but a short timeout returns control back to caller.
1275 */
1276 void close() {
1277 fsRunning = false;
1278 try {
1279 stopCommonServices();
1280 if (smmthread != null) smmthread.interrupt();
1281 } finally {
1282 // using finally to ensure we also wait for lease daemon
1283 try {
1284 stopActiveServices();
1285 stopStandbyServices();
1286 if (dir != null) {
1287 dir.close();
1288 }
1289 } catch (IOException ie) {
1290 LOG.error("Error closing FSDirectory", ie);
1291 IOUtils.cleanup(LOG, dir);
1292 }
1293 }
1294 }
1295
1296 @Override
1297 public boolean isRunning() {
1298 return fsRunning;
1299 }
1300
1301 @Override
1302 public boolean isInStandbyState() {
1303 if (haContext == null || haContext.getState() == null) {
1304 // We're still starting up. In this case, if HA is
1305 // on for the cluster, we always start in standby. Otherwise
1306 // start in active.
1307 return haEnabled;
1308 }
1309
1310 return HAServiceState.STANDBY == haContext.getState().getServiceState();
1311 }
1312
1313 /**
1314 * Dump all metadata into specified file
1315 */
1316 void metaSave(String filename) throws IOException {
1317 checkSuperuserPrivilege();
1318 checkOperation(OperationCategory.UNCHECKED);
1319 writeLock();
1320 try {
1321 checkOperation(OperationCategory.UNCHECKED);
1322 File file = new File(System.getProperty("hadoop.log.dir"), filename);
1323 PrintWriter out = new PrintWriter(new BufferedWriter(
1324 new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8)));
1325 metaSave(out);
1326 out.flush();
1327 out.close();
1328 } finally {
1329 writeUnlock();
1330 }
1331 }
1332
1333 private void metaSave(PrintWriter out) {
1334 assert hasWriteLock();
1335 long totalInodes = this.dir.totalInodes();
1336 long totalBlocks = this.getBlocksTotal();
1337 out.println(totalInodes + " files and directories, " + totalBlocks
1338 + " blocks = " + (totalInodes + totalBlocks) + " total");
1339
1340 blockManager.metaSave(out);
1341 }
1342
1343 private String metaSaveAsString() {
1344 StringWriter sw = new StringWriter();
1345 PrintWriter pw = new PrintWriter(sw);
1346 metaSave(pw);
1347 pw.flush();
1348 return sw.toString();
1349 }
1350
1351
1352 long getDefaultBlockSize() {
1353 return serverDefaults.getBlockSize();
1354 }
1355
1356 FsServerDefaults getServerDefaults() throws StandbyException {
1357 checkOperation(OperationCategory.READ);
1358 return serverDefaults;
1359 }
1360
1361 long getAccessTimePrecision() {
1362 return accessTimePrecision;
1363 }
1364
1365 private boolean isAccessTimeSupported() {
1366 return accessTimePrecision > 0;
1367 }
1368
1369 /////////////////////////////////////////////////////////
1370 //
1371 // These methods are called by HadoopFS clients
1372 //
1373 /////////////////////////////////////////////////////////
1374 /**
1375 * Set permissions for an existing file.
1376 * @throws IOException
1377 */
1378 void setPermission(String src, FsPermission permission)
1379 throws AccessControlException, FileNotFoundException, SafeModeException,
1380 UnresolvedLinkException, IOException {
1381 try {
1382 setPermissionInt(src, permission);
1383 } catch (AccessControlException e) {
1384 logAuditEvent(false, "setPermission", src);
1385 throw e;
1386 }
1387 }
1388
1389 private void setPermissionInt(String src, FsPermission permission)
1390 throws AccessControlException, FileNotFoundException, SafeModeException,
1391 UnresolvedLinkException, IOException {
1392 HdfsFileStatus resultingStat = null;
1393 FSPermissionChecker pc = getPermissionChecker();
1394 checkOperation(OperationCategory.WRITE);
1395 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1396 writeLock();
1397 try {
1398 checkOperation(OperationCategory.WRITE);
1399 checkNameNodeSafeMode("Cannot set permission for " + src);
1400 src = FSDirectory.resolvePath(src, pathComponents, dir);
1401 checkOwner(pc, src);
1402 dir.setPermission(src, permission);
1403 resultingStat = getAuditFileInfo(src, false);
1404 } finally {
1405 writeUnlock();
1406 }
1407 getEditLog().logSync();
1408 logAuditEvent(true, "setPermission", src, null, resultingStat);
1409 }
1410
1411 /**
1412 * Set owner for an existing file.
1413 * @throws IOException
1414 */
1415 void setOwner(String src, String username, String group)
1416 throws AccessControlException, FileNotFoundException, SafeModeException,
1417 UnresolvedLinkException, IOException {
1418 try {
1419 setOwnerInt(src, username, group);
1420 } catch (AccessControlException e) {
1421 logAuditEvent(false, "setOwner", src);
1422 throw e;
1423 }
1424 }
1425
1426 private void setOwnerInt(String src, String username, String group)
1427 throws AccessControlException, FileNotFoundException, SafeModeException,
1428 UnresolvedLinkException, IOException {
1429 HdfsFileStatus resultingStat = null;
1430 FSPermissionChecker pc = getPermissionChecker();
1431 checkOperation(OperationCategory.WRITE);
1432 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1433 writeLock();
1434 try {
1435 checkOperation(OperationCategory.WRITE);
1436 checkNameNodeSafeMode("Cannot set owner for " + src);
1437 src = FSDirectory.resolvePath(src, pathComponents, dir);
1438 checkOwner(pc, src);
1439 if (!pc.isSuperUser()) {
1440 if (username != null && !pc.getUser().equals(username)) {
1441 throw new AccessControlException("Non-super user cannot change owner");
1442 }
1443 if (group != null && !pc.containsGroup(group)) {
1444 throw new AccessControlException("User does not belong to " + group);
1445 }
1446 }
1447 dir.setOwner(src, username, group);
1448 resultingStat = getAuditFileInfo(src, false);
1449 } finally {
1450 writeUnlock();
1451 }
1452 getEditLog().logSync();
1453 logAuditEvent(true, "setOwner", src, null, resultingStat);
1454 }
1455
1456 /**
1457 * Get block locations within the specified range.
1458 * @see ClientProtocol#getBlockLocations(String, long, long)
1459 */
1460 LocatedBlocks getBlockLocations(String clientMachine, String src,
1461 long offset, long length) throws AccessControlException,
1462 FileNotFoundException, UnresolvedLinkException, IOException {
1463 LocatedBlocks blocks = getBlockLocations(src, offset, length, true, true,
1464 true);
1465 if (blocks != null) {
1466 blockManager.getDatanodeManager().sortLocatedBlocks(
1467 clientMachine, blocks.getLocatedBlocks());
1468
1469 LocatedBlock lastBlock = blocks.getLastLocatedBlock();
1470 if (lastBlock != null) {
1471 ArrayList<LocatedBlock> lastBlockList = new ArrayList<LocatedBlock>();
1472 lastBlockList.add(lastBlock);
1473 blockManager.getDatanodeManager().sortLocatedBlocks(
1474 clientMachine, lastBlockList);
1475 }
1476 }
1477 return blocks;
1478 }
1479
1480 /**
1481 * Get block locations within the specified range.
1482 * @see ClientProtocol#getBlockLocations(String, long, long)
1483 * @throws FileNotFoundException, UnresolvedLinkException, IOException
1484 */
1485 LocatedBlocks getBlockLocations(String src, long offset, long length,
1486 boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode)
1487 throws FileNotFoundException, UnresolvedLinkException, IOException {
1488 try {
1489 return getBlockLocationsInt(src, offset, length, doAccessTime,
1490 needBlockToken, checkSafeMode);
1491 } catch (AccessControlException e) {
1492 logAuditEvent(false, "open", src);
1493 throw e;
1494 }
1495 }
1496
1497 private LocatedBlocks getBlockLocationsInt(String src, long offset,
1498 long length, boolean doAccessTime, boolean needBlockToken,
1499 boolean checkSafeMode)
1500 throws FileNotFoundException, UnresolvedLinkException, IOException {
1501 if (offset < 0) {
1502 throw new HadoopIllegalArgumentException(
1503 "Negative offset is not supported. File: " + src);
1504 }
1505 if (length < 0) {
1506 throw new HadoopIllegalArgumentException(
1507 "Negative length is not supported. File: " + src);
1508 }
1509 final LocatedBlocks ret = getBlockLocationsUpdateTimes(src,
1510 offset, length, doAccessTime, needBlockToken);
1511 logAuditEvent(true, "open", src);
1512 if (checkSafeMode && isInSafeMode()) {
1513 for (LocatedBlock b : ret.getLocatedBlocks()) {
1514 // if safemode & no block locations yet then throw safemodeException
1515 if ((b.getLocations() == null) || (b.getLocations().length == 0)) {
1516 SafeModeException se = new SafeModeException(
1517 "Zero blocklocations for " + src, safeMode);
1518 if (haEnabled && haContext != null &&
1519 haContext.getState().getServiceState() == HAServiceState.ACTIVE) {
1520 throw new RetriableException(se);
1521 } else {
1522 throw se;
1523 }
1524 }
1525 }
1526 }
1527 return ret;
1528 }
1529
1530 /*
1531 * Get block locations within the specified range, updating the
1532 * access times if necessary.
1533 */
1534 private LocatedBlocks getBlockLocationsUpdateTimes(String src, long offset,
1535 long length, boolean doAccessTime, boolean needBlockToken)
1536 throws FileNotFoundException,
1537 UnresolvedLinkException, IOException {
1538 FSPermissionChecker pc = getPermissionChecker();
1539 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1540 for (int attempt = 0; attempt < 2; attempt++) {
1541 boolean isReadOp = (attempt == 0);
1542 if (isReadOp) { // first attempt is with readlock
1543 checkOperation(OperationCategory.READ);
1544 readLock();
1545 } else { // second attempt is with write lock
1546 checkOperation(OperationCategory.WRITE);
1547 writeLock(); // writelock is needed to set accesstime
1548 }
1549 src = FSDirectory.resolvePath(src, pathComponents, dir);
1550 try {
1551 if (isReadOp) {
1552 checkOperation(OperationCategory.READ);
1553 } else {
1554 checkOperation(OperationCategory.WRITE);
1555 }
1556 if (isPermissionEnabled) {
1557 checkPathAccess(pc, src, FsAction.READ);
1558 }
1559
1560 // if the namenode is in safemode, then do not update access time
1561 if (isInSafeMode()) {
1562 doAccessTime = false;
1563 }
1564
1565 final INodesInPath iip = dir.getLastINodeInPath(src);
1566 final INodeFile inode = INodeFile.valueOf(iip.getLastINode(), src);
1567 if (!iip.isSnapshot() //snapshots are readonly, so don't update atime.
1568 && doAccessTime && isAccessTimeSupported()) {
1569 final long now = now();
1570 if (now > inode.getAccessTime() + getAccessTimePrecision()) {
1571 // if we have to set access time but we only have the readlock, then
1572 // restart this entire operation with the writeLock.
1573 if (isReadOp) {
1574 continue;
1575 }
1576 dir.setTimes(src, inode, -1, now, false, iip.getLatestSnapshot());
1577 }
1578 }
1579 final long fileSize = iip.isSnapshot() ?
1580 inode.computeFileSize(iip.getPathSnapshot())
1581 : inode.computeFileSizeNotIncludingLastUcBlock();
1582 boolean isUc = inode.isUnderConstruction();
1583 if (iip.isSnapshot()) {
1584 // if src indicates a snapshot file, we need to make sure the returned
1585 // blocks do not exceed the size of the snapshot file.
1586 length = Math.min(length, fileSize - offset);
1587 isUc = false;
1588 }
1589 return blockManager.createLocatedBlocks(inode.getBlocks(), fileSize,
1590 isUc, offset, length, needBlockToken, iip.isSnapshot());
1591 } finally {
1592 if (isReadOp) {
1593 readUnlock();
1594 } else {
1595 writeUnlock();
1596 }
1597 }
1598 }
1599 return null; // can never reach here
1600 }
1601
1602 /**
1603 * Moves all the blocks from srcs and appends them to trg
1604 * To avoid rollbacks we will verify validitity of ALL of the args
1605 * before we start actual move.
1606 *
1607 * This does not support ".inodes" relative path
1608 * @param target
1609 * @param srcs
1610 * @throws IOException
1611 */
1612 void concat(String target, String [] srcs)
1613 throws IOException, UnresolvedLinkException {
1614 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1615 if (cacheEntry != null && cacheEntry.isSuccess()) {
1616 return; // Return previous response
1617 }
1618
1619 // Either there is no previous request in progres or it has failed
1620 if(FSNamesystem.LOG.isDebugEnabled()) {
1621 FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
1622 " to " + target);
1623 }
1624
1625 boolean success = false;
1626 try {
1627 concatInt(target, srcs, cacheEntry != null);
1628 success = true;
1629 } catch (AccessControlException e) {
1630 logAuditEvent(false, "concat", Arrays.toString(srcs), target, null);
1631 throw e;
1632 } finally {
1633 RetryCache.setState(cacheEntry, success);
1634 }
1635 }
1636
1637 private void concatInt(String target, String [] srcs,
1638 boolean logRetryCache) throws IOException, UnresolvedLinkException {
1639 // verify args
1640 if(target.isEmpty()) {
1641 throw new IllegalArgumentException("Target file name is empty");
1642 }
1643 if(srcs == null || srcs.length == 0) {
1644 throw new IllegalArgumentException("No sources given");
1645 }
1646
1647 // We require all files be in the same directory
1648 String trgParent =
1649 target.substring(0, target.lastIndexOf(Path.SEPARATOR_CHAR));
1650 for (String s : srcs) {
1651 String srcParent = s.substring(0, s.lastIndexOf(Path.SEPARATOR_CHAR));
1652 if (!srcParent.equals(trgParent)) {
1653 throw new IllegalArgumentException(
1654 "Sources and target are not in the same directory");
1655 }
1656 }
1657
1658 HdfsFileStatus resultingStat = null;
1659 FSPermissionChecker pc = getPermissionChecker();
1660 checkOperation(OperationCategory.WRITE);
1661 writeLock();
1662 try {
1663 checkOperation(OperationCategory.WRITE);
1664 checkNameNodeSafeMode("Cannot concat " + target);
1665 concatInternal(pc, target, srcs, logRetryCache);
1666 resultingStat = getAuditFileInfo(target, false);
1667 } finally {
1668 writeUnlock();
1669 }
1670 getEditLog().logSync();
1671 logAuditEvent(true, "concat", Arrays.toString(srcs), target, resultingStat);
1672 }
1673
1674 /** See {@link #concat(String, String[])} */
1675 private void concatInternal(FSPermissionChecker pc, String target,
1676 String[] srcs, boolean logRetryCache) throws IOException,
1677 UnresolvedLinkException {
1678 assert hasWriteLock();
1679
1680 // write permission for the target
1681 if (isPermissionEnabled) {
1682 checkPathAccess(pc, target, FsAction.WRITE);
1683
1684 // and srcs
1685 for(String aSrc: srcs) {
1686 checkPathAccess(pc, aSrc, FsAction.READ); // read the file
1687 checkParentAccess(pc, aSrc, FsAction.WRITE); // for delete
1688 }
1689 }
1690
1691 // to make sure no two files are the same
1692 Set<INode> si = new HashSet<INode>();
1693
1694 // we put the following prerequisite for the operation
1695 // replication and blocks sizes should be the same for ALL the blocks
1696
1697 // check the target
1698 final INodeFile trgInode = INodeFile.valueOf(dir.getINode4Write(target),
1699 target);
1700 if(trgInode.isUnderConstruction()) {
1701 throw new HadoopIllegalArgumentException("concat: target file "
1702 + target + " is under construction");
1703 }
1704 // per design target shouldn't be empty and all the blocks same size
1705 if(trgInode.numBlocks() == 0) {
1706 throw new HadoopIllegalArgumentException("concat: target file "
1707 + target + " is empty");
1708 }
1709 if (trgInode instanceof INodeFileWithSnapshot) {
1710 throw new HadoopIllegalArgumentException("concat: target file "
1711 + target + " is in a snapshot");
1712 }
1713
1714 long blockSize = trgInode.getPreferredBlockSize();
1715
1716 // check the end block to be full
1717 final BlockInfo last = trgInode.getLastBlock();
1718 if(blockSize != last.getNumBytes()) {
1719 throw new HadoopIllegalArgumentException("The last block in " + target
1720 + " is not full; last block size = " + last.getNumBytes()
1721 + " but file block size = " + blockSize);
1722 }
1723
1724 si.add(trgInode);
1725 final short repl = trgInode.getFileReplication();
1726
1727 // now check the srcs
1728 boolean endSrc = false; // final src file doesn't have to have full end block
1729 for(int i=0; i<srcs.length; i++) {
1730 String src = srcs[i];
1731 if(i==srcs.length-1)
1732 endSrc=true;
1733
1734 final INodeFile srcInode = INodeFile.valueOf(dir.getINode4Write(src), src);
1735 if(src.isEmpty()
1736 || srcInode.isUnderConstruction()
1737 || srcInode.numBlocks() == 0) {
1738 throw new HadoopIllegalArgumentException("concat: source file " + src
1739 + " is invalid or empty or underConstruction");
1740 }
1741
1742 // check replication and blocks size
1743 if(repl != srcInode.getBlockReplication()) {
1744 throw new HadoopIllegalArgumentException("concat: the soruce file "
1745 + src + " and the target file " + target
1746 + " should have the same replication: source replication is "
1747 + srcInode.getBlockReplication()
1748 + " but target replication is " + repl);
1749 }
1750
1751 //boolean endBlock=false;
1752 // verify that all the blocks are of the same length as target
1753 // should be enough to check the end blocks
1754 final BlockInfo[] srcBlocks = srcInode.getBlocks();
1755 int idx = srcBlocks.length-1;
1756 if(endSrc)
1757 idx = srcBlocks.length-2; // end block of endSrc is OK not to be full
1758 if(idx >= 0 && srcBlocks[idx].getNumBytes() != blockSize) {
1759 throw new HadoopIllegalArgumentException("concat: the soruce file "
1760 + src + " and the target file " + target
1761 + " should have the same blocks sizes: target block size is "
1762 + blockSize + " but the size of source block " + idx + " is "
1763 + srcBlocks[idx].getNumBytes());
1764 }
1765
1766 si.add(srcInode);
1767 }
1768
1769 // make sure no two files are the same
1770 if(si.size() < srcs.length+1) { // trg + srcs
1771 // it means at least two files are the same
1772 throw new HadoopIllegalArgumentException(
1773 "concat: at least two of the source files are the same");
1774 }
1775
1776 if(NameNode.stateChangeLog.isDebugEnabled()) {
1777 NameNode.stateChangeLog.debug("DIR* NameSystem.concat: " +
1778 Arrays.toString(srcs) + " to " + target);
1779 }
1780
1781 dir.concat(target,srcs, logRetryCache);
1782 }
1783
1784 /**
1785 * stores the modification and access time for this inode.
1786 * The access time is precise upto an hour. The transaction, if needed, is
1787 * written to the edits log but is not flushed.
1788 */
1789 void setTimes(String src, long mtime, long atime)
1790 throws IOException, UnresolvedLinkException {
1791 if (!isAccessTimeSupported() && atime != -1) {
1792 throw new IOException("Access time for hdfs is not configured. " +
1793 " Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
1794 }
1795 try {
1796 setTimesInt(src, mtime, atime);
1797 } catch (AccessControlException e) {
1798 logAuditEvent(false, "setTimes", src);
1799 throw e;
1800 }
1801 }
1802
1803 private void setTimesInt(String src, long mtime, long atime)
1804 throws IOException, UnresolvedLinkException {
1805 HdfsFileStatus resultingStat = null;
1806 FSPermissionChecker pc = getPermissionChecker();
1807 checkOperation(OperationCategory.WRITE);
1808 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1809 writeLock();
1810 try {
1811 checkOperation(OperationCategory.WRITE);
1812 checkNameNodeSafeMode("Cannot set times " + src);
1813 src = FSDirectory.resolvePath(src, pathComponents, dir);
1814
1815 // Write access is required to set access and modification times
1816 if (isPermissionEnabled) {
1817 checkPathAccess(pc, src, FsAction.WRITE);
1818 }
1819 final INodesInPath iip = dir.getINodesInPath4Write(src);
1820 final INode inode = iip.getLastINode();
1821 if (inode != null) {
1822 dir.setTimes(src, inode, mtime, atime, true, iip.getLatestSnapshot());
1823 resultingStat = getAuditFileInfo(src, false);
1824 } else {
1825 throw new FileNotFoundException("File/Directory " + src + " does not exist.");
1826 }
1827 } finally {
1828 writeUnlock();
1829 }
1830 logAuditEvent(true, "setTimes", src, null, resultingStat);
1831 }
1832
1833 /**
1834 * Create a symbolic link.
1835 */
1836 @SuppressWarnings("deprecation")
1837 void createSymlink(String target, String link,
1838 PermissionStatus dirPerms, boolean createParent)
1839 throws IOException, UnresolvedLinkException {
1840 if (!FileSystem.isSymlinksEnabled()) {
1841 throw new UnsupportedOperationException("Symlinks not supported");
1842 }
1843 if (!DFSUtil.isValidName(link)) {
1844 throw new InvalidPathException("Invalid link name: " + link);
1845 }
1846 if (FSDirectory.isReservedName(target)) {
1847 throw new InvalidPathException("Invalid target name: " + target);
1848 }
1849 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
1850 if (cacheEntry != null && cacheEntry.isSuccess()) {
1851 return; // Return previous response
1852 }
1853 boolean success = false;
1854 try {
1855 createSymlinkInt(target, link, dirPerms, createParent, cacheEntry != null);
1856 success = true;
1857 } catch (AccessControlException e) {
1858 logAuditEvent(false, "createSymlink", link, target, null);
1859 throw e;
1860 } finally {
1861 RetryCache.setState(cacheEntry, success);
1862 }
1863 }
1864
1865 private void createSymlinkInt(String target, String link,
1866 PermissionStatus dirPerms, boolean createParent, boolean logRetryCache)
1867 throws IOException, UnresolvedLinkException {
1868 if (NameNode.stateChangeLog.isDebugEnabled()) {
1869 NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target="
1870 + target + " link=" + link);
1871 }
1872 HdfsFileStatus resultingStat = null;
1873 FSPermissionChecker pc = getPermissionChecker();
1874 checkOperation(OperationCategory.WRITE);
1875 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(link);
1876 writeLock();
1877 try {
1878 checkOperation(OperationCategory.WRITE);
1879 checkNameNodeSafeMode("Cannot create symlink " + link);
1880 link = FSDirectory.resolvePath(link, pathComponents, dir);
1881 if (!createParent) {
1882 verifyParentDir(link);
1883 }
1884 if (!dir.isValidToCreate(link)) {
1885 throw new IOException("failed to create link " + link
1886 +" either because the filename is invalid or the file exists");
1887 }
1888 if (isPermissionEnabled) {
1889 checkAncestorAccess(pc, link, FsAction.WRITE);
1890 }
1891 // validate that we have enough inodes.
1892 checkFsObjectLimit();
1893
1894 // add symbolic link to namespace
1895 dir.addSymlink(link, target, dirPerms, createParent, logRetryCache);
1896 resultingStat = getAuditFileInfo(link, false);
1897 } finally {
1898 writeUnlock();
1899 }
1900 getEditLog().logSync();
1901 logAuditEvent(true, "createSymlink", link, target, resultingStat);
1902 }
1903
1904 /**
1905 * Set replication for an existing file.
1906 *
1907 * The NameNode sets new replication and schedules either replication of
1908 * under-replicated data blocks or removal of the excessive block copies
1909 * if the blocks are over-replicated.
1910 *
1911 * @see ClientProtocol#setReplication(String, short)
1912 * @param src file name
1913 * @param replication new replication
1914 * @return true if successful;
1915 * false if file does not exist or is a directory
1916 */
1917 boolean setReplication(final String src, final short replication)
1918 throws IOException {
1919 try {
1920 return setReplicationInt(src, replication);
1921 } catch (AccessControlException e) {
1922 logAuditEvent(false, "setReplication", src);
1923 throw e;
1924 }
1925 }
1926
1927 private boolean setReplicationInt(String src, final short replication)
1928 throws IOException {
1929 blockManager.verifyReplication(src, replication, null);
1930 final boolean isFile;
1931 FSPermissionChecker pc = getPermissionChecker();
1932 checkOperation(OperationCategory.WRITE);
1933 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
1934 writeLock();
1935 try {
1936 checkOperation(OperationCategory.WRITE);
1937 checkNameNodeSafeMode("Cannot set replication for " + src);
1938 src = FSDirectory.resolvePath(src, pathComponents, dir);
1939 if (isPermissionEnabled) {
1940 checkPathAccess(pc, src, FsAction.WRITE);
1941 }
1942
1943 final short[] blockRepls = new short[2]; // 0: old, 1: new
1944 final Block[] blocks = dir.setReplication(src, replication, blockRepls);
1945 isFile = blocks != null;
1946 if (isFile) {
1947 blockManager.setReplication(blockRepls[0], blockRepls[1], src, blocks);
1948 }
1949 } finally {
1950 writeUnlock();
1951 }
1952
1953 getEditLog().logSync();
1954 if (isFile) {
1955 logAuditEvent(true, "setReplication", src);
1956 }
1957 return isFile;
1958 }
1959
1960 long getPreferredBlockSize(String filename)
1961 throws IOException, UnresolvedLinkException {
1962 FSPermissionChecker pc = getPermissionChecker();
1963 checkOperation(OperationCategory.READ);
1964 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(filename);
1965 readLock();
1966 try {
1967 checkOperation(OperationCategory.READ);
1968 filename = FSDirectory.resolvePath(filename, pathComponents, dir);
1969 if (isPermissionEnabled) {
1970 checkTraverse(pc, filename);
1971 }
1972 return dir.getPreferredBlockSize(filename);
1973 } finally {
1974 readUnlock();
1975 }
1976 }
1977
1978 /**
1979 * Verify that parent directory of src exists.
1980 */
1981 private void verifyParentDir(String src) throws FileNotFoundException,
1982 ParentNotDirectoryException, UnresolvedLinkException {
1983 assert hasReadLock();
1984 Path parent = new Path(src).getParent();
1985 if (parent != null) {
1986 final INode parentNode = dir.getINode(parent.toString());
1987 if (parentNode == null) {
1988 throw new FileNotFoundException("Parent directory doesn't exist: "
1989 + parent);
1990 } else if (!parentNode.isDirectory() && !parentNode.isSymlink()) {
1991 throw new ParentNotDirectoryException("Parent path is not a directory: "
1992 + parent);
1993 }
1994 }
1995 }
1996
1997 /**
1998 * Create a new file entry in the namespace.
1999 *
2000 * For description of parameters and exceptions thrown see
2001 * {@link ClientProtocol#create()}, except it returns valid file status upon
2002 * success
2003 *
2004 * For retryCache handling details see -
2005 * {@link #getFileStatus(boolean, CacheEntryWithPayload)}
2006 *
2007 */
2008 HdfsFileStatus startFile(String src, PermissionStatus permissions,
2009 String holder, String clientMachine, EnumSet<CreateFlag> flag,
2010 boolean createParent, short replication, long blockSize)
2011 throws AccessControlException, SafeModeException,
2012 FileAlreadyExistsException, UnresolvedLinkException,
2013 FileNotFoundException, ParentNotDirectoryException, IOException {
2014 HdfsFileStatus status = null;
2015 CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2016 null);
2017 if (cacheEntry != null && cacheEntry.isSuccess()) {
2018 return (HdfsFileStatus) cacheEntry.getPayload();
2019 }
2020
2021 try {
2022 status = startFileInt(src, permissions, holder, clientMachine, flag,
2023 createParent, replication, blockSize, cacheEntry != null);
2024 } catch (AccessControlException e) {
2025 logAuditEvent(false, "create", src);
2026 throw e;
2027 } finally {
2028 RetryCache.setState(cacheEntry, status != null, status);
2029 }
2030 return status;
2031 }
2032
2033 private HdfsFileStatus startFileInt(String src, PermissionStatus permissions,
2034 String holder, String clientMachine, EnumSet<CreateFlag> flag,
2035 boolean createParent, short replication, long blockSize,
2036 boolean logRetryCache) throws AccessControlException, SafeModeException,
2037 FileAlreadyExistsException, UnresolvedLinkException,
2038 FileNotFoundException, ParentNotDirectoryException, IOException {
2039 if (NameNode.stateChangeLog.isDebugEnabled()) {
2040 NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: src=" + src
2041 + ", holder=" + holder
2042 + ", clientMachine=" + clientMachine
2043 + ", createParent=" + createParent
2044 + ", replication=" + replication
2045 + ", createFlag=" + flag.toString());
2046 }
2047 if (!DFSUtil.isValidName(src)) {
2048 throw new InvalidPathException(src);
2049 }
2050 blockManager.verifyReplication(src, replication, clientMachine);
2051
2052 boolean skipSync = false;
2053 HdfsFileStatus stat = null;
2054 FSPermissionChecker pc = getPermissionChecker();
2055 checkOperation(OperationCategory.WRITE);
2056 if (blockSize < minBlockSize) {
2057 throw new IOException("Specified block size is less than configured" +
2058 " minimum value (" + DFSConfigKeys.DFS_NAMENODE_MIN_BLOCK_SIZE_KEY
2059 + "): " + blockSize + " < " + minBlockSize);
2060 }
2061 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2062 boolean create = flag.contains(CreateFlag.CREATE);
2063 boolean overwrite = flag.contains(CreateFlag.OVERWRITE);
2064 writeLock();
2065 try {
2066 checkOperation(OperationCategory.WRITE);
2067 checkNameNodeSafeMode("Cannot create file" + src);
2068 src = FSDirectory.resolvePath(src, pathComponents, dir);
2069 startFileInternal(pc, src, permissions, holder, clientMachine, create,
2070 overwrite, createParent, replication, blockSize, logRetryCache);
2071 stat = dir.getFileInfo(src, false);
2072 } catch (StandbyException se) {
2073 skipSync = true;
2074 throw se;
2075 } finally {
2076 writeUnlock();
2077 // There might be transactions logged while trying to recover the lease.
2078 // They need to be sync'ed even when an exception was thrown.
2079 if (!skipSync) {
2080 getEditLog().logSync();
2081 }
2082 }
2083 logAuditEvent(true, "create", src, null, stat);
2084 return stat;
2085 }
2086
2087 /**
2088 * Create a new file or overwrite an existing file<br>
2089 *
2090 * Once the file is create the client then allocates a new block with the next
2091 * call using {@link NameNode#addBlock()}.
2092 * <p>
2093 * For description of parameters and exceptions thrown see
2094 * {@link ClientProtocol#create()}
2095 */
2096 private void startFileInternal(FSPermissionChecker pc, String src,
2097 PermissionStatus permissions, String holder, String clientMachine,
2098 boolean create, boolean overwrite, boolean createParent,
2099 short replication, long blockSize, boolean logRetryEntry)
2100 throws FileAlreadyExistsException, AccessControlException,
2101 UnresolvedLinkException, FileNotFoundException,
2102 ParentNotDirectoryException, IOException {
2103 assert hasWriteLock();
2104 // Verify that the destination does not exist as a directory already.
2105 final INodesInPath iip = dir.getINodesInPath4Write(src);
2106 final INode inode = iip.getLastINode();
2107 if (inode != null && inode.isDirectory()) {
2108 throw new FileAlreadyExistsException("Cannot create file " + src
2109 + "; already exists as a directory.");
2110 }
2111 final INodeFile myFile = INodeFile.valueOf(inode, src, true);
2112 if (isPermissionEnabled) {
2113 if (overwrite && myFile != null) {
2114 checkPathAccess(pc, src, FsAction.WRITE);
2115 } else {
2116 checkAncestorAccess(pc, src, FsAction.WRITE);
2117 }
2118 }
2119
2120 if (!createParent) {
2121 verifyParentDir(src);
2122 }
2123
2124 try {
2125 if (myFile == null) {
2126 if (!create) {
2127 throw new FileNotFoundException("failed to overwrite non-existent file "
2128 + src + " on client " + clientMachine);
2129 }
2130 } else {
2131 if (overwrite) {
2132 try {
2133 deleteInt(src, true, false); // File exists - delete if overwrite
2134 } catch (AccessControlException e) {
2135 logAuditEvent(false, "delete", src);
2136 throw e;
2137 }
2138 } else {
2139 // If lease soft limit time is expired, recover the lease
2140 recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2141 throw new FileAlreadyExistsException("failed to create file " + src
2142 + " on client " + clientMachine + " because the file exists");
2143 }
2144 }
2145
2146 checkFsObjectLimit();
2147 final DatanodeDescriptor clientNode =
2148 blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2149
2150 INodeFileUnderConstruction newNode = dir.addFile(src, permissions,
2151 replication, blockSize, holder, clientMachine, clientNode);
2152 if (newNode == null) {
2153 throw new IOException("DIR* NameSystem.startFile: " +
2154 "Unable to add file to namespace.");
2155 }
2156 leaseManager.addLease(newNode.getClientName(), src);
2157
2158 // record file record in log, record new generation stamp
2159 getEditLog().logOpenFile(src, newNode, logRetryEntry);
2160 if (NameNode.stateChangeLog.isDebugEnabled()) {
2161 NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: "
2162 +"add "+src+" to namespace for "+holder);
2163 }
2164 } catch (IOException ie) {
2165 NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: "
2166 +ie.getMessage());
2167 throw ie;
2168 }
2169 }
2170
2171 /**
2172 * Append to an existing file for append.
2173 * <p>
2174 *
2175 * The method returns the last block of the file if this is a partial block,
2176 * which can still be used for writing more data. The client uses the returned
2177 * block locations to form the data pipeline for this block.<br>
2178 * The method returns null if the last block is full. The client then
2179 * allocates a new block with the next call using {@link NameNode#addBlock()}.
2180 * <p>
2181 *
2182 * For description of parameters and exceptions thrown see
2183 * {@link ClientProtocol#append(String, String)}
2184 *
2185 * @return the last block locations if the block is partial or null otherwise
2186 */
2187 private LocatedBlock appendFileInternal(FSPermissionChecker pc, String src,
2188 String holder, String clientMachine, boolean logRetryCache)
2189 throws AccessControlException, UnresolvedLinkException,
2190 FileNotFoundException, IOException {
2191 assert hasWriteLock();
2192 // Verify that the destination does not exist as a directory already.
2193 final INodesInPath iip = dir.getINodesInPath4Write(src);
2194 final INode inode = iip.getLastINode();
2195 if (inode != null && inode.isDirectory()) {
2196 throw new FileAlreadyExistsException("Cannot append to directory " + src
2197 + "; already exists as a directory.");
2198 }
2199 if (isPermissionEnabled) {
2200 checkPathAccess(pc, src, FsAction.WRITE);
2201 }
2202
2203 try {
2204 if (inode == null) {
2205 throw new FileNotFoundException("failed to append to non-existent file "
2206 + src + " on client " + clientMachine);
2207 }
2208 INodeFile myFile = INodeFile.valueOf(inode, src, true);
2209 // Opening an existing file for write - may need to recover lease.
2210 recoverLeaseInternal(myFile, src, holder, clientMachine, false);
2211
2212 // recoverLeaseInternal may create a new InodeFile via
2213 // finalizeINodeFileUnderConstruction so we need to refresh
2214 // the referenced file.
2215 myFile = INodeFile.valueOf(dir.getINode(src), src, true);
2216
2217 final DatanodeDescriptor clientNode =
2218 blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
2219 return prepareFileForWrite(src, myFile, holder, clientMachine, clientNode,
2220 true, iip.getLatestSnapshot(), logRetryCache);
2221 } catch (IOException ie) {
2222 NameNode.stateChangeLog.warn("DIR* NameSystem.append: " +ie.getMessage());
2223 throw ie;
2224 }
2225 }
2226
2227 /**
2228 * Replace current node with a INodeUnderConstruction.
2229 * Recreate in-memory lease record.
2230 *
2231 * @param src path to the file
2232 * @param file existing file object
2233 * @param leaseHolder identifier of the lease holder on this file
2234 * @param clientMachine identifier of the client machine
2235 * @param clientNode if the client is collocated with a DN, that DN's descriptor
2236 * @param writeToEditLog whether to persist this change to the edit log
2237 * @param logRetryCache whether to record RPC ids in editlog for retry cache
2238 * rebuilding
2239 * @return the last block locations if the block is partial or null otherwise
2240 * @throws UnresolvedLinkException
2241 * @throws IOException
2242 */
2243 LocatedBlock prepareFileForWrite(String src, INodeFile file,
2244 String leaseHolder, String clientMachine, DatanodeDescriptor clientNode,
2245 boolean writeToEditLog, Snapshot latestSnapshot, boolean logRetryCache)
2246 throws IOException {
2247 file = file.recordModification(latestSnapshot, dir.getINodeMap());
2248 final INodeFileUnderConstruction cons = file.toUnderConstruction(
2249 leaseHolder, clientMachine, clientNode);
2250
2251 dir.replaceINodeFile(src, file, cons);
2252 leaseManager.addLease(cons.getClientName(), src);
2253
2254 LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons);
2255 if (writeToEditLog) {
2256 getEditLog().logOpenFile(src, cons, logRetryCache);
2257 }
2258 return ret;
2259 }
2260
2261 /**
2262 * Recover lease;
2263 * Immediately revoke the lease of the current lease holder and start lease
2264 * recovery so that the file can be forced to be closed.
2265 *
2266 * @param src the path of the file to start lease recovery
2267 * @param holder the lease holder's name
2268 * @param clientMachine the client machine's name
2269 * @return true if the file is already closed
2270 * @throws IOException
2271 */
2272 boolean recoverLease(String src, String holder, String clientMachine)
2273 throws IOException {
2274 if (!DFSUtil.isValidName(src)) {
2275 throw new IOException("Invalid file name: " + src);
2276 }
2277
2278 boolean skipSync = false;
2279 FSPermissionChecker pc = getPermissionChecker();
2280 checkOperation(OperationCategory.WRITE);
2281 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2282 writeLock();
2283 try {
2284 checkOperation(OperationCategory.WRITE);
2285 checkNameNodeSafeMode("Cannot recover the lease of " + src);
2286 src = FSDirectory.resolvePath(src, pathComponents, dir);
2287 final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src);
2288 if (!inode.isUnderConstruction()) {
2289 return true;
2290 }
2291 if (isPermissionEnabled) {
2292 checkPathAccess(pc, src, FsAction.WRITE);
2293 }
2294
2295 recoverLeaseInternal(inode, src, holder, clientMachine, true);
2296 } catch (StandbyException se) {
2297 skipSync = true;
2298 throw se;
2299 } finally {
2300 writeUnlock();
2301 // There might be transactions logged while trying to recover the lease.
2302 // They need to be sync'ed even when an exception was thrown.
2303 if (!skipSync) {
2304 getEditLog().logSync();
2305 }
2306 }
2307 return false;
2308 }
2309
2310 private void recoverLeaseInternal(INodeFile fileInode,
2311 String src, String holder, String clientMachine, boolean force)
2312 throws IOException {
2313 assert hasWriteLock();
2314 if (fileInode != null && fileInode.isUnderConstruction()) {
2315 INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) fileInode;
2316 //
2317 // If the file is under construction , then it must be in our
2318 // leases. Find the appropriate lease record.
2319 //
2320 Lease lease = leaseManager.getLease(holder);
2321 //
2322 // We found the lease for this file. And surprisingly the original
2323 // holder is trying to recreate this file. This should never occur.
2324 //
2325 if (!force && lease != null) {
2326 Lease leaseFile = leaseManager.getLeaseByPath(src);
2327 if ((leaseFile != null && leaseFile.equals(lease)) ||
2328 lease.getHolder().equals(holder)) {
2329 throw new AlreadyBeingCreatedException(
2330 "failed to create file " + src + " for " + holder +
2331 " on client " + clientMachine +
2332 " because current leaseholder is trying to recreate file.");
2333 }
2334 }
2335 //
2336 // Find the original holder.
2337 //
2338 lease = leaseManager.getLease(pendingFile.getClientName());
2339 if (lease == null) {
2340 throw new AlreadyBeingCreatedException(
2341 "failed to create file " + src + " for " + holder +
2342 " on client " + clientMachine +
2343 " because pendingCreates is non-null but no leases found.");
2344 }
2345 if (force) {
2346 // close now: no need to wait for soft lease expiration and
2347 // close only the file src
2348 LOG.info("recoverLease: " + lease + ", src=" + src +
2349 " from client " + pendingFile.getClientName());
2350 internalReleaseLease(lease, src, holder);
2351 } else {
2352 assert lease.getHolder().equals(pendingFile.getClientName()) :
2353 "Current lease holder " + lease.getHolder() +
2354 " does not match file creator " + pendingFile.getClientName();
2355 //
2356 // If the original holder has not renewed in the last SOFTLIMIT
2357 // period, then start lease recovery.
2358 //
2359 if (lease.expiredSoftLimit()) {
2360 LOG.info("startFile: recover " + lease + ", src=" + src + " client "
2361 + pendingFile.getClientName());
2362 boolean isClosed = internalReleaseLease(lease, src, null);
2363 if(!isClosed)
2364 throw new RecoveryInProgressException(
2365 "Failed to close file " + src +
2366 ". Lease recovery is in progress. Try again later.");
2367 } else {
2368 final BlockInfo lastBlock = pendingFile.getLastBlock();
2369 if (lastBlock != null
2370 && lastBlock.getBlockUCState() == BlockUCState.UNDER_RECOVERY) {
2371 throw new RecoveryInProgressException("Recovery in progress, file ["
2372 + src + "], " + "lease owner [" + lease.getHolder() + "]");
2373 } else {
2374 throw new AlreadyBeingCreatedException("Failed to create file ["
2375 + src + "] for [" + holder + "] on client [" + clientMachine
2376 + "], because this file is already being created by ["
2377 + pendingFile.getClientName() + "] on ["
2378 + pendingFile.getClientMachine() + "]");
2379 }
2380 }
2381 }
2382 }
2383 }
2384
2385 /**
2386 * Append to an existing file in the namespace.
2387 */
2388 LocatedBlock appendFile(String src, String holder, String clientMachine)
2389 throws AccessControlException, SafeModeException,
2390 FileAlreadyExistsException, FileNotFoundException,
2391 ParentNotDirectoryException, IOException {
2392 LocatedBlock lb = null;
2393 CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
2394 null);
2395 if (cacheEntry != null && cacheEntry.isSuccess()) {
2396 return (LocatedBlock) cacheEntry.getPayload();
2397 }
2398
2399 boolean success = false;
2400 try {
2401 lb = appendFileInt(src, holder, clientMachine, cacheEntry != null);
2402 success = true;
2403 return lb;
2404 } catch (AccessControlException e) {
2405 logAuditEvent(false, "append", src);
2406 throw e;
2407 } finally {
2408 RetryCache.setState(cacheEntry, success, lb);
2409 }
2410 }
2411
2412 private LocatedBlock appendFileInt(String src, String holder,
2413 String clientMachine, boolean logRetryCache)
2414 throws AccessControlException, SafeModeException,
2415 FileAlreadyExistsException, FileNotFoundException,
2416 ParentNotDirectoryException, IOException {
2417 if (NameNode.stateChangeLog.isDebugEnabled()) {
2418 NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
2419 + ", holder=" + holder
2420 + ", clientMachine=" + clientMachine);
2421 }
2422 boolean skipSync = false;
2423 if (!supportAppends) {
2424 throw new UnsupportedOperationException(
2425 "Append is not enabled on this NameNode. Use the " +
2426 DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
2427 }
2428
2429 LocatedBlock lb = null;
2430 FSPermissionChecker pc = getPermissionChecker();
2431 checkOperation(OperationCategory.WRITE);
2432 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2433 writeLock();
2434 try {
2435 checkOperation(OperationCategory.WRITE);
2436 checkNameNodeSafeMode("Cannot append to file" + src);
2437 src = FSDirectory.resolvePath(src, pathComponents, dir);
2438 lb = appendFileInternal(pc, src, holder, clientMachine, logRetryCache);
2439 } catch (StandbyException se) {
2440 skipSync = true;
2441 throw se;
2442 } finally {
2443 writeUnlock();
2444 // There might be transactions logged while trying to recover the lease.
2445 // They need to be sync'ed even when an exception was thrown.
2446 if (!skipSync) {
2447 getEditLog().logSync();
2448 }
2449 }
2450 if (lb != null) {
2451 if (NameNode.stateChangeLog.isDebugEnabled()) {
2452 NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: file "
2453 +src+" for "+holder+" at "+clientMachine
2454 +" block " + lb.getBlock()
2455 +" block size " + lb.getBlock().getNumBytes());
2456 }
2457 }
2458 logAuditEvent(true, "append", src);
2459 return lb;
2460 }
2461
2462 ExtendedBlock getExtendedBlock(Block blk) {
2463 return new ExtendedBlock(blockPoolId, blk);
2464 }
2465
2466 void setBlockPoolId(String bpid) {
2467 blockPoolId = bpid;
2468 blockManager.setBlockPoolId(blockPoolId);
2469 }
2470
2471 /**
2472 * The client would like to obtain an additional block for the indicated
2473 * filename (which is being written-to). Return an array that consists
2474 * of the block, plus a set of machines. The first on this list should
2475 * be where the client writes data. Subsequent items in the list must
2476 * be provided in the connection to the first datanode.
2477 *
2478 * Make sure the previous blocks have been reported by datanodes and
2479 * are replicated. Will return an empty 2-elt array if we want the
2480 * client to "try again later".
2481 */
2482 LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
2483 ExtendedBlock previous, HashMap<Node, Node> excludedNodes,
2484 List<String> favoredNodes)
2485 throws LeaseExpiredException, NotReplicatedYetException,
2486 QuotaExceededException, SafeModeException, UnresolvedLinkException,
2487 IOException {
2488 long blockSize;
2489 int replication;
2490 DatanodeDescriptor clientNode = null;
2491
2492 if(NameNode.stateChangeLog.isDebugEnabled()) {
2493 NameNode.stateChangeLog.debug(
2494 "BLOCK* NameSystem.getAdditionalBlock: file "
2495 +src+" for "+clientName);
2496 }
2497
2498 // Part I. Analyze the state of the file with respect to the input data.
2499 checkOperation(OperationCategory.READ);
2500 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2501 readLock();
2502 try {
2503 checkOperation(OperationCategory.READ);
2504 src = FSDirectory.resolvePath(src, pathComponents, dir);
2505 LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2506 final INode[] inodes = analyzeFileState(
2507 src, fileId, clientName, previous, onRetryBlock).getINodes();
2508 final INodeFileUnderConstruction pendingFile =
2509 (INodeFileUnderConstruction) inodes[inodes.length - 1].asFile();
2510
2511 if (onRetryBlock[0] != null && onRetryBlock[0].getLocations().length > 0) {
2512 // This is a retry. Just return the last block if having locations.
2513 return onRetryBlock[0];
2514 }
2515 if (pendingFile.getBlocks().length >= maxBlocksPerFile) {
2516 throw new IOException("File has reached the limit on maximum number of"
2517 + " blocks (" + DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_KEY
2518 + "): " + pendingFile.getBlocks().length + " >= "
2519 + maxBlocksPerFile);
2520 }
2521 blockSize = pendingFile.getPreferredBlockSize();
2522 clientNode = pendingFile.getClientNode();
2523 replication = pendingFile.getFileReplication();
2524 } finally {
2525 readUnlock();
2526 }
2527
2528 // choose targets for the new block to be allocated.
2529 final DatanodeDescriptor targets[] = getBlockManager().chooseTarget(
2530 src, replication, clientNode, excludedNodes, blockSize, favoredNodes);
2531
2532 // Part II.
2533 // Allocate a new block, add it to the INode and the BlocksMap.
2534 Block newBlock = null;
2535 long offset;
2536 checkOperation(OperationCategory.WRITE);
2537 writeLock();
2538 try {
2539 checkOperation(OperationCategory.WRITE);
2540 // Run the full analysis again, since things could have changed
2541 // while chooseTarget() was executing.
2542 LocatedBlock[] onRetryBlock = new LocatedBlock[1];
2543 INodesInPath inodesInPath =
2544 analyzeFileState(src, fileId, clientName, previous, onRetryBlock);
2545 final INode[] inodes = inodesInPath.getINodes();
2546 final INodeFileUnderConstruction pendingFile =
2547 (INodeFileUnderConstruction) inodes[inodes.length - 1].asFile();
2548
2549 if (onRetryBlock[0] != null) {
2550 if (onRetryBlock[0].getLocations().length > 0) {
2551 // This is a retry. Just return the last block if having locations.
2552 return onRetryBlock[0];
2553 } else {
2554 // add new chosen targets to already allocated block and return
2555 BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2556 ((BlockInfoUnderConstruction) lastBlockInFile)
2557 .setExpectedLocations(targets);
2558 offset = pendingFile.computeFileSize();
2559 return makeLocatedBlock(lastBlockInFile, targets, offset);
2560 }
2561 }
2562
2563 // commit the last block and complete it if it has minimum replicas
2564 commitOrCompleteLastBlock(pendingFile,
2565 ExtendedBlock.getLocalBlock(previous));
2566
2567 // allocate new block, record block locations in INode.
2568 newBlock = createNewBlock();
2569 saveAllocatedBlock(src, inodesInPath, newBlock, targets);
2570
2571 dir.persistBlocks(src, pendingFile, false);
2572 offset = pendingFile.computeFileSize();
2573 } finally {
2574 writeUnlock();
2575 }
2576 getEditLog().logSync();
2577
2578 // Return located block
2579 return makeLocatedBlock(newBlock, targets, offset);
2580 }
2581
2582 INodesInPath analyzeFileState(String src,
2583 long fileId,
2584 String clientName,
2585 ExtendedBlock previous,
2586 LocatedBlock[] onRetryBlock)
2587 throws IOException {
2588 assert hasReadLock();
2589
2590 checkBlock(previous);
2591 onRetryBlock[0] = null;
2592 checkOperation(OperationCategory.WRITE);
2593 checkNameNodeSafeMode("Cannot add block to " + src);
2594
2595 // have we exceeded the configured limit of fs objects.
2596 checkFsObjectLimit();
2597
2598 Block previousBlock = ExtendedBlock.getLocalBlock(previous);
2599 final INodesInPath iip = dir.getINodesInPath4Write(src);
2600 final INodeFileUnderConstruction pendingFile
2601 = checkLease(src, fileId, clientName, iip.getLastINode());
2602 BlockInfo lastBlockInFile = pendingFile.getLastBlock();
2603 if (!Block.matchingIdAndGenStamp(previousBlock, lastBlockInFile)) {
2604 // The block that the client claims is the current last block
2605 // doesn't match up with what we think is the last block. There are
2606 // four possibilities:
2607 // 1) This is the first block allocation of an append() pipeline
2608 // which started appending exactly at a block boundary.
2609 // In this case, the client isn't passed the previous block,
2610 // so it makes the allocateBlock() call with previous=null.
2611 // We can distinguish this since the last block of the file
2612 // will be exactly a full block.
2613 // 2) This is a retry from a client that missed the response of a
2614 // prior getAdditionalBlock() call, perhaps because of a network
2615 // timeout, or because of an HA failover. In that case, we know
2616 // by the fact that the client is re-issuing the RPC that it
2617 // never began to write to the old block. Hence it is safe to
2618 // to return the existing block.
2619 // 3) This is an entirely bogus request/bug -- we should error out
2620 // rather than potentially appending a new block with an empty
2621 // one in the middle, etc
2622 // 4) This is a retry from a client that timed out while
2623 // the prior getAdditionalBlock() is still being processed,
2624 // currently working on chooseTarget().
2625 // There are no means to distinguish between the first and
2626 // the second attempts in Part I, because the first one hasn't
2627 // changed the namesystem state yet.
2628 // We run this analysis again in Part II where case 4 is impossible.
2629
2630 BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
2631 if (previous == null &&
2632 lastBlockInFile != null &&
2633 lastBlockInFile.getNumBytes() == pendingFile.getPreferredBlockSize() &&
2634 lastBlockInFile.isComplete()) {
2635 // Case 1
2636 if (NameNode.stateChangeLog.isDebugEnabled()) {
2637 NameNode.stateChangeLog.debug(
2638 "BLOCK* NameSystem.allocateBlock: handling block allocation" +
2639 " writing to a file with a complete previous block: src=" +
2640 src + " lastBlock=" + lastBlockInFile);
2641 }
2642 } else if (Block.matchingIdAndGenStamp(penultimateBlock, previousBlock)) {
2643 if (lastBlockInFile.getNumBytes() != 0) {
2644 throw new IOException(
2645 "Request looked like a retry to allocate block " +
2646 lastBlockInFile + " but it already contains " +
2647 lastBlockInFile.getNumBytes() + " bytes");
2648 }
2649
2650 // Case 2
2651 // Return the last block.
2652 NameNode.stateChangeLog.info("BLOCK* allocateBlock: " +
2653 "caught retry for allocation of a new block in " +
2654 src + ". Returning previously allocated block " + lastBlockInFile);
2655 long offset = pendingFile.computeFileSize();
2656 onRetryBlock[0] = makeLocatedBlock(lastBlockInFile,
2657 ((BlockInfoUnderConstruction)lastBlockInFile).getExpectedLocations(),
2658 offset);
2659 return iip;
2660 } else {
2661 // Case 3
2662 throw new IOException("Cannot allocate block in " + src + ": " +
2663 "passed 'previous' block " + previous + " does not match actual " +
2664 "last block in file " + lastBlockInFile);
2665 }
2666 }
2667
2668 // Check if the penultimate block is minimally replicated
2669 if (!checkFileProgress(pendingFile, false)) {
2670 throw new NotReplicatedYetException("Not replicated yet: " + src);
2671 }
2672 return iip;
2673 }
2674
2675 LocatedBlock makeLocatedBlock(Block blk,
2676 DatanodeInfo[] locs,
2677 long offset) throws IOException {
2678 LocatedBlock lBlk = new LocatedBlock(
2679 getExtendedBlock(blk), locs, offset);
2680 getBlockManager().setBlockToken(
2681 lBlk, BlockTokenSecretManager.AccessMode.WRITE);
2682 return lBlk;
2683 }
2684
2685 /** @see NameNode#getAdditionalDatanode(String, ExtendedBlock, DatanodeInfo[], DatanodeInfo[], int, String) */
2686 LocatedBlock getAdditionalDatanode(String src, final ExtendedBlock blk,
2687 final DatanodeInfo[] existings, final HashMap<Node, Node> excludes,
2688 final int numAdditionalNodes, final String clientName
2689 ) throws IOException {
2690 //check if the feature is enabled
2691 dtpReplaceDatanodeOnFailure.checkEnabled();
2692
2693 final DatanodeDescriptor clientnode;
2694 final long preferredblocksize;
2695 final List<DatanodeDescriptor> chosen;
2696 checkOperation(OperationCategory.READ);
2697 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2698 readLock();
2699 try {
2700 checkOperation(OperationCategory.READ);
2701 //check safe mode
2702 checkNameNodeSafeMode("Cannot add datanode; src=" + src + ", blk=" + blk);
2703 src = FSDirectory.resolvePath(src, pathComponents, dir);
2704
2705 //check lease
2706 final INodeFileUnderConstruction file = checkLease(src, clientName);
2707 clientnode = file.getClientNode();
2708 preferredblocksize = file.getPreferredBlockSize();
2709
2710 //find datanode descriptors
2711 chosen = new ArrayList<DatanodeDescriptor>();
2712 for(DatanodeInfo d : existings) {
2713 final DatanodeDescriptor descriptor = blockManager.getDatanodeManager(
2714 ).getDatanode(d);
2715 if (descriptor != null) {
2716 chosen.add(descriptor);
2717 }
2718 }
2719 } finally {
2720 readUnlock();
2721 }
2722
2723 // choose new datanodes.
2724 final DatanodeInfo[] targets = blockManager.getBlockPlacementPolicy(
2725 ).chooseTarget(src, numAdditionalNodes, clientnode, chosen, true,
2726 excludes, preferredblocksize);
2727 final LocatedBlock lb = new LocatedBlock(blk, targets);
2728 blockManager.setBlockToken(lb, AccessMode.COPY);
2729 return lb;
2730 }
2731
2732 /**
2733 * The client would like to let go of the given block
2734 */
2735 boolean abandonBlock(ExtendedBlock b, String src, String holder)
2736 throws LeaseExpiredException, FileNotFoundException,
2737 UnresolvedLinkException, IOException {
2738 if(NameNode.stateChangeLog.isDebugEnabled()) {
2739 NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
2740 + "of file " + src);
2741 }
2742 checkOperation(OperationCategory.WRITE);
2743 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2744 writeLock();
2745 try {
2746 checkOperation(OperationCategory.WRITE);
2747 checkNameNodeSafeMode("Cannot abandon block " + b + " for fle" + src);
2748 src = FSDirectory.resolvePath(src, pathComponents, dir);
2749
2750 //
2751 // Remove the block from the pending creates list
2752 //
2753 INodeFileUnderConstruction file = checkLease(src, holder);
2754 boolean removed = dir.removeBlock(src, file,
2755 ExtendedBlock.getLocalBlock(b));
2756 if (!removed) {
2757 return true;
2758 }
2759 if(NameNode.stateChangeLog.isDebugEnabled()) {
2760 NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
2761 + b + " is removed from pendingCreates");
2762 }
2763 dir.persistBlocks(src, file, false);
2764 } finally {
2765 writeUnlock();
2766 }
2767 getEditLog().logSync();
2768
2769 return true;
2770 }
2771
2772 /** make sure that we still have the lease on this file. */
2773 private INodeFileUnderConstruction checkLease(String src, String holder)
2774 throws LeaseExpiredException, UnresolvedLinkException,
2775 FileNotFoundException {
2776 return checkLease(src, INodeId.GRANDFATHER_INODE_ID, holder,
2777 dir.getINode(src));
2778 }
2779
2780 private INodeFileUnderConstruction checkLease(String src, long fileId,
2781 String holder, INode inode) throws LeaseExpiredException,
2782 FileNotFoundException {
2783 assert hasReadLock();
2784 if (inode == null || !inode.isFile()) {
2785 Lease lease = leaseManager.getLease(holder);
2786 throw new LeaseExpiredException(
2787 "No lease on " + src + ": File does not exist. "
2788 + (lease != null ? lease.toString()
2789 : "Holder " + holder + " does not have any open files."));
2790 }
2791 final INodeFile file = inode.asFile();
2792 if (!file.isUnderConstruction()) {
2793 Lease lease = leaseManager.getLease(holder);
2794 throw new LeaseExpiredException(
2795 "No lease on " + src + ": File is not open for writing. "
2796 + (lease != null ? lease.toString()
2797 : "Holder " + holder + " does not have any open files."));
2798 }
2799 INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)file;
2800 if (holder != null && !pendingFile.getClientName().equals(holder)) {
2801 throw new LeaseExpiredException("Lease mismatch on " + src + " owned by "
2802 + pendingFile.getClientName() + " but is accessed by " + holder);
2803 }
2804 INodeId.checkId(fileId, pendingFile);
2805 return pendingFile;
2806 }
2807
2808 /**
2809 * Complete in-progress write to the given file.
2810 * @return true if successful, false if the client should continue to retry
2811 * (e.g if not all blocks have reached minimum replication yet)
2812 * @throws IOException on error (eg lease mismatch, file not open, file deleted)
2813 */
2814 boolean completeFile(String src, String holder,
2815 ExtendedBlock last, long fileId)
2816 throws SafeModeException, UnresolvedLinkException, IOException {
2817 if (NameNode.stateChangeLog.isDebugEnabled()) {
2818 NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
2819 src + " for " + holder);
2820 }
2821 checkBlock(last);
2822 boolean success = false;
2823 checkOperation(OperationCategory.WRITE);
2824 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
2825 writeLock();
2826 try {
2827 checkOperation(OperationCategory.WRITE);
2828 checkNameNodeSafeMode("Cannot complete file " + src);
2829 src = FSDirectory.resolvePath(src, pathComponents, dir);
2830 success = completeFileInternal(src, holder,
2831 ExtendedBlock.getLocalBlock(last), fileId);
2832 } finally {
2833 writeUnlock();
2834 }
2835 getEditLog().logSync();
2836 NameNode.stateChangeLog.info("DIR* completeFile: " + src + " is closed by "
2837 + holder);
2838 return success;
2839 }
2840
2841 private boolean completeFileInternal(String src,
2842 String holder, Block last, long fileId) throws SafeModeException,
2843 UnresolvedLinkException, IOException {
2844 assert hasWriteLock();
2845 final INodesInPath iip = dir.getLastINodeInPath(src);
2846 final INodeFileUnderConstruction pendingFile;
2847 try {
2848 pendingFile = checkLease(src, fileId, holder, iip.getINode(0));
2849 } catch (LeaseExpiredException lee) {
2850 final INode inode = dir.getINode(src);
2851 if (inode != null
2852 && inode.isFile()
2853 && !inode.asFile().isUnderConstruction()) {
2854 // This could be a retry RPC - i.e the client tried to close
2855 // the file, but missed the RPC response. Thus, it is trying
2856 // again to close the file. If the file still exists and
2857 // the client's view of the last block matches the actual
2858 // last block, then we'll treat it as a successful close.
2859 // See HDFS-3031.
2860 final Block realLastBlock = inode.asFile().getLastBlock();
2861 if (Block.matchingIdAndGenStamp(last, realLastBlock)) {
2862 NameNode.stateChangeLog.info("DIR* completeFile: " +
2863 "request from " + holder + " to complete " + src +
2864 " which is already closed. But, it appears to be an RPC " +
2865 "retry. Returning success");
2866 return true;
2867 }
2868 }
2869 throw lee;
2870 }
2871 // commit the last block and complete it if it has minimum replicas
2872 commitOrCompleteLastBlock(pendingFile, last);
2873
2874 if (!checkFileProgress(pendingFile, true)) {
2875 return false;
2876 }
2877
2878 finalizeINodeFileUnderConstruction(src, pendingFile,
2879 iip.getLatestSnapshot());
2880 return true;
2881 }
2882
2883 /**
2884 * Save allocated block at the given pending filename
2885 *
2886 * @param src path to the file
2887 * @param inodesInPath representing each of the components of src.
2888 * The last INode is the INode for the file.
2889 * @throws QuotaExceededException If addition of block exceeds space quota
2890 */
2891 BlockInfo saveAllocatedBlock(String src, INodesInPath inodes,
2892 Block newBlock, DatanodeDescriptor targets[]) throws IOException {
2893 assert hasWriteLock();
2894 BlockInfo b = dir.addBlock(src, inodes, newBlock, targets);
2895 NameNode.stateChangeLog.info("BLOCK* allocateBlock: " + src + ". "
2896 + getBlockPoolId() + " " + b);
2897 for (DatanodeDescriptor dn : targets) {
2898 dn.incBlocksScheduled();
2899 }
2900 return b;
2901 }
2902
2903 /**
2904 * Create new block with a unique block id and a new generation stamp.
2905 */
2906 Block createNewBlock() throws IOException {
2907 assert hasWriteLock();
2908 Block b = new Block(nextBlockId(), 0, 0);
2909 // Increment the generation stamp for every new block.
2910 b.setGenerationStamp(nextGenerationStamp(false));
2911 return b;
2912 }
2913
2914 /**
2915 * Check that the indicated file's blocks are present and
2916 * replicated. If not, return false. If checkall is true, then check
2917 * all blocks, otherwise check only penultimate block.
2918 */
2919 boolean checkFileProgress(INodeFile v, boolean checkall) {
2920 readLock();
2921 try {
2922 if (checkall) {
2923 //
2924 // check all blocks of the file.
2925 //
2926 for (BlockInfo block: v.getBlocks()) {
2927 if (!block.isComplete()) {
2928 LOG.info("BLOCK* checkFileProgress: " + block
2929 + " has not reached minimal replication "
2930 + blockManager.minReplication);
2931 return false;
2932 }
2933 }
2934 } else {
2935 //
2936 // check the penultimate block of this file
2937 //
2938 BlockInfo b = v.getPenultimateBlock();
2939 if (b != null && !b.isComplete()) {
2940 LOG.info("BLOCK* checkFileProgress: " + b
2941 + " has not reached minimal replication "
2942 + blockManager.minReplication);
2943 return false;
2944 }
2945 }
2946 return true;
2947 } finally {
2948 readUnlock();
2949 }
2950 }
2951
2952 ////////////////////////////////////////////////////////////////
2953 // Here's how to handle block-copy failure during client write:
2954 // -- As usual, the client's write should result in a streaming
2955 // backup write to a k-machine sequence.
2956 // -- If one of the backup machines fails, no worries. Fail silently.
2957 // -- Before client is allowed to close and finalize file, make sure
2958 // that the blocks are backed up. Namenode may have to issue specific backup
2959 // commands to make up for earlier datanode failures. Once all copies
2960 // are made, edit namespace and return to client.
2961 ////////////////////////////////////////////////////////////////
2962
2963 /**
2964 * Change the indicated filename.
2965 * @deprecated Use {@link #renameTo(String, String, Options.Rename...)} instead.
2966 */
2967 @Deprecated
2968 boolean renameTo(String src, String dst)
2969 throws IOException, UnresolvedLinkException {
2970 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
2971 if (cacheEntry != null && cacheEntry.isSuccess()) {
2972 return true; // Return previous response
2973 }
2974 boolean ret = false;
2975 try {
2976 ret = renameToInt(src, dst, cacheEntry != null);
2977 } catch (AccessControlException e) {
2978 logAuditEvent(false, "rename", src, dst, null);
2979 throw e;
2980 } finally {
2981 RetryCache.setState(cacheEntry, ret);
2982 }
2983 return ret;
2984 }
2985
2986 private boolean renameToInt(String src, String dst, boolean logRetryCache)
2987 throws IOException, UnresolvedLinkException {
2988 if (NameNode.stateChangeLog.isDebugEnabled()) {
2989 NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src +
2990 " to " + dst);
2991 }
2992 if (!DFSUtil.isValidName(dst)) {
2993 throw new IOException("Invalid name: " + dst);
2994 }
2995 FSPermissionChecker pc = getPermissionChecker();
2996 checkOperation(OperationCategory.WRITE);
2997 byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
2998 byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
2999 boolean status = false;
3000 HdfsFileStatus resultingStat = null;
3001 writeLock();
3002 try {
3003 checkOperation(OperationCategory.WRITE);
3004 checkNameNodeSafeMode("Cannot rename " + src);
3005 src = FSDirectory.resolvePath(src, srcComponents, dir);
3006 dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3007 checkOperation(OperationCategory.WRITE);
3008 status = renameToInternal(pc, src, dst, logRetryCache);
3009 if (status) {
3010 resultingStat = getAuditFileInfo(dst, false);
3011 }
3012 } finally {
3013 writeUnlock();
3014 }
3015 getEditLog().logSync();
3016 if (status) {
3017 logAuditEvent(true, "rename", src, dst, resultingStat);
3018 }
3019 return status;
3020 }
3021
3022 /** @deprecated See {@link #renameTo(String, String)} */
3023 @Deprecated
3024 private boolean renameToInternal(FSPermissionChecker pc, String src,
3025 String dst, boolean logRetryCache) throws IOException,
3026 UnresolvedLinkException {
3027 assert hasWriteLock();
3028 if (isPermissionEnabled) {
3029 //We should not be doing this. This is move() not renameTo().
3030 //but for now,
3031 //NOTE: yes, this is bad! it's assuming much lower level behavior
3032 // of rewriting the dst
3033 String actualdst = dir.isDir(dst)?
3034 dst + Path.SEPARATOR + new Path(src).getName(): dst;
3035 // Rename does not operates on link targets
3036 // Do not resolveLink when checking permissions of src and dst
3037 // Check write access to parent of src
3038 checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3039 // Check write access to ancestor of dst
3040 checkPermission(pc, actualdst, false, FsAction.WRITE, null, null, null,
3041 false);
3042 }
3043
3044 if (dir.renameTo(src, dst, logRetryCache)) {
3045 return true;
3046 }
3047 return false;
3048 }
3049
3050
3051 /** Rename src to dst */
3052 void renameTo(String src, String dst, Options.Rename... options)
3053 throws IOException, UnresolvedLinkException {
3054 if (NameNode.stateChangeLog.isDebugEnabled()) {
3055 NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: with options - "
3056 + src + " to " + dst);
3057 }
3058 if (!DFSUtil.isValidName(dst)) {
3059 throw new InvalidPathException("Invalid name: " + dst);
3060 }
3061 final FSPermissionChecker pc = getPermissionChecker();
3062
3063 checkOperation(OperationCategory.WRITE);
3064 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3065 if (cacheEntry != null && cacheEntry.isSuccess()) {
3066 return; // Return previous response
3067 }
3068 byte[][] srcComponents = FSDirectory.getPathComponentsForReservedPath(src);
3069 byte[][] dstComponents = FSDirectory.getPathComponentsForReservedPath(dst);
3070 HdfsFileStatus resultingStat = null;
3071 boolean success = false;
3072 writeLock();
3073 try {
3074 checkOperation(OperationCategory.WRITE);
3075 checkNameNodeSafeMode("Cannot rename " + src);
3076 src = FSDirectory.resolvePath(src, srcComponents, dir);
3077 dst = FSDirectory.resolvePath(dst, dstComponents, dir);
3078 renameToInternal(pc, src, dst, cacheEntry != null, options);
3079 resultingStat = getAuditFileInfo(dst, false);
3080 success = true;
3081 } finally {
3082 writeUnlock();
3083 RetryCache.setState(cacheEntry, success);
3084 }
3085 getEditLog().logSync();
3086 if (resultingStat != null) {
3087 StringBuilder cmd = new StringBuilder("rename options=");
3088 for (Rename option : options) {
3089 cmd.append(option.value()).append(" ");
3090 }
3091 logAuditEvent(true, cmd.toString(), src, dst, resultingStat);
3092 }
3093 }
3094
3095 private void renameToInternal(FSPermissionChecker pc, String src, String dst,
3096 boolean logRetryCache, Options.Rename... options) throws IOException {
3097 assert hasWriteLock();
3098 if (isPermissionEnabled) {
3099 // Rename does not operates on link targets
3100 // Do not resolveLink when checking permissions of src and dst
3101 // Check write access to parent of src
3102 checkPermission(pc, src, false, null, FsAction.WRITE, null, null, false);
3103 // Check write access to ancestor of dst
3104 checkPermission(pc, dst, false, FsAction.WRITE, null, null, null, false);
3105 }
3106
3107 dir.renameTo(src, dst, logRetryCache, options);
3108 }
3109
3110 /**
3111 * Remove the indicated file from namespace.
3112 *
3113 * @see ClientProtocol#delete(String, boolean) for detailed description and
3114 * description of exceptions
3115 */
3116 boolean delete(String src, boolean recursive)
3117 throws AccessControlException, SafeModeException,
3118 UnresolvedLinkException, IOException {
3119 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
3120 if (cacheEntry != null && cacheEntry.isSuccess()) {
3121 return true; // Return previous response
3122 }
3123 boolean ret = false;
3124 try {
3125 ret = deleteInt(src, recursive, cacheEntry != null);
3126 } catch (AccessControlException e) {
3127 logAuditEvent(false, "delete", src);
3128 throw e;
3129 } finally {
3130 RetryCache.setState(cacheEntry, ret);
3131 }
3132 return ret;
3133 }
3134
3135 private boolean deleteInt(String src, boolean recursive, boolean logRetryCache)
3136 throws AccessControlException, SafeModeException,
3137 UnresolvedLinkException, IOException {
3138 if (NameNode.stateChangeLog.isDebugEnabled()) {
3139 NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src);
3140 }
3141 boolean status = deleteInternal(src, recursive, true, logRetryCache);
3142 if (status) {
3143 logAuditEvent(true, "delete", src);
3144 }
3145 return status;
3146 }
3147
3148 private FSPermissionChecker getPermissionChecker()
3149 throws AccessControlException {
3150 try {
3151 return new FSPermissionChecker(fsOwnerShortUserName, supergroup, getRemoteUser());
3152 } catch (IOException ioe) {
3153 throw new AccessControlException(ioe);
3154 }
3155 }
3156
3157 /**
3158 * Remove a file/directory from the namespace.
3159 * <p>
3160 * For large directories, deletion is incremental. The blocks under
3161 * the directory are collected and deleted a small number at a time holding
3162 * the {@link FSNamesystem} lock.
3163 * <p>
3164 * For small directory or file the deletion is done in one shot.
3165 *
3166 * @see ClientProtocol#delete(String, boolean) for description of exceptions
3167 */
3168 private boolean deleteInternal(String src, boolean recursive,
3169 boolean enforcePermission, boolean logRetryCache)
3170 throws AccessControlException, SafeModeException, UnresolvedLinkException,
3171 IOException {
3172 BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
3173 List<INode> removedINodes = new ArrayList<INode>();
3174 FSPermissionChecker pc = getPermissionChecker();
3175 checkOperation(OperationCategory.WRITE);
3176 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3177 boolean ret = false;
3178 writeLock();
3179 try {
3180 checkOperation(OperationCategory.WRITE);
3181 checkNameNodeSafeMode("Cannot delete " + src);
3182 src = FSDirectory.resolvePath(src, pathComponents, dir);
3183 if (!recursive && dir.isNonEmptyDirectory(src)) {
3184 throw new IOException(src + " is non empty");
3185 }
3186 if (enforcePermission && isPermissionEnabled) {
3187 checkPermission(pc, src, false, null, FsAction.WRITE, null,
3188 FsAction.ALL, false);
3189 }
3190 // Unlink the target directory from directory tree
3191 if (!dir.delete(src, collectedBlocks, removedINodes, logRetryCache)) {
3192 return false;
3193 }
3194 ret = true;
3195 } finally {
3196 writeUnlock();
3197 }
3198 getEditLog().logSync();
3199 removeBlocks(collectedBlocks); // Incremental deletion of blocks
3200 collectedBlocks.clear();
3201 dir.writeLock();
3202 try {
3203 dir.removeFromInodeMap(removedINodes);
3204 } finally {
3205 dir.writeUnlock();
3206 }
3207 removedINodes.clear();
3208 if (NameNode.stateChangeLog.isDebugEnabled()) {
3209 NameNode.stateChangeLog.debug("DIR* Namesystem.delete: "
3210 + src +" is removed");
3211 }
3212 return ret;
3213 }
3214
3215 /**
3216 * From the given list, incrementally remove the blocks from blockManager
3217 * Writelock is dropped and reacquired every BLOCK_DELETION_INCREMENT to
3218 * ensure that other waiters on the lock can get in. See HDFS-2938
3219 *
3220 * @param blocks
3221 * An instance of {@link BlocksMapUpdateInfo} which contains a list
3222 * of blocks that need to be removed from blocksMap
3223 */
3224 void removeBlocks(BlocksMapUpdateInfo blocks) {
3225 int start = 0;
3226 int end = 0;
3227 List<Block> toDeleteList = blocks.getToDeleteList();
3228 while (start < toDeleteList.size()) {
3229 end = BLOCK_DELETION_INCREMENT + start;
3230 end = end > toDeleteList.size() ? toDeleteList.size() : end;
3231 writeLock();
3232 try {
3233 for (int i = start; i < end; i++) {
3234 blockManager.removeBlock(toDeleteList.get(i));
3235 }
3236 } finally {
3237 writeUnlock();
3238 }
3239 start = end;
3240 }
3241 }
3242
3243 /**
3244 * Remove leases, inodes and blocks related to a given path
3245 * @param src The given path
3246 * @param blocks Containing the list of blocks to be deleted from blocksMap
3247 * @param removedINodes Containing the list of inodes to be removed from
3248 * inodesMap
3249 */
3250 void removePathAndBlocks(String src, BlocksMapUpdateInfo blocks,
3251 List<INode> removedINodes) {
3252 assert hasWriteLock();
3253 leaseManager.removeLeaseWithPrefixPath(src);
3254 // remove inodes from inodesMap
3255 if (removedINodes != null) {
3256 dir.removeFromInodeMap(removedINodes);
3257 removedINodes.clear();
3258 }
3259 if (blocks == null) {
3260 return;
3261 }
3262
3263 removeBlocksAndUpdateSafemodeTotal(blocks);
3264 }
3265
3266 /**
3267 * Removes the blocks from blocksmap and updates the safemode blocks total
3268 *
3269 * @param blocks
3270 * An instance of {@link BlocksMapUpdateInfo} which contains a list
3271 * of blocks that need to be removed from blocksMap
3272 */
3273 void removeBlocksAndUpdateSafemodeTotal(BlocksMapUpdateInfo blocks) {
3274 assert hasWriteLock();
3275 // In the case that we are a Standby tailing edits from the
3276 // active while in safe-mode, we need to track the total number
3277 // of blocks and safe blocks in the system.
3278 boolean trackBlockCounts = isSafeModeTrackingBlocks();
3279 int numRemovedComplete = 0, numRemovedSafe = 0;
3280
3281 for (Block b : blocks.getToDeleteList()) {
3282 if (trackBlockCounts) {
3283 BlockInfo bi = getStoredBlock(b);
3284 if (bi.isComplete()) {
3285 numRemovedComplete++;
3286 if (bi.numNodes() >= blockManager.minReplication) {
3287 numRemovedSafe++;
3288 }
3289 }
3290 }
3291 blockManager.removeBlock(b);
3292 }
3293 if (trackBlockCounts) {
3294 if (LOG.isDebugEnabled()) {
3295 LOG.debug("Adjusting safe-mode totals for deletion."
3296 + "decreasing safeBlocks by " + numRemovedSafe
3297 + ", totalBlocks by " + numRemovedComplete);
3298 }
3299 adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
3300 }
3301 }
3302
3303 /**
3304 * @see SafeModeInfo#shouldIncrementallyTrackBlocks
3305 */
3306 private boolean isSafeModeTrackingBlocks() {
3307 if (!haEnabled) {
3308 // Never track blocks incrementally in non-HA code.
3309 return false;
3310 }
3311 SafeModeInfo sm = this.safeMode;
3312 return sm != null && sm.shouldIncrementallyTrackBlocks();
3313 }
3314
3315 /**
3316 * Get the file info for a specific file.
3317 *
3318 * @param src The string representation of the path to the file
3319 * @param resolveLink whether to throw UnresolvedLinkException
3320 * if src refers to a symlink
3321 *
3322 * @throws AccessControlException if access is denied
3323 * @throws UnresolvedLinkException if a symlink is encountered.
3324 *
3325 * @return object containing information regarding the file
3326 * or null if file not found
3327 * @throws StandbyException
3328 */
3329 HdfsFileStatus getFileInfo(String src, boolean resolveLink)
3330 throws AccessControlException, UnresolvedLinkException,
3331 StandbyException, IOException {
3332 if (!DFSUtil.isValidName(src)) {
3333 throw new InvalidPathException("Invalid file name: " + src);
3334 }
3335 HdfsFileStatus stat = null;
3336 FSPermissionChecker pc = getPermissionChecker();
3337 checkOperation(OperationCategory.READ);
3338 if (!DFSUtil.isValidName(src)) {
3339 throw new InvalidPathException("Invalid file name: " + src);
3340 }
3341 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3342 readLock();
3343 try {
3344 checkOperation(OperationCategory.READ);
3345 src = FSDirectory.resolvePath(src, pathComponents, dir);
3346 if (isPermissionEnabled) {
3347 checkPermission(pc, src, false, null, null, null, null, resolveLink);
3348 }
3349 stat = dir.getFileInfo(src, resolveLink);
3350 } catch (AccessControlException e) {
3351 logAuditEvent(false, "getfileinfo", src);
3352 throw e;
3353 } finally {
3354 readUnlock();
3355 }
3356 logAuditEvent(true, "getfileinfo", src);
3357 return stat;
3358 }
3359
3360 /**
3361 * Returns true if the file is closed
3362 */
3363 boolean isFileClosed(String src)
3364 throws AccessControlException, UnresolvedLinkException,
3365 StandbyException, IOException {
3366 FSPermissionChecker pc = getPermissionChecker();
3367 checkOperation(OperationCategory.READ);
3368 readLock();
3369 try {
3370 checkOperation(OperationCategory.READ);
3371 if (isPermissionEnabled) {
3372 checkTraverse(pc, src);
3373 }
3374 return !INodeFile.valueOf(dir.getINode(src), src).isUnderConstruction();
3375 } catch (AccessControlException e) {
3376 if (isAuditEnabled() && isExternalInvocation()) {
3377 logAuditEvent(false, "isFileClosed", src);
3378 }
3379 throw e;
3380 } finally {
3381 readUnlock();
3382 }
3383 }
3384
3385 /**
3386 * Create all the necessary directories
3387 */
3388 boolean mkdirs(String src, PermissionStatus permissions,
3389 boolean createParent) throws IOException, UnresolvedLinkException {
3390 boolean ret = false;
3391 try {
3392 ret = mkdirsInt(src, permissions, createParent);
3393 } catch (AccessControlException e) {
3394 logAuditEvent(false, "mkdirs", src);
3395 throw e;
3396 }
3397 return ret;
3398 }
3399
3400 private boolean mkdirsInt(String src, PermissionStatus permissions,
3401 boolean createParent) throws IOException, UnresolvedLinkException {
3402 if(NameNode.stateChangeLog.isDebugEnabled()) {
3403 NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
3404 }
3405 if (!DFSUtil.isValidName(src)) {
3406 throw new InvalidPathException(src);
3407 }
3408 FSPermissionChecker pc = getPermissionChecker();
3409 checkOperation(OperationCategory.WRITE);
3410 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3411 HdfsFileStatus resultingStat = null;
3412 boolean status = false;
3413 writeLock();
3414 try {
3415 checkOperation(OperationCategory.WRITE);
3416 checkNameNodeSafeMode("Cannot create directory " + src);
3417 src = FSDirectory.resolvePath(src, pathComponents, dir);
3418 status = mkdirsInternal(pc, src, permissions, createParent);
3419 if (status) {
3420 resultingStat = dir.getFileInfo(src, false);
3421 }
3422 } finally {
3423 writeUnlock();
3424 }
3425 getEditLog().logSync();
3426 if (status) {
3427 logAuditEvent(true, "mkdirs", src, null, resultingStat);
3428 }
3429 return status;
3430 }
3431
3432 /**
3433 * Create all the necessary directories
3434 */
3435 private boolean mkdirsInternal(FSPermissionChecker pc, String src,
3436 PermissionStatus permissions, boolean createParent)
3437 throws IOException, UnresolvedLinkException {
3438 assert hasWriteLock();
3439 if (isPermissionEnabled) {
3440 checkTraverse(pc, src);
3441 }
3442 if (dir.isDirMutable(src)) {
3443 // all the users of mkdirs() are used to expect 'true' even if
3444 // a new directory is not created.
3445 return true;
3446 }
3447 if (isPermissionEnabled) {
3448 checkAncestorAccess(pc, src, FsAction.WRITE);
3449 }
3450 if (!createParent) {
3451 verifyParentDir(src);
3452 }
3453
3454 // validate that we have enough inodes. This is, at best, a
3455 // heuristic because the mkdirs() operation might need to
3456 // create multiple inodes.
3457 checkFsObjectLimit();
3458
3459 if (!dir.mkdirs(src, permissions, false, now())) {
3460 throw new IOException("Failed to create directory: " + src);
3461 }
3462 return true;
3463 }
3464
3465 ContentSummary getContentSummary(String src) throws AccessControlException,
3466 FileNotFoundException, UnresolvedLinkException, StandbyException {
3467 FSPermissionChecker pc = getPermissionChecker();
3468 checkOperation(OperationCategory.READ);
3469 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3470 readLock();
3471 try {
3472 checkOperation(OperationCategory.READ);
3473 src = FSDirectory.resolvePath(src, pathComponents, dir);
3474 if (isPermissionEnabled) {
3475 checkPermission(pc, src, false, null, null, null, FsAction.READ_EXECUTE);
3476 }
3477 return dir.getContentSummary(src);
3478 } finally {
3479 readUnlock();
3480 }
3481 }
3482
3483 /**
3484 * Set the namespace quota and diskspace quota for a directory.
3485 * See {@link ClientProtocol#setQuota(String, long, long)} for the
3486 * contract.
3487 *
3488 * Note: This does not support ".inodes" relative path.
3489 */
3490 void setQuota(String path, long nsQuota, long dsQuota)
3491 throws IOException, UnresolvedLinkException {
3492 checkSuperuserPrivilege();
3493 checkOperation(OperationCategory.WRITE);
3494 writeLock();
3495 try {
3496 checkOperation(OperationCategory.WRITE);
3497 checkNameNodeSafeMode("Cannot set quota on " + path);
3498 dir.setQuota(path, nsQuota, dsQuota);
3499 } finally {
3500 writeUnlock();
3501 }
3502 getEditLog().logSync();
3503 }
3504
3505 /** Persist all metadata about this file.
3506 * @param src The string representation of the path
3507 * @param clientName The string representation of the client
3508 * @param lastBlockLength The length of the last block
3509 * under construction reported from client.
3510 * @throws IOException if path does not exist
3511 */
3512 void fsync(String src, String clientName, long lastBlockLength)
3513 throws IOException, UnresolvedLinkException {
3514 NameNode.stateChangeLog.info("BLOCK* fsync: " + src + " for " + clientName);
3515 checkOperation(OperationCategory.WRITE);
3516 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3517 writeLock();
3518 try {
3519 checkOperation(OperationCategory.WRITE);
3520 checkNameNodeSafeMode("Cannot fsync file " + src);
3521 src = FSDirectory.resolvePath(src, pathComponents, dir);
3522 INodeFileUnderConstruction pendingFile = checkLease(src, clientName);
3523 if (lastBlockLength > 0) {
3524 pendingFile.updateLengthOfLastBlock(lastBlockLength);
3525 }
3526 dir.persistBlocks(src, pendingFile, false);
3527 } finally {
3528 writeUnlock();
3529 }
3530 getEditLog().logSync();
3531 }
3532
3533 /**
3534 * Move a file that is being written to be immutable.
3535 * @param src The filename
3536 * @param lease The lease for the client creating the file
3537 * @param recoveryLeaseHolder reassign lease to this holder if the last block
3538 * needs recovery; keep current holder if null.
3539 * @throws AlreadyBeingCreatedException if file is waiting to achieve minimal
3540 * replication;<br>
3541 * RecoveryInProgressException if lease recovery is in progress.<br>
3542 * IOException in case of an error.
3543 * @return true if file has been successfully finalized and closed or
3544 * false if block recovery has been initiated. Since the lease owner
3545 * has been changed and logged, caller should call logSync().
3546 */
3547 boolean internalReleaseLease(Lease lease, String src,
3548 String recoveryLeaseHolder) throws AlreadyBeingCreatedException,
3549 IOException, UnresolvedLinkException {
3550 LOG.info("Recovering " + lease + ", src=" + src);
3551 assert !isInSafeMode();
3552 assert hasWriteLock();
3553
3554 final INodesInPath iip = dir.getLastINodeInPath(src);
3555 final INodeFileUnderConstruction pendingFile
3556 = INodeFileUnderConstruction.valueOf(iip.getINode(0), src);
3557 int nrBlocks = pendingFile.numBlocks();
3558 BlockInfo[] blocks = pendingFile.getBlocks();
3559
3560 int nrCompleteBlocks;
3561 BlockInfo curBlock = null;
3562 for(nrCompleteBlocks = 0; nrCompleteBlocks < nrBlocks; nrCompleteBlocks++) {
3563 curBlock = blocks[nrCompleteBlocks];
3564 if(!curBlock.isComplete())
3565 break;
3566 assert blockManager.checkMinReplication(curBlock) :
3567 "A COMPLETE block is not minimally replicated in " + src;
3568 }
3569
3570 // If there are no incomplete blocks associated with this file,
3571 // then reap lease immediately and close the file.
3572 if(nrCompleteBlocks == nrBlocks) {
3573 finalizeINodeFileUnderConstruction(src, pendingFile,
3574 iip.getLatestSnapshot());
3575 NameNode.stateChangeLog.warn("BLOCK*"
3576 + " internalReleaseLease: All existing blocks are COMPLETE,"
3577 + " lease removed, file closed.");
3578 return true; // closed!
3579 }
3580
3581 // Only the last and the penultimate blocks may be in non COMPLETE state.
3582 // If the penultimate block is not COMPLETE, then it must be COMMITTED.
3583 if(nrCompleteBlocks < nrBlocks - 2 ||
3584 nrCompleteBlocks == nrBlocks - 2 &&
3585 curBlock != null &&
3586 curBlock.getBlockUCState() != BlockUCState.COMMITTED) {
3587 final String message = "DIR* NameSystem.internalReleaseLease: "
3588 + "attempt to release a create lock on "
3589 + src + " but file is already closed.";
3590 NameNode.stateChangeLog.warn(message);
3591 throw new IOException(message);
3592 }
3593
3594 // The last block is not COMPLETE, and
3595 // that the penultimate block if exists is either COMPLETE or COMMITTED
3596 final BlockInfo lastBlock = pendingFile.getLastBlock();
3597 BlockUCState lastBlockState = lastBlock.getBlockUCState();
3598 BlockInfo penultimateBlock = pendingFile.getPenultimateBlock();
3599 boolean penultimateBlockMinReplication;
3600 BlockUCState penultimateBlockState;
3601 if (penultimateBlock == null) {
3602 penultimateBlockState = BlockUCState.COMPLETE;
3603 // If penultimate block doesn't exist then its minReplication is met
3604 penultimateBlockMinReplication = true;
3605 } else {
3606 penultimateBlockState = BlockUCState.COMMITTED;
3607 penultimateBlockMinReplication =
3608 blockManager.checkMinReplication(penultimateBlock);
3609 }
3610 assert penultimateBlockState == BlockUCState.COMPLETE ||
3611 penultimateBlockState == BlockUCState.COMMITTED :
3612 "Unexpected state of penultimate block in " + src;
3613
3614 switch(lastBlockState) {
3615 case COMPLETE:
3616 assert false : "Already checked that the last block is incomplete";
3617 break;
3618 case COMMITTED:
3619 // Close file if committed blocks are minimally replicated
3620 if(penultimateBlockMinReplication &&
3621 blockManager.checkMinReplication(lastBlock)) {
3622 finalizeINodeFileUnderConstruction(src, pendingFile,
3623 iip.getLatestSnapshot());
3624 NameNode.stateChangeLog.warn("BLOCK*"
3625 + " internalReleaseLease: Committed blocks are minimally replicated,"
3626 + " lease removed, file closed.");
3627 return true; // closed!
3628 }
3629 // Cannot close file right now, since some blocks
3630 // are not yet minimally replicated.
3631 // This may potentially cause infinite loop in lease recovery
3632 // if there are no valid replicas on data-nodes.
3633 String message = "DIR* NameSystem.internalReleaseLease: " +
3634 "Failed to release lease for file " + src +
3635 ". Committed blocks are waiting to be minimally replicated." +
3636 " Try again later.";
3637 NameNode.stateChangeLog.warn(message);
3638 throw new AlreadyBeingCreatedException(message);
3639 case UNDER_CONSTRUCTION:
3640 case UNDER_RECOVERY:
3641 final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)lastBlock;
3642 // setup the last block locations from the blockManager if not known
3643 if (uc.getNumExpectedLocations() == 0) {
3644 uc.setExpectedLocations(blockManager.getNodes(lastBlock));
3645 }
3646
3647 if (uc.getNumExpectedLocations() == 0 && uc.getNumBytes() == 0) {
3648 // There is no datanode reported to this block.
3649 // may be client have crashed before writing data to pipeline.
3650 // This blocks doesn't need any recovery.
3651 // We can remove this block and close the file.
3652 pendingFile.removeLastBlock(lastBlock);
3653 finalizeINodeFileUnderConstruction(src, pendingFile,
3654 iip.getLatestSnapshot());
3655 NameNode.stateChangeLog.warn("BLOCK* internalReleaseLease: "
3656 + "Removed empty last block and closed file.");
3657 return true;
3658 }
3659 // start recovery of the last block for this file
3660 long blockRecoveryId = nextGenerationStamp(isLegacyBlock(uc));
3661 lease = reassignLease(lease, src, recoveryLeaseHolder, pendingFile);
3662 uc.initializeBlockRecovery(blockRecoveryId);
3663 leaseManager.renewLease(lease);
3664 // Cannot close file right now, since the last block requires recovery.
3665 // This may potentially cause infinite loop in lease recovery
3666 // if there are no valid replicas on data-nodes.
3667 NameNode.stateChangeLog.warn(
3668 "DIR* NameSystem.internalReleaseLease: " +
3669 "File " + src + " has not been closed." +
3670 " Lease recovery is in progress. " +
3671 "RecoveryId = " + blockRecoveryId + " for block " + lastBlock);
3672 break;
3673 }
3674 return false;
3675 }
3676
3677 private Lease reassignLease(Lease lease, String src, String newHolder,
3678 INodeFileUnderConstruction pendingFile) {
3679 assert hasWriteLock();
3680 if(newHolder == null)
3681 return lease;
3682 // The following transaction is not synced. Make sure it's sync'ed later.
3683 logReassignLease(lease.getHolder(), src, newHolder);
3684 return reassignLeaseInternal(lease, src, newHolder, pendingFile);
3685 }
3686
3687 Lease reassignLeaseInternal(Lease lease, String src, String newHolder,
3688 INodeFileUnderConstruction pendingFile) {
3689 assert hasWriteLock();
3690 pendingFile.setClientName(newHolder);
3691 return leaseManager.reassignLease(lease, src, newHolder);
3692 }
3693
3694 private void commitOrCompleteLastBlock(final INodeFileUnderConstruction fileINode,
3695 final Block commitBlock) throws IOException {
3696 assert hasWriteLock();
3697 if (!blockManager.commitOrCompleteLastBlock(fileINode, commitBlock)) {
3698 return;
3699 }
3700
3701 // Adjust disk space consumption if required
3702 final long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();
3703 if (diff > 0) {
3704 try {
3705 String path = leaseManager.findPath(fileINode);
3706 dir.updateSpaceConsumed(path, 0, -diff*fileINode.getFileReplication());
3707 } catch (IOException e) {
3708 LOG.warn("Unexpected exception while updating disk space.", e);
3709 }
3710 }
3711 }
3712
3713 private void finalizeINodeFileUnderConstruction(String src,
3714 INodeFileUnderConstruction pendingFile, Snapshot latestSnapshot)
3715 throws IOException, UnresolvedLinkException {
3716 assert hasWriteLock();
3717 leaseManager.removeLease(pendingFile.getClientName(), src);
3718
3719 pendingFile = pendingFile.recordModification(latestSnapshot,
3720 dir.getINodeMap());
3721
3722 // The file is no longer pending.
3723 // Create permanent INode, update blocks
3724 final INodeFile newFile = pendingFile.toINodeFile(now());
3725 dir.replaceINodeFile(src, pendingFile, newFile);
3726
3727 // close file and persist block allocations for this file
3728 dir.closeFile(src, newFile);
3729
3730 blockManager.checkReplication(newFile);
3731 }
3732
3733 @VisibleForTesting
3734 BlockInfo getStoredBlock(Block block) {
3735 return blockManager.getStoredBlock(block);
3736 }
3737
3738 @Override
3739 public boolean isInSnapshot(BlockInfoUnderConstruction blockUC) {
3740 assert hasReadLock();
3741 final BlockCollection bc = blockUC.getBlockCollection();
3742 if (bc == null || !(bc instanceof INodeFileUnderConstruction)) {
3743 return false;
3744 }
3745
3746 INodeFileUnderConstruction inodeUC = (INodeFileUnderConstruction) blockUC
3747 .getBlockCollection();
3748 String fullName = inodeUC.getName();
3749 try {
3750 if (fullName != null && fullName.startsWith(Path.SEPARATOR)
3751 && dir.getINode(fullName) == inodeUC) {
3752 // If file exists in normal path then no need to look in snapshot
3753 return false;
3754 }
3755 } catch (UnresolvedLinkException e) {
3756 LOG.error("Error while resolving the link : " + fullName, e);
3757 return false;
3758 }
3759 /*
3760 * 1. if bc is an instance of INodeFileUnderConstructionWithSnapshot, and
3761 * bc is not in the current fsdirectory tree, bc must represent a snapshot
3762 * file.
3763 * 2. if fullName is not an absolute path, bc cannot be existent in the
3764 * current fsdirectory tree.
3765 * 3. if bc is not the current node associated with fullName, bc must be a
3766 * snapshot inode.
3767 */
3768 return true;
3769 }
3770
3771 void commitBlockSynchronization(ExtendedBlock lastblock,
3772 long newgenerationstamp, long newlength,
3773 boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
3774 String[] newtargetstorages)
3775 throws IOException, UnresolvedLinkException {
3776 LOG.info("commitBlockSynchronization(lastblock=" + lastblock
3777 + ", newgenerationstamp=" + newgenerationstamp
3778 + ", newlength=" + newlength
3779 + ", newtargets=" + Arrays.asList(newtargets)
3780 + ", closeFile=" + closeFile
3781 + ", deleteBlock=" + deleteblock
3782 + ")");
3783 checkOperation(OperationCategory.WRITE);
3784 String src = "";
3785 writeLock();
3786 try {
3787 checkOperation(OperationCategory.WRITE);
3788 // If a DN tries to commit to the standby, the recovery will
3789 // fail, and the next retry will succeed on the new NN.
3790
3791 checkNameNodeSafeMode(
3792 "Cannot commitBlockSynchronization while in safe mode");
3793 final BlockInfo storedBlock = getStoredBlock(
3794 ExtendedBlock.getLocalBlock(lastblock));
3795 if (storedBlock == null) {
3796 if (deleteblock) {
3797 // This may be a retry attempt so ignore the failure
3798 // to locate the block.
3799 if (LOG.isDebugEnabled()) {
3800 LOG.debug("Block (=" + lastblock + ") not found");
3801 }
3802 return;
3803 } else {
3804 throw new IOException("Block (=" + lastblock + ") not found");
3805 }
3806 }
3807 INodeFile iFile = ((INode)storedBlock.getBlockCollection()).asFile();
3808 if (!iFile.isUnderConstruction() || storedBlock.isComplete()) {
3809 if (LOG.isDebugEnabled()) {
3810 LOG.debug("Unexpected block (=" + lastblock
3811 + ") since the file (=" + iFile.getLocalName()
3812 + ") is not under construction");
3813 }
3814 return;
3815 }
3816
3817 long recoveryId =
3818 ((BlockInfoUnderConstruction)storedBlock).getBlockRecoveryId();
3819 if(recoveryId != newgenerationstamp) {
3820 throw new IOException("The recovery id " + newgenerationstamp
3821 + " does not match current recovery id "
3822 + recoveryId + " for block " + lastblock);
3823 }
3824
3825 INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)iFile;
3826
3827 if (deleteblock) {
3828 Block blockToDel = ExtendedBlock.getLocalBlock(lastblock);
3829 boolean remove = pendingFile.removeLastBlock(blockToDel);
3830 if (remove) {
3831 blockManager.removeBlockFromMap(storedBlock);
3832 }
3833 }
3834 else {
3835 // update last block
3836 storedBlock.setGenerationStamp(newgenerationstamp);
3837 storedBlock.setNumBytes(newlength);
3838
3839 // find the DatanodeDescriptor objects
3840 // There should be no locations in the blockManager till now because the
3841 // file is underConstruction
3842 List<DatanodeDescriptor> targetList =
3843 new ArrayList<DatanodeDescriptor>(newtargets.length);
3844 if (newtargets.length > 0) {
3845 for (DatanodeID newtarget : newtargets) {
3846 // try to get targetNode
3847 DatanodeDescriptor targetNode =
3848 blockManager.getDatanodeManager().getDatanode(newtarget);
3849 if (targetNode != null)
3850 targetList.add(targetNode);
3851 else if (LOG.isDebugEnabled()) {
3852 LOG.debug("DatanodeDescriptor (=" + newtarget + ") not found");
3853 }
3854 }
3855 }
3856 if ((closeFile) && !targetList.isEmpty()) {
3857 // the file is getting closed. Insert block locations into blockManager.
3858 // Otherwise fsck will report these blocks as MISSING, especially if the
3859 // blocksReceived from Datanodes take a long time to arrive.
3860 for (DatanodeDescriptor targetNode : targetList) {
3861 targetNode.addBlock(storedBlock);
3862 }
3863 }
3864 // add pipeline locations into the INodeUnderConstruction
3865 DatanodeDescriptor[] targetArray =
3866 new DatanodeDescriptor[targetList.size()];
3867 pendingFile.setLastBlock(storedBlock, targetList.toArray(targetArray));
3868 }
3869
3870 if (closeFile) {
3871 src = closeFileCommitBlocks(pendingFile, storedBlock);
3872 } else {
3873 // If this commit does not want to close the file, persist blocks
3874 src = persistBlocks(pendingFile, false);
3875 }
3876 } finally {
3877 writeUnlock();
3878 }
3879 getEditLog().logSync();
3880 if (closeFile) {
3881 LOG.info("commitBlockSynchronization(newblock=" + lastblock
3882 + ", file=" + src
3883 + ", newgenerationstamp=" + newgenerationstamp
3884 + ", newlength=" + newlength
3885 + ", newtargets=" + Arrays.asList(newtargets) + ") successful");
3886 } else {
3887 LOG.info("commitBlockSynchronization(" + lastblock + ") successful");
3888 }
3889 }
3890
3891 /**
3892 *
3893 * @param pendingFile
3894 * @param storedBlock
3895 * @return Path of the file that was closed.
3896 * @throws IOException
3897 */
3898 @VisibleForTesting
3899 String closeFileCommitBlocks(INodeFileUnderConstruction pendingFile,
3900 BlockInfo storedBlock)
3901 throws IOException {
3902
3903 String src = leaseManager.findPath(pendingFile);
3904
3905 // commit the last block and complete it if it has minimum replicas
3906 commitOrCompleteLastBlock(pendingFile, storedBlock);
3907
3908 //remove lease, close file
3909 finalizeINodeFileUnderConstruction(src, pendingFile,
3910 Snapshot.findLatestSnapshot(pendingFile, null));
3911
3912 return src;
3913 }
3914
3915 /**
3916 * Persist the block list for the given file.
3917 *
3918 * @param pendingFile
3919 * @return Path to the given file.
3920 * @throws IOException
3921 */
3922 @VisibleForTesting
3923 String persistBlocks(INodeFileUnderConstruction pendingFile,
3924 boolean logRetryCache) throws IOException {
3925 String src = leaseManager.findPath(pendingFile);
3926 dir.persistBlocks(src, pendingFile, logRetryCache);
3927 return src;
3928 }
3929
3930 /**
3931 * Renew the lease(s) held by the given client
3932 */
3933 void renewLease(String holder) throws IOException {
3934 checkOperation(OperationCategory.WRITE);
3935 writeLock();
3936 try {
3937 checkOperation(OperationCategory.WRITE);
3938 checkNameNodeSafeMode("Cannot renew lease for " + holder);
3939 leaseManager.renewLease(holder);
3940 } finally {
3941 writeUnlock();
3942 }
3943 }
3944
3945 /**
3946 * Get a partial listing of the indicated directory
3947 *
3948 * @param src the directory name
3949 * @param startAfter the name to start after
3950 * @param needLocation if blockLocations need to be returned
3951 * @return a partial listing starting after startAfter
3952 *
3953 * @throws AccessControlException if access is denied
3954 * @throws UnresolvedLinkException if symbolic link is encountered
3955 * @throws IOException if other I/O error occurred
3956 */
3957 DirectoryListing getListing(String src, byte[] startAfter,
3958 boolean needLocation)
3959 throws AccessControlException, UnresolvedLinkException, IOException {
3960 try {
3961 return getListingInt(src, startAfter, needLocation);
3962 } catch (AccessControlException e) {
3963 logAuditEvent(false, "listStatus", src);
3964 throw e;
3965 }
3966 }
3967
3968 private DirectoryListing getListingInt(String src, byte[] startAfter,
3969 boolean needLocation)
3970 throws AccessControlException, UnresolvedLinkException, IOException {
3971 DirectoryListing dl;
3972 FSPermissionChecker pc = getPermissionChecker();
3973 checkOperation(OperationCategory.READ);
3974 byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
3975 String startAfterString = new String(startAfter);
3976 readLock();
3977 try {
3978 checkOperation(OperationCategory.READ);
3979 src = FSDirectory.resolvePath(src, pathComponents, dir);
3980
3981 // Get file name when startAfter is an INodePath
3982 if (FSDirectory.isReservedName(startAfterString)) {
3983 byte[][] startAfterComponents = FSDirectory
3984 .getPathComponentsForReservedPath(startAfterString);
3985 try {
3986 String tmp = FSDirectory.resolvePath(src, startAfterComponents, dir);
3987 byte[][] regularPath = INode.getPathComponents(tmp);
3988 startAfter = regularPath[regularPath.length - 1];
3989 } catch (IOException e) {
3990 // Possibly the inode is deleted
3991 throw new DirectoryListingStartAfterNotFoundException(
3992 "Can't find startAfter " + startAfterString);
3993 }
3994 }
3995
3996 if (isPermissionEnabled) {
3997 if (dir.isDir(src)) {
3998 checkPathAccess(pc, src, FsAction.READ_EXECUTE);
3999 } else {
4000 checkTraverse(pc, src);
4001 }
4002 }
4003 logAuditEvent(true, "listStatus", src);
4004 dl = dir.getListing(src, startAfter, needLocation);
4005 } finally {
4006 readUnlock();
4007 }
4008 return dl;
4009 }
4010
4011 /////////////////////////////////////////////////////////
4012 //
4013 // These methods are called by datanodes
4014 //
4015 /////////////////////////////////////////////////////////
4016 /**
4017 * Register Datanode.
4018 * <p>
4019 * The purpose of registration is to identify whether the new datanode
4020 * serves a new data storage, and will report new data block copies,
4021 * which the namenode was not aware of; or the datanode is a replacement
4022 * node for the data storage that was previously served by a different
4023 * or the same (in terms of host:port) datanode.
4024 * The data storages are distinguished by their storageIDs. When a new
4025 * data storage is reported the namenode issues a new unique storageID.
4026 * <p>
4027 * Finally, the namenode returns its namespaceID as the registrationID
4028 * for the datanodes.
4029 * namespaceID is a persistent attribute of the name space.
4030 * The registrationID is checked every time the datanode is communicating
4031 * with the namenode.
4032 * Datanodes with inappropriate registrationID are rejected.
4033 * If the namenode stops, and then restarts it can restore its
4034 * namespaceID and will continue serving the datanodes that has previously
4035 * registered with the namenode without restarting the whole cluster.
4036 *
4037 * @see org.apache.hadoop.hdfs.server.datanode.DataNode
4038 */
4039 void registerDatanode(DatanodeRegistration nodeReg) throws IOException {
4040 writeLock();
4041 try {
4042 getBlockManager().getDatanodeManager().registerDatanode(nodeReg);
4043 checkSafeMode();
4044 } finally {
4045 writeUnlock();
4046 }
4047 }
4048
4049 /**
4050 * Get registrationID for datanodes based on the namespaceID.
4051 *
4052 * @see #registerDatanode(DatanodeRegistration)
4053 * @return registration ID
4054 */
4055 String getRegistrationID() {
4056 return Storage.getRegistrationID(dir.fsImage.getStorage());
4057 }
4058
4059 /**
4060 * The given node has reported in. This method should:
4061 * 1) Record the heartbeat, so the datanode isn't timed out
4062 * 2) Adjust usage stats for future block allocation
4063 *
4064 * If a substantial amount of time passed since the last datanode
4065 * heartbeat then request an immediate block report.
4066 *
4067 * @return an array of datanode commands
4068 * @throws IOException
4069 */
4070 HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
4071 long capacity, long dfsUsed, long remaining, long blockPoolUsed,
4072 int xceiverCount, int xmitsInProgress, int failedVolumes)
4073 throws IOException {
4074 readLock();
4075 try {
4076 final int maxTransfer = blockManager.getMaxReplicationStreams()
4077 - xmitsInProgress;
4078 DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
4079 nodeReg, blockPoolId, capacity, dfsUsed, remaining, blockPoolUsed,
4080 xceiverCount, maxTransfer, failedVolumes);
4081 return new HeartbeatResponse(cmds, createHaStatusHeartbeat());
4082 } finally {
4083 readUnlock();
4084 }
4085 }
4086
4087 private NNHAStatusHeartbeat createHaStatusHeartbeat() {
4088 HAState state = haContext.getState();
4089 return new NNHAStatusHeartbeat(state.getServiceState(),
4090 getFSImage().getLastAppliedOrWrittenTxId());
4091 }
4092
4093 /**
4094 * Returns whether or not there were available resources at the last check of
4095 * resources.
4096 *
4097 * @return true if there were sufficient resources available, false otherwise.
4098 */
4099 boolean nameNodeHasResourcesAvailable() {
4100 return hasResourcesAvailable;
4101 }
4102
4103 /**
4104 * Perform resource checks and cache the results.
4105 * @throws IOException
4106 */
4107 void checkAvailableResources() {
4108 Preconditions.checkState(nnResourceChecker != null,
4109 "nnResourceChecker not initialized");
4110 hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
4111 }
4112
4113 /**
4114 * Periodically calls hasAvailableResources of NameNodeResourceChecker, and if
4115 * there are found to be insufficient resources available, causes the NN to
4116 * enter safe mode. If resources are later found to have returned to
4117 * acceptable levels, this daemon will cause the NN to exit safe mode.
4118 */
4119 class NameNodeResourceMonitor implements Runnable {
4120 boolean shouldNNRmRun = true;
4121 @Override
4122 public void run () {
4123 try {
4124 while (fsRunning && shouldNNRmRun) {
4125 checkAvailableResources();
4126 if(!nameNodeHasResourcesAvailable()) {
4127 String lowResourcesMsg = "NameNode low on available disk space. ";
4128 if (!isInSafeMode()) {
4129 FSNamesystem.LOG.warn(lowResourcesMsg + "Entering safe mode.");
4130 } else {
4131 FSNamesystem.LOG.warn(lowResourcesMsg + "Already in safe mode.");
4132 }
4133 enterSafeMode(true);
4134 }
4135 try {
4136 Thread.sleep(resourceRecheckInterval);
4137 } catch (InterruptedException ie) {
4138 // Deliberately ignore
4139 }
4140 }
4141 } catch (Exception e) {
4142 FSNamesystem.LOG.error("Exception in NameNodeResourceMonitor: ", e);
4143 }
4144 }
4145
4146 public void stopMonitor() {
4147 shouldNNRmRun = false;
4148 }
4149 }
4150
4151 class NameNodeEditLogRoller implements Runnable {
4152
4153 private boolean shouldRun = true;
4154 private final long rollThreshold;
4155 private final long sleepIntervalMs;
4156
4157 public NameNodeEditLogRoller(long rollThreshold, int sleepIntervalMs) {
4158 this.rollThreshold = rollThreshold;
4159 this.sleepIntervalMs = sleepIntervalMs;
4160 }
4161
4162 @Override
4163 public void run() {
4164 while (fsRunning && shouldRun) {
4165 try {
4166 FSEditLog editLog = getFSImage().getEditLog();
4167 long numEdits =
4168 editLog.getLastWrittenTxId() - editLog.getCurSegmentTxId();
4169 if (numEdits > rollThreshold) {
4170 FSNamesystem.LOG.info("NameNode rolling its own edit log because"
4171 + " number of edits in open segment exceeds threshold of "
4172 + rollThreshold);
4173 rollEditLog();
4174 }
4175 Thread.sleep(sleepIntervalMs);
4176 } catch (InterruptedException e) {
4177 FSNamesystem.LOG.info(NameNodeEditLogRoller.class.getSimpleName()
4178 + " was interrupted, exiting");
4179 break;
4180 } catch (Exception e) {
4181 FSNamesystem.LOG.error("Swallowing exception in "
4182 + NameNodeEditLogRoller.class.getSimpleName() + ":", e);
4183 }
4184 }
4185 }
4186
4187 public void stop() {
4188 shouldRun = false;
4189 }
4190 }
4191
4192 public FSImage getFSImage() {
4193 return dir.fsImage;
4194 }
4195
4196 public FSEditLog getEditLog() {
4197 return getFSImage().getEditLog();
4198 }
4199
4200 private void checkBlock(ExtendedBlock block) throws IOException {
4201 if (block != null && !this.blockPoolId.equals(block.getBlockPoolId())) {
4202 throw new IOException("Unexpected BlockPoolId " + block.getBlockPoolId()
4203 + " - expected " + blockPoolId);
4204 }
4205 }
4206
4207 @Metric({"MissingBlocks", "Number of missing blocks"})
4208 public long getMissingBlocksCount() {
4209 // not locking
4210 return blockManager.getMissingBlocksCount();
4211 }
4212
4213 @Metric({"ExpiredHeartbeats", "Number of expired heartbeats"})
4214 public int getExpiredHeartbeats() {
4215 return datanodeStatistics.getExpiredHeartbeats();
4216 }
4217
4218 @Metric({"TransactionsSinceLastCheckpoint",
4219 "Number of transactions since last checkpoint"})
4220 public long getTransactionsSinceLastCheckpoint() {
4221 return getEditLog().getLastWrittenTxId() -
4222 getFSImage().getStorage().getMostRecentCheckpointTxId();
4223 }
4224
4225 @Metric({"TransactionsSinceLastLogRoll",
4226 "Number of transactions since last edit log roll"})
4227 public long getTransactionsSinceLastLogRoll() {
4228 if (isInStandbyState() || !getEditLog().isSegmentOpen()) {
4229 return 0;
4230 } else {
4231 return getEditLog().getLastWrittenTxId() -
4232 getEditLog().getCurSegmentTxId() + 1;
4233 }
4234 }
4235
4236 @Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
4237 public long getLastWrittenTransactionId() {
4238 return getEditLog().getLastWrittenTxId();
4239 }
4240
4241 @Metric({"LastCheckpointTime",
4242 "Time in milliseconds since the epoch of the last checkpoint"})
4243 public long getLastCheckpointTime() {
4244 return getFSImage().getStorage().getMostRecentCheckpointTime();
4245 }
4246
4247 /** @see ClientProtocol#getStats() */
4248 long[] getStats() {
4249 final long[] stats = datanodeStatistics.getStats();
4250 stats[ClientProtocol.GET_STATS_UNDER_REPLICATED_IDX] = getUnderReplicatedBlocks();
4251 stats[ClientProtocol.GET_STATS_CORRUPT_BLOCKS_IDX] = getCorruptReplicaBlocks();
4252 stats[ClientProtocol.GET_STATS_MISSING_BLOCKS_IDX] = getMissingBlocksCount();
4253 return stats;
4254 }
4255
4256 @Override // FSNamesystemMBean
4257 @Metric({"CapacityTotal",
4258 "Total raw capacity of data nodes in bytes"})
4259 public long getCapacityTotal() {
4260 return datanodeStatistics.getCapacityTotal();
4261 }
4262
4263 @Metric({"CapacityTotalGB",
4264 "Total raw capacity of data nodes in GB"})
4265 public float getCapacityTotalGB() {
4266 return DFSUtil.roundBytesToGB(getCapacityTotal());
4267 }
4268
4269 @Override // FSNamesystemMBean
4270 @Metric({"CapacityUsed",
4271 "Total used capacity across all data nodes in bytes"})
4272 public long getCapacityUsed() {
4273 return datanodeStatistics.getCapacityUsed();
4274 }
4275
4276 @Metric({"CapacityUsedGB",
4277 "Total used capacity across all data nodes in GB"})
4278 public float getCapacityUsedGB() {
4279 return DFSUtil.roundBytesToGB(getCapacityUsed());
4280 }
4281
4282 @Override // FSNamesystemMBean
4283 @Metric({"CapacityRemaining", "Remaining capacity in bytes"})
4284 public long getCapacityRemaining() {
4285 return datanodeStatistics.getCapacityRemaining();
4286 }
4287
4288 @Metric({"CapacityRemainingGB", "Remaining capacity in GB"})
4289 public float getCapacityRemainingGB() {
4290 return DFSUtil.roundBytesToGB(getCapacityRemaining());
4291 }
4292
4293 @Metric({"CapacityUsedNonDFS",
4294 "Total space used by data nodes for non DFS purposes in bytes"})
4295 public long getCapacityUsedNonDFS() {
4296 return datanodeStatistics.getCapacityUsedNonDFS();
4297 }
4298
4299 /**
4300 * Total number of connections.
4301 */
4302 @Override // FSNamesystemMBean
4303 @Metric
4304 public int getTotalLoad() {
4305 return datanodeStatistics.getXceiverCount();
4306 }
4307
4308 @Metric({ "SnapshottableDirectories", "Number of snapshottable directories" })
4309 public int getNumSnapshottableDirs() {
4310 return this.snapshotManager.getNumSnapshottableDirs();
4311 }
4312
4313 @Metric({ "Snapshots", "The number of snapshots" })
4314 public int getNumSnapshots() {
4315 return this.snapshotManager.getNumSnapshots();
4316 }
4317
4318 int getNumberOfDatanodes(DatanodeReportType type) {
4319 readLock();
4320 try {
4321 return getBlockManager().getDatanodeManager().getDatanodeListForReport(
4322 type).size();
4323 } finally {
4324 readUnlock();
4325 }
4326 }
4327
4328 DatanodeInfo[] datanodeReport(final DatanodeReportType type
4329 ) throws AccessControlException, StandbyException {
4330 checkSuperuserPrivilege();
4331 checkOperation(OperationCategory.UNCHECKED);
4332 readLock();
4333 try {
4334 checkOperation(OperationCategory.UNCHECKED);
4335 final DatanodeManager dm = getBlockManager().getDatanodeManager();
4336 final List<DatanodeDescriptor> results = dm.getDatanodeListForReport(type);
4337
4338 DatanodeInfo[] arr = new DatanodeInfo[results.size()];
4339 for (int i=0; i<arr.length; i++) {
4340 arr[i] = new DatanodeInfo(results.get(i));
4341 }
4342 return arr;
4343 } finally {
4344 readUnlock();
4345 }
4346 }
4347
4348 /**
4349 * Save namespace image.
4350 * This will save current namespace into fsimage file and empty edits file.
4351 * Requires superuser privilege and safe mode.
4352 *
4353 * @throws AccessControlException if superuser privilege is violated.
4354 * @throws IOException if
4355 */
4356 void saveNamespace() throws AccessControlException, IOException {
4357 checkOperation(OperationCategory.UNCHECKED);
4358 checkSuperuserPrivilege();
4359
4360 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
4361 if (cacheEntry != null && cacheEntry.isSuccess()) {
4362 return; // Return previous response
4363 }
4364 boolean success = false;
4365 readLock();
4366 try {
4367 checkOperation(OperationCategory.UNCHECKED);
4368 if (!isInSafeMode()) {
4369 throw new IOException("Safe mode should be turned ON "
4370 + "in order to create namespace image.");
4371 }
4372 getFSImage().saveNamespace(this);
4373 success = true;
4374 } finally {
4375 readUnlock();
4376 RetryCache.setState(cacheEntry, success);
4377 }
4378 LOG.info("New namespace image has been created");
4379 }
4380
4381 /**
4382 * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
4383 * Requires superuser privilege.
4384 *
4385 * @throws AccessControlException if superuser privilege is violated.
4386 */
4387 boolean restoreFailedStorage(String arg) throws AccessControlException,
4388 StandbyException {
4389 checkSuperuserPrivilege();
4390 checkOperation(OperationCategory.UNCHECKED);
4391 writeLock();
4392 try {
4393 checkOperation(OperationCategory.UNCHECKED);
4394
4395 // if it is disabled - enable it and vice versa.
4396 if(arg.equals("check"))
4397 return getFSImage().getStorage().getRestoreFailedStorage();
4398
4399 boolean val = arg.equals("true"); // false if not
4400 getFSImage().getStorage().setRestoreFailedStorage(val);
4401
4402 return val;
4403 } finally {
4404 writeUnlock();
4405 }
4406 }
4407
4408 Date getStartTime() {
4409 return new Date(startTime);
4410 }
4411
4412 void finalizeUpgrade() throws IOException {
4413 checkSuperuserPrivilege();
4414 checkOperation(OperationCategory.WRITE);
4415 writeLock();
4416 try {
4417 checkOperation(OperationCategory.WRITE);
4418 getFSImage().finalizeUpgrade();
4419 } finally {
4420 writeUnlock();
4421 }
4422 }
4423
4424 void refreshNodes() throws IOException {
4425 checkOperation(OperationCategory.UNCHECKED);
4426 checkSuperuserPrivilege();
4427 getBlockManager().getDatanodeManager().refreshNodes(new HdfsConfiguration());
4428 }
4429
4430 void setBalancerBandwidth(long bandwidth) throws IOException {
4431 checkOperation(OperationCategory.UNCHECKED);
4432 checkSuperuserPrivilege();
4433 getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
4434 }
4435
4436 /**
4437 * SafeModeInfo contains information related to the safe mode.
4438 * <p>
4439 * An instance of {@link SafeModeInfo} is created when the name node
4440 * enters safe mode.
4441 * <p>
4442 * During name node startup {@link SafeModeInfo} counts the number of
4443 * <em>safe blocks</em>, those that have at least the minimal number of
4444 * replicas, and calculates the ratio of safe blocks to the total number
4445 * of blocks in the system, which is the size of blocks in
4446 * {@link FSNamesystem#blockManager}. When the ratio reaches the
4447 * {@link #threshold} it starts the SafeModeMonitor daemon in order
4448 * to monitor whether the safe mode {@link #extension} is passed.
4449 * Then it leaves safe mode and destroys itself.
4450 * <p>
4451 * If safe mode is turned on manually then the number of safe blocks is
4452 * not tracked because the name node is not intended to leave safe mode
4453 * automatically in the case.
4454 *
4455 * @see ClientProtocol#setSafeMode(HdfsConstants.SafeModeAction, boolean)
4456 */
4457 public class SafeModeInfo {
4458 // configuration fields
4459 /** Safe mode threshold condition %.*/
4460 private double threshold;
4461 /** Safe mode minimum number of datanodes alive */
4462 private int datanodeThreshold;
4463 /** Safe mode extension after the threshold. */
4464 private int extension;
4465 /** Min replication required by safe mode. */
4466 private int safeReplication;
4467 /** threshold for populating needed replication queues */
4468 private double replQueueThreshold;
4469
4470 // internal fields
4471 /** Time when threshold was reached.
4472 * <br> -1 safe mode is off
4473 * <br> 0 safe mode is on, and threshold is not reached yet
4474 * <br> >0 safe mode is on, but we are in extension period
4475 */
4476 private long reached = -1;
4477 /** Total number of blocks. */
4478 int blockTotal;
4479 /** Number of safe blocks. */
4480 int blockSafe;
4481 /** Number of blocks needed to satisfy safe mode threshold condition */
4482 private int blockThreshold;
4483 /** Number of blocks needed before populating replication queues */
4484 private int blockReplQueueThreshold;
4485 /** time of the last status printout */
4486 private long lastStatusReport = 0;
4487 /** flag indicating whether replication queues have been initialized */
4488 boolean initializedReplQueues = false;
4489 /** Was safemode entered automatically because available resources were low. */
4490 private boolean resourcesLow = false;
4491 /** Should safemode adjust its block totals as blocks come in */
4492 private boolean shouldIncrementallyTrackBlocks = false;
4493 /** counter for tracking startup progress of reported blocks */
4494 private Counter awaitingReportedBlocksCounter;
4495
4496 /**
4497 * Creates SafeModeInfo when the name node enters
4498 * automatic safe mode at startup.
4499 *
4500 * @param conf configuration
4501 */
4502 private SafeModeInfo(Configuration conf) {
4503 this.threshold = conf.getFloat(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY,
4504 DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT);
4505 if(threshold > 1.0) {
4506 LOG.warn("The threshold value should't be greater than 1, threshold: " + threshold);
4507 }
4508 this.datanodeThreshold = conf.getInt(
4509 DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY,
4510 DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT);
4511 this.extension = conf.getInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
4512 this.safeReplication = conf.getInt(DFS_NAMENODE_REPLICATION_MIN_KEY,
4513 DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
4514
4515 LOG.info(DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY + " = " + threshold);
4516 LOG.info(DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY + " = " + datanodeThreshold);
4517 LOG.info(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + " = " + extension);
4518
4519 // default to safe mode threshold (i.e., don't populate queues before leaving safe mode)
4520 this.replQueueThreshold =
4521 conf.getFloat(DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,
4522 (float) threshold);
4523 this.blockTotal = 0;
4524 this.blockSafe = 0;
4525 }
4526
4527 /**
4528 * In the HA case, the StandbyNode can be in safemode while the namespace
4529 * is modified by the edit log tailer. In this case, the number of total
4530 * blocks changes as edits are processed (eg blocks are added and deleted).
4531 * However, we don't want to do the incremental tracking during the
4532 * startup-time loading process -- only once the initial total has been
4533 * set after the image has been loaded.
4534 */
4535 private boolean shouldIncrementallyTrackBlocks() {
4536 return shouldIncrementallyTrackBlocks;
4537 }
4538
4539 /**
4540 * Creates SafeModeInfo when safe mode is entered manually, or because
4541 * available resources are low.
4542 *
4543 * The {@link #threshold} is set to 1.5 so that it could never be reached.
4544 * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
4545 *
4546 * @see SafeModeInfo
4547 */
4548 private SafeModeInfo(boolean resourcesLow, boolean isReplQueuesInited) {
4549 this.threshold = 1.5f; // this threshold can never be reached
4550 this.datanodeThreshold = Integer.MAX_VALUE;
4551 this.extension = Integer.MAX_VALUE;
4552 this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
4553 this.replQueueThreshold = 1.5f; // can never be reached
4554 this.blockTotal = -1;
4555 this.blockSafe = -1;
4556 this.resourcesLow = resourcesLow;
4557 this.initializedReplQueues = isReplQueuesInited;
4558 enter();
4559 reportStatus("STATE* Safe mode is ON.", true);
4560 }
4561
4562 /**
4563 * Check if safe mode is on.
4564 * @return true if in safe mode
4565 */
4566 private synchronized boolean isOn() {
4567 doConsistencyCheck();
4568 return this.reached >= 0;
4569 }
4570
4571 /**
4572 * Check if we are populating replication queues.
4573 */
4574 private synchronized boolean isPopulatingReplQueues() {
4575 return initializedReplQueues;
4576 }
4577
4578 /**
4579 * Enter safe mode.
4580 */
4581 private void enter() {
4582 this.reached = 0;
4583 }
4584
4585 /**
4586 * Leave safe mode.
4587 * <p>
4588 * Check for invalid, under- & over-replicated blocks in the end of startup.
4589 */
4590 private synchronized void leave() {
4591 // if not done yet, initialize replication queues.
4592 // In the standby, do not populate repl queues
4593 if (!isPopulatingReplQueues() && shouldPopulateReplQueues()) {
4594 initializeReplQueues();
4595 }
4596 long timeInSafemode = now() - startTime;
4597 NameNode.stateChangeLog.info("STATE* Leaving safe mode after "
4598 + timeInSafemode/1000 + " secs");
4599 NameNode.getNameNodeMetrics().setSafeModeTime((int) timeInSafemode);
4600
4601 //Log the following only once (when transitioning from ON -> OFF)
4602 if (reached >= 0) {
4603 NameNode.stateChangeLog.info("STATE* Safe mode is OFF");
4604 }
4605 reached = -1;
4606 safeMode = null;
4607 final NetworkTopology nt = blockManager.getDatanodeManager().getNetworkTopology();
4608 NameNode.stateChangeLog.info("STATE* Network topology has "
4609 + nt.getNumOfRacks() + " racks and "
4610 + nt.getNumOfLeaves() + " datanodes");
4611 NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
4612 + blockManager.numOfUnderReplicatedBlocks() + " blocks");
4613
4614 startSecretManagerIfNecessary();
4615
4616 // If startup has not yet completed, end safemode phase.
4617 StartupProgress prog = NameNode.getStartupProgress();
4618 if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4619 prog.endStep(Phase.SAFEMODE, STEP_AWAITING_REPORTED_BLOCKS);
4620 prog.endPhase(Phase.SAFEMODE);
4621 }
4622 }
4623
4624 /**
4625 * Initialize replication queues.
4626 */
4627 private synchronized void initializeReplQueues() {
4628 LOG.info("initializing replication queues");
4629 assert !isPopulatingReplQueues() : "Already initialized repl queues";
4630 long startTimeMisReplicatedScan = now();
4631 blockManager.processMisReplicatedBlocks();
4632 initializedReplQueues = true;
4633 NameNode.stateChangeLog.info("STATE* Replication Queue initialization "
4634 + "scan for invalid, over- and under-replicated blocks "
4635 + "completed in " + (now() - startTimeMisReplicatedScan)
4636 + " msec");
4637 }
4638
4639 /**
4640 * Check whether we have reached the threshold for
4641 * initializing replication queues.
4642 */
4643 private synchronized boolean canInitializeReplQueues() {
4644 return shouldPopulateReplQueues()
4645 && blockSafe >= blockReplQueueThreshold;
4646 }
4647
4648 /**
4649 * Safe mode can be turned off iff
4650 * the threshold is reached and
4651 * the extension time have passed.
4652 * @return true if can leave or false otherwise.
4653 */
4654 private synchronized boolean canLeave() {
4655 if (reached == 0)
4656 return false;
4657 if (now() - reached < extension) {
4658 reportStatus("STATE* Safe mode ON.", false);
4659 return false;
4660 }
4661 return !needEnter();
4662 }
4663
4664 /**
4665 * There is no need to enter safe mode
4666 * if DFS is empty or {@link #threshold} == 0
4667 */
4668 private boolean needEnter() {
4669 return (threshold != 0 && blockSafe < blockThreshold) ||
4670 (getNumLiveDataNodes() < datanodeThreshold) ||
4671 (!nameNodeHasResourcesAvailable());
4672 }
4673
4674 /**
4675 * Check and trigger safe mode if needed.
4676 */
4677 private void checkMode() {
4678 // Have to have write-lock since leaving safemode initializes
4679 // repl queues, which requires write lock
4680 assert hasWriteLock();
4681 // if smmthread is already running, the block threshold must have been
4682 // reached before, there is no need to enter the safe mode again
4683 if (smmthread == null && needEnter()) {
4684 enter();
4685 // check if we are ready to initialize replication queues
4686 if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
4687 initializeReplQueues();
4688 }
4689 reportStatus("STATE* Safe mode ON.", false);
4690 return;
4691 }
4692 // the threshold is reached or was reached before
4693 if (!isOn() || // safe mode is off
4694 extension <= 0 || threshold <= 0) { // don't need to wait
4695 this.leave(); // leave safe mode
4696 return;
4697 }
4698 if (reached > 0) { // threshold has already been reached before
4699 reportStatus("STATE* Safe mode ON.", false);
4700 return;
4701 }
4702 // start monitor
4703 reached = now();
4704 if (smmthread == null) {
4705 smmthread = new Daemon(new SafeModeMonitor());
4706 smmthread.start();
4707 reportStatus("STATE* Safe mode extension entered.", true);
4708 }
4709
4710 // check if we are ready to initialize replication queues
4711 if (canInitializeReplQueues() && !isPopulatingReplQueues()) {
4712 initializeReplQueues();
4713 }
4714 }
4715
4716 /**
4717 * Set total number of blocks.
4718 */
4719 private synchronized void setBlockTotal(int total) {
4720 this.blockTotal = total;
4721 this.blockThreshold = (int) (blockTotal * threshold);
4722 this.blockReplQueueThreshold =
4723 (int) (blockTotal * replQueueThreshold);
4724 if (haEnabled) {
4725 // After we initialize the block count, any further namespace
4726 // modifications done while in safe mode need to keep track
4727 // of the number of total blocks in the system.
4728 this.shouldIncrementallyTrackBlocks = true;
4729 }
4730 if(blockSafe < 0)
4731 this.blockSafe = 0;
4732 checkMode();
4733 }
4734
4735 /**
4736 * Increment number of safe blocks if current block has
4737 * reached minimal replication.
4738 * @param replication current replication
4739 */
4740 private synchronized void incrementSafeBlockCount(short replication) {
4741 if (replication == safeReplication) {
4742 this.blockSafe++;
4743
4744 // Report startup progress only if we haven't completed startup yet.
4745 StartupProgress prog = NameNode.getStartupProgress();
4746 if (prog.getStatus(Phase.SAFEMODE) != Status.COMPLETE) {
4747 if (this.awaitingReportedBlocksCounter == null) {
4748 this.awaitingReportedBlocksCounter = prog.getCounter(Phase.SAFEMODE,
4749 STEP_AWAITING_REPORTED_BLOCKS);
4750 }
4751 this.awaitingReportedBlocksCounter.increment();
4752 }
4753
4754 checkMode();
4755 }
4756 }
4757
4758 /**
4759 * Decrement number of safe blocks if current block has
4760 * fallen below minimal replication.
4761 * @param replication current replication
4762 */
4763 private synchronized void decrementSafeBlockCount(short replication) {
4764 if (replication == safeReplication-1) {
4765 this.blockSafe--;
4766 //blockSafe is set to -1 in manual / low resources safemode
4767 assert blockSafe >= 0 || isManual() || areResourcesLow();
4768 checkMode();
4769 }
4770 }
4771
4772 /**
4773 * Check if safe mode was entered manually
4774 */
4775 private boolean isManual() {
4776 return extension == Integer.MAX_VALUE;
4777 }
4778
4779 /**
4780 * Set manual safe mode.
4781 */
4782 private synchronized void setManual() {
4783 extension = Integer.MAX_VALUE;
4784 }
4785
4786 /**
4787 * Check if safe mode was entered due to resources being low.
4788 */
4789 private boolean areResourcesLow() {
4790 return resourcesLow;
4791 }
4792
4793 /**
4794 * Set that resources are low for this instance of safe mode.
4795 */
4796 private void setResourcesLow() {
4797 resourcesLow = true;
4798 }
4799
4800 /**
4801 * A tip on how safe mode is to be turned off: manually or automatically.
4802 */
4803 String getTurnOffTip() {
4804 if(!isOn())
4805 return "Safe mode is OFF.";
4806
4807 //Manual OR low-resource safemode. (Admin intervention required)
4808 String leaveMsg = "It was turned on manually. ";
4809 if (areResourcesLow()) {
4810 leaveMsg = "Resources are low on NN. Please add or free up more "
4811 + "resources then turn off safe mode manually. NOTE: If you turn off"
4812 + " safe mode before adding resources, "
4813 + "the NN will immediately return to safe mode. ";
4814 }
4815 if (isManual() || areResourcesLow()) {
4816 return leaveMsg
4817 + "Use \"hdfs dfsadmin -safemode leave\" to turn safe mode off.";
4818 }
4819
4820 //Automatic safemode. System will come out of safemode automatically.
4821 leaveMsg = "Safe mode will be turned off automatically";
4822 int numLive = getNumLiveDataNodes();
4823 String msg = "";
4824 if (reached == 0) {
4825 if (blockSafe < blockThreshold) {
4826 msg += String.format(
4827 "The reported blocks %d needs additional %d"
4828 + " blocks to reach the threshold %.4f of total blocks %d.\n",
4829 blockSafe, (blockThreshold - blockSafe) + 1, threshold, blockTotal);
4830 }
4831 if (numLive < datanodeThreshold) {
4832 msg += String.format(
4833 "The number of live datanodes %d needs an additional %d live "
4834 + "datanodes to reach the minimum number %d.\n",
4835 numLive, (datanodeThreshold - numLive), datanodeThreshold);
4836 }
4837 } else {
4838 msg = String.format("The reported blocks %d has reached the threshold"
4839 + " %.4f of total blocks %d. ", blockSafe, threshold, blockTotal);
4840
4841 msg += String.format("The number of live datanodes %d has reached "
4842 + "the minimum number %d. ",
4843 numLive, datanodeThreshold);
4844 }
4845 msg += leaveMsg;
4846 // threshold is not reached or manual or resources low
4847 if(reached == 0 || (isManual() && !areResourcesLow())) {
4848 return msg;
4849 }
4850 // extension period is in progress
4851 return msg + (reached + extension - now() > 0 ?
4852 " in " + (reached + extension - now()) / 1000 + " seconds."
4853 : " soon.");
4854 }
4855
4856 /**
4857 * Print status every 20 seconds.
4858 */
4859 private void reportStatus(String msg, boolean rightNow) {
4860 long curTime = now();
4861 if(!rightNow && (curTime - lastStatusReport < 20 * 1000))
4862 return;
4863 NameNode.stateChangeLog.info(msg + " \n" + getTurnOffTip());
4864 lastStatusReport = curTime;
4865 }
4866
4867 @Override
4868 public String toString() {
4869 String resText = "Current safe blocks = "
4870 + blockSafe
4871 + ". Target blocks = " + blockThreshold + " for threshold = %" + threshold
4872 + ". Minimal replication = " + safeReplication + ".";
4873 if (reached > 0)
4874 resText += " Threshold was reached " + new Date(reached) + ".";
4875 return resText;
4876 }
4877
4878 /**
4879 * Checks consistency of the class state.
4880 * This is costly so only runs if asserts are enabled.
4881 */
4882 private void doConsistencyCheck() {
4883 boolean assertsOn = false;
4884 assert assertsOn = true; // set to true if asserts are on
4885 if (!assertsOn) return;
4886
4887 if (blockTotal == -1 && blockSafe == -1) {
4888 return; // manual safe mode
4889 }
4890 int activeBlocks = blockManager.getActiveBlockCount();
4891 if ((blockTotal != activeBlocks) &&
4892 !(blockSafe >= 0 && blockSafe <= blockTotal)) {
4893 throw new AssertionError(
4894 " SafeMode: Inconsistent filesystem state: "
4895 + "SafeMode data: blockTotal=" + blockTotal
4896 + " blockSafe=" + blockSafe + "; "
4897 + "BlockManager data: active=" + activeBlocks);
4898 }
4899 }
4900
4901 private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
4902 if (!shouldIncrementallyTrackBlocks) {
4903 return;
4904 }
4905 assert haEnabled;
4906
4907 if (LOG.isDebugEnabled()) {
4908 LOG.debug("Adjusting block totals from " +
4909 blockSafe + "/" + blockTotal + " to " +
4910 (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
4911 }
4912 assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
4913 blockSafe + " by " + deltaSafe + ": would be negative";
4914 assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
4915 blockTotal + " by " + deltaTotal + ": would be negative";
4916
4917 blockSafe += deltaSafe;
4918 setBlockTotal(blockTotal + deltaTotal);
4919 }
4920 }
4921
4922 /**
4923 * Periodically check whether it is time to leave safe mode.
4924 * This thread starts when the threshold level is reached.
4925 *
4926 */
4927 class SafeModeMonitor implements Runnable {
4928 /** interval in msec for checking safe mode: {@value} */
4929 private static final long recheckInterval = 1000;
4930
4931 /**
4932 */
4933 @Override
4934 public void run() {
4935 while (fsRunning) {
4936 writeLock();
4937 try {
4938 if (safeMode == null) { // Not in safe mode.
4939 break;
4940 }
4941 if (safeMode.canLeave()) {
4942 // Leave safe mode.
4943 safeMode.leave();
4944 smmthread = null;
4945 break;
4946 }
4947 } finally {
4948 writeUnlock();
4949 }
4950
4951 try {
4952 Thread.sleep(recheckInterval);
4953 } catch (InterruptedException ie) {
4954 // Ignored
4955 }
4956 }
4957 if (!fsRunning) {
4958 LOG.info("NameNode is being shutdown, exit SafeModeMonitor thread");
4959 }
4960 }
4961 }
4962
4963 boolean setSafeMode(SafeModeAction action) throws IOException {
4964 if (action != SafeModeAction.SAFEMODE_GET) {
4965 checkSuperuserPrivilege();
4966 switch(action) {
4967 case SAFEMODE_LEAVE: // leave safe mode
4968 leaveSafeMode();
4969 break;
4970 case SAFEMODE_ENTER: // enter safe mode
4971 enterSafeMode(false);
4972 break;
4973 default:
4974 LOG.error("Unexpected safe mode action");
4975 }
4976 }
4977 return isInSafeMode();
4978 }
4979
4980 @Override
4981 public void checkSafeMode() {
4982 // safeMode is volatile, and may be set to null at any time
4983 SafeModeInfo safeMode = this.safeMode;
4984 if (safeMode != null) {
4985 safeMode.checkMode();
4986 }
4987 }
4988
4989 @Override
4990 public boolean isInSafeMode() {
4991 // safeMode is volatile, and may be set to null at any time
4992 SafeModeInfo safeMode = this.safeMode;
4993 if (safeMode == null)
4994 return false;
4995 return safeMode.isOn();
4996 }
4997
4998 @Override
4999 public boolean isInStartupSafeMode() {
5000 // safeMode is volatile, and may be set to null at any time
5001 SafeModeInfo safeMode = this.safeMode;
5002 if (safeMode == null)
5003 return false;
5004 // If the NN is in safemode, and not due to manual / low resources, we
5005 // assume it must be because of startup. If the NN had low resources during
5006 // startup, we assume it came out of startup safemode and it is now in low
5007 // resources safemode
5008 return !safeMode.isManual() && !safeMode.areResourcesLow()
5009 && safeMode.isOn();
5010 }
5011
5012 /**
5013 * Check if replication queues are to be populated
5014 * @return true when node is HAState.Active and not in the very first safemode
5015 */
5016 @Override
5017 public boolean isPopulatingReplQueues() {
5018 if (!shouldPopulateReplQueues()) {
5019 return false;
5020 }
5021 // safeMode is volatile, and may be set to null at any time
5022 SafeModeInfo safeMode = this.safeMode;
5023 if (safeMode == null)
5024 return true;
5025 return safeMode.isPopulatingReplQueues();
5026 }
5027
5028 private boolean shouldPopulateReplQueues() {
5029 if(haContext == null || haContext.getState() == null)
5030 return false;
5031 return haContext.getState().shouldPopulateReplQueues();
5032 }
5033
5034 @Override
5035 public void incrementSafeBlockCount(int replication) {
5036 // safeMode is volatile, and may be set to null at any time
5037 SafeModeInfo safeMode = this.safeMode;
5038 if (safeMode == null)
5039 return;
5040 safeMode.incrementSafeBlockCount((short)replication);
5041 }
5042
5043 @Override
5044 public void decrementSafeBlockCount(Block b) {
5045 // safeMode is volatile, and may be set to null at any time
5046 SafeModeInfo safeMode = this.safeMode;
5047 if (safeMode == null) // mostly true
5048 return;
5049 BlockInfo storedBlock = getStoredBlock(b);
5050 if (storedBlock.isComplete()) {
5051 safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
5052 }
5053 }
5054
5055 /**
5056 * Adjust the total number of blocks safe and expected during safe mode.
5057 * If safe mode is not currently on, this is a no-op.
5058 * @param deltaSafe the change in number of safe blocks
5059 * @param deltaTotal the change i nnumber of total blocks expected
5060 */
5061 @Override
5062 public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
5063 // safeMode is volatile, and may be set to null at any time
5064 SafeModeInfo safeMode = this.safeMode;
5065 if (safeMode == null)
5066 return;
5067 safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
5068 }
5069
5070 /**
5071 * Set the total number of blocks in the system.
5072 */
5073 public void setBlockTotal() {
5074 // safeMode is volatile, and may be set to null at any time
5075 SafeModeInfo safeMode = this.safeMode;
5076 if (safeMode == null)
5077 return;
5078 safeMode.setBlockTotal((int)getCompleteBlocksTotal());
5079 }
5080
5081 /**
5082 * Get the total number of blocks in the system.
5083 */
5084 @Override // FSNamesystemMBean
5085 @Metric
5086 public long getBlocksTotal() {
5087 return blockManager.getTotalBlocks();
5088 }
5089
5090 /**
5091 * Get the total number of COMPLETE blocks in the system.
5092 * For safe mode only complete blocks are counted.
5093 */
5094 private long getCompleteBlocksTotal() {
5095 // Calculate number of blocks under construction
5096 long numUCBlocks = 0;
5097 readLock();
5098 try {
5099 for (Lease lease : leaseManager.getSortedLeases()) {
5100 for (String path : lease.getPaths()) {
5101 final INodeFileUnderConstruction cons;
5102 try {
5103 cons = INodeFileUnderConstruction.valueOf(dir.getINode(path), path);
5104 } catch (UnresolvedLinkException e) {
5105 throw new AssertionError("Lease files should reside on this FS");
5106 } catch (IOException e) {
5107 throw new RuntimeException(e);
5108 }
5109 BlockInfo[] blocks = cons.getBlocks();
5110 if(blocks == null)
5111 continue;
5112 for(BlockInfo b : blocks) {
5113 if(!b.isComplete())
5114 numUCBlocks++;
5115 }
5116 }
5117 }
5118 LOG.info("Number of blocks under construction: " + numUCBlocks);
5119 return getBlocksTotal() - numUCBlocks;
5120 } finally {
5121 readUnlock();
5122 }
5123 }
5124
5125 /**
5126 * Enter safe mode. If resourcesLow is false, then we assume it is manual
5127 * @throws IOException
5128 */
5129 void enterSafeMode(boolean resourcesLow) throws IOException {
5130 writeLock();
5131 try {
5132 // Stop the secret manager, since rolling the master key would
5133 // try to write to the edit log
5134 stopSecretManager();
5135
5136 // Ensure that any concurrent operations have been fully synced
5137 // before entering safe mode. This ensures that the FSImage
5138 // is entirely stable on disk as soon as we're in safe mode.
5139 boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
5140 // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
5141 // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
5142 if (isEditlogOpenForWrite) {
5143 getEditLog().logSyncAll();
5144 }
5145 if (!isInSafeMode()) {
5146 safeMode = new SafeModeInfo(resourcesLow, isPopulatingReplQueues());
5147 return;
5148 }
5149 if (resourcesLow) {
5150 safeMode.setResourcesLow();
5151 } else {
5152 safeMode.setManual();
5153 }
5154 if (isEditlogOpenForWrite) {
5155 getEditLog().logSyncAll();
5156 }
5157 NameNode.stateChangeLog.info("STATE* Safe mode is ON"
5158 + safeMode.getTurnOffTip());
5159 } finally {
5160 writeUnlock();
5161 }
5162 }
5163
5164 /**
5165 * Leave safe mode.
5166 * @throws IOException
5167 */
5168 void leaveSafeMode() {
5169 writeLock();
5170 try {
5171 if (!isInSafeMode()) {
5172 NameNode.stateChangeLog.info("STATE* Safe mode is already OFF");
5173 return;
5174 }
5175 safeMode.leave();
5176 } finally {
5177 writeUnlock();
5178 }
5179 }
5180
5181 String getSafeModeTip() {
5182 readLock();
5183 try {
5184 if (!isInSafeMode()) {
5185 return "";
5186 }
5187 return safeMode.getTurnOffTip();
5188 } finally {
5189 readUnlock();
5190 }
5191 }
5192
5193 CheckpointSignature rollEditLog() throws IOException {
5194 checkSuperuserPrivilege();
5195 checkOperation(OperationCategory.JOURNAL);
5196 writeLock();
5197 try {
5198 checkOperation(OperationCategory.JOURNAL);
5199 checkNameNodeSafeMode("Log not rolled");
5200 if (Server.isRpcInvocation()) {
5201 LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
5202 }
5203 return getFSImage().rollEditLog();
5204 } finally {
5205 writeUnlock();
5206 }
5207 }
5208
5209 NamenodeCommand startCheckpoint(NamenodeRegistration backupNode,
5210 NamenodeRegistration activeNamenode) throws IOException {
5211 checkOperation(OperationCategory.CHECKPOINT);
5212 CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
5213 null);
5214 if (cacheEntry != null && cacheEntry.isSuccess()) {
5215 return (NamenodeCommand) cacheEntry.getPayload();
5216 }
5217 writeLock();
5218 NamenodeCommand cmd = null;
5219 try {
5220 checkOperation(OperationCategory.CHECKPOINT);
5221
5222 checkNameNodeSafeMode("Checkpoint not started");
5223 LOG.info("Start checkpoint for " + backupNode.getAddress());
5224 cmd = getFSImage().startCheckpoint(backupNode, activeNamenode);
5225 getEditLog().logSync();
5226 return cmd;
5227 } finally {
5228 writeUnlock();
5229 RetryCache.setState(cacheEntry, cmd != null, cmd);
5230 }
5231 }
5232
5233 public void processIncrementalBlockReport(final DatanodeID nodeID,
5234 final String poolId, final ReceivedDeletedBlockInfo blockInfos[])
5235 throws IOException {
5236 writeLock();
5237 try {
5238 blockManager.processIncrementalBlockReport(nodeID, poolId, blockInfos);
5239 } finally {
5240 writeUnlock();
5241 }
5242 }
5243
5244 void endCheckpoint(NamenodeRegistration registration,
5245 CheckpointSignature sig) throws IOException {
5246 checkOperation(OperationCategory.CHECKPOINT);
5247 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5248 if (cacheEntry != null && cacheEntry.isSuccess()) {
5249 return; // Return previous response
5250 }
5251 boolean success = false;
5252 readLock();
5253 try {
5254 checkOperation(OperationCategory.CHECKPOINT);
5255
5256 checkNameNodeSafeMode("Checkpoint not ended");
5257 LOG.info("End checkpoint for " + registration.getAddress());
5258 getFSImage().endCheckpoint(sig);
5259 success = true;
5260 } finally {
5261 readUnlock();
5262 RetryCache.setState(cacheEntry, success);
5263 }
5264 }
5265
5266 PermissionStatus createFsOwnerPermissions(FsPermission permission) {
5267 return new PermissionStatus(fsOwner.getShortUserName(), supergroup, permission);
5268 }
5269
5270 private void checkOwner(FSPermissionChecker pc, String path)
5271 throws AccessControlException, UnresolvedLinkException {
5272 checkPermission(pc, path, true, null, null, null, null);
5273 }
5274
5275 private void checkPathAccess(FSPermissionChecker pc,
5276 String path, FsAction access) throws AccessControlException,
5277 UnresolvedLinkException {
5278 checkPermission(pc, path, false, null, null, access, null);
5279 }
5280
5281 private void checkParentAccess(FSPermissionChecker pc,
5282 String path, FsAction access) throws AccessControlException,
5283 UnresolvedLinkException {
5284 checkPermission(pc, path, false, null, access, null, null);
5285 }
5286
5287 private void checkAncestorAccess(FSPermissionChecker pc,
5288 String path, FsAction access) throws AccessControlException,
5289 UnresolvedLinkException {
5290 checkPermission(pc, path, false, access, null, null, null);
5291 }
5292
5293 private void checkTraverse(FSPermissionChecker pc, String path)
5294 throws AccessControlException, UnresolvedLinkException {
5295 checkPermission(pc, path, false, null, null, null, null);
5296 }
5297
5298 @Override
5299 public void checkSuperuserPrivilege()
5300 throws AccessControlException {
5301 if (isPermissionEnabled) {
5302 FSPermissionChecker pc = getPermissionChecker();
5303 pc.checkSuperuserPrivilege();
5304 }
5305 }
5306
5307 /**
5308 * Check whether current user have permissions to access the path. For more
5309 * details of the parameters, see
5310 * {@link FSPermissionChecker#checkPermission()}.
5311 */
5312 private void checkPermission(FSPermissionChecker pc,
5313 String path, boolean doCheckOwner, FsAction ancestorAccess,
5314 FsAction parentAccess, FsAction access, FsAction subAccess)
5315 throws AccessControlException, UnresolvedLinkException {
5316 checkPermission(pc, path, doCheckOwner, ancestorAccess,
5317 parentAccess, access, subAccess, true);
5318 }
5319
5320 /**
5321 * Check whether current user have permissions to access the path. For more
5322 * details of the parameters, see
5323 * {@link FSPermissionChecker#checkPermission()}.
5324 */
5325 private void checkPermission(FSPermissionChecker pc,
5326 String path, boolean doCheckOwner, FsAction ancestorAccess,
5327 FsAction parentAccess, FsAction access, FsAction subAccess,
5328 boolean resolveLink)
5329 throws AccessControlException, UnresolvedLinkException {
5330 if (!pc.isSuperUser()) {
5331 dir.waitForReady();
5332 readLock();
5333 try {
5334 pc.checkPermission(path, dir.rootDir, doCheckOwner, ancestorAccess,
5335 parentAccess, access, subAccess, resolveLink);
5336 } finally {
5337 readUnlock();
5338 }
5339 }
5340 }
5341
5342 /**
5343 * Check to see if we have exceeded the limit on the number
5344 * of inodes.
5345 */
5346 void checkFsObjectLimit() throws IOException {
5347 if (maxFsObjects != 0 &&
5348 maxFsObjects <= dir.totalInodes() + getBlocksTotal()) {
5349 throw new IOException("Exceeded the configured number of objects " +
5350 maxFsObjects + " in the filesystem.");
5351 }
5352 }
5353
5354 /**
5355 * Get the total number of objects in the system.
5356 */
5357 long getMaxObjects() {
5358 return maxFsObjects;
5359 }
5360
5361 @Override // FSNamesystemMBean
5362 @Metric
5363 public long getFilesTotal() {
5364 readLock();
5365 try {
5366 return this.dir.totalInodes();
5367 } finally {
5368 readUnlock();
5369 }
5370 }
5371
5372 @Override // FSNamesystemMBean
5373 @Metric
5374 public long getPendingReplicationBlocks() {
5375 return blockManager.getPendingReplicationBlocksCount();
5376 }
5377
5378 @Override // FSNamesystemMBean
5379 @Metric
5380 public long getUnderReplicatedBlocks() {
5381 return blockManager.getUnderReplicatedBlocksCount();
5382 }
5383
5384 /** Returns number of blocks with corrupt replicas */
5385 @Metric({"CorruptBlocks", "Number of blocks with corrupt replicas"})
5386 public long getCorruptReplicaBlocks() {
5387 return blockManager.getCorruptReplicaBlocksCount();
5388 }
5389
5390 @Override // FSNamesystemMBean
5391 @Metric
5392 public long getScheduledReplicationBlocks() {
5393 return blockManager.getScheduledReplicationBlocksCount();
5394 }
5395
5396 @Metric
5397 public long getPendingDeletionBlocks() {
5398 return blockManager.getPendingDeletionBlocksCount();
5399 }
5400
5401 @Metric
5402 public long getExcessBlocks() {
5403 return blockManager.getExcessBlocksCount();
5404 }
5405
5406 // HA-only metric
5407 @Metric
5408 public long getPostponedMisreplicatedBlocks() {
5409 return blockManager.getPostponedMisreplicatedBlocksCount();
5410 }
5411
5412 // HA-only metric
5413 @Metric
5414 public int getPendingDataNodeMessageCount() {
5415 return blockManager.getPendingDataNodeMessageCount();
5416 }
5417
5418 // HA-only metric
5419 @Metric
5420 public String getHAState() {
5421 return haContext.getState().toString();
5422 }
5423
5424 // HA-only metric
5425 @Metric
5426 public long getMillisSinceLastLoadedEdits() {
5427 if (isInStandbyState() && editLogTailer != null) {
5428 return now() - editLogTailer.getLastLoadTimestamp();
5429 } else {
5430 return 0;
5431 }
5432 }
5433
5434 @Metric
5435 public int getBlockCapacity() {
5436 return blockManager.getCapacity();
5437 }
5438
5439 @Override // FSNamesystemMBean
5440 public String getFSState() {
5441 return isInSafeMode() ? "safeMode" : "Operational";
5442 }
5443
5444 private ObjectName mbeanName;
5445
5446 /**
5447 * Register the FSNamesystem MBean using the name
5448 * "hadoop:service=NameNode,name=FSNamesystemState"
5449 */
5450 private void registerMBean() {
5451 // We can only implement one MXBean interface, so we keep the old one.
5452 try {
5453 StandardMBean bean = new StandardMBean(this, FSNamesystemMBean.class);
5454 mbeanName = MBeans.register("NameNode", "FSNamesystemState", bean);
5455 } catch (NotCompliantMBeanException e) {
5456 throw new RuntimeException("Bad MBean setup", e);
5457 }
5458
5459 LOG.info("Registered FSNamesystemState MBean");
5460 }
5461
5462 /**
5463 * shutdown FSNamesystem
5464 */
5465 void shutdown() {
5466 if (mbeanName != null) {
5467 MBeans.unregister(mbeanName);
5468 }
5469 if (dir != null) {
5470 dir.shutdown();
5471 }
5472 if (blockManager != null) {
5473 blockManager.shutdown();
5474 }
5475 }
5476
5477
5478 @Override // FSNamesystemMBean
5479 public int getNumLiveDataNodes() {
5480 return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
5481 }
5482
5483 @Override // FSNamesystemMBean
5484 public int getNumDeadDataNodes() {
5485 return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
5486 }
5487
5488 @Override // FSNamesystemMBean
5489 @Metric({"StaleDataNodes",
5490 "Number of datanodes marked stale due to delayed heartbeat"})
5491 public int getNumStaleDataNodes() {
5492 return getBlockManager().getDatanodeManager().getNumStaleNodes();
5493 }
5494
5495 /**
5496 * Sets the current generation stamp for legacy blocks
5497 */
5498 void setGenerationStampV1(long stamp) {
5499 generationStampV1.setCurrentValue(stamp);
5500 }
5501
5502 /**
5503 * Gets the current generation stamp for legacy blocks
5504 */
5505 long getGenerationStampV1() {
5506 return generationStampV1.getCurrentValue();
5507 }
5508
5509 /**
5510 * Gets the current generation stamp for this filesystem
5511 */
5512 void setGenerationStampV2(long stamp) {
5513 generationStampV2.setCurrentValue(stamp);
5514 }
5515
5516 /**
5517 * Gets the current generation stamp for this filesystem
5518 */
5519 long getGenerationStampV2() {
5520 return generationStampV2.getCurrentValue();
5521 }
5522
5523 /**
5524 * Upgrades the generation stamp for the filesystem
5525 * by reserving a sufficient range for all existing blocks.
5526 * Should be invoked only during the first upgrade to
5527 * sequential block IDs.
5528 */
5529 long upgradeGenerationStampToV2() {
5530 Preconditions.checkState(generationStampV2.getCurrentValue() ==
5531 GenerationStamp.LAST_RESERVED_STAMP);
5532
5533 generationStampV2.skipTo(
5534 generationStampV1.getCurrentValue() +
5535 HdfsConstants.RESERVED_GENERATION_STAMPS_V1);
5536
5537 generationStampV1Limit = generationStampV2.getCurrentValue();
5538 return generationStampV2.getCurrentValue();
5539 }
5540
5541 /**
5542 * Sets the generation stamp that delineates random and sequentially
5543 * allocated block IDs.
5544 * @param stamp
5545 */
5546 void setGenerationStampV1Limit(long stamp) {
5547 Preconditions.checkState(generationStampV1Limit ==
5548 GenerationStamp.GRANDFATHER_GENERATION_STAMP);
5549 generationStampV1Limit = stamp;
5550 }
5551
5552 /**
5553 * Gets the value of the generation stamp that delineates sequential
5554 * and random block IDs.
5555 */
5556 long getGenerationStampAtblockIdSwitch() {
5557 return generationStampV1Limit;
5558 }
5559
5560 @VisibleForTesting
5561 SequentialBlockIdGenerator getBlockIdGenerator() {
5562 return blockIdGenerator;
5563 }
5564
5565 /**
5566 * Sets the maximum allocated block ID for this filesystem. This is
5567 * the basis for allocating new block IDs.
5568 */
5569 void setLastAllocatedBlockId(long blockId) {
5570 blockIdGenerator.skipTo(blockId);
5571 }
5572
5573 /**
5574 * Gets the maximum sequentially allocated block ID for this filesystem
5575 */
5576 long getLastAllocatedBlockId() {
5577 return blockIdGenerator.getCurrentValue();
5578 }
5579
5580 /**
5581 * Increments, logs and then returns the stamp
5582 */
5583 long nextGenerationStamp(boolean legacyBlock)
5584 throws IOException, SafeModeException {
5585 assert hasWriteLock();
5586 checkNameNodeSafeMode("Cannot get next generation stamp");
5587
5588 long gs;
5589 if (legacyBlock) {
5590 gs = getNextGenerationStampV1();
5591 getEditLog().logGenerationStampV1(gs);
5592 } else {
5593 gs = getNextGenerationStampV2();
5594 getEditLog().logGenerationStampV2(gs);
5595 }
5596
5597 // NB: callers sync the log
5598 return gs;
5599 }
5600
5601 @VisibleForTesting
5602 long getNextGenerationStampV1() throws IOException {
5603 long genStampV1 = generationStampV1.nextValue();
5604
5605 if (genStampV1 >= generationStampV1Limit) {
5606 // We ran out of generation stamps for legacy blocks. In practice, it
5607 // is extremely unlikely as we reserved 1T v1 generation stamps. The
5608 // result is that we can no longer append to the legacy blocks that
5609 // were created before the upgrade to sequential block IDs.
5610 throw new OutOfV1GenerationStampsException();
5611 }
5612
5613 return genStampV1;
5614 }
5615
5616 @VisibleForTesting
5617 long getNextGenerationStampV2() {
5618 return generationStampV2.nextValue();
5619 }
5620
5621 long getGenerationStampV1Limit() {
5622 return generationStampV1Limit;
5623 }
5624
5625 /**
5626 * Determine whether the block ID was randomly generated (legacy) or
5627 * sequentially generated. The generation stamp value is used to
5628 * make the distinction.
5629 * @param block
5630 * @return true if the block ID was randomly generated, false otherwise.
5631 */
5632 boolean isLegacyBlock(Block block) {
5633 return block.getGenerationStamp() < getGenerationStampV1Limit();
5634 }
5635
5636 /**
5637 * Increments, logs and then returns the block ID
5638 */
5639 private long nextBlockId() throws IOException {
5640 assert hasWriteLock();
5641 checkNameNodeSafeMode("Cannot get next block ID");
5642 final long blockId = blockIdGenerator.nextValue();
5643 getEditLog().logAllocateBlockId(blockId);
5644 // NB: callers sync the log
5645 return blockId;
5646 }
5647
5648 private INodeFileUnderConstruction checkUCBlock(ExtendedBlock block,
5649 String clientName) throws IOException {
5650 assert hasWriteLock();
5651 checkNameNodeSafeMode("Cannot get a new generation stamp and an "
5652 + "access token for block " + block);
5653
5654 // check stored block state
5655 BlockInfo storedBlock = getStoredBlock(ExtendedBlock.getLocalBlock(block));
5656 if (storedBlock == null ||
5657 storedBlock.getBlockUCState() != BlockUCState.UNDER_CONSTRUCTION) {
5658 throw new IOException(block +
5659 " does not exist or is not under Construction" + storedBlock);
5660 }
5661
5662 // check file inode
5663 final INodeFile file = ((INode)storedBlock.getBlockCollection()).asFile();
5664 if (file==null || !file.isUnderConstruction()) {
5665 throw new IOException("The file " + storedBlock +
5666 " belonged to does not exist or it is not under construction.");
5667 }
5668
5669 // check lease
5670 INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction)file;
5671 if (clientName == null || !clientName.equals(pendingFile.getClientName())) {
5672 throw new LeaseExpiredException("Lease mismatch: " + block +
5673 " is accessed by a non lease holder " + clientName);
5674 }
5675
5676 return pendingFile;
5677 }
5678
5679 /**
5680 * Client is reporting some bad block locations.
5681 */
5682 void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
5683 checkOperation(OperationCategory.WRITE);
5684 NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
5685 writeLock();
5686 try {
5687 checkOperation(OperationCategory.WRITE);
5688 for (int i = 0; i < blocks.length; i++) {
5689 ExtendedBlock blk = blocks[i].getBlock();
5690 DatanodeInfo[] nodes = blocks[i].getLocations();
5691 for (int j = 0; j < nodes.length; j++) {
5692 DatanodeInfo dn = nodes[j];
5693 blockManager.findAndMarkBlockAsCorrupt(blk, dn,
5694 "client machine reported it");
5695 }
5696 }
5697 } finally {
5698 writeUnlock();
5699 }
5700 }
5701
5702 /**
5703 * Get a new generation stamp together with an access token for
5704 * a block under construction
5705 *
5706 * This method is called for recovering a failed pipeline or setting up
5707 * a pipeline to append to a block.
5708 *
5709 * @param block a block
5710 * @param clientName the name of a client
5711 * @return a located block with a new generation stamp and an access token
5712 * @throws IOException if any error occurs
5713 */
5714 LocatedBlock updateBlockForPipeline(ExtendedBlock block,
5715 String clientName) throws IOException {
5716 LocatedBlock locatedBlock;
5717 checkOperation(OperationCategory.WRITE);
5718 writeLock();
5719 try {
5720 checkOperation(OperationCategory.WRITE);
5721
5722 // check vadility of parameters
5723 checkUCBlock(block, clientName);
5724
5725 // get a new generation stamp and an access token
5726 block.setGenerationStamp(
5727 nextGenerationStamp(isLegacyBlock(block.getLocalBlock())));
5728 locatedBlock = new LocatedBlock(block, new DatanodeInfo[0]);
5729 blockManager.setBlockToken(locatedBlock, AccessMode.WRITE);
5730 } finally {
5731 writeUnlock();
5732 }
5733 // Ensure we record the new generation stamp
5734 getEditLog().logSync();
5735 return locatedBlock;
5736 }
5737
5738 /**
5739 * Update a pipeline for a block under construction
5740 *
5741 * @param clientName the name of the client
5742 * @param oldBlock and old block
5743 * @param newBlock a new block with a new generation stamp and length
5744 * @param newNodes datanodes in the pipeline
5745 * @throws IOException if any error occurs
5746 */
5747 void updatePipeline(String clientName, ExtendedBlock oldBlock,
5748 ExtendedBlock newBlock, DatanodeID[] newNodes)
5749 throws IOException {
5750 checkOperation(OperationCategory.WRITE);
5751 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
5752 if (cacheEntry != null && cacheEntry.isSuccess()) {
5753 return; // Return previous response
5754 }
5755 LOG.info("updatePipeline(block=" + oldBlock
5756 + ", newGenerationStamp=" + newBlock.getGenerationStamp()
5757 + ", newLength=" + newBlock.getNumBytes()
5758 + ", newNodes=" + Arrays.asList(newNodes)
5759 + ", clientName=" + clientName
5760 + ")");
5761 writeLock();
5762 boolean success = false;
5763 try {
5764 checkOperation(OperationCategory.WRITE);
5765 checkNameNodeSafeMode("Pipeline not updated");
5766 assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
5767 + oldBlock + " has different block identifier";
5768 updatePipelineInternal(clientName, oldBlock, newBlock, newNodes,
5769 cacheEntry != null);
5770 success = true;
5771 } finally {
5772 writeUnlock();
5773 RetryCache.setState(cacheEntry, success);
5774 }
5775 getEditLog().logSync();
5776 LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock);
5777 }
5778
5779 /** @see #updatePipeline(String, ExtendedBlock, ExtendedBlock, DatanodeID[]) */
5780 private void updatePipelineInternal(String clientName, ExtendedBlock oldBlock,
5781 ExtendedBlock newBlock, DatanodeID[] newNodes, boolean logRetryCache)
5782 throws IOException {
5783 assert hasWriteLock();
5784 // check the vadility of the block and lease holder name
5785 final INodeFileUnderConstruction pendingFile
5786 = checkUCBlock(oldBlock, clientName);
5787 final BlockInfoUnderConstruction blockinfo
5788 = (BlockInfoUnderConstruction)pendingFile.getLastBlock();
5789
5790 // check new GS & length: this is not expected
5791 if (newBlock.getGenerationStamp() <= blockinfo.getGenerationStamp() ||
5792 newBlock.getNumBytes() < blockinfo.getNumBytes()) {
5793 String msg = "Update " + oldBlock + " (len = " +
5794 blockinfo.getNumBytes() + ") to an older state: " + newBlock +
5795 " (len = " + newBlock.getNumBytes() +")";
5796 LOG.warn(msg);
5797 throw new IOException(msg);
5798 }
5799
5800 // Update old block with the new generation stamp and new length
5801 blockinfo.setNumBytes(newBlock.getNumBytes());
5802 blockinfo.setGenerationStampAndVerifyReplicas(newBlock.getGenerationStamp());
5803
5804 // find the DatanodeDescriptor objects
5805 final DatanodeManager dm = getBlockManager().getDatanodeManager();
5806 DatanodeDescriptor[] descriptors = null;
5807 if (newNodes.length > 0) {
5808 descriptors = new DatanodeDescriptor[newNodes.length];
5809 for(int i = 0; i < newNodes.length; i++) {
5810 descriptors[i] = dm.getDatanode(newNodes[i]);
5811 }
5812 }
5813 blockinfo.setExpectedLocations(descriptors);
5814
5815 String src = leaseManager.findPath(pendingFile);
5816 dir.persistBlocks(src, pendingFile, logRetryCache);
5817 }
5818
5819 // rename was successful. If any part of the renamed subtree had
5820 // files that were being written to, update with new filename.
5821 void unprotectedChangeLease(String src, String dst) {
5822 assert hasWriteLock();
5823 leaseManager.changeLease(src, dst);
5824 }
5825
5826 /**
5827 * Serializes leases.
5828 */
5829 void saveFilesUnderConstruction(DataOutputStream out,
5830 Map<Long, INodeFileUnderConstruction> snapshotUCMap) throws IOException {
5831 // This is run by an inferior thread of saveNamespace, which holds a read
5832 // lock on our behalf. If we took the read lock here, we could block
5833 // for fairness if a writer is waiting on the lock.
5834 synchronized (leaseManager) {
5835 Map<String, INodeFileUnderConstruction> nodes =
5836 leaseManager.getINodesUnderConstruction();
5837 for (Map.Entry<String, INodeFileUnderConstruction> entry
5838 : nodes.entrySet()) {
5839 // TODO: for HDFS-5428, because of rename operations, some
5840 // under-construction files that are
5841 // in the current fs directory can also be captured in the
5842 // snapshotUCMap. We should remove them from the snapshotUCMap.
5843 snapshotUCMap.remove(entry.getValue().getId());
5844 }
5845
5846 out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
5847 for (Map.Entry<String, INodeFileUnderConstruction> entry
5848 : nodes.entrySet()) {
5849 FSImageSerialization.writeINodeUnderConstruction(
5850 out, entry.getValue(), entry.getKey());
5851 }
5852 for (Map.Entry<Long, INodeFileUnderConstruction> entry
5853 : snapshotUCMap.entrySet()) {
5854 // for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
5855 // as their paths
5856 StringBuilder b = new StringBuilder();
5857 b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
5858 .append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
5859 .append(Path.SEPARATOR).append(entry.getValue().getId());
5860 FSImageSerialization.writeINodeUnderConstruction(
5861 out, entry.getValue(), b.toString());
5862 }
5863 }
5864 }
5865
5866 /**
5867 * Register a Backup name-node, verifying that it belongs
5868 * to the correct namespace, and adding it to the set of
5869 * active journals if necessary.
5870 *
5871 * @param bnReg registration of the new BackupNode
5872 * @param nnReg registration of this NameNode
5873 * @throws IOException if the namespace IDs do not match
5874 */
5875 void registerBackupNode(NamenodeRegistration bnReg,
5876 NamenodeRegistration nnReg) throws IOException {
5877 writeLock();
5878 try {
5879 if(getFSImage().getStorage().getNamespaceID()
5880 != bnReg.getNamespaceID())
5881 throw new IOException("Incompatible namespaceIDs: "
5882 + " Namenode namespaceID = "
5883 + getFSImage().getStorage().getNamespaceID() + "; "
5884 + bnReg.getRole() +
5885 " node namespaceID = " + bnReg.getNamespaceID());
5886 if (bnReg.getRole() == NamenodeRole.BACKUP) {
5887 getFSImage().getEditLog().registerBackupNode(
5888 bnReg, nnReg);
5889 }
5890 } finally {
5891 writeUnlock();
5892 }
5893 }
5894
5895 /**
5896 * Release (unregister) backup node.
5897 * <p>
5898 * Find and remove the backup stream corresponding to the node.
5899 * @param registration
5900 * @throws IOException
5901 */
5902 void releaseBackupNode(NamenodeRegistration registration)
5903 throws IOException {
5904 checkOperation(OperationCategory.WRITE);
5905 writeLock();
5906 try {
5907 checkOperation(OperationCategory.WRITE);
5908 if(getFSImage().getStorage().getNamespaceID()
5909 != registration.getNamespaceID())
5910 throw new IOException("Incompatible namespaceIDs: "
5911 + " Namenode namespaceID = "
5912 + getFSImage().getStorage().getNamespaceID() + "; "
5913 + registration.getRole() +
5914 " node namespaceID = " + registration.getNamespaceID());
5915 getEditLog().releaseBackupStream(registration);
5916 } finally {
5917 writeUnlock();
5918 }
5919 }
5920
5921 static class CorruptFileBlockInfo {
5922 String path;
5923 Block block;
5924
5925 public CorruptFileBlockInfo(String p, Block b) {
5926 path = p;
5927 block = b;
5928 }
5929
5930 @Override
5931 public String toString() {
5932 return block.getBlockName() + "\t" + path;
5933 }
5934 }
5935 /**
5936 * @param path Restrict corrupt files to this portion of namespace.
5937 * @param startBlockAfter Support for continuation; the set of files we return
5938 * back is ordered by blockid; startBlockAfter tells where to start from
5939 * @return a list in which each entry describes a corrupt file/block
5940 * @throws AccessControlException
5941 * @throws IOException
5942 */
5943 Collection<CorruptFileBlockInfo> listCorruptFileBlocks(String path,
5944 String[] cookieTab) throws IOException {
5945 checkSuperuserPrivilege();
5946 checkOperation(OperationCategory.READ);
5947 readLock();
5948 try {
5949 checkOperation(OperationCategory.READ);
5950 if (!isPopulatingReplQueues()) {
5951 throw new IOException("Cannot run listCorruptFileBlocks because " +
5952 "replication queues have not been initialized.");
5953 }
5954 // print a limited # of corrupt files per call
5955 int count = 0;
5956 ArrayList<CorruptFileBlockInfo> corruptFiles = new ArrayList<CorruptFileBlockInfo>();
5957
5958 final Iterator<Block> blkIterator = blockManager.getCorruptReplicaBlockIterator();
5959
5960 if (cookieTab == null) {
5961 cookieTab = new String[] { null };
5962 }
5963 int skip = getIntCookie(cookieTab[0]);
5964 for (int i = 0; i < skip && blkIterator.hasNext(); i++) {
5965 blkIterator.next();
5966 }
5967
5968 while (blkIterator.hasNext()) {
5969 Block blk = blkIterator.next();
5970 final INode inode = (INode)blockManager.getBlockCollection(blk);
5971 skip++;
5972 if (inode != null && blockManager.countNodes(blk).liveReplicas() == 0) {
5973 String src = FSDirectory.getFullPathName(inode);
5974 if (src.startsWith(path)){
5975 corruptFiles.add(new CorruptFileBlockInfo(src, blk));
5976 count++;
5977 if (count >= DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED)
5978 break;
5979 }
5980 }
5981 }
5982 cookieTab[0] = String.valueOf(skip);
5983 LOG.info("list corrupt file blocks returned: " + count);
5984 return corruptFiles;
5985 } finally {
5986 readUnlock();
5987 }
5988 }
5989
5990 /**
5991 * Convert string cookie to integer.
5992 */
5993 private static int getIntCookie(String cookie){
5994 int c;
5995 if(cookie == null){
5996 c = 0;
5997 } else {
5998 try{
5999 c = Integer.parseInt(cookie);
6000 }catch (NumberFormatException e) {
6001 c = 0;
6002 }
6003 }
6004 c = Math.max(0, c);
6005 return c;
6006 }
6007
6008 /**
6009 * Create delegation token secret manager
6010 */
6011 private DelegationTokenSecretManager createDelegationTokenSecretManager(
6012 Configuration conf) {
6013 return new DelegationTokenSecretManager(conf.getLong(
6014 DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY,
6015 DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT),
6016 conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY,
6017 DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT),
6018 conf.getLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY,
6019 DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT),
6020 DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL,
6021 conf.getBoolean(DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6022 DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT),
6023 this);
6024 }
6025
6026 /**
6027 * Returns the DelegationTokenSecretManager instance in the namesystem.
6028 * @return delegation token secret manager object
6029 */
6030 DelegationTokenSecretManager getDelegationTokenSecretManager() {
6031 return dtSecretManager;
6032 }
6033
6034 /**
6035 * @param renewer
6036 * @return Token<DelegationTokenIdentifier>
6037 * @throws IOException
6038 */
6039 Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
6040 throws IOException {
6041 Token<DelegationTokenIdentifier> token;
6042 checkOperation(OperationCategory.WRITE);
6043 writeLock();
6044 try {
6045 checkOperation(OperationCategory.WRITE);
6046 checkNameNodeSafeMode("Cannot issue delegation token");
6047 if (!isAllowedDelegationTokenOp()) {
6048 throw new IOException(
6049 "Delegation Token can be issued only with kerberos or web authentication");
6050 }
6051 if (dtSecretManager == null || !dtSecretManager.isRunning()) {
6052 LOG.warn("trying to get DT with no secret manager running");
6053 return null;
6054 }
6055
6056 UserGroupInformation ugi = getRemoteUser();
6057 String user = ugi.getUserName();
6058 Text owner = new Text(user);
6059 Text realUser = null;
6060 if (ugi.getRealUser() != null) {
6061 realUser = new Text(ugi.getRealUser().getUserName());
6062 }
6063 DelegationTokenIdentifier dtId = new DelegationTokenIdentifier(owner,
6064 renewer, realUser);
6065 token = new Token<DelegationTokenIdentifier>(
6066 dtId, dtSecretManager);
6067 long expiryTime = dtSecretManager.getTokenExpiryTime(dtId);
6068 getEditLog().logGetDelegationToken(dtId, expiryTime);
6069 } finally {
6070 writeUnlock();
6071 }
6072 getEditLog().logSync();
6073 return token;
6074 }
6075
6076 /**
6077 *
6078 * @param token
6079 * @return New expiryTime of the token
6080 * @throws InvalidToken
6081 * @throws IOException
6082 */
6083 long renewDelegationToken(Token<DelegationTokenIdentifier> token)
6084 throws InvalidToken, IOException {
6085 long expiryTime;
6086 checkOperation(OperationCategory.WRITE);
6087 writeLock();
6088 try {
6089 checkOperation(OperationCategory.WRITE);
6090
6091 checkNameNodeSafeMode("Cannot renew delegation token");
6092 if (!isAllowedDelegationTokenOp()) {
6093 throw new IOException(
6094 "Delegation Token can be renewed only with kerberos or web authentication");
6095 }
6096 String renewer = getRemoteUser().getShortUserName();
6097 expiryTime = dtSecretManager.renewToken(token, renewer);
6098 DelegationTokenIdentifier id = new DelegationTokenIdentifier();
6099 ByteArrayInputStream buf = new ByteArrayInputStream(token.getIdentifier());
6100 DataInputStream in = new DataInputStream(buf);
6101 id.readFields(in);
6102 getEditLog().logRenewDelegationToken(id, expiryTime);
6103 } finally {
6104 writeUnlock();
6105 }
6106 getEditLog().logSync();
6107 return expiryTime;
6108 }
6109
6110 /**
6111 *
6112 * @param token
6113 * @throws IOException
6114 */
6115 void cancelDelegationToken(Token<DelegationTokenIdentifier> token)
6116 throws IOException {
6117 checkOperation(OperationCategory.WRITE);
6118 writeLock();
6119 try {
6120 checkOperation(OperationCategory.WRITE);
6121
6122 checkNameNodeSafeMode("Cannot cancel delegation token");
6123 String canceller = getRemoteUser().getUserName();
6124 DelegationTokenIdentifier id = dtSecretManager
6125 .cancelToken(token, canceller);
6126 getEditLog().logCancelDelegationToken(id);
6127 } finally {
6128 writeUnlock();
6129 }
6130 getEditLog().logSync();
6131 }
6132
6133 /**
6134 * @param out save state of the secret manager
6135 * @param sdPath String storage directory path
6136 */
6137 void saveSecretManagerState(DataOutputStream out, String sdPath)
6138 throws IOException {
6139 dtSecretManager.saveSecretManagerState(out, sdPath);
6140 }
6141
6142 /**
6143 * @param in load the state of secret manager from input stream
6144 */
6145 void loadSecretManagerState(DataInput in) throws IOException {
6146 dtSecretManager.loadSecretManagerState(in);
6147 }
6148
6149 /**
6150 * Log the updateMasterKey operation to edit logs
6151 *
6152 * @param key new delegation key.
6153 */
6154 public void logUpdateMasterKey(DelegationKey key) {
6155
6156 assert !isInSafeMode() :
6157 "this should never be called while in safemode, since we stop " +
6158 "the DT manager before entering safemode!";
6159 // No need to hold FSN lock since we don't access any internal
6160 // structures, and this is stopped before the FSN shuts itself
6161 // down, etc.
6162 getEditLog().logUpdateMasterKey(key);
6163 getEditLog().logSync();
6164 }
6165
6166 /**
6167 * Log the cancellation of expired tokens to edit logs
6168 *
6169 * @param id token identifier to cancel
6170 */
6171 public void logExpireDelegationToken(DelegationTokenIdentifier id) {
6172 assert !isInSafeMode() :
6173 "this should never be called while in safemode, since we stop " +
6174 "the DT manager before entering safemode!";
6175 // No need to hold FSN lock since we don't access any internal
6176 // structures, and this is stopped before the FSN shuts itself
6177 // down, etc.
6178 getEditLog().logCancelDelegationToken(id);
6179 }
6180
6181 private void logReassignLease(String leaseHolder, String src,
6182 String newHolder) {
6183 assert hasWriteLock();
6184 getEditLog().logReassignLease(leaseHolder, src, newHolder);
6185 }
6186
6187 /**
6188 *
6189 * @return true if delegation token operation is allowed
6190 */
6191 private boolean isAllowedDelegationTokenOp() throws IOException {
6192 AuthenticationMethod authMethod = getConnectionAuthenticationMethod();
6193 if (UserGroupInformation.isSecurityEnabled()
6194 && (authMethod != AuthenticationMethod.KERBEROS)
6195 && (authMethod != AuthenticationMethod.KERBEROS_SSL)
6196 && (authMethod != AuthenticationMethod.CERTIFICATE)) {
6197 return false;
6198 }
6199 return true;
6200 }
6201
6202 /**
6203 * Returns authentication method used to establish the connection
6204 * @return AuthenticationMethod used to establish connection
6205 * @throws IOException
6206 */
6207 private AuthenticationMethod getConnectionAuthenticationMethod()
6208 throws IOException {
6209 UserGroupInformation ugi = getRemoteUser();
6210 AuthenticationMethod authMethod = ugi.getAuthenticationMethod();
6211 if (authMethod == AuthenticationMethod.PROXY) {
6212 authMethod = ugi.getRealUser().getAuthenticationMethod();
6213 }
6214 return authMethod;
6215 }
6216
6217 /**
6218 * Client invoked methods are invoked over RPC and will be in
6219 * RPC call context even if the client exits.
6220 */
6221 private boolean isExternalInvocation() {
6222 return Server.isRpcInvocation() || NamenodeWebHdfsMethods.isWebHdfsInvocation();
6223 }
6224
6225 private static InetAddress getRemoteIp() {
6226 InetAddress ip = Server.getRemoteIp();
6227 if (ip != null) {
6228 return ip;
6229 }
6230 return NamenodeWebHdfsMethods.getRemoteIp();
6231 }
6232
6233 // optimize ugi lookup for RPC operations to avoid a trip through
6234 // UGI.getCurrentUser which is synch'ed
6235 private static UserGroupInformation getRemoteUser() throws IOException {
6236 return NameNode.getRemoteUser();
6237 }
6238
6239 /**
6240 * Log fsck event in the audit log
6241 */
6242 void logFsckEvent(String src, InetAddress remoteAddress) throws IOException {
6243 if (isAuditEnabled()) {
6244 logAuditEvent(true, getRemoteUser(),
6245 remoteAddress,
6246 "fsck", src, null, null);
6247 }
6248 }
6249 /**
6250 * Register NameNodeMXBean
6251 */
6252 private void registerMXBean() {
6253 MBeans.register("NameNode", "NameNodeInfo", this);
6254 }
6255
6256 /**
6257 * Class representing Namenode information for JMX interfaces
6258 */
6259 @Override // NameNodeMXBean
6260 public String getVersion() {
6261 return VersionInfo.getVersion() + ", r" + VersionInfo.getRevision();
6262 }
6263
6264 @Override // NameNodeMXBean
6265 public long getUsed() {
6266 return this.getCapacityUsed();
6267 }
6268
6269 @Override // NameNodeMXBean
6270 public long getFree() {
6271 return this.getCapacityRemaining();
6272 }
6273
6274 @Override // NameNodeMXBean
6275 public long getTotal() {
6276 return this.getCapacityTotal();
6277 }
6278
6279 @Override // NameNodeMXBean
6280 public String getSafemode() {
6281 if (!this.isInSafeMode())
6282 return "";
6283 return "Safe mode is ON. " + this.getSafeModeTip();
6284 }
6285
6286 @Override // NameNodeMXBean
6287 public boolean isUpgradeFinalized() {
6288 return this.getFSImage().isUpgradeFinalized();
6289 }
6290
6291 @Override // NameNodeMXBean
6292 public long getNonDfsUsedSpace() {
6293 return datanodeStatistics.getCapacityUsedNonDFS();
6294 }
6295
6296 @Override // NameNodeMXBean
6297 public float getPercentUsed() {
6298 return datanodeStatistics.getCapacityUsedPercent();
6299 }
6300
6301 @Override // NameNodeMXBean
6302 public long getBlockPoolUsedSpace() {
6303 return datanodeStatistics.getBlockPoolUsed();
6304 }
6305
6306 @Override // NameNodeMXBean
6307 public float getPercentBlockPoolUsed() {
6308 return datanodeStatistics.getPercentBlockPoolUsed();
6309 }
6310
6311 @Override // NameNodeMXBean
6312 public float getPercentRemaining() {
6313 return datanodeStatistics.getCapacityRemainingPercent();
6314 }
6315
6316 @Override // NameNodeMXBean
6317 public long getTotalBlocks() {
6318 return getBlocksTotal();
6319 }
6320
6321 @Override // NameNodeMXBean
6322 @Metric
6323 public long getTotalFiles() {
6324 return getFilesTotal();
6325 }
6326
6327 @Override // NameNodeMXBean
6328 public long getNumberOfMissingBlocks() {
6329 return getMissingBlocksCount();
6330 }
6331
6332 @Override // NameNodeMXBean
6333 public int getThreads() {
6334 return ManagementFactory.getThreadMXBean().getThreadCount();
6335 }
6336
6337 /**
6338 * Returned information is a JSON representation of map with host name as the
6339 * key and value is a map of live node attribute keys to its values
6340 */
6341 @Override // NameNodeMXBean
6342 public String getLiveNodes() {
6343 final Map<String, Map<String,Object>> info =
6344 new HashMap<String, Map<String,Object>>();
6345 final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
6346 blockManager.getDatanodeManager().fetchDatanodes(live, null, true);
6347 for (DatanodeDescriptor node : live) {
6348 final Map<String, Object> innerinfo = new HashMap<String, Object>();
6349 innerinfo.put("lastContact", getLastContact(node));
6350 innerinfo.put("usedSpace", getDfsUsed(node));
6351 innerinfo.put("adminState", node.getAdminState().toString());
6352 innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed());
6353 innerinfo.put("capacity", node.getCapacity());
6354 innerinfo.put("numBlocks", node.numBlocks());
6355 innerinfo.put("version", node.getSoftwareVersion());
6356 info.put(node.getHostName(), innerinfo);
6357 }
6358 return JSON.toString(info);
6359 }
6360
6361 /**
6362 * Returned information is a JSON representation of map with host name as the
6363 * key and value is a map of dead node attribute keys to its values
6364 */
6365 @Override // NameNodeMXBean
6366 public String getDeadNodes() {
6367 final Map<String, Map<String, Object>> info =
6368 new HashMap<String, Map<String, Object>>();
6369 final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
6370 blockManager.getDatanodeManager().fetchDatanodes(null, dead, true);
6371 for (DatanodeDescriptor node : dead) {
6372 final Map<String, Object> innerinfo = new HashMap<String, Object>();
6373 innerinfo.put("lastContact", getLastContact(node));
6374 innerinfo.put("decommissioned", node.isDecommissioned());
6375 info.put(node.getHostName(), innerinfo);
6376 }
6377 return JSON.toString(info);
6378 }
6379
6380 /**
6381 * Returned information is a JSON representation of map with host name as the
6382 * key and value is a map of decomisioning node attribute keys to its values
6383 */
6384 @Override // NameNodeMXBean
6385 public String getDecomNodes() {
6386 final Map<String, Map<String, Object>> info =
6387 new HashMap<String, Map<String, Object>>();
6388 final List<DatanodeDescriptor> decomNodeList = blockManager.getDatanodeManager(
6389 ).getDecommissioningNodes();
6390 for (DatanodeDescriptor node : decomNodeList) {
6391 final Map<String, Object> innerinfo = new HashMap<String, Object>();
6392 innerinfo.put("underReplicatedBlocks", node.decommissioningStatus
6393 .getUnderReplicatedBlocks());
6394 innerinfo.put("decommissionOnlyReplicas", node.decommissioningStatus
6395 .getDecommissionOnlyReplicas());
6396 innerinfo.put("underReplicateInOpenFiles", node.decommissioningStatus
6397 .getUnderReplicatedInOpenFiles());
6398 info.put(node.getHostName(), innerinfo);
6399 }
6400 return JSON.toString(info);
6401 }
6402
6403 private long getLastContact(DatanodeDescriptor alivenode) {
6404 return (Time.now() - alivenode.getLastUpdate())/1000;
6405 }
6406
6407 private long getDfsUsed(DatanodeDescriptor alivenode) {
6408 return alivenode.getDfsUsed();
6409 }
6410
6411 @Override // NameNodeMXBean
6412 public String getClusterId() {
6413 return dir.fsImage.getStorage().getClusterID();
6414 }
6415
6416 @Override // NameNodeMXBean
6417 public String getBlockPoolId() {
6418 return blockPoolId;
6419 }
6420
6421 @Override // NameNodeMXBean
6422 public String getNameDirStatuses() {
6423 Map<String, Map<File, StorageDirType>> statusMap =
6424 new HashMap<String, Map<File, StorageDirType>>();
6425
6426 Map<File, StorageDirType> activeDirs = new HashMap<File, StorageDirType>();
6427 for (Iterator<StorageDirectory> it
6428 = getFSImage().getStorage().dirIterator(); it.hasNext();) {
6429 StorageDirectory st = it.next();
6430 activeDirs.put(st.getRoot(), st.getStorageDirType());
6431 }
6432 statusMap.put("active", activeDirs);
6433
6434 List<Storage.StorageDirectory> removedStorageDirs
6435 = getFSImage().getStorage().getRemovedStorageDirs();
6436 Map<File, StorageDirType> failedDirs = new HashMap<File, StorageDirType>();
6437 for (StorageDirectory st : removedStorageDirs) {
6438 failedDirs.put(st.getRoot(), st.getStorageDirType());
6439 }
6440 statusMap.put("failed", failedDirs);
6441
6442 return JSON.toString(statusMap);
6443 }
6444
6445 @Override // NameNodeMxBean
6446 public String getJournalTransactionInfo() {
6447 Map<String, String> txnIdMap = new HashMap<String, String>();
6448 txnIdMap.put("LastAppliedOrWrittenTxId",
6449 Long.toString(this.getFSImage().getLastAppliedOrWrittenTxId()));
6450 txnIdMap.put("MostRecentCheckpointTxId",
6451 Long.toString(this.getFSImage().getMostRecentCheckpointTxId()));
6452 return JSON.toString(txnIdMap);
6453 }
6454
6455 /** @return the block manager. */
6456 public BlockManager getBlockManager() {
6457 return blockManager;
6458 }
6459 /** @return the FSDirectory. */
6460 public FSDirectory getFSDirectory() {
6461 return dir;
6462 }
6463
6464 @Override //NameNodeMXBean
6465 public int getDistinctVersionCount() {
6466 return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
6467 .size();
6468 }
6469
6470 @Override //NameNodeMXBean
6471 public Map<String, Integer> getDistinctVersions() {
6472 return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
6473 }
6474
6475 @Override //NameNodeMXBean
6476 public String getSoftwareVersion() {
6477 return VersionInfo.getVersion();
6478 }
6479
6480 /**
6481 * Verifies that the given identifier and password are valid and match.
6482 * @param identifier Token identifier.
6483 * @param password Password in the token.
6484 */
6485 public synchronized void verifyToken(DelegationTokenIdentifier identifier,
6486 byte[] password) throws InvalidToken, RetriableException {
6487 try {
6488 getDelegationTokenSecretManager().verifyToken(identifier, password);
6489 } catch (InvalidToken it) {
6490 if (inTransitionToActive()) {
6491 throw new RetriableException(it);
6492 }
6493 throw it;
6494 }
6495 }
6496
6497 @Override
6498 public boolean isGenStampInFuture(Block block) {
6499 if (isLegacyBlock(block)) {
6500 return block.getGenerationStamp() > getGenerationStampV1();
6501 } else {
6502 return block.getGenerationStamp() > getGenerationStampV2();
6503 }
6504 }
6505
6506 @VisibleForTesting
6507 public EditLogTailer getEditLogTailer() {
6508 return editLogTailer;
6509 }
6510
6511 @VisibleForTesting
6512 public void setEditLogTailerForTests(EditLogTailer tailer) {
6513 this.editLogTailer = tailer;
6514 }
6515
6516 @VisibleForTesting
6517 void setFsLockForTests(ReentrantReadWriteLock lock) {
6518 this.fsLock = lock;
6519 }
6520
6521 @VisibleForTesting
6522 ReentrantReadWriteLock getFsLockForTests() {
6523 return fsLock;
6524 }
6525
6526 @VisibleForTesting
6527 public SafeModeInfo getSafeModeInfoForTests() {
6528 return safeMode;
6529 }
6530
6531 @VisibleForTesting
6532 public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
6533 this.nnResourceChecker = nnResourceChecker;
6534 }
6535
6536 @Override
6537 public boolean isAvoidingStaleDataNodesForWrite() {
6538 return this.blockManager.getDatanodeManager()
6539 .shouldAvoidStaleDataNodesForWrite();
6540 }
6541
6542 public SnapshotManager getSnapshotManager() {
6543 return snapshotManager;
6544 }
6545
6546 /** Allow snapshot on a directroy. */
6547 void allowSnapshot(String path) throws SafeModeException, IOException {
6548 writeLock();
6549 try {
6550 checkOperation(OperationCategory.WRITE);
6551 checkNameNodeSafeMode("Cannot allow snapshot for " + path);
6552 checkSuperuserPrivilege();
6553
6554 dir.writeLock();
6555 try {
6556 snapshotManager.setSnapshottable(path, true);
6557 } finally {
6558 dir.writeUnlock();
6559 }
6560 getEditLog().logAllowSnapshot(path);
6561 } finally {
6562 writeUnlock();
6563 }
6564 getEditLog().logSync();
6565
6566 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6567 logAuditEvent(true, "allowSnapshot", path, null, null);
6568 }
6569 }
6570
6571 /** Disallow snapshot on a directory. */
6572 void disallowSnapshot(String path) throws SafeModeException, IOException {
6573 writeLock();
6574 try {
6575 checkOperation(OperationCategory.WRITE);
6576 checkNameNodeSafeMode("Cannot disallow snapshot for " + path);
6577 checkSuperuserPrivilege();
6578
6579 dir.writeLock();
6580 try {
6581 snapshotManager.resetSnapshottable(path);
6582 } finally {
6583 dir.writeUnlock();
6584 }
6585 getEditLog().logDisallowSnapshot(path);
6586 } finally {
6587 writeUnlock();
6588 }
6589 getEditLog().logSync();
6590
6591 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6592 logAuditEvent(true, "disallowSnapshot", path, null, null);
6593 }
6594 }
6595
6596 /**
6597 * Create a snapshot
6598 * @param snapshotRoot The directory path where the snapshot is taken
6599 * @param snapshotName The name of the snapshot
6600 */
6601 String createSnapshot(String snapshotRoot, String snapshotName)
6602 throws SafeModeException, IOException {
6603 checkOperation(OperationCategory.WRITE);
6604 final FSPermissionChecker pc = getPermissionChecker();
6605 CacheEntryWithPayload cacheEntry = RetryCache.waitForCompletion(retryCache,
6606 null);
6607 if (cacheEntry != null && cacheEntry.isSuccess()) {
6608 return (String) cacheEntry.getPayload();
6609 }
6610 writeLock();
6611 String snapshotPath = null;
6612 try {
6613 checkOperation(OperationCategory.WRITE);
6614 checkNameNodeSafeMode("Cannot create snapshot for " + snapshotRoot);
6615 if (isPermissionEnabled) {
6616 checkOwner(pc, snapshotRoot);
6617 }
6618
6619 if (snapshotName == null || snapshotName.isEmpty()) {
6620 snapshotName = Snapshot.generateDefaultSnapshotName();
6621 }
6622 dir.verifySnapshotName(snapshotName, snapshotRoot);
6623 dir.writeLock();
6624 try {
6625 snapshotPath = snapshotManager.createSnapshot(snapshotRoot, snapshotName);
6626 } finally {
6627 dir.writeUnlock();
6628 }
6629 getEditLog().logCreateSnapshot(snapshotRoot, snapshotName,
6630 cacheEntry != null);
6631 } finally {
6632 writeUnlock();
6633 RetryCache.setState(cacheEntry, snapshotPath != null, snapshotPath);
6634 }
6635 getEditLog().logSync();
6636
6637 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6638 logAuditEvent(true, "createSnapshot", snapshotRoot, snapshotPath, null);
6639 }
6640 return snapshotPath;
6641 }
6642
6643 /**
6644 * Rename a snapshot
6645 * @param path The directory path where the snapshot was taken
6646 * @param snapshotOldName Old snapshot name
6647 * @param snapshotNewName New snapshot name
6648 * @throws SafeModeException
6649 * @throws IOException
6650 */
6651 void renameSnapshot(String path, String snapshotOldName,
6652 String snapshotNewName) throws SafeModeException, IOException {
6653 checkOperation(OperationCategory.WRITE);
6654 final FSPermissionChecker pc = getPermissionChecker();
6655 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6656 if (cacheEntry != null && cacheEntry.isSuccess()) {
6657 return; // Return previous response
6658 }
6659 writeLock();
6660 boolean success = false;
6661 try {
6662 checkOperation(OperationCategory.WRITE);
6663 checkNameNodeSafeMode("Cannot rename snapshot for " + path);
6664 if (isPermissionEnabled) {
6665 checkOwner(pc, path);
6666 }
6667 dir.verifySnapshotName(snapshotNewName, path);
6668
6669 snapshotManager.renameSnapshot(path, snapshotOldName, snapshotNewName);
6670 getEditLog().logRenameSnapshot(path, snapshotOldName, snapshotNewName,
6671 cacheEntry != null);
6672 success = true;
6673 } finally {
6674 writeUnlock();
6675 RetryCache.setState(cacheEntry, success);
6676 }
6677 getEditLog().logSync();
6678
6679 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6680 String oldSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotOldName);
6681 String newSnapshotRoot = Snapshot.getSnapshotPath(path, snapshotNewName);
6682 logAuditEvent(true, "renameSnapshot", oldSnapshotRoot, newSnapshotRoot, null);
6683 }
6684 }
6685
6686 /**
6687 * Get the list of snapshottable directories that are owned
6688 * by the current user. Return all the snapshottable directories if the
6689 * current user is a super user.
6690 * @return The list of all the current snapshottable directories
6691 * @throws IOException
6692 */
6693 public SnapshottableDirectoryStatus[] getSnapshottableDirListing()
6694 throws IOException {
6695 SnapshottableDirectoryStatus[] status = null;
6696 final FSPermissionChecker checker = getPermissionChecker();
6697 readLock();
6698 try {
6699 checkOperation(OperationCategory.READ);
6700 final String user = checker.isSuperUser()? null : checker.getUser();
6701 status = snapshotManager.getSnapshottableDirListing(user);
6702 } finally {
6703 readUnlock();
6704 }
6705 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6706 logAuditEvent(true, "listSnapshottableDirectory", null, null, null);
6707 }
6708 return status;
6709 }
6710
6711 /**
6712 * Get the difference between two snapshots (or between a snapshot and the
6713 * current status) of a snapshottable directory.
6714 *
6715 * @param path The full path of the snapshottable directory.
6716 * @param fromSnapshot Name of the snapshot to calculate the diff from. Null
6717 * or empty string indicates the current tree.
6718 * @param toSnapshot Name of the snapshot to calculated the diff to. Null or
6719 * empty string indicates the current tree.
6720 * @return A report about the difference between {@code fromSnapshot} and
6721 * {@code toSnapshot}. Modified/deleted/created/renamed files and
6722 * directories belonging to the snapshottable directories are listed
6723 * and labeled as M/-/+/R respectively.
6724 * @throws IOException
6725 */
6726 SnapshotDiffReport getSnapshotDiffReport(String path,
6727 String fromSnapshot, String toSnapshot) throws IOException {
6728 SnapshotDiffInfo diffs = null;
6729 final FSPermissionChecker pc = getPermissionChecker();
6730 readLock();
6731 try {
6732 checkOperation(OperationCategory.READ);
6733 if (isPermissionEnabled) {
6734 checkSubtreeReadPermission(pc, path, fromSnapshot);
6735 checkSubtreeReadPermission(pc, path, toSnapshot);
6736 }
6737 diffs = snapshotManager.diff(path, fromSnapshot, toSnapshot);
6738 } finally {
6739 readUnlock();
6740 }
6741
6742 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6743 logAuditEvent(true, "computeSnapshotDiff", null, null, null);
6744 }
6745 return diffs != null ? diffs.generateReport() : new SnapshotDiffReport(
6746 path, fromSnapshot, toSnapshot,
6747 Collections.<DiffReportEntry> emptyList());
6748 }
6749
6750 private void checkSubtreeReadPermission(final FSPermissionChecker pc,
6751 final String snapshottablePath, final String snapshot)
6752 throws AccessControlException, UnresolvedLinkException {
6753 final String fromPath = snapshot == null?
6754 snapshottablePath: Snapshot.getSnapshotPath(snapshottablePath, snapshot);
6755 checkPermission(pc, fromPath, false, null, null, FsAction.READ, FsAction.READ);
6756 }
6757
6758 /**
6759 * Delete a snapshot of a snapshottable directory
6760 * @param snapshotRoot The snapshottable directory
6761 * @param snapshotName The name of the to-be-deleted snapshot
6762 * @throws SafeModeException
6763 * @throws IOException
6764 */
6765 void deleteSnapshot(String snapshotRoot, String snapshotName)
6766 throws SafeModeException, IOException {
6767 checkOperation(OperationCategory.WRITE);
6768 final FSPermissionChecker pc = getPermissionChecker();
6769
6770 CacheEntry cacheEntry = RetryCache.waitForCompletion(retryCache);
6771 if (cacheEntry != null && cacheEntry.isSuccess()) {
6772 return; // Return previous response
6773 }
6774 boolean success = false;
6775 BlocksMapUpdateInfo collectedBlocks = new BlocksMapUpdateInfo();
6776 writeLock();
6777 try {
6778 checkOperation(OperationCategory.WRITE);
6779 checkNameNodeSafeMode("Cannot delete snapshot for " + snapshotRoot);
6780 if (isPermissionEnabled) {
6781 checkOwner(pc, snapshotRoot);
6782 }
6783
6784 List<INode> removedINodes = new ArrayList<INode>();
6785 dir.writeLock();
6786 try {
6787 snapshotManager.deleteSnapshot(snapshotRoot, snapshotName,
6788 collectedBlocks, removedINodes);
6789 dir.removeFromInodeMap(removedINodes);
6790 } finally {
6791 dir.writeUnlock();
6792 }
6793 removedINodes.clear();
6794 getEditLog().logDeleteSnapshot(snapshotRoot, snapshotName,
6795 cacheEntry != null);
6796 success = true;
6797 } finally {
6798 writeUnlock();
6799 RetryCache.setState(cacheEntry, success);
6800 }
6801 getEditLog().logSync();
6802
6803 removeBlocks(collectedBlocks);
6804 collectedBlocks.clear();
6805
6806 if (auditLog.isInfoEnabled() && isExternalInvocation()) {
6807 String rootPath = Snapshot.getSnapshotPath(snapshotRoot, snapshotName);
6808 logAuditEvent(true, "deleteSnapshot", rootPath, null, null);
6809 }
6810 }
6811
6812 /**
6813 * Remove a list of INodeDirectorySnapshottable from the SnapshotManager
6814 * @param toRemove the list of INodeDirectorySnapshottable to be removed
6815 */
6816 void removeSnapshottableDirs(List<INodeDirectorySnapshottable> toRemove) {
6817 if (snapshotManager != null) {
6818 snapshotManager.removeSnapshottable(toRemove);
6819 }
6820 }
6821
6822 /**
6823 * Default AuditLogger implementation; used when no access logger is
6824 * defined in the config file. It can also be explicitly listed in the
6825 * config file.
6826 */
6827 private static class DefaultAuditLogger extends HdfsAuditLogger {
6828
6829 private boolean logTokenTrackingId;
6830
6831 @Override
6832 public void initialize(Configuration conf) {
6833 logTokenTrackingId = conf.getBoolean(
6834 DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_KEY,
6835 DFSConfigKeys.DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT);
6836 }
6837
6838 @Override
6839 public void logAuditEvent(boolean succeeded, String userName,
6840 InetAddress addr, String cmd, String src, String dst,
6841 FileStatus status, UserGroupInformation ugi,
6842 DelegationTokenSecretManager dtSecretManager) {
6843 if (auditLog.isInfoEnabled()) {
6844 final StringBuilder sb = auditBuffer.get();
6845 sb.setLength(0);
6846 sb.append("allowed=").append(succeeded).append("\t");
6847 sb.append("ugi=").append(userName).append("\t");
6848 sb.append("ip=").append(addr).append("\t");
6849 sb.append("cmd=").append(cmd).append("\t");
6850 sb.append("src=").append(src).append("\t");
6851 sb.append("dst=").append(dst).append("\t");
6852 if (null == status) {
6853 sb.append("perm=null");
6854 } else {
6855 sb.append("perm=");
6856 sb.append(status.getOwner()).append(":");
6857 sb.append(status.getGroup()).append(":");
6858 sb.append(status.getPermission());
6859 }
6860 if (logTokenTrackingId) {
6861 sb.append("\t").append("trackingId=");
6862 String trackingId = null;
6863 if (ugi != null && dtSecretManager != null
6864 && ugi.getAuthenticationMethod() == AuthenticationMethod.TOKEN) {
6865 for (TokenIdentifier tid: ugi.getTokenIdentifiers()) {
6866 if (tid instanceof DelegationTokenIdentifier) {
6867 DelegationTokenIdentifier dtid =
6868 (DelegationTokenIdentifier)tid;
6869 trackingId = dtSecretManager.getTokenTrackingId(dtid);
6870 break;
6871 }
6872 }
6873 }
6874 sb.append(trackingId);
6875 }
6876 auditLog.info(sb);
6877 }
6878 }
6879
6880 }
6881
6882 }