001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.datanode; 019 020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS; 021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT; 022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS; 023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT; 024 025import java.io.Closeable; 026import java.io.FileInputStream; 027import java.io.IOException; 028import java.util.Collections; 029import java.util.HashMap; 030import java.util.Iterator; 031import java.util.Set; 032 033import org.apache.commons.io.IOUtils; 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036import org.apache.hadoop.conf.Configuration; 037import org.apache.hadoop.fs.InvalidRequestException; 038import org.apache.hadoop.hdfs.ExtendedBlockId; 039import org.apache.hadoop.hdfs.ShortCircuitShm; 040import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId; 041import org.apache.hadoop.hdfs.ShortCircuitShm.Slot; 042import org.apache.hadoop.hdfs.ShortCircuitShm.SlotId; 043import org.apache.hadoop.io.nativeio.NativeIO; 044import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory; 045import org.apache.hadoop.net.unix.DomainSocket; 046import org.apache.hadoop.net.unix.DomainSocketWatcher; 047 048import com.google.common.base.Preconditions; 049import com.google.common.base.Splitter; 050import com.google.common.collect.HashMultimap; 051import com.google.common.collect.Iterables; 052 053/* 054 * Manages client short-circuit memory segments on the DataNode. 055 * 056 * DFSClients request shared memory segments from the DataNode. The 057 * ShortCircuitRegistry generates and manages these segments. Each segment 058 * has a randomly generated 128-bit ID which uniquely identifies it. The 059 * segments each contain several "slots." 060 * 061 * Before performing a short-circuit read, DFSClients must request a pair of 062 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS 063 * operation. As part of this operation, DFSClients pass the ID of the shared 064 * memory segment they would like to use to communicate information about this 065 * replica, as well as the slot number within that segment they would like to 066 * use. Slot allocation is always done by the client. 067 * 068 * Slots are used to track the state of the block on the both the client and 069 * datanode. When this DataNode mlocks a block, the corresponding slots for the 070 * replicas are marked as "anchorable". Anchorable blocks can be safely read 071 * without verifying the checksum. This means that BlockReaderLocal objects 072 * using these replicas can skip checksumming. It also means that we can do 073 * zero-copy reads on these replicas (the ZCR interface has no way of 074 * verifying checksums.) 075 * 076 * When a DN needs to munlock a block, it needs to first wait for the block to 077 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 078 * DN also marks the block's slots as "unanchorable" to prevent additional 079 * clients from initiating these operations in the future. 080 * 081 * The counterpart fo this class on the client is {@link DfsClientShmManager}. 082 */ 083public class ShortCircuitRegistry { 084 public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class); 085 086 private static final int SHM_LENGTH = 8192; 087 088 private static class RegisteredShm extends ShortCircuitShm 089 implements DomainSocketWatcher.Handler { 090 private final ShortCircuitRegistry registry; 091 092 RegisteredShm(ShmId shmId, FileInputStream stream, 093 ShortCircuitRegistry registry) throws IOException { 094 super(shmId, stream); 095 this.registry = registry; 096 } 097 098 @Override 099 public boolean handle(DomainSocket sock) { 100 synchronized (registry) { 101 synchronized (this) { 102 registry.removeShm(this); 103 } 104 } 105 return true; 106 } 107 } 108 109 public synchronized void removeShm(ShortCircuitShm shm) { 110 if (LOG.isTraceEnabled()) { 111 LOG.debug("removing shm " + shm); 112 } 113 // Stop tracking the shmId. 114 RegisteredShm removedShm = segments.remove(shm.getShmId()); 115 Preconditions.checkState(removedShm == shm, 116 "failed to remove " + shm.getShmId()); 117 // Stop tracking the slots. 118 for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) { 119 Slot slot = iter.next(); 120 boolean removed = slots.remove(slot.getBlockId(), slot); 121 Preconditions.checkState(removed); 122 slot.makeInvalid(); 123 } 124 // De-allocate the memory map and close the shared file. 125 shm.free(); 126 } 127 128 /** 129 * Whether or not the registry is enabled. 130 */ 131 private boolean enabled; 132 133 /** 134 * The factory which creates shared file descriptors. 135 */ 136 private final SharedFileDescriptorFactory shmFactory; 137 138 /** 139 * A watcher which sends out callbacks when the UNIX domain socket 140 * associated with a shared memory segment closes. 141 */ 142 private final DomainSocketWatcher watcher; 143 144 private final HashMap<ShmId, RegisteredShm> segments = 145 new HashMap<ShmId, RegisteredShm>(0); 146 147 private final HashMultimap<ExtendedBlockId, Slot> slots = 148 HashMultimap.create(0, 1); 149 150 public ShortCircuitRegistry(Configuration conf) throws IOException { 151 boolean enabled = false; 152 SharedFileDescriptorFactory shmFactory = null; 153 DomainSocketWatcher watcher = null; 154 try { 155 int interruptCheck = conf.getInt( 156 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, 157 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT); 158 if (interruptCheck <= 0) { 159 throw new IOException( 160 DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS + 161 " was set to " + interruptCheck); 162 } 163 String shmPaths[] = 164 conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS); 165 if (shmPaths.length == 0) { 166 shmPaths = 167 DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(","); 168 } 169 shmFactory = SharedFileDescriptorFactory. 170 create("HadoopShortCircuitShm_", shmPaths); 171 String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason(); 172 if (dswLoadingFailure != null) { 173 throw new IOException(dswLoadingFailure); 174 } 175 watcher = new DomainSocketWatcher(interruptCheck); 176 enabled = true; 177 if (LOG.isDebugEnabled()) { 178 LOG.debug("created new ShortCircuitRegistry with interruptCheck=" + 179 interruptCheck + ", shmPath=" + shmFactory.getPath()); 180 } 181 } catch (IOException e) { 182 if (LOG.isDebugEnabled()) { 183 LOG.debug("Disabling ShortCircuitRegistry", e); 184 } 185 } finally { 186 this.enabled = enabled; 187 this.shmFactory = shmFactory; 188 this.watcher = watcher; 189 } 190 } 191 192 /** 193 * Process a block mlock event from the FsDatasetCache. 194 * 195 * @param blockId The block that was mlocked. 196 */ 197 public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) { 198 if (!enabled) return; 199 Set<Slot> affectedSlots = slots.get(blockId); 200 for (Slot slot : affectedSlots) { 201 slot.makeAnchorable(); 202 } 203 } 204 205 /** 206 * Mark any slots associated with this blockId as unanchorable. 207 * 208 * @param blockId The block ID. 209 * @return True if we should allow the munlock request. 210 */ 211 public synchronized boolean processBlockMunlockRequest( 212 ExtendedBlockId blockId) { 213 if (!enabled) return true; 214 boolean allowMunlock = true; 215 Set<Slot> affectedSlots = slots.get(blockId); 216 for (Slot slot : affectedSlots) { 217 slot.makeUnanchorable(); 218 if (slot.isAnchored()) { 219 allowMunlock = false; 220 } 221 } 222 return allowMunlock; 223 } 224 225 public static class NewShmInfo implements Closeable { 226 public final ShmId shmId; 227 public final FileInputStream stream; 228 229 NewShmInfo(ShmId shmId, FileInputStream stream) { 230 this.shmId = shmId; 231 this.stream = stream; 232 } 233 234 @Override 235 public void close() throws IOException { 236 stream.close(); 237 } 238 } 239 240 /** 241 * Handle a DFSClient request to create a new memory segment. 242 * 243 * @param clientName Client name as reported by the client. 244 * @param sock The DomainSocket to associate with this memory 245 * segment. When this socket is closed, or the 246 * other side writes anything to the socket, the 247 * segment will be closed. This can happen at any 248 * time, including right after this function returns. 249 * @return A NewShmInfo object. The caller must close the 250 * NewShmInfo object once they are done with it. 251 * @throws IOException If the new memory segment could not be created. 252 */ 253 public NewShmInfo createNewMemorySegment(String clientName, 254 DomainSocket sock) throws IOException { 255 NewShmInfo info = null; 256 RegisteredShm shm = null; 257 ShmId shmId = null; 258 synchronized (this) { 259 if (!enabled) { 260 if (LOG.isTraceEnabled()) { 261 LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " + 262 "not enabled."); 263 } 264 throw new UnsupportedOperationException(); 265 } 266 FileInputStream fis = null; 267 try { 268 do { 269 shmId = ShmId.createRandom(); 270 } while (segments.containsKey(shmId)); 271 fis = shmFactory.createDescriptor(clientName, SHM_LENGTH); 272 shm = new RegisteredShm(shmId, fis, this); 273 } finally { 274 if (shm == null) { 275 IOUtils.closeQuietly(fis); 276 } 277 } 278 info = new NewShmInfo(shmId, fis); 279 segments.put(shmId, shm); 280 } 281 // Drop the registry lock to prevent deadlock. 282 // After this point, RegisteredShm#handle may be called at any time. 283 watcher.add(sock, shm); 284 if (LOG.isTraceEnabled()) { 285 LOG.trace("createNewMemorySegment: created " + info.shmId); 286 } 287 return info; 288 } 289 290 public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId, 291 boolean isCached) throws InvalidRequestException { 292 if (!enabled) { 293 if (LOG.isTraceEnabled()) { 294 LOG.trace(this + " can't register a slot because the " + 295 "ShortCircuitRegistry is not enabled."); 296 } 297 throw new UnsupportedOperationException(); 298 } 299 ShmId shmId = slotId.getShmId(); 300 RegisteredShm shm = segments.get(shmId); 301 if (shm == null) { 302 throw new InvalidRequestException("there is no shared memory segment " + 303 "registered with shmId " + shmId); 304 } 305 Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId); 306 if (isCached) { 307 slot.makeAnchorable(); 308 } else { 309 slot.makeUnanchorable(); 310 } 311 boolean added = slots.put(blockId, slot); 312 Preconditions.checkState(added); 313 if (LOG.isTraceEnabled()) { 314 LOG.trace(this + ": registered " + blockId + " with slot " + 315 slotId + " (isCached=" + isCached + ")"); 316 } 317 } 318 319 public synchronized void unregisterSlot(SlotId slotId) 320 throws InvalidRequestException { 321 if (!enabled) { 322 if (LOG.isTraceEnabled()) { 323 LOG.trace("unregisterSlot: ShortCircuitRegistry is " + 324 "not enabled."); 325 } 326 throw new UnsupportedOperationException(); 327 } 328 ShmId shmId = slotId.getShmId(); 329 RegisteredShm shm = segments.get(shmId); 330 if (shm == null) { 331 throw new InvalidRequestException("there is no shared memory segment " + 332 "registered with shmId " + shmId); 333 } 334 Slot slot = shm.getSlot(slotId.getSlotIdx()); 335 slot.makeInvalid(); 336 shm.unregisterSlot(slotId.getSlotIdx()); 337 slots.remove(slot.getBlockId(), slot); 338 } 339 340 public void shutdown() { 341 synchronized (this) { 342 if (!enabled) return; 343 enabled = false; 344 } 345 IOUtils.closeQuietly(watcher); 346 } 347}