001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024
025import java.io.Closeable;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.util.Collections;
029import java.util.HashMap;
030import java.util.Iterator;
031import java.util.Set;
032
033import org.apache.commons.io.IOUtils;
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036import org.apache.hadoop.conf.Configuration;
037import org.apache.hadoop.fs.InvalidRequestException;
038import org.apache.hadoop.hdfs.ExtendedBlockId;
039import org.apache.hadoop.hdfs.ShortCircuitShm;
040import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
041import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
042import org.apache.hadoop.hdfs.ShortCircuitShm.SlotId;
043import org.apache.hadoop.io.nativeio.NativeIO;
044import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
045import org.apache.hadoop.net.unix.DomainSocket;
046import org.apache.hadoop.net.unix.DomainSocketWatcher;
047
048import com.google.common.base.Preconditions;
049import com.google.common.base.Splitter;
050import com.google.common.collect.HashMultimap;
051import com.google.common.collect.Iterables;
052
053/*
054 * Manages client short-circuit memory segments on the DataNode.
055 *
056 * DFSClients request shared memory segments from the DataNode.  The 
057 * ShortCircuitRegistry generates and manages these segments.  Each segment
058 * has a randomly generated 128-bit ID which uniquely identifies it.  The
059 * segments each contain several "slots."
060 *
061 * Before performing a short-circuit read, DFSClients must request a pair of
062 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
063 * operation.  As part of this operation, DFSClients pass the ID of the shared
064 * memory segment they would like to use to communicate information about this
065 * replica, as well as the slot number within that segment they would like to
066 * use.  Slot allocation is always done by the client.
067 *
068 * Slots are used to track the state of the block on the both the client and
069 * datanode. When this DataNode mlocks a block, the corresponding slots for the
070 * replicas are marked as "anchorable".  Anchorable blocks can be safely read
071 * without verifying the checksum.  This means that BlockReaderLocal objects
072 * using these replicas can skip checksumming.  It also means that we can do
073 * zero-copy reads on these replicas (the ZCR interface has no way of
074 * verifying checksums.)
075 * 
076 * When a DN needs to munlock a block, it needs to first wait for the block to
077 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 
078 * DN also marks the block's slots as "unanchorable" to prevent additional 
079 * clients from initiating these operations in the future.
080 * 
081 * The counterpart fo this class on the client is {@link DfsClientShmManager}.
082 */
083public class ShortCircuitRegistry {
084  public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
085
086  private static final int SHM_LENGTH = 8192;
087
088  private static class RegisteredShm extends ShortCircuitShm
089      implements DomainSocketWatcher.Handler {
090    private final ShortCircuitRegistry registry;
091
092    RegisteredShm(ShmId shmId, FileInputStream stream,
093        ShortCircuitRegistry registry) throws IOException {
094      super(shmId, stream);
095      this.registry = registry;
096    }
097
098    @Override
099    public boolean handle(DomainSocket sock) {
100      synchronized (registry) {
101        synchronized (this) {
102          registry.removeShm(this);
103        }
104      }
105      return true;
106    }
107  }
108
109  public synchronized void removeShm(ShortCircuitShm shm) {
110    if (LOG.isTraceEnabled()) {
111      LOG.debug("removing shm " + shm);
112    }
113    // Stop tracking the shmId.
114    RegisteredShm removedShm = segments.remove(shm.getShmId());
115    Preconditions.checkState(removedShm == shm,
116        "failed to remove " + shm.getShmId());
117    // Stop tracking the slots.
118    for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
119      Slot slot = iter.next();
120      boolean removed = slots.remove(slot.getBlockId(), slot);
121      Preconditions.checkState(removed);
122      slot.makeInvalid();
123    }
124    // De-allocate the memory map and close the shared file. 
125    shm.free();
126  }
127
128  /**
129   * Whether or not the registry is enabled.
130   */
131  private boolean enabled;
132
133  /**
134   * The factory which creates shared file descriptors.
135   */
136  private final SharedFileDescriptorFactory shmFactory;
137  
138  /**
139   * A watcher which sends out callbacks when the UNIX domain socket
140   * associated with a shared memory segment closes.
141   */
142  private final DomainSocketWatcher watcher;
143
144  private final HashMap<ShmId, RegisteredShm> segments =
145      new HashMap<ShmId, RegisteredShm>(0);
146  
147  private final HashMultimap<ExtendedBlockId, Slot> slots =
148      HashMultimap.create(0, 1);
149  
150  public ShortCircuitRegistry(Configuration conf) throws IOException {
151    boolean enabled = false;
152    SharedFileDescriptorFactory shmFactory = null;
153    DomainSocketWatcher watcher = null;
154    try {
155      int interruptCheck = conf.getInt(
156          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
157          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
158      if (interruptCheck <= 0) {
159        throw new IOException(
160            DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
161            " was set to " + interruptCheck);
162      }
163      String shmPaths[] =
164          conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
165      if (shmPaths.length == 0) {
166        shmPaths =
167            DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
168      }
169      shmFactory = SharedFileDescriptorFactory.
170          create("HadoopShortCircuitShm_", shmPaths);
171      String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
172      if (dswLoadingFailure != null) {
173        throw new IOException(dswLoadingFailure);
174      }
175      watcher = new DomainSocketWatcher(interruptCheck);
176      enabled = true;
177      if (LOG.isDebugEnabled()) {
178        LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
179                  interruptCheck + ", shmPath=" + shmFactory.getPath());
180      }
181    } catch (IOException e) {
182      if (LOG.isDebugEnabled()) {
183        LOG.debug("Disabling ShortCircuitRegistry", e);
184      }
185    } finally {
186      this.enabled = enabled;
187      this.shmFactory = shmFactory;
188      this.watcher = watcher;
189    }
190  }
191
192  /**
193   * Process a block mlock event from the FsDatasetCache.
194   *
195   * @param blockId    The block that was mlocked.
196   */
197  public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
198    if (!enabled) return;
199    Set<Slot> affectedSlots = slots.get(blockId);
200    for (Slot slot : affectedSlots) {
201      slot.makeAnchorable();
202    }
203  }
204
205  /**
206   * Mark any slots associated with this blockId as unanchorable.
207   *
208   * @param blockId        The block ID.
209   * @return               True if we should allow the munlock request.
210   */
211  public synchronized boolean processBlockMunlockRequest(
212      ExtendedBlockId blockId) {
213    if (!enabled) return true;
214    boolean allowMunlock = true;
215    Set<Slot> affectedSlots = slots.get(blockId);
216    for (Slot slot : affectedSlots) {
217      slot.makeUnanchorable();
218      if (slot.isAnchored()) {
219        allowMunlock = false;
220      }
221    }
222    return allowMunlock;
223  }
224  
225  public static class NewShmInfo implements Closeable {
226    public final ShmId shmId;
227    public final FileInputStream stream;
228
229    NewShmInfo(ShmId shmId, FileInputStream stream) {
230      this.shmId = shmId;
231      this.stream = stream;
232    }
233
234    @Override
235    public void close() throws IOException {
236      stream.close();
237    }
238  }
239
240  /**
241   * Handle a DFSClient request to create a new memory segment.
242   *
243   * @param clientName    Client name as reported by the client.
244   * @param sock          The DomainSocket to associate with this memory
245   *                        segment.  When this socket is closed, or the
246   *                        other side writes anything to the socket, the
247   *                        segment will be closed.  This can happen at any
248   *                        time, including right after this function returns.
249   * @return              A NewShmInfo object.  The caller must close the
250   *                        NewShmInfo object once they are done with it.
251   * @throws IOException  If the new memory segment could not be created.
252   */
253  public NewShmInfo createNewMemorySegment(String clientName,
254      DomainSocket sock) throws IOException {
255    NewShmInfo info = null;
256    RegisteredShm shm = null;
257    ShmId shmId = null;
258    synchronized (this) {
259      if (!enabled) {
260        if (LOG.isTraceEnabled()) {
261          LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
262              "not enabled.");
263        }
264        throw new UnsupportedOperationException();
265      }
266      FileInputStream fis = null;
267      try {
268        do {
269          shmId = ShmId.createRandom();
270        } while (segments.containsKey(shmId));
271        fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
272        shm = new RegisteredShm(shmId, fis, this);
273      } finally {
274        if (shm == null) {
275          IOUtils.closeQuietly(fis);
276        }
277      }
278      info = new NewShmInfo(shmId, fis);
279      segments.put(shmId, shm);
280    }
281    // Drop the registry lock to prevent deadlock.
282    // After this point, RegisteredShm#handle may be called at any time.
283    watcher.add(sock, shm);
284    if (LOG.isTraceEnabled()) {
285      LOG.trace("createNewMemorySegment: created " + info.shmId);
286    }
287    return info;
288  }
289  
290  public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
291      boolean isCached) throws InvalidRequestException {
292    if (!enabled) {
293      if (LOG.isTraceEnabled()) {
294        LOG.trace(this + " can't register a slot because the " +
295            "ShortCircuitRegistry is not enabled.");
296      }
297      throw new UnsupportedOperationException();
298    }
299    ShmId shmId = slotId.getShmId();
300    RegisteredShm shm = segments.get(shmId);
301    if (shm == null) {
302      throw new InvalidRequestException("there is no shared memory segment " +
303          "registered with shmId " + shmId);
304    }
305    Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
306    if (isCached) {
307      slot.makeAnchorable();
308    } else {
309      slot.makeUnanchorable();
310    }
311    boolean added = slots.put(blockId, slot);
312    Preconditions.checkState(added);
313    if (LOG.isTraceEnabled()) {
314      LOG.trace(this + ": registered " + blockId + " with slot " +
315        slotId + " (isCached=" + isCached + ")");
316    }
317  }
318  
319  public synchronized void unregisterSlot(SlotId slotId)
320      throws InvalidRequestException {
321    if (!enabled) {
322      if (LOG.isTraceEnabled()) {
323        LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
324            "not enabled.");
325      }
326      throw new UnsupportedOperationException();
327    }
328    ShmId shmId = slotId.getShmId();
329    RegisteredShm shm = segments.get(shmId);
330    if (shm == null) {
331      throw new InvalidRequestException("there is no shared memory segment " +
332          "registered with shmId " + shmId);
333    }
334    Slot slot = shm.getSlot(slotId.getSlotIdx());
335    slot.makeInvalid();
336    shm.unregisterSlot(slotId.getSlotIdx());
337    slots.remove(slot.getBlockId(), slot);
338  }
339  
340  public void shutdown() {
341    synchronized (this) {
342      if (!enabled) return;
343      enabled = false;
344    }
345    IOUtils.closeQuietly(watcher);
346  }
347}