001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT;
024
025import java.io.Closeable;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.util.HashMap;
029import java.util.Iterator;
030import java.util.Set;
031
032import org.apache.commons.io.IOUtils;
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.fs.InvalidRequestException;
037import org.apache.hadoop.hdfs.ExtendedBlockId;
038import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm;
039import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
040import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
041import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
042import org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory;
043import org.apache.hadoop.net.unix.DomainSocket;
044import org.apache.hadoop.net.unix.DomainSocketWatcher;
045
046import com.google.common.base.Preconditions;
047import com.google.common.collect.HashMultimap;
048
049/*
050 * Manages client short-circuit memory segments on the DataNode.
051 *
052 * DFSClients request shared memory segments from the DataNode.  The 
053 * ShortCircuitRegistry generates and manages these segments.  Each segment
054 * has a randomly generated 128-bit ID which uniquely identifies it.  The
055 * segments each contain several "slots."
056 *
057 * Before performing a short-circuit read, DFSClients must request a pair of
058 * file descriptors from the DataNode via the REQUEST_SHORT_CIRCUIT_FDS
059 * operation.  As part of this operation, DFSClients pass the ID of the shared
060 * memory segment they would like to use to communicate information about this
061 * replica, as well as the slot number within that segment they would like to
062 * use.  Slot allocation is always done by the client.
063 *
064 * Slots are used to track the state of the block on the both the client and
065 * datanode. When this DataNode mlocks a block, the corresponding slots for the
066 * replicas are marked as "anchorable".  Anchorable blocks can be safely read
067 * without verifying the checksum.  This means that BlockReaderLocal objects
068 * using these replicas can skip checksumming.  It also means that we can do
069 * zero-copy reads on these replicas (the ZCR interface has no way of
070 * verifying checksums.)
071 * 
072 * When a DN needs to munlock a block, it needs to first wait for the block to
073 * be unanchored by clients doing a no-checksum read or a zero-copy read. The 
074 * DN also marks the block's slots as "unanchorable" to prevent additional 
075 * clients from initiating these operations in the future.
076 * 
077 * The counterpart fo this class on the client is {@link DfsClientShmManager}.
078 */
079public class ShortCircuitRegistry {
080  public static final Log LOG = LogFactory.getLog(ShortCircuitRegistry.class);
081
082  private static final int SHM_LENGTH = 8192;
083
084  private static class RegisteredShm extends ShortCircuitShm
085      implements DomainSocketWatcher.Handler {
086    private final ShortCircuitRegistry registry;
087
088    RegisteredShm(ShmId shmId, FileInputStream stream,
089        ShortCircuitRegistry registry) throws IOException {
090      super(shmId, stream);
091      this.registry = registry;
092    }
093
094    @Override
095    public boolean handle(DomainSocket sock) {
096      synchronized (registry) {
097        synchronized (this) {
098          registry.removeShm(this);
099        }
100      }
101      return true;
102    }
103  }
104
105  public synchronized void removeShm(ShortCircuitShm shm) {
106    if (LOG.isTraceEnabled()) {
107      LOG.debug("removing shm " + shm);
108    }
109    // Stop tracking the shmId.
110    RegisteredShm removedShm = segments.remove(shm.getShmId());
111    Preconditions.checkState(removedShm == shm,
112        "failed to remove " + shm.getShmId());
113    // Stop tracking the slots.
114    for (Iterator<Slot> iter = shm.slotIterator(); iter.hasNext(); ) {
115      Slot slot = iter.next();
116      boolean removed = slots.remove(slot.getBlockId(), slot);
117      Preconditions.checkState(removed);
118      slot.makeInvalid();
119    }
120    // De-allocate the memory map and close the shared file. 
121    shm.free();
122  }
123
124  /**
125   * Whether or not the registry is enabled.
126   */
127  private boolean enabled;
128
129  /**
130   * The factory which creates shared file descriptors.
131   */
132  private final SharedFileDescriptorFactory shmFactory;
133  
134  /**
135   * A watcher which sends out callbacks when the UNIX domain socket
136   * associated with a shared memory segment closes.
137   */
138  private final DomainSocketWatcher watcher;
139
140  private final HashMap<ShmId, RegisteredShm> segments =
141      new HashMap<ShmId, RegisteredShm>(0);
142  
143  private final HashMultimap<ExtendedBlockId, Slot> slots =
144      HashMultimap.create(0, 1);
145  
146  public ShortCircuitRegistry(Configuration conf) throws IOException {
147    boolean enabled = false;
148    SharedFileDescriptorFactory shmFactory = null;
149    DomainSocketWatcher watcher = null;
150    try {
151      int interruptCheck = conf.getInt(
152          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
153          DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT);
154      if (interruptCheck <= 0) {
155        throw new IOException(
156            DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS +
157            " was set to " + interruptCheck);
158      }
159      String shmPaths[] =
160          conf.getTrimmedStrings(DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS);
161      if (shmPaths.length == 0) {
162        shmPaths =
163            DFS_DATANODE_SHARED_FILE_DESCRIPTOR_PATHS_DEFAULT.split(",");
164      }
165      shmFactory = SharedFileDescriptorFactory.
166          create("HadoopShortCircuitShm_", shmPaths);
167      String dswLoadingFailure = DomainSocketWatcher.getLoadingFailureReason();
168      if (dswLoadingFailure != null) {
169        throw new IOException(dswLoadingFailure);
170      }
171      watcher = new DomainSocketWatcher(interruptCheck);
172      enabled = true;
173      if (LOG.isDebugEnabled()) {
174        LOG.debug("created new ShortCircuitRegistry with interruptCheck=" +
175                  interruptCheck + ", shmPath=" + shmFactory.getPath());
176      }
177    } catch (IOException e) {
178      if (LOG.isDebugEnabled()) {
179        LOG.debug("Disabling ShortCircuitRegistry", e);
180      }
181    } finally {
182      this.enabled = enabled;
183      this.shmFactory = shmFactory;
184      this.watcher = watcher;
185    }
186  }
187
188  /**
189   * Process a block mlock event from the FsDatasetCache.
190   *
191   * @param blockId    The block that was mlocked.
192   */
193  public synchronized void processBlockMlockEvent(ExtendedBlockId blockId) {
194    if (!enabled) return;
195    Set<Slot> affectedSlots = slots.get(blockId);
196    for (Slot slot : affectedSlots) {
197      slot.makeAnchorable();
198    }
199  }
200
201  /**
202   * Mark any slots associated with this blockId as unanchorable.
203   *
204   * @param blockId        The block ID.
205   * @return               True if we should allow the munlock request.
206   */
207  public synchronized boolean processBlockMunlockRequest(
208      ExtendedBlockId blockId) {
209    if (!enabled) return true;
210    boolean allowMunlock = true;
211    Set<Slot> affectedSlots = slots.get(blockId);
212    for (Slot slot : affectedSlots) {
213      slot.makeUnanchorable();
214      if (slot.isAnchored()) {
215        allowMunlock = false;
216      }
217    }
218    return allowMunlock;
219  }
220  
221  public static class NewShmInfo implements Closeable {
222    public final ShmId shmId;
223    public final FileInputStream stream;
224
225    NewShmInfo(ShmId shmId, FileInputStream stream) {
226      this.shmId = shmId;
227      this.stream = stream;
228    }
229
230    @Override
231    public void close() throws IOException {
232      stream.close();
233    }
234  }
235
236  /**
237   * Handle a DFSClient request to create a new memory segment.
238   *
239   * @param clientName    Client name as reported by the client.
240   * @param sock          The DomainSocket to associate with this memory
241   *                        segment.  When this socket is closed, or the
242   *                        other side writes anything to the socket, the
243   *                        segment will be closed.  This can happen at any
244   *                        time, including right after this function returns.
245   * @return              A NewShmInfo object.  The caller must close the
246   *                        NewShmInfo object once they are done with it.
247   * @throws IOException  If the new memory segment could not be created.
248   */
249  public NewShmInfo createNewMemorySegment(String clientName,
250      DomainSocket sock) throws IOException {
251    NewShmInfo info = null;
252    RegisteredShm shm = null;
253    ShmId shmId = null;
254    synchronized (this) {
255      if (!enabled) {
256        if (LOG.isTraceEnabled()) {
257          LOG.trace("createNewMemorySegment: ShortCircuitRegistry is " +
258              "not enabled.");
259        }
260        throw new UnsupportedOperationException();
261      }
262      FileInputStream fis = null;
263      try {
264        do {
265          shmId = ShmId.createRandom();
266        } while (segments.containsKey(shmId));
267        fis = shmFactory.createDescriptor(clientName, SHM_LENGTH);
268        shm = new RegisteredShm(shmId, fis, this);
269      } finally {
270        if (shm == null) {
271          IOUtils.closeQuietly(fis);
272        }
273      }
274      info = new NewShmInfo(shmId, fis);
275      segments.put(shmId, shm);
276    }
277    // Drop the registry lock to prevent deadlock.
278    // After this point, RegisteredShm#handle may be called at any time.
279    watcher.add(sock, shm);
280    if (LOG.isTraceEnabled()) {
281      LOG.trace("createNewMemorySegment: created " + info.shmId);
282    }
283    return info;
284  }
285  
286  public synchronized void registerSlot(ExtendedBlockId blockId, SlotId slotId,
287      boolean isCached) throws InvalidRequestException {
288    if (!enabled) {
289      if (LOG.isTraceEnabled()) {
290        LOG.trace(this + " can't register a slot because the " +
291            "ShortCircuitRegistry is not enabled.");
292      }
293      throw new UnsupportedOperationException();
294    }
295    ShmId shmId = slotId.getShmId();
296    RegisteredShm shm = segments.get(shmId);
297    if (shm == null) {
298      throw new InvalidRequestException("there is no shared memory segment " +
299          "registered with shmId " + shmId);
300    }
301    Slot slot = shm.registerSlot(slotId.getSlotIdx(), blockId);
302    if (isCached) {
303      slot.makeAnchorable();
304    } else {
305      slot.makeUnanchorable();
306    }
307    boolean added = slots.put(blockId, slot);
308    Preconditions.checkState(added);
309    if (LOG.isTraceEnabled()) {
310      LOG.trace(this + ": registered " + blockId + " with slot " +
311        slotId + " (isCached=" + isCached + ")");
312    }
313  }
314  
315  public synchronized void unregisterSlot(SlotId slotId)
316      throws InvalidRequestException {
317    if (!enabled) {
318      if (LOG.isTraceEnabled()) {
319        LOG.trace("unregisterSlot: ShortCircuitRegistry is " +
320            "not enabled.");
321      }
322      throw new UnsupportedOperationException();
323    }
324    ShmId shmId = slotId.getShmId();
325    RegisteredShm shm = segments.get(shmId);
326    if (shm == null) {
327      throw new InvalidRequestException("there is no shared memory segment " +
328          "registered with shmId " + shmId);
329    }
330    Slot slot = shm.getSlot(slotId.getSlotIdx());
331    slot.makeInvalid();
332    shm.unregisterSlot(slotId.getSlotIdx());
333    slots.remove(slot.getBlockId(), slot);
334  }
335  
336  public void shutdown() {
337    synchronized (this) {
338      if (!enabled) return;
339      enabled = false;
340    }
341    IOUtils.closeQuietly(watcher);
342  }
343}