001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.shortcircuit;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataOutputStream;
023import java.io.EOFException;
024import java.io.FileInputStream;
025import java.io.IOException;
026import java.util.HashMap;
027import java.util.Map.Entry;
028import java.util.TreeMap;
029import java.util.concurrent.locks.Condition;
030import java.util.concurrent.locks.ReentrantLock;
031
032import org.apache.commons.lang.mutable.MutableBoolean;
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.apache.hadoop.classification.InterfaceAudience;
036import org.apache.hadoop.hdfs.ExtendedBlockId;
037import org.apache.hadoop.hdfs.net.DomainPeer;
038import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
039import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
040import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
041import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
042import org.apache.hadoop.hdfs.protocolPB.PBHelper;
043import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
044import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.ShmId;
045import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
046import org.apache.hadoop.io.IOUtils;
047import org.apache.hadoop.net.unix.DomainSocket;
048import org.apache.hadoop.net.unix.DomainSocketWatcher;
049
050import com.google.common.annotations.VisibleForTesting;
051import com.google.common.base.Preconditions;
052
053/**
054 * Manages short-circuit memory segments for an HDFS client.
055 * 
056 * Clients are responsible for requesting and releasing shared memory segments used
057 * for communicating with the DataNode. The client will try to allocate new slots
058 * in the set of existing segments, falling back to getting a new segment from the
059 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
060 * 
061 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
062 * See {@link ShortCircuitRegistry} for more information on the communication protocol.
063 */
064@InterfaceAudience.Private
065public class DfsClientShmManager implements Closeable {
066  private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
067
068  /**
069   * Manages short-circuit memory segments that pertain to a given DataNode.
070   */
071  class EndpointShmManager {
072    /**
073     * The datanode we're managing.
074     */
075    private final DatanodeInfo datanode;
076
077    /**
078     * Shared memory segments which have no empty slots.
079     *
080     * Protected by the manager lock.
081     */
082    private final TreeMap<ShmId, DfsClientShm> full =
083        new TreeMap<ShmId, DfsClientShm>();
084
085    /**
086     * Shared memory segments which have at least one empty slot.
087     *
088     * Protected by the manager lock.
089     */
090    private final TreeMap<ShmId, DfsClientShm> notFull =
091        new TreeMap<ShmId, DfsClientShm>();
092
093    /**
094     * True if this datanode doesn't support short-circuit shared memory
095     * segments.
096     *
097     * Protected by the manager lock.
098     */
099    private boolean disabled = false;
100
101    /**
102     * True if we're in the process of loading a shared memory segment from
103     * this DataNode.
104     *
105     * Protected by the manager lock.
106     */
107    private boolean loading = false;
108
109    EndpointShmManager (DatanodeInfo datanode) {
110      this.datanode = datanode;
111    }
112
113    /**
114     * Pull a slot out of a preexisting shared memory segment.
115     *
116     * Must be called with the manager lock held.
117     *
118     * @param blockId     The blockId to put inside the Slot object.
119     *
120     * @return            null if none of our shared memory segments contain a
121     *                      free slot; the slot object otherwise.
122     */
123    private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
124      if (notFull.isEmpty()) {
125        return null;
126      }
127      Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
128      DfsClientShm shm = entry.getValue();
129      ShmId shmId = shm.getShmId();
130      Slot slot = shm.allocAndRegisterSlot(blockId);
131      if (shm.isFull()) {
132        if (LOG.isTraceEnabled()) {
133          LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
134              " out of " + shm);
135        }
136        DfsClientShm removedShm = notFull.remove(shmId);
137        Preconditions.checkState(removedShm == shm);
138        full.put(shmId, shm);
139      } else {
140        if (LOG.isTraceEnabled()) {
141          LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
142              " out of " + shm);
143        }
144      }
145      return slot;
146    }
147
148    /**
149     * Ask the DataNode for a new shared memory segment.  This function must be
150     * called with the manager lock held.  We will release the lock while
151     * communicating with the DataNode.
152     *
153     * @param clientName    The current client name.
154     * @param peer          The peer to use to talk to the DataNode.
155     *
156     * @return              Null if the DataNode does not support shared memory
157     *                        segments, or experienced an error creating the
158     *                        shm.  The shared memory segment itself on success.
159     * @throws IOException  If there was an error communicating over the socket.
160     *                        We will not throw an IOException unless the socket
161     *                        itself (or the network) is the problem.
162     */
163    private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
164        throws IOException {
165      final DataOutputStream out = 
166          new DataOutputStream(
167              new BufferedOutputStream(peer.getOutputStream()));
168      new Sender(out).requestShortCircuitShm(clientName);
169      ShortCircuitShmResponseProto resp = 
170          ShortCircuitShmResponseProto.parseFrom(
171              PBHelper.vintPrefixed(peer.getInputStream()));
172      String error = resp.hasError() ? resp.getError() : "(unknown)";
173      switch (resp.getStatus()) {
174      case SUCCESS:
175        DomainSocket sock = peer.getDomainSocket();
176        byte buf[] = new byte[1];
177        FileInputStream fis[] = new FileInputStream[1];
178        if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
179          throw new EOFException("got EOF while trying to transfer the " +
180              "file descriptor for the shared memory segment.");
181        }
182        if (fis[0] == null) {
183          throw new IOException("the datanode " + datanode + " failed to " +
184              "pass a file descriptor for the shared memory segment.");
185        }
186        try {
187          DfsClientShm shm = 
188              new DfsClientShm(PBHelper.convert(resp.getId()),
189                  fis[0], this, peer);
190          if (LOG.isTraceEnabled()) {
191            LOG.trace(this + ": createNewShm: created " + shm);
192          }
193          return shm;
194        } finally {
195          IOUtils.cleanup(LOG,  fis[0]);
196        }
197      case ERROR_UNSUPPORTED:
198        // The DataNode just does not support short-circuit shared memory
199        // access, and we should stop asking.
200        LOG.info(this + ": datanode does not support short-circuit " +
201            "shared memory access: " + error);
202        disabled = true;
203        return null;
204      default:
205        // The datanode experienced some kind of unexpected error when trying to
206        // create the short-circuit shared memory segment.
207        LOG.warn(this + ": error requesting short-circuit shared memory " +
208            "access: " + error);
209        return null;
210      }
211    }
212
213    /**
214     * Allocate a new shared memory slot connected to this datanode.
215     *
216     * Must be called with the EndpointShmManager lock held.
217     *
218     * @param peer          The peer to use to talk to the DataNode.
219     * @param clientName    The client name.
220     * @param usedPeer      (out param) Will be set to true if we used the peer.
221     *                        When a peer is used
222     *
223     * @return              null if the DataNode does not support shared memory
224     *                        segments, or experienced an error creating the
225     *                        shm.  The shared memory segment itself on success.
226     * @throws IOException  If there was an error communicating over the socket.
227     */
228    Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
229        String clientName, ExtendedBlockId blockId) throws IOException {
230      while (true) {
231        if (closed) {
232          if (LOG.isTraceEnabled()) {
233            LOG.trace(this + ": the DfsClientShmManager has been closed.");
234          }
235          return null;
236        }
237        if (disabled) {
238          if (LOG.isTraceEnabled()) {
239            LOG.trace(this + ": shared memory segment access is disabled.");
240          }
241          return null;
242        }
243        // Try to use an existing slot.
244        Slot slot = allocSlotFromExistingShm(blockId);
245        if (slot != null) {
246          return slot;
247        }
248        // There are no free slots.  If someone is loading more slots, wait
249        // for that to finish.
250        if (loading) {
251          if (LOG.isTraceEnabled()) {
252            LOG.trace(this + ": waiting for loading to finish...");
253          }
254          finishedLoading.awaitUninterruptibly();
255        } else {
256          // Otherwise, load the slot ourselves.
257          loading = true;
258          lock.unlock();
259          DfsClientShm shm;
260          try {
261            shm = requestNewShm(clientName, peer);
262            if (shm == null) continue;
263            // See #{DfsClientShmManager#domainSocketWatcher} for details
264            // about why we do this before retaking the manager lock.
265            domainSocketWatcher.add(peer.getDomainSocket(), shm);
266            // The DomainPeer is now our responsibility, and should not be
267            // closed by the caller.
268            usedPeer.setValue(true);
269          } finally {
270            lock.lock();
271            loading = false;
272            finishedLoading.signalAll();
273          }
274          if (shm.isStale()) {
275            // If the peer closed immediately after the shared memory segment
276            // was created, the DomainSocketWatcher callback might already have
277            // fired and marked the shm as stale.  In this case, we obviously
278            // don't want to add the SharedMemorySegment to our list of valid
279            // not-full segments.
280            if (LOG.isDebugEnabled()) {
281              LOG.debug(this + ": the UNIX domain socket associated with " +
282                  "this short-circuit memory closed before we could make " +
283                  "use of the shm.");
284            }
285          } else {
286            notFull.put(shm.getShmId(), shm);
287          }
288        }
289      }
290    }
291    
292    /**
293     * Stop tracking a slot.
294     *
295     * Must be called with the EndpointShmManager lock held.
296     *
297     * @param slot          The slot to release.
298     */
299    void freeSlot(Slot slot) {
300      DfsClientShm shm = (DfsClientShm)slot.getShm();
301      shm.unregisterSlot(slot.getSlotIdx());
302      if (shm.isStale()) {
303        // Stale shared memory segments should not be tracked here.
304        Preconditions.checkState(!full.containsKey(shm.getShmId()));
305        Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
306        if (shm.isEmpty()) {
307          if (LOG.isTraceEnabled()) {
308            LOG.trace(this + ": freeing empty stale " + shm);
309          }
310          shm.free();
311        }
312      } else {
313        ShmId shmId = shm.getShmId();
314        full.remove(shmId); // The shm can't be full if we just freed a slot.
315        if (shm.isEmpty()) {
316          notFull.remove(shmId);
317  
318          // If the shared memory segment is now empty, we call shutdown(2) on
319          // the UNIX domain socket associated with it.  The DomainSocketWatcher,
320          // which is watching this socket, will call DfsClientShm#handle,
321          // cleaning up this shared memory segment.
322          //
323          // See #{DfsClientShmManager#domainSocketWatcher} for details about why
324          // we don't want to call DomainSocketWatcher#remove directly here.
325          //
326          // Note that we could experience 'fragmentation' here, where the
327          // DFSClient allocates a bunch of slots in different shared memory
328          // segments, and then frees most of them, but never fully empties out
329          // any segment.  We make some attempt to avoid this fragmentation by
330          // always allocating new slots out of the shared memory segment with the
331          // lowest ID, but it could still occur.  In most workloads,
332          // fragmentation should not be a major concern, since it doesn't impact
333          // peak file descriptor usage or the speed of allocation.
334          if (LOG.isTraceEnabled()) {
335            LOG.trace(this + ": shutting down UNIX domain socket for " +
336                "empty " + shm);
337          }
338          shutdown(shm);
339        } else {
340          notFull.put(shmId, shm);
341        }
342      }
343    }
344    
345    /**
346     * Unregister a shared memory segment.
347     *
348     * Once a segment is unregistered, we will not allocate any more slots
349     * inside that segment.
350     *
351     * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
352     * lock.
353     *
354     * @param shmId         The ID of the shared memory segment to unregister.
355     */
356    void unregisterShm(ShmId shmId) {
357      lock.lock();
358      try {
359        full.remove(shmId);
360        notFull.remove(shmId);
361      } finally {
362        lock.unlock();
363      }
364    }
365
366    @Override
367    public String toString() {
368      return String.format("EndpointShmManager(%s, parent=%s)",
369          datanode, DfsClientShmManager.this);
370    }
371
372    PerDatanodeVisitorInfo getVisitorInfo() {
373      return new PerDatanodeVisitorInfo(full, notFull, disabled);
374    }
375
376    final void shutdown(DfsClientShm shm) {
377      try {
378        shm.getPeer().getDomainSocket().shutdown();
379      } catch (IOException e) {
380        LOG.warn(this + ": error shutting down shm: got IOException calling " +
381            "shutdown(SHUT_RDWR)", e);
382      }
383    }
384  }
385
386  private boolean closed = false;
387
388  private final ReentrantLock lock = new ReentrantLock();
389
390  /**
391   * A condition variable which is signalled when we finish loading a segment
392   * from the Datanode.
393   */
394  private final Condition finishedLoading = lock.newCondition();
395
396  /**
397   * Information about each Datanode.
398   */
399  private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
400      new HashMap<DatanodeInfo, EndpointShmManager>(1);
401  
402  /**
403   * The DomainSocketWatcher which keeps track of the UNIX domain socket
404   * associated with each shared memory segment.
405   *
406   * Note: because the DomainSocketWatcher makes callbacks into this
407   * DfsClientShmManager object, you must MUST NOT attempt to take the
408   * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
409   * or else deadlock might result.   This means that most DomainSocketWatcher
410   * methods are off-limits unless you release the manager lock first.
411   */
412  private final DomainSocketWatcher domainSocketWatcher;
413  
414  DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
415    this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
416  }
417  
418  public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
419      MutableBoolean usedPeer, ExtendedBlockId blockId,
420      String clientName) throws IOException {
421    lock.lock();
422    try {
423      if (closed) {
424        LOG.trace(this + ": the DfsClientShmManager isclosed.");
425        return null;
426      }
427      EndpointShmManager shmManager = datanodes.get(datanode);
428      if (shmManager == null) {
429        shmManager = new EndpointShmManager(datanode);
430        datanodes.put(datanode, shmManager);
431      }
432      return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
433    } finally {
434      lock.unlock();
435    }
436  }
437  
438  public void freeSlot(Slot slot) {
439    lock.lock();
440    try {
441      DfsClientShm shm = (DfsClientShm)slot.getShm();
442      shm.getEndpointShmManager().freeSlot(slot);
443    } finally {
444      lock.unlock();
445    }
446  }
447
448  @VisibleForTesting
449  public static class PerDatanodeVisitorInfo {
450    public final TreeMap<ShmId, DfsClientShm> full;
451    public final TreeMap<ShmId, DfsClientShm> notFull;
452    public final boolean disabled;
453
454    PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
455        TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
456      this.full = full;
457      this.notFull = notFull;
458      this.disabled = disabled;
459    }
460  }
461
462  @VisibleForTesting
463  public interface Visitor {
464    void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
465        throws IOException;
466  }
467
468  @VisibleForTesting
469  public void visit(Visitor visitor) throws IOException {
470    lock.lock();
471    try {
472      HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 
473          new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
474      for (Entry<DatanodeInfo, EndpointShmManager> entry :
475            datanodes.entrySet()) {
476        info.put(entry.getKey(), entry.getValue().getVisitorInfo());
477      }
478      visitor.visit(info);
479    } finally {
480      lock.unlock();
481    }
482  }
483
484  /**
485   * Close the DfsClientShmManager.
486   */
487  @Override
488  public void close() throws IOException {
489    lock.lock();
490    try {
491      if (closed) return;
492      closed = true;
493    } finally {
494      lock.unlock();
495    }
496    // When closed, the domainSocketWatcher will issue callbacks that mark
497    // all the outstanding DfsClientShm segments as stale.
498    IOUtils.cleanup(LOG, domainSocketWatcher);
499  }
500
501
502  @Override
503  public String toString() {
504    return String.format("ShortCircuitShmManager(%08x)",
505        System.identityHashCode(this));
506  }
507
508  @VisibleForTesting
509  public DomainSocketWatcher getDomainSocketWatcher() {
510    return domainSocketWatcher;
511  }
512}