001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.client;
019
020import com.google.common.annotations.VisibleForTesting;
021import com.google.common.base.Preconditions;
022
023import java.io.BufferedOutputStream;
024import java.io.Closeable;
025import java.io.DataOutputStream;
026import java.io.EOFException;
027import java.io.FileInputStream;
028import java.io.IOException;
029import java.util.HashMap;
030import java.util.TreeMap;
031import java.util.Map.Entry;
032import java.util.concurrent.locks.Condition;
033import java.util.concurrent.locks.ReentrantLock;
034
035import org.apache.commons.lang.mutable.MutableBoolean;
036import org.apache.commons.logging.Log;
037import org.apache.commons.logging.LogFactory;
038import org.apache.hadoop.hdfs.ExtendedBlockId;
039import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
040import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
041import org.apache.hadoop.hdfs.net.DomainPeer;
042import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
043import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
044import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
045import org.apache.hadoop.hdfs.protocolPB.PBHelper;
046import org.apache.hadoop.io.IOUtils;
047import org.apache.hadoop.net.unix.DomainSocket;
048import org.apache.hadoop.net.unix.DomainSocketWatcher;
049import org.apache.hadoop.classification.InterfaceAudience;
050
051/**
052 * Manages short-circuit memory segments for an HDFS client.
053 * 
054 * Clients are responsible for requesting and releasing shared memory segments used
055 * for communicating with the DataNode. The client will try to allocate new slots
056 * in the set of existing segments, falling back to getting a new segment from the
057 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
058 * 
059 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
060 * See {@link ShortCircuitRegistry} for more information on the communication protocol.
061 */
062@InterfaceAudience.Private
063public class DfsClientShmManager implements Closeable {
064  private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
065
066  /**
067   * Manages short-circuit memory segments that pertain to a given DataNode.
068   */
069  class EndpointShmManager {
070    /**
071     * The datanode we're managing.
072     */
073    private final DatanodeInfo datanode;
074
075    /**
076     * Shared memory segments which have no empty slots.
077     *
078     * Protected by the manager lock.
079     */
080    private final TreeMap<ShmId, DfsClientShm> full =
081        new TreeMap<ShmId, DfsClientShm>();
082
083    /**
084     * Shared memory segments which have at least one empty slot.
085     *
086     * Protected by the manager lock.
087     */
088    private final TreeMap<ShmId, DfsClientShm> notFull =
089        new TreeMap<ShmId, DfsClientShm>();
090
091    /**
092     * True if this datanode doesn't support short-circuit shared memory
093     * segments.
094     *
095     * Protected by the manager lock.
096     */
097    private boolean disabled = false;
098
099    /**
100     * True if we're in the process of loading a shared memory segment from
101     * this DataNode.
102     *
103     * Protected by the manager lock.
104     */
105    private boolean loading = false;
106
107    EndpointShmManager (DatanodeInfo datanode) {
108      this.datanode = datanode;
109    }
110
111    /**
112     * Pull a slot out of a preexisting shared memory segment.
113     *
114     * Must be called with the manager lock held.
115     *
116     * @param blockId     The blockId to put inside the Slot object.
117     *
118     * @return            null if none of our shared memory segments contain a
119     *                      free slot; the slot object otherwise.
120     */
121    private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
122      if (notFull.isEmpty()) {
123        return null;
124      }
125      Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
126      DfsClientShm shm = entry.getValue();
127      ShmId shmId = shm.getShmId();
128      Slot slot = shm.allocAndRegisterSlot(blockId);
129      if (shm.isFull()) {
130        if (LOG.isTraceEnabled()) {
131          LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
132              " out of " + shm);
133        }
134        DfsClientShm removedShm = notFull.remove(shmId);
135        Preconditions.checkState(removedShm == shm);
136        full.put(shmId, shm);
137      } else {
138        if (LOG.isTraceEnabled()) {
139          LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
140              " out of " + shm);
141        }
142      }
143      return slot;
144    }
145
146    /**
147     * Ask the DataNode for a new shared memory segment.  This function must be
148     * called with the manager lock held.  We will release the lock while
149     * communicating with the DataNode.
150     *
151     * @param clientName    The current client name.
152     * @param peer          The peer to use to talk to the DataNode.
153     *
154     * @return              Null if the DataNode does not support shared memory
155     *                        segments, or experienced an error creating the
156     *                        shm.  The shared memory segment itself on success.
157     * @throws IOException  If there was an error communicating over the socket.
158     *                        We will not throw an IOException unless the socket
159     *                        itself (or the network) is the problem.
160     */
161    private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
162        throws IOException {
163      final DataOutputStream out = 
164          new DataOutputStream(
165              new BufferedOutputStream(peer.getOutputStream()));
166      new Sender(out).requestShortCircuitShm(clientName);
167      ShortCircuitShmResponseProto resp = 
168          ShortCircuitShmResponseProto.parseFrom(
169              PBHelper.vintPrefixed(peer.getInputStream()));
170      String error = resp.hasError() ? resp.getError() : "(unknown)";
171      switch (resp.getStatus()) {
172      case SUCCESS:
173        DomainSocket sock = peer.getDomainSocket();
174        byte buf[] = new byte[1];
175        FileInputStream fis[] = new FileInputStream[1];
176        if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
177          throw new EOFException("got EOF while trying to transfer the " +
178              "file descriptor for the shared memory segment.");
179        }
180        if (fis[0] == null) {
181          throw new IOException("the datanode " + datanode + " failed to " +
182              "pass a file descriptor for the shared memory segment.");
183        }
184        try {
185          DfsClientShm shm = 
186              new DfsClientShm(PBHelper.convert(resp.getId()),
187                  fis[0], this, peer);
188          if (LOG.isTraceEnabled()) {
189            LOG.trace(this + ": createNewShm: created " + shm);
190          }
191          return shm;
192        } finally {
193          IOUtils.cleanup(LOG,  fis[0]);
194        }
195      case ERROR_UNSUPPORTED:
196        // The DataNode just does not support short-circuit shared memory
197        // access, and we should stop asking.
198        LOG.info(this + ": datanode does not support short-circuit " +
199            "shared memory access: " + error);
200        disabled = true;
201        return null;
202      default:
203        // The datanode experienced some kind of unexpected error when trying to
204        // create the short-circuit shared memory segment.
205        LOG.warn(this + ": error requesting short-circuit shared memory " +
206            "access: " + error);
207        return null;
208      }
209    }
210
211    /**
212     * Allocate a new shared memory slot connected to this datanode.
213     *
214     * Must be called with the EndpointShmManager lock held.
215     *
216     * @param peer          The peer to use to talk to the DataNode.
217     * @param clientName    The client name.
218     * @param usedPeer      (out param) Will be set to true if we used the peer.
219     *                        When a peer is used
220     *
221     * @return              null if the DataNode does not support shared memory
222     *                        segments, or experienced an error creating the
223     *                        shm.  The shared memory segment itself on success.
224     * @throws IOException  If there was an error communicating over the socket.
225     */
226    Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
227        String clientName, ExtendedBlockId blockId) throws IOException {
228      while (true) {
229        if (closed) {
230          if (LOG.isTraceEnabled()) {
231            LOG.trace(this + ": the DfsClientShmManager has been closed.");
232          }
233          return null;
234        }
235        if (disabled) {
236          if (LOG.isTraceEnabled()) {
237            LOG.trace(this + ": shared memory segment access is disabled.");
238          }
239          return null;
240        }
241        // Try to use an existing slot.
242        Slot slot = allocSlotFromExistingShm(blockId);
243        if (slot != null) {
244          return slot;
245        }
246        // There are no free slots.  If someone is loading more slots, wait
247        // for that to finish.
248        if (loading) {
249          if (LOG.isTraceEnabled()) {
250            LOG.trace(this + ": waiting for loading to finish...");
251          }
252          finishedLoading.awaitUninterruptibly();
253        } else {
254          // Otherwise, load the slot ourselves.
255          loading = true;
256          lock.unlock();
257          DfsClientShm shm;
258          try {
259            shm = requestNewShm(clientName, peer);
260            if (shm == null) continue;
261            // See #{DfsClientShmManager#domainSocketWatcher} for details
262            // about why we do this before retaking the manager lock.
263            domainSocketWatcher.add(peer.getDomainSocket(), shm);
264            // The DomainPeer is now our responsibility, and should not be
265            // closed by the caller.
266            usedPeer.setValue(true);
267          } finally {
268            lock.lock();
269            loading = false;
270            finishedLoading.signalAll();
271          }
272          if (shm.isStale()) {
273            // If the peer closed immediately after the shared memory segment
274            // was created, the DomainSocketWatcher callback might already have
275            // fired and marked the shm as stale.  In this case, we obviously
276            // don't want to add the SharedMemorySegment to our list of valid
277            // not-full segments.
278            if (LOG.isDebugEnabled()) {
279              LOG.debug(this + ": the UNIX domain socket associated with " +
280                  "this short-circuit memory closed before we could make " +
281                  "use of the shm.");
282            }
283          } else {
284            notFull.put(shm.getShmId(), shm);
285          }
286        }
287      }
288    }
289    
290    /**
291     * Stop tracking a slot.
292     *
293     * Must be called with the EndpointShmManager lock held.
294     *
295     * @param slot          The slot to release.
296     */
297    void freeSlot(Slot slot) {
298      DfsClientShm shm = (DfsClientShm)slot.getShm();
299      shm.unregisterSlot(slot.getSlotIdx());
300      if (shm.isStale()) {
301        // Stale shared memory segments should not be tracked here.
302        Preconditions.checkState(!full.containsKey(shm.getShmId()));
303        Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
304        if (shm.isEmpty()) {
305          if (LOG.isTraceEnabled()) {
306            LOG.trace(this + ": freeing empty stale " + shm);
307          }
308          shm.free();
309        }
310      } else {
311        ShmId shmId = shm.getShmId();
312        full.remove(shmId); // The shm can't be full if we just freed a slot.
313        if (shm.isEmpty()) {
314          notFull.remove(shmId);
315  
316          // If the shared memory segment is now empty, we call shutdown(2) on
317          // the UNIX domain socket associated with it.  The DomainSocketWatcher,
318          // which is watching this socket, will call DfsClientShm#handle,
319          // cleaning up this shared memory segment.
320          //
321          // See #{DfsClientShmManager#domainSocketWatcher} for details about why
322          // we don't want to call DomainSocketWatcher#remove directly here.
323          //
324          // Note that we could experience 'fragmentation' here, where the
325          // DFSClient allocates a bunch of slots in different shared memory
326          // segments, and then frees most of them, but never fully empties out
327          // any segment.  We make some attempt to avoid this fragmentation by
328          // always allocating new slots out of the shared memory segment with the
329          // lowest ID, but it could still occur.  In most workloads,
330          // fragmentation should not be a major concern, since it doesn't impact
331          // peak file descriptor usage or the speed of allocation.
332          if (LOG.isTraceEnabled()) {
333            LOG.trace(this + ": shutting down UNIX domain socket for " +
334                "empty " + shm);
335          }
336          shutdown(shm);
337        } else {
338          notFull.put(shmId, shm);
339        }
340      }
341    }
342    
343    /**
344     * Unregister a shared memory segment.
345     *
346     * Once a segment is unregistered, we will not allocate any more slots
347     * inside that segment.
348     *
349     * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
350     * lock.
351     *
352     * @param shmId         The ID of the shared memory segment to unregister.
353     */
354    void unregisterShm(ShmId shmId) {
355      lock.lock();
356      try {
357        full.remove(shmId);
358        notFull.remove(shmId);
359      } finally {
360        lock.unlock();
361      }
362    }
363
364    @Override
365    public String toString() {
366      return String.format("EndpointShmManager(%s, parent=%s)",
367          datanode, DfsClientShmManager.this);
368    }
369
370    PerDatanodeVisitorInfo getVisitorInfo() {
371      return new PerDatanodeVisitorInfo(full, notFull, disabled);
372    }
373
374    final void shutdown(DfsClientShm shm) {
375      try {
376        shm.getPeer().getDomainSocket().shutdown();
377      } catch (IOException e) {
378        LOG.warn(this + ": error shutting down shm: got IOException calling " +
379            "shutdown(SHUT_RDWR)", e);
380      }
381    }
382  }
383
384  private boolean closed = false;
385
386  private final ReentrantLock lock = new ReentrantLock();
387
388  /**
389   * A condition variable which is signalled when we finish loading a segment
390   * from the Datanode.
391   */
392  private final Condition finishedLoading = lock.newCondition();
393
394  /**
395   * Information about each Datanode.
396   */
397  private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
398      new HashMap<DatanodeInfo, EndpointShmManager>(1);
399  
400  /**
401   * The DomainSocketWatcher which keeps track of the UNIX domain socket
402   * associated with each shared memory segment.
403   *
404   * Note: because the DomainSocketWatcher makes callbacks into this
405   * DfsClientShmManager object, you must MUST NOT attempt to take the
406   * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
407   * or else deadlock might result.   This means that most DomainSocketWatcher
408   * methods are off-limits unless you release the manager lock first.
409   */
410  private final DomainSocketWatcher domainSocketWatcher;
411  
412  DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
413    this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
414  }
415  
416  public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
417      MutableBoolean usedPeer, ExtendedBlockId blockId,
418      String clientName) throws IOException {
419    lock.lock();
420    try {
421      if (closed) {
422        LOG.trace(this + ": the DfsClientShmManager isclosed.");
423        return null;
424      }
425      EndpointShmManager shmManager = datanodes.get(datanode);
426      if (shmManager == null) {
427        shmManager = new EndpointShmManager(datanode);
428        datanodes.put(datanode, shmManager);
429      }
430      return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
431    } finally {
432      lock.unlock();
433    }
434  }
435  
436  public void freeSlot(Slot slot) {
437    lock.lock();
438    try {
439      DfsClientShm shm = (DfsClientShm)slot.getShm();
440      shm.getEndpointShmManager().freeSlot(slot);
441    } finally {
442      lock.unlock();
443    }
444  }
445
446  @VisibleForTesting
447  public static class PerDatanodeVisitorInfo {
448    public final TreeMap<ShmId, DfsClientShm> full;
449    public final TreeMap<ShmId, DfsClientShm> notFull;
450    public final boolean disabled;
451
452    PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
453        TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
454      this.full = full;
455      this.notFull = notFull;
456      this.disabled = disabled;
457    }
458  }
459
460  @VisibleForTesting
461  public interface Visitor {
462    void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
463        throws IOException;
464  }
465
466  @VisibleForTesting
467  public void visit(Visitor visitor) throws IOException {
468    lock.lock();
469    try {
470      HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 
471          new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
472      for (Entry<DatanodeInfo, EndpointShmManager> entry :
473            datanodes.entrySet()) {
474        info.put(entry.getKey(), entry.getValue().getVisitorInfo());
475      }
476      visitor.visit(info);
477    } finally {
478      lock.unlock();
479    }
480  }
481
482  /**
483   * Close the DfsClientShmManager.
484   */
485  @Override
486  public void close() throws IOException {
487    lock.lock();
488    try {
489      if (closed) return;
490      closed = true;
491    } finally {
492      lock.unlock();
493    }
494    // When closed, the domainSocketWatcher will issue callbacks that mark
495    // all the outstanding DfsClientShm segments as stale.
496    IOUtils.cleanup(LOG, domainSocketWatcher);
497  }
498
499
500  @Override
501  public String toString() {
502    return String.format("ShortCircuitShmManager(%08x)",
503        System.identityHashCode(this));
504  }
505
506  @VisibleForTesting
507  public DomainSocketWatcher getDomainSocketWatcher() {
508    return domainSocketWatcher;
509  }
510}