001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.client;
019
020import java.io.BufferedOutputStream;
021import java.io.Closeable;
022import java.io.DataInputStream;
023import java.io.DataOutputStream;
024
025import org.apache.hadoop.classification.InterfaceAudience;
026
027import java.io.IOException;
028import java.nio.MappedByteBuffer;
029import java.util.HashMap;
030import java.util.Map;
031import java.util.Map.Entry;
032import java.util.TreeMap;
033import java.util.concurrent.ScheduledFuture;
034import java.util.concurrent.ScheduledThreadPoolExecutor;
035import java.util.concurrent.TimeUnit;
036import java.util.concurrent.locks.Condition;
037import java.util.concurrent.locks.ReentrantLock;
038
039import org.apache.commons.lang.mutable.MutableBoolean;
040import org.apache.commons.logging.Log;
041import org.apache.commons.logging.LogFactory;
042import org.apache.hadoop.conf.Configuration;
043import org.apache.hadoop.hdfs.ExtendedBlockId;
044import org.apache.hadoop.hdfs.DFSConfigKeys;
045import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
046import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
047import org.apache.hadoop.hdfs.net.DomainPeer;
048import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
049import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
050import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto;
051import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
052import org.apache.hadoop.hdfs.protocolPB.PBHelper;
053import org.apache.hadoop.io.IOUtils;
054import org.apache.hadoop.ipc.RetriableException;
055import org.apache.hadoop.net.unix.DomainSocket;
056import org.apache.hadoop.net.unix.DomainSocketWatcher;
057import org.apache.hadoop.security.token.SecretManager.InvalidToken;
058import org.apache.hadoop.util.StringUtils;
059import org.apache.hadoop.util.Time;
060import org.apache.hadoop.util.Waitable;
061
062import com.google.common.annotations.VisibleForTesting;
063import com.google.common.base.Preconditions;
064import com.google.common.util.concurrent.ThreadFactoryBuilder;
065
066/**
067 * The ShortCircuitCache tracks things which the client needs to access
068 * HDFS block files via short-circuit.
069 *
070 * These things include: memory-mapped regions, file descriptors, and shared
071 * memory areas for communicating with the DataNode.
072 */
073@InterfaceAudience.Private
074public class ShortCircuitCache implements Closeable {
075  public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
076
077  /**
078   * Expiry thread which makes sure that the file descriptors get closed
079   * after a while.
080   */
081  private class CacheCleaner implements Runnable, Closeable {
082    private ScheduledFuture<?> future;
083
084    /**
085     * Run the CacheCleaner thread.
086     *
087     * Whenever a thread requests a ShortCircuitReplica object, we will make
088     * sure it gets one.  That ShortCircuitReplica object can then be re-used
089     * when another thread requests a ShortCircuitReplica object for the same
090     * block.  So in that sense, there is no maximum size to the cache.
091     *
092     * However, when a ShortCircuitReplica object is unreferenced by the
093     * thread(s) that are using it, it becomes evictable.  There are two
094     * separate eviction lists-- one for mmaped objects, and another for
095     * non-mmaped objects.  We do this in order to avoid having the regular
096     * files kick the mmaped files out of the cache too quickly.  Reusing
097     * an already-existing mmap gives a huge performance boost, since the
098     * page table entries don't have to be re-populated.  Both the mmap
099     * and non-mmap evictable lists have maximum sizes and maximum lifespans.
100     */
101    @Override
102    public void run() {
103      ShortCircuitCache.this.lock.lock();
104      try {
105        if (ShortCircuitCache.this.closed) return;
106        long curMs = Time.monotonicNow();
107
108        if (LOG.isDebugEnabled()) {
109          LOG.debug(this + ": cache cleaner running at " + curMs);
110        }
111
112        int numDemoted = demoteOldEvictableMmaped(curMs);
113        int numPurged = 0;
114        Long evictionTimeNs = Long.valueOf(0);
115        while (true) {
116          Entry<Long, ShortCircuitReplica> entry = 
117              evictableMmapped.ceilingEntry(evictionTimeNs);
118          if (entry == null) break;
119          evictionTimeNs = entry.getKey();
120          long evictionTimeMs = 
121              TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
122          if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
123          ShortCircuitReplica replica = entry.getValue();
124          if (LOG.isTraceEnabled()) {
125            LOG.trace("CacheCleaner: purging " + replica + ": " + 
126                  StringUtils.getStackTrace(Thread.currentThread()));
127          }
128          purge(replica);
129          numPurged++;
130        }
131
132        if (LOG.isDebugEnabled()) {
133          LOG.debug(this + ": finishing cache cleaner run started at " +
134            curMs + ".  Demoted " + numDemoted + " mmapped replicas; " +
135            "purged " + numPurged + " replicas.");
136        }
137      } finally {
138        ShortCircuitCache.this.lock.unlock();
139      }
140    }
141
142    @Override
143    public void close() throws IOException {
144      if (future != null) {
145        future.cancel(false);
146      }
147    }
148
149    public void setFuture(ScheduledFuture<?> future) {
150      this.future = future;
151    }
152
153    /**
154     * Get the rate at which this cleaner thread should be scheduled.
155     *
156     * We do this by taking the minimum expiration time and dividing by 4.
157     *
158     * @return the rate in milliseconds at which this thread should be
159     *         scheduled.
160     */
161    public long getRateInMs() {
162      long minLifespanMs =
163          Math.min(maxNonMmappedEvictableLifespanMs,
164              maxEvictableMmapedLifespanMs);
165      long sampleTimeMs = minLifespanMs / 4;
166      return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
167    }
168  }
169
170  /**
171   * A task which asks the DataNode to release a short-circuit shared memory
172   * slot.  If successful, this will tell the DataNode to stop monitoring
173   * changes to the mlock status of the replica associated with the slot.
174   * It will also allow us (the client) to re-use this slot for another
175   * replica.  If we can't communicate with the DataNode for some reason,
176   * we tear down the shared memory segment to avoid being in an inconsistent
177   * state.
178   */
179  private class SlotReleaser implements Runnable {
180    /**
181     * The slot that we need to release.
182     */
183    private final Slot slot;
184
185    SlotReleaser(Slot slot) {
186      this.slot = slot;
187    }
188
189    @Override
190    public void run() {
191      if (LOG.isTraceEnabled()) {
192        LOG.trace(ShortCircuitCache.this + ": about to release " + slot);
193      }
194      final DfsClientShm shm = (DfsClientShm)slot.getShm();
195      final DomainSocket shmSock = shm.getPeer().getDomainSocket();
196      DomainSocket sock = null;
197      DataOutputStream out = null;
198      final String path = shmSock.getPath();
199      boolean success = false;
200      try {
201        sock = DomainSocket.connect(path);
202        out = new DataOutputStream(
203            new BufferedOutputStream(sock.getOutputStream()));
204        new Sender(out).releaseShortCircuitFds(slot.getSlotId());
205        DataInputStream in = new DataInputStream(sock.getInputStream());
206        ReleaseShortCircuitAccessResponseProto resp =
207            ReleaseShortCircuitAccessResponseProto.parseFrom(
208                PBHelper.vintPrefixed(in));
209        if (resp.getStatus() != Status.SUCCESS) {
210          String error = resp.hasError() ? resp.getError() : "(unknown)";
211          throw new IOException(resp.getStatus().toString() + ": " + error);
212        }
213        if (LOG.isTraceEnabled()) {
214          LOG.trace(ShortCircuitCache.this + ": released " + slot);
215        }
216        success = true;
217      } catch (IOException e) {
218        LOG.error(ShortCircuitCache.this + ": failed to release " +
219            "short-circuit shared memory slot " + slot + " by sending " +
220            "ReleaseShortCircuitAccessRequestProto to " + path +
221            ".  Closing shared memory segment.", e);
222      } finally {
223        if (success) {
224          shmManager.freeSlot(slot);
225        } else {
226          shm.getEndpointShmManager().shutdown(shm);
227        }
228        IOUtils.cleanup(LOG, sock, out);
229      }
230    }
231  }
232
233  public interface ShortCircuitReplicaCreator {
234    /**
235     * Attempt to create a ShortCircuitReplica object.
236     *
237     * This callback will be made without holding any locks.
238     *
239     * @return a non-null ShortCircuitReplicaInfo object.
240     */
241    ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
242  }
243
244  /**
245   * Lock protecting the cache.
246   */
247  private final ReentrantLock lock = new ReentrantLock();
248
249  /**
250   * The executor service that runs the cacheCleaner.
251   */
252  private final ScheduledThreadPoolExecutor cleanerExecutor
253  = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
254          setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner").
255          build());
256
257  /**
258   * The executor service that runs the cacheCleaner.
259   */
260  private final ScheduledThreadPoolExecutor releaserExecutor
261      = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
262          setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser").
263          build());
264
265  /**
266   * A map containing all ShortCircuitReplicaInfo objects, organized by Key.
267   * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
268   * exception.
269   */
270  private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 
271      replicaInfoMap = new HashMap<ExtendedBlockId,
272          Waitable<ShortCircuitReplicaInfo>>();
273
274  /**
275   * The CacheCleaner.  We don't create this and schedule it until it becomes
276   * necessary.
277   */
278  private CacheCleaner cacheCleaner;
279
280  /**
281   * Tree of evictable elements.
282   *
283   * Maps (unique) insertion time in nanoseconds to the element.
284   */
285  private final TreeMap<Long, ShortCircuitReplica> evictable =
286      new TreeMap<Long, ShortCircuitReplica>();
287
288  /**
289   * Maximum total size of the cache, including both mmapped and
290   * no$-mmapped elements.
291   */
292  private final int maxTotalSize;
293
294  /**
295   * Non-mmaped elements older than this will be closed.
296   */
297  private long maxNonMmappedEvictableLifespanMs;
298
299  /**
300   * Tree of mmaped evictable elements.
301   *
302   * Maps (unique) insertion time in nanoseconds to the element.
303   */
304  private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
305      new TreeMap<Long, ShortCircuitReplica>();
306
307  /**
308   * Maximum number of mmaped evictable elements.
309   */
310  private int maxEvictableMmapedSize;
311
312  /**
313   * Mmaped elements older than this will be closed.
314   */
315  private final long maxEvictableMmapedLifespanMs;
316
317  /**
318   * The minimum number of milliseconds we'll wait after an unsuccessful
319   * mmap attempt before trying again.
320   */
321  private final long mmapRetryTimeoutMs;
322
323  /**
324   * How long we will keep replicas in the cache before declaring them
325   * to be stale.
326   */
327  private final long staleThresholdMs;
328
329  /**
330   * True if the ShortCircuitCache is closed.
331   */
332  private boolean closed = false;
333
334  /**
335   * Number of existing mmaps associated with this cache.
336   */
337  private int outstandingMmapCount = 0;
338
339  /**
340   * Manages short-circuit shared memory segments for the client.
341   */
342  private final DfsClientShmManager shmManager;
343
344  /**
345   * Create a {@link ShortCircuitCache} object from a {@link Configuration}
346   */
347  public static ShortCircuitCache fromConf(Configuration conf) {
348    return new ShortCircuitCache(
349        conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
350            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
351        conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
352            DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
353        conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
354            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
355        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
356            DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
357        conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
358            DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
359        conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
360            DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT),
361        conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS,
362            DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT));
363  }
364
365  public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
366      int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
367      long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) {
368    Preconditions.checkArgument(maxTotalSize >= 0);
369    this.maxTotalSize = maxTotalSize;
370    Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
371    this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
372    Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
373    this.maxEvictableMmapedSize = maxEvictableMmapedSize;
374    Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
375    this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
376    this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
377    this.staleThresholdMs = staleThresholdMs;
378    DfsClientShmManager shmManager = null;
379    if ((shmInterruptCheckMs > 0) &&
380        (DomainSocketWatcher.getLoadingFailureReason() == null)) {
381      try {
382        shmManager = new DfsClientShmManager(shmInterruptCheckMs);
383      } catch (IOException e) {
384        LOG.error("failed to create ShortCircuitShmManager", e);
385      }
386    }
387    this.shmManager = shmManager;
388  }
389
390  public long getMmapRetryTimeoutMs() {
391    return mmapRetryTimeoutMs;
392  }
393
394  public long getStaleThresholdMs() {
395    return staleThresholdMs;
396  }
397
398  /**
399   * Increment the reference count of a replica, and remove it from any free
400   * list it may be in.
401   *
402   * You must hold the cache lock while calling this function.
403   *
404   * @param replica      The replica we're removing.
405   */
406  private void ref(ShortCircuitReplica replica) {
407    lock.lock();
408    try {
409      Preconditions.checkArgument(replica.refCount > 0,
410          "can't ref " + replica + " because its refCount reached " +
411          replica.refCount);
412      Long evictableTimeNs = replica.getEvictableTimeNs();
413      replica.refCount++;
414      if (evictableTimeNs != null) {
415        String removedFrom = removeEvictable(replica);
416        if (LOG.isTraceEnabled()) {
417          LOG.trace(this + ": " + removedFrom +
418              " no longer contains " + replica + ".  refCount " +
419              (replica.refCount - 1) + " -> " + replica.refCount +
420              StringUtils.getStackTrace(Thread.currentThread()));
421
422        }
423      } else if (LOG.isTraceEnabled()) {
424        LOG.trace(this + ": replica  refCount " +
425            (replica.refCount - 1) + " -> " + replica.refCount +
426            StringUtils.getStackTrace(Thread.currentThread()));
427      }
428    } finally {
429      lock.unlock();
430    }
431  }
432
433  /**
434   * Unreference a replica.
435   *
436   * You must hold the cache lock while calling this function.
437   *
438   * @param replica   The replica being unreferenced.
439   */
440  void unref(ShortCircuitReplica replica) {
441    lock.lock();
442    try {
443      // If the replica is stale, but we haven't purged it yet, let's do that.
444      // It would be a shame to evict a non-stale replica so that we could put
445      // a stale one into the cache.
446      if ((!replica.purged) && replica.isStale()) {
447        purge(replica);
448      }
449      String addedString = "";
450      boolean shouldTrimEvictionMaps = false;
451      int newRefCount = --replica.refCount;
452      if (newRefCount == 0) {
453        // Close replica, since there are no remaining references to it.
454        Preconditions.checkArgument(replica.purged,
455            "Replica " + replica + " reached a refCount of 0 without " +
456            "being purged");
457        replica.close();
458      } else if (newRefCount == 1) {
459        Preconditions.checkState(null == replica.getEvictableTimeNs(),
460            "Replica " + replica + " had a refCount higher than 1, " +
461              "but was still evictable (evictableTimeNs = " +
462                replica.getEvictableTimeNs() + ")");
463        if (!replica.purged) {
464          // Add the replica to the end of an eviction list.
465          // Eviction lists are sorted by time.
466          if (replica.hasMmap()) {
467            insertEvictable(System.nanoTime(), replica, evictableMmapped);
468            addedString = "added to evictableMmapped, ";
469          } else {
470            insertEvictable(System.nanoTime(), replica, evictable);
471            addedString = "added to evictable, ";
472          }
473          shouldTrimEvictionMaps = true;
474        }
475      } else {
476        Preconditions.checkArgument(replica.refCount >= 0,
477            "replica's refCount went negative (refCount = " +
478            replica.refCount + " for " + replica + ")");
479      }
480      if (LOG.isTraceEnabled()) {
481        LOG.trace(this + ": unref replica " + replica +
482            ": " + addedString + " refCount " +
483            (newRefCount + 1) + " -> " + newRefCount +
484            StringUtils.getStackTrace(Thread.currentThread()));
485      }
486      if (shouldTrimEvictionMaps) {
487        trimEvictionMaps();
488      }
489    } finally {
490      lock.unlock();
491    }
492  }
493
494  /**
495   * Demote old evictable mmaps into the regular eviction map.
496   *
497   * You must hold the cache lock while calling this function.
498   *
499   * @param now   Current time in monotonic milliseconds.
500   * @return      Number of replicas demoted.
501   */
502  private int demoteOldEvictableMmaped(long now) {
503    int numDemoted = 0;
504    boolean needMoreSpace = false;
505    Long evictionTimeNs = Long.valueOf(0);
506
507    while (true) {
508      Entry<Long, ShortCircuitReplica> entry = 
509          evictableMmapped.ceilingEntry(evictionTimeNs);
510      if (entry == null) break;
511      evictionTimeNs = entry.getKey();
512      long evictionTimeMs = 
513          TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
514      if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
515        if (evictableMmapped.size() < maxEvictableMmapedSize) {
516          break;
517        }
518        needMoreSpace = true;
519      }
520      ShortCircuitReplica replica = entry.getValue();
521      if (LOG.isTraceEnabled()) {
522        String rationale = needMoreSpace ? "because we need more space" : 
523            "because it's too old";
524        LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
525            rationale + ": " +
526            StringUtils.getStackTrace(Thread.currentThread()));
527      }
528      removeEvictable(replica, evictableMmapped);
529      munmap(replica);
530      insertEvictable(evictionTimeNs, replica, evictable);
531      numDemoted++;
532    }
533    return numDemoted;
534  }
535
536  /**
537   * Trim the eviction lists.
538   */
539  private void trimEvictionMaps() {
540    long now = Time.monotonicNow();
541    demoteOldEvictableMmaped(now);
542
543    while (true) {
544      long evictableSize = evictable.size();
545      long evictableMmappedSize = evictableMmapped.size();
546      if (evictableSize + evictableMmappedSize <= maxTotalSize) {
547        return;
548      }
549      ShortCircuitReplica replica;
550      if (evictableSize == 0) {
551       replica = evictableMmapped.firstEntry().getValue();
552      } else {
553       replica = evictable.firstEntry().getValue();
554      }
555      if (LOG.isTraceEnabled()) {
556        LOG.trace(this + ": trimEvictionMaps is purging " + replica +
557          StringUtils.getStackTrace(Thread.currentThread()));
558      }
559      purge(replica);
560    }
561  }
562
563  /**
564   * Munmap a replica, updating outstandingMmapCount.
565   *
566   * @param replica  The replica to munmap.
567   */
568  private void munmap(ShortCircuitReplica replica) {
569    replica.munmap();
570    outstandingMmapCount--;
571  }
572
573  /**
574   * Remove a replica from an evictable map.
575   *
576   * @param replica   The replica to remove.
577   * @return          The map it was removed from.
578   */
579  private String removeEvictable(ShortCircuitReplica replica) {
580    if (replica.hasMmap()) {
581      removeEvictable(replica, evictableMmapped);
582      return "evictableMmapped";
583    } else {
584      removeEvictable(replica, evictable);
585      return "evictable";
586    }
587  }
588
589  /**
590   * Remove a replica from an evictable map.
591   *
592   * @param replica   The replica to remove.
593   * @param map       The map to remove it from.
594   */
595  private void removeEvictable(ShortCircuitReplica replica,
596      TreeMap<Long, ShortCircuitReplica> map) {
597    Long evictableTimeNs = replica.getEvictableTimeNs();
598    Preconditions.checkNotNull(evictableTimeNs);
599    ShortCircuitReplica removed = map.remove(evictableTimeNs);
600    Preconditions.checkState(removed == replica,
601        "failed to make " + replica + " unevictable");
602    replica.setEvictableTimeNs(null);
603  }
604
605  /**
606   * Insert a replica into an evictable map.
607   *
608   * If an element already exists with this eviction time, we add a nanosecond
609   * to it until we find an unused key.
610   *
611   * @param evictionTimeNs   The eviction time in absolute nanoseconds.
612   * @param replica          The replica to insert.
613   * @param map              The map to insert it into.
614   */
615  private void insertEvictable(Long evictionTimeNs,
616      ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
617    while (map.containsKey(evictionTimeNs)) {
618      evictionTimeNs++;
619    }
620    Preconditions.checkState(null == replica.getEvictableTimeNs());
621    Long time = Long.valueOf(evictionTimeNs);
622    replica.setEvictableTimeNs(time);
623    map.put(time, replica);
624  }
625
626  /**
627   * Purge a replica from the cache.
628   *
629   * This doesn't necessarily close the replica, since there may be
630   * outstanding references to it.  However, it does mean the cache won't
631   * hand it out to anyone after this.
632   *
633   * You must hold the cache lock while calling this function.
634   *
635   * @param replica   The replica being removed.
636   */
637  private void purge(ShortCircuitReplica replica) {
638    boolean removedFromInfoMap = false;
639    String evictionMapName = null;
640    Preconditions.checkArgument(!replica.purged);
641    replica.purged = true;
642    Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
643    if (val != null) {
644      ShortCircuitReplicaInfo info = val.getVal();
645      if ((info != null) && (info.getReplica() == replica)) {
646        replicaInfoMap.remove(replica.key);
647        removedFromInfoMap = true;
648      }
649    }
650    Long evictableTimeNs = replica.getEvictableTimeNs();
651    if (evictableTimeNs != null) {
652      evictionMapName = removeEvictable(replica);
653    }
654    if (LOG.isTraceEnabled()) {
655      StringBuilder builder = new StringBuilder();
656      builder.append(this).append(": ").append(": purged ").
657          append(replica).append(" from the cache.");
658      if (removedFromInfoMap) {
659        builder.append("  Removed from the replicaInfoMap.");
660      }
661      if (evictionMapName != null) {
662        builder.append("  Removed from ").append(evictionMapName);
663      }
664      LOG.trace(builder.toString());
665    }
666    unref(replica);
667  }
668
669  /**
670   * Fetch or create a replica.
671   *
672   * You must hold the cache lock while calling this function.
673   *
674   * @param key          Key to use for lookup.
675   * @param creator      Replica creator callback.  Will be called without
676   *                     the cache lock being held.
677   *
678   * @return             Null if no replica could be found or created.
679   *                     The replica, otherwise.
680   */
681  public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
682      ShortCircuitReplicaCreator creator) {
683    Waitable<ShortCircuitReplicaInfo> newWaitable = null;
684    lock.lock();
685    try {
686      ShortCircuitReplicaInfo info = null;
687      do {
688        if (closed) {
689          if (LOG.isTraceEnabled()) {
690            LOG.trace(this + ": can't fetchOrCreate " + key +
691                " because the cache is closed.");
692          }
693          return null;
694        }
695        Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
696        if (waitable != null) {
697          try {
698            info = fetch(key, waitable);
699          } catch (RetriableException e) {
700            if (LOG.isDebugEnabled()) {
701              LOG.debug(this + ": retrying " + e.getMessage());
702            }
703            continue;
704          }
705        }
706      } while (false);
707      if (info != null) return info;
708      // We need to load the replica ourselves.
709      newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
710      replicaInfoMap.put(key, newWaitable);
711    } finally {
712      lock.unlock();
713    }
714    return create(key, creator, newWaitable);
715  }
716
717  /**
718   * Fetch an existing ReplicaInfo object.
719   *
720   * @param key       The key that we're using.
721   * @param waitable  The waitable object to wait on.
722   * @return          The existing ReplicaInfo object, or null if there is
723   *                  none.
724   *
725   * @throws RetriableException   If the caller needs to retry.
726   */
727  private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
728      Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
729    // Another thread is already in the process of loading this
730    // ShortCircuitReplica.  So we simply wait for it to complete.
731    ShortCircuitReplicaInfo info;
732    try {
733      if (LOG.isTraceEnabled()) {
734        LOG.trace(this + ": found waitable for " + key);
735      }
736      info = waitable.await();
737    } catch (InterruptedException e) {
738      LOG.info(this + ": interrupted while waiting for " + key);
739      Thread.currentThread().interrupt();
740      throw new RetriableException("interrupted");
741    }
742    if (info.getInvalidTokenException() != null) {
743      LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
744            "exception.", info.getInvalidTokenException());
745      return info;
746    }
747    ShortCircuitReplica replica = info.getReplica();
748    if (replica == null) {
749      LOG.warn(this + ": failed to get " + key);
750      return info;
751    }
752    if (replica.purged) {
753      // Ignore replicas that have already been purged from the cache.
754      throw new RetriableException("Ignoring purged replica " +
755          replica + ".  Retrying.");
756    }
757    // Check if the replica is stale before using it.
758    // If it is, purge it and retry.
759    if (replica.isStale()) {
760      LOG.info(this + ": got stale replica " + replica + ".  Removing " +
761          "this replica from the replicaInfoMap and retrying.");
762      // Remove the cache's reference to the replica.  This may or may not
763      // trigger a close.
764      purge(replica);
765      throw new RetriableException("ignoring stale replica " + replica);
766    }
767    ref(replica);
768    return info;
769  }
770
771  private ShortCircuitReplicaInfo create(ExtendedBlockId key,
772      ShortCircuitReplicaCreator creator,
773      Waitable<ShortCircuitReplicaInfo> newWaitable) {
774    // Handle loading a new replica.
775    ShortCircuitReplicaInfo info = null;
776    try {
777      if (LOG.isTraceEnabled()) {
778        LOG.trace(this + ": loading " + key);
779      }
780      info = creator.createShortCircuitReplicaInfo();
781    } catch (RuntimeException e) {
782      LOG.warn(this + ": failed to load " + key, e);
783    }
784    if (info == null) info = new ShortCircuitReplicaInfo();
785    lock.lock();
786    try {
787      if (info.getReplica() != null) {
788        // On success, make sure the cache cleaner thread is running.
789        if (LOG.isTraceEnabled()) {
790          LOG.trace(this + ": successfully loaded " + info.getReplica());
791        }
792        startCacheCleanerThreadIfNeeded();
793        // Note: new ShortCircuitReplicas start with a refCount of 2,
794        // indicating that both this cache and whoever requested the 
795        // creation of the replica hold a reference.  So we don't need
796        // to increment the reference count here.
797      } else {
798        // On failure, remove the waitable from the replicaInfoMap.
799        Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
800        if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
801        if (info.getInvalidTokenException() != null) {
802          LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
803              "exception.", info.getInvalidTokenException());
804        } else {
805          LOG.warn(this + ": failed to load " + key);
806        }
807      }
808      newWaitable.provide(info);
809    } finally {
810      lock.unlock();
811    }
812    return info;
813  }
814
815  private void startCacheCleanerThreadIfNeeded() {
816    if (cacheCleaner == null) {
817      cacheCleaner = new CacheCleaner();
818      long rateMs = cacheCleaner.getRateInMs();
819      ScheduledFuture<?> future =
820          cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
821              TimeUnit.MILLISECONDS);
822      cacheCleaner.setFuture(future);
823      if (LOG.isDebugEnabled()) {
824        LOG.debug(this + ": starting cache cleaner thread which will run " +
825          "every " + rateMs + " ms");
826      }
827    }
828  }
829
830  ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica,
831      boolean anchored) {
832    Condition newCond;
833    lock.lock();
834    try {
835      while (replica.mmapData != null) {
836        if (replica.mmapData instanceof MappedByteBuffer) {
837          ref(replica);
838          MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData;
839          return new ClientMmap(replica, mmap, anchored);
840        } else if (replica.mmapData instanceof Long) {
841          long lastAttemptTimeMs = (Long)replica.mmapData;
842          long delta = Time.monotonicNow() - lastAttemptTimeMs;
843          if (delta < staleThresholdMs) {
844            if (LOG.isTraceEnabled()) {
845              LOG.trace(this + ": can't create client mmap for " +
846                  replica + " because we failed to " +
847                  "create one just " + delta + "ms ago.");
848            }
849            return null;
850          }
851          if (LOG.isTraceEnabled()) {
852            LOG.trace(this + ": retrying client mmap for " + replica +
853                ", " + delta + " ms after the previous failure.");
854          }
855        } else if (replica.mmapData instanceof Condition) {
856          Condition cond = (Condition)replica.mmapData;
857          cond.awaitUninterruptibly();
858        } else {
859          Preconditions.checkState(false, "invalid mmapData type " +
860              replica.mmapData.getClass().getName());
861        }
862      }
863      newCond = lock.newCondition();
864      replica.mmapData = newCond;
865    } finally {
866      lock.unlock();
867    }
868    MappedByteBuffer map = replica.loadMmapInternal();
869    lock.lock();
870    try {
871      if (map == null) {
872        replica.mmapData = Long.valueOf(Time.monotonicNow());
873        newCond.signalAll();
874        return null;
875      } else {
876        outstandingMmapCount++;
877        replica.mmapData = map;
878        ref(replica);
879        newCond.signalAll();
880        return new ClientMmap(replica, map, anchored);
881      }
882    } finally {
883      lock.unlock();
884    }
885  }
886
887  /**
888   * Close the cache and free all associated resources.
889   */
890  @Override
891  public void close() {
892    try {
893      lock.lock();
894      if (closed) return;
895      closed = true;
896      LOG.info(this + ": closing");
897      maxNonMmappedEvictableLifespanMs = 0;
898      maxEvictableMmapedSize = 0;
899      // Close and join cacheCleaner thread.
900      IOUtils.cleanup(LOG, cacheCleaner);
901      // Purge all replicas.
902      while (true) {
903        Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
904        if (entry == null) break;
905        purge(entry.getValue());
906      }
907      while (true) {
908        Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
909        if (entry == null) break;
910        purge(entry.getValue());
911      }
912    } finally {
913      lock.unlock();
914    }
915    IOUtils.cleanup(LOG, shmManager);
916  }
917
918  @VisibleForTesting // ONLY for testing
919  public interface CacheVisitor {
920    void visit(int numOutstandingMmaps,
921        Map<ExtendedBlockId, ShortCircuitReplica> replicas,
922        Map<ExtendedBlockId, InvalidToken> failedLoads,
923        Map<Long, ShortCircuitReplica> evictable,
924        Map<Long, ShortCircuitReplica> evictableMmapped);
925  }
926
927  @VisibleForTesting // ONLY for testing
928  public void accept(CacheVisitor visitor) {
929    lock.lock();
930    try {
931      Map<ExtendedBlockId, ShortCircuitReplica> replicas =
932          new HashMap<ExtendedBlockId, ShortCircuitReplica>();
933      Map<ExtendedBlockId, InvalidToken> failedLoads =
934          new HashMap<ExtendedBlockId, InvalidToken>();
935      for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
936            replicaInfoMap.entrySet()) {
937        Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
938        if (waitable.hasVal()) {
939          if (waitable.getVal().getReplica() != null) {
940            replicas.put(entry.getKey(), waitable.getVal().getReplica());
941          } else {
942            // The exception may be null here, indicating a failed load that
943            // isn't the result of an invalid block token.
944            failedLoads.put(entry.getKey(),
945                waitable.getVal().getInvalidTokenException());
946          }
947        }
948      }
949      if (LOG.isDebugEnabled()) {
950        StringBuilder builder = new StringBuilder();
951        builder.append("visiting ").append(visitor.getClass().getName()).
952            append("with outstandingMmapCount=").append(outstandingMmapCount).
953            append(", replicas=");
954        String prefix = "";
955        for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
956          builder.append(prefix).append(entry.getValue());
957          prefix = ",";
958        }
959        prefix = "";
960        builder.append(", failedLoads=");
961        for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
962          builder.append(prefix).append(entry.getValue());
963          prefix = ",";
964        }
965        prefix = "";
966        builder.append(", evictable=");
967        for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
968          builder.append(prefix).append(entry.getKey()).
969              append(":").append(entry.getValue());
970          prefix = ",";
971        }
972        prefix = "";
973        builder.append(", evictableMmapped=");
974        for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
975          builder.append(prefix).append(entry.getKey()).
976              append(":").append(entry.getValue());
977          prefix = ",";
978        }
979        LOG.debug(builder.toString());
980      }
981      visitor.visit(outstandingMmapCount, replicas, failedLoads,
982            evictable, evictableMmapped);
983    } finally {
984      lock.unlock();
985    }
986  }
987
988  @Override
989  public String toString() {
990    return "ShortCircuitCache(0x" +
991        Integer.toHexString(System.identityHashCode(this)) + ")";
992  }
993
994  /**
995   * Allocate a new shared memory slot.
996   *
997   * @param datanode       The datanode to allocate a shm slot with.
998   * @param peer           A peer connected to the datanode.
999   * @param usedPeer       Will be set to true if we use up the provided peer.
1000   * @param blockId        The block id and block pool id of the block we're 
1001   *                         allocating this slot for.
1002   * @param clientName     The name of the DFSClient allocating the shared
1003   *                         memory.
1004   * @return               Null if short-circuit shared memory is disabled;
1005   *                         a short-circuit memory slot otherwise.
1006   * @throws IOException   An exception if there was an error talking to 
1007   *                         the datanode.
1008   */
1009  public Slot allocShmSlot(DatanodeInfo datanode,
1010        DomainPeer peer, MutableBoolean usedPeer,
1011        ExtendedBlockId blockId, String clientName) throws IOException {
1012    if (shmManager != null) {
1013      return shmManager.allocSlot(datanode, peer, usedPeer,
1014          blockId, clientName);
1015    } else {
1016      return null;
1017    }
1018  }
1019
1020  /**
1021   * Free a slot immediately.
1022   *
1023   * ONLY use this if the DataNode is not yet aware of the slot.
1024   * 
1025   * @param slot           The slot to free.
1026   */
1027  public void freeSlot(Slot slot) {
1028    Preconditions.checkState(shmManager != null);
1029    slot.makeInvalid();
1030    shmManager.freeSlot(slot);
1031  }
1032  
1033  /**
1034   * Schedule a shared memory slot to be released.
1035   *
1036   * @param slot           The slot to release.
1037   */
1038  public void scheduleSlotReleaser(Slot slot) {
1039    Preconditions.checkState(shmManager != null);
1040    releaserExecutor.execute(new SlotReleaser(slot));
1041  }
1042
1043  @VisibleForTesting
1044  public DfsClientShmManager getDfsClientShmManager() {
1045    return shmManager;
1046  }
1047}