001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.BufferedOutputStream; 021import java.io.Closeable; 022import java.io.DataInputStream; 023import java.io.DataOutputStream; 024import java.io.IOException; 025import java.nio.MappedByteBuffer; 026import java.util.HashMap; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.TreeMap; 030import java.util.concurrent.ScheduledFuture; 031import java.util.concurrent.ScheduledThreadPoolExecutor; 032import java.util.concurrent.TimeUnit; 033import java.util.concurrent.locks.Condition; 034import java.util.concurrent.locks.ReentrantLock; 035 036import org.apache.commons.lang.mutable.MutableBoolean; 037import org.apache.commons.logging.Log; 038import org.apache.commons.logging.LogFactory; 039import org.apache.hadoop.classification.InterfaceAudience; 040import org.apache.hadoop.conf.Configuration; 041import org.apache.hadoop.hdfs.DFSConfigKeys; 042import org.apache.hadoop.hdfs.ExtendedBlockId; 043import org.apache.hadoop.hdfs.net.DomainPeer; 044import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 045import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 046import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto; 047import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; 048import org.apache.hadoop.hdfs.protocolPB.PBHelper; 049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot; 050import org.apache.hadoop.io.IOUtils; 051import org.apache.hadoop.ipc.RetriableException; 052import org.apache.hadoop.net.unix.DomainSocket; 053import org.apache.hadoop.net.unix.DomainSocketWatcher; 054import org.apache.hadoop.security.token.SecretManager.InvalidToken; 055import org.apache.hadoop.util.StringUtils; 056import org.apache.hadoop.util.Time; 057import org.apache.hadoop.util.Waitable; 058 059import com.google.common.annotations.VisibleForTesting; 060import com.google.common.base.Preconditions; 061import com.google.common.util.concurrent.ThreadFactoryBuilder; 062 063/** 064 * The ShortCircuitCache tracks things which the client needs to access 065 * HDFS block files via short-circuit. 066 * 067 * These things include: memory-mapped regions, file descriptors, and shared 068 * memory areas for communicating with the DataNode. 069 */ 070@InterfaceAudience.Private 071public class ShortCircuitCache implements Closeable { 072 public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class); 073 074 /** 075 * Expiry thread which makes sure that the file descriptors get closed 076 * after a while. 077 */ 078 private class CacheCleaner implements Runnable, Closeable { 079 private ScheduledFuture<?> future; 080 081 /** 082 * Run the CacheCleaner thread. 083 * 084 * Whenever a thread requests a ShortCircuitReplica object, we will make 085 * sure it gets one. That ShortCircuitReplica object can then be re-used 086 * when another thread requests a ShortCircuitReplica object for the same 087 * block. So in that sense, there is no maximum size to the cache. 088 * 089 * However, when a ShortCircuitReplica object is unreferenced by the 090 * thread(s) that are using it, it becomes evictable. There are two 091 * separate eviction lists-- one for mmaped objects, and another for 092 * non-mmaped objects. We do this in order to avoid having the regular 093 * files kick the mmaped files out of the cache too quickly. Reusing 094 * an already-existing mmap gives a huge performance boost, since the 095 * page table entries don't have to be re-populated. Both the mmap 096 * and non-mmap evictable lists have maximum sizes and maximum lifespans. 097 */ 098 @Override 099 public void run() { 100 ShortCircuitCache.this.lock.lock(); 101 try { 102 if (ShortCircuitCache.this.closed) return; 103 long curMs = Time.monotonicNow(); 104 105 if (LOG.isDebugEnabled()) { 106 LOG.debug(this + ": cache cleaner running at " + curMs); 107 } 108 109 int numDemoted = demoteOldEvictableMmaped(curMs); 110 int numPurged = 0; 111 Long evictionTimeNs = Long.valueOf(0); 112 while (true) { 113 Entry<Long, ShortCircuitReplica> entry = 114 evictable.ceilingEntry(evictionTimeNs); 115 if (entry == null) break; 116 evictionTimeNs = entry.getKey(); 117 long evictionTimeMs = 118 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 119 if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break; 120 ShortCircuitReplica replica = entry.getValue(); 121 if (LOG.isTraceEnabled()) { 122 LOG.trace("CacheCleaner: purging " + replica + ": " + 123 StringUtils.getStackTrace(Thread.currentThread())); 124 } 125 purge(replica); 126 numPurged++; 127 } 128 129 if (LOG.isDebugEnabled()) { 130 LOG.debug(this + ": finishing cache cleaner run started at " + 131 curMs + ". Demoted " + numDemoted + " mmapped replicas; " + 132 "purged " + numPurged + " replicas."); 133 } 134 } finally { 135 ShortCircuitCache.this.lock.unlock(); 136 } 137 } 138 139 @Override 140 public void close() throws IOException { 141 if (future != null) { 142 future.cancel(false); 143 } 144 } 145 146 public void setFuture(ScheduledFuture<?> future) { 147 this.future = future; 148 } 149 150 /** 151 * Get the rate at which this cleaner thread should be scheduled. 152 * 153 * We do this by taking the minimum expiration time and dividing by 4. 154 * 155 * @return the rate in milliseconds at which this thread should be 156 * scheduled. 157 */ 158 public long getRateInMs() { 159 long minLifespanMs = 160 Math.min(maxNonMmappedEvictableLifespanMs, 161 maxEvictableMmapedLifespanMs); 162 long sampleTimeMs = minLifespanMs / 4; 163 return (sampleTimeMs < 1) ? 1 : sampleTimeMs; 164 } 165 } 166 167 /** 168 * A task which asks the DataNode to release a short-circuit shared memory 169 * slot. If successful, this will tell the DataNode to stop monitoring 170 * changes to the mlock status of the replica associated with the slot. 171 * It will also allow us (the client) to re-use this slot for another 172 * replica. If we can't communicate with the DataNode for some reason, 173 * we tear down the shared memory segment to avoid being in an inconsistent 174 * state. 175 */ 176 private class SlotReleaser implements Runnable { 177 /** 178 * The slot that we need to release. 179 */ 180 private final Slot slot; 181 182 SlotReleaser(Slot slot) { 183 this.slot = slot; 184 } 185 186 @Override 187 public void run() { 188 if (LOG.isTraceEnabled()) { 189 LOG.trace(ShortCircuitCache.this + ": about to release " + slot); 190 } 191 final DfsClientShm shm = (DfsClientShm)slot.getShm(); 192 final DomainSocket shmSock = shm.getPeer().getDomainSocket(); 193 DomainSocket sock = null; 194 DataOutputStream out = null; 195 final String path = shmSock.getPath(); 196 boolean success = false; 197 try { 198 sock = DomainSocket.connect(path); 199 out = new DataOutputStream( 200 new BufferedOutputStream(sock.getOutputStream())); 201 new Sender(out).releaseShortCircuitFds(slot.getSlotId()); 202 DataInputStream in = new DataInputStream(sock.getInputStream()); 203 ReleaseShortCircuitAccessResponseProto resp = 204 ReleaseShortCircuitAccessResponseProto.parseFrom( 205 PBHelper.vintPrefixed(in)); 206 if (resp.getStatus() != Status.SUCCESS) { 207 String error = resp.hasError() ? resp.getError() : "(unknown)"; 208 throw new IOException(resp.getStatus().toString() + ": " + error); 209 } 210 if (LOG.isTraceEnabled()) { 211 LOG.trace(ShortCircuitCache.this + ": released " + slot); 212 } 213 success = true; 214 } catch (IOException e) { 215 LOG.error(ShortCircuitCache.this + ": failed to release " + 216 "short-circuit shared memory slot " + slot + " by sending " + 217 "ReleaseShortCircuitAccessRequestProto to " + path + 218 ". Closing shared memory segment.", e); 219 } finally { 220 if (success) { 221 shmManager.freeSlot(slot); 222 } else { 223 shm.getEndpointShmManager().shutdown(shm); 224 } 225 IOUtils.cleanup(LOG, sock, out); 226 } 227 } 228 } 229 230 public interface ShortCircuitReplicaCreator { 231 /** 232 * Attempt to create a ShortCircuitReplica object. 233 * 234 * This callback will be made without holding any locks. 235 * 236 * @return a non-null ShortCircuitReplicaInfo object. 237 */ 238 ShortCircuitReplicaInfo createShortCircuitReplicaInfo(); 239 } 240 241 /** 242 * Lock protecting the cache. 243 */ 244 private final ReentrantLock lock = new ReentrantLock(); 245 246 /** 247 * The executor service that runs the cacheCleaner. 248 */ 249 private final ScheduledThreadPoolExecutor cleanerExecutor 250 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 251 setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner"). 252 build()); 253 254 /** 255 * The executor service that runs the cacheCleaner. 256 */ 257 private final ScheduledThreadPoolExecutor releaserExecutor 258 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 259 setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser"). 260 build()); 261 262 /** 263 * A map containing all ShortCircuitReplicaInfo objects, organized by Key. 264 * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken 265 * exception. 266 */ 267 private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 268 replicaInfoMap = new HashMap<ExtendedBlockId, 269 Waitable<ShortCircuitReplicaInfo>>(); 270 271 /** 272 * The CacheCleaner. We don't create this and schedule it until it becomes 273 * necessary. 274 */ 275 private CacheCleaner cacheCleaner; 276 277 /** 278 * Tree of evictable elements. 279 * 280 * Maps (unique) insertion time in nanoseconds to the element. 281 */ 282 private final TreeMap<Long, ShortCircuitReplica> evictable = 283 new TreeMap<Long, ShortCircuitReplica>(); 284 285 /** 286 * Maximum total size of the cache, including both mmapped and 287 * no$-mmapped elements. 288 */ 289 private final int maxTotalSize; 290 291 /** 292 * Non-mmaped elements older than this will be closed. 293 */ 294 private long maxNonMmappedEvictableLifespanMs; 295 296 /** 297 * Tree of mmaped evictable elements. 298 * 299 * Maps (unique) insertion time in nanoseconds to the element. 300 */ 301 private final TreeMap<Long, ShortCircuitReplica> evictableMmapped = 302 new TreeMap<Long, ShortCircuitReplica>(); 303 304 /** 305 * Maximum number of mmaped evictable elements. 306 */ 307 private int maxEvictableMmapedSize; 308 309 /** 310 * Mmaped elements older than this will be closed. 311 */ 312 private final long maxEvictableMmapedLifespanMs; 313 314 /** 315 * The minimum number of milliseconds we'll wait after an unsuccessful 316 * mmap attempt before trying again. 317 */ 318 private final long mmapRetryTimeoutMs; 319 320 /** 321 * How long we will keep replicas in the cache before declaring them 322 * to be stale. 323 */ 324 private final long staleThresholdMs; 325 326 /** 327 * True if the ShortCircuitCache is closed. 328 */ 329 private boolean closed = false; 330 331 /** 332 * Number of existing mmaps associated with this cache. 333 */ 334 private int outstandingMmapCount = 0; 335 336 /** 337 * Manages short-circuit shared memory segments for the client. 338 */ 339 private final DfsClientShmManager shmManager; 340 341 /** 342 * Create a {@link ShortCircuitCache} object from a {@link Configuration} 343 */ 344 public static ShortCircuitCache fromConf(Configuration conf) { 345 return new ShortCircuitCache( 346 conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY, 347 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT), 348 conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY, 349 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT), 350 conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE, 351 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT), 352 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS, 353 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT), 354 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS, 355 DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT), 356 conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS, 357 DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT), 358 conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, 359 DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT)); 360 } 361 362 public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs, 363 int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs, 364 long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) { 365 Preconditions.checkArgument(maxTotalSize >= 0); 366 this.maxTotalSize = maxTotalSize; 367 Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0); 368 this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs; 369 Preconditions.checkArgument(maxEvictableMmapedSize >= 0); 370 this.maxEvictableMmapedSize = maxEvictableMmapedSize; 371 Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0); 372 this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs; 373 this.mmapRetryTimeoutMs = mmapRetryTimeoutMs; 374 this.staleThresholdMs = staleThresholdMs; 375 DfsClientShmManager shmManager = null; 376 if ((shmInterruptCheckMs > 0) && 377 (DomainSocketWatcher.getLoadingFailureReason() == null)) { 378 try { 379 shmManager = new DfsClientShmManager(shmInterruptCheckMs); 380 } catch (IOException e) { 381 LOG.error("failed to create ShortCircuitShmManager", e); 382 } 383 } 384 this.shmManager = shmManager; 385 } 386 387 public long getStaleThresholdMs() { 388 return staleThresholdMs; 389 } 390 391 /** 392 * Increment the reference count of a replica, and remove it from any free 393 * list it may be in. 394 * 395 * You must hold the cache lock while calling this function. 396 * 397 * @param replica The replica we're removing. 398 */ 399 private void ref(ShortCircuitReplica replica) { 400 lock.lock(); 401 try { 402 Preconditions.checkArgument(replica.refCount > 0, 403 "can't ref " + replica + " because its refCount reached " + 404 replica.refCount); 405 Long evictableTimeNs = replica.getEvictableTimeNs(); 406 replica.refCount++; 407 if (evictableTimeNs != null) { 408 String removedFrom = removeEvictable(replica); 409 if (LOG.isTraceEnabled()) { 410 LOG.trace(this + ": " + removedFrom + 411 " no longer contains " + replica + ". refCount " + 412 (replica.refCount - 1) + " -> " + replica.refCount + 413 StringUtils.getStackTrace(Thread.currentThread())); 414 415 } 416 } else if (LOG.isTraceEnabled()) { 417 LOG.trace(this + ": replica refCount " + 418 (replica.refCount - 1) + " -> " + replica.refCount + 419 StringUtils.getStackTrace(Thread.currentThread())); 420 } 421 } finally { 422 lock.unlock(); 423 } 424 } 425 426 /** 427 * Unreference a replica. 428 * 429 * You must hold the cache lock while calling this function. 430 * 431 * @param replica The replica being unreferenced. 432 */ 433 void unref(ShortCircuitReplica replica) { 434 lock.lock(); 435 try { 436 // If the replica is stale or unusable, but we haven't purged it yet, 437 // let's do that. It would be a shame to evict a non-stale replica so 438 // that we could put a stale or unusable one into the cache. 439 if (!replica.purged) { 440 String purgeReason = null; 441 if (!replica.getDataStream().getChannel().isOpen()) { 442 purgeReason = "purging replica because its data channel is closed."; 443 } else if (!replica.getMetaStream().getChannel().isOpen()) { 444 purgeReason = "purging replica because its meta channel is closed."; 445 } else if (replica.isStale()) { 446 purgeReason = "purging replica because it is stale."; 447 } 448 if (purgeReason != null) { 449 LOG.debug(this + ": " + purgeReason); 450 purge(replica); 451 } 452 } 453 String addedString = ""; 454 boolean shouldTrimEvictionMaps = false; 455 int newRefCount = --replica.refCount; 456 if (newRefCount == 0) { 457 // Close replica, since there are no remaining references to it. 458 Preconditions.checkArgument(replica.purged, 459 "Replica " + replica + " reached a refCount of 0 without " + 460 "being purged"); 461 replica.close(); 462 } else if (newRefCount == 1) { 463 Preconditions.checkState(null == replica.getEvictableTimeNs(), 464 "Replica " + replica + " had a refCount higher than 1, " + 465 "but was still evictable (evictableTimeNs = " + 466 replica.getEvictableTimeNs() + ")"); 467 if (!replica.purged) { 468 // Add the replica to the end of an eviction list. 469 // Eviction lists are sorted by time. 470 if (replica.hasMmap()) { 471 insertEvictable(System.nanoTime(), replica, evictableMmapped); 472 addedString = "added to evictableMmapped, "; 473 } else { 474 insertEvictable(System.nanoTime(), replica, evictable); 475 addedString = "added to evictable, "; 476 } 477 shouldTrimEvictionMaps = true; 478 } 479 } else { 480 Preconditions.checkArgument(replica.refCount >= 0, 481 "replica's refCount went negative (refCount = " + 482 replica.refCount + " for " + replica + ")"); 483 } 484 if (LOG.isTraceEnabled()) { 485 LOG.trace(this + ": unref replica " + replica + 486 ": " + addedString + " refCount " + 487 (newRefCount + 1) + " -> " + newRefCount + 488 StringUtils.getStackTrace(Thread.currentThread())); 489 } 490 if (shouldTrimEvictionMaps) { 491 trimEvictionMaps(); 492 } 493 } finally { 494 lock.unlock(); 495 } 496 } 497 498 /** 499 * Demote old evictable mmaps into the regular eviction map. 500 * 501 * You must hold the cache lock while calling this function. 502 * 503 * @param now Current time in monotonic milliseconds. 504 * @return Number of replicas demoted. 505 */ 506 private int demoteOldEvictableMmaped(long now) { 507 int numDemoted = 0; 508 boolean needMoreSpace = false; 509 Long evictionTimeNs = Long.valueOf(0); 510 511 while (true) { 512 Entry<Long, ShortCircuitReplica> entry = 513 evictableMmapped.ceilingEntry(evictionTimeNs); 514 if (entry == null) break; 515 evictionTimeNs = entry.getKey(); 516 long evictionTimeMs = 517 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 518 if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) { 519 if (evictableMmapped.size() < maxEvictableMmapedSize) { 520 break; 521 } 522 needMoreSpace = true; 523 } 524 ShortCircuitReplica replica = entry.getValue(); 525 if (LOG.isTraceEnabled()) { 526 String rationale = needMoreSpace ? "because we need more space" : 527 "because it's too old"; 528 LOG.trace("demoteOldEvictable: demoting " + replica + ": " + 529 rationale + ": " + 530 StringUtils.getStackTrace(Thread.currentThread())); 531 } 532 removeEvictable(replica, evictableMmapped); 533 munmap(replica); 534 insertEvictable(evictionTimeNs, replica, evictable); 535 numDemoted++; 536 } 537 return numDemoted; 538 } 539 540 /** 541 * Trim the eviction lists. 542 */ 543 private void trimEvictionMaps() { 544 long now = Time.monotonicNow(); 545 demoteOldEvictableMmaped(now); 546 547 while (true) { 548 long evictableSize = evictable.size(); 549 long evictableMmappedSize = evictableMmapped.size(); 550 if (evictableSize + evictableMmappedSize <= maxTotalSize) { 551 return; 552 } 553 ShortCircuitReplica replica; 554 if (evictableSize == 0) { 555 replica = evictableMmapped.firstEntry().getValue(); 556 } else { 557 replica = evictable.firstEntry().getValue(); 558 } 559 if (LOG.isTraceEnabled()) { 560 LOG.trace(this + ": trimEvictionMaps is purging " + replica + 561 StringUtils.getStackTrace(Thread.currentThread())); 562 } 563 purge(replica); 564 } 565 } 566 567 /** 568 * Munmap a replica, updating outstandingMmapCount. 569 * 570 * @param replica The replica to munmap. 571 */ 572 private void munmap(ShortCircuitReplica replica) { 573 replica.munmap(); 574 outstandingMmapCount--; 575 } 576 577 /** 578 * Remove a replica from an evictable map. 579 * 580 * @param replica The replica to remove. 581 * @return The map it was removed from. 582 */ 583 private String removeEvictable(ShortCircuitReplica replica) { 584 if (replica.hasMmap()) { 585 removeEvictable(replica, evictableMmapped); 586 return "evictableMmapped"; 587 } else { 588 removeEvictable(replica, evictable); 589 return "evictable"; 590 } 591 } 592 593 /** 594 * Remove a replica from an evictable map. 595 * 596 * @param replica The replica to remove. 597 * @param map The map to remove it from. 598 */ 599 private void removeEvictable(ShortCircuitReplica replica, 600 TreeMap<Long, ShortCircuitReplica> map) { 601 Long evictableTimeNs = replica.getEvictableTimeNs(); 602 Preconditions.checkNotNull(evictableTimeNs); 603 ShortCircuitReplica removed = map.remove(evictableTimeNs); 604 Preconditions.checkState(removed == replica, 605 "failed to make " + replica + " unevictable"); 606 replica.setEvictableTimeNs(null); 607 } 608 609 /** 610 * Insert a replica into an evictable map. 611 * 612 * If an element already exists with this eviction time, we add a nanosecond 613 * to it until we find an unused key. 614 * 615 * @param evictionTimeNs The eviction time in absolute nanoseconds. 616 * @param replica The replica to insert. 617 * @param map The map to insert it into. 618 */ 619 private void insertEvictable(Long evictionTimeNs, 620 ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) { 621 while (map.containsKey(evictionTimeNs)) { 622 evictionTimeNs++; 623 } 624 Preconditions.checkState(null == replica.getEvictableTimeNs()); 625 replica.setEvictableTimeNs(evictionTimeNs); 626 map.put(evictionTimeNs, replica); 627 } 628 629 /** 630 * Purge a replica from the cache. 631 * 632 * This doesn't necessarily close the replica, since there may be 633 * outstanding references to it. However, it does mean the cache won't 634 * hand it out to anyone after this. 635 * 636 * You must hold the cache lock while calling this function. 637 * 638 * @param replica The replica being removed. 639 */ 640 private void purge(ShortCircuitReplica replica) { 641 boolean removedFromInfoMap = false; 642 String evictionMapName = null; 643 Preconditions.checkArgument(!replica.purged); 644 replica.purged = true; 645 Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key); 646 if (val != null) { 647 ShortCircuitReplicaInfo info = val.getVal(); 648 if ((info != null) && (info.getReplica() == replica)) { 649 replicaInfoMap.remove(replica.key); 650 removedFromInfoMap = true; 651 } 652 } 653 Long evictableTimeNs = replica.getEvictableTimeNs(); 654 if (evictableTimeNs != null) { 655 evictionMapName = removeEvictable(replica); 656 } 657 if (LOG.isTraceEnabled()) { 658 StringBuilder builder = new StringBuilder(); 659 builder.append(this).append(": ").append(": purged "). 660 append(replica).append(" from the cache."); 661 if (removedFromInfoMap) { 662 builder.append(" Removed from the replicaInfoMap."); 663 } 664 if (evictionMapName != null) { 665 builder.append(" Removed from ").append(evictionMapName); 666 } 667 LOG.trace(builder.toString()); 668 } 669 unref(replica); 670 } 671 672 /** 673 * Fetch or create a replica. 674 * 675 * You must hold the cache lock while calling this function. 676 * 677 * @param key Key to use for lookup. 678 * @param creator Replica creator callback. Will be called without 679 * the cache lock being held. 680 * 681 * @return Null if no replica could be found or created. 682 * The replica, otherwise. 683 */ 684 public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key, 685 ShortCircuitReplicaCreator creator) { 686 Waitable<ShortCircuitReplicaInfo> newWaitable = null; 687 lock.lock(); 688 try { 689 ShortCircuitReplicaInfo info = null; 690 do { 691 if (closed) { 692 if (LOG.isTraceEnabled()) { 693 LOG.trace(this + ": can't fetchOrCreate " + key + 694 " because the cache is closed."); 695 } 696 return null; 697 } 698 Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key); 699 if (waitable != null) { 700 try { 701 info = fetch(key, waitable); 702 } catch (RetriableException e) { 703 if (LOG.isDebugEnabled()) { 704 LOG.debug(this + ": retrying " + e.getMessage()); 705 } 706 continue; 707 } 708 } 709 } while (false); 710 if (info != null) return info; 711 // We need to load the replica ourselves. 712 newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition()); 713 replicaInfoMap.put(key, newWaitable); 714 } finally { 715 lock.unlock(); 716 } 717 return create(key, creator, newWaitable); 718 } 719 720 /** 721 * Fetch an existing ReplicaInfo object. 722 * 723 * @param key The key that we're using. 724 * @param waitable The waitable object to wait on. 725 * @return The existing ReplicaInfo object, or null if there is 726 * none. 727 * 728 * @throws RetriableException If the caller needs to retry. 729 */ 730 private ShortCircuitReplicaInfo fetch(ExtendedBlockId key, 731 Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException { 732 // Another thread is already in the process of loading this 733 // ShortCircuitReplica. So we simply wait for it to complete. 734 ShortCircuitReplicaInfo info; 735 try { 736 if (LOG.isTraceEnabled()) { 737 LOG.trace(this + ": found waitable for " + key); 738 } 739 info = waitable.await(); 740 } catch (InterruptedException e) { 741 LOG.info(this + ": interrupted while waiting for " + key); 742 Thread.currentThread().interrupt(); 743 throw new RetriableException("interrupted"); 744 } 745 if (info.getInvalidTokenException() != null) { 746 LOG.warn(this + ": could not get " + key + " due to InvalidToken " + 747 "exception.", info.getInvalidTokenException()); 748 return info; 749 } 750 ShortCircuitReplica replica = info.getReplica(); 751 if (replica == null) { 752 LOG.warn(this + ": failed to get " + key); 753 return info; 754 } 755 if (replica.purged) { 756 // Ignore replicas that have already been purged from the cache. 757 throw new RetriableException("Ignoring purged replica " + 758 replica + ". Retrying."); 759 } 760 // Check if the replica is stale before using it. 761 // If it is, purge it and retry. 762 if (replica.isStale()) { 763 LOG.info(this + ": got stale replica " + replica + ". Removing " + 764 "this replica from the replicaInfoMap and retrying."); 765 // Remove the cache's reference to the replica. This may or may not 766 // trigger a close. 767 purge(replica); 768 throw new RetriableException("ignoring stale replica " + replica); 769 } 770 ref(replica); 771 return info; 772 } 773 774 private ShortCircuitReplicaInfo create(ExtendedBlockId key, 775 ShortCircuitReplicaCreator creator, 776 Waitable<ShortCircuitReplicaInfo> newWaitable) { 777 // Handle loading a new replica. 778 ShortCircuitReplicaInfo info = null; 779 try { 780 if (LOG.isTraceEnabled()) { 781 LOG.trace(this + ": loading " + key); 782 } 783 info = creator.createShortCircuitReplicaInfo(); 784 } catch (RuntimeException e) { 785 LOG.warn(this + ": failed to load " + key, e); 786 } 787 if (info == null) info = new ShortCircuitReplicaInfo(); 788 lock.lock(); 789 try { 790 if (info.getReplica() != null) { 791 // On success, make sure the cache cleaner thread is running. 792 if (LOG.isTraceEnabled()) { 793 LOG.trace(this + ": successfully loaded " + info.getReplica()); 794 } 795 startCacheCleanerThreadIfNeeded(); 796 // Note: new ShortCircuitReplicas start with a refCount of 2, 797 // indicating that both this cache and whoever requested the 798 // creation of the replica hold a reference. So we don't need 799 // to increment the reference count here. 800 } else { 801 // On failure, remove the waitable from the replicaInfoMap. 802 Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key); 803 if (waitableInMap == newWaitable) replicaInfoMap.remove(key); 804 if (info.getInvalidTokenException() != null) { 805 LOG.warn(this + ": could not load " + key + " due to InvalidToken " + 806 "exception.", info.getInvalidTokenException()); 807 } else { 808 LOG.warn(this + ": failed to load " + key); 809 } 810 } 811 newWaitable.provide(info); 812 } finally { 813 lock.unlock(); 814 } 815 return info; 816 } 817 818 private void startCacheCleanerThreadIfNeeded() { 819 if (cacheCleaner == null) { 820 cacheCleaner = new CacheCleaner(); 821 long rateMs = cacheCleaner.getRateInMs(); 822 ScheduledFuture<?> future = 823 cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs, 824 TimeUnit.MILLISECONDS); 825 cacheCleaner.setFuture(future); 826 if (LOG.isDebugEnabled()) { 827 LOG.debug(this + ": starting cache cleaner thread which will run " + 828 "every " + rateMs + " ms"); 829 } 830 } 831 } 832 833 ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica, 834 boolean anchored) { 835 Condition newCond; 836 lock.lock(); 837 try { 838 while (replica.mmapData != null) { 839 if (replica.mmapData instanceof MappedByteBuffer) { 840 ref(replica); 841 MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData; 842 return new ClientMmap(replica, mmap, anchored); 843 } else if (replica.mmapData instanceof Long) { 844 long lastAttemptTimeMs = (Long)replica.mmapData; 845 long delta = Time.monotonicNow() - lastAttemptTimeMs; 846 if (delta < mmapRetryTimeoutMs) { 847 if (LOG.isTraceEnabled()) { 848 LOG.trace(this + ": can't create client mmap for " + 849 replica + " because we failed to " + 850 "create one just " + delta + "ms ago."); 851 } 852 return null; 853 } 854 if (LOG.isTraceEnabled()) { 855 LOG.trace(this + ": retrying client mmap for " + replica + 856 ", " + delta + " ms after the previous failure."); 857 } 858 } else if (replica.mmapData instanceof Condition) { 859 Condition cond = (Condition)replica.mmapData; 860 cond.awaitUninterruptibly(); 861 } else { 862 Preconditions.checkState(false, "invalid mmapData type " + 863 replica.mmapData.getClass().getName()); 864 } 865 } 866 newCond = lock.newCondition(); 867 replica.mmapData = newCond; 868 } finally { 869 lock.unlock(); 870 } 871 MappedByteBuffer map = replica.loadMmapInternal(); 872 lock.lock(); 873 try { 874 if (map == null) { 875 replica.mmapData = Long.valueOf(Time.monotonicNow()); 876 newCond.signalAll(); 877 return null; 878 } else { 879 outstandingMmapCount++; 880 replica.mmapData = map; 881 ref(replica); 882 newCond.signalAll(); 883 return new ClientMmap(replica, map, anchored); 884 } 885 } finally { 886 lock.unlock(); 887 } 888 } 889 890 /** 891 * Close the cache and free all associated resources. 892 */ 893 @Override 894 public void close() { 895 try { 896 lock.lock(); 897 if (closed) return; 898 closed = true; 899 LOG.info(this + ": closing"); 900 maxNonMmappedEvictableLifespanMs = 0; 901 maxEvictableMmapedSize = 0; 902 // Close and join cacheCleaner thread. 903 IOUtils.cleanup(LOG, cacheCleaner); 904 // Purge all replicas. 905 while (true) { 906 Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry(); 907 if (entry == null) break; 908 purge(entry.getValue()); 909 } 910 while (true) { 911 Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry(); 912 if (entry == null) break; 913 purge(entry.getValue()); 914 } 915 } finally { 916 lock.unlock(); 917 } 918 IOUtils.cleanup(LOG, shmManager); 919 } 920 921 @VisibleForTesting // ONLY for testing 922 public interface CacheVisitor { 923 void visit(int numOutstandingMmaps, 924 Map<ExtendedBlockId, ShortCircuitReplica> replicas, 925 Map<ExtendedBlockId, InvalidToken> failedLoads, 926 Map<Long, ShortCircuitReplica> evictable, 927 Map<Long, ShortCircuitReplica> evictableMmapped); 928 } 929 930 @VisibleForTesting // ONLY for testing 931 public void accept(CacheVisitor visitor) { 932 lock.lock(); 933 try { 934 Map<ExtendedBlockId, ShortCircuitReplica> replicas = 935 new HashMap<ExtendedBlockId, ShortCircuitReplica>(); 936 Map<ExtendedBlockId, InvalidToken> failedLoads = 937 new HashMap<ExtendedBlockId, InvalidToken>(); 938 for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry : 939 replicaInfoMap.entrySet()) { 940 Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue(); 941 if (waitable.hasVal()) { 942 if (waitable.getVal().getReplica() != null) { 943 replicas.put(entry.getKey(), waitable.getVal().getReplica()); 944 } else { 945 // The exception may be null here, indicating a failed load that 946 // isn't the result of an invalid block token. 947 failedLoads.put(entry.getKey(), 948 waitable.getVal().getInvalidTokenException()); 949 } 950 } 951 } 952 if (LOG.isDebugEnabled()) { 953 StringBuilder builder = new StringBuilder(); 954 builder.append("visiting ").append(visitor.getClass().getName()). 955 append("with outstandingMmapCount=").append(outstandingMmapCount). 956 append(", replicas="); 957 String prefix = ""; 958 for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) { 959 builder.append(prefix).append(entry.getValue()); 960 prefix = ","; 961 } 962 prefix = ""; 963 builder.append(", failedLoads="); 964 for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) { 965 builder.append(prefix).append(entry.getValue()); 966 prefix = ","; 967 } 968 prefix = ""; 969 builder.append(", evictable="); 970 for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) { 971 builder.append(prefix).append(entry.getKey()). 972 append(":").append(entry.getValue()); 973 prefix = ","; 974 } 975 prefix = ""; 976 builder.append(", evictableMmapped="); 977 for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) { 978 builder.append(prefix).append(entry.getKey()). 979 append(":").append(entry.getValue()); 980 prefix = ","; 981 } 982 LOG.debug(builder.toString()); 983 } 984 visitor.visit(outstandingMmapCount, replicas, failedLoads, 985 evictable, evictableMmapped); 986 } finally { 987 lock.unlock(); 988 } 989 } 990 991 @Override 992 public String toString() { 993 return "ShortCircuitCache(0x" + 994 Integer.toHexString(System.identityHashCode(this)) + ")"; 995 } 996 997 /** 998 * Allocate a new shared memory slot. 999 * 1000 * @param datanode The datanode to allocate a shm slot with. 1001 * @param peer A peer connected to the datanode. 1002 * @param usedPeer Will be set to true if we use up the provided peer. 1003 * @param blockId The block id and block pool id of the block we're 1004 * allocating this slot for. 1005 * @param clientName The name of the DFSClient allocating the shared 1006 * memory. 1007 * @return Null if short-circuit shared memory is disabled; 1008 * a short-circuit memory slot otherwise. 1009 * @throws IOException An exception if there was an error talking to 1010 * the datanode. 1011 */ 1012 public Slot allocShmSlot(DatanodeInfo datanode, 1013 DomainPeer peer, MutableBoolean usedPeer, 1014 ExtendedBlockId blockId, String clientName) throws IOException { 1015 if (shmManager != null) { 1016 return shmManager.allocSlot(datanode, peer, usedPeer, 1017 blockId, clientName); 1018 } else { 1019 return null; 1020 } 1021 } 1022 1023 /** 1024 * Free a slot immediately. 1025 * 1026 * ONLY use this if the DataNode is not yet aware of the slot. 1027 * 1028 * @param slot The slot to free. 1029 */ 1030 public void freeSlot(Slot slot) { 1031 Preconditions.checkState(shmManager != null); 1032 slot.makeInvalid(); 1033 shmManager.freeSlot(slot); 1034 } 1035 1036 /** 1037 * Schedule a shared memory slot to be released. 1038 * 1039 * @param slot The slot to release. 1040 */ 1041 public void scheduleSlotReleaser(Slot slot) { 1042 Preconditions.checkState(shmManager != null); 1043 releaserExecutor.execute(new SlotReleaser(slot)); 1044 } 1045 1046 @VisibleForTesting 1047 public DfsClientShmManager getDfsClientShmManager() { 1048 return shmManager; 1049 } 1050}