001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.client; 019 020import java.io.BufferedOutputStream; 021import java.io.Closeable; 022import java.io.DataInputStream; 023import java.io.DataOutputStream; 024 025import org.apache.hadoop.classification.InterfaceAudience; 026 027import java.io.IOException; 028import java.nio.MappedByteBuffer; 029import java.util.HashMap; 030import java.util.Map; 031import java.util.Map.Entry; 032import java.util.TreeMap; 033import java.util.concurrent.ScheduledFuture; 034import java.util.concurrent.ScheduledThreadPoolExecutor; 035import java.util.concurrent.TimeUnit; 036import java.util.concurrent.locks.Condition; 037import java.util.concurrent.locks.ReentrantLock; 038 039import org.apache.commons.lang.mutable.MutableBoolean; 040import org.apache.commons.logging.Log; 041import org.apache.commons.logging.LogFactory; 042import org.apache.hadoop.conf.Configuration; 043import org.apache.hadoop.hdfs.ExtendedBlockId; 044import org.apache.hadoop.hdfs.DFSConfigKeys; 045import org.apache.hadoop.hdfs.client.ShortCircuitReplica; 046import org.apache.hadoop.hdfs.ShortCircuitShm.Slot; 047import org.apache.hadoop.hdfs.net.DomainPeer; 048import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 049import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 050import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ReleaseShortCircuitAccessResponseProto; 051import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; 052import org.apache.hadoop.hdfs.protocolPB.PBHelper; 053import org.apache.hadoop.io.IOUtils; 054import org.apache.hadoop.ipc.RetriableException; 055import org.apache.hadoop.net.unix.DomainSocket; 056import org.apache.hadoop.net.unix.DomainSocketWatcher; 057import org.apache.hadoop.security.token.SecretManager.InvalidToken; 058import org.apache.hadoop.util.StringUtils; 059import org.apache.hadoop.util.Time; 060import org.apache.hadoop.util.Waitable; 061 062import com.google.common.annotations.VisibleForTesting; 063import com.google.common.base.Preconditions; 064import com.google.common.util.concurrent.ThreadFactoryBuilder; 065 066/** 067 * The ShortCircuitCache tracks things which the client needs to access 068 * HDFS block files via short-circuit. 069 * 070 * These things include: memory-mapped regions, file descriptors, and shared 071 * memory areas for communicating with the DataNode. 072 */ 073@InterfaceAudience.Private 074public class ShortCircuitCache implements Closeable { 075 public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class); 076 077 /** 078 * Expiry thread which makes sure that the file descriptors get closed 079 * after a while. 080 */ 081 private class CacheCleaner implements Runnable, Closeable { 082 private ScheduledFuture<?> future; 083 084 /** 085 * Run the CacheCleaner thread. 086 * 087 * Whenever a thread requests a ShortCircuitReplica object, we will make 088 * sure it gets one. That ShortCircuitReplica object can then be re-used 089 * when another thread requests a ShortCircuitReplica object for the same 090 * block. So in that sense, there is no maximum size to the cache. 091 * 092 * However, when a ShortCircuitReplica object is unreferenced by the 093 * thread(s) that are using it, it becomes evictable. There are two 094 * separate eviction lists-- one for mmaped objects, and another for 095 * non-mmaped objects. We do this in order to avoid having the regular 096 * files kick the mmaped files out of the cache too quickly. Reusing 097 * an already-existing mmap gives a huge performance boost, since the 098 * page table entries don't have to be re-populated. Both the mmap 099 * and non-mmap evictable lists have maximum sizes and maximum lifespans. 100 */ 101 @Override 102 public void run() { 103 ShortCircuitCache.this.lock.lock(); 104 try { 105 if (ShortCircuitCache.this.closed) return; 106 long curMs = Time.monotonicNow(); 107 108 if (LOG.isDebugEnabled()) { 109 LOG.debug(this + ": cache cleaner running at " + curMs); 110 } 111 112 int numDemoted = demoteOldEvictableMmaped(curMs); 113 int numPurged = 0; 114 Long evictionTimeNs = Long.valueOf(0); 115 while (true) { 116 Entry<Long, ShortCircuitReplica> entry = 117 evictableMmapped.ceilingEntry(evictionTimeNs); 118 if (entry == null) break; 119 evictionTimeNs = entry.getKey(); 120 long evictionTimeMs = 121 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 122 if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break; 123 ShortCircuitReplica replica = entry.getValue(); 124 if (LOG.isTraceEnabled()) { 125 LOG.trace("CacheCleaner: purging " + replica + ": " + 126 StringUtils.getStackTrace(Thread.currentThread())); 127 } 128 purge(replica); 129 numPurged++; 130 } 131 132 if (LOG.isDebugEnabled()) { 133 LOG.debug(this + ": finishing cache cleaner run started at " + 134 curMs + ". Demoted " + numDemoted + " mmapped replicas; " + 135 "purged " + numPurged + " replicas."); 136 } 137 } finally { 138 ShortCircuitCache.this.lock.unlock(); 139 } 140 } 141 142 @Override 143 public void close() throws IOException { 144 if (future != null) { 145 future.cancel(false); 146 } 147 } 148 149 public void setFuture(ScheduledFuture<?> future) { 150 this.future = future; 151 } 152 153 /** 154 * Get the rate at which this cleaner thread should be scheduled. 155 * 156 * We do this by taking the minimum expiration time and dividing by 4. 157 * 158 * @return the rate in milliseconds at which this thread should be 159 * scheduled. 160 */ 161 public long getRateInMs() { 162 long minLifespanMs = 163 Math.min(maxNonMmappedEvictableLifespanMs, 164 maxEvictableMmapedLifespanMs); 165 long sampleTimeMs = minLifespanMs / 4; 166 return (sampleTimeMs < 1) ? 1 : sampleTimeMs; 167 } 168 } 169 170 /** 171 * A task which asks the DataNode to release a short-circuit shared memory 172 * slot. If successful, this will tell the DataNode to stop monitoring 173 * changes to the mlock status of the replica associated with the slot. 174 * It will also allow us (the client) to re-use this slot for another 175 * replica. If we can't communicate with the DataNode for some reason, 176 * we tear down the shared memory segment to avoid being in an inconsistent 177 * state. 178 */ 179 private class SlotReleaser implements Runnable { 180 /** 181 * The slot that we need to release. 182 */ 183 private final Slot slot; 184 185 SlotReleaser(Slot slot) { 186 this.slot = slot; 187 } 188 189 @Override 190 public void run() { 191 if (LOG.isTraceEnabled()) { 192 LOG.trace(ShortCircuitCache.this + ": about to release " + slot); 193 } 194 final DfsClientShm shm = (DfsClientShm)slot.getShm(); 195 final DomainSocket shmSock = shm.getPeer().getDomainSocket(); 196 DomainSocket sock = null; 197 DataOutputStream out = null; 198 final String path = shmSock.getPath(); 199 boolean success = false; 200 try { 201 sock = DomainSocket.connect(path); 202 out = new DataOutputStream( 203 new BufferedOutputStream(sock.getOutputStream())); 204 new Sender(out).releaseShortCircuitFds(slot.getSlotId()); 205 DataInputStream in = new DataInputStream(sock.getInputStream()); 206 ReleaseShortCircuitAccessResponseProto resp = 207 ReleaseShortCircuitAccessResponseProto.parseFrom( 208 PBHelper.vintPrefixed(in)); 209 if (resp.getStatus() != Status.SUCCESS) { 210 String error = resp.hasError() ? resp.getError() : "(unknown)"; 211 throw new IOException(resp.getStatus().toString() + ": " + error); 212 } 213 if (LOG.isTraceEnabled()) { 214 LOG.trace(ShortCircuitCache.this + ": released " + slot); 215 } 216 success = true; 217 } catch (IOException e) { 218 LOG.error(ShortCircuitCache.this + ": failed to release " + 219 "short-circuit shared memory slot " + slot + " by sending " + 220 "ReleaseShortCircuitAccessRequestProto to " + path + 221 ". Closing shared memory segment.", e); 222 } finally { 223 if (success) { 224 shmManager.freeSlot(slot); 225 } else { 226 shm.getEndpointShmManager().shutdown(shm); 227 } 228 IOUtils.cleanup(LOG, sock, out); 229 } 230 } 231 } 232 233 public interface ShortCircuitReplicaCreator { 234 /** 235 * Attempt to create a ShortCircuitReplica object. 236 * 237 * This callback will be made without holding any locks. 238 * 239 * @return a non-null ShortCircuitReplicaInfo object. 240 */ 241 ShortCircuitReplicaInfo createShortCircuitReplicaInfo(); 242 } 243 244 /** 245 * Lock protecting the cache. 246 */ 247 private final ReentrantLock lock = new ReentrantLock(); 248 249 /** 250 * The executor service that runs the cacheCleaner. 251 */ 252 private final ScheduledThreadPoolExecutor cleanerExecutor 253 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 254 setDaemon(true).setNameFormat("ShortCircuitCache_Cleaner"). 255 build()); 256 257 /** 258 * The executor service that runs the cacheCleaner. 259 */ 260 private final ScheduledThreadPoolExecutor releaserExecutor 261 = new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 262 setDaemon(true).setNameFormat("ShortCircuitCache_SlotReleaser"). 263 build()); 264 265 /** 266 * A map containing all ShortCircuitReplicaInfo objects, organized by Key. 267 * ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken 268 * exception. 269 */ 270 private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> 271 replicaInfoMap = new HashMap<ExtendedBlockId, 272 Waitable<ShortCircuitReplicaInfo>>(); 273 274 /** 275 * The CacheCleaner. We don't create this and schedule it until it becomes 276 * necessary. 277 */ 278 private CacheCleaner cacheCleaner; 279 280 /** 281 * Tree of evictable elements. 282 * 283 * Maps (unique) insertion time in nanoseconds to the element. 284 */ 285 private final TreeMap<Long, ShortCircuitReplica> evictable = 286 new TreeMap<Long, ShortCircuitReplica>(); 287 288 /** 289 * Maximum total size of the cache, including both mmapped and 290 * no$-mmapped elements. 291 */ 292 private final int maxTotalSize; 293 294 /** 295 * Non-mmaped elements older than this will be closed. 296 */ 297 private long maxNonMmappedEvictableLifespanMs; 298 299 /** 300 * Tree of mmaped evictable elements. 301 * 302 * Maps (unique) insertion time in nanoseconds to the element. 303 */ 304 private final TreeMap<Long, ShortCircuitReplica> evictableMmapped = 305 new TreeMap<Long, ShortCircuitReplica>(); 306 307 /** 308 * Maximum number of mmaped evictable elements. 309 */ 310 private int maxEvictableMmapedSize; 311 312 /** 313 * Mmaped elements older than this will be closed. 314 */ 315 private final long maxEvictableMmapedLifespanMs; 316 317 /** 318 * The minimum number of milliseconds we'll wait after an unsuccessful 319 * mmap attempt before trying again. 320 */ 321 private final long mmapRetryTimeoutMs; 322 323 /** 324 * How long we will keep replicas in the cache before declaring them 325 * to be stale. 326 */ 327 private final long staleThresholdMs; 328 329 /** 330 * True if the ShortCircuitCache is closed. 331 */ 332 private boolean closed = false; 333 334 /** 335 * Number of existing mmaps associated with this cache. 336 */ 337 private int outstandingMmapCount = 0; 338 339 /** 340 * Manages short-circuit shared memory segments for the client. 341 */ 342 private final DfsClientShmManager shmManager; 343 344 /** 345 * Create a {@link ShortCircuitCache} object from a {@link Configuration} 346 */ 347 public static ShortCircuitCache fromConf(Configuration conf) { 348 return new ShortCircuitCache( 349 conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY, 350 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT), 351 conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY, 352 DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT), 353 conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE, 354 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT), 355 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS, 356 DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT), 357 conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS, 358 DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT), 359 conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS, 360 DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT), 361 conf.getInt(DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS, 362 DFSConfigKeys.DFS_SHORT_CIRCUIT_SHARED_MEMORY_WATCHER_INTERRUPT_CHECK_MS_DEFAULT)); 363 } 364 365 public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs, 366 int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs, 367 long mmapRetryTimeoutMs, long staleThresholdMs, int shmInterruptCheckMs) { 368 Preconditions.checkArgument(maxTotalSize >= 0); 369 this.maxTotalSize = maxTotalSize; 370 Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0); 371 this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs; 372 Preconditions.checkArgument(maxEvictableMmapedSize >= 0); 373 this.maxEvictableMmapedSize = maxEvictableMmapedSize; 374 Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0); 375 this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs; 376 this.mmapRetryTimeoutMs = mmapRetryTimeoutMs; 377 this.staleThresholdMs = staleThresholdMs; 378 DfsClientShmManager shmManager = null; 379 if ((shmInterruptCheckMs > 0) && 380 (DomainSocketWatcher.getLoadingFailureReason() == null)) { 381 try { 382 shmManager = new DfsClientShmManager(shmInterruptCheckMs); 383 } catch (IOException e) { 384 LOG.error("failed to create ShortCircuitShmManager", e); 385 } 386 } 387 this.shmManager = shmManager; 388 } 389 390 public long getMmapRetryTimeoutMs() { 391 return mmapRetryTimeoutMs; 392 } 393 394 public long getStaleThresholdMs() { 395 return staleThresholdMs; 396 } 397 398 /** 399 * Increment the reference count of a replica, and remove it from any free 400 * list it may be in. 401 * 402 * You must hold the cache lock while calling this function. 403 * 404 * @param replica The replica we're removing. 405 */ 406 private void ref(ShortCircuitReplica replica) { 407 lock.lock(); 408 try { 409 Preconditions.checkArgument(replica.refCount > 0, 410 "can't ref " + replica + " because its refCount reached " + 411 replica.refCount); 412 Long evictableTimeNs = replica.getEvictableTimeNs(); 413 replica.refCount++; 414 if (evictableTimeNs != null) { 415 String removedFrom = removeEvictable(replica); 416 if (LOG.isTraceEnabled()) { 417 LOG.trace(this + ": " + removedFrom + 418 " no longer contains " + replica + ". refCount " + 419 (replica.refCount - 1) + " -> " + replica.refCount + 420 StringUtils.getStackTrace(Thread.currentThread())); 421 422 } 423 } else if (LOG.isTraceEnabled()) { 424 LOG.trace(this + ": replica refCount " + 425 (replica.refCount - 1) + " -> " + replica.refCount + 426 StringUtils.getStackTrace(Thread.currentThread())); 427 } 428 } finally { 429 lock.unlock(); 430 } 431 } 432 433 /** 434 * Unreference a replica. 435 * 436 * You must hold the cache lock while calling this function. 437 * 438 * @param replica The replica being unreferenced. 439 */ 440 void unref(ShortCircuitReplica replica) { 441 lock.lock(); 442 try { 443 // If the replica is stale, but we haven't purged it yet, let's do that. 444 // It would be a shame to evict a non-stale replica so that we could put 445 // a stale one into the cache. 446 if ((!replica.purged) && replica.isStale()) { 447 purge(replica); 448 } 449 String addedString = ""; 450 boolean shouldTrimEvictionMaps = false; 451 int newRefCount = --replica.refCount; 452 if (newRefCount == 0) { 453 // Close replica, since there are no remaining references to it. 454 Preconditions.checkArgument(replica.purged, 455 "Replica " + replica + " reached a refCount of 0 without " + 456 "being purged"); 457 replica.close(); 458 } else if (newRefCount == 1) { 459 Preconditions.checkState(null == replica.getEvictableTimeNs(), 460 "Replica " + replica + " had a refCount higher than 1, " + 461 "but was still evictable (evictableTimeNs = " + 462 replica.getEvictableTimeNs() + ")"); 463 if (!replica.purged) { 464 // Add the replica to the end of an eviction list. 465 // Eviction lists are sorted by time. 466 if (replica.hasMmap()) { 467 insertEvictable(System.nanoTime(), replica, evictableMmapped); 468 addedString = "added to evictableMmapped, "; 469 } else { 470 insertEvictable(System.nanoTime(), replica, evictable); 471 addedString = "added to evictable, "; 472 } 473 shouldTrimEvictionMaps = true; 474 } 475 } else { 476 Preconditions.checkArgument(replica.refCount >= 0, 477 "replica's refCount went negative (refCount = " + 478 replica.refCount + " for " + replica + ")"); 479 } 480 if (LOG.isTraceEnabled()) { 481 LOG.trace(this + ": unref replica " + replica + 482 ": " + addedString + " refCount " + 483 (newRefCount + 1) + " -> " + newRefCount + 484 StringUtils.getStackTrace(Thread.currentThread())); 485 } 486 if (shouldTrimEvictionMaps) { 487 trimEvictionMaps(); 488 } 489 } finally { 490 lock.unlock(); 491 } 492 } 493 494 /** 495 * Demote old evictable mmaps into the regular eviction map. 496 * 497 * You must hold the cache lock while calling this function. 498 * 499 * @param now Current time in monotonic milliseconds. 500 * @return Number of replicas demoted. 501 */ 502 private int demoteOldEvictableMmaped(long now) { 503 int numDemoted = 0; 504 boolean needMoreSpace = false; 505 Long evictionTimeNs = Long.valueOf(0); 506 507 while (true) { 508 Entry<Long, ShortCircuitReplica> entry = 509 evictableMmapped.ceilingEntry(evictionTimeNs); 510 if (entry == null) break; 511 evictionTimeNs = entry.getKey(); 512 long evictionTimeMs = 513 TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS); 514 if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) { 515 if (evictableMmapped.size() < maxEvictableMmapedSize) { 516 break; 517 } 518 needMoreSpace = true; 519 } 520 ShortCircuitReplica replica = entry.getValue(); 521 if (LOG.isTraceEnabled()) { 522 String rationale = needMoreSpace ? "because we need more space" : 523 "because it's too old"; 524 LOG.trace("demoteOldEvictable: demoting " + replica + ": " + 525 rationale + ": " + 526 StringUtils.getStackTrace(Thread.currentThread())); 527 } 528 removeEvictable(replica, evictableMmapped); 529 munmap(replica); 530 insertEvictable(evictionTimeNs, replica, evictable); 531 numDemoted++; 532 } 533 return numDemoted; 534 } 535 536 /** 537 * Trim the eviction lists. 538 */ 539 private void trimEvictionMaps() { 540 long now = Time.monotonicNow(); 541 demoteOldEvictableMmaped(now); 542 543 while (true) { 544 long evictableSize = evictable.size(); 545 long evictableMmappedSize = evictableMmapped.size(); 546 if (evictableSize + evictableMmappedSize <= maxTotalSize) { 547 return; 548 } 549 ShortCircuitReplica replica; 550 if (evictableSize == 0) { 551 replica = evictableMmapped.firstEntry().getValue(); 552 } else { 553 replica = evictable.firstEntry().getValue(); 554 } 555 if (LOG.isTraceEnabled()) { 556 LOG.trace(this + ": trimEvictionMaps is purging " + replica + 557 StringUtils.getStackTrace(Thread.currentThread())); 558 } 559 purge(replica); 560 } 561 } 562 563 /** 564 * Munmap a replica, updating outstandingMmapCount. 565 * 566 * @param replica The replica to munmap. 567 */ 568 private void munmap(ShortCircuitReplica replica) { 569 replica.munmap(); 570 outstandingMmapCount--; 571 } 572 573 /** 574 * Remove a replica from an evictable map. 575 * 576 * @param replica The replica to remove. 577 * @return The map it was removed from. 578 */ 579 private String removeEvictable(ShortCircuitReplica replica) { 580 if (replica.hasMmap()) { 581 removeEvictable(replica, evictableMmapped); 582 return "evictableMmapped"; 583 } else { 584 removeEvictable(replica, evictable); 585 return "evictable"; 586 } 587 } 588 589 /** 590 * Remove a replica from an evictable map. 591 * 592 * @param replica The replica to remove. 593 * @param map The map to remove it from. 594 */ 595 private void removeEvictable(ShortCircuitReplica replica, 596 TreeMap<Long, ShortCircuitReplica> map) { 597 Long evictableTimeNs = replica.getEvictableTimeNs(); 598 Preconditions.checkNotNull(evictableTimeNs); 599 ShortCircuitReplica removed = map.remove(evictableTimeNs); 600 Preconditions.checkState(removed == replica, 601 "failed to make " + replica + " unevictable"); 602 replica.setEvictableTimeNs(null); 603 } 604 605 /** 606 * Insert a replica into an evictable map. 607 * 608 * If an element already exists with this eviction time, we add a nanosecond 609 * to it until we find an unused key. 610 * 611 * @param evictionTimeNs The eviction time in absolute nanoseconds. 612 * @param replica The replica to insert. 613 * @param map The map to insert it into. 614 */ 615 private void insertEvictable(Long evictionTimeNs, 616 ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) { 617 while (map.containsKey(evictionTimeNs)) { 618 evictionTimeNs++; 619 } 620 Preconditions.checkState(null == replica.getEvictableTimeNs()); 621 Long time = Long.valueOf(evictionTimeNs); 622 replica.setEvictableTimeNs(time); 623 map.put(time, replica); 624 } 625 626 /** 627 * Purge a replica from the cache. 628 * 629 * This doesn't necessarily close the replica, since there may be 630 * outstanding references to it. However, it does mean the cache won't 631 * hand it out to anyone after this. 632 * 633 * You must hold the cache lock while calling this function. 634 * 635 * @param replica The replica being removed. 636 */ 637 private void purge(ShortCircuitReplica replica) { 638 boolean removedFromInfoMap = false; 639 String evictionMapName = null; 640 Preconditions.checkArgument(!replica.purged); 641 replica.purged = true; 642 Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key); 643 if (val != null) { 644 ShortCircuitReplicaInfo info = val.getVal(); 645 if ((info != null) && (info.getReplica() == replica)) { 646 replicaInfoMap.remove(replica.key); 647 removedFromInfoMap = true; 648 } 649 } 650 Long evictableTimeNs = replica.getEvictableTimeNs(); 651 if (evictableTimeNs != null) { 652 evictionMapName = removeEvictable(replica); 653 } 654 if (LOG.isTraceEnabled()) { 655 StringBuilder builder = new StringBuilder(); 656 builder.append(this).append(": ").append(": purged "). 657 append(replica).append(" from the cache."); 658 if (removedFromInfoMap) { 659 builder.append(" Removed from the replicaInfoMap."); 660 } 661 if (evictionMapName != null) { 662 builder.append(" Removed from ").append(evictionMapName); 663 } 664 LOG.trace(builder.toString()); 665 } 666 unref(replica); 667 } 668 669 /** 670 * Fetch or create a replica. 671 * 672 * You must hold the cache lock while calling this function. 673 * 674 * @param key Key to use for lookup. 675 * @param creator Replica creator callback. Will be called without 676 * the cache lock being held. 677 * 678 * @return Null if no replica could be found or created. 679 * The replica, otherwise. 680 */ 681 public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key, 682 ShortCircuitReplicaCreator creator) { 683 Waitable<ShortCircuitReplicaInfo> newWaitable = null; 684 lock.lock(); 685 try { 686 ShortCircuitReplicaInfo info = null; 687 do { 688 if (closed) { 689 if (LOG.isTraceEnabled()) { 690 LOG.trace(this + ": can't fetchOrCreate " + key + 691 " because the cache is closed."); 692 } 693 return null; 694 } 695 Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key); 696 if (waitable != null) { 697 try { 698 info = fetch(key, waitable); 699 } catch (RetriableException e) { 700 if (LOG.isDebugEnabled()) { 701 LOG.debug(this + ": retrying " + e.getMessage()); 702 } 703 continue; 704 } 705 } 706 } while (false); 707 if (info != null) return info; 708 // We need to load the replica ourselves. 709 newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition()); 710 replicaInfoMap.put(key, newWaitable); 711 } finally { 712 lock.unlock(); 713 } 714 return create(key, creator, newWaitable); 715 } 716 717 /** 718 * Fetch an existing ReplicaInfo object. 719 * 720 * @param key The key that we're using. 721 * @param waitable The waitable object to wait on. 722 * @return The existing ReplicaInfo object, or null if there is 723 * none. 724 * 725 * @throws RetriableException If the caller needs to retry. 726 */ 727 private ShortCircuitReplicaInfo fetch(ExtendedBlockId key, 728 Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException { 729 // Another thread is already in the process of loading this 730 // ShortCircuitReplica. So we simply wait for it to complete. 731 ShortCircuitReplicaInfo info; 732 try { 733 if (LOG.isTraceEnabled()) { 734 LOG.trace(this + ": found waitable for " + key); 735 } 736 info = waitable.await(); 737 } catch (InterruptedException e) { 738 LOG.info(this + ": interrupted while waiting for " + key); 739 Thread.currentThread().interrupt(); 740 throw new RetriableException("interrupted"); 741 } 742 if (info.getInvalidTokenException() != null) { 743 LOG.warn(this + ": could not get " + key + " due to InvalidToken " + 744 "exception.", info.getInvalidTokenException()); 745 return info; 746 } 747 ShortCircuitReplica replica = info.getReplica(); 748 if (replica == null) { 749 LOG.warn(this + ": failed to get " + key); 750 return info; 751 } 752 if (replica.purged) { 753 // Ignore replicas that have already been purged from the cache. 754 throw new RetriableException("Ignoring purged replica " + 755 replica + ". Retrying."); 756 } 757 // Check if the replica is stale before using it. 758 // If it is, purge it and retry. 759 if (replica.isStale()) { 760 LOG.info(this + ": got stale replica " + replica + ". Removing " + 761 "this replica from the replicaInfoMap and retrying."); 762 // Remove the cache's reference to the replica. This may or may not 763 // trigger a close. 764 purge(replica); 765 throw new RetriableException("ignoring stale replica " + replica); 766 } 767 ref(replica); 768 return info; 769 } 770 771 private ShortCircuitReplicaInfo create(ExtendedBlockId key, 772 ShortCircuitReplicaCreator creator, 773 Waitable<ShortCircuitReplicaInfo> newWaitable) { 774 // Handle loading a new replica. 775 ShortCircuitReplicaInfo info = null; 776 try { 777 if (LOG.isTraceEnabled()) { 778 LOG.trace(this + ": loading " + key); 779 } 780 info = creator.createShortCircuitReplicaInfo(); 781 } catch (RuntimeException e) { 782 LOG.warn(this + ": failed to load " + key, e); 783 } 784 if (info == null) info = new ShortCircuitReplicaInfo(); 785 lock.lock(); 786 try { 787 if (info.getReplica() != null) { 788 // On success, make sure the cache cleaner thread is running. 789 if (LOG.isTraceEnabled()) { 790 LOG.trace(this + ": successfully loaded " + info.getReplica()); 791 } 792 startCacheCleanerThreadIfNeeded(); 793 // Note: new ShortCircuitReplicas start with a refCount of 2, 794 // indicating that both this cache and whoever requested the 795 // creation of the replica hold a reference. So we don't need 796 // to increment the reference count here. 797 } else { 798 // On failure, remove the waitable from the replicaInfoMap. 799 Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key); 800 if (waitableInMap == newWaitable) replicaInfoMap.remove(key); 801 if (info.getInvalidTokenException() != null) { 802 LOG.warn(this + ": could not load " + key + " due to InvalidToken " + 803 "exception.", info.getInvalidTokenException()); 804 } else { 805 LOG.warn(this + ": failed to load " + key); 806 } 807 } 808 newWaitable.provide(info); 809 } finally { 810 lock.unlock(); 811 } 812 return info; 813 } 814 815 private void startCacheCleanerThreadIfNeeded() { 816 if (cacheCleaner == null) { 817 cacheCleaner = new CacheCleaner(); 818 long rateMs = cacheCleaner.getRateInMs(); 819 ScheduledFuture<?> future = 820 cleanerExecutor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs, 821 TimeUnit.MILLISECONDS); 822 cacheCleaner.setFuture(future); 823 if (LOG.isDebugEnabled()) { 824 LOG.debug(this + ": starting cache cleaner thread which will run " + 825 "every " + rateMs + " ms"); 826 } 827 } 828 } 829 830 ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica, 831 boolean anchored) { 832 Condition newCond; 833 lock.lock(); 834 try { 835 while (replica.mmapData != null) { 836 if (replica.mmapData instanceof MappedByteBuffer) { 837 ref(replica); 838 MappedByteBuffer mmap = (MappedByteBuffer)replica.mmapData; 839 return new ClientMmap(replica, mmap, anchored); 840 } else if (replica.mmapData instanceof Long) { 841 long lastAttemptTimeMs = (Long)replica.mmapData; 842 long delta = Time.monotonicNow() - lastAttemptTimeMs; 843 if (delta < staleThresholdMs) { 844 if (LOG.isTraceEnabled()) { 845 LOG.trace(this + ": can't create client mmap for " + 846 replica + " because we failed to " + 847 "create one just " + delta + "ms ago."); 848 } 849 return null; 850 } 851 if (LOG.isTraceEnabled()) { 852 LOG.trace(this + ": retrying client mmap for " + replica + 853 ", " + delta + " ms after the previous failure."); 854 } 855 } else if (replica.mmapData instanceof Condition) { 856 Condition cond = (Condition)replica.mmapData; 857 cond.awaitUninterruptibly(); 858 } else { 859 Preconditions.checkState(false, "invalid mmapData type " + 860 replica.mmapData.getClass().getName()); 861 } 862 } 863 newCond = lock.newCondition(); 864 replica.mmapData = newCond; 865 } finally { 866 lock.unlock(); 867 } 868 MappedByteBuffer map = replica.loadMmapInternal(); 869 lock.lock(); 870 try { 871 if (map == null) { 872 replica.mmapData = Long.valueOf(Time.monotonicNow()); 873 newCond.signalAll(); 874 return null; 875 } else { 876 outstandingMmapCount++; 877 replica.mmapData = map; 878 ref(replica); 879 newCond.signalAll(); 880 return new ClientMmap(replica, map, anchored); 881 } 882 } finally { 883 lock.unlock(); 884 } 885 } 886 887 /** 888 * Close the cache and free all associated resources. 889 */ 890 @Override 891 public void close() { 892 try { 893 lock.lock(); 894 if (closed) return; 895 closed = true; 896 LOG.info(this + ": closing"); 897 maxNonMmappedEvictableLifespanMs = 0; 898 maxEvictableMmapedSize = 0; 899 // Close and join cacheCleaner thread. 900 IOUtils.cleanup(LOG, cacheCleaner); 901 // Purge all replicas. 902 while (true) { 903 Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry(); 904 if (entry == null) break; 905 purge(entry.getValue()); 906 } 907 while (true) { 908 Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry(); 909 if (entry == null) break; 910 purge(entry.getValue()); 911 } 912 } finally { 913 lock.unlock(); 914 } 915 IOUtils.cleanup(LOG, shmManager); 916 } 917 918 @VisibleForTesting // ONLY for testing 919 public interface CacheVisitor { 920 void visit(int numOutstandingMmaps, 921 Map<ExtendedBlockId, ShortCircuitReplica> replicas, 922 Map<ExtendedBlockId, InvalidToken> failedLoads, 923 Map<Long, ShortCircuitReplica> evictable, 924 Map<Long, ShortCircuitReplica> evictableMmapped); 925 } 926 927 @VisibleForTesting // ONLY for testing 928 public void accept(CacheVisitor visitor) { 929 lock.lock(); 930 try { 931 Map<ExtendedBlockId, ShortCircuitReplica> replicas = 932 new HashMap<ExtendedBlockId, ShortCircuitReplica>(); 933 Map<ExtendedBlockId, InvalidToken> failedLoads = 934 new HashMap<ExtendedBlockId, InvalidToken>(); 935 for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry : 936 replicaInfoMap.entrySet()) { 937 Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue(); 938 if (waitable.hasVal()) { 939 if (waitable.getVal().getReplica() != null) { 940 replicas.put(entry.getKey(), waitable.getVal().getReplica()); 941 } else { 942 // The exception may be null here, indicating a failed load that 943 // isn't the result of an invalid block token. 944 failedLoads.put(entry.getKey(), 945 waitable.getVal().getInvalidTokenException()); 946 } 947 } 948 } 949 if (LOG.isDebugEnabled()) { 950 StringBuilder builder = new StringBuilder(); 951 builder.append("visiting ").append(visitor.getClass().getName()). 952 append("with outstandingMmapCount=").append(outstandingMmapCount). 953 append(", replicas="); 954 String prefix = ""; 955 for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) { 956 builder.append(prefix).append(entry.getValue()); 957 prefix = ","; 958 } 959 prefix = ""; 960 builder.append(", failedLoads="); 961 for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) { 962 builder.append(prefix).append(entry.getValue()); 963 prefix = ","; 964 } 965 prefix = ""; 966 builder.append(", evictable="); 967 for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) { 968 builder.append(prefix).append(entry.getKey()). 969 append(":").append(entry.getValue()); 970 prefix = ","; 971 } 972 prefix = ""; 973 builder.append(", evictableMmapped="); 974 for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) { 975 builder.append(prefix).append(entry.getKey()). 976 append(":").append(entry.getValue()); 977 prefix = ","; 978 } 979 LOG.debug(builder.toString()); 980 } 981 visitor.visit(outstandingMmapCount, replicas, failedLoads, 982 evictable, evictableMmapped); 983 } finally { 984 lock.unlock(); 985 } 986 } 987 988 @Override 989 public String toString() { 990 return "ShortCircuitCache(0x" + 991 Integer.toHexString(System.identityHashCode(this)) + ")"; 992 } 993 994 /** 995 * Allocate a new shared memory slot. 996 * 997 * @param datanode The datanode to allocate a shm slot with. 998 * @param peer A peer connected to the datanode. 999 * @param usedPeer Will be set to true if we use up the provided peer. 1000 * @param blockId The block id and block pool id of the block we're 1001 * allocating this slot for. 1002 * @param clientName The name of the DFSClient allocating the shared 1003 * memory. 1004 * @return Null if short-circuit shared memory is disabled; 1005 * a short-circuit memory slot otherwise. 1006 * @throws IOException An exception if there was an error talking to 1007 * the datanode. 1008 */ 1009 public Slot allocShmSlot(DatanodeInfo datanode, 1010 DomainPeer peer, MutableBoolean usedPeer, 1011 ExtendedBlockId blockId, String clientName) throws IOException { 1012 if (shmManager != null) { 1013 return shmManager.allocSlot(datanode, peer, usedPeer, 1014 blockId, clientName); 1015 } else { 1016 return null; 1017 } 1018 } 1019 1020 /** 1021 * Free a slot immediately. 1022 * 1023 * ONLY use this if the DataNode is not yet aware of the slot. 1024 * 1025 * @param slot The slot to free. 1026 */ 1027 public void freeSlot(Slot slot) { 1028 Preconditions.checkState(shmManager != null); 1029 slot.makeInvalid(); 1030 shmManager.freeSlot(slot); 1031 } 1032 1033 /** 1034 * Schedule a shared memory slot to be released. 1035 * 1036 * @param slot The slot to release. 1037 */ 1038 public void scheduleSlotReleaser(Slot slot) { 1039 Preconditions.checkState(shmManager != null); 1040 releaserExecutor.execute(new SlotReleaser(slot)); 1041 } 1042 1043 @VisibleForTesting 1044 public DfsClientShmManager getDfsClientShmManager() { 1045 return shmManager; 1046 } 1047}