001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.shortcircuit; 019 020import java.io.FileInputStream; 021import java.io.IOException; 022import java.lang.reflect.Field; 023import java.util.BitSet; 024import java.util.Iterator; 025import java.util.NoSuchElementException; 026import java.util.Random; 027 028import org.apache.commons.lang.builder.EqualsBuilder; 029import org.apache.commons.lang.builder.HashCodeBuilder; 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.apache.hadoop.fs.InvalidRequestException; 033import org.apache.hadoop.hdfs.ExtendedBlockId; 034import org.apache.hadoop.io.nativeio.NativeIO; 035import org.apache.hadoop.io.nativeio.NativeIO.POSIX; 036import org.apache.hadoop.util.Shell; 037import org.apache.hadoop.util.StringUtils; 038 039import sun.misc.Unsafe; 040 041import com.google.common.base.Preconditions; 042import com.google.common.collect.ComparisonChain; 043import com.google.common.primitives.Ints; 044 045/** 046 * A shared memory segment used to implement short-circuit reads. 047 */ 048public class ShortCircuitShm { 049 private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class); 050 051 protected static final int BYTES_PER_SLOT = 64; 052 053 private static final Unsafe unsafe = safetyDance(); 054 055 private static Unsafe safetyDance() { 056 try { 057 Field f = Unsafe.class.getDeclaredField("theUnsafe"); 058 f.setAccessible(true); 059 return (Unsafe)f.get(null); 060 } catch (Throwable e) { 061 LOG.error("failed to load misc.Unsafe", e); 062 } 063 return null; 064 } 065 066 /** 067 * Calculate the usable size of a shared memory segment. 068 * We round down to a multiple of the slot size and do some validation. 069 * 070 * @param stream The stream we're using. 071 * @return The usable size of the shared memory segment. 072 */ 073 private static int getUsableLength(FileInputStream stream) 074 throws IOException { 075 int intSize = Ints.checkedCast(stream.getChannel().size()); 076 int slots = intSize / BYTES_PER_SLOT; 077 if (slots == 0) { 078 throw new IOException("size of shared memory segment was " + 079 intSize + ", but that is not enough to hold even one slot."); 080 } 081 return slots * BYTES_PER_SLOT; 082 } 083 084 /** 085 * Identifies a DfsClientShm. 086 */ 087 public static class ShmId implements Comparable<ShmId> { 088 private static final Random random = new Random(); 089 private final long hi; 090 private final long lo; 091 092 /** 093 * Generate a random ShmId. 094 * 095 * We generate ShmIds randomly to prevent a malicious client from 096 * successfully guessing one and using that to interfere with another 097 * client. 098 */ 099 public static ShmId createRandom() { 100 return new ShmId(random.nextLong(), random.nextLong()); 101 } 102 103 public ShmId(long hi, long lo) { 104 this.hi = hi; 105 this.lo = lo; 106 } 107 108 public long getHi() { 109 return hi; 110 } 111 112 public long getLo() { 113 return lo; 114 } 115 116 @Override 117 public boolean equals(Object o) { 118 if ((o == null) || (o.getClass() != this.getClass())) { 119 return false; 120 } 121 ShmId other = (ShmId)o; 122 return new EqualsBuilder(). 123 append(hi, other.hi). 124 append(lo, other.lo). 125 isEquals(); 126 } 127 128 @Override 129 public int hashCode() { 130 return new HashCodeBuilder(). 131 append(this.hi). 132 append(this.lo). 133 toHashCode(); 134 } 135 136 @Override 137 public String toString() { 138 return String.format("%016x%016x", hi, lo); 139 } 140 141 @Override 142 public int compareTo(ShmId other) { 143 return ComparisonChain.start(). 144 compare(hi, other.hi). 145 compare(lo, other.lo). 146 result(); 147 } 148 }; 149 150 /** 151 * Uniquely identifies a slot. 152 */ 153 public static class SlotId { 154 private final ShmId shmId; 155 private final int slotIdx; 156 157 public SlotId(ShmId shmId, int slotIdx) { 158 this.shmId = shmId; 159 this.slotIdx = slotIdx; 160 } 161 162 public ShmId getShmId() { 163 return shmId; 164 } 165 166 public int getSlotIdx() { 167 return slotIdx; 168 } 169 170 @Override 171 public boolean equals(Object o) { 172 if ((o == null) || (o.getClass() != this.getClass())) { 173 return false; 174 } 175 SlotId other = (SlotId)o; 176 return new EqualsBuilder(). 177 append(shmId, other.shmId). 178 append(slotIdx, other.slotIdx). 179 isEquals(); 180 } 181 182 @Override 183 public int hashCode() { 184 return new HashCodeBuilder(). 185 append(this.shmId). 186 append(this.slotIdx). 187 toHashCode(); 188 } 189 190 @Override 191 public String toString() { 192 return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx); 193 } 194 } 195 196 public class SlotIterator implements Iterator<Slot> { 197 int slotIdx = -1; 198 199 @Override 200 public boolean hasNext() { 201 synchronized (ShortCircuitShm.this) { 202 return allocatedSlots.nextSetBit(slotIdx + 1) != -1; 203 } 204 } 205 206 @Override 207 public Slot next() { 208 synchronized (ShortCircuitShm.this) { 209 int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1); 210 if (nextSlotIdx == -1) { 211 throw new NoSuchElementException(); 212 } 213 slotIdx = nextSlotIdx; 214 return slots[nextSlotIdx]; 215 } 216 } 217 218 @Override 219 public void remove() { 220 throw new UnsupportedOperationException("SlotIterator " + 221 "doesn't support removal"); 222 } 223 } 224 225 /** 226 * A slot containing information about a replica. 227 * 228 * The format is: 229 * word 0 230 * bit 0:32 Slot flags (see below). 231 * bit 33:63 Anchor count. 232 * word 1:7 233 * Reserved for future use, such as statistics. 234 * Padding is also useful for avoiding false sharing. 235 * 236 * Little-endian versus big-endian is not relevant here since both the client 237 * and the server reside on the same computer and use the same orientation. 238 */ 239 public class Slot { 240 /** 241 * Flag indicating that the slot is valid. 242 * 243 * The DFSClient sets this flag when it allocates a new slot within one of 244 * its shared memory regions. 245 * 246 * The DataNode clears this flag when the replica associated with this slot 247 * is no longer valid. The client itself also clears this flag when it 248 * believes that the DataNode is no longer using this slot to communicate. 249 */ 250 private static final long VALID_FLAG = 1L<<63; 251 252 /** 253 * Flag indicating that the slot can be anchored. 254 */ 255 private static final long ANCHORABLE_FLAG = 1L<<62; 256 257 /** 258 * The slot address in memory. 259 */ 260 private final long slotAddress; 261 262 /** 263 * BlockId of the block this slot is used for. 264 */ 265 private final ExtendedBlockId blockId; 266 267 Slot(long slotAddress, ExtendedBlockId blockId) { 268 this.slotAddress = slotAddress; 269 this.blockId = blockId; 270 } 271 272 /** 273 * Get the short-circuit memory segment associated with this Slot. 274 * 275 * @return The enclosing short-circuit memory segment. 276 */ 277 public ShortCircuitShm getShm() { 278 return ShortCircuitShm.this; 279 } 280 281 /** 282 * Get the ExtendedBlockId associated with this slot. 283 * 284 * @return The ExtendedBlockId of this slot. 285 */ 286 public ExtendedBlockId getBlockId() { 287 return blockId; 288 } 289 290 /** 291 * Get the SlotId of this slot, containing both shmId and slotIdx. 292 * 293 * @return The SlotId of this slot. 294 */ 295 public SlotId getSlotId() { 296 return new SlotId(getShmId(), getSlotIdx()); 297 } 298 299 /** 300 * Get the Slot index. 301 * 302 * @return The index of this slot. 303 */ 304 public int getSlotIdx() { 305 return Ints.checkedCast( 306 (slotAddress - baseAddress) / BYTES_PER_SLOT); 307 } 308 309 private boolean isSet(long flag) { 310 long prev = unsafe.getLongVolatile(null, this.slotAddress); 311 return (prev & flag) != 0; 312 } 313 314 private void setFlag(long flag) { 315 long prev; 316 do { 317 prev = unsafe.getLongVolatile(null, this.slotAddress); 318 if ((prev & flag) != 0) { 319 return; 320 } 321 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 322 prev, prev | flag)); 323 } 324 325 private void clearFlag(long flag) { 326 long prev; 327 do { 328 prev = unsafe.getLongVolatile(null, this.slotAddress); 329 if ((prev & flag) == 0) { 330 return; 331 } 332 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 333 prev, prev & (~flag))); 334 } 335 336 public boolean isValid() { 337 return isSet(VALID_FLAG); 338 } 339 340 public void makeValid() { 341 setFlag(VALID_FLAG); 342 } 343 344 public void makeInvalid() { 345 clearFlag(VALID_FLAG); 346 } 347 348 public boolean isAnchorable() { 349 return isSet(ANCHORABLE_FLAG); 350 } 351 352 public void makeAnchorable() { 353 setFlag(ANCHORABLE_FLAG); 354 } 355 356 public void makeUnanchorable() { 357 clearFlag(ANCHORABLE_FLAG); 358 } 359 360 public boolean isAnchored() { 361 long prev = unsafe.getLongVolatile(null, this.slotAddress); 362 if ((prev & VALID_FLAG) == 0) { 363 // Slot is no longer valid. 364 return false; 365 } 366 return ((prev & 0x7fffffff) != 0); 367 } 368 369 /** 370 * Try to add an anchor for a given slot. 371 * 372 * When a slot is anchored, we know that the block it refers to is resident 373 * in memory. 374 * 375 * @return True if the slot is anchored. 376 */ 377 public boolean addAnchor() { 378 long prev; 379 do { 380 prev = unsafe.getLongVolatile(null, this.slotAddress); 381 if ((prev & VALID_FLAG) == 0) { 382 // Slot is no longer valid. 383 return false; 384 } 385 if ((prev & ANCHORABLE_FLAG) == 0) { 386 // Slot can't be anchored right now. 387 return false; 388 } 389 if ((prev & 0x7fffffff) == 0x7fffffff) { 390 // Too many other threads have anchored the slot (2 billion?) 391 return false; 392 } 393 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 394 prev, prev + 1)); 395 return true; 396 } 397 398 /** 399 * Remove an anchor for a given slot. 400 */ 401 public void removeAnchor() { 402 long prev; 403 do { 404 prev = unsafe.getLongVolatile(null, this.slotAddress); 405 Preconditions.checkState((prev & 0x7fffffff) != 0, 406 "Tried to remove anchor for slot " + slotAddress +", which was " + 407 "not anchored."); 408 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 409 prev, prev - 1)); 410 } 411 412 @Override 413 public String toString() { 414 return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")"; 415 } 416 } 417 418 /** 419 * ID for this SharedMemorySegment. 420 */ 421 private final ShmId shmId; 422 423 /** 424 * The base address of the memory-mapped file. 425 */ 426 private final long baseAddress; 427 428 /** 429 * The mmapped length of the shared memory segment 430 */ 431 private final int mmappedLength; 432 433 /** 434 * The slots associated with this shared memory segment. 435 * slot[i] contains the slot at offset i * BYTES_PER_SLOT, 436 * or null if that slot is not allocated. 437 */ 438 private final Slot slots[]; 439 440 /** 441 * A bitset where each bit represents a slot which is in use. 442 */ 443 private final BitSet allocatedSlots; 444 445 /** 446 * Create the ShortCircuitShm. 447 * 448 * @param shmId The ID to use. 449 * @param stream The stream that we're going to use to create this 450 * shared memory segment. 451 * 452 * Although this is a FileInputStream, we are going to 453 * assume that the underlying file descriptor is writable 454 * as well as readable. It would be more appropriate to use 455 * a RandomAccessFile here, but that class does not have 456 * any public accessor which returns a FileDescriptor, 457 * unlike FileInputStream. 458 */ 459 public ShortCircuitShm(ShmId shmId, FileInputStream stream) 460 throws IOException { 461 if (!NativeIO.isAvailable()) { 462 throw new UnsupportedOperationException("NativeIO is not available."); 463 } 464 if (Shell.WINDOWS) { 465 throw new UnsupportedOperationException( 466 "DfsClientShm is not yet implemented for Windows."); 467 } 468 if (unsafe == null) { 469 throw new UnsupportedOperationException( 470 "can't use DfsClientShm because we failed to " + 471 "load misc.Unsafe."); 472 } 473 this.shmId = shmId; 474 this.mmappedLength = getUsableLength(stream); 475 this.baseAddress = POSIX.mmap(stream.getFD(), 476 POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength); 477 this.slots = new Slot[mmappedLength / BYTES_PER_SLOT]; 478 this.allocatedSlots = new BitSet(slots.length); 479 if (LOG.isTraceEnabled()) { 480 LOG.trace("creating " + this.getClass().getSimpleName() + 481 "(shmId=" + shmId + 482 ", mmappedLength=" + mmappedLength + 483 ", baseAddress=" + String.format("%x", baseAddress) + 484 ", slots.length=" + slots.length + ")"); 485 } 486 } 487 488 public final ShmId getShmId() { 489 return shmId; 490 } 491 492 /** 493 * Determine if this shared memory object is empty. 494 * 495 * @return True if the shared memory object is empty. 496 */ 497 synchronized final public boolean isEmpty() { 498 return allocatedSlots.nextSetBit(0) == -1; 499 } 500 501 /** 502 * Determine if this shared memory object is full. 503 * 504 * @return True if the shared memory object is full. 505 */ 506 synchronized final public boolean isFull() { 507 return allocatedSlots.nextClearBit(0) >= slots.length; 508 } 509 510 /** 511 * Calculate the base address of a slot. 512 * 513 * @param slotIdx Index of the slot. 514 * @return The base address of the slot. 515 */ 516 private final long calculateSlotAddress(int slotIdx) { 517 long offset = slotIdx; 518 offset *= BYTES_PER_SLOT; 519 return this.baseAddress + offset; 520 } 521 522 /** 523 * Allocate a new slot and register it. 524 * 525 * This function chooses an empty slot, initializes it, and then returns 526 * the relevant Slot object. 527 * 528 * @return The new slot. 529 */ 530 synchronized public final Slot allocAndRegisterSlot( 531 ExtendedBlockId blockId) { 532 int idx = allocatedSlots.nextClearBit(0); 533 if (idx >= slots.length) { 534 throw new RuntimeException(this + ": no more slots are available."); 535 } 536 allocatedSlots.set(idx, true); 537 Slot slot = new Slot(calculateSlotAddress(idx), blockId); 538 slot.makeValid(); 539 slots[idx] = slot; 540 if (LOG.isTraceEnabled()) { 541 LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots + 542 StringUtils.getStackTrace(Thread.currentThread())); 543 } 544 return slot; 545 } 546 547 synchronized public final Slot getSlot(int slotIdx) 548 throws InvalidRequestException { 549 if (!allocatedSlots.get(slotIdx)) { 550 throw new InvalidRequestException(this + ": slot " + slotIdx + 551 " does not exist."); 552 } 553 return slots[slotIdx]; 554 } 555 556 /** 557 * Register a slot. 558 * 559 * This function looks at a slot which has already been initialized (by 560 * another process), and registers it with us. Then, it returns the 561 * relevant Slot object. 562 * 563 * @return The slot. 564 * 565 * @throws InvalidRequestException 566 * If the slot index we're trying to allocate has not been 567 * initialized, or is already in use. 568 */ 569 synchronized public final Slot registerSlot(int slotIdx, 570 ExtendedBlockId blockId) throws InvalidRequestException { 571 if (slotIdx < 0) { 572 throw new InvalidRequestException(this + ": invalid negative slot " + 573 "index " + slotIdx); 574 } 575 if (slotIdx >= slots.length) { 576 throw new InvalidRequestException(this + ": invalid slot " + 577 "index " + slotIdx); 578 } 579 if (allocatedSlots.get(slotIdx)) { 580 throw new InvalidRequestException(this + ": slot " + slotIdx + 581 " is already in use."); 582 } 583 Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId); 584 if (!slot.isValid()) { 585 throw new InvalidRequestException(this + ": slot " + slotIdx + 586 " has not been allocated."); 587 } 588 slots[slotIdx] = slot; 589 allocatedSlots.set(slotIdx, true); 590 if (LOG.isTraceEnabled()) { 591 LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots + 592 StringUtils.getStackTrace(Thread.currentThread())); 593 } 594 return slot; 595 } 596 597 /** 598 * Unregisters a slot. 599 * 600 * This doesn't alter the contents of the slot. It just means 601 * 602 * @param slotIdx Index of the slot to unregister. 603 */ 604 synchronized public final void unregisterSlot(int slotIdx) { 605 Preconditions.checkState(allocatedSlots.get(slotIdx), 606 "tried to unregister slot " + slotIdx + ", which was not registered."); 607 allocatedSlots.set(slotIdx, false); 608 slots[slotIdx] = null; 609 if (LOG.isTraceEnabled()) { 610 LOG.trace(this + ": unregisterSlot " + slotIdx); 611 } 612 } 613 614 /** 615 * Iterate over all allocated slots. 616 * 617 * Note that this method isn't safe if 618 * 619 * @return The slot iterator. 620 */ 621 public SlotIterator slotIterator() { 622 return new SlotIterator(); 623 } 624 625 public void free() { 626 try { 627 POSIX.munmap(baseAddress, mmappedLength); 628 } catch (IOException e) { 629 LOG.warn(this + ": failed to munmap", e); 630 } 631 LOG.trace(this + ": freed"); 632 } 633 634 @Override 635 public String toString() { 636 return this.getClass().getSimpleName() + "(" + shmId + ")"; 637 } 638}