001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.FileInputStream;
021import java.io.IOException;
022import java.lang.reflect.Field;
023import java.util.BitSet;
024import java.util.Iterator;
025import java.util.NoSuchElementException;
026import java.util.Random;
027
028import org.apache.commons.lang.builder.EqualsBuilder;
029import org.apache.commons.lang.builder.HashCodeBuilder;
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.fs.InvalidRequestException;
033import org.apache.hadoop.io.nativeio.NativeIO;
034import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
035import org.apache.hadoop.util.Shell;
036import org.apache.hadoop.util.StringUtils;
037
038import com.google.common.base.Preconditions;
039import com.google.common.collect.ComparisonChain;
040import com.google.common.primitives.Ints;
041
042import sun.misc.Unsafe;
043
044/**
045 * A shared memory segment used to implement short-circuit reads.
046 */
047public class ShortCircuitShm {
048  private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class);
049
050  protected static final int BYTES_PER_SLOT = 64;
051
052  private static final Unsafe unsafe = safetyDance();
053
054  private static Unsafe safetyDance() {
055    try {
056      Field f = Unsafe.class.getDeclaredField("theUnsafe");
057      f.setAccessible(true);
058      return (Unsafe)f.get(null);
059    } catch (Throwable e) {
060      LOG.error("failed to load misc.Unsafe", e);
061    }
062    return null;
063  }
064
065  /**
066   * Calculate the usable size of a shared memory segment.
067   * We round down to a multiple of the slot size and do some validation.
068   *
069   * @param stream The stream we're using.
070   * @return       The usable size of the shared memory segment.
071   */
072  private static int getUsableLength(FileInputStream stream)
073      throws IOException {
074    int intSize = Ints.checkedCast(stream.getChannel().size());
075    int slots = intSize / BYTES_PER_SLOT;
076    if (slots == 0) {
077      throw new IOException("size of shared memory segment was " +
078          intSize + ", but that is not enough to hold even one slot.");
079    }
080    return slots * BYTES_PER_SLOT;
081  }
082
083  /**
084   * Identifies a DfsClientShm.
085   */
086  public static class ShmId implements Comparable<ShmId> {
087    private static final Random random = new Random();
088    private final long hi;
089    private final long lo;
090
091    /**
092     * Generate a random ShmId.
093     * 
094     * We generate ShmIds randomly to prevent a malicious client from
095     * successfully guessing one and using that to interfere with another
096     * client.
097     */
098    public static ShmId createRandom() {
099      return new ShmId(random.nextLong(), random.nextLong());
100    }
101
102    public ShmId(long hi, long lo) {
103      this.hi = hi;
104      this.lo = lo;
105    }
106    
107    public long getHi() {
108      return hi;
109    }
110    
111    public long getLo() {
112      return lo;
113    }
114
115    @Override
116    public boolean equals(Object o) {
117      if ((o == null) || (o.getClass() != this.getClass())) {
118        return false;
119      }
120      ShmId other = (ShmId)o;
121      return new EqualsBuilder().
122          append(hi, other.hi).
123          append(lo, other.lo).
124          isEquals();
125    }
126
127    @Override
128    public int hashCode() {
129      return new HashCodeBuilder().
130          append(this.hi).
131          append(this.lo).
132          toHashCode();
133    }
134
135    @Override
136    public String toString() {
137      return String.format("%016x%016x", hi, lo);
138    }
139
140    @Override
141    public int compareTo(ShmId other) {
142      return ComparisonChain.start().
143          compare(hi, other.hi).
144          compare(lo, other.lo).
145          result();
146    }
147  };
148
149  /**
150   * Uniquely identifies a slot.
151   */
152  public static class SlotId {
153    private final ShmId shmId;
154    private final int slotIdx;
155    
156    public SlotId(ShmId shmId, int slotIdx) {
157      this.shmId = shmId;
158      this.slotIdx = slotIdx;
159    }
160
161    public ShmId getShmId() {
162      return shmId;
163    }
164
165    public int getSlotIdx() {
166      return slotIdx;
167    }
168
169    @Override
170    public boolean equals(Object o) {
171      if ((o == null) || (o.getClass() != this.getClass())) {
172        return false;
173      }
174      SlotId other = (SlotId)o;
175      return new EqualsBuilder().
176          append(shmId, other.shmId).
177          append(slotIdx, other.slotIdx).
178          isEquals();
179    }
180
181    @Override
182    public int hashCode() {
183      return new HashCodeBuilder().
184          append(this.shmId).
185          append(this.slotIdx).
186          toHashCode();
187    }
188
189    @Override
190    public String toString() {
191      return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
192    }
193  }
194
195  public class SlotIterator implements Iterator<Slot> {
196    int slotIdx = -1;
197
198    @Override
199    public boolean hasNext() {
200      synchronized (ShortCircuitShm.this) {
201        return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
202      }
203    }
204
205    @Override
206    public Slot next() {
207      synchronized (ShortCircuitShm.this) {
208        int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
209        if (nextSlotIdx == -1) {
210          throw new NoSuchElementException();
211        }
212        slotIdx = nextSlotIdx;
213        return slots[nextSlotIdx];
214      }
215    }
216
217    @Override
218    public void remove() {
219      throw new UnsupportedOperationException("SlotIterator " +
220          "doesn't support removal");
221    }
222  }
223  
224  /**
225   * A slot containing information about a replica.
226   *
227   * The format is:
228   * word 0
229   *   bit 0:32   Slot flags (see below).
230   *   bit 33:63  Anchor count.
231   * word 1:7
232   *   Reserved for future use, such as statistics.
233   *   Padding is also useful for avoiding false sharing.
234   *
235   * Little-endian versus big-endian is not relevant here since both the client
236   * and the server reside on the same computer and use the same orientation.
237   */
238  public class Slot {
239    /**
240     * Flag indicating that the slot is valid.  
241     * 
242     * The DFSClient sets this flag when it allocates a new slot within one of
243     * its shared memory regions.
244     * 
245     * The DataNode clears this flag when the replica associated with this slot
246     * is no longer valid.  The client itself also clears this flag when it
247     * believes that the DataNode is no longer using this slot to communicate.
248     */
249    private static final long VALID_FLAG =          1L<<63;
250
251    /**
252     * Flag indicating that the slot can be anchored.
253     */
254    private static final long ANCHORABLE_FLAG =     1L<<62;
255
256    /**
257     * The slot address in memory.
258     */
259    private final long slotAddress;
260
261    /**
262     * BlockId of the block this slot is used for.
263     */
264    private final ExtendedBlockId blockId;
265
266    Slot(long slotAddress, ExtendedBlockId blockId) {
267      this.slotAddress = slotAddress;
268      this.blockId = blockId;
269    }
270
271    /**
272     * Get the short-circuit memory segment associated with this Slot.
273     *
274     * @return      The enclosing short-circuit memory segment.
275     */
276    public ShortCircuitShm getShm() {
277      return ShortCircuitShm.this;
278    }
279
280    /**
281     * Get the ExtendedBlockId associated with this slot.
282     *
283     * @return      The ExtendedBlockId of this slot.
284     */
285    public ExtendedBlockId getBlockId() {
286      return blockId;
287    }
288
289    /**
290     * Get the SlotId of this slot, containing both shmId and slotIdx.
291     *
292     * @return      The SlotId of this slot.
293     */
294    public SlotId getSlotId() {
295      return new SlotId(getShmId(), getSlotIdx());
296    }
297
298    /**
299     * Get the Slot index.
300     *
301     * @return      The index of this slot.
302     */
303    public int getSlotIdx() {
304      return Ints.checkedCast(
305          (slotAddress - baseAddress) / BYTES_PER_SLOT);
306    }
307
308    private boolean isSet(long flag) {
309      long prev = unsafe.getLongVolatile(null, this.slotAddress);
310      return (prev & flag) != 0;
311    }
312
313    private void setFlag(long flag) {
314      long prev;
315      do {
316        prev = unsafe.getLongVolatile(null, this.slotAddress);
317        if ((prev & flag) != 0) {
318          return;
319        }
320      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
321                  prev, prev | flag));
322    }
323
324    private void clearFlag(long flag) {
325      long prev;
326      do {
327        prev = unsafe.getLongVolatile(null, this.slotAddress);
328        if ((prev & flag) == 0) {
329          return;
330        }
331      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
332                  prev, prev & (~flag)));
333    }
334    
335    public boolean isValid() {
336      return isSet(VALID_FLAG);
337    }
338
339    public void makeValid() {
340      setFlag(VALID_FLAG);
341    }
342
343    public void makeInvalid() {
344      clearFlag(VALID_FLAG);
345    }
346
347    public boolean isAnchorable() {
348      return isSet(ANCHORABLE_FLAG);
349    }
350
351    public void makeAnchorable() {
352      setFlag(ANCHORABLE_FLAG);
353    }
354
355    public void makeUnanchorable() {
356      clearFlag(ANCHORABLE_FLAG);
357    }
358
359    public boolean isAnchored() {
360      long prev = unsafe.getLongVolatile(null, this.slotAddress);
361      if ((prev & VALID_FLAG) == 0) {
362        // Slot is no longer valid.
363        return false;
364      }
365      return ((prev & 0x7fffffff) != 0);
366    }
367
368    /**
369     * Try to add an anchor for a given slot.
370     *
371     * When a slot is anchored, we know that the block it refers to is resident
372     * in memory.
373     *
374     * @return          True if the slot is anchored.
375     */
376    public boolean addAnchor() {
377      long prev;
378      do {
379        prev = unsafe.getLongVolatile(null, this.slotAddress);
380        if ((prev & VALID_FLAG) == 0) {
381          // Slot is no longer valid.
382          return false;
383        }
384        if ((prev & ANCHORABLE_FLAG) == 0) {
385          // Slot can't be anchored right now.
386          return false;
387        }
388        if ((prev & 0x7fffffff) == 0x7fffffff) {
389          // Too many other threads have anchored the slot (2 billion?)
390          return false;
391        }
392      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
393                  prev, prev + 1));
394      return true;
395    }
396
397    /**
398     * Remove an anchor for a given slot.
399     */
400    public void removeAnchor() {
401      long prev;
402      do {
403        prev = unsafe.getLongVolatile(null, this.slotAddress);
404        Preconditions.checkState((prev & 0x7fffffff) != 0,
405            "Tried to remove anchor for slot " + slotAddress +", which was " +
406            "not anchored.");
407      } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
408                  prev, prev - 1));
409    }
410
411    @Override
412    public String toString() {
413      return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
414    }
415  }
416
417  /**
418   * ID for this SharedMemorySegment.
419   */
420  private final ShmId shmId;
421
422  /**
423   * The base address of the memory-mapped file.
424   */
425  private final long baseAddress;
426
427  /**
428   * The mmapped length of the shared memory segment
429   */
430  private final int mmappedLength;
431
432  /**
433   * The slots associated with this shared memory segment.
434   * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
435   * or null if that slot is not allocated.
436   */
437  private final Slot slots[];
438
439  /**
440   * A bitset where each bit represents a slot which is in use.
441   */
442  private final BitSet allocatedSlots;
443
444  /**
445   * Create the ShortCircuitShm.
446   * 
447   * @param shmId       The ID to use.
448   * @param stream      The stream that we're going to use to create this 
449   *                    shared memory segment.
450   *                    
451   *                    Although this is a FileInputStream, we are going to
452   *                    assume that the underlying file descriptor is writable
453   *                    as well as readable. It would be more appropriate to use
454   *                    a RandomAccessFile here, but that class does not have
455   *                    any public accessor which returns a FileDescriptor,
456   *                    unlike FileInputStream.
457   */
458  public ShortCircuitShm(ShmId shmId, FileInputStream stream)
459        throws IOException {
460    if (!NativeIO.isAvailable()) {
461      throw new UnsupportedOperationException("NativeIO is not available.");
462    }
463    if (Shell.WINDOWS) {
464      throw new UnsupportedOperationException(
465          "DfsClientShm is not yet implemented for Windows.");
466    }
467    if (unsafe == null) {
468      throw new UnsupportedOperationException(
469          "can't use DfsClientShm because we failed to " +
470          "load misc.Unsafe.");
471    }
472    this.shmId = shmId;
473    this.mmappedLength = getUsableLength(stream);
474    this.baseAddress = POSIX.mmap(stream.getFD(), 
475        POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
476    this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
477    this.allocatedSlots = new BitSet(slots.length);
478    if (LOG.isTraceEnabled()) {
479      LOG.trace("creating " + this.getClass().getSimpleName() +
480          "(shmId=" + shmId +
481          ", mmappedLength=" + mmappedLength +
482          ", baseAddress=" + String.format("%x", baseAddress) +
483          ", slots.length=" + slots.length + ")");
484    }
485  }
486
487  public final ShmId getShmId() {
488    return shmId;
489  }
490  
491  /**
492   * Determine if this shared memory object is empty.
493   *
494   * @return    True if the shared memory object is empty.
495   */
496  synchronized final public boolean isEmpty() {
497    return allocatedSlots.nextSetBit(0) == -1;
498  }
499
500  /**
501   * Determine if this shared memory object is full.
502   *
503   * @return    True if the shared memory object is full.
504   */
505  synchronized final public boolean isFull() {
506    return allocatedSlots.nextClearBit(0) >= slots.length;
507  }
508
509  /**
510   * Calculate the base address of a slot.
511   *
512   * @param slotIdx   Index of the slot.
513   * @return          The base address of the slot.
514   */
515  private final long calculateSlotAddress(int slotIdx) {
516    long offset = slotIdx;
517    offset *= BYTES_PER_SLOT;
518    return this.baseAddress + offset;
519  }
520
521  /**
522   * Allocate a new slot and register it.
523   *
524   * This function chooses an empty slot, initializes it, and then returns
525   * the relevant Slot object.
526   *
527   * @return    The new slot.
528   */
529  synchronized public final Slot allocAndRegisterSlot(
530      ExtendedBlockId blockId) {
531    int idx = allocatedSlots.nextClearBit(0);
532    if (idx >= slots.length) {
533      throw new RuntimeException(this + ": no more slots are available.");
534    }
535    allocatedSlots.set(idx, true);
536    Slot slot = new Slot(calculateSlotAddress(idx), blockId);
537    slot.makeValid();
538    slots[idx] = slot;
539    if (LOG.isTraceEnabled()) {
540      LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
541                  StringUtils.getStackTrace(Thread.currentThread()));
542    }
543    return slot;
544  }
545
546  synchronized public final Slot getSlot(int slotIdx)
547      throws InvalidRequestException {
548    if (!allocatedSlots.get(slotIdx)) {
549      throw new InvalidRequestException(this + ": slot " + slotIdx +
550          " does not exist.");
551    }
552    return slots[slotIdx];
553  }
554
555  /**
556   * Register a slot.
557   *
558   * This function looks at a slot which has already been initialized (by
559   * another process), and registers it with us.  Then, it returns the 
560   * relevant Slot object.
561   *
562   * @return    The slot.
563   *
564   * @throws InvalidRequestException
565   *            If the slot index we're trying to allocate has not been
566   *            initialized, or is already in use.
567   */
568  synchronized public final Slot registerSlot(int slotIdx,
569      ExtendedBlockId blockId) throws InvalidRequestException {
570    if (slotIdx < 0) {
571      throw new InvalidRequestException(this + ": invalid negative slot " +
572          "index " + slotIdx);
573    }
574    if (slotIdx >= slots.length) {
575      throw new InvalidRequestException(this + ": invalid slot " +
576          "index " + slotIdx);
577    }
578    if (allocatedSlots.get(slotIdx)) {
579      throw new InvalidRequestException(this + ": slot " + slotIdx +
580          " is already in use.");
581    }
582    Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
583    if (!slot.isValid()) {
584      throw new InvalidRequestException(this + ": slot " + slotIdx +
585          " has not been allocated.");
586    }
587    slots[slotIdx] = slot;
588    allocatedSlots.set(slotIdx, true);
589    if (LOG.isTraceEnabled()) {
590      LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
591                  StringUtils.getStackTrace(Thread.currentThread()));
592    }
593    return slot;
594  }
595
596  /**
597   * Unregisters a slot.
598   * 
599   * This doesn't alter the contents of the slot.  It just means
600   *
601   * @param slotIdx  Index of the slot to unregister.
602   */
603  synchronized public final void unregisterSlot(int slotIdx) {
604    Preconditions.checkState(allocatedSlots.get(slotIdx),
605        "tried to unregister slot " + slotIdx + ", which was not registered.");
606    allocatedSlots.set(slotIdx, false);
607    slots[slotIdx] = null;
608    if (LOG.isTraceEnabled()) {
609      LOG.trace(this + ": unregisterSlot " + slotIdx);
610    }
611  }
612  
613  /**
614   * Iterate over all allocated slots.
615   * 
616   * Note that this method isn't safe if 
617   *
618   * @return        The slot iterator.
619   */
620  public SlotIterator slotIterator() {
621    return new SlotIterator();
622  }
623
624  public void free() {
625    try {
626      POSIX.munmap(baseAddress, mmappedLength);
627    } catch (IOException e) {
628      LOG.warn(this + ": failed to munmap", e);
629    }
630    LOG.trace(this + ": freed");
631  }
632  
633  @Override
634  public String toString() {
635    return this.getClass().getSimpleName() + "(" + shmId + ")";
636  }
637}