001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.IOException;
021import java.net.InetSocketAddress;
022import java.nio.ByteBuffer;
023import java.util.AbstractMap;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.EnumSet;
027import java.util.HashMap;
028import java.util.HashSet;
029import java.util.Iterator;
030import java.util.List;
031import java.util.Map;
032import java.util.Map.Entry;
033import java.util.Set;
034import java.util.concurrent.Callable;
035import java.util.concurrent.ConcurrentHashMap;
036import java.util.concurrent.CountDownLatch;
037import java.util.concurrent.ExecutionException;
038import java.util.concurrent.Future;
039import java.util.concurrent.TimeUnit;
040import java.util.concurrent.TimeoutException;
041
042import org.apache.commons.io.IOUtils;
043import org.apache.hadoop.classification.InterfaceAudience;
044import org.apache.hadoop.fs.ByteBufferReadable;
045import org.apache.hadoop.fs.ByteBufferUtil;
046import org.apache.hadoop.fs.CanSetDropBehind;
047import org.apache.hadoop.fs.CanSetReadahead;
048import org.apache.hadoop.fs.ChecksumException;
049import org.apache.hadoop.fs.FSInputStream;
050import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
051import org.apache.hadoop.fs.ReadOption;
052import org.apache.hadoop.fs.UnresolvedLinkException;
053import org.apache.hadoop.hdfs.client.ClientMmap;
054import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
055import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
056import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
057import org.apache.hadoop.hdfs.protocol.LocatedBlock;
058import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
059import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
060import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
061import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
062import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
063import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
064import org.apache.hadoop.io.ByteBufferPool;
065import org.apache.hadoop.ipc.RPC;
066import org.apache.hadoop.ipc.RemoteException;
067import org.apache.hadoop.net.NetUtils;
068import org.apache.hadoop.security.token.SecretManager.InvalidToken;
069import org.apache.hadoop.security.token.Token;
070import org.apache.hadoop.util.IdentityHashStore;
071
072import com.google.common.annotations.VisibleForTesting;
073
074/****************************************************************
075 * DFSInputStream provides bytes from a named file.  It handles 
076 * negotiation of the namenode and various datanodes as necessary.
077 ****************************************************************/
078@InterfaceAudience.Private
079public class DFSInputStream extends FSInputStream
080implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
081    HasEnhancedByteBufferAccess {
082  @VisibleForTesting
083  static boolean tcpReadsDisabledForTesting = false;
084  private final DFSClient dfsClient;
085  private boolean closed = false;
086  private final String src;
087  private BlockReader blockReader = null;
088  private final boolean verifyChecksum;
089  private LocatedBlocks locatedBlocks = null;
090  private long lastBlockBeingWrittenLength = 0;
091  private DatanodeInfo currentNode = null;
092  private LocatedBlock currentLocatedBlock = null;
093  private long pos = 0;
094  private long blockEnd = -1;
095  private CachingStrategy cachingStrategy;
096  private final ReadStatistics readStatistics = new ReadStatistics();
097
098  /**
099   * Track the ByteBuffers that we have handed out to readers.
100   * 
101   * The value type can be either ByteBufferPool or ClientMmap, depending on
102   * whether we this is a memory-mapped buffer or not.
103   */
104  private final IdentityHashStore<ByteBuffer, Object>
105      extendedReadBuffers = new IdentityHashStore<ByteBuffer, Object>(0);
106
107  public static class ReadStatistics {
108    public ReadStatistics() {
109      this.totalBytesRead = 0;
110      this.totalLocalBytesRead = 0;
111      this.totalShortCircuitBytesRead = 0;
112      this.totalZeroCopyBytesRead = 0;
113    }
114
115    public ReadStatistics(ReadStatistics rhs) {
116      this.totalBytesRead = rhs.getTotalBytesRead();
117      this.totalLocalBytesRead = rhs.getTotalLocalBytesRead();
118      this.totalShortCircuitBytesRead = rhs.getTotalShortCircuitBytesRead();
119      this.totalZeroCopyBytesRead = rhs.getTotalZeroCopyBytesRead();
120    }
121
122    /**
123     * @return The total bytes read.  This will always be at least as
124     * high as the other numbers, since it includes all of them.
125     */
126    public long getTotalBytesRead() {
127      return totalBytesRead;
128    }
129
130    /**
131     * @return The total local bytes read.  This will always be at least
132     * as high as totalShortCircuitBytesRead, since all short-circuit
133     * reads are also local.
134     */
135    public long getTotalLocalBytesRead() {
136      return totalLocalBytesRead;
137    }
138
139    /**
140     * @return The total short-circuit local bytes read.
141     */
142    public long getTotalShortCircuitBytesRead() {
143      return totalShortCircuitBytesRead;
144    }
145    
146    /**
147     * @return The total number of zero-copy bytes read.
148     */
149    public long getTotalZeroCopyBytesRead() {
150      return totalZeroCopyBytesRead;
151    }
152
153    /**
154     * @return The total number of bytes read which were not local.
155     */
156    public long getRemoteBytesRead() {
157      return totalBytesRead - totalLocalBytesRead;
158    }
159    
160    void addRemoteBytes(long amt) {
161      this.totalBytesRead += amt;
162    }
163
164    void addLocalBytes(long amt) {
165      this.totalBytesRead += amt;
166      this.totalLocalBytesRead += amt;
167    }
168
169    void addShortCircuitBytes(long amt) {
170      this.totalBytesRead += amt;
171      this.totalLocalBytesRead += amt;
172      this.totalShortCircuitBytesRead += amt;
173    }
174
175    void addZeroCopyBytes(long amt) {
176      this.totalBytesRead += amt;
177      this.totalLocalBytesRead += amt;
178      this.totalShortCircuitBytesRead += amt;
179      this.totalZeroCopyBytesRead += amt;
180    }
181    
182    private long totalBytesRead;
183
184    private long totalLocalBytesRead;
185
186    private long totalShortCircuitBytesRead;
187
188    private long totalZeroCopyBytesRead;
189  }
190  
191  /**
192   * This variable tracks the number of failures since the start of the
193   * most recent user-facing operation. That is to say, it should be reset
194   * whenever the user makes a call on this stream, and if at any point
195   * during the retry logic, the failure count exceeds a threshold,
196   * the errors will be thrown back to the operation.
197   *
198   * Specifically this counts the number of times the client has gone
199   * back to the namenode to get a new list of block locations, and is
200   * capped at maxBlockAcquireFailures
201   */
202  private int failures = 0;
203
204  /* XXX Use of CocurrentHashMap is temp fix. Need to fix 
205   * parallel accesses to DFSInputStream (through ptreads) properly */
206  private final ConcurrentHashMap<DatanodeInfo, DatanodeInfo> deadNodes =
207             new ConcurrentHashMap<DatanodeInfo, DatanodeInfo>();
208  private int buffersize = 1;
209  
210  private final byte[] oneByteBuf = new byte[1]; // used for 'int read()'
211
212  void addToDeadNodes(DatanodeInfo dnInfo) {
213    deadNodes.put(dnInfo, dnInfo);
214  }
215  
216  DFSInputStream(DFSClient dfsClient, String src, int buffersize, boolean verifyChecksum
217                 ) throws IOException, UnresolvedLinkException {
218    this.dfsClient = dfsClient;
219    this.verifyChecksum = verifyChecksum;
220    this.buffersize = buffersize;
221    this.src = src;
222    this.cachingStrategy =
223        dfsClient.getDefaultReadCachingStrategy();
224    openInfo();
225  }
226
227  /**
228   * Grab the open-file info from namenode
229   */
230  synchronized void openInfo() throws IOException, UnresolvedLinkException {
231    lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
232    int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
233    while (retriesForLastBlockLength > 0) {
234      // Getting last block length as -1 is a special case. When cluster
235      // restarts, DNs may not report immediately. At this time partial block
236      // locations will not be available with NN for getting the length. Lets
237      // retry for 3 times to get the length.
238      if (lastBlockBeingWrittenLength == -1) {
239        DFSClient.LOG.warn("Last block locations not available. "
240            + "Datanodes might not have reported blocks completely."
241            + " Will retry for " + retriesForLastBlockLength + " times");
242        waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
243        lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
244      } else {
245        break;
246      }
247      retriesForLastBlockLength--;
248    }
249    if (retriesForLastBlockLength == 0) {
250      throw new IOException("Could not obtain the last block locations.");
251    }
252  }
253
254  private void waitFor(int waitTime) throws IOException {
255    try {
256      Thread.sleep(waitTime);
257    } catch (InterruptedException e) {
258      throw new IOException(
259          "Interrupted while getting the last block length.");
260    }
261  }
262
263  private long fetchLocatedBlocksAndGetLastBlockLength() throws IOException {
264    final LocatedBlocks newInfo = dfsClient.getLocatedBlocks(src, 0);
265    if (DFSClient.LOG.isDebugEnabled()) {
266      DFSClient.LOG.debug("newInfo = " + newInfo);
267    }
268    if (newInfo == null) {
269      throw new IOException("Cannot open filename " + src);
270    }
271
272    if (locatedBlocks != null) {
273      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
274      Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
275      while (oldIter.hasNext() && newIter.hasNext()) {
276        if (! oldIter.next().getBlock().equals(newIter.next().getBlock())) {
277          throw new IOException("Blocklist for " + src + " has changed!");
278        }
279      }
280    }
281    locatedBlocks = newInfo;
282    long lastBlockBeingWrittenLength = 0;
283    if (!locatedBlocks.isLastBlockComplete()) {
284      final LocatedBlock last = locatedBlocks.getLastLocatedBlock();
285      if (last != null) {
286        if (last.getLocations().length == 0) {
287          if (last.getBlockSize() == 0) {
288            // if the length is zero, then no data has been written to
289            // datanode. So no need to wait for the locations.
290            return 0;
291          }
292          return -1;
293        }
294        final long len = readBlockLength(last);
295        last.getBlock().setNumBytes(len);
296        lastBlockBeingWrittenLength = len; 
297      }
298    }
299
300    currentNode = null;
301    return lastBlockBeingWrittenLength;
302  }
303
304  /** Read the block length from one of the datanodes. */
305  private long readBlockLength(LocatedBlock locatedblock) throws IOException {
306    assert locatedblock != null : "LocatedBlock cannot be null";
307    int replicaNotFoundCount = locatedblock.getLocations().length;
308    
309    for(DatanodeInfo datanode : locatedblock.getLocations()) {
310      ClientDatanodeProtocol cdp = null;
311      
312      try {
313        cdp = DFSUtil.createClientDatanodeProtocolProxy(datanode,
314            dfsClient.getConfiguration(), dfsClient.getConf().socketTimeout,
315            dfsClient.getConf().connectToDnViaHostname, locatedblock);
316        
317        final long n = cdp.getReplicaVisibleLength(locatedblock.getBlock());
318        
319        if (n >= 0) {
320          return n;
321        }
322      }
323      catch(IOException ioe) {
324        if (ioe instanceof RemoteException &&
325          (((RemoteException) ioe).unwrapRemoteException() instanceof
326            ReplicaNotFoundException)) {
327          // special case : replica might not be on the DN, treat as 0 length
328          replicaNotFoundCount--;
329        }
330        
331        if (DFSClient.LOG.isDebugEnabled()) {
332          DFSClient.LOG.debug("Failed to getReplicaVisibleLength from datanode "
333              + datanode + " for block " + locatedblock.getBlock(), ioe);
334        }
335      } finally {
336        if (cdp != null) {
337          RPC.stopProxy(cdp);
338        }
339      }
340    }
341
342    // Namenode told us about these locations, but none know about the replica
343    // means that we hit the race between pipeline creation start and end.
344    // we require all 3 because some other exception could have happened
345    // on a DN that has it.  we want to report that error
346    if (replicaNotFoundCount == 0) {
347      return 0;
348    }
349
350    throw new IOException("Cannot obtain block length for " + locatedblock);
351  }
352  
353  public synchronized long getFileLength() {
354    return locatedBlocks == null? 0:
355        locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
356  }
357
358  // Short circuit local reads are forbidden for files that are
359  // under construction.  See HDFS-2757.
360  synchronized boolean shortCircuitForbidden() {
361    return locatedBlocks.isUnderConstruction();
362  }
363
364  /**
365   * Returns the datanode from which the stream is currently reading.
366   */
367  public DatanodeInfo getCurrentDatanode() {
368    return currentNode;
369  }
370
371  /**
372   * Returns the block containing the target position. 
373   */
374  synchronized public ExtendedBlock getCurrentBlock() {
375    if (currentLocatedBlock == null){
376      return null;
377    }
378    return currentLocatedBlock.getBlock();
379  }
380
381  /**
382   * Return collection of blocks that has already been located.
383   */
384  public synchronized List<LocatedBlock> getAllBlocks() throws IOException {
385    return getBlockRange(0, getFileLength());
386  }
387
388  /**
389   * Get block at the specified position.
390   * Fetch it from the namenode if not cached.
391   * 
392   * @param offset
393   * @param updatePosition whether to update current position
394   * @return located block
395   * @throws IOException
396   */
397  private synchronized LocatedBlock getBlockAt(long offset,
398      boolean updatePosition) throws IOException {
399    assert (locatedBlocks != null) : "locatedBlocks is null";
400
401    final LocatedBlock blk;
402
403    //check offset
404    if (offset < 0 || offset >= getFileLength()) {
405      throw new IOException("offset < 0 || offset >= getFileLength(), offset="
406          + offset
407          + ", updatePosition=" + updatePosition
408          + ", locatedBlocks=" + locatedBlocks);
409    }
410    else if (offset >= locatedBlocks.getFileLength()) {
411      // offset to the portion of the last block,
412      // which is not known to the name-node yet;
413      // getting the last block 
414      blk = locatedBlocks.getLastLocatedBlock();
415    }
416    else {
417      // search cached blocks first
418      int targetBlockIdx = locatedBlocks.findBlock(offset);
419      if (targetBlockIdx < 0) { // block is not cached
420        targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
421        // fetch more blocks
422        final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
423        assert (newBlocks != null) : "Could not find target position " + offset;
424        locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
425      }
426      blk = locatedBlocks.get(targetBlockIdx);
427    }
428
429    // update current position
430    if (updatePosition) {
431      pos = offset;
432      blockEnd = blk.getStartOffset() + blk.getBlockSize() - 1;
433      currentLocatedBlock = blk;
434    }
435    return blk;
436  }
437
438  /** Fetch a block from namenode and cache it */
439  private synchronized void fetchBlockAt(long offset) throws IOException {
440    int targetBlockIdx = locatedBlocks.findBlock(offset);
441    if (targetBlockIdx < 0) { // block is not cached
442      targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
443    }
444    // fetch blocks
445    final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
446    if (newBlocks == null) {
447      throw new IOException("Could not find target position " + offset);
448    }
449    locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
450  }
451
452  /**
453   * Get blocks in the specified range.
454   * Fetch them from the namenode if not cached. This function
455   * will not get a read request beyond the EOF.
456   * @param offset
457   * @param length
458   * @return consequent segment of located blocks
459   * @throws IOException
460   */
461  private synchronized List<LocatedBlock> getBlockRange(long offset, 
462                                                        long length) 
463                                                      throws IOException {
464    // getFileLength(): returns total file length
465    // locatedBlocks.getFileLength(): returns length of completed blocks
466    if (offset >= getFileLength()) {
467      throw new IOException("Offset: " + offset +
468        " exceeds file length: " + getFileLength());
469    }
470
471    final List<LocatedBlock> blocks;
472    final long lengthOfCompleteBlk = locatedBlocks.getFileLength();
473    final boolean readOffsetWithinCompleteBlk = offset < lengthOfCompleteBlk;
474    final boolean readLengthPastCompleteBlk = offset + length > lengthOfCompleteBlk;
475
476    if (readOffsetWithinCompleteBlk) {
477      //get the blocks of finalized (completed) block range
478      blocks = getFinalizedBlockRange(offset, 
479        Math.min(length, lengthOfCompleteBlk - offset));
480    } else {
481      blocks = new ArrayList<LocatedBlock>(1);
482    }
483
484    // get the blocks from incomplete block range
485    if (readLengthPastCompleteBlk) {
486       blocks.add(locatedBlocks.getLastLocatedBlock());
487    }
488
489    return blocks;
490  }
491
492  /**
493   * Get blocks in the specified range.
494   * Includes only the complete blocks.
495   * Fetch them from the namenode if not cached.
496   */
497  private synchronized List<LocatedBlock> getFinalizedBlockRange(
498      long offset, long length) throws IOException {
499    assert (locatedBlocks != null) : "locatedBlocks is null";
500    List<LocatedBlock> blockRange = new ArrayList<LocatedBlock>();
501    // search cached blocks first
502    int blockIdx = locatedBlocks.findBlock(offset);
503    if (blockIdx < 0) { // block is not cached
504      blockIdx = LocatedBlocks.getInsertIndex(blockIdx);
505    }
506    long remaining = length;
507    long curOff = offset;
508    while(remaining > 0) {
509      LocatedBlock blk = null;
510      if(blockIdx < locatedBlocks.locatedBlockCount())
511        blk = locatedBlocks.get(blockIdx);
512      if (blk == null || curOff < blk.getStartOffset()) {
513        LocatedBlocks newBlocks;
514        newBlocks = dfsClient.getLocatedBlocks(src, curOff, remaining);
515        locatedBlocks.insertRange(blockIdx, newBlocks.getLocatedBlocks());
516        continue;
517      }
518      assert curOff >= blk.getStartOffset() : "Block not found";
519      blockRange.add(blk);
520      long bytesRead = blk.getStartOffset() + blk.getBlockSize() - curOff;
521      remaining -= bytesRead;
522      curOff += bytesRead;
523      blockIdx++;
524    }
525    return blockRange;
526  }
527
528  /**
529   * Open a DataInputStream to a DataNode so that it can be read from.
530   * We get block ID and the IDs of the destinations at startup, from the namenode.
531   */
532  private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
533    if (target >= getFileLength()) {
534      throw new IOException("Attempted to read past end of file");
535    }
536
537    // Will be getting a new BlockReader.
538    if (blockReader != null) {
539      blockReader.close();
540      blockReader = null;
541    }
542
543    //
544    // Connect to best DataNode for desired Block, with potential offset
545    //
546    DatanodeInfo chosenNode = null;
547    int refetchToken = 1; // only need to get a new access token once
548    int refetchEncryptionKey = 1; // only need to get a new encryption key once
549    
550    boolean connectFailedOnce = false;
551
552    while (true) {
553      //
554      // Compute desired block
555      //
556      LocatedBlock targetBlock = getBlockAt(target, true);
557      assert (target==pos) : "Wrong postion " + pos + " expect " + target;
558      long offsetIntoBlock = target - targetBlock.getStartOffset();
559
560      DNAddrPair retval = chooseDataNode(targetBlock, null);
561      chosenNode = retval.info;
562      InetSocketAddress targetAddr = retval.addr;
563
564      try {
565        ExtendedBlock blk = targetBlock.getBlock();
566        Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
567        blockReader = new BlockReaderFactory(dfsClient.getConf()).
568            setInetSocketAddress(targetAddr).
569            setRemotePeerFactory(dfsClient).
570            setDatanodeInfo(chosenNode).
571            setFileName(src).
572            setBlock(blk).
573            setBlockToken(accessToken).
574            setStartOffset(offsetIntoBlock).
575            setVerifyChecksum(verifyChecksum).
576            setClientName(dfsClient.clientName).
577            setLength(blk.getNumBytes() - offsetIntoBlock).
578            setCachingStrategy(cachingStrategy).
579            setAllowShortCircuitLocalReads(!shortCircuitForbidden()).
580            setClientCacheContext(dfsClient.getClientContext()).
581            setUserGroupInformation(dfsClient.ugi).
582            setConfiguration(dfsClient.getConfiguration()).
583            build();
584        if(connectFailedOnce) {
585          DFSClient.LOG.info("Successfully connected to " + targetAddr +
586                             " for " + blk);
587        }
588        return chosenNode;
589      } catch (IOException ex) {
590        if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
591          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
592              + "encryption key was invalid when connecting to " + targetAddr
593              + " : " + ex);
594          // The encryption key used is invalid.
595          refetchEncryptionKey--;
596          dfsClient.clearDataEncryptionKey();
597        } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
598          refetchToken--;
599          fetchBlockAt(target);
600        } else {
601          connectFailedOnce = true;
602          DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
603            + ", add to deadNodes and continue. " + ex, ex);
604          // Put chosen node into dead list, continue
605          addToDeadNodes(chosenNode);
606        }
607      }
608    }
609  }
610
611  /**
612   * Close it down!
613   */
614  @Override
615  public synchronized void close() throws IOException {
616    if (closed) {
617      return;
618    }
619    dfsClient.checkOpen();
620
621    if (!extendedReadBuffers.isEmpty()) {
622      final StringBuilder builder = new StringBuilder();
623      extendedReadBuffers.visitAll(new IdentityHashStore.Visitor<ByteBuffer, Object>() {
624        private String prefix = "";
625        @Override
626        public void accept(ByteBuffer k, Object v) {
627          builder.append(prefix).append(k);
628          prefix = ", ";
629        }
630      });
631      DFSClient.LOG.warn("closing file " + src + ", but there are still " +
632          "unreleased ByteBuffers allocated by read().  " +
633          "Please release " + builder.toString() + ".");
634    }
635    if (blockReader != null) {
636      blockReader.close();
637      blockReader = null;
638    }
639    super.close();
640    closed = true;
641  }
642
643  @Override
644  public synchronized int read() throws IOException {
645    int ret = read( oneByteBuf, 0, 1 );
646    return ( ret <= 0 ) ? -1 : (oneByteBuf[0] & 0xff);
647  }
648
649  /**
650   * Wraps different possible read implementations so that readBuffer can be
651   * strategy-agnostic.
652   */
653  private interface ReaderStrategy {
654    public int doRead(BlockReader blockReader, int off, int len,
655        ReadStatistics readStatistics) throws ChecksumException, IOException;
656  }
657
658  private static void updateReadStatistics(ReadStatistics readStatistics, 
659        int nRead, BlockReader blockReader) {
660    if (nRead <= 0) return;
661    if (blockReader.isShortCircuit()) {
662      readStatistics.addShortCircuitBytes(nRead);
663    } else if (blockReader.isLocal()) {
664      readStatistics.addLocalBytes(nRead);
665    } else {
666      readStatistics.addRemoteBytes(nRead);
667    }
668  }
669  
670  /**
671   * Used to read bytes into a byte[]
672   */
673  private static class ByteArrayStrategy implements ReaderStrategy {
674    final byte[] buf;
675
676    public ByteArrayStrategy(byte[] buf) {
677      this.buf = buf;
678    }
679
680    @Override
681    public int doRead(BlockReader blockReader, int off, int len,
682            ReadStatistics readStatistics) throws ChecksumException, IOException {
683        int nRead = blockReader.read(buf, off, len);
684        updateReadStatistics(readStatistics, nRead, blockReader);
685        return nRead;
686    }
687  }
688
689  /**
690   * Used to read bytes into a user-supplied ByteBuffer
691   */
692  private static class ByteBufferStrategy implements ReaderStrategy {
693    final ByteBuffer buf;
694    ByteBufferStrategy(ByteBuffer buf) {
695      this.buf = buf;
696    }
697
698    @Override
699    public int doRead(BlockReader blockReader, int off, int len,
700        ReadStatistics readStatistics) throws ChecksumException, IOException {
701      int oldpos = buf.position();
702      int oldlimit = buf.limit();
703      boolean success = false;
704      try {
705        int ret = blockReader.read(buf);
706        success = true;
707        updateReadStatistics(readStatistics, ret, blockReader);
708        return ret;
709      } finally {
710        if (!success) {
711          // Reset to original state so that retries work correctly.
712          buf.position(oldpos);
713          buf.limit(oldlimit);
714        }
715      } 
716    }
717  }
718
719  /* This is a used by regular read() and handles ChecksumExceptions.
720   * name readBuffer() is chosen to imply similarity to readBuffer() in
721   * ChecksumFileSystem
722   */ 
723  private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
724      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
725      throws IOException {
726    IOException ioe;
727    
728    /* we retry current node only once. So this is set to true only here.
729     * Intention is to handle one common case of an error that is not a
730     * failure on datanode or client : when DataNode closes the connection
731     * since client is idle. If there are other cases of "non-errors" then
732     * then a datanode might be retried by setting this to true again.
733     */
734    boolean retryCurrentNode = true;
735
736    while (true) {
737      // retry as many times as seekToNewSource allows.
738      try {
739        return reader.doRead(blockReader, off, len, readStatistics);
740      } catch ( ChecksumException ce ) {
741        DFSClient.LOG.warn("Found Checksum error for "
742            + getCurrentBlock() + " from " + currentNode
743            + " at " + ce.getPos());        
744        ioe = ce;
745        retryCurrentNode = false;
746        // we want to remember which block replicas we have tried
747        addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
748            corruptedBlockMap);
749      } catch ( IOException e ) {
750        if (!retryCurrentNode) {
751          DFSClient.LOG.warn("Exception while reading from "
752              + getCurrentBlock() + " of " + src + " from "
753              + currentNode, e);
754        }
755        ioe = e;
756      }
757      boolean sourceFound = false;
758      if (retryCurrentNode) {
759        /* possibly retry the same node so that transient errors don't
760         * result in application level failures (e.g. Datanode could have
761         * closed the connection because the client is idle for too long).
762         */ 
763        sourceFound = seekToBlockSource(pos);
764      } else {
765        addToDeadNodes(currentNode);
766        sourceFound = seekToNewSource(pos);
767      }
768      if (!sourceFound) {
769        throw ioe;
770      }
771      retryCurrentNode = false;
772    }
773  }
774
775  private int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
776    dfsClient.checkOpen();
777    if (closed) {
778      throw new IOException("Stream closed");
779    }
780    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
781      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
782    failures = 0;
783    if (pos < getFileLength()) {
784      int retries = 2;
785      while (retries > 0) {
786        try {
787          // currentNode can be left as null if previous read had a checksum
788          // error on the same block. See HDFS-3067
789          if (pos > blockEnd || currentNode == null) {
790            currentNode = blockSeekTo(pos);
791          }
792          int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
793          if (locatedBlocks.isLastBlockComplete()) {
794            realLen = (int) Math.min(realLen, locatedBlocks.getFileLength());
795          }
796          int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
797          
798          if (result >= 0) {
799            pos += result;
800          } else {
801            // got a EOS from reader though we expect more data on it.
802            throw new IOException("Unexpected EOS from the reader");
803          }
804          if (dfsClient.stats != null && result != -1) {
805            dfsClient.stats.incrementBytesRead(result);
806          }
807          return result;
808        } catch (ChecksumException ce) {
809          throw ce;            
810        } catch (IOException e) {
811          if (retries == 1) {
812            DFSClient.LOG.warn("DFS Read", e);
813          }
814          blockEnd = -1;
815          if (currentNode != null) { addToDeadNodes(currentNode); }
816          if (--retries == 0) {
817            throw e;
818          }
819        } finally {
820          // Check if need to report block replicas corruption either read
821          // was successful or ChecksumException occured.
822          reportCheckSumFailure(corruptedBlockMap, 
823              currentLocatedBlock.getLocations().length);
824        }
825      }
826    }
827    return -1;
828  }
829
830  /**
831   * Read the entire buffer.
832   */
833  @Override
834  public synchronized int read(final byte buf[], int off, int len) throws IOException {
835    ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
836
837    return readWithStrategy(byteArrayReader, off, len);
838  }
839
840  @Override
841  public synchronized int read(final ByteBuffer buf) throws IOException {
842    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
843
844    return readWithStrategy(byteBufferReader, 0, buf.remaining());
845  }
846
847
848  /**
849   * Add corrupted block replica into map.
850   * @param corruptedBlockMap 
851   */
852  private void addIntoCorruptedBlockMap(ExtendedBlock blk, DatanodeInfo node, 
853      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
854    Set<DatanodeInfo> dnSet = null;
855    if((corruptedBlockMap.containsKey(blk))) {
856      dnSet = corruptedBlockMap.get(blk);
857    }else {
858      dnSet = new HashSet<DatanodeInfo>();
859    }
860    if (!dnSet.contains(node)) {
861      dnSet.add(node);
862      corruptedBlockMap.put(blk, dnSet);
863    }
864  }
865
866  private DNAddrPair chooseDataNode(LocatedBlock block,
867      Collection<DatanodeInfo> ignoredNodes) throws IOException {
868    while (true) {
869      DatanodeInfo[] nodes = block.getLocations();
870      try {
871        return getBestNodeDNAddrPair(nodes, ignoredNodes);
872      } catch (IOException ie) {
873        String errMsg =
874          getBestNodeDNAddrPairErrorString(nodes, deadNodes, ignoredNodes);
875        String blockInfo = block.getBlock() + " file=" + src;
876        if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
877          String description = "Could not obtain block: " + blockInfo;
878          DFSClient.LOG.warn(description + errMsg
879              + ". Throwing a BlockMissingException");
880          throw new BlockMissingException(src, description,
881              block.getStartOffset());
882        }
883        
884        if (nodes == null || nodes.length == 0) {
885          DFSClient.LOG.info("No node available for " + blockInfo);
886        }
887        DFSClient.LOG.info("Could not obtain " + block.getBlock()
888            + " from any node: " + ie + errMsg
889            + ". Will get new block locations from namenode and retry...");
890        try {
891          // Introducing a random factor to the wait time before another retry.
892          // The wait time is dependent on # of failures and a random factor.
893          // At the first time of getting a BlockMissingException, the wait time
894          // is a random number between 0..3000 ms. If the first retry
895          // still fails, we will wait 3000 ms grace period before the 2nd retry.
896          // Also at the second retry, the waiting window is expanded to 6000 ms
897          // alleviating the request rate from the server. Similarly the 3rd retry
898          // will wait 6000ms grace period before retry and the waiting window is
899          // expanded to 9000ms. 
900          final int timeWindow = dfsClient.getConf().timeWindow;
901          double waitTime = timeWindow * failures +       // grace period for the last round of attempt
902            timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
903          DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
904          Thread.sleep((long)waitTime);
905        } catch (InterruptedException iex) {
906        }
907        deadNodes.clear(); //2nd option is to remove only nodes[blockId]
908        openInfo();
909        block = getBlockAt(block.getStartOffset(), false);
910        failures++;
911        continue;
912      }
913    }
914  }
915
916  /**
917   * Get the best node.
918   * @param nodes Nodes to choose from.
919   * @param ignoredNodes Do not chose nodes in this array (may be null)
920   * @return The DNAddrPair of the best node.
921   * @throws IOException
922   */
923  private DNAddrPair getBestNodeDNAddrPair(final DatanodeInfo[] nodes,
924      Collection<DatanodeInfo> ignoredNodes) throws IOException {
925    DatanodeInfo chosenNode = bestNode(nodes, deadNodes, ignoredNodes);
926    final String dnAddr =
927        chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
928    if (DFSClient.LOG.isDebugEnabled()) {
929      DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
930    }
931    InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
932    return new DNAddrPair(chosenNode, targetAddr);
933  }
934
935  private static String getBestNodeDNAddrPairErrorString(
936      DatanodeInfo nodes[], AbstractMap<DatanodeInfo,
937      DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) {
938    StringBuilder errMsgr = new StringBuilder(
939        " No live nodes contain current block ");
940    errMsgr.append("Block locations:");
941    for (DatanodeInfo datanode : nodes) {
942      errMsgr.append(" ");
943      errMsgr.append(datanode.toString());
944    }
945    errMsgr.append(" Dead nodes: ");
946    for (DatanodeInfo datanode : deadNodes.keySet()) {
947      errMsgr.append(" ");
948      errMsgr.append(datanode.toString());
949    }
950    if (ignoredNodes != null) {
951      errMsgr.append(" Ignored nodes: ");
952      for (DatanodeInfo datanode : ignoredNodes) {
953        errMsgr.append(" ");
954        errMsgr.append(datanode.toString());
955      }
956    }
957    return errMsgr.toString();
958  }
959
960  private void fetchBlockByteRange(LocatedBlock block, long start, long end,
961      byte[] buf, int offset,
962      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
963      throws IOException {
964    block = getBlockAt(block.getStartOffset(), false);
965    while (true) {
966      DNAddrPair addressPair = chooseDataNode(block, null);
967      try {
968        actualGetFromOneDataNode(addressPair, block, start, end, buf, offset,
969            corruptedBlockMap);
970        return;
971      } catch (IOException e) {
972        // Ignore. Already processed inside the function.
973        // Loop through to try the next node.
974      }
975    }
976  }
977
978  private Callable<ByteBuffer> getFromOneDataNode(final DNAddrPair datanode,
979      final LocatedBlock block, final long start, final long end,
980      final ByteBuffer bb,
981      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap,
982      final CountDownLatch latch) {
983    return new Callable<ByteBuffer>() {
984      @Override
985      public ByteBuffer call() throws Exception {
986        try {
987          byte[] buf = bb.array();
988          int offset = bb.position();
989          actualGetFromOneDataNode(datanode, block, start, end, buf, offset,
990              corruptedBlockMap);
991          return bb;
992        } finally {
993          latch.countDown();
994        }
995      }
996    };
997  }
998
999  private void actualGetFromOneDataNode(final DNAddrPair datanode,
1000      LocatedBlock block, final long start, final long end, byte[] buf,
1001      int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1002      throws IOException {
1003    DFSClientFaultInjector.get().startFetchFromDatanode();
1004    int refetchToken = 1; // only need to get a new access token once
1005    int refetchEncryptionKey = 1; // only need to get a new encryption key once
1006
1007    while (true) {
1008      // cached block locations may have been updated by chooseDataNode()
1009      // or fetchBlockAt(). Always get the latest list of locations at the
1010      // start of the loop.
1011      CachingStrategy curCachingStrategy;
1012      boolean allowShortCircuitLocalReads;
1013      synchronized (this) {
1014        block = getBlockAt(block.getStartOffset(), false);
1015        curCachingStrategy = cachingStrategy;
1016        allowShortCircuitLocalReads = !shortCircuitForbidden();
1017      }
1018      DatanodeInfo chosenNode = datanode.info;
1019      InetSocketAddress targetAddr = datanode.addr;
1020      BlockReader reader = null;
1021
1022      try {
1023        Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
1024        int len = (int) (end - start + 1);
1025        reader = new BlockReaderFactory(dfsClient.getConf()).
1026            setInetSocketAddress(targetAddr).
1027            setRemotePeerFactory(dfsClient).
1028            setDatanodeInfo(chosenNode).
1029            setFileName(src).
1030            setBlock(block.getBlock()).
1031            setBlockToken(blockToken).
1032            setStartOffset(start).
1033            setVerifyChecksum(verifyChecksum).
1034            setClientName(dfsClient.clientName).
1035            setLength(len).
1036            setCachingStrategy(curCachingStrategy).
1037            setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
1038            setClientCacheContext(dfsClient.getClientContext()).
1039            setUserGroupInformation(dfsClient.ugi).
1040            setConfiguration(dfsClient.getConfiguration()).
1041            build();
1042        int nread = reader.readAll(buf, offset, len);
1043        if (nread != len) {
1044          throw new IOException("truncated return from reader.read(): " +
1045                                "excpected " + len + ", got " + nread);
1046        }
1047        return;
1048      } catch (ChecksumException e) {
1049        String msg = "fetchBlockByteRange(). Got a checksum exception for "
1050            + src + " at " + block.getBlock() + ":" + e.getPos() + " from "
1051            + chosenNode;
1052        DFSClient.LOG.warn(msg);
1053        // we want to remember what we have tried
1054        addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
1055        addToDeadNodes(chosenNode);
1056        throw new IOException(msg);
1057      } catch (IOException e) {
1058        if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
1059          DFSClient.LOG.info("Will fetch a new encryption key and retry, " 
1060              + "encryption key was invalid when connecting to " + targetAddr
1061              + " : " + e);
1062          // The encryption key used is invalid.
1063          refetchEncryptionKey--;
1064          dfsClient.clearDataEncryptionKey();
1065          continue;
1066        } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
1067          refetchToken--;
1068          try {
1069            fetchBlockAt(block.getStartOffset());
1070          } catch (IOException fbae) {
1071            // ignore IOE, since we can retry it later in a loop
1072          }
1073          continue;
1074        } else {
1075          String msg = "Failed to connect to " + targetAddr + " for file "
1076              + src + " for block " + block.getBlock() + ":" + e;
1077          DFSClient.LOG.warn("Connection failure: " + msg, e);
1078          addToDeadNodes(chosenNode);
1079          throw new IOException(msg);
1080        }
1081      } finally {
1082        if (reader != null) {
1083          reader.close();
1084        }
1085      }
1086    }
1087  }
1088
1089  /**
1090   * Like {@link #fetchBlockByteRange(LocatedBlock, long, long, byte[],
1091   * int, Map)} except we start up a second, parallel, 'hedged' read
1092   * if the first read is taking longer than configured amount of
1093   * time.  We then wait on which ever read returns first.
1094   * 
1095   * @param block
1096   * @param start
1097   * @param end
1098   * @param buf
1099   * @param offset
1100   * @param corruptedBlockMap
1101   * @throws IOException
1102   */
1103  private void hedgedFetchBlockByteRange(LocatedBlock block, long start,
1104      long end, byte[] buf, int offset,
1105      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
1106      throws IOException {
1107    ArrayList<Future<ByteBuffer>> futures = new ArrayList<Future<ByteBuffer>>();
1108    ArrayList<DatanodeInfo> ignored = new ArrayList<DatanodeInfo>();
1109    ByteBuffer bb = null;
1110    int len = (int) (end - start + 1);
1111    block = getBlockAt(block.getStartOffset(), false);
1112    // Latch shared by all outstanding reads.  First to finish closes
1113    CountDownLatch hasReceivedResult = new CountDownLatch(1);
1114    while (true) {
1115      DNAddrPair chosenNode = null;
1116      Future<ByteBuffer> future = null;
1117      // futures is null if there is no request already executing.
1118      if (futures.isEmpty()) {
1119        // chooseDataNode is a commitment.  If no node, we go to
1120        // the NN to reget block locations.  Only go here on first read.
1121        chosenNode = chooseDataNode(block, ignored);
1122        bb = ByteBuffer.wrap(buf, offset, len);
1123        future = getHedgedReadFuture(chosenNode, block, start, end, bb,
1124          corruptedBlockMap, hasReceivedResult);
1125        try {
1126          future.get(dfsClient.getHedgedReadTimeout(), TimeUnit.MILLISECONDS);
1127          return;
1128        } catch (TimeoutException e) {
1129          if (DFSClient.LOG.isDebugEnabled()) {
1130            DFSClient.LOG.debug("Waited " + dfsClient.getHedgedReadTimeout() +
1131              "ms to read from " + chosenNode.info + "; spawning hedged read");
1132          }
1133          // Ignore this node on next go around.
1134          ignored.add(chosenNode.info);
1135          dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1136          futures.add(future);
1137          continue; // no need to refresh block locations
1138        } catch (InterruptedException e) {
1139          // Ignore
1140        } catch (ExecutionException e) {
1141          // Ignore already logged in the call.
1142        }
1143      } else {
1144        // We are starting up a 'hedged' read.  We have a read already
1145        // ongoing. Call getBestNodeDNAddrPair instead of chooseDataNode.
1146        // If no nodes to do hedged reads against, pass.
1147        try {
1148          chosenNode = getBestNodeDNAddrPair(block.getLocations(), ignored);
1149          bb = ByteBuffer.allocate(len);
1150          future = getHedgedReadFuture(chosenNode, block, start, end, bb,
1151            corruptedBlockMap, hasReceivedResult);
1152          futures.add(future);
1153        } catch (IOException ioe) {
1154          if (DFSClient.LOG.isDebugEnabled()) {
1155            DFSClient.LOG.debug("Failed getting node for hedged read: " +
1156              ioe.getMessage());
1157          }
1158        }
1159        // if not succeeded. Submit callables for each datanode in a loop, wait
1160        // for a fixed interval and get the result from the fastest one.
1161        try {
1162          ByteBuffer result = getFirstToComplete(futures, hasReceivedResult);
1163          // cancel the rest.
1164          cancelAll(futures);
1165          if (result.array() != buf) { // compare the array pointers
1166            dfsClient.getHedgedReadMetrics().incHedgedReadWins();
1167            System.arraycopy(result.array(), result.position(), buf, offset,
1168                len);
1169          } else {
1170            dfsClient.getHedgedReadMetrics().incHedgedReadOps();
1171          }
1172          return;
1173        } catch (InterruptedException ie) {
1174          // Ignore
1175        } catch (ExecutionException e) {
1176          // exception already handled in the call method. getFirstToComplete
1177          // will remove the failing future from the list. nothing more to do.
1178        }
1179        // We got here if exception.  Ignore this node on next go around IFF
1180        // we found a chosenNode to hedge read against.
1181        if (chosenNode != null && chosenNode.info != null) {
1182          ignored.add(chosenNode.info);
1183        }
1184      }
1185      // executed if we get an error from a data node
1186      block = getBlockAt(block.getStartOffset(), false);
1187    }
1188  }
1189
1190  private Future<ByteBuffer> getHedgedReadFuture(final DNAddrPair chosenNode,
1191      final LocatedBlock block, long start,
1192      final long end, final ByteBuffer bb,
1193      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap,
1194      final CountDownLatch hasReceivedResult) {
1195    Callable<ByteBuffer> getFromDataNodeCallable =
1196        getFromOneDataNode(chosenNode, block, start, end, bb,
1197          corruptedBlockMap, hasReceivedResult);
1198    return dfsClient.getHedgedReadsThreadPool().submit(getFromDataNodeCallable);
1199  }
1200
1201  private ByteBuffer getFirstToComplete(ArrayList<Future<ByteBuffer>> futures,
1202      CountDownLatch latch) throws ExecutionException, InterruptedException {
1203    latch.await();
1204    for (Future<ByteBuffer> future : futures) {
1205      if (future.isDone()) {
1206        try {
1207          return future.get();
1208        } catch (ExecutionException e) {
1209          // already logged in the Callable
1210          futures.remove(future);
1211          throw e;
1212        }
1213      }
1214    }
1215    throw new InterruptedException("latch has counted down to zero but no"
1216        + "result available yet, for safety try to request another one from"
1217        + "outside loop, this should be rare");
1218  }
1219
1220  private void cancelAll(List<Future<ByteBuffer>> futures) {
1221    for (Future<ByteBuffer> future : futures) {
1222      // Unfortunately, hdfs reads do not take kindly to interruption.
1223      // Threads return a variety of interrupted-type exceptions but
1224      // also complaints about invalid pbs -- likely because read
1225      // is interrupted before gets whole pb.  Also verbose WARN
1226      // logging.  So, for now, do not interrupt running read.
1227      future.cancel(false);
1228    }
1229  }
1230
1231  /**
1232   * Should the block access token be refetched on an exception
1233   * 
1234   * @param ex Exception received
1235   * @param targetAddr Target datanode address from where exception was received
1236   * @return true if block access token has expired or invalid and it should be
1237   *         refetched
1238   */
1239  private static boolean tokenRefetchNeeded(IOException ex,
1240      InetSocketAddress targetAddr) {
1241    /*
1242     * Get a new access token and retry. Retry is needed in 2 cases. 1)
1243     * When both NN and DN re-started while DFSClient holding a cached
1244     * access token. 2) In the case that NN fails to update its
1245     * access key at pre-set interval (by a wide margin) and
1246     * subsequently restarts. In this case, DN re-registers itself with
1247     * NN and receives a new access key, but DN will delete the old
1248     * access key from its memory since it's considered expired based on
1249     * the estimated expiration date.
1250     */
1251    if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
1252      DFSClient.LOG.info("Access token was invalid when connecting to "
1253          + targetAddr + " : " + ex);
1254      return true;
1255    }
1256    return false;
1257  }
1258
1259  /**
1260   * Read bytes starting from the specified position.
1261   * 
1262   * @param position start read from this position
1263   * @param buffer read buffer
1264   * @param offset offset into buffer
1265   * @param length number of bytes to read
1266   * 
1267   * @return actual number of bytes read
1268   */
1269  @Override
1270  public int read(long position, byte[] buffer, int offset, int length)
1271    throws IOException {
1272    // sanity checks
1273    dfsClient.checkOpen();
1274    if (closed) {
1275      throw new IOException("Stream closed");
1276    }
1277    failures = 0;
1278    long filelen = getFileLength();
1279    if ((position < 0) || (position >= filelen)) {
1280      return -1;
1281    }
1282    int realLen = length;
1283    if ((position + length) > filelen) {
1284      realLen = (int)(filelen - position);
1285    }
1286    
1287    // determine the block and byte range within the block
1288    // corresponding to position and realLen
1289    List<LocatedBlock> blockRange = getBlockRange(position, realLen);
1290    int remaining = realLen;
1291    Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap 
1292      = new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
1293    for (LocatedBlock blk : blockRange) {
1294      long targetStart = position - blk.getStartOffset();
1295      long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart);
1296      try {
1297        if (dfsClient.isHedgedReadsEnabled()) {
1298          hedgedFetchBlockByteRange(blk, targetStart, targetStart + bytesToRead
1299              - 1, buffer, offset, corruptedBlockMap);
1300        } else {
1301          fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1,
1302              buffer, offset, corruptedBlockMap);
1303        }
1304      } finally {
1305        // Check and report if any block replicas are corrupted.
1306        // BlockMissingException may be caught if all block replicas are
1307        // corrupted.
1308        reportCheckSumFailure(corruptedBlockMap, blk.getLocations().length);
1309      }
1310
1311      remaining -= bytesToRead;
1312      position += bytesToRead;
1313      offset += bytesToRead;
1314    }
1315    assert remaining == 0 : "Wrong number of bytes read.";
1316    if (dfsClient.stats != null) {
1317      dfsClient.stats.incrementBytesRead(realLen);
1318    }
1319    return realLen;
1320  }
1321  
1322  /**
1323   * DFSInputStream reports checksum failure.
1324   * Case I : client has tried multiple data nodes and at least one of the
1325   * attempts has succeeded. We report the other failures as corrupted block to
1326   * namenode. 
1327   * Case II: client has tried out all data nodes, but all failed. We
1328   * only report if the total number of replica is 1. We do not
1329   * report otherwise since this maybe due to the client is a handicapped client
1330   * (who can not read).
1331   * @param corruptedBlockMap map of corrupted blocks
1332   * @param dataNodeCount number of data nodes who contains the block replicas
1333   */
1334  private void reportCheckSumFailure(
1335      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap, 
1336      int dataNodeCount) {
1337    if (corruptedBlockMap.isEmpty()) {
1338      return;
1339    }
1340    Iterator<Entry<ExtendedBlock, Set<DatanodeInfo>>> it = corruptedBlockMap
1341        .entrySet().iterator();
1342    Entry<ExtendedBlock, Set<DatanodeInfo>> entry = it.next();
1343    ExtendedBlock blk = entry.getKey();
1344    Set<DatanodeInfo> dnSet = entry.getValue();
1345    if (((dnSet.size() < dataNodeCount) && (dnSet.size() > 0))
1346        || ((dataNodeCount == 1) && (dnSet.size() == dataNodeCount))) {
1347      DatanodeInfo[] locs = new DatanodeInfo[dnSet.size()];
1348      int i = 0;
1349      for (DatanodeInfo dn:dnSet) {
1350        locs[i++] = dn;
1351      }
1352      LocatedBlock [] lblocks = { new LocatedBlock(blk, locs) };
1353      dfsClient.reportChecksumFailure(src, lblocks);
1354    }
1355    corruptedBlockMap.clear();
1356  }
1357
1358  @Override
1359  public long skip(long n) throws IOException {
1360    if ( n > 0 ) {
1361      long curPos = getPos();
1362      long fileLen = getFileLength();
1363      if( n+curPos > fileLen ) {
1364        n = fileLen - curPos;
1365      }
1366      seek(curPos+n);
1367      return n;
1368    }
1369    return n < 0 ? -1 : 0;
1370  }
1371
1372  /**
1373   * Seek to a new arbitrary location
1374   */
1375  @Override
1376  public synchronized void seek(long targetPos) throws IOException {
1377    if (targetPos > getFileLength()) {
1378      throw new IOException("Cannot seek after EOF");
1379    }
1380    if (targetPos < 0) {
1381      throw new IOException("Cannot seek to negative offset");
1382    }
1383    if (closed) {
1384      throw new IOException("Stream is closed!");
1385    }
1386    boolean done = false;
1387    if (pos <= targetPos && targetPos <= blockEnd) {
1388      //
1389      // If this seek is to a positive position in the current
1390      // block, and this piece of data might already be lying in
1391      // the TCP buffer, then just eat up the intervening data.
1392      //
1393      int diff = (int)(targetPos - pos);
1394      if (diff <= blockReader.available()) {
1395        try {
1396          pos += blockReader.skip(diff);
1397          if (pos == targetPos) {
1398            done = true;
1399          } else {
1400            // The range was already checked. If the block reader returns
1401            // something unexpected instead of throwing an exception, it is
1402            // most likely a bug. 
1403            String errMsg = "BlockReader failed to seek to " + 
1404                targetPos + ". Instead, it seeked to " + pos + ".";
1405            DFSClient.LOG.warn(errMsg);
1406            throw new IOException(errMsg);
1407          }
1408        } catch (IOException e) {//make following read to retry
1409          if(DFSClient.LOG.isDebugEnabled()) {
1410            DFSClient.LOG.debug("Exception while seek to " + targetPos
1411                + " from " + getCurrentBlock() + " of " + src + " from "
1412                + currentNode, e);
1413          }
1414        }
1415      }
1416    }
1417    if (!done) {
1418      pos = targetPos;
1419      blockEnd = -1;
1420    }
1421  }
1422
1423  /**
1424   * Same as {@link #seekToNewSource(long)} except that it does not exclude
1425   * the current datanode and might connect to the same node.
1426   */
1427  private synchronized boolean seekToBlockSource(long targetPos)
1428                                                 throws IOException {
1429    currentNode = blockSeekTo(targetPos);
1430    return true;
1431  }
1432  
1433  /**
1434   * Seek to given position on a node other than the current node.  If
1435   * a node other than the current node is found, then returns true. 
1436   * If another node could not be found, then returns false.
1437   */
1438  @Override
1439  public synchronized boolean seekToNewSource(long targetPos) throws IOException {
1440    boolean markedDead = deadNodes.containsKey(currentNode);
1441    addToDeadNodes(currentNode);
1442    DatanodeInfo oldNode = currentNode;
1443    DatanodeInfo newNode = blockSeekTo(targetPos);
1444    if (!markedDead) {
1445      /* remove it from deadNodes. blockSeekTo could have cleared 
1446       * deadNodes and added currentNode again. Thats ok. */
1447      deadNodes.remove(oldNode);
1448    }
1449    if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
1450      currentNode = newNode;
1451      return true;
1452    } else {
1453      return false;
1454    }
1455  }
1456      
1457  /**
1458   */
1459  @Override
1460  public synchronized long getPos() throws IOException {
1461    return pos;
1462  }
1463
1464  /** Return the size of the remaining available bytes
1465   * if the size is less than or equal to {@link Integer#MAX_VALUE},
1466   * otherwise, return {@link Integer#MAX_VALUE}.
1467   */
1468  @Override
1469  public synchronized int available() throws IOException {
1470    if (closed) {
1471      throw new IOException("Stream closed");
1472    }
1473
1474    final long remaining = getFileLength() - pos;
1475    return remaining <= Integer.MAX_VALUE? (int)remaining: Integer.MAX_VALUE;
1476  }
1477
1478  /**
1479   * We definitely don't support marks
1480   */
1481  @Override
1482  public boolean markSupported() {
1483    return false;
1484  }
1485  @Override
1486  public void mark(int readLimit) {
1487  }
1488  @Override
1489  public void reset() throws IOException {
1490    throw new IOException("Mark/reset not supported");
1491  }
1492
1493  /**
1494   * Pick the best node from which to stream the data.
1495   * Entries in <i>nodes</i> are already in the priority order
1496   */
1497  static DatanodeInfo bestNode(DatanodeInfo nodes[],
1498      AbstractMap<DatanodeInfo, DatanodeInfo> deadNodes,
1499      Collection<DatanodeInfo> ignoredNodes) throws IOException {
1500    if (nodes != null) {
1501      for (int i = 0; i < nodes.length; i++) {
1502        if (!deadNodes.containsKey(nodes[i])
1503            && (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) {
1504          return nodes[i];
1505        }
1506      }
1507    }
1508    throw new IOException("No live nodes contain current block");
1509  }
1510
1511  /** Utility class to encapsulate data node info and its address. */
1512  static class DNAddrPair {
1513    final DatanodeInfo info;
1514    final InetSocketAddress addr;
1515    DNAddrPair(DatanodeInfo info, InetSocketAddress addr) {
1516      this.info = info;
1517      this.addr = addr;
1518    }
1519  }
1520
1521  /**
1522   * Get statistics about the reads which this DFSInputStream has done.
1523   */
1524  public synchronized ReadStatistics getReadStatistics() {
1525    return new ReadStatistics(readStatistics);
1526  }
1527
1528  private synchronized void closeCurrentBlockReader() {
1529    if (blockReader == null) return;
1530    // Close the current block reader so that the new caching settings can 
1531    // take effect immediately.
1532    try {
1533      blockReader.close();
1534    } catch (IOException e) {
1535      DFSClient.LOG.error("error closing blockReader", e);
1536    }
1537    blockReader = null;
1538  }
1539
1540  @Override
1541  public synchronized void setReadahead(Long readahead)
1542      throws IOException {
1543    this.cachingStrategy =
1544        new CachingStrategy.Builder(this.cachingStrategy).
1545            setReadahead(readahead).build();
1546    closeCurrentBlockReader();
1547  }
1548
1549  @Override
1550  public synchronized void setDropBehind(Boolean dropBehind)
1551      throws IOException {
1552    this.cachingStrategy =
1553        new CachingStrategy.Builder(this.cachingStrategy).
1554            setDropBehind(dropBehind).build();
1555    closeCurrentBlockReader();
1556  }
1557
1558  /**
1559   * The immutable empty buffer we return when we reach EOF when doing a
1560   * zero-copy read.
1561   */
1562  private static final ByteBuffer EMPTY_BUFFER =
1563    ByteBuffer.allocateDirect(0).asReadOnlyBuffer();
1564
1565  @Override
1566  public synchronized ByteBuffer read(ByteBufferPool bufferPool,
1567      int maxLength, EnumSet<ReadOption> opts) 
1568          throws IOException, UnsupportedOperationException {
1569    if (maxLength == 0) {
1570      return EMPTY_BUFFER;
1571    } else if (maxLength < 0) {
1572      throw new IllegalArgumentException("can't read a negative " +
1573          "number of bytes.");
1574    }
1575    if ((blockReader == null) || (blockEnd == -1)) {
1576      if (pos >= getFileLength()) {
1577        return null;
1578      }
1579      /*
1580       * If we don't have a blockReader, or the one we have has no more bytes
1581       * left to read, we call seekToBlockSource to get a new blockReader and
1582       * recalculate blockEnd.  Note that we assume we're not at EOF here
1583       * (we check this above).
1584       */
1585      if ((!seekToBlockSource(pos)) || (blockReader == null)) {
1586        throw new IOException("failed to allocate new BlockReader " +
1587            "at position " + pos);
1588      }
1589    }
1590    ByteBuffer buffer = null;
1591    if (dfsClient.getConf().shortCircuitMmapEnabled) {
1592      buffer = tryReadZeroCopy(maxLength, opts);
1593    }
1594    if (buffer != null) {
1595      return buffer;
1596    }
1597    buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
1598    if (buffer != null) {
1599      extendedReadBuffers.put(buffer, bufferPool);
1600    }
1601    return buffer;
1602  }
1603
1604  private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
1605      EnumSet<ReadOption> opts) throws IOException {
1606    // Copy 'pos' and 'blockEnd' to local variables to make it easier for the
1607    // JVM to optimize this function.
1608    final long curPos = pos;
1609    final long curEnd = blockEnd;
1610    final long blockStartInFile = currentLocatedBlock.getStartOffset();
1611    final long blockPos = curPos - blockStartInFile;
1612
1613    // Shorten this read if the end of the block is nearby.
1614    long length63;
1615    if ((curPos + maxLength) <= (curEnd + 1)) {
1616      length63 = maxLength;
1617    } else {
1618      length63 = 1 + curEnd - curPos;
1619      if (length63 <= 0) {
1620        if (DFSClient.LOG.isDebugEnabled()) {
1621          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1622            curPos + " of " + src + "; " + length63 + " bytes left in block.  " +
1623            "blockPos=" + blockPos + "; curPos=" + curPos +
1624            "; curEnd=" + curEnd);
1625        }
1626        return null;
1627      }
1628      if (DFSClient.LOG.isDebugEnabled()) {
1629        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1630            " to " + length63 + " to avoid going more than one byte " +
1631            "past the end of the block.  blockPos=" + blockPos +
1632            "; curPos=" + curPos + "; curEnd=" + curEnd);
1633      }
1634    }
1635    // Make sure that don't go beyond 31-bit offsets in the MappedByteBuffer.
1636    int length;
1637    if (blockPos + length63 <= Integer.MAX_VALUE) {
1638      length = (int)length63;
1639    } else {
1640      long length31 = Integer.MAX_VALUE - blockPos;
1641      if (length31 <= 0) {
1642        // Java ByteBuffers can't be longer than 2 GB, because they use
1643        // 4-byte signed integers to represent capacity, etc.
1644        // So we can't mmap the parts of the block higher than the 2 GB offset.
1645        // FIXME: we could work around this with multiple memory maps.
1646        // See HDFS-5101.
1647        if (DFSClient.LOG.isDebugEnabled()) {
1648          DFSClient.LOG.debug("Unable to perform a zero-copy read from offset " +
1649            curPos + " of " + src + "; 31-bit MappedByteBuffer limit " +
1650            "exceeded.  blockPos=" + blockPos + ", curEnd=" + curEnd);
1651        }
1652        return null;
1653      }
1654      length = (int)length31;
1655      if (DFSClient.LOG.isDebugEnabled()) {
1656        DFSClient.LOG.debug("Reducing read length from " + maxLength +
1657            " to " + length + " to avoid 31-bit limit.  " +
1658            "blockPos=" + blockPos + "; curPos=" + curPos +
1659            "; curEnd=" + curEnd);
1660      }
1661    }
1662    final ClientMmap clientMmap = blockReader.getClientMmap(opts);
1663    if (clientMmap == null) {
1664      if (DFSClient.LOG.isDebugEnabled()) {
1665        DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
1666          curPos + " of " + src + "; BlockReader#getClientMmap returned " +
1667          "null.");
1668      }
1669      return null;
1670    }
1671    boolean success = false;
1672    ByteBuffer buffer;
1673    try {
1674      seek(curPos + length);
1675      buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
1676      buffer.position((int)blockPos);
1677      buffer.limit((int)(blockPos + length));
1678      extendedReadBuffers.put(buffer, clientMmap);
1679      readStatistics.addZeroCopyBytes(length);
1680      if (DFSClient.LOG.isDebugEnabled()) {
1681        DFSClient.LOG.debug("readZeroCopy read " + length + 
1682            " bytes from offset " + curPos + " via the zero-copy read " +
1683            "path.  blockEnd = " + blockEnd);
1684      }
1685      success = true;
1686    } finally {
1687      if (!success) {
1688        IOUtils.closeQuietly(clientMmap);
1689      }
1690    }
1691    return buffer;
1692  }
1693
1694  @Override
1695  public synchronized void releaseBuffer(ByteBuffer buffer) {
1696    if (buffer == EMPTY_BUFFER) return;
1697    Object val = extendedReadBuffers.remove(buffer);
1698    if (val == null) {
1699      throw new IllegalArgumentException("tried to release a buffer " +
1700          "that was not created by this stream, " + buffer);
1701    }
1702    if (val instanceof ClientMmap) {
1703      IOUtils.closeQuietly((ClientMmap)val);
1704    } else if (val instanceof ByteBufferPool) {
1705      ((ByteBufferPool)val).putBuffer(buffer);
1706    }
1707  }
1708}