001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import java.io.BufferedOutputStream;
021import java.io.DataInputStream;
022import java.io.DataOutputStream;
023import java.io.FileInputStream;
024import java.io.IOException;
025import java.net.InetSocketAddress;
026
027import org.apache.commons.lang.mutable.MutableBoolean;
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030import org.apache.hadoop.classification.InterfaceAudience;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.hdfs.net.DomainPeer;
033import org.apache.hadoop.hdfs.net.Peer;
034import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
035import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
036import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
037import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
038import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
039import org.apache.hadoop.hdfs.protocolPB.PBHelper;
040import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
041import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
042import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
043import org.apache.hadoop.hdfs.shortcircuit.DomainSocketFactory;
044import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache;
045import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.ShortCircuitReplicaCreator;
046import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplica;
047import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplicaInfo;
048import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
050import org.apache.hadoop.io.IOUtils;
051import org.apache.hadoop.ipc.RemoteException;
052import org.apache.hadoop.net.unix.DomainSocket;
053import org.apache.hadoop.security.AccessControlException;
054import org.apache.hadoop.security.UserGroupInformation;
055import org.apache.hadoop.security.token.SecretManager.InvalidToken;
056import org.apache.hadoop.security.token.Token;
057import org.apache.hadoop.util.Time;
058
059import com.google.common.annotations.VisibleForTesting;
060import com.google.common.base.Preconditions;
061
062
063/** 
064 * Utility class to create BlockReader implementations.
065 */
066@InterfaceAudience.Private
067public class BlockReaderFactory implements ShortCircuitReplicaCreator {
068  static final Log LOG = LogFactory.getLog(BlockReaderFactory.class);
069
070  @VisibleForTesting
071  static ShortCircuitReplicaCreator
072      createShortCircuitReplicaInfoCallback = null;
073
074  private final DFSClient.Conf conf;
075
076  /**
077   * The file name, for logging and debugging purposes.
078   */
079  private String fileName;
080
081  /**
082   * The block ID and block pool ID to use.
083   */
084  private ExtendedBlock block;
085
086  /**
087   * The block token to use for security purposes.
088   */
089  private Token<BlockTokenIdentifier> token;
090
091  /**
092   * The offset within the block to start reading at.
093   */
094  private long startOffset;
095
096  /**
097   * If false, we won't try to verify the block checksum.
098   */
099  private boolean verifyChecksum;
100
101  /**
102   * The name of this client.
103   */
104  private String clientName; 
105
106  /**
107   * The DataNode we're talking to.
108   */
109  private DatanodeInfo datanode;
110
111  /**
112   * If false, we won't try short-circuit local reads.
113   */
114  private boolean allowShortCircuitLocalReads;
115
116  /**
117   * The ClientContext to use for things like the PeerCache.
118   */
119  private ClientContext clientContext;
120
121  /**
122   * Number of bytes to read.  -1 indicates no limit.
123   */
124  private long length = -1;
125
126  /**
127   * Caching strategy to use when reading the block.
128   */
129  private CachingStrategy cachingStrategy;
130
131  /**
132   * Socket address to use to connect to peer.
133   */
134  private InetSocketAddress inetSocketAddress;
135
136  /**
137   * Remote peer factory to use to create a peer, if needed.
138   */
139  private RemotePeerFactory remotePeerFactory;
140
141  /**
142   * UserGroupInformation  to use for legacy block reader local objects, if needed.
143   */
144  private UserGroupInformation userGroupInformation;
145
146  /**
147   * Configuration to use for legacy block reader local objects, if needed.
148   */
149  private Configuration configuration;
150
151  /**
152   * Information about the domain socket path we should use to connect to the
153   * local peer-- or null if we haven't examined the local domain socket.
154   */
155  private DomainSocketFactory.PathInfo pathInfo;
156
157  /**
158   * The remaining number of times that we'll try to pull a socket out of the
159   * cache.
160   */
161  private int remainingCacheTries;
162
163  public BlockReaderFactory(DFSClient.Conf conf) {
164    this.conf = conf;
165    this.remainingCacheTries = conf.nCachedConnRetry;
166  }
167
168  public BlockReaderFactory setFileName(String fileName) {
169    this.fileName = fileName;
170    return this;
171  }
172
173  public BlockReaderFactory setBlock(ExtendedBlock block) {
174    this.block = block;
175    return this;
176  }
177
178  public BlockReaderFactory setBlockToken(Token<BlockTokenIdentifier> token) {
179    this.token = token;
180    return this;
181  }
182
183  public BlockReaderFactory setStartOffset(long startOffset) {
184    this.startOffset = startOffset;
185    return this;
186  }
187
188  public BlockReaderFactory setVerifyChecksum(boolean verifyChecksum) {
189    this.verifyChecksum = verifyChecksum;
190    return this;
191  }
192
193  public BlockReaderFactory setClientName(String clientName) {
194    this.clientName = clientName;
195    return this;
196  }
197
198  public BlockReaderFactory setDatanodeInfo(DatanodeInfo datanode) {
199    this.datanode = datanode;
200    return this;
201  }
202
203  public BlockReaderFactory setAllowShortCircuitLocalReads(
204      boolean allowShortCircuitLocalReads) {
205    this.allowShortCircuitLocalReads = allowShortCircuitLocalReads;
206    return this;
207  }
208
209  public BlockReaderFactory setClientCacheContext(
210      ClientContext clientContext) {
211    this.clientContext = clientContext;
212    return this;
213  }
214
215  public BlockReaderFactory setLength(long length) {
216    this.length = length;
217    return this;
218  }
219
220  public BlockReaderFactory setCachingStrategy(
221      CachingStrategy cachingStrategy) {
222    this.cachingStrategy = cachingStrategy;
223    return this;
224  }
225
226  public BlockReaderFactory setInetSocketAddress (
227      InetSocketAddress inetSocketAddress) {
228    this.inetSocketAddress = inetSocketAddress;
229    return this;
230  }
231
232  public BlockReaderFactory setUserGroupInformation(
233      UserGroupInformation userGroupInformation) {
234    this.userGroupInformation = userGroupInformation;
235    return this;
236  }
237
238  public BlockReaderFactory setRemotePeerFactory(
239      RemotePeerFactory remotePeerFactory) {
240    this.remotePeerFactory = remotePeerFactory;
241    return this;
242  }
243
244  public BlockReaderFactory setConfiguration(
245      Configuration configuration) {
246    this.configuration = configuration;
247    return this;
248  }
249
250  /**
251   * Build a BlockReader with the given options.
252   *
253   * This function will do the best it can to create a block reader that meets
254   * all of our requirements.  We prefer short-circuit block readers
255   * (BlockReaderLocal and BlockReaderLocalLegacy) over remote ones, since the
256   * former avoid the overhead of socket communication.  If short-circuit is
257   * unavailable, our next fallback is data transfer over UNIX domain sockets,
258   * if dfs.client.domain.socket.data.traffic has been enabled.  If that doesn't
259   * work, we will try to create a remote block reader that operates over TCP
260   * sockets.
261   *
262   * There are a few caches that are important here.
263   *
264   * The ShortCircuitCache stores file descriptor objects which have been passed
265   * from the DataNode. 
266   *
267   * The DomainSocketFactory stores information about UNIX domain socket paths
268   * that we not been able to use in the past, so that we don't waste time
269   * retrying them over and over.  (Like all the caches, it does have a timeout,
270   * though.)
271   *
272   * The PeerCache stores peers that we have used in the past.  If we can reuse
273   * one of these peers, we avoid the overhead of re-opening a socket.  However,
274   * if the socket has been timed out on the remote end, our attempt to reuse
275   * the socket may end with an IOException.  For that reason, we limit our
276   * attempts at socket reuse to dfs.client.cached.conn.retry times.  After
277   * that, we create new sockets.  This avoids the problem where a thread tries
278   * to talk to a peer that it hasn't talked to in a while, and has to clean out
279   * every entry in a socket cache full of stale entries.
280   *
281   * @return The new BlockReader.  We will not return null.
282   *
283   * @throws InvalidToken
284   *             If the block token was invalid.
285   *         InvalidEncryptionKeyException
286   *             If the encryption key was invalid.
287   *         Other IOException
288   *             If there was another problem.
289   */
290  public BlockReader build() throws IOException {
291    BlockReader reader = null;
292
293    Preconditions.checkNotNull(configuration);
294    if (conf.shortCircuitLocalReads && allowShortCircuitLocalReads) {
295      if (clientContext.getUseLegacyBlockReaderLocal()) {
296        reader = getLegacyBlockReaderLocal();
297        if (reader != null) {
298          if (LOG.isTraceEnabled()) {
299            LOG.trace(this + ": returning new legacy block reader local.");
300          }
301          return reader;
302        }
303      } else {
304        reader = getBlockReaderLocal();
305        if (reader != null) {
306          if (LOG.isTraceEnabled()) {
307            LOG.trace(this + ": returning new block reader local.");
308          }
309          return reader;
310        }
311      }
312    }
313    if (conf.domainSocketDataTraffic) {
314      reader = getRemoteBlockReaderFromDomain();
315      if (reader != null) {
316        if (LOG.isTraceEnabled()) {
317          LOG.trace(this + ": returning new remote block reader using " +
318              "UNIX domain socket on " + pathInfo.getPath());
319        }
320        return reader;
321      }
322    }
323    Preconditions.checkState(!DFSInputStream.tcpReadsDisabledForTesting,
324        "TCP reads were disabled for testing, but we failed to " +
325        "do a non-TCP read.");
326    return getRemoteBlockReaderFromTcp();
327  }
328
329  /**
330   * Get {@link BlockReaderLocalLegacy} for short circuited local reads.
331   * This block reader implements the path-based style of local reads
332   * first introduced in HDFS-2246.
333   */
334  private BlockReader getLegacyBlockReaderLocal() throws IOException {
335    if (LOG.isTraceEnabled()) {
336      LOG.trace(this + ": trying to construct BlockReaderLocalLegacy");
337    }
338    if (!DFSClient.isLocalAddress(inetSocketAddress)) {
339      if (LOG.isTraceEnabled()) {
340        LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
341            "the address " + inetSocketAddress + " is not local");
342      }
343      return null;
344    }
345    if (clientContext.getDisableLegacyBlockReaderLocal()) {
346      if (LOG.isTraceEnabled()) {
347        LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
348            "disableLegacyBlockReaderLocal is set.");
349      }
350      return null;
351    }
352    IOException ioe = null;
353    try {
354      return BlockReaderLocalLegacy.newBlockReader(conf,
355          userGroupInformation, configuration, fileName, block, token,
356          datanode, startOffset, length);
357    } catch (RemoteException remoteException) {
358      ioe = remoteException.unwrapRemoteException(
359                InvalidToken.class, AccessControlException.class);
360    } catch (IOException e) {
361      ioe = e;
362    }
363    if ((!(ioe instanceof AccessControlException)) &&
364        isSecurityException(ioe)) {
365      // Handle security exceptions.
366      // We do not handle AccessControlException here, since
367      // BlockReaderLocalLegacy#newBlockReader uses that exception to indicate
368      // that the user is not in dfs.block.local-path-access.user, a condition
369      // which requires us to disable legacy SCR.
370      throw ioe;
371    }
372    LOG.warn(this + ": error creating legacy BlockReaderLocal.  " +
373        "Disabling legacy local reads.", ioe);
374    clientContext.setDisableLegacyBlockReaderLocal();
375    return null;
376  }
377
378  private BlockReader getBlockReaderLocal() throws InvalidToken {
379    if (LOG.isTraceEnabled()) {
380      LOG.trace(this + ": trying to construct a BlockReaderLocal " +
381          "for short-circuit reads.");
382    }
383    if (pathInfo == null) {
384      pathInfo = clientContext.getDomainSocketFactory().
385                      getPathInfo(inetSocketAddress, conf);
386    }
387    if (!pathInfo.getPathState().getUsableForShortCircuit()) {
388      if (LOG.isTraceEnabled()) {
389        LOG.trace(this + ": " + pathInfo + " is not " +
390            "usable for short circuit; giving up on BlockReaderLocal.");
391      }
392      return null;
393    }
394    ShortCircuitCache cache = clientContext.getShortCircuitCache();
395    ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
396    ShortCircuitReplicaInfo info = cache.fetchOrCreate(key, this);
397    InvalidToken exc = info.getInvalidTokenException();
398    if (exc != null) {
399      if (LOG.isTraceEnabled()) {
400        LOG.trace(this + ": got InvalidToken exception while trying to " +
401            "construct BlockReaderLocal via " + pathInfo.getPath());
402      }
403      throw exc;
404    }
405    if (info.getReplica() == null) {
406      if (LOG.isTraceEnabled()) {
407        LOG.trace(this + ": failed to get ShortCircuitReplica.  " +
408            "Cannot construct BlockReaderLocal via " + pathInfo.getPath());
409      }
410      return null;
411    }
412    return new BlockReaderLocal.Builder(conf).
413        setFilename(fileName).
414        setBlock(block).
415        setStartOffset(startOffset).
416        setShortCircuitReplica(info.getReplica()).
417        setVerifyChecksum(verifyChecksum).
418        setCachingStrategy(cachingStrategy).
419        build();
420  }
421
422  /**
423   * Fetch a pair of short-circuit block descriptors from a local DataNode.
424   *
425   * @return    Null if we could not communicate with the datanode,
426   *            a new ShortCircuitReplicaInfo object otherwise.
427   *            ShortCircuitReplicaInfo objects may contain either an InvalidToken
428   *            exception, or a ShortCircuitReplica object ready to use.
429   */
430  @Override
431  public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
432    if (createShortCircuitReplicaInfoCallback != null) {
433      ShortCircuitReplicaInfo info =
434        createShortCircuitReplicaInfoCallback.createShortCircuitReplicaInfo();
435      if (info != null) return info;
436    }
437    if (LOG.isTraceEnabled()) {
438      LOG.trace(this + ": trying to create ShortCircuitReplicaInfo.");
439    }
440    BlockReaderPeer curPeer;
441    while (true) {
442      curPeer = nextDomainPeer();
443      if (curPeer == null) break;
444      if (curPeer.fromCache) remainingCacheTries--;
445      DomainPeer peer = (DomainPeer)curPeer.peer;
446      Slot slot = null;
447      ShortCircuitCache cache = clientContext.getShortCircuitCache();
448      try {
449        MutableBoolean usedPeer = new MutableBoolean(false);
450        slot = cache.allocShmSlot(datanode, peer, usedPeer,
451            new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId()),
452            clientName);
453        if (usedPeer.booleanValue()) {
454          if (LOG.isTraceEnabled()) {
455            LOG.trace(this + ": allocShmSlot used up our previous socket " +
456              peer.getDomainSocket() + ".  Allocating a new one...");
457          }
458          curPeer = nextDomainPeer();
459          if (curPeer == null) break;
460          peer = (DomainPeer)curPeer.peer;
461        }
462        ShortCircuitReplicaInfo info = requestFileDescriptors(peer, slot);
463        clientContext.getPeerCache().put(datanode, peer);
464        return info;
465      } catch (IOException e) {
466        if (slot != null) {
467          cache.freeSlot(slot);
468        }
469        if (curPeer.fromCache) {
470          // Handle an I/O error we got when using a cached socket.
471          // These are considered less serious, because the socket may be stale.
472          if (LOG.isDebugEnabled()) {
473            LOG.debug(this + ": closing stale domain peer " + peer, e);
474          }
475          IOUtils.cleanup(LOG, peer);
476        } else {
477          // Handle an I/O error we got when using a newly created socket.
478          // We temporarily disable the domain socket path for a few minutes in
479          // this case, to prevent wasting more time on it.
480          LOG.warn(this + ": I/O error requesting file descriptors.  " + 
481              "Disabling domain socket " + peer.getDomainSocket(), e);
482          IOUtils.cleanup(LOG, peer);
483          clientContext.getDomainSocketFactory()
484              .disableDomainSocketPath(pathInfo.getPath());
485          return null;
486        }
487      }
488    }
489    return null;
490  }
491
492  /**
493   * Request file descriptors from a DomainPeer.
494   *
495   * @param peer   The peer to use for communication.
496   * @param slot   If non-null, the shared memory slot to associate with the 
497   *               new ShortCircuitReplica.
498   * 
499   * @return  A ShortCircuitReplica object if we could communicate with the
500   *          datanode; null, otherwise. 
501   * @throws  IOException If we encountered an I/O exception while communicating
502   *          with the datanode.
503   */
504  private ShortCircuitReplicaInfo requestFileDescriptors(DomainPeer peer,
505          Slot slot) throws IOException {
506    ShortCircuitCache cache = clientContext.getShortCircuitCache();
507    final DataOutputStream out =
508        new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
509    SlotId slotId = slot == null ? null : slot.getSlotId();
510    new Sender(out).requestShortCircuitFds(block, token, slotId, 1);
511    DataInputStream in = new DataInputStream(peer.getInputStream());
512    BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
513        PBHelper.vintPrefixed(in));
514    DomainSocket sock = peer.getDomainSocket();
515    switch (resp.getStatus()) {
516    case SUCCESS:
517      byte buf[] = new byte[1];
518      FileInputStream fis[] = new FileInputStream[2];
519      sock.recvFileInputStreams(fis, buf, 0, buf.length);
520      ShortCircuitReplica replica = null;
521      try {
522        ExtendedBlockId key =
523            new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
524        replica = new ShortCircuitReplica(key, fis[0], fis[1], cache,
525            Time.monotonicNow(), slot);
526      } catch (IOException e) {
527        // This indicates an error reading from disk, or a format error.  Since
528        // it's not a socket communication problem, we return null rather than
529        // throwing an exception.
530        LOG.warn(this + ": error creating ShortCircuitReplica.", e);
531        return null;
532      } finally {
533        if (replica == null) {
534          IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);
535        }
536      }
537      return new ShortCircuitReplicaInfo(replica);
538    case ERROR_UNSUPPORTED:
539      if (!resp.hasShortCircuitAccessVersion()) {
540        LOG.warn("short-circuit read access is disabled for " +
541            "DataNode " + datanode + ".  reason: " + resp.getMessage());
542        clientContext.getDomainSocketFactory()
543            .disableShortCircuitForPath(pathInfo.getPath());
544      } else {
545        LOG.warn("short-circuit read access for the file " +
546            fileName + " is disabled for DataNode " + datanode +
547            ".  reason: " + resp.getMessage());
548      }
549      return null;
550    case ERROR_ACCESS_TOKEN:
551      String msg = "access control error while " +
552          "attempting to set up short-circuit access to " +
553          fileName + resp.getMessage();
554      if (LOG.isDebugEnabled()) {
555        LOG.debug(this + ":" + msg);
556      }
557      return new ShortCircuitReplicaInfo(new InvalidToken(msg));
558    default:
559      LOG.warn(this + ": unknown response code " + resp.getStatus() +
560          " while attempting to set up short-circuit access. " +
561          resp.getMessage());
562      clientContext.getDomainSocketFactory()
563          .disableShortCircuitForPath(pathInfo.getPath());
564      return null;
565    }
566  }
567
568  /**
569   * Get a RemoteBlockReader that communicates over a UNIX domain socket.
570   *
571   * @return The new BlockReader, or null if we failed to create the block
572   * reader.
573   *
574   * @throws InvalidToken    If the block token was invalid.
575   * Potentially other security-related execptions.
576   */
577  private BlockReader getRemoteBlockReaderFromDomain() throws IOException {
578    if (pathInfo == null) {
579      pathInfo = clientContext.getDomainSocketFactory().
580                      getPathInfo(inetSocketAddress, conf);
581    }
582    if (!pathInfo.getPathState().getUsableForDataTransfer()) {
583      if (LOG.isTraceEnabled()) {
584        LOG.trace(this + ": not trying to create a remote block reader " +
585            "because the UNIX domain socket at " + pathInfo +
586            " is not usable.");
587      }
588      return null;
589    }
590    if (LOG.isTraceEnabled()) {
591      LOG.trace(this + ": trying to create a remote block reader from the " +
592          "UNIX domain socket at " + pathInfo.getPath());
593    }
594
595    while (true) {
596      BlockReaderPeer curPeer = nextDomainPeer();
597      if (curPeer == null) break;
598      if (curPeer.fromCache) remainingCacheTries--;
599      DomainPeer peer = (DomainPeer)curPeer.peer;
600      BlockReader blockReader = null;
601      try {
602        blockReader = getRemoteBlockReader(peer);
603        return blockReader;
604      } catch (IOException ioe) {
605        IOUtils.cleanup(LOG, peer);
606        if (isSecurityException(ioe)) {
607          if (LOG.isTraceEnabled()) {
608            LOG.trace(this + ": got security exception while constructing " +
609                "a remote block reader from the unix domain socket at " +
610                pathInfo.getPath(), ioe);
611          }
612          throw ioe;
613        }
614        if (curPeer.fromCache) {
615          // Handle an I/O error we got when using a cached peer.  These are
616          // considered less serious, because the underlying socket may be stale.
617          if (LOG.isDebugEnabled()) {
618            LOG.debug("Closed potentially stale domain peer " + peer, ioe);
619          }
620        } else {
621          // Handle an I/O error we got when using a newly created domain peer.
622          // We temporarily disable the domain socket path for a few minutes in
623          // this case, to prevent wasting more time on it.
624          LOG.warn("I/O error constructing remote block reader.  Disabling " +
625              "domain socket " + peer.getDomainSocket(), ioe);
626          clientContext.getDomainSocketFactory()
627              .disableDomainSocketPath(pathInfo.getPath());
628          return null;
629        }
630      } finally {
631        if (blockReader == null) {
632          IOUtils.cleanup(LOG, peer);
633        }
634      }
635    }
636    return null;
637  }
638
639  /**
640   * Get a RemoteBlockReader that communicates over a TCP socket.
641   *
642   * @return The new BlockReader.  We will not return null, but instead throw
643   *         an exception if this fails.
644   *
645   * @throws InvalidToken
646   *             If the block token was invalid.
647   *         InvalidEncryptionKeyException
648   *             If the encryption key was invalid.
649   *         Other IOException
650   *             If there was another problem.
651   */
652  private BlockReader getRemoteBlockReaderFromTcp() throws IOException {
653    if (LOG.isTraceEnabled()) {
654      LOG.trace(this + ": trying to create a remote block reader from a " +
655          "TCP socket");
656    }
657    BlockReader blockReader = null;
658    while (true) {
659      BlockReaderPeer curPeer = null;
660      Peer peer = null;
661      try {
662        curPeer = nextTcpPeer();
663        if (curPeer == null) break;
664        if (curPeer.fromCache) remainingCacheTries--;
665        peer = curPeer.peer;
666        blockReader = getRemoteBlockReader(peer);
667        return blockReader;
668      } catch (IOException ioe) {
669        if (isSecurityException(ioe)) {
670          if (LOG.isTraceEnabled()) {
671            LOG.trace(this + ": got security exception while constructing " +
672                "a remote block reader from " + peer, ioe);
673          }
674          throw ioe;
675        }
676        if ((curPeer != null) && curPeer.fromCache) {
677          // Handle an I/O error we got when using a cached peer.  These are
678          // considered less serious, because the underlying socket may be
679          // stale.
680          if (LOG.isDebugEnabled()) {
681            LOG.debug("Closed potentially stale remote peer " + peer, ioe);
682          }
683        } else {
684          // Handle an I/O error we got when using a newly created peer.
685          LOG.warn("I/O error constructing remote block reader.", ioe);
686          throw ioe;
687        }
688      } finally {
689        if (blockReader == null) {
690          IOUtils.cleanup(LOG, peer);
691        }
692      }
693    }
694    return null;
695  }
696
697  public static class BlockReaderPeer {
698    final Peer peer;
699    final boolean fromCache;
700    
701    BlockReaderPeer(Peer peer, boolean fromCache) {
702      this.peer = peer;
703      this.fromCache = fromCache;
704    }
705  }
706
707  /**
708   * Get the next DomainPeer-- either from the cache or by creating it.
709   *
710   * @return the next DomainPeer, or null if we could not construct one.
711   */
712  private BlockReaderPeer nextDomainPeer() {
713    if (remainingCacheTries > 0) {
714      Peer peer = clientContext.getPeerCache().get(datanode, true);
715      if (peer != null) {
716        if (LOG.isTraceEnabled()) {
717          LOG.trace("nextDomainPeer: reusing existing peer " + peer);
718        }
719        return new BlockReaderPeer(peer, true);
720      }
721    }
722    DomainSocket sock = clientContext.getDomainSocketFactory().
723        createSocket(pathInfo, conf.socketTimeout);
724    if (sock == null) return null;
725    return new BlockReaderPeer(new DomainPeer(sock), false);
726  }
727
728  /**
729   * Get the next TCP-based peer-- either from the cache or by creating it.
730   *
731   * @return the next Peer, or null if we could not construct one.
732   *
733   * @throws IOException  If there was an error while constructing the peer
734   *                      (such as an InvalidEncryptionKeyException)
735   */
736  private BlockReaderPeer nextTcpPeer() throws IOException {
737    if (remainingCacheTries > 0) {
738      Peer peer = clientContext.getPeerCache().get(datanode, false);
739      if (peer != null) {
740        if (LOG.isTraceEnabled()) {
741          LOG.trace("nextTcpPeer: reusing existing peer " + peer);
742        }
743        return new BlockReaderPeer(peer, true);
744      }
745    }
746    try {
747      Peer peer = remotePeerFactory.newConnectedPeer(inetSocketAddress);
748      if (LOG.isTraceEnabled()) {
749        LOG.trace("nextTcpPeer: created newConnectedPeer " + peer);
750      }
751      return new BlockReaderPeer(peer, false);
752    } catch (IOException e) {
753      if (LOG.isTraceEnabled()) {
754        LOG.trace("nextTcpPeer: failed to create newConnectedPeer " +
755                  "connected to " + datanode);
756      }
757      throw e;
758    }
759  }
760
761  /**
762   * Determine if an exception is security-related.
763   *
764   * We need to handle these exceptions differently than other IOExceptions.
765   * They don't indicate a communication problem.  Instead, they mean that there
766   * is some action the client needs to take, such as refetching block tokens,
767   * renewing encryption keys, etc.
768   *
769   * @param ioe    The exception
770   * @return       True only if the exception is security-related.
771   */
772  private static boolean isSecurityException(IOException ioe) {
773    return (ioe instanceof InvalidToken) ||
774            (ioe instanceof InvalidEncryptionKeyException) ||
775            (ioe instanceof InvalidBlockTokenException) ||
776            (ioe instanceof AccessControlException);
777  }
778
779  @SuppressWarnings("deprecation")
780  private BlockReader getRemoteBlockReader(Peer peer) throws IOException {
781    if (conf.useLegacyBlockReader) {
782      return RemoteBlockReader.newBlockReader(fileName,
783          block, token, startOffset, length, conf.ioBufferSize,
784          verifyChecksum, clientName, peer, datanode,
785          clientContext.getPeerCache(), cachingStrategy);
786    } else {
787      return RemoteBlockReader2.newBlockReader(
788          fileName, block, token, startOffset, length,
789          verifyChecksum, clientName, peer, datanode,
790          clientContext.getPeerCache(), cachingStrategy);
791    }
792  }
793
794  @Override
795  public String toString() {
796    return "BlockReaderFactory(fileName=" + fileName + ", block=" + block + ")";
797  }
798
799  /**
800   * File name to print when accessing a block directly (from servlets)
801   * @param s Address of the block location
802   * @param poolId Block pool ID of the block
803   * @param blockId Block ID of the block
804   * @return string that has a file name for debug purposes
805   */
806  public static String getFileName(final InetSocketAddress s,
807      final String poolId, final long blockId) {
808    return s.toString() + ":" + poolId + ":" + blockId;
809  }
810}