001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
020
021import com.google.common.base.Preconditions;
022import com.google.common.util.concurrent.ThreadFactoryBuilder;
023
024import java.io.FileInputStream;
025import java.io.FileNotFoundException;
026import java.io.IOException;
027import java.util.ArrayList;
028import java.util.HashMap;
029import java.util.Iterator;
030import java.util.List;
031import java.util.Map.Entry;
032import java.util.concurrent.Executor;
033import java.util.concurrent.LinkedBlockingQueue;
034import java.util.concurrent.ThreadFactory;
035import java.util.concurrent.ThreadPoolExecutor;
036import java.util.concurrent.TimeUnit;
037import java.util.concurrent.atomic.AtomicLong;
038
039import org.apache.commons.io.IOUtils;
040import org.apache.commons.logging.Log;
041import org.apache.commons.logging.LogFactory;
042import org.apache.hadoop.classification.InterfaceAudience;
043import org.apache.hadoop.classification.InterfaceStability;
044import org.apache.hadoop.fs.ChecksumException;
045import org.apache.hadoop.hdfs.ExtendedBlockId;
046import org.apache.hadoop.hdfs.DFSConfigKeys;
047import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
048import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
049import org.apache.hadoop.io.nativeio.NativeIO;
050
051/**
052 * Manages caching for an FsDatasetImpl by using the mmap(2) and mlock(2)
053 * system calls to lock blocks into memory. Block checksums are verified upon
054 * entry into the cache.
055 */
056@InterfaceAudience.Private
057@InterfaceStability.Unstable
058public class FsDatasetCache {
059  /**
060   * MappableBlocks that we know about.
061   */
062  private static final class Value {
063    final State state;
064    final MappableBlock mappableBlock;
065
066    Value(MappableBlock mappableBlock, State state) {
067      this.mappableBlock = mappableBlock;
068      this.state = state;
069    }
070  }
071
072  private enum State {
073    /**
074     * The MappableBlock is in the process of being cached.
075     */
076    CACHING,
077
078    /**
079     * The MappableBlock was in the process of being cached, but it was
080     * cancelled.  Only the FsDatasetCache#WorkerTask can remove cancelled
081     * MappableBlock objects.
082     */
083    CACHING_CANCELLED,
084
085    /**
086     * The MappableBlock is in the cache.
087     */
088    CACHED,
089
090    /**
091     * The MappableBlock is in the process of uncaching.
092     */
093    UNCACHING;
094
095    /**
096     * Whether we should advertise this block as cached to the NameNode and
097     * clients.
098     */
099    public boolean shouldAdvertise() {
100      return (this == CACHED);
101    }
102  }
103
104  private static final Log LOG = LogFactory.getLog(FsDatasetCache.class);
105
106  /**
107   * Stores MappableBlock objects and the states they're in.
108   */
109  private final HashMap<ExtendedBlockId, Value> mappableBlockMap =
110      new HashMap<ExtendedBlockId, Value>();
111
112  private final AtomicLong numBlocksCached = new AtomicLong(0);
113
114  private final FsDatasetImpl dataset;
115
116  private final ThreadPoolExecutor uncachingExecutor;
117
118  /**
119   * The approximate amount of cache space in use.
120   *
121   * This number is an overestimate, counting bytes that will be used only
122   * if pending caching operations succeed.  It does not take into account
123   * pending uncaching operations.
124   *
125   * This overestimate is more useful to the NameNode than an underestimate,
126   * since we don't want the NameNode to assign us more replicas than
127   * we can cache, because of the current batch of operations.
128   */
129  private final UsedBytesCount usedBytesCount;
130
131  public static class PageRounder {
132    private final long osPageSize =
133        NativeIO.POSIX.getCacheManipulator().getOperatingSystemPageSize();
134
135    /**
136     * Round up a number to the operating system page size.
137     */
138    public long round(long count) {
139      long newCount = 
140          (count + (osPageSize - 1)) / osPageSize;
141      return newCount * osPageSize;
142    }
143  }
144
145  private class UsedBytesCount {
146    private final AtomicLong usedBytes = new AtomicLong(0);
147    
148    private final PageRounder rounder = new PageRounder();
149
150    /**
151     * Try to reserve more bytes.
152     *
153     * @param count    The number of bytes to add.  We will round this
154     *                 up to the page size.
155     *
156     * @return         The new number of usedBytes if we succeeded;
157     *                 -1 if we failed.
158     */
159    long reserve(long count) {
160      count = rounder.round(count);
161      while (true) {
162        long cur = usedBytes.get();
163        long next = cur + count;
164        if (next > maxBytes) {
165          return -1;
166        }
167        if (usedBytes.compareAndSet(cur, next)) {
168          return next;
169        }
170      }
171    }
172    
173    /**
174     * Release some bytes that we're using.
175     *
176     * @param count    The number of bytes to release.  We will round this
177     *                 up to the page size.
178     *
179     * @return         The new number of usedBytes.
180     */
181    long release(long count) {
182      count = rounder.round(count);
183      return usedBytes.addAndGet(-count);
184    }
185    
186    long get() {
187      return usedBytes.get();
188    }
189  }
190
191  /**
192   * The total cache capacity in bytes.
193   */
194  private final long maxBytes;
195
196  /**
197   * Number of cache commands that could not be completed successfully
198   */
199  final AtomicLong numBlocksFailedToCache = new AtomicLong(0);
200  /**
201   * Number of uncache commands that could not be completed successfully
202   */
203  final AtomicLong numBlocksFailedToUncache = new AtomicLong(0);
204
205  public FsDatasetCache(FsDatasetImpl dataset) {
206    this.dataset = dataset;
207    this.maxBytes = dataset.datanode.getDnConf().getMaxLockedMemory();
208    ThreadFactory workerFactory = new ThreadFactoryBuilder()
209        .setDaemon(true)
210        .setNameFormat("FsDatasetCache-%d-" + dataset.toString())
211        .build();
212    this.usedBytesCount = new UsedBytesCount();
213    this.uncachingExecutor = new ThreadPoolExecutor(
214            0, 1,
215            60, TimeUnit.SECONDS,
216            new LinkedBlockingQueue<Runnable>(),
217            workerFactory);
218    this.uncachingExecutor.allowCoreThreadTimeOut(true);
219  }
220
221  /**
222   * @return List of cached blocks suitable for translation into a
223   * {@link BlockListAsLongs} for a cache report.
224   */
225  synchronized List<Long> getCachedBlocks(String bpid) {
226    List<Long> blocks = new ArrayList<Long>();
227    for (Iterator<Entry<ExtendedBlockId, Value>> iter =
228        mappableBlockMap.entrySet().iterator(); iter.hasNext(); ) {
229      Entry<ExtendedBlockId, Value> entry = iter.next();
230      if (entry.getKey().getBlockPoolId().equals(bpid)) {
231        if (entry.getValue().state.shouldAdvertise()) {
232          blocks.add(entry.getKey().getBlockId());
233        }
234      }
235    }
236    return blocks;
237  }
238
239  /**
240   * Attempt to begin caching a block.
241   */
242  synchronized void cacheBlock(long blockId, String bpid,
243      String blockFileName, long length, long genstamp,
244      Executor volumeExecutor) {
245    ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
246    Value prevValue = mappableBlockMap.get(key);
247    if (prevValue != null) {
248      if (LOG.isDebugEnabled()) {
249        LOG.debug("Block with id " + blockId + ", pool " + bpid +
250            " already exists in the FsDatasetCache with state " +
251            prevValue.state);
252      }
253      numBlocksFailedToCache.incrementAndGet();
254      return;
255    }
256    mappableBlockMap.put(key, new Value(null, State.CACHING));
257    volumeExecutor.execute(
258        new CachingTask(key, blockFileName, length, genstamp));
259    if (LOG.isDebugEnabled()) {
260      LOG.debug("Initiating caching for Block with id " + blockId +
261          ", pool " + bpid);
262    }
263  }
264
265  synchronized void uncacheBlock(String bpid, long blockId) {
266    ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
267    Value prevValue = mappableBlockMap.get(key);
268
269    if (!dataset.datanode.getShortCircuitRegistry().
270            processBlockMunlockRequest(key)) {
271      // TODO: we probably want to forcibly uncache the block (and close the 
272      // shm) after a certain timeout has elapsed.
273      if (LOG.isDebugEnabled()) {
274        LOG.debug(key + " is anchored, and can't be uncached now.");
275      }
276      return;
277    }
278    if (prevValue == null) {
279      if (LOG.isDebugEnabled()) {
280        LOG.debug("Block with id " + blockId + ", pool " + bpid + " " +
281            "does not need to be uncached, because it is not currently " +
282            "in the mappableBlockMap.");
283      }
284      numBlocksFailedToUncache.incrementAndGet();
285      return;
286    }
287    switch (prevValue.state) {
288    case CACHING:
289      if (LOG.isDebugEnabled()) {
290        LOG.debug("Cancelling caching for block with id " + blockId +
291            ", pool " + bpid + ".");
292      }
293      mappableBlockMap.put(key,
294          new Value(prevValue.mappableBlock, State.CACHING_CANCELLED));
295      break;
296    case CACHED:
297      if (LOG.isDebugEnabled()) {
298        LOG.debug("Block with id " + blockId + ", pool " + bpid + " " +
299            "has been scheduled for uncaching.");
300      }
301      mappableBlockMap.put(key,
302          new Value(prevValue.mappableBlock, State.UNCACHING));
303      uncachingExecutor.execute(new UncachingTask(key));
304      break;
305    default:
306      if (LOG.isDebugEnabled()) {
307        LOG.debug("Block with id " + blockId + ", pool " + bpid + " " +
308            "does not need to be uncached, because it is " +
309            "in state " + prevValue.state + ".");
310      }
311      numBlocksFailedToUncache.incrementAndGet();
312      break;
313    }
314  }
315
316  /**
317   * Background worker that mmaps, mlocks, and checksums a block
318   */
319  private class CachingTask implements Runnable {
320    private final ExtendedBlockId key; 
321    private final String blockFileName;
322    private final long length;
323    private final long genstamp;
324
325    CachingTask(ExtendedBlockId key, String blockFileName, long length, long genstamp) {
326      this.key = key;
327      this.blockFileName = blockFileName;
328      this.length = length;
329      this.genstamp = genstamp;
330    }
331
332    @Override
333    public void run() {
334      boolean success = false;
335      FileInputStream blockIn = null, metaIn = null;
336      MappableBlock mappableBlock = null;
337      ExtendedBlock extBlk = new ExtendedBlock(key.getBlockPoolId(),
338          key.getBlockId(), length, genstamp);
339      long newUsedBytes = usedBytesCount.reserve(length);
340      boolean reservedBytes = false;
341      try {
342        if (newUsedBytes < 0) {
343          LOG.warn("Failed to cache " + key + ": could not reserve " + length +
344              " more bytes in the cache: " +
345              DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
346              " of " + maxBytes + " exceeded.");
347          return;
348        }
349        reservedBytes = true;
350        try {
351          blockIn = (FileInputStream)dataset.getBlockInputStream(extBlk, 0);
352          metaIn = (FileInputStream)dataset.getMetaDataInputStream(extBlk)
353              .getWrappedStream();
354        } catch (ClassCastException e) {
355          LOG.warn("Failed to cache " + key +
356              ": Underlying blocks are not backed by files.", e);
357          return;
358        } catch (FileNotFoundException e) {
359          LOG.info("Failed to cache " + key + ": failed to find backing " +
360              "files.");
361          return;
362        } catch (IOException e) {
363          LOG.warn("Failed to cache " + key + ": failed to open file", e);
364          return;
365        }
366        try {
367          mappableBlock = MappableBlock.
368              load(length, blockIn, metaIn, blockFileName);
369        } catch (ChecksumException e) {
370          // Exception message is bogus since this wasn't caused by a file read
371          LOG.warn("Failed to cache " + key + ": checksum verification failed.");
372          return;
373        } catch (IOException e) {
374          LOG.warn("Failed to cache " + key, e);
375          return;
376        }
377        synchronized (FsDatasetCache.this) {
378          Value value = mappableBlockMap.get(key);
379          Preconditions.checkNotNull(value);
380          Preconditions.checkState(value.state == State.CACHING ||
381                                   value.state == State.CACHING_CANCELLED);
382          if (value.state == State.CACHING_CANCELLED) {
383            mappableBlockMap.remove(key);
384            LOG.warn("Caching of " + key + " was cancelled.");
385            return;
386          }
387          mappableBlockMap.put(key, new Value(mappableBlock, State.CACHED));
388        }
389        if (LOG.isDebugEnabled()) {
390          LOG.debug("Successfully cached " + key + ".  We are now caching " +
391              newUsedBytes + " bytes in total.");
392        }
393        dataset.datanode.getShortCircuitRegistry().processBlockMlockEvent(key);
394        numBlocksCached.addAndGet(1);
395        dataset.datanode.getMetrics().incrBlocksCached(1);
396        success = true;
397      } finally {
398        IOUtils.closeQuietly(blockIn);
399        IOUtils.closeQuietly(metaIn);
400        if (!success) {
401          if (reservedBytes) {
402            newUsedBytes = usedBytesCount.release(length);
403          }
404          if (LOG.isDebugEnabled()) {
405            LOG.debug("Caching of " + key + " was aborted.  We are now " +
406                "caching only " + newUsedBytes + " + bytes in total.");
407          }
408          if (mappableBlock != null) {
409            mappableBlock.close();
410          }
411          numBlocksFailedToCache.incrementAndGet();
412
413          synchronized (FsDatasetCache.this) {
414            mappableBlockMap.remove(key);
415          }
416        }
417      }
418    }
419  }
420
421  private class UncachingTask implements Runnable {
422    private final ExtendedBlockId key; 
423
424    UncachingTask(ExtendedBlockId key) {
425      this.key = key;
426    }
427
428    @Override
429    public void run() {
430      Value value;
431      
432      synchronized (FsDatasetCache.this) {
433        value = mappableBlockMap.get(key);
434      }
435      Preconditions.checkNotNull(value);
436      Preconditions.checkArgument(value.state == State.UNCACHING);
437      // TODO: we will eventually need to do revocation here if any clients
438      // are reading via mmap with checksums enabled.  See HDFS-5182.
439      IOUtils.closeQuietly(value.mappableBlock);
440      synchronized (FsDatasetCache.this) {
441        mappableBlockMap.remove(key);
442      }
443      long newUsedBytes =
444          usedBytesCount.release(value.mappableBlock.getLength());
445      numBlocksCached.addAndGet(-1);
446      dataset.datanode.getMetrics().incrBlocksUncached(1);
447      if (LOG.isDebugEnabled()) {
448        LOG.debug("Uncaching of " + key + " completed.  " +
449            "usedBytes = " + newUsedBytes);
450      }
451    }
452  }
453
454  // Stats related methods for FSDatasetMBean
455
456  /**
457   * Get the approximate amount of cache space used.
458   */
459  public long getCacheUsed() {
460    return usedBytesCount.get();
461  }
462
463  /**
464   * Get the maximum amount of bytes we can cache.  This is a constant.
465   */
466  public long getCacheCapacity() {
467    return maxBytes;
468  }
469
470  public long getNumBlocksFailedToCache() {
471    return numBlocksFailedToCache.get();
472  }
473
474  public long getNumBlocksFailedToUncache() {
475    return numBlocksFailedToUncache.get();
476  }
477
478  public long getNumBlocksCached() {
479    return numBlocksCached.get();
480  }
481
482  public synchronized boolean isCached(String bpid, long blockId) {
483    ExtendedBlockId block = new ExtendedBlockId(blockId, bpid);
484    Value val = mappableBlockMap.get(block);
485    return (val != null) && val.state.shouldAdvertise();
486  }
487}