001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode.fsdataset;
019
020import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
021import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY;
022import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
023import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY;
024
025import java.io.IOException;
026import java.util.ArrayList;
027import java.util.List;
028import java.util.Random;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.conf.Configurable;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
035
036/**
037 * A DN volume choosing policy which takes into account the amount of free
038 * space on each of the available volumes when considering where to assign a
039 * new replica allocation. By default this policy prefers assigning replicas to
040 * those volumes with more available free space, so as to over time balance the
041 * available space of all the volumes within a DN.
042 */
043public class AvailableSpaceVolumeChoosingPolicy<V extends FsVolumeSpi>
044    implements VolumeChoosingPolicy<V>, Configurable {
045  
046  private static final Log LOG = LogFactory.getLog(AvailableSpaceVolumeChoosingPolicy.class);
047  
048  private final Random random;
049  
050  private long balancedSpaceThreshold = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT;
051  private float balancedPreferencePercent = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT;
052
053  AvailableSpaceVolumeChoosingPolicy(Random random) {
054    this.random = random;
055  }
056
057  public AvailableSpaceVolumeChoosingPolicy() {
058    this(new Random());
059  }
060
061  @Override
062  public synchronized void setConf(Configuration conf) {
063    balancedSpaceThreshold = conf.getLong(
064        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,
065        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT);
066    balancedPreferencePercent = conf.getFloat(
067        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY,
068        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT);
069    
070    LOG.info("Available space volume choosing policy initialized: " +
071        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY +
072        " = " + balancedSpaceThreshold + ", " +
073        DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
074        " = " + balancedPreferencePercent);
075
076    if (balancedPreferencePercent > 1.0) {
077      LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
078               " is greater than 1.0 but should be in the range 0.0 - 1.0");
079    }
080
081    if (balancedPreferencePercent < 0.5) {
082      LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY +
083               " is less than 0.5 so volumes with less available disk space will receive more block allocations");
084    }
085  }
086  
087  @Override
088  public synchronized Configuration getConf() {
089    // Nothing to do. Only added to fulfill the Configurable contract.
090    return null;
091  }
092  
093  private final VolumeChoosingPolicy<V> roundRobinPolicyBalanced =
094      new RoundRobinVolumeChoosingPolicy<V>();
095  private final VolumeChoosingPolicy<V> roundRobinPolicyHighAvailable =
096      new RoundRobinVolumeChoosingPolicy<V>();
097  private final VolumeChoosingPolicy<V> roundRobinPolicyLowAvailable =
098      new RoundRobinVolumeChoosingPolicy<V>();
099
100  @Override
101  public synchronized V chooseVolume(List<V> volumes,
102      final long replicaSize) throws IOException {
103    if (volumes.size() < 1) {
104      throw new DiskOutOfSpaceException("No more available volumes");
105    }
106    
107    AvailableSpaceVolumeList volumesWithSpaces =
108        new AvailableSpaceVolumeList(volumes);
109    
110    if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) {
111      // If they're actually not too far out of whack, fall back on pure round
112      // robin.
113      V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize);
114      if (LOG.isDebugEnabled()) {
115        LOG.debug("All volumes are within the configured free space balance " +
116            "threshold. Selecting " + volume + " for write of block size " +
117            replicaSize);
118      }
119      return volume;
120    } else {
121      V volume = null;
122      // If none of the volumes with low free space have enough space for the
123      // replica, always try to choose a volume with a lot of free space.
124      long mostAvailableAmongLowVolumes = volumesWithSpaces
125          .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace();
126      
127      List<V> highAvailableVolumes = extractVolumesFromPairs(
128          volumesWithSpaces.getVolumesWithHighAvailableSpace());
129      List<V> lowAvailableVolumes = extractVolumesFromPairs(
130          volumesWithSpaces.getVolumesWithLowAvailableSpace());
131      
132      float preferencePercentScaler =
133          (highAvailableVolumes.size() * balancedPreferencePercent) +
134          (lowAvailableVolumes.size() * (1 - balancedPreferencePercent));
135      float scaledPreferencePercent =
136          (highAvailableVolumes.size() * balancedPreferencePercent) /
137          preferencePercentScaler;
138      if (mostAvailableAmongLowVolumes < replicaSize ||
139          random.nextFloat() < scaledPreferencePercent) {
140        volume = roundRobinPolicyHighAvailable.chooseVolume(
141            highAvailableVolumes,
142            replicaSize);
143        if (LOG.isDebugEnabled()) {
144          LOG.debug("Volumes are imbalanced. Selecting " + volume +
145              " from high available space volumes for write of block size "
146              + replicaSize);
147        }
148      } else {
149        volume = roundRobinPolicyLowAvailable.chooseVolume(
150            lowAvailableVolumes,
151            replicaSize);
152        if (LOG.isDebugEnabled()) {
153          LOG.debug("Volumes are imbalanced. Selecting " + volume +
154              " from low available space volumes for write of block size "
155              + replicaSize);
156        }
157      }
158      return volume;
159    }
160  }
161  
162  /**
163   * Used to keep track of the list of volumes we're choosing from.
164   */
165  private class AvailableSpaceVolumeList {
166    private final List<AvailableSpaceVolumePair> volumes;
167    
168    public AvailableSpaceVolumeList(List<V> volumes) throws IOException {
169      this.volumes = new ArrayList<AvailableSpaceVolumePair>();
170      for (V volume : volumes) {
171        this.volumes.add(new AvailableSpaceVolumePair(volume));
172      }
173    }
174    
175    /**
176     * @return true if all volumes' free space is within the
177     *         configured threshold, false otherwise.
178     */
179    public boolean areAllVolumesWithinFreeSpaceThreshold() {
180      long leastAvailable = Long.MAX_VALUE;
181      long mostAvailable = 0;
182      for (AvailableSpaceVolumePair volume : volumes) {
183        leastAvailable = Math.min(leastAvailable, volume.getAvailable());
184        mostAvailable = Math.max(mostAvailable, volume.getAvailable());
185      }
186      return (mostAvailable - leastAvailable) < balancedSpaceThreshold;
187    }
188    
189    /**
190     * @return the minimum amount of space available on a single volume,
191     *         across all volumes.
192     */
193    private long getLeastAvailableSpace() {
194      long leastAvailable = Long.MAX_VALUE;
195      for (AvailableSpaceVolumePair volume : volumes) {
196        leastAvailable = Math.min(leastAvailable, volume.getAvailable());
197      }
198      return leastAvailable;
199    }
200    
201    /**
202     * @return the maximum amount of space available across volumes with low space.
203     */
204    public long getMostAvailableSpaceAmongVolumesWithLowAvailableSpace() {
205      long mostAvailable = Long.MIN_VALUE;
206      for (AvailableSpaceVolumePair volume : getVolumesWithLowAvailableSpace()) {
207        mostAvailable = Math.max(mostAvailable, volume.getAvailable());
208      }
209      return mostAvailable;
210    }
211    
212    /**
213     * @return the list of volumes with relatively low available space.
214     */
215    public List<AvailableSpaceVolumePair> getVolumesWithLowAvailableSpace() {
216      long leastAvailable = getLeastAvailableSpace();
217      List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
218      for (AvailableSpaceVolumePair volume : volumes) {
219        if (volume.getAvailable() <= leastAvailable + balancedSpaceThreshold) {
220          ret.add(volume);
221        }
222      }
223      return ret;
224    }
225    
226    /**
227     * @return the list of volumes with a lot of available space.
228     */
229    public List<AvailableSpaceVolumePair> getVolumesWithHighAvailableSpace() {
230      long leastAvailable = getLeastAvailableSpace();
231      List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>();
232      for (AvailableSpaceVolumePair volume : volumes) {
233        if (volume.getAvailable() > leastAvailable + balancedSpaceThreshold) {
234          ret.add(volume);
235        }
236      }
237      return ret;
238    }
239    
240  }
241  
242  /**
243   * Used so that we only check the available space on a given volume once, at
244   * the beginning of {@link AvailableSpaceVolumeChoosingPolicy#chooseVolume(List, long)}.
245   */
246  private class AvailableSpaceVolumePair {
247    private final V volume;
248    private final long availableSpace;
249    
250    public AvailableSpaceVolumePair(V volume) throws IOException {
251      this.volume = volume;
252      this.availableSpace = volume.getAvailable();
253    }
254    
255    public long getAvailable() {
256      return availableSpace;
257    }
258    
259    public V getVolume() {
260      return volume;
261    }
262  }
263  
264  private List<V> extractVolumesFromPairs(List<AvailableSpaceVolumePair> volumes) {
265    List<V> ret = new ArrayList<V>();
266    for (AvailableSpaceVolumePair volume : volumes) {
267      ret.add(volume.getVolume());
268    }
269    return ret;
270  }
271
272}