001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.blockmanagement;
019
020import java.util.ArrayList;
021import java.util.Collection;
022import java.util.HashMap;
023import java.util.List;
024import java.util.Map;
025import java.util.Set;
026
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.hdfs.DFSUtil;
029import org.apache.hadoop.hdfs.StorageType;
030import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
031import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
032import org.apache.hadoop.net.NetworkTopology;
033import org.apache.hadoop.net.NetworkTopologyWithNodeGroup;
034import org.apache.hadoop.net.Node;
035import org.apache.hadoop.net.NodeBase;
036
037/** The class is responsible for choosing the desired number of targets
038 * for placing block replicas on environment with node-group layer.
039 * The replica placement strategy is adjusted to:
040 * If the writer is on a datanode, the 1st replica is placed on the local 
041 *     node (or local node-group), otherwise a random datanode. 
042 * The 2nd replica is placed on a datanode that is on a different rack with 1st
043 *     replica node. 
044 * The 3rd replica is placed on a datanode which is on a different node-group
045 *     but the same rack as the second replica node.
046 */
047public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault {
048
049  protected BlockPlacementPolicyWithNodeGroup(Configuration conf,  FSClusterStats stats,
050      NetworkTopology clusterMap, DatanodeManager datanodeManager) {
051    initialize(conf, stats, clusterMap, host2datanodeMap);
052  }
053
054  protected BlockPlacementPolicyWithNodeGroup() {
055  }
056
057  public void initialize(Configuration conf,  FSClusterStats stats,
058          NetworkTopology clusterMap, 
059          Host2NodesMap host2datanodeMap) {
060    super.initialize(conf, stats, clusterMap, host2datanodeMap);
061  }
062
063  /** choose local node of localMachine as the target.
064   * if localMachine is not available, choose a node on the same nodegroup or 
065   * rack instead.
066   * @return the chosen node
067   */
068  @Override
069  protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
070      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
071      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
072      StorageType storageType, boolean fallbackToLocalRack
073      ) throws NotEnoughReplicasException {
074    // if no local machine, randomly choose one node
075    if (localMachine == null)
076      return chooseRandom(NodeBase.ROOT, excludedNodes, 
077          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
078
079    // otherwise try local machine first
080    if (localMachine instanceof DatanodeDescriptor) {
081      DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
082      if (excludedNodes.add(localMachine)) { // was not in the excluded list
083        for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
084            localDataNode.getStorageInfos())) {
085          if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
086              maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
087            return localStorage;
088          }
089        }
090      }
091    }
092
093    // try a node on local node group
094    DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
095        (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
096        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
097    if (chosenStorage != null) {
098      return chosenStorage;
099    }
100
101    if (!fallbackToLocalRack) {
102      return null;
103    }
104    // try a node on local rack
105    return chooseLocalRack(localMachine, excludedNodes, 
106        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
107  }
108
109  /** @return the node of the second replica */
110  private static DatanodeDescriptor secondNode(Node localMachine,
111      List<DatanodeStorageInfo> results) {
112    // find the second replica
113    for(DatanodeStorageInfo nextStorage : results) {
114      DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
115      if (nextNode != localMachine) {
116        return nextNode;
117      }
118    }
119    return null;
120  }
121
122  @Override
123  protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
124      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
125      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
126      StorageType storageType) throws NotEnoughReplicasException {
127    // no local machine, so choose a random machine
128    if (localMachine == null) {
129      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
130          maxNodesPerRack, results, avoidStaleNodes, storageType);
131    }
132
133    // choose one from the local rack, but off-nodegroup
134    try {
135      final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
136      return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
137          results, avoidStaleNodes, storageType);
138    } catch (NotEnoughReplicasException e1) {
139      // find the second replica
140      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
141      if (newLocal != null) {
142        try {
143          return chooseRandom(
144              clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
145              blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
146        } catch(NotEnoughReplicasException e2) {
147          //otherwise randomly choose one from the network
148          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
149              maxNodesPerRack, results, avoidStaleNodes, storageType);
150        }
151      } else {
152        //otherwise randomly choose one from the network
153        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
154            maxNodesPerRack, results, avoidStaleNodes, storageType);
155      }
156    }
157  }
158
159  /**
160   * {@inheritDoc}
161   */
162  @Override
163  protected void chooseRemoteRack(int numOfReplicas,
164      DatanodeDescriptor localMachine, Set<Node> excludedNodes,
165      long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
166      boolean avoidStaleNodes, StorageType storageType)
167          throws NotEnoughReplicasException {
168    int oldNumOfReplicas = results.size();
169
170    final String rackLocation = NetworkTopology.getFirstHalf(
171        localMachine.getNetworkLocation());
172    try {
173      // randomly choose from remote racks
174      chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
175          maxReplicasPerRack, results, avoidStaleNodes, storageType);
176    } catch (NotEnoughReplicasException e) {
177      // fall back to the local rack
178      chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
179          rackLocation, excludedNodes, blocksize,
180          maxReplicasPerRack, results, avoidStaleNodes, storageType);
181    }
182  }
183
184  /* choose one node from the nodegroup that <i>localMachine</i> is on.
185   * if no such node is available, choose one node from the nodegroup where
186   * a second replica is on.
187   * if still no such node is available, choose a random node in the cluster.
188   * @return the chosen node
189   */
190  private DatanodeStorageInfo chooseLocalNodeGroup(
191      NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
192      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
193      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
194      StorageType storageType) throws NotEnoughReplicasException {
195    // no local machine, so choose a random machine
196    if (localMachine == null) {
197      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
198          maxNodesPerRack, results, avoidStaleNodes, storageType);
199    }
200
201    // choose one from the local node group
202    try {
203      return chooseRandom(
204          clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
205          excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
206          storageType);
207    } catch (NotEnoughReplicasException e1) {
208      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
209      if (newLocal != null) {
210        try {
211          return chooseRandom(
212              clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
213              excludedNodes, blocksize, maxNodesPerRack, results,
214              avoidStaleNodes, storageType);
215        } catch(NotEnoughReplicasException e2) {
216          //otherwise randomly choose one from the network
217          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
218              maxNodesPerRack, results, avoidStaleNodes, storageType);
219        }
220      } else {
221        //otherwise randomly choose one from the network
222        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
223            maxNodesPerRack, results, avoidStaleNodes, storageType);
224      }
225    }
226  }
227
228  @Override
229  protected String getRack(final DatanodeInfo cur) {
230    String nodeGroupString = cur.getNetworkLocation();
231    return NetworkTopology.getFirstHalf(nodeGroupString);
232  }
233  
234  /**
235   * Find other nodes in the same nodegroup of <i>localMachine</i> and add them
236   * into <i>excludeNodes</i> as replica should not be duplicated for nodes 
237   * within the same nodegroup
238   * @return number of new excluded nodes
239   */
240  @Override
241  protected int addToExcludedNodes(DatanodeDescriptor chosenNode,
242      Set<Node> excludedNodes) {
243    int countOfExcludedNodes = 0;
244    String nodeGroupScope = chosenNode.getNetworkLocation();
245    List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope);
246    for (Node leafNode : leafNodes) {
247      if (excludedNodes.add(leafNode)) {
248        // not a existing node in excludedNodes
249        countOfExcludedNodes++;
250      }
251    }
252    
253    countOfExcludedNodes += addDependentNodesToExcludedNodes(
254        chosenNode, excludedNodes);
255    return countOfExcludedNodes;
256  }
257  
258  /**
259   * Add all nodes from a dependent nodes list to excludedNodes.
260   * @return number of new excluded nodes
261   */
262  private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode,
263      Set<Node> excludedNodes) {
264    if (this.host2datanodeMap == null) {
265      return 0;
266    }
267    int countOfExcludedNodes = 0;
268    for(String hostname : chosenNode.getDependentHostNames()) {
269      DatanodeDescriptor node =
270          this.host2datanodeMap.getDataNodeByHostName(hostname);
271      if(node!=null) {
272        if (excludedNodes.add(node)) {
273          countOfExcludedNodes++;
274        }
275      } else {
276        LOG.warn("Not able to find datanode " + hostname
277            + " which has dependency with datanode "
278            + chosenNode.getHostName());
279      }
280    }
281    
282    return countOfExcludedNodes;
283  }
284
285  /**
286   * Pick up replica node set for deleting replica as over-replicated. 
287   * First set contains replica nodes on rack with more than one
288   * replica while second set contains remaining replica nodes.
289   * If first is not empty, divide first set into two subsets:
290   *   moreThanOne contains nodes on nodegroup with more than one replica
291   *   exactlyOne contains the remaining nodes in first set
292   * then pickup priSet if not empty.
293   * If first is empty, then pick second.
294   */
295  @Override
296  public Collection<DatanodeDescriptor> pickupReplicaSet(
297      Collection<DatanodeDescriptor> first,
298      Collection<DatanodeDescriptor> second) {
299    // If no replica within same rack, return directly.
300    if (first.isEmpty()) {
301      return second;
302    }
303    // Split data nodes in the first set into two sets, 
304    // moreThanOne contains nodes on nodegroup with more than one replica
305    // exactlyOne contains the remaining nodes
306    Map<String, List<DatanodeDescriptor>> nodeGroupMap = 
307        new HashMap<String, List<DatanodeDescriptor>>();
308    
309    for(DatanodeDescriptor node : first) {
310      final String nodeGroupName = 
311          NetworkTopology.getLastHalf(node.getNetworkLocation());
312      List<DatanodeDescriptor> datanodeList = 
313          nodeGroupMap.get(nodeGroupName);
314      if (datanodeList == null) {
315        datanodeList = new ArrayList<DatanodeDescriptor>();
316        nodeGroupMap.put(nodeGroupName, datanodeList);
317      }
318      datanodeList.add(node);
319    }
320    
321    final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>();
322    final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>();
323    // split nodes into two sets
324    for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) {
325      if (datanodeList.size() == 1 ) {
326        // exactlyOne contains nodes on nodegroup with exactly one replica
327        exactlyOne.add(datanodeList.get(0));
328      } else {
329        // moreThanOne contains nodes on nodegroup with more than one replica
330        moreThanOne.addAll(datanodeList);
331      }
332    }
333    
334    return moreThanOne.isEmpty()? exactlyOne : moreThanOne;
335  }
336  
337}