001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.blockmanagement; 019 020import java.util.ArrayList; 021import java.util.Collection; 022import java.util.HashMap; 023import java.util.List; 024import java.util.Map; 025import java.util.Set; 026 027import org.apache.hadoop.conf.Configuration; 028import org.apache.hadoop.hdfs.DFSUtil; 029import org.apache.hadoop.hdfs.StorageType; 030import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 031import org.apache.hadoop.hdfs.server.namenode.FSClusterStats; 032import org.apache.hadoop.net.NetworkTopology; 033import org.apache.hadoop.net.NetworkTopologyWithNodeGroup; 034import org.apache.hadoop.net.Node; 035import org.apache.hadoop.net.NodeBase; 036 037/** The class is responsible for choosing the desired number of targets 038 * for placing block replicas on environment with node-group layer. 039 * The replica placement strategy is adjusted to: 040 * If the writer is on a datanode, the 1st replica is placed on the local 041 * node (or local node-group), otherwise a random datanode. 042 * The 2nd replica is placed on a datanode that is on a different rack with 1st 043 * replica node. 044 * The 3rd replica is placed on a datanode which is on a different node-group 045 * but the same rack as the second replica node. 046 */ 047public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault { 048 049 protected BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats, 050 NetworkTopology clusterMap, DatanodeManager datanodeManager) { 051 initialize(conf, stats, clusterMap, host2datanodeMap); 052 } 053 054 protected BlockPlacementPolicyWithNodeGroup() { 055 } 056 057 public void initialize(Configuration conf, FSClusterStats stats, 058 NetworkTopology clusterMap, 059 Host2NodesMap host2datanodeMap) { 060 super.initialize(conf, stats, clusterMap, host2datanodeMap); 061 } 062 063 /** choose local node of localMachine as the target. 064 * if localMachine is not available, choose a node on the same nodegroup or 065 * rack instead. 066 * @return the chosen node 067 */ 068 @Override 069 protected DatanodeStorageInfo chooseLocalStorage(Node localMachine, 070 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 071 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 072 StorageType storageType, boolean fallbackToLocalRack 073 ) throws NotEnoughReplicasException { 074 // if no local machine, randomly choose one node 075 if (localMachine == null) 076 return chooseRandom(NodeBase.ROOT, excludedNodes, 077 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 078 079 // otherwise try local machine first 080 if (localMachine instanceof DatanodeDescriptor) { 081 DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine; 082 if (excludedNodes.add(localMachine)) { // was not in the excluded list 083 for(DatanodeStorageInfo localStorage : DFSUtil.shuffle( 084 localDataNode.getStorageInfos())) { 085 if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize, 086 maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) { 087 return localStorage; 088 } 089 } 090 } 091 } 092 093 // try a node on local node group 094 DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup( 095 (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 096 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 097 if (chosenStorage != null) { 098 return chosenStorage; 099 } 100 101 if (!fallbackToLocalRack) { 102 return null; 103 } 104 // try a node on local rack 105 return chooseLocalRack(localMachine, excludedNodes, 106 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 107 } 108 109 /** @return the node of the second replica */ 110 private static DatanodeDescriptor secondNode(Node localMachine, 111 List<DatanodeStorageInfo> results) { 112 // find the second replica 113 for(DatanodeStorageInfo nextStorage : results) { 114 DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor(); 115 if (nextNode != localMachine) { 116 return nextNode; 117 } 118 } 119 return null; 120 } 121 122 @Override 123 protected DatanodeStorageInfo chooseLocalRack(Node localMachine, 124 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 125 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 126 StorageType storageType) throws NotEnoughReplicasException { 127 // no local machine, so choose a random machine 128 if (localMachine == null) { 129 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 130 maxNodesPerRack, results, avoidStaleNodes, storageType); 131 } 132 133 // choose one from the local rack, but off-nodegroup 134 try { 135 final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation()); 136 return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack, 137 results, avoidStaleNodes, storageType); 138 } catch (NotEnoughReplicasException e1) { 139 // find the second replica 140 final DatanodeDescriptor newLocal = secondNode(localMachine, results); 141 if (newLocal != null) { 142 try { 143 return chooseRandom( 144 clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes, 145 blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType); 146 } catch(NotEnoughReplicasException e2) { 147 //otherwise randomly choose one from the network 148 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 149 maxNodesPerRack, results, avoidStaleNodes, storageType); 150 } 151 } else { 152 //otherwise randomly choose one from the network 153 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 154 maxNodesPerRack, results, avoidStaleNodes, storageType); 155 } 156 } 157 } 158 159 /** 160 * {@inheritDoc} 161 */ 162 @Override 163 protected void chooseRemoteRack(int numOfReplicas, 164 DatanodeDescriptor localMachine, Set<Node> excludedNodes, 165 long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results, 166 boolean avoidStaleNodes, StorageType storageType) 167 throws NotEnoughReplicasException { 168 int oldNumOfReplicas = results.size(); 169 170 final String rackLocation = NetworkTopology.getFirstHalf( 171 localMachine.getNetworkLocation()); 172 try { 173 // randomly choose from remote racks 174 chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize, 175 maxReplicasPerRack, results, avoidStaleNodes, storageType); 176 } catch (NotEnoughReplicasException e) { 177 // fall back to the local rack 178 chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas), 179 rackLocation, excludedNodes, blocksize, 180 maxReplicasPerRack, results, avoidStaleNodes, storageType); 181 } 182 } 183 184 /* choose one node from the nodegroup that <i>localMachine</i> is on. 185 * if no such node is available, choose one node from the nodegroup where 186 * a second replica is on. 187 * if still no such node is available, choose a random node in the cluster. 188 * @return the chosen node 189 */ 190 private DatanodeStorageInfo chooseLocalNodeGroup( 191 NetworkTopologyWithNodeGroup clusterMap, Node localMachine, 192 Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, 193 List<DatanodeStorageInfo> results, boolean avoidStaleNodes, 194 StorageType storageType) throws NotEnoughReplicasException { 195 // no local machine, so choose a random machine 196 if (localMachine == null) { 197 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 198 maxNodesPerRack, results, avoidStaleNodes, storageType); 199 } 200 201 // choose one from the local node group 202 try { 203 return chooseRandom( 204 clusterMap.getNodeGroup(localMachine.getNetworkLocation()), 205 excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, 206 storageType); 207 } catch (NotEnoughReplicasException e1) { 208 final DatanodeDescriptor newLocal = secondNode(localMachine, results); 209 if (newLocal != null) { 210 try { 211 return chooseRandom( 212 clusterMap.getNodeGroup(newLocal.getNetworkLocation()), 213 excludedNodes, blocksize, maxNodesPerRack, results, 214 avoidStaleNodes, storageType); 215 } catch(NotEnoughReplicasException e2) { 216 //otherwise randomly choose one from the network 217 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 218 maxNodesPerRack, results, avoidStaleNodes, storageType); 219 } 220 } else { 221 //otherwise randomly choose one from the network 222 return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, 223 maxNodesPerRack, results, avoidStaleNodes, storageType); 224 } 225 } 226 } 227 228 @Override 229 protected String getRack(final DatanodeInfo cur) { 230 String nodeGroupString = cur.getNetworkLocation(); 231 return NetworkTopology.getFirstHalf(nodeGroupString); 232 } 233 234 /** 235 * Find other nodes in the same nodegroup of <i>localMachine</i> and add them 236 * into <i>excludeNodes</i> as replica should not be duplicated for nodes 237 * within the same nodegroup 238 * @return number of new excluded nodes 239 */ 240 @Override 241 protected int addToExcludedNodes(DatanodeDescriptor chosenNode, 242 Set<Node> excludedNodes) { 243 int countOfExcludedNodes = 0; 244 String nodeGroupScope = chosenNode.getNetworkLocation(); 245 List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope); 246 for (Node leafNode : leafNodes) { 247 if (excludedNodes.add(leafNode)) { 248 // not a existing node in excludedNodes 249 countOfExcludedNodes++; 250 } 251 } 252 253 countOfExcludedNodes += addDependentNodesToExcludedNodes( 254 chosenNode, excludedNodes); 255 return countOfExcludedNodes; 256 } 257 258 /** 259 * Add all nodes from a dependent nodes list to excludedNodes. 260 * @return number of new excluded nodes 261 */ 262 private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode, 263 Set<Node> excludedNodes) { 264 if (this.host2datanodeMap == null) { 265 return 0; 266 } 267 int countOfExcludedNodes = 0; 268 for(String hostname : chosenNode.getDependentHostNames()) { 269 DatanodeDescriptor node = 270 this.host2datanodeMap.getDataNodeByHostName(hostname); 271 if(node!=null) { 272 if (excludedNodes.add(node)) { 273 countOfExcludedNodes++; 274 } 275 } else { 276 LOG.warn("Not able to find datanode " + hostname 277 + " which has dependency with datanode " 278 + chosenNode.getHostName()); 279 } 280 } 281 282 return countOfExcludedNodes; 283 } 284 285 /** 286 * Pick up replica node set for deleting replica as over-replicated. 287 * First set contains replica nodes on rack with more than one 288 * replica while second set contains remaining replica nodes. 289 * If first is not empty, divide first set into two subsets: 290 * moreThanOne contains nodes on nodegroup with more than one replica 291 * exactlyOne contains the remaining nodes in first set 292 * then pickup priSet if not empty. 293 * If first is empty, then pick second. 294 */ 295 @Override 296 public Collection<DatanodeDescriptor> pickupReplicaSet( 297 Collection<DatanodeDescriptor> first, 298 Collection<DatanodeDescriptor> second) { 299 // If no replica within same rack, return directly. 300 if (first.isEmpty()) { 301 return second; 302 } 303 // Split data nodes in the first set into two sets, 304 // moreThanOne contains nodes on nodegroup with more than one replica 305 // exactlyOne contains the remaining nodes 306 Map<String, List<DatanodeDescriptor>> nodeGroupMap = 307 new HashMap<String, List<DatanodeDescriptor>>(); 308 309 for(DatanodeDescriptor node : first) { 310 final String nodeGroupName = 311 NetworkTopology.getLastHalf(node.getNetworkLocation()); 312 List<DatanodeDescriptor> datanodeList = 313 nodeGroupMap.get(nodeGroupName); 314 if (datanodeList == null) { 315 datanodeList = new ArrayList<DatanodeDescriptor>(); 316 nodeGroupMap.put(nodeGroupName, datanodeList); 317 } 318 datanodeList.add(node); 319 } 320 321 final List<DatanodeDescriptor> moreThanOne = new ArrayList<DatanodeDescriptor>(); 322 final List<DatanodeDescriptor> exactlyOne = new ArrayList<DatanodeDescriptor>(); 323 // split nodes into two sets 324 for(List<DatanodeDescriptor> datanodeList : nodeGroupMap.values()) { 325 if (datanodeList.size() == 1 ) { 326 // exactlyOne contains nodes on nodegroup with exactly one replica 327 exactlyOne.add(datanodeList.get(0)); 328 } else { 329 // moreThanOne contains nodes on nodegroup with more than one replica 330 moreThanOne.addAll(datanodeList); 331 } 332 } 333 334 return moreThanOne.isEmpty()? exactlyOne : moreThanOne; 335 } 336 337}