001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.ha;
019
020import java.io.IOException;
021import java.io.PrintStream;
022import java.util.Arrays;
023import java.util.Map;
024
025import org.apache.commons.cli.Options;
026import org.apache.commons.cli.CommandLine;
027import org.apache.commons.cli.GnuParser;
028import org.apache.commons.cli.ParseException;
029import org.apache.commons.logging.Log;
030import org.apache.commons.logging.LogFactory;
031
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.conf.Configured;
035import org.apache.hadoop.fs.CommonConfigurationKeys;
036import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
037import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
038import org.apache.hadoop.util.Tool;
039import org.apache.hadoop.util.ToolRunner;
040
041import com.google.common.base.Preconditions;
042import com.google.common.collect.ImmutableMap;
043
044/**
045 * A command-line tool for making calls in the HAServiceProtocol.
046 * For example,. this can be used to force a service to standby or active
047 * mode, or to trigger a health-check.
048 */
049@InterfaceAudience.Private
050
051public abstract class HAAdmin extends Configured implements Tool {
052  
053  private static final String FORCEFENCE  = "forcefence";
054  private static final String FORCEACTIVE = "forceactive";
055  
056  /**
057   * Undocumented flag which allows an administrator to use manual failover
058   * state transitions even when auto-failover is enabled. This is an unsafe
059   * operation, which is why it is not documented in the usage below.
060   */
061  private static final String FORCEMANUAL = "forcemanual";
062  private static final Log LOG = LogFactory.getLog(HAAdmin.class);
063
064  private int rpcTimeoutForChecks = -1;
065  
066  protected final static Map<String, UsageInfo> USAGE =
067    ImmutableMap.<String, UsageInfo>builder()
068    .put("-transitionToActive",
069        new UsageInfo("<serviceId>", "Transitions the service into Active state"))
070    .put("-transitionToStandby",
071        new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
072    .put("-failover",
073        new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
074            "Failover from the first service to the second.\n" +
075            "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
076            "Try to failover to the target service even if it is not ready if the " + 
077            FORCEACTIVE + " option is used."))
078    .put("-getServiceState",
079        new UsageInfo("<serviceId>", "Returns the state of the service"))
080    .put("-checkHealth",
081        new UsageInfo("<serviceId>",
082            "Requests that the service perform a health check.\n" + 
083            "The HAAdmin tool will exit with a non-zero exit code\n" +
084            "if the check fails."))
085    .put("-help",
086        new UsageInfo("<command>", "Displays help on the specified command"))
087    .build();
088
089  /** Output stream for errors, for use in tests */
090  protected PrintStream errOut = System.err;
091  protected PrintStream out = System.out;
092  private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
093
094  protected HAAdmin() {
095    super();
096  }
097
098  protected HAAdmin(Configuration conf) {
099    super(conf);
100  }
101
102  protected abstract HAServiceTarget resolveTarget(String string);
103
104  protected String getUsageString() {
105    return "Usage: HAAdmin";
106  }
107
108  protected void printUsage(PrintStream errOut) {
109    errOut.println(getUsageString());
110    for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
111      String cmd = e.getKey();
112      UsageInfo usage = e.getValue();
113      
114      errOut.println("    [" + cmd + " " + usage.args + "]"); 
115    }
116    errOut.println();
117    ToolRunner.printGenericCommandUsage(errOut);    
118  }
119  
120  private static void printUsage(PrintStream errOut, String cmd) {
121    UsageInfo usage = USAGE.get(cmd);
122    if (usage == null) {
123      throw new RuntimeException("No usage for cmd " + cmd);
124    }
125    errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
126  }
127
128  private int transitionToActive(final CommandLine cmd)
129      throws IOException, ServiceFailedException {
130    String[] argv = cmd.getArgs();
131    if (argv.length != 1) {
132      errOut.println("transitionToActive: incorrect number of arguments");
133      printUsage(errOut, "-transitionToActive");
134      return -1;
135    }
136    HAServiceTarget target = resolveTarget(argv[0]);
137    if (!checkManualStateManagementOK(target)) {
138      return -1;
139    }
140    HAServiceProtocol proto = target.getProxy(
141        getConf(), 0);
142    HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
143    return 0;
144  }
145
146  private int transitionToStandby(final CommandLine cmd)
147      throws IOException, ServiceFailedException {
148    String[] argv = cmd.getArgs();
149    if (argv.length != 1) {
150      errOut.println("transitionToStandby: incorrect number of arguments");
151      printUsage(errOut, "-transitionToStandby");
152      return -1;
153    }
154    
155    HAServiceTarget target = resolveTarget(argv[0]);
156    if (!checkManualStateManagementOK(target)) {
157      return -1;
158    }
159    HAServiceProtocol proto = target.getProxy(
160        getConf(), 0);
161    HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
162    return 0;
163  }
164  /**
165   * Ensure that we are allowed to manually manage the HA state of the target
166   * service. If automatic failover is configured, then the automatic
167   * failover controllers should be doing state management, and it is generally
168   * an error to use the HAAdmin command line to do so.
169   * 
170   * @param target the target to check
171   * @return true if manual state management is allowed
172   */
173  private boolean checkManualStateManagementOK(HAServiceTarget target) {
174    if (target.isAutoFailoverEnabled()) {
175      if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
176        errOut.println(
177            "Automatic failover is enabled for " + target + "\n" +
178            "Refusing to manually manage HA state, since it may cause\n" +
179            "a split-brain scenario or other incorrect state.\n" +
180            "If you are very sure you know what you are doing, please \n" +
181            "specify the " + FORCEMANUAL + " flag.");
182        return false;
183      } else {
184        LOG.warn("Proceeding with manual HA state management even though\n" +
185            "automatic failover is enabled for " + target);
186        return true;
187      }
188    }
189    return true;
190  }
191
192  private StateChangeRequestInfo createReqInfo() {
193    return new StateChangeRequestInfo(requestSource);
194  }
195
196  private int failover(CommandLine cmd)
197      throws IOException, ServiceFailedException {
198    boolean forceFence = cmd.hasOption(FORCEFENCE);
199    boolean forceActive = cmd.hasOption(FORCEACTIVE);
200
201    int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
202    final String[] args = cmd.getArgs();
203
204    if (numOpts > 3 || args.length != 2) {
205      errOut.println("failover: incorrect arguments");
206      printUsage(errOut, "-failover");
207      return -1;
208    }
209
210    HAServiceTarget fromNode = resolveTarget(args[0]);
211    HAServiceTarget toNode = resolveTarget(args[1]);
212    
213    // Check that auto-failover is consistently configured for both nodes.
214    Preconditions.checkState(
215        fromNode.isAutoFailoverEnabled() ==
216          toNode.isAutoFailoverEnabled(),
217          "Inconsistent auto-failover configs between %s and %s!",
218          fromNode, toNode);
219    
220    if (fromNode.isAutoFailoverEnabled()) {
221      if (forceFence || forceActive) {
222        // -forceActive doesn't make sense with auto-HA, since, if the node
223        // is not healthy, then its ZKFC will immediately quit the election
224        // again the next time a health check runs.
225        //
226        // -forceFence doesn't seem to have any real use cases with auto-HA
227        // so it isn't implemented.
228        errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
229            "supported with auto-failover enabled.");
230        return -1;
231      }
232      return gracefulFailoverThroughZKFCs(toNode);
233    }
234    
235    FailoverController fc = new FailoverController(getConf(),
236        requestSource);
237    
238    try {
239      fc.failover(fromNode, toNode, forceFence, forceActive); 
240      out.println("Failover from "+args[0]+" to "+args[1]+" successful");
241    } catch (FailoverFailedException ffe) {
242      errOut.println("Failover failed: " + ffe.getLocalizedMessage());
243      return -1;
244    }
245    return 0;
246  }
247  
248
249  /**
250   * Initiate a graceful failover by talking to the target node's ZKFC.
251   * This sends an RPC to the ZKFC, which coordinates the failover.
252   * 
253   * @param toNode the node to fail to
254   * @return status code (0 for success)
255   * @throws IOException if failover does not succeed
256   */
257  private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
258      throws IOException {
259
260    int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
261    ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
262    try {
263      proxy.gracefulFailover();
264      out.println("Failover to " + toNode + " successful");
265    } catch (ServiceFailedException sfe) {
266      errOut.println("Failover failed: " + sfe.getLocalizedMessage());
267      return -1;
268    }
269
270    return 0;
271  }
272
273  private int checkHealth(final CommandLine cmd)
274      throws IOException, ServiceFailedException {
275    String[] argv = cmd.getArgs();
276    if (argv.length != 1) {
277      errOut.println("checkHealth: incorrect number of arguments");
278      printUsage(errOut, "-checkHealth");
279      return -1;
280    }
281    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
282        getConf(), rpcTimeoutForChecks);
283    try {
284      HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
285    } catch (HealthCheckFailedException e) {
286      errOut.println("Health check failed: " + e.getLocalizedMessage());
287      return -1;
288    }
289    return 0;
290  }
291
292  private int getServiceState(final CommandLine cmd)
293      throws IOException, ServiceFailedException {
294    String[] argv = cmd.getArgs();
295    if (argv.length != 1) {
296      errOut.println("getServiceState: incorrect number of arguments");
297      printUsage(errOut, "-getServiceState");
298      return -1;
299    }
300
301    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
302        getConf(), rpcTimeoutForChecks);
303    out.println(proto.getServiceStatus().getState());
304    return 0;
305  }
306
307  /**
308   * Return the serviceId as is, we are assuming it was
309   * given as a service address of form <host:ipcport>.
310   */
311  protected String getServiceAddr(String serviceId) {
312    return serviceId;
313  }
314
315  @Override
316  public void setConf(Configuration conf) {
317    super.setConf(conf);
318    if (conf != null) {
319      rpcTimeoutForChecks = conf.getInt(
320          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
321          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
322    }
323  }
324
325  @Override
326  public int run(String[] argv) throws Exception {
327    try {
328      return runCmd(argv);
329    } catch (IllegalArgumentException iae) {
330      errOut.println("Illegal argument: " + iae.getLocalizedMessage());
331      return -1;
332    } catch (IOException ioe) {
333      errOut.println("Operation failed: " + ioe.getLocalizedMessage());
334      if (LOG.isDebugEnabled()) {
335        LOG.debug("Operation failed", ioe);
336      }
337      return -1;
338    }
339  }
340  
341  protected int runCmd(String[] argv) throws Exception {
342    if (argv.length < 1) {
343      printUsage(errOut);
344      return -1;
345    }
346
347    String cmd = argv[0];
348
349    if (!cmd.startsWith("-")) {
350      errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
351      printUsage(errOut);
352      return -1;
353    }
354    
355    if (!USAGE.containsKey(cmd)) {
356      errOut.println(cmd.substring(1) + ": Unknown command");
357      printUsage(errOut);
358      return -1;
359    }
360    
361    Options opts = new Options();
362
363    // Add command-specific options
364    if ("-failover".equals(cmd)) {
365      addFailoverCliOpts(opts);
366    }
367    // Mutative commands take FORCEMANUAL option
368    if ("-transitionToActive".equals(cmd) ||
369        "-transitionToStandby".equals(cmd) ||
370        "-failover".equals(cmd)) {
371      opts.addOption(FORCEMANUAL, false,
372          "force manual control even if auto-failover is enabled");
373    }
374         
375    CommandLine cmdLine = parseOpts(cmd, opts, argv);
376    if (cmdLine == null) {
377      // error already printed
378      return -1;
379    }
380    
381    if (cmdLine.hasOption(FORCEMANUAL)) {
382      if (!confirmForceManual()) {
383        LOG.fatal("Aborted");
384        return -1;
385      }
386      // Instruct the NNs to honor this request even if they're
387      // configured for manual failover.
388      requestSource = RequestSource.REQUEST_BY_USER_FORCED;
389    }
390
391    if ("-transitionToActive".equals(cmd)) {
392      return transitionToActive(cmdLine);
393    } else if ("-transitionToStandby".equals(cmd)) {
394      return transitionToStandby(cmdLine);
395    } else if ("-failover".equals(cmd)) {
396      return failover(cmdLine);
397    } else if ("-getServiceState".equals(cmd)) {
398      return getServiceState(cmdLine);
399    } else if ("-checkHealth".equals(cmd)) {
400      return checkHealth(cmdLine);
401    } else if ("-help".equals(cmd)) {
402      return help(argv);
403    } else {
404      // we already checked command validity above, so getting here
405      // would be a coding error
406      throw new AssertionError("Should not get here, command: " + cmd);
407    } 
408  }
409  
410  private boolean confirmForceManual() throws IOException {
411     return ToolRunner.confirmPrompt(
412        "You have specified the " + FORCEMANUAL + " flag. This flag is " +
413        "dangerous, as it can induce a split-brain scenario that WILL " +
414        "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
415        "\n" +
416        "It is recommended not to use this flag, but instead to shut down the " +
417        "cluster and disable automatic failover if you prefer to manually " +
418        "manage your HA state.\n" +
419        "\n" +
420        "You may abort safely by answering 'n' or hitting ^C now.\n" +
421        "\n" +
422        "Are you sure you want to continue?");
423  }
424
425  /**
426   * Add CLI options which are specific to the failover command and no
427   * others.
428   */
429  private void addFailoverCliOpts(Options failoverOpts) {
430    failoverOpts.addOption(FORCEFENCE, false, "force fencing");
431    failoverOpts.addOption(FORCEACTIVE, false, "force failover");
432    // Don't add FORCEMANUAL, since that's added separately for all commands
433    // that change state.
434  }
435  
436  private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
437    try {
438      // Strip off the first arg, since that's just the command name
439      argv = Arrays.copyOfRange(argv, 1, argv.length); 
440      return new GnuParser().parse(opts, argv);
441    } catch (ParseException pe) {
442      errOut.println(cmdName.substring(1) +
443          ": incorrect arguments");
444      printUsage(errOut, cmdName);
445      return null;
446    }
447  }
448  
449  private int help(String[] argv) {
450    if (argv.length == 1) { // only -help
451      printUsage(out);
452      return 0;
453    } else if (argv.length != 2) {
454      printUsage(errOut, "-help");
455      return -1;
456    }
457    String cmd = argv[1];
458    if (!cmd.startsWith("-")) {
459      cmd = "-" + cmd;
460    }
461    UsageInfo usageInfo = USAGE.get(cmd);
462    if (usageInfo == null) {
463      errOut.println(cmd + ": Unknown command");
464      printUsage(errOut);
465      return -1;
466    }
467    
468    out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
469    return 0;
470  }
471  
472  protected static class UsageInfo {
473    public final String args;
474    public final String help;
475    
476    public UsageInfo(String args, String help) {
477      this.args = args;
478      this.help = help;
479    }
480  }
481}