001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.hdfs.server.namenode;
019
020 import static org.apache.hadoop.util.ExitUtil.terminate;
021
022 import java.io.IOException;
023 import java.util.ArrayList;
024 import java.util.Collection;
025 import java.util.Collections;
026 import java.util.Comparator;
027 import java.util.LinkedList;
028 import java.util.List;
029 import java.util.PriorityQueue;
030 import java.util.SortedSet;
031 import java.util.concurrent.CopyOnWriteArrayList;
032
033 import org.apache.commons.logging.Log;
034 import org.apache.commons.logging.LogFactory;
035 import org.apache.hadoop.classification.InterfaceAudience;
036 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
037 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
038 import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
039
040 import static org.apache.hadoop.util.ExitUtil.terminate;
041
042 import com.google.common.base.Preconditions;
043 import com.google.common.collect.ComparisonChain;
044 import com.google.common.collect.ImmutableList;
045 import com.google.common.collect.ImmutableListMultimap;
046 import com.google.common.collect.Lists;
047 import com.google.common.collect.Multimaps;
048 import com.google.common.collect.Sets;
049
050 /**
051 * Manages a collection of Journals. None of the methods are synchronized, it is
052 * assumed that FSEditLog methods, that use this class, use proper
053 * synchronization.
054 */
055 public class JournalSet implements JournalManager {
056
057 static final Log LOG = LogFactory.getLog(FSEditLog.class);
058
059 static final public Comparator<EditLogInputStream>
060 EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() {
061 @Override
062 public int compare(EditLogInputStream a, EditLogInputStream b) {
063 return ComparisonChain.start().
064 compare(a.getFirstTxId(), b.getFirstTxId()).
065 compare(b.getLastTxId(), a.getLastTxId()).
066 result();
067 }
068 };
069
070 /**
071 * Container for a JournalManager paired with its currently
072 * active stream.
073 *
074 * If a Journal gets disabled due to an error writing to its
075 * stream, then the stream will be aborted and set to null.
076 */
077 static class JournalAndStream implements CheckableNameNodeResource {
078 private final JournalManager journal;
079 private boolean disabled = false;
080 private EditLogOutputStream stream;
081 private boolean required = false;
082
083 public JournalAndStream(JournalManager manager, boolean required) {
084 this.journal = manager;
085 this.required = required;
086 }
087
088 public void startLogSegment(long txId) throws IOException {
089 Preconditions.checkState(stream == null);
090 disabled = false;
091 stream = journal.startLogSegment(txId);
092 }
093
094 /**
095 * Closes the stream, also sets it to null.
096 */
097 public void closeStream() throws IOException {
098 if (stream == null) return;
099 stream.close();
100 stream = null;
101 }
102
103 /**
104 * Close the Journal and Stream
105 */
106 public void close() throws IOException {
107 closeStream();
108
109 journal.close();
110 }
111
112 /**
113 * Aborts the stream, also sets it to null.
114 */
115 public void abort() {
116 if (stream == null) return;
117 try {
118 stream.abort();
119 } catch (IOException ioe) {
120 LOG.error("Unable to abort stream " + stream, ioe);
121 }
122 stream = null;
123 }
124
125 boolean isActive() {
126 return stream != null;
127 }
128
129 /**
130 * Should be used outside JournalSet only for testing.
131 */
132 EditLogOutputStream getCurrentStream() {
133 return stream;
134 }
135
136 @Override
137 public String toString() {
138 return "JournalAndStream(mgr=" + journal +
139 ", " + "stream=" + stream + ")";
140 }
141
142 void setCurrentStreamForTests(EditLogOutputStream stream) {
143 this.stream = stream;
144 }
145
146 JournalManager getManager() {
147 return journal;
148 }
149
150 boolean isDisabled() {
151 return disabled;
152 }
153
154 private void setDisabled(boolean disabled) {
155 this.disabled = disabled;
156 }
157
158 @Override
159 public boolean isResourceAvailable() {
160 return !isDisabled();
161 }
162
163 @Override
164 public boolean isRequired() {
165 return required;
166 }
167 }
168
169 // COW implementation is necessary since some users (eg the web ui) call
170 // getAllJournalStreams() and then iterate. Since this is rarely
171 // mutated, there is no performance concern.
172 private List<JournalAndStream> journals =
173 new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
174 final int minimumRedundantJournals;
175
176 JournalSet(int minimumRedundantResources) {
177 this.minimumRedundantJournals = minimumRedundantResources;
178 }
179
180 @Override
181 public void format(NamespaceInfo nsInfo) throws IOException {
182 // The iteration is done by FSEditLog itself
183 throw new UnsupportedOperationException();
184 }
185
186 @Override
187 public boolean hasSomeData() throws IOException {
188 // This is called individually on the underlying journals,
189 // not on the JournalSet.
190 throw new UnsupportedOperationException();
191 }
192
193
194 @Override
195 public EditLogOutputStream startLogSegment(final long txId) throws IOException {
196 mapJournalsAndReportErrors(new JournalClosure() {
197 @Override
198 public void apply(JournalAndStream jas) throws IOException {
199 jas.startLogSegment(txId);
200 }
201 }, "starting log segment " + txId);
202 return new JournalSetOutputStream();
203 }
204
205 @Override
206 public void finalizeLogSegment(final long firstTxId, final long lastTxId)
207 throws IOException {
208 mapJournalsAndReportErrors(new JournalClosure() {
209 @Override
210 public void apply(JournalAndStream jas) throws IOException {
211 if (jas.isActive()) {
212 jas.closeStream();
213 jas.getManager().finalizeLogSegment(firstTxId, lastTxId);
214 }
215 }
216 }, "finalize log segment " + firstTxId + ", " + lastTxId);
217 }
218
219 @Override
220 public void close() throws IOException {
221 mapJournalsAndReportErrors(new JournalClosure() {
222 @Override
223 public void apply(JournalAndStream jas) throws IOException {
224 jas.close();
225 }
226 }, "close journal");
227 }
228
229 /**
230 * In this function, we get a bunch of streams from all of our JournalManager
231 * objects. Then we add these to the collection one by one.
232 *
233 * @param streams The collection to add the streams to. It may or
234 * may not be sorted-- this is up to the caller.
235 * @param fromTxId The transaction ID to start looking for streams at
236 * @param inProgressOk Should we consider unfinalized streams?
237 */
238 @Override
239 public void selectInputStreams(Collection<EditLogInputStream> streams,
240 long fromTxId, boolean inProgressOk) throws IOException {
241 final PriorityQueue<EditLogInputStream> allStreams =
242 new PriorityQueue<EditLogInputStream>(64,
243 EDIT_LOG_INPUT_STREAM_COMPARATOR);
244 for (JournalAndStream jas : journals) {
245 if (jas.isDisabled()) {
246 LOG.info("Skipping jas " + jas + " since it's disabled");
247 continue;
248 }
249 try {
250 jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk);
251 } catch (IOException ioe) {
252 LOG.warn("Unable to determine input streams from " + jas.getManager() +
253 ". Skipping.", ioe);
254 }
255 }
256 chainAndMakeRedundantStreams(streams, allStreams, fromTxId);
257 }
258
259 public static void chainAndMakeRedundantStreams(
260 Collection<EditLogInputStream> outStreams,
261 PriorityQueue<EditLogInputStream> allStreams, long fromTxId) {
262 // We want to group together all the streams that start on the same start
263 // transaction ID. To do this, we maintain an accumulator (acc) of all
264 // the streams we've seen at a given start transaction ID. When we see a
265 // higher start transaction ID, we select a stream from the accumulator and
266 // clear it. Then we begin accumulating streams with the new, higher start
267 // transaction ID.
268 LinkedList<EditLogInputStream> acc =
269 new LinkedList<EditLogInputStream>();
270 EditLogInputStream elis;
271 while ((elis = allStreams.poll()) != null) {
272 if (acc.isEmpty()) {
273 acc.add(elis);
274 } else {
275 long accFirstTxId = acc.get(0).getFirstTxId();
276 if (accFirstTxId == elis.getFirstTxId()) {
277 acc.add(elis);
278 } else if (accFirstTxId < elis.getFirstTxId()) {
279 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
280 acc.clear();
281 acc.add(elis);
282 } else if (accFirstTxId > elis.getFirstTxId()) {
283 throw new RuntimeException("sorted set invariants violated! " +
284 "Got stream with first txid " + elis.getFirstTxId() +
285 ", but the last firstTxId was " + accFirstTxId);
286 }
287 }
288 }
289 if (!acc.isEmpty()) {
290 outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
291 acc.clear();
292 }
293 }
294
295 /**
296 * Returns true if there are no journals, all redundant journals are disabled,
297 * or any required journals are disabled.
298 *
299 * @return True if there no journals, all redundant journals are disabled,
300 * or any required journals are disabled.
301 */
302 public boolean isEmpty() {
303 return !NameNodeResourcePolicy.areResourcesAvailable(journals,
304 minimumRedundantJournals);
305 }
306
307 /**
308 * Called when some journals experience an error in some operation.
309 */
310 private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
311 if (badJournals == null || badJournals.isEmpty()) {
312 return; // nothing to do
313 }
314
315 for (JournalAndStream j : badJournals) {
316 LOG.error("Disabling journal " + j);
317 j.abort();
318 j.setDisabled(true);
319 }
320 }
321
322 /**
323 * Implementations of this interface encapsulate operations that can be
324 * iteratively applied on all the journals. For example see
325 * {@link JournalSet#mapJournalsAndReportErrors}.
326 */
327 private interface JournalClosure {
328 /**
329 * The operation on JournalAndStream.
330 * @param jas Object on which operations are performed.
331 * @throws IOException
332 */
333 public void apply(JournalAndStream jas) throws IOException;
334 }
335
336 /**
337 * Apply the given operation across all of the journal managers, disabling
338 * any for which the closure throws an IOException.
339 * @param closure {@link JournalClosure} object encapsulating the operation.
340 * @param status message used for logging errors (e.g. "opening journal")
341 * @throws IOException If the operation fails on all the journals.
342 */
343 private void mapJournalsAndReportErrors(
344 JournalClosure closure, String status) throws IOException{
345
346 List<JournalAndStream> badJAS = Lists.newLinkedList();
347 for (JournalAndStream jas : journals) {
348 try {
349 closure.apply(jas);
350 } catch (Throwable t) {
351 if (jas.isRequired()) {
352 final String msg = "Error: " + status + " failed for required journal ("
353 + jas + ")";
354 LOG.fatal(msg, t);
355 // If we fail on *any* of the required journals, then we must not
356 // continue on any of the other journals. Abort them to ensure that
357 // retry behavior doesn't allow them to keep going in any way.
358 abortAllJournals();
359 // the current policy is to shutdown the NN on errors to shared edits
360 // dir. There are many code paths to shared edits failures - syncs,
361 // roll of edits etc. All of them go through this common function
362 // where the isRequired() check is made. Applying exit policy here
363 // to catch all code paths.
364 terminate(1, msg);
365 } else {
366 LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
367 badJAS.add(jas);
368 }
369 }
370 }
371 disableAndReportErrorOnJournals(badJAS);
372 if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
373 minimumRedundantJournals)) {
374 String message = status + " failed for too many journals";
375 LOG.error("Error: " + message);
376 throw new IOException(message);
377 }
378 }
379
380 /**
381 * Abort all of the underlying streams.
382 */
383 private void abortAllJournals() {
384 for (JournalAndStream jas : journals) {
385 if (jas.isActive()) {
386 jas.abort();
387 }
388 }
389 }
390
391 /**
392 * An implementation of EditLogOutputStream that applies a requested method on
393 * all the journals that are currently active.
394 */
395 private class JournalSetOutputStream extends EditLogOutputStream {
396
397 JournalSetOutputStream() throws IOException {
398 super();
399 }
400
401 @Override
402 public void write(final FSEditLogOp op)
403 throws IOException {
404 mapJournalsAndReportErrors(new JournalClosure() {
405 @Override
406 public void apply(JournalAndStream jas) throws IOException {
407 if (jas.isActive()) {
408 jas.getCurrentStream().write(op);
409 }
410 }
411 }, "write op");
412 }
413
414 @Override
415 public void writeRaw(final byte[] data, final int offset, final int length)
416 throws IOException {
417 mapJournalsAndReportErrors(new JournalClosure() {
418 @Override
419 public void apply(JournalAndStream jas) throws IOException {
420 if (jas.isActive()) {
421 jas.getCurrentStream().writeRaw(data, offset, length);
422 }
423 }
424 }, "write bytes");
425 }
426
427 @Override
428 public void create() throws IOException {
429 mapJournalsAndReportErrors(new JournalClosure() {
430 @Override
431 public void apply(JournalAndStream jas) throws IOException {
432 if (jas.isActive()) {
433 jas.getCurrentStream().create();
434 }
435 }
436 }, "create");
437 }
438
439 @Override
440 public void close() throws IOException {
441 mapJournalsAndReportErrors(new JournalClosure() {
442 @Override
443 public void apply(JournalAndStream jas) throws IOException {
444 jas.closeStream();
445 }
446 }, "close");
447 }
448
449 @Override
450 public void abort() throws IOException {
451 mapJournalsAndReportErrors(new JournalClosure() {
452 @Override
453 public void apply(JournalAndStream jas) throws IOException {
454 jas.abort();
455 }
456 }, "abort");
457 }
458
459 @Override
460 public void setReadyToFlush() throws IOException {
461 mapJournalsAndReportErrors(new JournalClosure() {
462 @Override
463 public void apply(JournalAndStream jas) throws IOException {
464 if (jas.isActive()) {
465 jas.getCurrentStream().setReadyToFlush();
466 }
467 }
468 }, "setReadyToFlush");
469 }
470
471 @Override
472 protected void flushAndSync(final boolean durable) throws IOException {
473 mapJournalsAndReportErrors(new JournalClosure() {
474 @Override
475 public void apply(JournalAndStream jas) throws IOException {
476 if (jas.isActive()) {
477 jas.getCurrentStream().flushAndSync(durable);
478 }
479 }
480 }, "flushAndSync");
481 }
482
483 @Override
484 public void flush() throws IOException {
485 mapJournalsAndReportErrors(new JournalClosure() {
486 @Override
487 public void apply(JournalAndStream jas) throws IOException {
488 if (jas.isActive()) {
489 jas.getCurrentStream().flush();
490 }
491 }
492 }, "flush");
493 }
494
495 @Override
496 public boolean shouldForceSync() {
497 for (JournalAndStream js : journals) {
498 if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
499 return true;
500 }
501 }
502 return false;
503 }
504
505 @Override
506 protected long getNumSync() {
507 for (JournalAndStream jas : journals) {
508 if (jas.isActive()) {
509 return jas.getCurrentStream().getNumSync();
510 }
511 }
512 return 0;
513 }
514 }
515
516 @Override
517 public void setOutputBufferCapacity(final int size) {
518 try {
519 mapJournalsAndReportErrors(new JournalClosure() {
520 @Override
521 public void apply(JournalAndStream jas) throws IOException {
522 jas.getManager().setOutputBufferCapacity(size);
523 }
524 }, "setOutputBufferCapacity");
525 } catch (IOException e) {
526 LOG.error("Error in setting outputbuffer capacity");
527 }
528 }
529
530 List<JournalAndStream> getAllJournalStreams() {
531 return journals;
532 }
533
534 List<JournalManager> getJournalManagers() {
535 List<JournalManager> jList = new ArrayList<JournalManager>();
536 for (JournalAndStream j : journals) {
537 jList.add(j.getManager());
538 }
539 return jList;
540 }
541
542 void add(JournalManager j, boolean required) {
543 JournalAndStream jas = new JournalAndStream(j, required);
544 journals.add(jas);
545 }
546
547 void remove(JournalManager j) {
548 JournalAndStream jasToRemove = null;
549 for (JournalAndStream jas: journals) {
550 if (jas.getManager().equals(j)) {
551 jasToRemove = jas;
552 break;
553 }
554 }
555 if (jasToRemove != null) {
556 jasToRemove.abort();
557 journals.remove(jasToRemove);
558 }
559 }
560
561 @Override
562 public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException {
563 mapJournalsAndReportErrors(new JournalClosure() {
564 @Override
565 public void apply(JournalAndStream jas) throws IOException {
566 jas.getManager().purgeLogsOlderThan(minTxIdToKeep);
567 }
568 }, "purgeLogsOlderThan " + minTxIdToKeep);
569 }
570
571 @Override
572 public void recoverUnfinalizedSegments() throws IOException {
573 mapJournalsAndReportErrors(new JournalClosure() {
574 @Override
575 public void apply(JournalAndStream jas) throws IOException {
576 jas.getManager().recoverUnfinalizedSegments();
577 }
578 }, "recoverUnfinalizedSegments");
579 }
580
581 /**
582 * Return a manifest of what finalized edit logs are available. All available
583 * edit logs are returned starting from the transaction id passed. If
584 * 'fromTxId' falls in the middle of a log, that log is returned as well.
585 *
586 * @param fromTxId Starting transaction id to read the logs.
587 * @return RemoteEditLogManifest object.
588 */
589 public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId) {
590 // Collect RemoteEditLogs available from each FileJournalManager
591 List<RemoteEditLog> allLogs = Lists.newArrayList();
592 for (JournalAndStream j : journals) {
593 if (j.getManager() instanceof FileJournalManager) {
594 FileJournalManager fjm = (FileJournalManager)j.getManager();
595 try {
596 allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, false));
597 } catch (Throwable t) {
598 LOG.warn("Cannot list edit logs in " + fjm, t);
599 }
600 }
601 }
602
603 // Group logs by their starting txid
604 ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId =
605 Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID);
606 long curStartTxId = fromTxId;
607
608 List<RemoteEditLog> logs = Lists.newArrayList();
609 while (true) {
610 ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId);
611 if (logGroup.isEmpty()) {
612 // we have a gap in logs - for example because we recovered some old
613 // storage directory with ancient logs. Clear out any logs we've
614 // accumulated so far, and then skip to the next segment of logs
615 // after the gap.
616 SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet());
617 startTxIds = startTxIds.tailSet(curStartTxId);
618 if (startTxIds.isEmpty()) {
619 break;
620 } else {
621 if (LOG.isDebugEnabled()) {
622 LOG.debug("Found gap in logs at " + curStartTxId + ": " +
623 "not returning previous logs in manifest.");
624 }
625 logs.clear();
626 curStartTxId = startTxIds.first();
627 continue;
628 }
629 }
630
631 // Find the one that extends the farthest forward
632 RemoteEditLog bestLog = Collections.max(logGroup);
633 logs.add(bestLog);
634 // And then start looking from after that point
635 curStartTxId = bestLog.getEndTxId() + 1;
636 }
637 RemoteEditLogManifest ret = new RemoteEditLogManifest(logs);
638
639 if (LOG.isDebugEnabled()) {
640 LOG.debug("Generated manifest for logs since " + fromTxId + ":"
641 + ret);
642 }
643 return ret;
644 }
645
646 /**
647 * Add sync times to the buffer.
648 */
649 String getSyncTimes() {
650 StringBuilder buf = new StringBuilder();
651 for (JournalAndStream jas : journals) {
652 if (jas.isActive()) {
653 buf.append(jas.getCurrentStream().getTotalSyncTime());
654 buf.append(" ");
655 }
656 }
657 return buf.toString();
658 }
659 }