001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.server.namenode;
019    
020    import static org.apache.hadoop.util.ExitUtil.terminate;
021    
022    import java.io.IOException;
023    import java.util.ArrayList;
024    import java.util.Collection;
025    import java.util.Collections;
026    import java.util.Comparator;
027    import java.util.LinkedList;
028    import java.util.List;
029    import java.util.PriorityQueue;
030    import java.util.SortedSet;
031    import java.util.concurrent.CopyOnWriteArrayList;
032    
033    import org.apache.commons.logging.Log;
034    import org.apache.commons.logging.LogFactory;
035    import org.apache.hadoop.classification.InterfaceAudience;
036    import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
037    import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
038    import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
039    
040    import static org.apache.hadoop.util.ExitUtil.terminate;
041    
042    import com.google.common.base.Preconditions;
043    import com.google.common.collect.ComparisonChain;
044    import com.google.common.collect.ImmutableList;
045    import com.google.common.collect.ImmutableListMultimap;
046    import com.google.common.collect.Lists;
047    import com.google.common.collect.Multimaps;
048    import com.google.common.collect.Sets;
049    
050    /**
051     * Manages a collection of Journals. None of the methods are synchronized, it is
052     * assumed that FSEditLog methods, that use this class, use proper
053     * synchronization.
054     */
055    public class JournalSet implements JournalManager {
056    
057      static final Log LOG = LogFactory.getLog(FSEditLog.class);
058      
059      static final public Comparator<EditLogInputStream>
060        EDIT_LOG_INPUT_STREAM_COMPARATOR = new Comparator<EditLogInputStream>() {
061          @Override
062          public int compare(EditLogInputStream a, EditLogInputStream b) {
063            return ComparisonChain.start().
064              compare(a.getFirstTxId(), b.getFirstTxId()).
065              compare(b.getLastTxId(), a.getLastTxId()).
066              result();
067          }
068        };
069      
070      /**
071       * Container for a JournalManager paired with its currently
072       * active stream.
073       * 
074       * If a Journal gets disabled due to an error writing to its
075       * stream, then the stream will be aborted and set to null.
076       */
077      static class JournalAndStream implements CheckableNameNodeResource {
078        private final JournalManager journal;
079        private boolean disabled = false;
080        private EditLogOutputStream stream;
081        private boolean required = false;
082        
083        public JournalAndStream(JournalManager manager, boolean required) {
084          this.journal = manager;
085          this.required = required;
086        }
087    
088        public void startLogSegment(long txId) throws IOException {
089          Preconditions.checkState(stream == null);
090          disabled = false;
091          stream = journal.startLogSegment(txId);
092        }
093    
094        /**
095         * Closes the stream, also sets it to null.
096         */
097        public void closeStream() throws IOException {
098          if (stream == null) return;
099          stream.close();
100          stream = null;
101        }
102    
103        /**
104         * Close the Journal and Stream
105         */
106        public void close() throws IOException {
107          closeStream();
108    
109          journal.close();
110        }
111        
112        /**
113         * Aborts the stream, also sets it to null.
114         */
115        public void abort() {
116          if (stream == null) return;
117          try {
118            stream.abort();
119          } catch (IOException ioe) {
120            LOG.error("Unable to abort stream " + stream, ioe);
121          }
122          stream = null;
123        }
124    
125        boolean isActive() {
126          return stream != null;
127        }
128        
129        /**
130         * Should be used outside JournalSet only for testing.
131         */
132        EditLogOutputStream getCurrentStream() {
133          return stream;
134        }
135        
136        @Override
137        public String toString() {
138          return "JournalAndStream(mgr=" + journal +
139            ", " + "stream=" + stream + ")";
140        }
141    
142        void setCurrentStreamForTests(EditLogOutputStream stream) {
143          this.stream = stream;
144        }
145        
146        JournalManager getManager() {
147          return journal;
148        }
149    
150        boolean isDisabled() {
151          return disabled;
152        }
153    
154        private void setDisabled(boolean disabled) {
155          this.disabled = disabled;
156        }
157        
158        @Override
159        public boolean isResourceAvailable() {
160          return !isDisabled();
161        }
162        
163        @Override
164        public boolean isRequired() {
165          return required;
166        }
167      }
168     
169      // COW implementation is necessary since some users (eg the web ui) call
170      // getAllJournalStreams() and then iterate. Since this is rarely
171      // mutated, there is no performance concern.
172      private List<JournalAndStream> journals =
173          new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
174      final int minimumRedundantJournals;
175      
176      JournalSet(int minimumRedundantResources) {
177        this.minimumRedundantJournals = minimumRedundantResources;
178      }
179      
180      @Override
181      public void format(NamespaceInfo nsInfo) throws IOException {
182        // The iteration is done by FSEditLog itself
183        throw new UnsupportedOperationException();
184      }
185    
186      @Override
187      public boolean hasSomeData() throws IOException {
188        // This is called individually on the underlying journals,
189        // not on the JournalSet.
190        throw new UnsupportedOperationException();
191      }
192    
193      
194      @Override
195      public EditLogOutputStream startLogSegment(final long txId) throws IOException {
196        mapJournalsAndReportErrors(new JournalClosure() {
197          @Override
198          public void apply(JournalAndStream jas) throws IOException {
199            jas.startLogSegment(txId);
200          }
201        }, "starting log segment " + txId);
202        return new JournalSetOutputStream();
203      }
204      
205      @Override
206      public void finalizeLogSegment(final long firstTxId, final long lastTxId)
207          throws IOException {
208        mapJournalsAndReportErrors(new JournalClosure() {
209          @Override
210          public void apply(JournalAndStream jas) throws IOException {
211            if (jas.isActive()) {
212              jas.closeStream();
213              jas.getManager().finalizeLogSegment(firstTxId, lastTxId);
214            }
215          }
216        }, "finalize log segment " + firstTxId + ", " + lastTxId);
217      }
218       
219      @Override
220      public void close() throws IOException {
221        mapJournalsAndReportErrors(new JournalClosure() {
222          @Override
223          public void apply(JournalAndStream jas) throws IOException {
224            jas.close();
225          }
226        }, "close journal");
227      }
228    
229      /**
230       * In this function, we get a bunch of streams from all of our JournalManager
231       * objects.  Then we add these to the collection one by one.
232       * 
233       * @param streams          The collection to add the streams to.  It may or 
234       *                         may not be sorted-- this is up to the caller.
235       * @param fromTxId         The transaction ID to start looking for streams at
236       * @param inProgressOk     Should we consider unfinalized streams?
237       */
238      @Override
239      public void selectInputStreams(Collection<EditLogInputStream> streams,
240          long fromTxId, boolean inProgressOk) throws IOException {
241        final PriorityQueue<EditLogInputStream> allStreams = 
242            new PriorityQueue<EditLogInputStream>(64,
243                EDIT_LOG_INPUT_STREAM_COMPARATOR);
244        for (JournalAndStream jas : journals) {
245          if (jas.isDisabled()) {
246            LOG.info("Skipping jas " + jas + " since it's disabled");
247            continue;
248          }
249          try {
250            jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk);
251          } catch (IOException ioe) {
252            LOG.warn("Unable to determine input streams from " + jas.getManager() +
253                ". Skipping.", ioe);
254          }
255        }
256        chainAndMakeRedundantStreams(streams, allStreams, fromTxId);
257      }
258      
259      public static void chainAndMakeRedundantStreams(
260          Collection<EditLogInputStream> outStreams,
261          PriorityQueue<EditLogInputStream> allStreams, long fromTxId) {
262        // We want to group together all the streams that start on the same start
263        // transaction ID.  To do this, we maintain an accumulator (acc) of all
264        // the streams we've seen at a given start transaction ID.  When we see a
265        // higher start transaction ID, we select a stream from the accumulator and
266        // clear it.  Then we begin accumulating streams with the new, higher start
267        // transaction ID.
268        LinkedList<EditLogInputStream> acc =
269            new LinkedList<EditLogInputStream>();
270        EditLogInputStream elis;
271        while ((elis = allStreams.poll()) != null) {
272          if (acc.isEmpty()) {
273            acc.add(elis);
274          } else {
275            long accFirstTxId = acc.get(0).getFirstTxId();
276            if (accFirstTxId == elis.getFirstTxId()) {
277              acc.add(elis);
278            } else if (accFirstTxId < elis.getFirstTxId()) {
279              outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
280              acc.clear();
281              acc.add(elis);
282            } else if (accFirstTxId > elis.getFirstTxId()) {
283              throw new RuntimeException("sorted set invariants violated!  " +
284                  "Got stream with first txid " + elis.getFirstTxId() +
285                  ", but the last firstTxId was " + accFirstTxId);
286            }
287          }
288        }
289        if (!acc.isEmpty()) {
290          outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
291          acc.clear();
292        }
293      }
294    
295      /**
296       * Returns true if there are no journals, all redundant journals are disabled,
297       * or any required journals are disabled.
298       * 
299       * @return True if there no journals, all redundant journals are disabled,
300       * or any required journals are disabled.
301       */
302      public boolean isEmpty() {
303        return !NameNodeResourcePolicy.areResourcesAvailable(journals,
304            minimumRedundantJournals);
305      }
306      
307      /**
308       * Called when some journals experience an error in some operation.
309       */
310      private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
311        if (badJournals == null || badJournals.isEmpty()) {
312          return; // nothing to do
313        }
314     
315        for (JournalAndStream j : badJournals) {
316          LOG.error("Disabling journal " + j);
317          j.abort();
318          j.setDisabled(true);
319        }
320      }
321    
322      /**
323       * Implementations of this interface encapsulate operations that can be
324       * iteratively applied on all the journals. For example see
325       * {@link JournalSet#mapJournalsAndReportErrors}.
326       */
327      private interface JournalClosure {
328        /**
329         * The operation on JournalAndStream.
330         * @param jas Object on which operations are performed.
331         * @throws IOException
332         */
333        public void apply(JournalAndStream jas) throws IOException;
334      }
335      
336      /**
337       * Apply the given operation across all of the journal managers, disabling
338       * any for which the closure throws an IOException.
339       * @param closure {@link JournalClosure} object encapsulating the operation.
340       * @param status message used for logging errors (e.g. "opening journal")
341       * @throws IOException If the operation fails on all the journals.
342       */
343      private void mapJournalsAndReportErrors(
344          JournalClosure closure, String status) throws IOException{
345    
346        List<JournalAndStream> badJAS = Lists.newLinkedList();
347        for (JournalAndStream jas : journals) {
348          try {
349            closure.apply(jas);
350          } catch (Throwable t) {
351            if (jas.isRequired()) {
352              final String msg = "Error: " + status + " failed for required journal ("
353                + jas + ")";
354              LOG.fatal(msg, t);
355              // If we fail on *any* of the required journals, then we must not
356              // continue on any of the other journals. Abort them to ensure that
357              // retry behavior doesn't allow them to keep going in any way.
358              abortAllJournals();
359              // the current policy is to shutdown the NN on errors to shared edits
360              // dir. There are many code paths to shared edits failures - syncs,
361              // roll of edits etc. All of them go through this common function 
362              // where the isRequired() check is made. Applying exit policy here 
363              // to catch all code paths.
364              terminate(1, msg);
365            } else {
366              LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
367              badJAS.add(jas);          
368            }
369          }
370        }
371        disableAndReportErrorOnJournals(badJAS);
372        if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
373            minimumRedundantJournals)) {
374          String message = status + " failed for too many journals";
375          LOG.error("Error: " + message);
376          throw new IOException(message);
377        }
378      }
379      
380      /**
381       * Abort all of the underlying streams.
382       */
383      private void abortAllJournals() {
384        for (JournalAndStream jas : journals) {
385          if (jas.isActive()) {
386            jas.abort();
387          }
388        }
389      }
390    
391      /**
392       * An implementation of EditLogOutputStream that applies a requested method on
393       * all the journals that are currently active.
394       */
395      private class JournalSetOutputStream extends EditLogOutputStream {
396    
397        JournalSetOutputStream() throws IOException {
398          super();
399        }
400    
401        @Override
402        public void write(final FSEditLogOp op)
403            throws IOException {
404          mapJournalsAndReportErrors(new JournalClosure() {
405            @Override
406            public void apply(JournalAndStream jas) throws IOException {
407              if (jas.isActive()) {
408                jas.getCurrentStream().write(op);
409              }
410            }
411          }, "write op");
412        }
413    
414        @Override
415        public void writeRaw(final byte[] data, final int offset, final int length)
416            throws IOException {
417          mapJournalsAndReportErrors(new JournalClosure() {
418            @Override
419            public void apply(JournalAndStream jas) throws IOException {
420              if (jas.isActive()) {
421                jas.getCurrentStream().writeRaw(data, offset, length);
422              }
423            }
424          }, "write bytes");
425        }
426    
427        @Override
428        public void create() throws IOException {
429          mapJournalsAndReportErrors(new JournalClosure() {
430            @Override
431            public void apply(JournalAndStream jas) throws IOException {
432              if (jas.isActive()) {
433                jas.getCurrentStream().create();
434              }
435            }
436          }, "create");
437        }
438    
439        @Override
440        public void close() throws IOException {
441          mapJournalsAndReportErrors(new JournalClosure() {
442            @Override
443            public void apply(JournalAndStream jas) throws IOException {
444              jas.closeStream();
445            }
446          }, "close");
447        }
448    
449        @Override
450        public void abort() throws IOException {
451          mapJournalsAndReportErrors(new JournalClosure() {
452            @Override
453            public void apply(JournalAndStream jas) throws IOException {
454              jas.abort();
455            }
456          }, "abort");
457        }
458    
459        @Override
460        public void setReadyToFlush() throws IOException {
461          mapJournalsAndReportErrors(new JournalClosure() {
462            @Override
463            public void apply(JournalAndStream jas) throws IOException {
464              if (jas.isActive()) {
465                jas.getCurrentStream().setReadyToFlush();
466              }
467            }
468          }, "setReadyToFlush");
469        }
470    
471        @Override
472        protected void flushAndSync(final boolean durable) throws IOException {
473          mapJournalsAndReportErrors(new JournalClosure() {
474            @Override
475            public void apply(JournalAndStream jas) throws IOException {
476              if (jas.isActive()) {
477                jas.getCurrentStream().flushAndSync(durable);
478              }
479            }
480          }, "flushAndSync");
481        }
482        
483        @Override
484        public void flush() throws IOException {
485          mapJournalsAndReportErrors(new JournalClosure() {
486            @Override
487            public void apply(JournalAndStream jas) throws IOException {
488              if (jas.isActive()) {
489                jas.getCurrentStream().flush();
490              }
491            }
492          }, "flush");
493        }
494        
495        @Override
496        public boolean shouldForceSync() {
497          for (JournalAndStream js : journals) {
498            if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
499              return true;
500            }
501          }
502          return false;
503        }
504        
505        @Override
506        protected long getNumSync() {
507          for (JournalAndStream jas : journals) {
508            if (jas.isActive()) {
509              return jas.getCurrentStream().getNumSync();
510            }
511          }
512          return 0;
513        }
514      }
515    
516      @Override
517      public void setOutputBufferCapacity(final int size) {
518        try {
519          mapJournalsAndReportErrors(new JournalClosure() {
520            @Override
521            public void apply(JournalAndStream jas) throws IOException {
522                jas.getManager().setOutputBufferCapacity(size);
523            }
524          }, "setOutputBufferCapacity");
525        } catch (IOException e) {
526          LOG.error("Error in setting outputbuffer capacity");
527        }
528      }
529      
530      List<JournalAndStream> getAllJournalStreams() {
531        return journals;
532      }
533    
534      List<JournalManager> getJournalManagers() {
535        List<JournalManager> jList = new ArrayList<JournalManager>();
536        for (JournalAndStream j : journals) {
537          jList.add(j.getManager());
538        }
539        return jList;
540      }
541    
542      void add(JournalManager j, boolean required) {
543        JournalAndStream jas = new JournalAndStream(j, required);
544        journals.add(jas);
545      }
546      
547      void remove(JournalManager j) {
548        JournalAndStream jasToRemove = null;
549        for (JournalAndStream jas: journals) {
550          if (jas.getManager().equals(j)) {
551            jasToRemove = jas;
552            break;
553          }
554        }
555        if (jasToRemove != null) {
556          jasToRemove.abort();
557          journals.remove(jasToRemove);
558        }
559      }
560    
561      @Override
562      public void purgeLogsOlderThan(final long minTxIdToKeep) throws IOException {
563        mapJournalsAndReportErrors(new JournalClosure() {
564          @Override
565          public void apply(JournalAndStream jas) throws IOException {
566            jas.getManager().purgeLogsOlderThan(minTxIdToKeep);
567          }
568        }, "purgeLogsOlderThan " + minTxIdToKeep);
569      }
570    
571      @Override
572      public void recoverUnfinalizedSegments() throws IOException {
573        mapJournalsAndReportErrors(new JournalClosure() {
574          @Override
575          public void apply(JournalAndStream jas) throws IOException {
576            jas.getManager().recoverUnfinalizedSegments();
577          }
578        }, "recoverUnfinalizedSegments");
579      }
580      
581      /**
582       * Return a manifest of what finalized edit logs are available. All available
583       * edit logs are returned starting from the transaction id passed. If
584       * 'fromTxId' falls in the middle of a log, that log is returned as well.
585       * 
586       * @param fromTxId Starting transaction id to read the logs.
587       * @return RemoteEditLogManifest object.
588       */
589      public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId) {
590        // Collect RemoteEditLogs available from each FileJournalManager
591        List<RemoteEditLog> allLogs = Lists.newArrayList();
592        for (JournalAndStream j : journals) {
593          if (j.getManager() instanceof FileJournalManager) {
594            FileJournalManager fjm = (FileJournalManager)j.getManager();
595            try {
596              allLogs.addAll(fjm.getRemoteEditLogs(fromTxId, false));
597            } catch (Throwable t) {
598              LOG.warn("Cannot list edit logs in " + fjm, t);
599            }
600          }
601        }
602        
603        // Group logs by their starting txid
604        ImmutableListMultimap<Long, RemoteEditLog> logsByStartTxId =
605          Multimaps.index(allLogs, RemoteEditLog.GET_START_TXID);
606        long curStartTxId = fromTxId;
607    
608        List<RemoteEditLog> logs = Lists.newArrayList();
609        while (true) {
610          ImmutableList<RemoteEditLog> logGroup = logsByStartTxId.get(curStartTxId);
611          if (logGroup.isEmpty()) {
612            // we have a gap in logs - for example because we recovered some old
613            // storage directory with ancient logs. Clear out any logs we've
614            // accumulated so far, and then skip to the next segment of logs
615            // after the gap.
616            SortedSet<Long> startTxIds = Sets.newTreeSet(logsByStartTxId.keySet());
617            startTxIds = startTxIds.tailSet(curStartTxId);
618            if (startTxIds.isEmpty()) {
619              break;
620            } else {
621              if (LOG.isDebugEnabled()) {
622                LOG.debug("Found gap in logs at " + curStartTxId + ": " +
623                    "not returning previous logs in manifest.");
624              }
625              logs.clear();
626              curStartTxId = startTxIds.first();
627              continue;
628            }
629          }
630    
631          // Find the one that extends the farthest forward
632          RemoteEditLog bestLog = Collections.max(logGroup);
633          logs.add(bestLog);
634          // And then start looking from after that point
635          curStartTxId = bestLog.getEndTxId() + 1;
636        }
637        RemoteEditLogManifest ret = new RemoteEditLogManifest(logs);
638        
639        if (LOG.isDebugEnabled()) {
640          LOG.debug("Generated manifest for logs since " + fromTxId + ":"
641              + ret);      
642        }
643        return ret;
644      }
645    
646      /**
647       * Add sync times to the buffer.
648       */
649      String getSyncTimes() {
650        StringBuilder buf = new StringBuilder();
651        for (JournalAndStream jas : journals) {
652          if (jas.isActive()) {
653            buf.append(jas.getCurrentStream().getTotalSyncTime());
654            buf.append(" ");
655          }
656        }
657        return buf.toString();
658      }
659    }