001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.io;
020
021 import java.io.*;
022 import java.util.*;
023 import java.rmi.server.UID;
024 import java.security.MessageDigest;
025
026 import org.apache.commons.logging.*;
027 import org.apache.hadoop.util.Options;
028 import org.apache.hadoop.fs.*;
029 import org.apache.hadoop.fs.FSDataInputStream.FadviseType;
030 import org.apache.hadoop.fs.Options.CreateOpts;
031 import org.apache.hadoop.io.compress.CodecPool;
032 import org.apache.hadoop.io.compress.CompressionCodec;
033 import org.apache.hadoop.io.compress.CompressionInputStream;
034 import org.apache.hadoop.io.compress.CompressionOutputStream;
035 import org.apache.hadoop.io.compress.Compressor;
036 import org.apache.hadoop.io.compress.Decompressor;
037 import org.apache.hadoop.io.compress.DefaultCodec;
038 import org.apache.hadoop.io.compress.GzipCodec;
039 import org.apache.hadoop.io.compress.zlib.ZlibFactory;
040 import org.apache.hadoop.io.serializer.Deserializer;
041 import org.apache.hadoop.io.serializer.Serializer;
042 import org.apache.hadoop.io.serializer.SerializationFactory;
043 import org.apache.hadoop.classification.InterfaceAudience;
044 import org.apache.hadoop.classification.InterfaceStability;
045 import org.apache.hadoop.conf.*;
046 import org.apache.hadoop.util.Progressable;
047 import org.apache.hadoop.util.Progress;
048 import org.apache.hadoop.util.ReflectionUtils;
049 import org.apache.hadoop.util.NativeCodeLoader;
050 import org.apache.hadoop.util.MergeSort;
051 import org.apache.hadoop.util.PriorityQueue;
052 import org.apache.hadoop.util.Time;
053
054 /**
055 * <code>SequenceFile</code>s are flat files consisting of binary key/value
056 * pairs.
057 *
058 * <p><code>SequenceFile</code> provides {@link Writer}, {@link Reader} and
059 * {@link Sorter} classes for writing, reading and sorting respectively.</p>
060 *
061 * There are three <code>SequenceFile</code> <code>Writer</code>s based on the
062 * {@link CompressionType} used to compress key/value pairs:
063 * <ol>
064 * <li>
065 * <code>Writer</code> : Uncompressed records.
066 * </li>
067 * <li>
068 * <code>RecordCompressWriter</code> : Record-compressed files, only compress
069 * values.
070 * </li>
071 * <li>
072 * <code>BlockCompressWriter</code> : Block-compressed files, both keys &
073 * values are collected in 'blocks'
074 * separately and compressed. The size of
075 * the 'block' is configurable.
076 * </ol>
077 *
078 * <p>The actual compression algorithm used to compress key and/or values can be
079 * specified by using the appropriate {@link CompressionCodec}.</p>
080 *
081 * <p>The recommended way is to use the static <tt>createWriter</tt> methods
082 * provided by the <code>SequenceFile</code> to chose the preferred format.</p>
083 *
084 * <p>The {@link Reader} acts as the bridge and can read any of the above
085 * <code>SequenceFile</code> formats.</p>
086 *
087 * <h4 id="Formats">SequenceFile Formats</h4>
088 *
089 * <p>Essentially there are 3 different formats for <code>SequenceFile</code>s
090 * depending on the <code>CompressionType</code> specified. All of them share a
091 * <a href="#Header">common header</a> described below.
092 *
093 * <h5 id="Header">SequenceFile Header</h5>
094 * <ul>
095 * <li>
096 * version - 3 bytes of magic header <b>SEQ</b>, followed by 1 byte of actual
097 * version number (e.g. SEQ4 or SEQ6)
098 * </li>
099 * <li>
100 * keyClassName -key class
101 * </li>
102 * <li>
103 * valueClassName - value class
104 * </li>
105 * <li>
106 * compression - A boolean which specifies if compression is turned on for
107 * keys/values in this file.
108 * </li>
109 * <li>
110 * blockCompression - A boolean which specifies if block-compression is
111 * turned on for keys/values in this file.
112 * </li>
113 * <li>
114 * compression codec - <code>CompressionCodec</code> class which is used for
115 * compression of keys and/or values (if compression is
116 * enabled).
117 * </li>
118 * <li>
119 * metadata - {@link Metadata} for this file.
120 * </li>
121 * <li>
122 * sync - A sync marker to denote end of the header.
123 * </li>
124 * </ul>
125 *
126 * <h5 id="#UncompressedFormat">Uncompressed SequenceFile Format</h5>
127 * <ul>
128 * <li>
129 * <a href="#Header">Header</a>
130 * </li>
131 * <li>
132 * Record
133 * <ul>
134 * <li>Record length</li>
135 * <li>Key length</li>
136 * <li>Key</li>
137 * <li>Value</li>
138 * </ul>
139 * </li>
140 * <li>
141 * A sync-marker every few <code>100</code> bytes or so.
142 * </li>
143 * </ul>
144 *
145 * <h5 id="#RecordCompressedFormat">Record-Compressed SequenceFile Format</h5>
146 * <ul>
147 * <li>
148 * <a href="#Header">Header</a>
149 * </li>
150 * <li>
151 * Record
152 * <ul>
153 * <li>Record length</li>
154 * <li>Key length</li>
155 * <li>Key</li>
156 * <li><i>Compressed</i> Value</li>
157 * </ul>
158 * </li>
159 * <li>
160 * A sync-marker every few <code>100</code> bytes or so.
161 * </li>
162 * </ul>
163 *
164 * <h5 id="#BlockCompressedFormat">Block-Compressed SequenceFile Format</h5>
165 * <ul>
166 * <li>
167 * <a href="#Header">Header</a>
168 * </li>
169 * <li>
170 * Record <i>Block</i>
171 * <ul>
172 * <li>Uncompressed number of records in the block</li>
173 * <li>Compressed key-lengths block-size</li>
174 * <li>Compressed key-lengths block</li>
175 * <li>Compressed keys block-size</li>
176 * <li>Compressed keys block</li>
177 * <li>Compressed value-lengths block-size</li>
178 * <li>Compressed value-lengths block</li>
179 * <li>Compressed values block-size</li>
180 * <li>Compressed values block</li>
181 * </ul>
182 * </li>
183 * <li>
184 * A sync-marker every block.
185 * </li>
186 * </ul>
187 *
188 * <p>The compressed blocks of key lengths and value lengths consist of the
189 * actual lengths of individual keys/values encoded in ZeroCompressedInteger
190 * format.</p>
191 *
192 * @see CompressionCodec
193 */
194 @InterfaceAudience.Public
195 @InterfaceStability.Stable
196 public class SequenceFile {
197 private static final Log LOG = LogFactory.getLog(SequenceFile.class);
198
199 private SequenceFile() {} // no public ctor
200
201 private static final byte BLOCK_COMPRESS_VERSION = (byte)4;
202 private static final byte CUSTOM_COMPRESS_VERSION = (byte)5;
203 private static final byte VERSION_WITH_METADATA = (byte)6;
204 private static byte[] VERSION = new byte[] {
205 (byte)'S', (byte)'E', (byte)'Q', VERSION_WITH_METADATA
206 };
207
208 private static final int SYNC_ESCAPE = -1; // "length" of sync entries
209 private static final int SYNC_HASH_SIZE = 16; // number of bytes in hash
210 private static final int SYNC_SIZE = 4+SYNC_HASH_SIZE; // escape + hash
211
212 /** The number of bytes between sync points.*/
213 public static final int SYNC_INTERVAL = 100*SYNC_SIZE;
214
215 /**
216 * The compression type used to compress key/value pairs in the
217 * {@link SequenceFile}.
218 *
219 * @see SequenceFile.Writer
220 */
221 public static enum CompressionType {
222 /** Do not compress records. */
223 NONE,
224 /** Compress values only, each separately. */
225 RECORD,
226 /** Compress sequences of records together in blocks. */
227 BLOCK
228 }
229
230 /**
231 * Get the compression type for the reduce outputs
232 * @param job the job config to look in
233 * @return the kind of compression to use
234 */
235 static public CompressionType getDefaultCompressionType(Configuration job) {
236 String name = job.get("io.seqfile.compression.type");
237 return name == null ? CompressionType.RECORD :
238 CompressionType.valueOf(name);
239 }
240
241 /**
242 * Set the default compression type for sequence files.
243 * @param job the configuration to modify
244 * @param val the new compression type (none, block, record)
245 */
246 static public void setDefaultCompressionType(Configuration job,
247 CompressionType val) {
248 job.set("io.seqfile.compression.type", val.toString());
249 }
250
251 /**
252 * Create a new Writer with the given options.
253 * @param conf the configuration to use
254 * @param opts the options to create the file with
255 * @return a new Writer
256 * @throws IOException
257 */
258 public static Writer createWriter(Configuration conf, Writer.Option... opts
259 ) throws IOException {
260 Writer.CompressionOption compressionOption =
261 Options.getOption(Writer.CompressionOption.class, opts);
262 CompressionType kind;
263 if (compressionOption != null) {
264 kind = compressionOption.getValue();
265 } else {
266 kind = getDefaultCompressionType(conf);
267 opts = Options.prependOptions(opts, Writer.compression(kind));
268 }
269 switch (kind) {
270 default:
271 case NONE:
272 return new Writer(conf, opts);
273 case RECORD:
274 return new RecordCompressWriter(conf, opts);
275 case BLOCK:
276 return new BlockCompressWriter(conf, opts);
277 }
278 }
279
280 /**
281 * Construct the preferred type of SequenceFile Writer.
282 * @param fs The configured filesystem.
283 * @param conf The configuration.
284 * @param name The name of the file.
285 * @param keyClass The 'key' type.
286 * @param valClass The 'value' type.
287 * @return Returns the handle to the constructed SequenceFile Writer.
288 * @throws IOException
289 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
290 * instead.
291 */
292 @Deprecated
293 public static Writer
294 createWriter(FileSystem fs, Configuration conf, Path name,
295 Class keyClass, Class valClass) throws IOException {
296 return createWriter(conf, Writer.filesystem(fs),
297 Writer.file(name), Writer.keyClass(keyClass),
298 Writer.valueClass(valClass));
299 }
300
301 /**
302 * Construct the preferred type of SequenceFile Writer.
303 * @param fs The configured filesystem.
304 * @param conf The configuration.
305 * @param name The name of the file.
306 * @param keyClass The 'key' type.
307 * @param valClass The 'value' type.
308 * @param compressionType The compression type.
309 * @return Returns the handle to the constructed SequenceFile Writer.
310 * @throws IOException
311 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
312 * instead.
313 */
314 @Deprecated
315 public static Writer
316 createWriter(FileSystem fs, Configuration conf, Path name,
317 Class keyClass, Class valClass,
318 CompressionType compressionType) throws IOException {
319 return createWriter(conf, Writer.filesystem(fs),
320 Writer.file(name), Writer.keyClass(keyClass),
321 Writer.valueClass(valClass),
322 Writer.compression(compressionType));
323 }
324
325 /**
326 * Construct the preferred type of SequenceFile Writer.
327 * @param fs The configured filesystem.
328 * @param conf The configuration.
329 * @param name The name of the file.
330 * @param keyClass The 'key' type.
331 * @param valClass The 'value' type.
332 * @param compressionType The compression type.
333 * @param progress The Progressable object to track progress.
334 * @return Returns the handle to the constructed SequenceFile Writer.
335 * @throws IOException
336 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
337 * instead.
338 */
339 @Deprecated
340 public static Writer
341 createWriter(FileSystem fs, Configuration conf, Path name,
342 Class keyClass, Class valClass, CompressionType compressionType,
343 Progressable progress) throws IOException {
344 return createWriter(conf, Writer.file(name),
345 Writer.filesystem(fs),
346 Writer.keyClass(keyClass),
347 Writer.valueClass(valClass),
348 Writer.compression(compressionType),
349 Writer.progressable(progress));
350 }
351
352 /**
353 * Construct the preferred type of SequenceFile Writer.
354 * @param fs The configured filesystem.
355 * @param conf The configuration.
356 * @param name The name of the file.
357 * @param keyClass The 'key' type.
358 * @param valClass The 'value' type.
359 * @param compressionType The compression type.
360 * @param codec The compression codec.
361 * @return Returns the handle to the constructed SequenceFile Writer.
362 * @throws IOException
363 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
364 * instead.
365 */
366 @Deprecated
367 public static Writer
368 createWriter(FileSystem fs, Configuration conf, Path name,
369 Class keyClass, Class valClass, CompressionType compressionType,
370 CompressionCodec codec) throws IOException {
371 return createWriter(conf, Writer.file(name),
372 Writer.filesystem(fs),
373 Writer.keyClass(keyClass),
374 Writer.valueClass(valClass),
375 Writer.compression(compressionType, codec));
376 }
377
378 /**
379 * Construct the preferred type of SequenceFile Writer.
380 * @param fs The configured filesystem.
381 * @param conf The configuration.
382 * @param name The name of the file.
383 * @param keyClass The 'key' type.
384 * @param valClass The 'value' type.
385 * @param compressionType The compression type.
386 * @param codec The compression codec.
387 * @param progress The Progressable object to track progress.
388 * @param metadata The metadata of the file.
389 * @return Returns the handle to the constructed SequenceFile Writer.
390 * @throws IOException
391 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
392 * instead.
393 */
394 @Deprecated
395 public static Writer
396 createWriter(FileSystem fs, Configuration conf, Path name,
397 Class keyClass, Class valClass,
398 CompressionType compressionType, CompressionCodec codec,
399 Progressable progress, Metadata metadata) throws IOException {
400 return createWriter(conf, Writer.file(name),
401 Writer.filesystem(fs),
402 Writer.keyClass(keyClass),
403 Writer.valueClass(valClass),
404 Writer.compression(compressionType, codec),
405 Writer.progressable(progress),
406 Writer.metadata(metadata));
407 }
408
409 /**
410 * Construct the preferred type of SequenceFile Writer.
411 * @param fs The configured filesystem.
412 * @param conf The configuration.
413 * @param name The name of the file.
414 * @param keyClass The 'key' type.
415 * @param valClass The 'value' type.
416 * @param bufferSize buffer size for the underlaying outputstream.
417 * @param replication replication factor for the file.
418 * @param blockSize block size for the file.
419 * @param compressionType The compression type.
420 * @param codec The compression codec.
421 * @param progress The Progressable object to track progress.
422 * @param metadata The metadata of the file.
423 * @return Returns the handle to the constructed SequenceFile Writer.
424 * @throws IOException
425 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
426 * instead.
427 */
428 @Deprecated
429 public static Writer
430 createWriter(FileSystem fs, Configuration conf, Path name,
431 Class keyClass, Class valClass, int bufferSize,
432 short replication, long blockSize,
433 CompressionType compressionType, CompressionCodec codec,
434 Progressable progress, Metadata metadata) throws IOException {
435 return createWriter(conf, Writer.file(name),
436 Writer.filesystem(fs),
437 Writer.keyClass(keyClass),
438 Writer.valueClass(valClass),
439 Writer.bufferSize(bufferSize),
440 Writer.replication(replication),
441 Writer.blockSize(blockSize),
442 Writer.compression(compressionType, codec),
443 Writer.progressable(progress),
444 Writer.metadata(metadata));
445 }
446
447 /**
448 * Construct the preferred type of SequenceFile Writer.
449 * @param fs The configured filesystem.
450 * @param conf The configuration.
451 * @param name The name of the file.
452 * @param keyClass The 'key' type.
453 * @param valClass The 'value' type.
454 * @param bufferSize buffer size for the underlaying outputstream.
455 * @param replication replication factor for the file.
456 * @param blockSize block size for the file.
457 * @param createParent create parent directory if non-existent
458 * @param compressionType The compression type.
459 * @param codec The compression codec.
460 * @param metadata The metadata of the file.
461 * @return Returns the handle to the constructed SequenceFile Writer.
462 * @throws IOException
463 */
464 @Deprecated
465 public static Writer
466 createWriter(FileSystem fs, Configuration conf, Path name,
467 Class keyClass, Class valClass, int bufferSize,
468 short replication, long blockSize, boolean createParent,
469 CompressionType compressionType, CompressionCodec codec,
470 Metadata metadata) throws IOException {
471 return createWriter(FileContext.getFileContext(fs.getUri(), conf),
472 conf, name, keyClass, valClass, compressionType, codec,
473 metadata, EnumSet.of(CreateFlag.CREATE,CreateFlag.OVERWRITE),
474 CreateOpts.bufferSize(bufferSize),
475 createParent ? CreateOpts.createParent()
476 : CreateOpts.donotCreateParent(),
477 CreateOpts.repFac(replication),
478 CreateOpts.blockSize(blockSize)
479 );
480 }
481
482 /**
483 * Construct the preferred type of SequenceFile Writer.
484 * @param fc The context for the specified file.
485 * @param conf The configuration.
486 * @param name The name of the file.
487 * @param keyClass The 'key' type.
488 * @param valClass The 'value' type.
489 * @param compressionType The compression type.
490 * @param codec The compression codec.
491 * @param metadata The metadata of the file.
492 * @param createFlag gives the semantics of create: overwrite, append etc.
493 * @param opts file creation options; see {@link CreateOpts}.
494 * @return Returns the handle to the constructed SequenceFile Writer.
495 * @throws IOException
496 */
497 public static Writer
498 createWriter(FileContext fc, Configuration conf, Path name,
499 Class keyClass, Class valClass,
500 CompressionType compressionType, CompressionCodec codec,
501 Metadata metadata,
502 final EnumSet<CreateFlag> createFlag, CreateOpts... opts)
503 throws IOException {
504 return createWriter(conf, fc.create(name, createFlag, opts),
505 keyClass, valClass, compressionType, codec, metadata).ownStream();
506 }
507
508 /**
509 * Construct the preferred type of SequenceFile Writer.
510 * @param fs The configured filesystem.
511 * @param conf The configuration.
512 * @param name The name of the file.
513 * @param keyClass The 'key' type.
514 * @param valClass The 'value' type.
515 * @param compressionType The compression type.
516 * @param codec The compression codec.
517 * @param progress The Progressable object to track progress.
518 * @return Returns the handle to the constructed SequenceFile Writer.
519 * @throws IOException
520 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
521 * instead.
522 */
523 @Deprecated
524 public static Writer
525 createWriter(FileSystem fs, Configuration conf, Path name,
526 Class keyClass, Class valClass,
527 CompressionType compressionType, CompressionCodec codec,
528 Progressable progress) throws IOException {
529 return createWriter(conf, Writer.file(name),
530 Writer.filesystem(fs),
531 Writer.keyClass(keyClass),
532 Writer.valueClass(valClass),
533 Writer.compression(compressionType, codec),
534 Writer.progressable(progress));
535 }
536
537 /**
538 * Construct the preferred type of 'raw' SequenceFile Writer.
539 * @param conf The configuration.
540 * @param out The stream on top which the writer is to be constructed.
541 * @param keyClass The 'key' type.
542 * @param valClass The 'value' type.
543 * @param compressionType The compression type.
544 * @param codec The compression codec.
545 * @param metadata The metadata of the file.
546 * @return Returns the handle to the constructed SequenceFile Writer.
547 * @throws IOException
548 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
549 * instead.
550 */
551 @Deprecated
552 public static Writer
553 createWriter(Configuration conf, FSDataOutputStream out,
554 Class keyClass, Class valClass,
555 CompressionType compressionType,
556 CompressionCodec codec, Metadata metadata) throws IOException {
557 return createWriter(conf, Writer.stream(out), Writer.keyClass(keyClass),
558 Writer.valueClass(valClass),
559 Writer.compression(compressionType, codec),
560 Writer.metadata(metadata));
561 }
562
563 /**
564 * Construct the preferred type of 'raw' SequenceFile Writer.
565 * @param conf The configuration.
566 * @param out The stream on top which the writer is to be constructed.
567 * @param keyClass The 'key' type.
568 * @param valClass The 'value' type.
569 * @param compressionType The compression type.
570 * @param codec The compression codec.
571 * @return Returns the handle to the constructed SequenceFile Writer.
572 * @throws IOException
573 * @deprecated Use {@link #createWriter(Configuration, Writer.Option...)}
574 * instead.
575 */
576 @Deprecated
577 public static Writer
578 createWriter(Configuration conf, FSDataOutputStream out,
579 Class keyClass, Class valClass, CompressionType compressionType,
580 CompressionCodec codec) throws IOException {
581 return createWriter(conf, Writer.stream(out), Writer.keyClass(keyClass),
582 Writer.valueClass(valClass),
583 Writer.compression(compressionType, codec));
584 }
585
586
587 /** The interface to 'raw' values of SequenceFiles. */
588 public static interface ValueBytes {
589
590 /** Writes the uncompressed bytes to the outStream.
591 * @param outStream : Stream to write uncompressed bytes into.
592 * @throws IOException
593 */
594 public void writeUncompressedBytes(DataOutputStream outStream)
595 throws IOException;
596
597 /** Write compressed bytes to outStream.
598 * Note: that it will NOT compress the bytes if they are not compressed.
599 * @param outStream : Stream to write compressed bytes into.
600 */
601 public void writeCompressedBytes(DataOutputStream outStream)
602 throws IllegalArgumentException, IOException;
603
604 /**
605 * Size of stored data.
606 */
607 public int getSize();
608 }
609
610 private static class UncompressedBytes implements ValueBytes {
611 private int dataSize;
612 private byte[] data;
613
614 private UncompressedBytes() {
615 data = null;
616 dataSize = 0;
617 }
618
619 private void reset(DataInputStream in, int length) throws IOException {
620 if (data == null) {
621 data = new byte[length];
622 } else if (length > data.length) {
623 data = new byte[Math.max(length, data.length * 2)];
624 }
625 dataSize = -1;
626 in.readFully(data, 0, length);
627 dataSize = length;
628 }
629
630 @Override
631 public int getSize() {
632 return dataSize;
633 }
634
635 @Override
636 public void writeUncompressedBytes(DataOutputStream outStream)
637 throws IOException {
638 outStream.write(data, 0, dataSize);
639 }
640
641 @Override
642 public void writeCompressedBytes(DataOutputStream outStream)
643 throws IllegalArgumentException, IOException {
644 throw
645 new IllegalArgumentException("UncompressedBytes cannot be compressed!");
646 }
647
648 } // UncompressedBytes
649
650 private static class CompressedBytes implements ValueBytes {
651 private int dataSize;
652 private byte[] data;
653 DataInputBuffer rawData = null;
654 CompressionCodec codec = null;
655 CompressionInputStream decompressedStream = null;
656
657 private CompressedBytes(CompressionCodec codec) {
658 data = null;
659 dataSize = 0;
660 this.codec = codec;
661 }
662
663 private void reset(DataInputStream in, int length) throws IOException {
664 if (data == null) {
665 data = new byte[length];
666 } else if (length > data.length) {
667 data = new byte[Math.max(length, data.length * 2)];
668 }
669 dataSize = -1;
670 in.readFully(data, 0, length);
671 dataSize = length;
672 }
673
674 @Override
675 public int getSize() {
676 return dataSize;
677 }
678
679 @Override
680 public void writeUncompressedBytes(DataOutputStream outStream)
681 throws IOException {
682 if (decompressedStream == null) {
683 rawData = new DataInputBuffer();
684 decompressedStream = codec.createInputStream(rawData);
685 } else {
686 decompressedStream.resetState();
687 }
688 rawData.reset(data, 0, dataSize);
689
690 byte[] buffer = new byte[8192];
691 int bytesRead = 0;
692 while ((bytesRead = decompressedStream.read(buffer, 0, 8192)) != -1) {
693 outStream.write(buffer, 0, bytesRead);
694 }
695 }
696
697 @Override
698 public void writeCompressedBytes(DataOutputStream outStream)
699 throws IllegalArgumentException, IOException {
700 outStream.write(data, 0, dataSize);
701 }
702
703 } // CompressedBytes
704
705 /**
706 * The class encapsulating with the metadata of a file.
707 * The metadata of a file is a list of attribute name/value
708 * pairs of Text type.
709 *
710 */
711 public static class Metadata implements Writable {
712
713 private TreeMap<Text, Text> theMetadata;
714
715 public Metadata() {
716 this(new TreeMap<Text, Text>());
717 }
718
719 public Metadata(TreeMap<Text, Text> arg) {
720 if (arg == null) {
721 this.theMetadata = new TreeMap<Text, Text>();
722 } else {
723 this.theMetadata = arg;
724 }
725 }
726
727 public Text get(Text name) {
728 return this.theMetadata.get(name);
729 }
730
731 public void set(Text name, Text value) {
732 this.theMetadata.put(name, value);
733 }
734
735 public TreeMap<Text, Text> getMetadata() {
736 return new TreeMap<Text, Text>(this.theMetadata);
737 }
738
739 @Override
740 public void write(DataOutput out) throws IOException {
741 out.writeInt(this.theMetadata.size());
742 Iterator<Map.Entry<Text, Text>> iter =
743 this.theMetadata.entrySet().iterator();
744 while (iter.hasNext()) {
745 Map.Entry<Text, Text> en = iter.next();
746 en.getKey().write(out);
747 en.getValue().write(out);
748 }
749 }
750
751 @Override
752 public void readFields(DataInput in) throws IOException {
753 int sz = in.readInt();
754 if (sz < 0) throw new IOException("Invalid size: " + sz + " for file metadata object");
755 this.theMetadata = new TreeMap<Text, Text>();
756 for (int i = 0; i < sz; i++) {
757 Text key = new Text();
758 Text val = new Text();
759 key.readFields(in);
760 val.readFields(in);
761 this.theMetadata.put(key, val);
762 }
763 }
764
765 @Override
766 public boolean equals(Object other) {
767 if (other == null) {
768 return false;
769 }
770 if (other.getClass() != this.getClass()) {
771 return false;
772 } else {
773 return equals((Metadata)other);
774 }
775 }
776
777 public boolean equals(Metadata other) {
778 if (other == null) return false;
779 if (this.theMetadata.size() != other.theMetadata.size()) {
780 return false;
781 }
782 Iterator<Map.Entry<Text, Text>> iter1 =
783 this.theMetadata.entrySet().iterator();
784 Iterator<Map.Entry<Text, Text>> iter2 =
785 other.theMetadata.entrySet().iterator();
786 while (iter1.hasNext() && iter2.hasNext()) {
787 Map.Entry<Text, Text> en1 = iter1.next();
788 Map.Entry<Text, Text> en2 = iter2.next();
789 if (!en1.getKey().equals(en2.getKey())) {
790 return false;
791 }
792 if (!en1.getValue().equals(en2.getValue())) {
793 return false;
794 }
795 }
796 if (iter1.hasNext() || iter2.hasNext()) {
797 return false;
798 }
799 return true;
800 }
801
802 @Override
803 public int hashCode() {
804 assert false : "hashCode not designed";
805 return 42; // any arbitrary constant will do
806 }
807
808 @Override
809 public String toString() {
810 StringBuilder sb = new StringBuilder();
811 sb.append("size: ").append(this.theMetadata.size()).append("\n");
812 Iterator<Map.Entry<Text, Text>> iter =
813 this.theMetadata.entrySet().iterator();
814 while (iter.hasNext()) {
815 Map.Entry<Text, Text> en = iter.next();
816 sb.append("\t").append(en.getKey().toString()).append("\t").append(en.getValue().toString());
817 sb.append("\n");
818 }
819 return sb.toString();
820 }
821 }
822
823 /** Write key/value pairs to a sequence-format file. */
824 public static class Writer implements java.io.Closeable, Syncable {
825 private Configuration conf;
826 FSDataOutputStream out;
827 boolean ownOutputStream = true;
828 DataOutputBuffer buffer = new DataOutputBuffer();
829
830 Class keyClass;
831 Class valClass;
832
833 private final CompressionType compress;
834 CompressionCodec codec = null;
835 CompressionOutputStream deflateFilter = null;
836 DataOutputStream deflateOut = null;
837 Metadata metadata = null;
838 Compressor compressor = null;
839
840 protected Serializer keySerializer;
841 protected Serializer uncompressedValSerializer;
842 protected Serializer compressedValSerializer;
843
844 // Insert a globally unique 16-byte value every few entries, so that one
845 // can seek into the middle of a file and then synchronize with record
846 // starts and ends by scanning for this value.
847 long lastSyncPos; // position of last sync
848 byte[] sync; // 16 random bytes
849 {
850 try {
851 MessageDigest digester = MessageDigest.getInstance("MD5");
852 long time = Time.now();
853 digester.update((new UID()+"@"+time).getBytes());
854 sync = digester.digest();
855 } catch (Exception e) {
856 throw new RuntimeException(e);
857 }
858 }
859
860 public static interface Option {}
861
862 static class FileOption extends Options.PathOption
863 implements Option {
864 FileOption(Path path) {
865 super(path);
866 }
867 }
868
869 /**
870 * @deprecated only used for backwards-compatibility in the createWriter methods
871 * that take FileSystem.
872 */
873 @Deprecated
874 private static class FileSystemOption implements Option {
875 private final FileSystem value;
876 protected FileSystemOption(FileSystem value) {
877 this.value = value;
878 }
879 public FileSystem getValue() {
880 return value;
881 }
882 }
883
884 static class StreamOption extends Options.FSDataOutputStreamOption
885 implements Option {
886 StreamOption(FSDataOutputStream stream) {
887 super(stream);
888 }
889 }
890
891 static class BufferSizeOption extends Options.IntegerOption
892 implements Option {
893 BufferSizeOption(int value) {
894 super(value);
895 }
896 }
897
898 static class BlockSizeOption extends Options.LongOption implements Option {
899 BlockSizeOption(long value) {
900 super(value);
901 }
902 }
903
904 static class ReplicationOption extends Options.IntegerOption
905 implements Option {
906 ReplicationOption(int value) {
907 super(value);
908 }
909 }
910
911 static class KeyClassOption extends Options.ClassOption implements Option {
912 KeyClassOption(Class<?> value) {
913 super(value);
914 }
915 }
916
917 static class ValueClassOption extends Options.ClassOption
918 implements Option {
919 ValueClassOption(Class<?> value) {
920 super(value);
921 }
922 }
923
924 static class MetadataOption implements Option {
925 private final Metadata value;
926 MetadataOption(Metadata value) {
927 this.value = value;
928 }
929 Metadata getValue() {
930 return value;
931 }
932 }
933
934 static class ProgressableOption extends Options.ProgressableOption
935 implements Option {
936 ProgressableOption(Progressable value) {
937 super(value);
938 }
939 }
940
941 private static class CompressionOption implements Option {
942 private final CompressionType value;
943 private final CompressionCodec codec;
944 CompressionOption(CompressionType value) {
945 this(value, null);
946 }
947 CompressionOption(CompressionType value, CompressionCodec codec) {
948 this.value = value;
949 this.codec = (CompressionType.NONE != value && null == codec)
950 ? new DefaultCodec()
951 : codec;
952 }
953 CompressionType getValue() {
954 return value;
955 }
956 CompressionCodec getCodec() {
957 return codec;
958 }
959 }
960
961 public static Option file(Path value) {
962 return new FileOption(value);
963 }
964
965 /**
966 * @deprecated only used for backwards-compatibility in the createWriter methods
967 * that take FileSystem.
968 */
969 @Deprecated
970 private static Option filesystem(FileSystem fs) {
971 return new SequenceFile.Writer.FileSystemOption(fs);
972 }
973
974 public static Option bufferSize(int value) {
975 return new BufferSizeOption(value);
976 }
977
978 public static Option stream(FSDataOutputStream value) {
979 return new StreamOption(value);
980 }
981
982 public static Option replication(short value) {
983 return new ReplicationOption(value);
984 }
985
986 public static Option blockSize(long value) {
987 return new BlockSizeOption(value);
988 }
989
990 public static Option progressable(Progressable value) {
991 return new ProgressableOption(value);
992 }
993
994 public static Option keyClass(Class<?> value) {
995 return new KeyClassOption(value);
996 }
997
998 public static Option valueClass(Class<?> value) {
999 return new ValueClassOption(value);
1000 }
1001
1002 public static Option metadata(Metadata value) {
1003 return new MetadataOption(value);
1004 }
1005
1006 public static Option compression(CompressionType value) {
1007 return new CompressionOption(value);
1008 }
1009
1010 public static Option compression(CompressionType value,
1011 CompressionCodec codec) {
1012 return new CompressionOption(value, codec);
1013 }
1014
1015 /**
1016 * Construct a uncompressed writer from a set of options.
1017 * @param conf the configuration to use
1018 * @param options the options used when creating the writer
1019 * @throws IOException if it fails
1020 */
1021 Writer(Configuration conf,
1022 Option... opts) throws IOException {
1023 BlockSizeOption blockSizeOption =
1024 Options.getOption(BlockSizeOption.class, opts);
1025 BufferSizeOption bufferSizeOption =
1026 Options.getOption(BufferSizeOption.class, opts);
1027 ReplicationOption replicationOption =
1028 Options.getOption(ReplicationOption.class, opts);
1029 ProgressableOption progressOption =
1030 Options.getOption(ProgressableOption.class, opts);
1031 FileOption fileOption = Options.getOption(FileOption.class, opts);
1032 FileSystemOption fsOption = Options.getOption(FileSystemOption.class, opts);
1033 StreamOption streamOption = Options.getOption(StreamOption.class, opts);
1034 KeyClassOption keyClassOption =
1035 Options.getOption(KeyClassOption.class, opts);
1036 ValueClassOption valueClassOption =
1037 Options.getOption(ValueClassOption.class, opts);
1038 MetadataOption metadataOption =
1039 Options.getOption(MetadataOption.class, opts);
1040 CompressionOption compressionTypeOption =
1041 Options.getOption(CompressionOption.class, opts);
1042 // check consistency of options
1043 if ((fileOption == null) == (streamOption == null)) {
1044 throw new IllegalArgumentException("file or stream must be specified");
1045 }
1046 if (fileOption == null && (blockSizeOption != null ||
1047 bufferSizeOption != null ||
1048 replicationOption != null ||
1049 progressOption != null)) {
1050 throw new IllegalArgumentException("file modifier options not " +
1051 "compatible with stream");
1052 }
1053
1054 FSDataOutputStream out;
1055 boolean ownStream = fileOption != null;
1056 if (ownStream) {
1057 Path p = fileOption.getValue();
1058 FileSystem fs;
1059 if (fsOption != null) {
1060 fs = fsOption.getValue();
1061 } else {
1062 fs = p.getFileSystem(conf);
1063 }
1064 int bufferSize = bufferSizeOption == null ? getBufferSize(conf) :
1065 bufferSizeOption.getValue();
1066 short replication = replicationOption == null ?
1067 fs.getDefaultReplication(p) :
1068 (short) replicationOption.getValue();
1069 long blockSize = blockSizeOption == null ? fs.getDefaultBlockSize(p) :
1070 blockSizeOption.getValue();
1071 Progressable progress = progressOption == null ? null :
1072 progressOption.getValue();
1073 out = fs.create(p, true, bufferSize, replication, blockSize, progress);
1074 } else {
1075 out = streamOption.getValue();
1076 }
1077 Class<?> keyClass = keyClassOption == null ?
1078 Object.class : keyClassOption.getValue();
1079 Class<?> valueClass = valueClassOption == null ?
1080 Object.class : valueClassOption.getValue();
1081 Metadata metadata = metadataOption == null ?
1082 new Metadata() : metadataOption.getValue();
1083 this.compress = compressionTypeOption.getValue();
1084 final CompressionCodec codec = compressionTypeOption.getCodec();
1085 if (codec != null &&
1086 (codec instanceof GzipCodec) &&
1087 !NativeCodeLoader.isNativeCodeLoaded() &&
1088 !ZlibFactory.isNativeZlibLoaded(conf)) {
1089 throw new IllegalArgumentException("SequenceFile doesn't work with " +
1090 "GzipCodec without native-hadoop " +
1091 "code!");
1092 }
1093 init(conf, out, ownStream, keyClass, valueClass, codec, metadata);
1094 }
1095
1096 /** Create the named file.
1097 * @deprecated Use
1098 * {@link SequenceFile#createWriter(Configuration, Writer.Option...)}
1099 * instead.
1100 */
1101 @Deprecated
1102 public Writer(FileSystem fs, Configuration conf, Path name,
1103 Class keyClass, Class valClass) throws IOException {
1104 this.compress = CompressionType.NONE;
1105 init(conf, fs.create(name), true, keyClass, valClass, null,
1106 new Metadata());
1107 }
1108
1109 /** Create the named file with write-progress reporter.
1110 * @deprecated Use
1111 * {@link SequenceFile#createWriter(Configuration, Writer.Option...)}
1112 * instead.
1113 */
1114 @Deprecated
1115 public Writer(FileSystem fs, Configuration conf, Path name,
1116 Class keyClass, Class valClass,
1117 Progressable progress, Metadata metadata) throws IOException {
1118 this.compress = CompressionType.NONE;
1119 init(conf, fs.create(name, progress), true, keyClass, valClass,
1120 null, metadata);
1121 }
1122
1123 /** Create the named file with write-progress reporter.
1124 * @deprecated Use
1125 * {@link SequenceFile#createWriter(Configuration, Writer.Option...)}
1126 * instead.
1127 */
1128 @Deprecated
1129 public Writer(FileSystem fs, Configuration conf, Path name,
1130 Class keyClass, Class valClass,
1131 int bufferSize, short replication, long blockSize,
1132 Progressable progress, Metadata metadata) throws IOException {
1133 this.compress = CompressionType.NONE;
1134 init(conf,
1135 fs.create(name, true, bufferSize, replication, blockSize, progress),
1136 true, keyClass, valClass, null, metadata);
1137 }
1138
1139 boolean isCompressed() { return compress != CompressionType.NONE; }
1140 boolean isBlockCompressed() { return compress == CompressionType.BLOCK; }
1141
1142 Writer ownStream() { this.ownOutputStream = true; return this; }
1143
1144 /** Write and flush the file header. */
1145 private void writeFileHeader()
1146 throws IOException {
1147 out.write(VERSION);
1148 Text.writeString(out, keyClass.getName());
1149 Text.writeString(out, valClass.getName());
1150
1151 out.writeBoolean(this.isCompressed());
1152 out.writeBoolean(this.isBlockCompressed());
1153
1154 if (this.isCompressed()) {
1155 Text.writeString(out, (codec.getClass()).getName());
1156 }
1157 this.metadata.write(out);
1158 out.write(sync); // write the sync bytes
1159 out.flush(); // flush header
1160 }
1161
1162 /** Initialize. */
1163 @SuppressWarnings("unchecked")
1164 void init(Configuration conf, FSDataOutputStream out, boolean ownStream,
1165 Class keyClass, Class valClass,
1166 CompressionCodec codec, Metadata metadata)
1167 throws IOException {
1168 this.conf = conf;
1169 this.out = out;
1170 this.ownOutputStream = ownStream;
1171 this.keyClass = keyClass;
1172 this.valClass = valClass;
1173 this.codec = codec;
1174 this.metadata = metadata;
1175 SerializationFactory serializationFactory = new SerializationFactory(conf);
1176 this.keySerializer = serializationFactory.getSerializer(keyClass);
1177 if (this.keySerializer == null) {
1178 throw new IOException(
1179 "Could not find a serializer for the Key class: '"
1180 + keyClass.getCanonicalName() + "'. "
1181 + "Please ensure that the configuration '" +
1182 CommonConfigurationKeys.IO_SERIALIZATIONS_KEY + "' is "
1183 + "properly configured, if you're using"
1184 + "custom serialization.");
1185 }
1186 this.keySerializer.open(buffer);
1187 this.uncompressedValSerializer = serializationFactory.getSerializer(valClass);
1188 if (this.uncompressedValSerializer == null) {
1189 throw new IOException(
1190 "Could not find a serializer for the Value class: '"
1191 + valClass.getCanonicalName() + "'. "
1192 + "Please ensure that the configuration '" +
1193 CommonConfigurationKeys.IO_SERIALIZATIONS_KEY + "' is "
1194 + "properly configured, if you're using"
1195 + "custom serialization.");
1196 }
1197 this.uncompressedValSerializer.open(buffer);
1198 if (this.codec != null) {
1199 ReflectionUtils.setConf(this.codec, this.conf);
1200 this.compressor = CodecPool.getCompressor(this.codec);
1201 this.deflateFilter = this.codec.createOutputStream(buffer, compressor);
1202 this.deflateOut =
1203 new DataOutputStream(new BufferedOutputStream(deflateFilter));
1204 this.compressedValSerializer = serializationFactory.getSerializer(valClass);
1205 if (this.compressedValSerializer == null) {
1206 throw new IOException(
1207 "Could not find a serializer for the Value class: '"
1208 + valClass.getCanonicalName() + "'. "
1209 + "Please ensure that the configuration '" +
1210 CommonConfigurationKeys.IO_SERIALIZATIONS_KEY + "' is "
1211 + "properly configured, if you're using"
1212 + "custom serialization.");
1213 }
1214 this.compressedValSerializer.open(deflateOut);
1215 }
1216 writeFileHeader();
1217 }
1218
1219 /** Returns the class of keys in this file. */
1220 public Class getKeyClass() { return keyClass; }
1221
1222 /** Returns the class of values in this file. */
1223 public Class getValueClass() { return valClass; }
1224
1225 /** Returns the compression codec of data in this file. */
1226 public CompressionCodec getCompressionCodec() { return codec; }
1227
1228 /** create a sync point */
1229 public void sync() throws IOException {
1230 if (sync != null && lastSyncPos != out.getPos()) {
1231 out.writeInt(SYNC_ESCAPE); // mark the start of the sync
1232 out.write(sync); // write sync
1233 lastSyncPos = out.getPos(); // update lastSyncPos
1234 }
1235 }
1236
1237 /**
1238 * flush all currently written data to the file system
1239 * @deprecated Use {@link #hsync()} or {@link #hflush()} instead
1240 */
1241 @Deprecated
1242 public void syncFs() throws IOException {
1243 if (out != null) {
1244 out.sync(); // flush contents to file system
1245 }
1246 }
1247
1248 @Override
1249 public void hsync() throws IOException {
1250 if (out != null) {
1251 out.hsync();
1252 }
1253 }
1254
1255 @Override
1256 public void hflush() throws IOException {
1257 if (out != null) {
1258 out.hflush();
1259 }
1260 }
1261
1262 /** Returns the configuration of this file. */
1263 Configuration getConf() { return conf; }
1264
1265 /** Close the file. */
1266 @Override
1267 public synchronized void close() throws IOException {
1268 keySerializer.close();
1269 uncompressedValSerializer.close();
1270 if (compressedValSerializer != null) {
1271 compressedValSerializer.close();
1272 }
1273
1274 CodecPool.returnCompressor(compressor);
1275 compressor = null;
1276
1277 if (out != null) {
1278
1279 // Close the underlying stream iff we own it...
1280 if (ownOutputStream) {
1281 out.close();
1282 } else {
1283 out.flush();
1284 }
1285 out = null;
1286 }
1287 }
1288
1289 synchronized void checkAndWriteSync() throws IOException {
1290 if (sync != null &&
1291 out.getPos() >= lastSyncPos+SYNC_INTERVAL) { // time to emit sync
1292 sync();
1293 }
1294 }
1295
1296 /** Append a key/value pair. */
1297 public void append(Writable key, Writable val)
1298 throws IOException {
1299 append((Object) key, (Object) val);
1300 }
1301
1302 /** Append a key/value pair. */
1303 @SuppressWarnings("unchecked")
1304 public synchronized void append(Object key, Object val)
1305 throws IOException {
1306 if (key.getClass() != keyClass)
1307 throw new IOException("wrong key class: "+key.getClass().getName()
1308 +" is not "+keyClass);
1309 if (val.getClass() != valClass)
1310 throw new IOException("wrong value class: "+val.getClass().getName()
1311 +" is not "+valClass);
1312
1313 buffer.reset();
1314
1315 // Append the 'key'
1316 keySerializer.serialize(key);
1317 int keyLength = buffer.getLength();
1318 if (keyLength < 0)
1319 throw new IOException("negative length keys not allowed: " + key);
1320
1321 // Append the 'value'
1322 if (compress == CompressionType.RECORD) {
1323 deflateFilter.resetState();
1324 compressedValSerializer.serialize(val);
1325 deflateOut.flush();
1326 deflateFilter.finish();
1327 } else {
1328 uncompressedValSerializer.serialize(val);
1329 }
1330
1331 // Write the record out
1332 checkAndWriteSync(); // sync
1333 out.writeInt(buffer.getLength()); // total record length
1334 out.writeInt(keyLength); // key portion length
1335 out.write(buffer.getData(), 0, buffer.getLength()); // data
1336 }
1337
1338 public synchronized void appendRaw(byte[] keyData, int keyOffset,
1339 int keyLength, ValueBytes val) throws IOException {
1340 if (keyLength < 0)
1341 throw new IOException("negative length keys not allowed: " + keyLength);
1342
1343 int valLength = val.getSize();
1344
1345 checkAndWriteSync();
1346
1347 out.writeInt(keyLength+valLength); // total record length
1348 out.writeInt(keyLength); // key portion length
1349 out.write(keyData, keyOffset, keyLength); // key
1350 val.writeUncompressedBytes(out); // value
1351 }
1352
1353 /** Returns the current length of the output file.
1354 *
1355 * <p>This always returns a synchronized position. In other words,
1356 * immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
1357 * returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
1358 * the key may be earlier in the file than key last written when this
1359 * method was called (e.g., with block-compression, it may be the first key
1360 * in the block that was being written when this method was called).
1361 */
1362 public synchronized long getLength() throws IOException {
1363 return out.getPos();
1364 }
1365
1366 } // class Writer
1367
1368 /** Write key/compressed-value pairs to a sequence-format file. */
1369 static class RecordCompressWriter extends Writer {
1370
1371 RecordCompressWriter(Configuration conf,
1372 Option... options) throws IOException {
1373 super(conf, options);
1374 }
1375
1376 /** Append a key/value pair. */
1377 @Override
1378 @SuppressWarnings("unchecked")
1379 public synchronized void append(Object key, Object val)
1380 throws IOException {
1381 if (key.getClass() != keyClass)
1382 throw new IOException("wrong key class: "+key.getClass().getName()
1383 +" is not "+keyClass);
1384 if (val.getClass() != valClass)
1385 throw new IOException("wrong value class: "+val.getClass().getName()
1386 +" is not "+valClass);
1387
1388 buffer.reset();
1389
1390 // Append the 'key'
1391 keySerializer.serialize(key);
1392 int keyLength = buffer.getLength();
1393 if (keyLength < 0)
1394 throw new IOException("negative length keys not allowed: " + key);
1395
1396 // Compress 'value' and append it
1397 deflateFilter.resetState();
1398 compressedValSerializer.serialize(val);
1399 deflateOut.flush();
1400 deflateFilter.finish();
1401
1402 // Write the record out
1403 checkAndWriteSync(); // sync
1404 out.writeInt(buffer.getLength()); // total record length
1405 out.writeInt(keyLength); // key portion length
1406 out.write(buffer.getData(), 0, buffer.getLength()); // data
1407 }
1408
1409 /** Append a key/value pair. */
1410 @Override
1411 public synchronized void appendRaw(byte[] keyData, int keyOffset,
1412 int keyLength, ValueBytes val) throws IOException {
1413
1414 if (keyLength < 0)
1415 throw new IOException("negative length keys not allowed: " + keyLength);
1416
1417 int valLength = val.getSize();
1418
1419 checkAndWriteSync(); // sync
1420 out.writeInt(keyLength+valLength); // total record length
1421 out.writeInt(keyLength); // key portion length
1422 out.write(keyData, keyOffset, keyLength); // 'key' data
1423 val.writeCompressedBytes(out); // 'value' data
1424 }
1425
1426 } // RecordCompressionWriter
1427
1428 /** Write compressed key/value blocks to a sequence-format file. */
1429 static class BlockCompressWriter extends Writer {
1430
1431 private int noBufferedRecords = 0;
1432
1433 private DataOutputBuffer keyLenBuffer = new DataOutputBuffer();
1434 private DataOutputBuffer keyBuffer = new DataOutputBuffer();
1435
1436 private DataOutputBuffer valLenBuffer = new DataOutputBuffer();
1437 private DataOutputBuffer valBuffer = new DataOutputBuffer();
1438
1439 private final int compressionBlockSize;
1440
1441 BlockCompressWriter(Configuration conf,
1442 Option... options) throws IOException {
1443 super(conf, options);
1444 compressionBlockSize =
1445 conf.getInt("io.seqfile.compress.blocksize", 1000000);
1446 keySerializer.close();
1447 keySerializer.open(keyBuffer);
1448 uncompressedValSerializer.close();
1449 uncompressedValSerializer.open(valBuffer);
1450 }
1451
1452 /** Workhorse to check and write out compressed data/lengths */
1453 private synchronized
1454 void writeBuffer(DataOutputBuffer uncompressedDataBuffer)
1455 throws IOException {
1456 deflateFilter.resetState();
1457 buffer.reset();
1458 deflateOut.write(uncompressedDataBuffer.getData(), 0,
1459 uncompressedDataBuffer.getLength());
1460 deflateOut.flush();
1461 deflateFilter.finish();
1462
1463 WritableUtils.writeVInt(out, buffer.getLength());
1464 out.write(buffer.getData(), 0, buffer.getLength());
1465 }
1466
1467 /** Compress and flush contents to dfs */
1468 @Override
1469 public synchronized void sync() throws IOException {
1470 if (noBufferedRecords > 0) {
1471 super.sync();
1472
1473 // No. of records
1474 WritableUtils.writeVInt(out, noBufferedRecords);
1475
1476 // Write 'keys' and lengths
1477 writeBuffer(keyLenBuffer);
1478 writeBuffer(keyBuffer);
1479
1480 // Write 'values' and lengths
1481 writeBuffer(valLenBuffer);
1482 writeBuffer(valBuffer);
1483
1484 // Flush the file-stream
1485 out.flush();
1486
1487 // Reset internal states
1488 keyLenBuffer.reset();
1489 keyBuffer.reset();
1490 valLenBuffer.reset();
1491 valBuffer.reset();
1492 noBufferedRecords = 0;
1493 }
1494
1495 }
1496
1497 /** Close the file. */
1498 @Override
1499 public synchronized void close() throws IOException {
1500 if (out != null) {
1501 sync();
1502 }
1503 super.close();
1504 }
1505
1506 /** Append a key/value pair. */
1507 @Override
1508 @SuppressWarnings("unchecked")
1509 public synchronized void append(Object key, Object val)
1510 throws IOException {
1511 if (key.getClass() != keyClass)
1512 throw new IOException("wrong key class: "+key+" is not "+keyClass);
1513 if (val.getClass() != valClass)
1514 throw new IOException("wrong value class: "+val+" is not "+valClass);
1515
1516 // Save key/value into respective buffers
1517 int oldKeyLength = keyBuffer.getLength();
1518 keySerializer.serialize(key);
1519 int keyLength = keyBuffer.getLength() - oldKeyLength;
1520 if (keyLength < 0)
1521 throw new IOException("negative length keys not allowed: " + key);
1522 WritableUtils.writeVInt(keyLenBuffer, keyLength);
1523
1524 int oldValLength = valBuffer.getLength();
1525 uncompressedValSerializer.serialize(val);
1526 int valLength = valBuffer.getLength() - oldValLength;
1527 WritableUtils.writeVInt(valLenBuffer, valLength);
1528
1529 // Added another key/value pair
1530 ++noBufferedRecords;
1531
1532 // Compress and flush?
1533 int currentBlockSize = keyBuffer.getLength() + valBuffer.getLength();
1534 if (currentBlockSize >= compressionBlockSize) {
1535 sync();
1536 }
1537 }
1538
1539 /** Append a key/value pair. */
1540 @Override
1541 public synchronized void appendRaw(byte[] keyData, int keyOffset,
1542 int keyLength, ValueBytes val) throws IOException {
1543
1544 if (keyLength < 0)
1545 throw new IOException("negative length keys not allowed");
1546
1547 int valLength = val.getSize();
1548
1549 // Save key/value data in relevant buffers
1550 WritableUtils.writeVInt(keyLenBuffer, keyLength);
1551 keyBuffer.write(keyData, keyOffset, keyLength);
1552 WritableUtils.writeVInt(valLenBuffer, valLength);
1553 val.writeUncompressedBytes(valBuffer);
1554
1555 // Added another key/value pair
1556 ++noBufferedRecords;
1557
1558 // Compress and flush?
1559 int currentBlockSize = keyBuffer.getLength() + valBuffer.getLength();
1560 if (currentBlockSize >= compressionBlockSize) {
1561 sync();
1562 }
1563 }
1564
1565 } // BlockCompressionWriter
1566
1567 /** Get the configured buffer size */
1568 private static int getBufferSize(Configuration conf) {
1569 return conf.getInt("io.file.buffer.size", 4096);
1570 }
1571
1572 /** Reads key/value pairs from a sequence-format file. */
1573 public static class Reader implements java.io.Closeable {
1574 private String filename;
1575 private FSDataInputStream in;
1576 private DataOutputBuffer outBuf = new DataOutputBuffer();
1577
1578 private byte version;
1579
1580 private String keyClassName;
1581 private String valClassName;
1582 private Class keyClass;
1583 private Class valClass;
1584
1585 private CompressionCodec codec = null;
1586 private Metadata metadata = null;
1587
1588 private byte[] sync = new byte[SYNC_HASH_SIZE];
1589 private byte[] syncCheck = new byte[SYNC_HASH_SIZE];
1590 private boolean syncSeen;
1591
1592 private long headerEnd;
1593 private long start;
1594 private long end;
1595 private int keyLength;
1596 private int recordLength;
1597
1598 private boolean decompress;
1599 private boolean blockCompressed;
1600
1601 private Configuration conf;
1602
1603 private int noBufferedRecords = 0;
1604 private boolean lazyDecompress = true;
1605 private boolean valuesDecompressed = true;
1606
1607 private int noBufferedKeys = 0;
1608 private int noBufferedValues = 0;
1609
1610 private DataInputBuffer keyLenBuffer = null;
1611 private CompressionInputStream keyLenInFilter = null;
1612 private DataInputStream keyLenIn = null;
1613 private Decompressor keyLenDecompressor = null;
1614 private DataInputBuffer keyBuffer = null;
1615 private CompressionInputStream keyInFilter = null;
1616 private DataInputStream keyIn = null;
1617 private Decompressor keyDecompressor = null;
1618
1619 private DataInputBuffer valLenBuffer = null;
1620 private CompressionInputStream valLenInFilter = null;
1621 private DataInputStream valLenIn = null;
1622 private Decompressor valLenDecompressor = null;
1623 private DataInputBuffer valBuffer = null;
1624 private CompressionInputStream valInFilter = null;
1625 private DataInputStream valIn = null;
1626 private Decompressor valDecompressor = null;
1627
1628 private Deserializer keyDeserializer;
1629 private Deserializer valDeserializer;
1630
1631 /**
1632 * A tag interface for all of the Reader options
1633 */
1634 public static interface Option {}
1635
1636 /**
1637 * Create an option to specify the path name of the sequence file.
1638 * @param value the path to read
1639 * @return a new option
1640 */
1641 public static Option file(Path value) {
1642 return new FileOption(value);
1643 }
1644
1645 /**
1646 * Create an option to specify the stream with the sequence file.
1647 * @param value the stream to read.
1648 * @return a new option
1649 */
1650 public static Option stream(FSDataInputStream value) {
1651 return new InputStreamOption(value);
1652 }
1653
1654 /**
1655 * Create an option to specify the starting byte to read.
1656 * @param value the number of bytes to skip over
1657 * @return a new option
1658 */
1659 public static Option start(long value) {
1660 return new StartOption(value);
1661 }
1662
1663 /**
1664 * Create an option to specify the number of bytes to read.
1665 * @param value the number of bytes to read
1666 * @return a new option
1667 */
1668 public static Option length(long value) {
1669 return new LengthOption(value);
1670 }
1671
1672 /**
1673 * Create an option with the buffer size for reading the given pathname.
1674 * @param value the number of bytes to buffer
1675 * @return a new option
1676 */
1677 public static Option bufferSize(int value) {
1678 return new BufferSizeOption(value);
1679 }
1680
1681 private static class FileOption extends Options.PathOption
1682 implements Option {
1683 private FileOption(Path value) {
1684 super(value);
1685 }
1686 }
1687
1688 private static class InputStreamOption
1689 extends Options.FSDataInputStreamOption
1690 implements Option {
1691 private InputStreamOption(FSDataInputStream value) {
1692 super(value);
1693 }
1694 }
1695
1696 private static class StartOption extends Options.LongOption
1697 implements Option {
1698 private StartOption(long value) {
1699 super(value);
1700 }
1701 }
1702
1703 private static class LengthOption extends Options.LongOption
1704 implements Option {
1705 private LengthOption(long value) {
1706 super(value);
1707 }
1708 }
1709
1710 private static class BufferSizeOption extends Options.IntegerOption
1711 implements Option {
1712 private BufferSizeOption(int value) {
1713 super(value);
1714 }
1715 }
1716
1717 // only used directly
1718 private static class OnlyHeaderOption extends Options.BooleanOption
1719 implements Option {
1720 private OnlyHeaderOption() {
1721 super(true);
1722 }
1723 }
1724
1725 public Reader(Configuration conf, Option... opts) throws IOException {
1726 // Look up the options, these are null if not set
1727 FileOption fileOpt = Options.getOption(FileOption.class, opts);
1728 InputStreamOption streamOpt =
1729 Options.getOption(InputStreamOption.class, opts);
1730 StartOption startOpt = Options.getOption(StartOption.class, opts);
1731 LengthOption lenOpt = Options.getOption(LengthOption.class, opts);
1732 BufferSizeOption bufOpt = Options.getOption(BufferSizeOption.class,opts);
1733 OnlyHeaderOption headerOnly =
1734 Options.getOption(OnlyHeaderOption.class, opts);
1735 // check for consistency
1736 if ((fileOpt == null) == (streamOpt == null)) {
1737 throw new
1738 IllegalArgumentException("File or stream option must be specified");
1739 }
1740 if (fileOpt == null && bufOpt != null) {
1741 throw new IllegalArgumentException("buffer size can only be set when" +
1742 " a file is specified.");
1743 }
1744 // figure out the real values
1745 Path filename = null;
1746 FSDataInputStream file;
1747 final long len;
1748 if (fileOpt != null) {
1749 filename = fileOpt.getValue();
1750 FileSystem fs = filename.getFileSystem(conf);
1751 int bufSize = bufOpt == null ? getBufferSize(conf): bufOpt.getValue();
1752 len = null == lenOpt
1753 ? fs.getFileStatus(filename).getLen()
1754 : lenOpt.getValue();
1755 file = openFile(fs, filename, bufSize, len);
1756 } else {
1757 len = null == lenOpt ? Long.MAX_VALUE : lenOpt.getValue();
1758 file = streamOpt.getValue();
1759 }
1760 long start = startOpt == null ? 0 : startOpt.getValue();
1761 // really set up
1762 initialize(filename, file, start, len, conf, headerOnly != null);
1763 }
1764
1765 /**
1766 * Construct a reader by opening a file from the given file system.
1767 * @param fs The file system used to open the file.
1768 * @param file The file being read.
1769 * @param conf Configuration
1770 * @throws IOException
1771 * @deprecated Use Reader(Configuration, Option...) instead.
1772 */
1773 @Deprecated
1774 public Reader(FileSystem fs, Path file,
1775 Configuration conf) throws IOException {
1776 this(conf, file(file.makeQualified(fs)));
1777 }
1778
1779 /**
1780 * Construct a reader by the given input stream.
1781 * @param in An input stream.
1782 * @param buffersize unused
1783 * @param start The starting position.
1784 * @param length The length being read.
1785 * @param conf Configuration
1786 * @throws IOException
1787 * @deprecated Use Reader(Configuration, Reader.Option...) instead.
1788 */
1789 @Deprecated
1790 public Reader(FSDataInputStream in, int buffersize,
1791 long start, long length, Configuration conf) throws IOException {
1792 this(conf, stream(in), start(start), length(length));
1793 }
1794
1795 /** Common work of the constructors. */
1796 private void initialize(Path filename, FSDataInputStream in,
1797 long start, long length, Configuration conf,
1798 boolean tempReader) throws IOException {
1799 if (in == null) {
1800 throw new IllegalArgumentException("in == null");
1801 }
1802 this.filename = filename == null ? "<unknown>" : filename.toString();
1803 this.in = in;
1804 this.conf = conf;
1805 boolean succeeded = false;
1806 this.start = start;
1807 try {
1808 seek(start);
1809 this.end = this.in.getPos() + length;
1810 // if it wrapped around, use the max
1811 if (end < length) {
1812 end = Long.MAX_VALUE;
1813 }
1814 init(tempReader);
1815 succeeded = true;
1816 } finally {
1817 if (!succeeded) {
1818 IOUtils.cleanup(LOG, this.in);
1819 }
1820 }
1821 }
1822
1823 /**
1824 * Override this method to specialize the type of
1825 * {@link FSDataInputStream} returned.
1826 * @param fs The file system used to open the file.
1827 * @param file The file being read.
1828 * @param bufferSize The buffer size used to read the file.
1829 * @param length The length being read if it is >= 0. Otherwise,
1830 * the length is not available.
1831 * @return The opened stream.
1832 * @throws IOException
1833 */
1834 protected FSDataInputStream openFile(FileSystem fs, Path file,
1835 int bufferSize, long length) throws IOException {
1836 return fs.open(file, bufferSize);
1837 }
1838
1839 /**
1840 * Initialize the {@link Reader}
1841 * @param tmpReader <code>true</code> if we are constructing a temporary
1842 * reader {@link SequenceFile.Sorter.cloneFileAttributes},
1843 * and hence do not initialize every component;
1844 * <code>false</code> otherwise.
1845 * @throws IOException
1846 */
1847 private void init(boolean tempReader) throws IOException {
1848 byte[] versionBlock = new byte[VERSION.length];
1849 in.readFully(versionBlock);
1850
1851 if ((versionBlock[0] != VERSION[0]) ||
1852 (versionBlock[1] != VERSION[1]) ||
1853 (versionBlock[2] != VERSION[2]))
1854 throw new IOException(this + " not a SequenceFile");
1855
1856 // Set 'version'
1857 version = versionBlock[3];
1858 if (version > VERSION[3])
1859 throw new VersionMismatchException(VERSION[3], version);
1860
1861 if (version < BLOCK_COMPRESS_VERSION) {
1862 UTF8 className = new UTF8();
1863
1864 className.readFields(in);
1865 keyClassName = className.toStringChecked(); // key class name
1866
1867 className.readFields(in);
1868 valClassName = className.toStringChecked(); // val class name
1869 } else {
1870 keyClassName = Text.readString(in);
1871 valClassName = Text.readString(in);
1872 }
1873
1874 if (version > 2) { // if version > 2
1875 this.decompress = in.readBoolean(); // is compressed?
1876 } else {
1877 decompress = false;
1878 }
1879
1880 if (version >= BLOCK_COMPRESS_VERSION) { // if version >= 4
1881 this.blockCompressed = in.readBoolean(); // is block-compressed?
1882 } else {
1883 blockCompressed = false;
1884 }
1885
1886 // if version >= 5
1887 // setup the compression codec
1888 if (decompress) {
1889 if (version >= CUSTOM_COMPRESS_VERSION) {
1890 String codecClassname = Text.readString(in);
1891 try {
1892 Class<? extends CompressionCodec> codecClass
1893 = conf.getClassByName(codecClassname).asSubclass(CompressionCodec.class);
1894 this.codec = ReflectionUtils.newInstance(codecClass, conf);
1895 } catch (ClassNotFoundException cnfe) {
1896 throw new IllegalArgumentException("Unknown codec: " +
1897 codecClassname, cnfe);
1898 }
1899 } else {
1900 codec = new DefaultCodec();
1901 ((Configurable)codec).setConf(conf);
1902 }
1903 }
1904
1905 this.metadata = new Metadata();
1906 if (version >= VERSION_WITH_METADATA) { // if version >= 6
1907 this.metadata.readFields(in);
1908 }
1909
1910 if (version > 1) { // if version > 1
1911 in.readFully(sync); // read sync bytes
1912 headerEnd = in.getPos(); // record end of header
1913 }
1914
1915 // Initialize... *not* if this we are constructing a temporary Reader
1916 if (!tempReader) {
1917 valBuffer = new DataInputBuffer();
1918 if (decompress) {
1919 valDecompressor = CodecPool.getDecompressor(codec);
1920 valInFilter = codec.createInputStream(valBuffer, valDecompressor);
1921 valIn = new DataInputStream(valInFilter);
1922 } else {
1923 valIn = valBuffer;
1924 }
1925
1926 if (blockCompressed) {
1927 keyLenBuffer = new DataInputBuffer();
1928 keyBuffer = new DataInputBuffer();
1929 valLenBuffer = new DataInputBuffer();
1930
1931 keyLenDecompressor = CodecPool.getDecompressor(codec);
1932 keyLenInFilter = codec.createInputStream(keyLenBuffer,
1933 keyLenDecompressor);
1934 keyLenIn = new DataInputStream(keyLenInFilter);
1935
1936 keyDecompressor = CodecPool.getDecompressor(codec);
1937 keyInFilter = codec.createInputStream(keyBuffer, keyDecompressor);
1938 keyIn = new DataInputStream(keyInFilter);
1939
1940 valLenDecompressor = CodecPool.getDecompressor(codec);
1941 valLenInFilter = codec.createInputStream(valLenBuffer,
1942 valLenDecompressor);
1943 valLenIn = new DataInputStream(valLenInFilter);
1944 }
1945
1946 SerializationFactory serializationFactory =
1947 new SerializationFactory(conf);
1948 this.keyDeserializer =
1949 getDeserializer(serializationFactory, getKeyClass());
1950 if (this.keyDeserializer == null) {
1951 throw new IOException(
1952 "Could not find a deserializer for the Key class: '"
1953 + getKeyClass().getCanonicalName() + "'. "
1954 + "Please ensure that the configuration '" +
1955 CommonConfigurationKeys.IO_SERIALIZATIONS_KEY + "' is "
1956 + "properly configured, if you're using "
1957 + "custom serialization.");
1958 }
1959 if (!blockCompressed) {
1960 this.keyDeserializer.open(valBuffer);
1961 } else {
1962 this.keyDeserializer.open(keyIn);
1963 }
1964 this.valDeserializer =
1965 getDeserializer(serializationFactory, getValueClass());
1966 if (this.valDeserializer == null) {
1967 throw new IOException(
1968 "Could not find a deserializer for the Value class: '"
1969 + getValueClass().getCanonicalName() + "'. "
1970 + "Please ensure that the configuration '" +
1971 CommonConfigurationKeys.IO_SERIALIZATIONS_KEY + "' is "
1972 + "properly configured, if you're using "
1973 + "custom serialization.");
1974 }
1975 this.valDeserializer.open(valIn);
1976 }
1977 }
1978
1979 @SuppressWarnings("unchecked")
1980 private Deserializer getDeserializer(SerializationFactory sf, Class c) {
1981 return sf.getDeserializer(c);
1982 }
1983
1984 /** Close the file. */
1985 @Override
1986 public synchronized void close() throws IOException {
1987 // Return the decompressors to the pool
1988 CodecPool.returnDecompressor(keyLenDecompressor);
1989 CodecPool.returnDecompressor(keyDecompressor);
1990 CodecPool.returnDecompressor(valLenDecompressor);
1991 CodecPool.returnDecompressor(valDecompressor);
1992 keyLenDecompressor = keyDecompressor = null;
1993 valLenDecompressor = valDecompressor = null;
1994
1995 if (keyDeserializer != null) {
1996 keyDeserializer.close();
1997 }
1998 if (valDeserializer != null) {
1999 valDeserializer.close();
2000 }
2001
2002 try {
2003 in.adviseFile(FadviseType.FILE_DONTNEED, start, end);
2004 } catch (IOException ioe) {
2005 if (LOG.isInfoEnabled()) {
2006 LOG.info("Error in fadvise. Ignoring it.", ioe);
2007 }
2008 }
2009
2010 // Close the input-stream
2011 in.close();
2012 }
2013
2014 /** Returns the name of the key class. */
2015 public String getKeyClassName() {
2016 return keyClassName;
2017 }
2018
2019 /** Returns the class of keys in this file. */
2020 public synchronized Class<?> getKeyClass() {
2021 if (null == keyClass) {
2022 try {
2023 keyClass = WritableName.getClass(getKeyClassName(), conf);
2024 } catch (IOException e) {
2025 throw new RuntimeException(e);
2026 }
2027 }
2028 return keyClass;
2029 }
2030
2031 /** Returns the name of the value class. */
2032 public String getValueClassName() {
2033 return valClassName;
2034 }
2035
2036 /** Returns the class of values in this file. */
2037 public synchronized Class<?> getValueClass() {
2038 if (null == valClass) {
2039 try {
2040 valClass = WritableName.getClass(getValueClassName(), conf);
2041 } catch (IOException e) {
2042 throw new RuntimeException(e);
2043 }
2044 }
2045 return valClass;
2046 }
2047
2048 /** Returns true if values are compressed. */
2049 public boolean isCompressed() { return decompress; }
2050
2051 /** Returns true if records are block-compressed. */
2052 public boolean isBlockCompressed() { return blockCompressed; }
2053
2054 /** Returns the compression codec of data in this file. */
2055 public CompressionCodec getCompressionCodec() { return codec; }
2056
2057 /**
2058 * Get the compression type for this file.
2059 * @return the compression type
2060 */
2061 public CompressionType getCompressionType() {
2062 if (decompress) {
2063 return blockCompressed ? CompressionType.BLOCK : CompressionType.RECORD;
2064 } else {
2065 return CompressionType.NONE;
2066 }
2067 }
2068
2069 /** Returns the metadata object of the file */
2070 public Metadata getMetadata() {
2071 return this.metadata;
2072 }
2073
2074 /** Returns the configuration used for this file. */
2075 Configuration getConf() { return conf; }
2076
2077 /** Read a compressed buffer */
2078 private synchronized void readBuffer(DataInputBuffer buffer,
2079 CompressionInputStream filter) throws IOException {
2080 // Read data into a temporary buffer
2081 DataOutputBuffer dataBuffer = new DataOutputBuffer();
2082
2083 try {
2084 int dataBufferLength = WritableUtils.readVInt(in);
2085 dataBuffer.write(in, dataBufferLength);
2086
2087 // Set up 'buffer' connected to the input-stream
2088 buffer.reset(dataBuffer.getData(), 0, dataBuffer.getLength());
2089 } finally {
2090 dataBuffer.close();
2091 }
2092
2093 // Reset the codec
2094 filter.resetState();
2095 }
2096
2097 /** Read the next 'compressed' block */
2098 private synchronized void readBlock() throws IOException {
2099 // Check if we need to throw away a whole block of
2100 // 'values' due to 'lazy decompression'
2101 if (lazyDecompress && !valuesDecompressed) {
2102 in.seek(WritableUtils.readVInt(in)+in.getPos());
2103 in.seek(WritableUtils.readVInt(in)+in.getPos());
2104 }
2105
2106 // Reset internal states
2107 noBufferedKeys = 0; noBufferedValues = 0; noBufferedRecords = 0;
2108 valuesDecompressed = false;
2109
2110 //Process sync
2111 if (sync != null) {
2112 in.readInt();
2113 in.readFully(syncCheck); // read syncCheck
2114 if (!Arrays.equals(sync, syncCheck)) // check it
2115 throw new IOException("File is corrupt!");
2116 }
2117 syncSeen = true;
2118
2119 // Read number of records in this block
2120 noBufferedRecords = WritableUtils.readVInt(in);
2121
2122 // Read key lengths and keys
2123 readBuffer(keyLenBuffer, keyLenInFilter);
2124 readBuffer(keyBuffer, keyInFilter);
2125 noBufferedKeys = noBufferedRecords;
2126
2127 // Read value lengths and values
2128 if (!lazyDecompress) {
2129 readBuffer(valLenBuffer, valLenInFilter);
2130 readBuffer(valBuffer, valInFilter);
2131 noBufferedValues = noBufferedRecords;
2132 valuesDecompressed = true;
2133 }
2134 }
2135
2136 /**
2137 * Position valLenIn/valIn to the 'value'
2138 * corresponding to the 'current' key
2139 */
2140 private synchronized void seekToCurrentValue() throws IOException {
2141 if (!blockCompressed) {
2142 if (decompress) {
2143 valInFilter.resetState();
2144 }
2145 valBuffer.reset();
2146 } else {
2147 // Check if this is the first value in the 'block' to be read
2148 if (lazyDecompress && !valuesDecompressed) {
2149 // Read the value lengths and values
2150 readBuffer(valLenBuffer, valLenInFilter);
2151 readBuffer(valBuffer, valInFilter);
2152 noBufferedValues = noBufferedRecords;
2153 valuesDecompressed = true;
2154 }
2155
2156 // Calculate the no. of bytes to skip
2157 // Note: 'current' key has already been read!
2158 int skipValBytes = 0;
2159 int currentKey = noBufferedKeys + 1;
2160 for (int i=noBufferedValues; i > currentKey; --i) {
2161 skipValBytes += WritableUtils.readVInt(valLenIn);
2162 --noBufferedValues;
2163 }
2164
2165 // Skip to the 'val' corresponding to 'current' key
2166 if (skipValBytes > 0) {
2167 if (valIn.skipBytes(skipValBytes) != skipValBytes) {
2168 throw new IOException("Failed to seek to " + currentKey +
2169 "(th) value!");
2170 }
2171 }
2172 }
2173 }
2174
2175 /**
2176 * Get the 'value' corresponding to the last read 'key'.
2177 * @param val : The 'value' to be read.
2178 * @throws IOException
2179 */
2180 public synchronized void getCurrentValue(Writable val)
2181 throws IOException {
2182 if (val instanceof Configurable) {
2183 ((Configurable) val).setConf(this.conf);
2184 }
2185
2186 // Position stream to 'current' value
2187 seekToCurrentValue();
2188
2189 if (!blockCompressed) {
2190 val.readFields(valIn);
2191
2192 if (valIn.read() > 0) {
2193 LOG.info("available bytes: " + valIn.available());
2194 throw new IOException(val+" read "+(valBuffer.getPosition()-keyLength)
2195 + " bytes, should read " +
2196 (valBuffer.getLength()-keyLength));
2197 }
2198 } else {
2199 // Get the value
2200 int valLength = WritableUtils.readVInt(valLenIn);
2201 val.readFields(valIn);
2202
2203 // Read another compressed 'value'
2204 --noBufferedValues;
2205
2206 // Sanity check
2207 if ((valLength < 0) && LOG.isDebugEnabled()) {
2208 LOG.debug(val + " is a zero-length value");
2209 }
2210 }
2211
2212 }
2213
2214 /**
2215 * Get the 'value' corresponding to the last read 'key'.
2216 * @param val : The 'value' to be read.
2217 * @throws IOException
2218 */
2219 public synchronized Object getCurrentValue(Object val)
2220 throws IOException {
2221 if (val instanceof Configurable) {
2222 ((Configurable) val).setConf(this.conf);
2223 }
2224
2225 // Position stream to 'current' value
2226 seekToCurrentValue();
2227
2228 if (!blockCompressed) {
2229 val = deserializeValue(val);
2230
2231 if (valIn.read() > 0) {
2232 LOG.info("available bytes: " + valIn.available());
2233 throw new IOException(val+" read "+(valBuffer.getPosition()-keyLength)
2234 + " bytes, should read " +
2235 (valBuffer.getLength()-keyLength));
2236 }
2237 } else {
2238 // Get the value
2239 int valLength = WritableUtils.readVInt(valLenIn);
2240 val = deserializeValue(val);
2241
2242 // Read another compressed 'value'
2243 --noBufferedValues;
2244
2245 // Sanity check
2246 if ((valLength < 0) && LOG.isDebugEnabled()) {
2247 LOG.debug(val + " is a zero-length value");
2248 }
2249 }
2250 return val;
2251
2252 }
2253
2254 @SuppressWarnings("unchecked")
2255 private Object deserializeValue(Object val) throws IOException {
2256 return valDeserializer.deserialize(val);
2257 }
2258
2259 /** Read the next key in the file into <code>key</code>, skipping its
2260 * value. True if another entry exists, and false at end of file. */
2261 public synchronized boolean next(Writable key) throws IOException {
2262 if (key.getClass() != getKeyClass())
2263 throw new IOException("wrong key class: "+key.getClass().getName()
2264 +" is not "+keyClass);
2265
2266 if (!blockCompressed) {
2267 outBuf.reset();
2268
2269 keyLength = next(outBuf);
2270 if (keyLength < 0)
2271 return false;
2272
2273 valBuffer.reset(outBuf.getData(), outBuf.getLength());
2274
2275 key.readFields(valBuffer);
2276 valBuffer.mark(0);
2277 if (valBuffer.getPosition() != keyLength)
2278 throw new IOException(key + " read " + valBuffer.getPosition()
2279 + " bytes, should read " + keyLength);
2280 } else {
2281 //Reset syncSeen
2282 syncSeen = false;
2283
2284 if (noBufferedKeys == 0) {
2285 try {
2286 readBlock();
2287 } catch (EOFException eof) {
2288 return false;
2289 }
2290 }
2291
2292 int keyLength = WritableUtils.readVInt(keyLenIn);
2293
2294 // Sanity check
2295 if (keyLength < 0) {
2296 return false;
2297 }
2298
2299 //Read another compressed 'key'
2300 key.readFields(keyIn);
2301 --noBufferedKeys;
2302 }
2303
2304 return true;
2305 }
2306
2307 /** Read the next key/value pair in the file into <code>key</code> and
2308 * <code>val</code>. Returns true if such a pair exists and false when at
2309 * end of file */
2310 public synchronized boolean next(Writable key, Writable val)
2311 throws IOException {
2312 if (val.getClass() != getValueClass())
2313 throw new IOException("wrong value class: "+val+" is not "+valClass);
2314
2315 boolean more = next(key);
2316
2317 if (more) {
2318 getCurrentValue(val);
2319 }
2320
2321 return more;
2322 }
2323
2324 /**
2325 * Read and return the next record length, potentially skipping over
2326 * a sync block.
2327 * @return the length of the next record or -1 if there is no next record
2328 * @throws IOException
2329 */
2330 private synchronized int readRecordLength() throws IOException {
2331 if (in.getPos() >= end) {
2332 return -1;
2333 }
2334 int length = in.readInt();
2335 if (version > 1 && sync != null &&
2336 length == SYNC_ESCAPE) { // process a sync entry
2337 in.readFully(syncCheck); // read syncCheck
2338 if (!Arrays.equals(sync, syncCheck)) // check it
2339 throw new IOException("File is corrupt!");
2340 syncSeen = true;
2341 if (in.getPos() >= end) {
2342 return -1;
2343 }
2344 length = in.readInt(); // re-read length
2345 } else {
2346 syncSeen = false;
2347 }
2348
2349 return length;
2350 }
2351
2352 /** Read the next key/value pair in the file into <code>buffer</code>.
2353 * Returns the length of the key read, or -1 if at end of file. The length
2354 * of the value may be computed by calling buffer.getLength() before and
2355 * after calls to this method. */
2356 /** @deprecated Call {@link #nextRaw(DataOutputBuffer,SequenceFile.ValueBytes)}. */
2357 @Deprecated
2358 synchronized int next(DataOutputBuffer buffer) throws IOException {
2359 // Unsupported for block-compressed sequence files
2360 if (blockCompressed) {
2361 throw new IOException("Unsupported call for block-compressed" +
2362 " SequenceFiles - use SequenceFile.Reader.next(DataOutputStream, ValueBytes)");
2363 }
2364 try {
2365 int length = readRecordLength();
2366 if (length == -1) {
2367 return -1;
2368 }
2369 int keyLength = in.readInt();
2370 buffer.write(in, length);
2371 return keyLength;
2372 } catch (ChecksumException e) { // checksum failure
2373 handleChecksumException(e);
2374 return next(buffer);
2375 }
2376 }
2377
2378 public ValueBytes createValueBytes() {
2379 ValueBytes val = null;
2380 if (!decompress || blockCompressed) {
2381 val = new UncompressedBytes();
2382 } else {
2383 val = new CompressedBytes(codec);
2384 }
2385 return val;
2386 }
2387
2388 /**
2389 * Read 'raw' records.
2390 * @param key - The buffer into which the key is read
2391 * @param val - The 'raw' value
2392 * @return Returns the total record length or -1 for end of file
2393 * @throws IOException
2394 */
2395 public synchronized int nextRaw(DataOutputBuffer key, ValueBytes val)
2396 throws IOException {
2397 if (!blockCompressed) {
2398 int length = readRecordLength();
2399 if (length == -1) {
2400 return -1;
2401 }
2402 int keyLength = in.readInt();
2403 int valLength = length - keyLength;
2404 key.write(in, keyLength);
2405 if (decompress) {
2406 CompressedBytes value = (CompressedBytes)val;
2407 value.reset(in, valLength);
2408 } else {
2409 UncompressedBytes value = (UncompressedBytes)val;
2410 value.reset(in, valLength);
2411 }
2412
2413 return length;
2414 } else {
2415 //Reset syncSeen
2416 syncSeen = false;
2417
2418 // Read 'key'
2419 if (noBufferedKeys == 0) {
2420 if (in.getPos() >= end)
2421 return -1;
2422
2423 try {
2424 readBlock();
2425 } catch (EOFException eof) {
2426 return -1;
2427 }
2428 }
2429 int keyLength = WritableUtils.readVInt(keyLenIn);
2430 if (keyLength < 0) {
2431 throw new IOException("zero length key found!");
2432 }
2433 key.write(keyIn, keyLength);
2434 --noBufferedKeys;
2435
2436 // Read raw 'value'
2437 seekToCurrentValue();
2438 int valLength = WritableUtils.readVInt(valLenIn);
2439 UncompressedBytes rawValue = (UncompressedBytes)val;
2440 rawValue.reset(valIn, valLength);
2441 --noBufferedValues;
2442
2443 return (keyLength+valLength);
2444 }
2445
2446 }
2447
2448 /**
2449 * Read 'raw' keys.
2450 * @param key - The buffer into which the key is read
2451 * @return Returns the key length or -1 for end of file
2452 * @throws IOException
2453 */
2454 public synchronized int nextRawKey(DataOutputBuffer key)
2455 throws IOException {
2456 if (!blockCompressed) {
2457 recordLength = readRecordLength();
2458 if (recordLength == -1) {
2459 return -1;
2460 }
2461 keyLength = in.readInt();
2462 key.write(in, keyLength);
2463 return keyLength;
2464 } else {
2465 //Reset syncSeen
2466 syncSeen = false;
2467
2468 // Read 'key'
2469 if (noBufferedKeys == 0) {
2470 if (in.getPos() >= end)
2471 return -1;
2472
2473 try {
2474 readBlock();
2475 } catch (EOFException eof) {
2476 return -1;
2477 }
2478 }
2479 int keyLength = WritableUtils.readVInt(keyLenIn);
2480 if (keyLength < 0) {
2481 throw new IOException("zero length key found!");
2482 }
2483 key.write(keyIn, keyLength);
2484 --noBufferedKeys;
2485
2486 return keyLength;
2487 }
2488
2489 }
2490
2491 /** Read the next key in the file, skipping its
2492 * value. Return null at end of file. */
2493 public synchronized Object next(Object key) throws IOException {
2494 if (key != null && key.getClass() != getKeyClass()) {
2495 throw new IOException("wrong key class: "+key.getClass().getName()
2496 +" is not "+keyClass);
2497 }
2498
2499 if (!blockCompressed) {
2500 outBuf.reset();
2501
2502 keyLength = next(outBuf);
2503 if (keyLength < 0)
2504 return null;
2505
2506 valBuffer.reset(outBuf.getData(), outBuf.getLength());
2507
2508 key = deserializeKey(key);
2509 valBuffer.mark(0);
2510 if (valBuffer.getPosition() != keyLength)
2511 throw new IOException(key + " read " + valBuffer.getPosition()
2512 + " bytes, should read " + keyLength);
2513 } else {
2514 //Reset syncSeen
2515 syncSeen = false;
2516
2517 if (noBufferedKeys == 0) {
2518 try {
2519 readBlock();
2520 } catch (EOFException eof) {
2521 return null;
2522 }
2523 }
2524
2525 int keyLength = WritableUtils.readVInt(keyLenIn);
2526
2527 // Sanity check
2528 if (keyLength < 0) {
2529 return null;
2530 }
2531
2532 //Read another compressed 'key'
2533 key = deserializeKey(key);
2534 --noBufferedKeys;
2535 }
2536
2537 return key;
2538 }
2539
2540 @SuppressWarnings("unchecked")
2541 private Object deserializeKey(Object key) throws IOException {
2542 return keyDeserializer.deserialize(key);
2543 }
2544
2545 /**
2546 * Read 'raw' values.
2547 * @param val - The 'raw' value
2548 * @return Returns the value length
2549 * @throws IOException
2550 */
2551 public synchronized int nextRawValue(ValueBytes val)
2552 throws IOException {
2553
2554 // Position stream to current value
2555 seekToCurrentValue();
2556
2557 if (!blockCompressed) {
2558 int valLength = recordLength - keyLength;
2559 if (decompress) {
2560 CompressedBytes value = (CompressedBytes)val;
2561 value.reset(in, valLength);
2562 } else {
2563 UncompressedBytes value = (UncompressedBytes)val;
2564 value.reset(in, valLength);
2565 }
2566
2567 return valLength;
2568 } else {
2569 int valLength = WritableUtils.readVInt(valLenIn);
2570 UncompressedBytes rawValue = (UncompressedBytes)val;
2571 rawValue.reset(valIn, valLength);
2572 --noBufferedValues;
2573 return valLength;
2574 }
2575
2576 }
2577
2578 private void handleChecksumException(ChecksumException e)
2579 throws IOException {
2580 if (this.conf.getBoolean("io.skip.checksum.errors", false)) {
2581 LOG.warn("Bad checksum at "+getPosition()+". Skipping entries.");
2582 sync(getPosition()+this.conf.getInt("io.bytes.per.checksum", 512));
2583 } else {
2584 throw e;
2585 }
2586 }
2587
2588 /** disables sync. often invoked for tmp files */
2589 synchronized void ignoreSync() {
2590 sync = null;
2591 }
2592
2593 /** Set the current byte position in the input file.
2594 *
2595 * <p>The position passed must be a position returned by {@link
2596 * SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
2597 * position, use {@link SequenceFile.Reader#sync(long)}.
2598 */
2599 public synchronized void seek(long position) throws IOException {
2600 in.seek(position);
2601 if (blockCompressed) { // trigger block read
2602 noBufferedKeys = 0;
2603 valuesDecompressed = true;
2604 }
2605 }
2606
2607 /** Seek to the next sync mark past a given position.*/
2608 public synchronized void sync(long position) throws IOException {
2609 if (position+SYNC_SIZE >= end) {
2610 seek(end);
2611 return;
2612 }
2613
2614 if (position < headerEnd) {
2615 // seek directly to first record
2616 in.seek(headerEnd);
2617 // note the sync marker "seen" in the header
2618 syncSeen = true;
2619 return;
2620 }
2621
2622 try {
2623 seek(position+4); // skip escape
2624 in.readFully(syncCheck);
2625 int syncLen = sync.length;
2626 for (int i = 0; in.getPos() < end; i++) {
2627 int j = 0;
2628 for (; j < syncLen; j++) {
2629 if (sync[j] != syncCheck[(i+j)%syncLen])
2630 break;
2631 }
2632 if (j == syncLen) {
2633 in.seek(in.getPos() - SYNC_SIZE); // position before sync
2634 return;
2635 }
2636 syncCheck[i%syncLen] = in.readByte();
2637 }
2638 } catch (ChecksumException e) { // checksum failure
2639 handleChecksumException(e);
2640 }
2641 }
2642
2643 /** Returns true iff the previous call to next passed a sync mark.*/
2644 public synchronized boolean syncSeen() { return syncSeen; }
2645
2646 /** Return the current byte position in the input file. */
2647 public synchronized long getPosition() throws IOException {
2648 return in.getPos();
2649 }
2650
2651 /** Returns the name of the file. */
2652 @Override
2653 public String toString() {
2654 return filename;
2655 }
2656
2657 }
2658
2659 /** Sorts key/value pairs in a sequence-format file.
2660 *
2661 * <p>For best performance, applications should make sure that the {@link
2662 * Writable#readFields(DataInput)} implementation of their keys is
2663 * very efficient. In particular, it should avoid allocating memory.
2664 */
2665 public static class Sorter {
2666
2667 private RawComparator comparator;
2668
2669 private MergeSort mergeSort; //the implementation of merge sort
2670
2671 private Path[] inFiles; // when merging or sorting
2672
2673 private Path outFile;
2674
2675 private int memory; // bytes
2676 private int factor; // merged per pass
2677
2678 private FileSystem fs = null;
2679
2680 private Class keyClass;
2681 private Class valClass;
2682
2683 private Configuration conf;
2684 private Metadata metadata;
2685
2686 private Progressable progressable = null;
2687
2688 /** Sort and merge files containing the named classes. */
2689 public Sorter(FileSystem fs, Class<? extends WritableComparable> keyClass,
2690 Class valClass, Configuration conf) {
2691 this(fs, WritableComparator.get(keyClass), keyClass, valClass, conf);
2692 }
2693
2694 /** Sort and merge using an arbitrary {@link RawComparator}. */
2695 public Sorter(FileSystem fs, RawComparator comparator, Class keyClass,
2696 Class valClass, Configuration conf) {
2697 this(fs, comparator, keyClass, valClass, conf, new Metadata());
2698 }
2699
2700 /** Sort and merge using an arbitrary {@link RawComparator}. */
2701 public Sorter(FileSystem fs, RawComparator comparator, Class keyClass,
2702 Class valClass, Configuration conf, Metadata metadata) {
2703 this.fs = fs;
2704 this.comparator = comparator;
2705 this.keyClass = keyClass;
2706 this.valClass = valClass;
2707 this.memory = conf.getInt("io.sort.mb", 100) * 1024 * 1024;
2708 this.factor = conf.getInt("io.sort.factor", 100);
2709 this.conf = conf;
2710 this.metadata = metadata;
2711 }
2712
2713 /** Set the number of streams to merge at once.*/
2714 public void setFactor(int factor) { this.factor = factor; }
2715
2716 /** Get the number of streams to merge at once.*/
2717 public int getFactor() { return factor; }
2718
2719 /** Set the total amount of buffer memory, in bytes.*/
2720 public void setMemory(int memory) { this.memory = memory; }
2721
2722 /** Get the total amount of buffer memory, in bytes.*/
2723 public int getMemory() { return memory; }
2724
2725 /** Set the progressable object in order to report progress. */
2726 public void setProgressable(Progressable progressable) {
2727 this.progressable = progressable;
2728 }
2729
2730 /**
2731 * Perform a file sort from a set of input files into an output file.
2732 * @param inFiles the files to be sorted
2733 * @param outFile the sorted output file
2734 * @param deleteInput should the input files be deleted as they are read?
2735 */
2736 public void sort(Path[] inFiles, Path outFile,
2737 boolean deleteInput) throws IOException {
2738 if (fs.exists(outFile)) {
2739 throw new IOException("already exists: " + outFile);
2740 }
2741
2742 this.inFiles = inFiles;
2743 this.outFile = outFile;
2744
2745 int segments = sortPass(deleteInput);
2746 if (segments > 1) {
2747 mergePass(outFile.getParent());
2748 }
2749 }
2750
2751 /**
2752 * Perform a file sort from a set of input files and return an iterator.
2753 * @param inFiles the files to be sorted
2754 * @param tempDir the directory where temp files are created during sort
2755 * @param deleteInput should the input files be deleted as they are read?
2756 * @return iterator the RawKeyValueIterator
2757 */
2758 public RawKeyValueIterator sortAndIterate(Path[] inFiles, Path tempDir,
2759 boolean deleteInput) throws IOException {
2760 Path outFile = new Path(tempDir + Path.SEPARATOR + "all.2");
2761 if (fs.exists(outFile)) {
2762 throw new IOException("already exists: " + outFile);
2763 }
2764 this.inFiles = inFiles;
2765 //outFile will basically be used as prefix for temp files in the cases
2766 //where sort outputs multiple sorted segments. For the single segment
2767 //case, the outputFile itself will contain the sorted data for that
2768 //segment
2769 this.outFile = outFile;
2770
2771 int segments = sortPass(deleteInput);
2772 if (segments > 1)
2773 return merge(outFile.suffix(".0"), outFile.suffix(".0.index"),
2774 tempDir);
2775 else if (segments == 1)
2776 return merge(new Path[]{outFile}, true, tempDir);
2777 else return null;
2778 }
2779
2780 /**
2781 * The backwards compatible interface to sort.
2782 * @param inFile the input file to sort
2783 * @param outFile the sorted output file
2784 */
2785 public void sort(Path inFile, Path outFile) throws IOException {
2786 sort(new Path[]{inFile}, outFile, false);
2787 }
2788
2789 private int sortPass(boolean deleteInput) throws IOException {
2790 if(LOG.isDebugEnabled()) {
2791 LOG.debug("running sort pass");
2792 }
2793 SortPass sortPass = new SortPass(); // make the SortPass
2794 sortPass.setProgressable(progressable);
2795 mergeSort = new MergeSort(sortPass.new SeqFileComparator());
2796 try {
2797 return sortPass.run(deleteInput); // run it
2798 } finally {
2799 sortPass.close(); // close it
2800 }
2801 }
2802
2803 private class SortPass {
2804 private int memoryLimit = memory/4;
2805 private int recordLimit = 1000000;
2806
2807 private DataOutputBuffer rawKeys = new DataOutputBuffer();
2808 private byte[] rawBuffer;
2809
2810 private int[] keyOffsets = new int[1024];
2811 private int[] pointers = new int[keyOffsets.length];
2812 private int[] pointersCopy = new int[keyOffsets.length];
2813 private int[] keyLengths = new int[keyOffsets.length];
2814 private ValueBytes[] rawValues = new ValueBytes[keyOffsets.length];
2815
2816 private ArrayList segmentLengths = new ArrayList();
2817
2818 private Reader in = null;
2819 private FSDataOutputStream out = null;
2820 private FSDataOutputStream indexOut = null;
2821 private Path outName;
2822
2823 private Progressable progressable = null;
2824
2825 public int run(boolean deleteInput) throws IOException {
2826 int segments = 0;
2827 int currentFile = 0;
2828 boolean atEof = (currentFile >= inFiles.length);
2829 CompressionType compressionType;
2830 CompressionCodec codec = null;
2831 segmentLengths.clear();
2832 if (atEof) {
2833 return 0;
2834 }
2835
2836 // Initialize
2837 in = new Reader(fs, inFiles[currentFile], conf);
2838 compressionType = in.getCompressionType();
2839 codec = in.getCompressionCodec();
2840
2841 for (int i=0; i < rawValues.length; ++i) {
2842 rawValues[i] = null;
2843 }
2844
2845 while (!atEof) {
2846 int count = 0;
2847 int bytesProcessed = 0;
2848 rawKeys.reset();
2849 while (!atEof &&
2850 bytesProcessed < memoryLimit && count < recordLimit) {
2851
2852 // Read a record into buffer
2853 // Note: Attempt to re-use 'rawValue' as far as possible
2854 int keyOffset = rawKeys.getLength();
2855 ValueBytes rawValue =
2856 (count == keyOffsets.length || rawValues[count] == null) ?
2857 in.createValueBytes() :
2858 rawValues[count];
2859 int recordLength = in.nextRaw(rawKeys, rawValue);
2860 if (recordLength == -1) {
2861 in.close();
2862 if (deleteInput) {
2863 fs.delete(inFiles[currentFile], true);
2864 }
2865 currentFile += 1;
2866 atEof = currentFile >= inFiles.length;
2867 if (!atEof) {
2868 in = new Reader(fs, inFiles[currentFile], conf);
2869 } else {
2870 in = null;
2871 }
2872 continue;
2873 }
2874
2875 int keyLength = rawKeys.getLength() - keyOffset;
2876
2877 if (count == keyOffsets.length)
2878 grow();
2879
2880 keyOffsets[count] = keyOffset; // update pointers
2881 pointers[count] = count;
2882 keyLengths[count] = keyLength;
2883 rawValues[count] = rawValue;
2884
2885 bytesProcessed += recordLength;
2886 count++;
2887 }
2888
2889 // buffer is full -- sort & flush it
2890 if(LOG.isDebugEnabled()) {
2891 LOG.debug("flushing segment " + segments);
2892 }
2893 rawBuffer = rawKeys.getData();
2894 sort(count);
2895 // indicate we're making progress
2896 if (progressable != null) {
2897 progressable.progress();
2898 }
2899 flush(count, bytesProcessed, compressionType, codec,
2900 segments==0 && atEof);
2901 segments++;
2902 }
2903 return segments;
2904 }
2905
2906 public void close() throws IOException {
2907 if (in != null) {
2908 in.close();
2909 }
2910 if (out != null) {
2911 out.close();
2912 }
2913 if (indexOut != null) {
2914 indexOut.close();
2915 }
2916 }
2917
2918 private void grow() {
2919 int newLength = keyOffsets.length * 3 / 2;
2920 keyOffsets = grow(keyOffsets, newLength);
2921 pointers = grow(pointers, newLength);
2922 pointersCopy = new int[newLength];
2923 keyLengths = grow(keyLengths, newLength);
2924 rawValues = grow(rawValues, newLength);
2925 }
2926
2927 private int[] grow(int[] old, int newLength) {
2928 int[] result = new int[newLength];
2929 System.arraycopy(old, 0, result, 0, old.length);
2930 return result;
2931 }
2932
2933 private ValueBytes[] grow(ValueBytes[] old, int newLength) {
2934 ValueBytes[] result = new ValueBytes[newLength];
2935 System.arraycopy(old, 0, result, 0, old.length);
2936 for (int i=old.length; i < newLength; ++i) {
2937 result[i] = null;
2938 }
2939 return result;
2940 }
2941
2942 private void flush(int count, int bytesProcessed,
2943 CompressionType compressionType,
2944 CompressionCodec codec,
2945 boolean done) throws IOException {
2946 if (out == null) {
2947 outName = done ? outFile : outFile.suffix(".0");
2948 out = fs.create(outName);
2949 if (!done) {
2950 indexOut = fs.create(outName.suffix(".index"));
2951 }
2952 }
2953
2954 long segmentStart = out.getPos();
2955 Writer writer = createWriter(conf, Writer.stream(out),
2956 Writer.keyClass(keyClass), Writer.valueClass(valClass),
2957 Writer.compression(compressionType, codec),
2958 Writer.metadata(done ? metadata : new Metadata()));
2959
2960 if (!done) {
2961 writer.sync = null; // disable sync on temp files
2962 }
2963
2964 for (int i = 0; i < count; i++) { // write in sorted order
2965 int p = pointers[i];
2966 writer.appendRaw(rawBuffer, keyOffsets[p], keyLengths[p], rawValues[p]);
2967 }
2968 writer.close();
2969
2970 if (!done) {
2971 // Save the segment length
2972 WritableUtils.writeVLong(indexOut, segmentStart);
2973 WritableUtils.writeVLong(indexOut, (out.getPos()-segmentStart));
2974 indexOut.flush();
2975 }
2976 }
2977
2978 private void sort(int count) {
2979 System.arraycopy(pointers, 0, pointersCopy, 0, count);
2980 mergeSort.mergeSort(pointersCopy, pointers, 0, count);
2981 }
2982 class SeqFileComparator implements Comparator<IntWritable> {
2983 @Override
2984 public int compare(IntWritable I, IntWritable J) {
2985 return comparator.compare(rawBuffer, keyOffsets[I.get()],
2986 keyLengths[I.get()], rawBuffer,
2987 keyOffsets[J.get()], keyLengths[J.get()]);
2988 }
2989 }
2990
2991 /** set the progressable object in order to report progress */
2992 public void setProgressable(Progressable progressable)
2993 {
2994 this.progressable = progressable;
2995 }
2996
2997 } // SequenceFile.Sorter.SortPass
2998
2999 /** The interface to iterate over raw keys/values of SequenceFiles. */
3000 public static interface RawKeyValueIterator {
3001 /** Gets the current raw key
3002 * @return DataOutputBuffer
3003 * @throws IOException
3004 */
3005 DataOutputBuffer getKey() throws IOException;
3006 /** Gets the current raw value
3007 * @return ValueBytes
3008 * @throws IOException
3009 */
3010 ValueBytes getValue() throws IOException;
3011 /** Sets up the current key and value (for getKey and getValue)
3012 * @return true if there exists a key/value, false otherwise
3013 * @throws IOException
3014 */
3015 boolean next() throws IOException;
3016 /** closes the iterator so that the underlying streams can be closed
3017 * @throws IOException
3018 */
3019 void close() throws IOException;
3020 /** Gets the Progress object; this has a float (0.0 - 1.0)
3021 * indicating the bytes processed by the iterator so far
3022 */
3023 Progress getProgress();
3024 }
3025
3026 /**
3027 * Merges the list of segments of type <code>SegmentDescriptor</code>
3028 * @param segments the list of SegmentDescriptors
3029 * @param tmpDir the directory to write temporary files into
3030 * @return RawKeyValueIterator
3031 * @throws IOException
3032 */
3033 public RawKeyValueIterator merge(List <SegmentDescriptor> segments,
3034 Path tmpDir)
3035 throws IOException {
3036 // pass in object to report progress, if present
3037 MergeQueue mQueue = new MergeQueue(segments, tmpDir, progressable);
3038 return mQueue.merge();
3039 }
3040
3041 /**
3042 * Merges the contents of files passed in Path[] using a max factor value
3043 * that is already set
3044 * @param inNames the array of path names
3045 * @param deleteInputs true if the input files should be deleted when
3046 * unnecessary
3047 * @param tmpDir the directory to write temporary files into
3048 * @return RawKeyValueIteratorMergeQueue
3049 * @throws IOException
3050 */
3051 public RawKeyValueIterator merge(Path [] inNames, boolean deleteInputs,
3052 Path tmpDir)
3053 throws IOException {
3054 return merge(inNames, deleteInputs,
3055 (inNames.length < factor) ? inNames.length : factor,
3056 tmpDir);
3057 }
3058
3059 /**
3060 * Merges the contents of files passed in Path[]
3061 * @param inNames the array of path names
3062 * @param deleteInputs true if the input files should be deleted when
3063 * unnecessary
3064 * @param factor the factor that will be used as the maximum merge fan-in
3065 * @param tmpDir the directory to write temporary files into
3066 * @return RawKeyValueIteratorMergeQueue
3067 * @throws IOException
3068 */
3069 public RawKeyValueIterator merge(Path [] inNames, boolean deleteInputs,
3070 int factor, Path tmpDir)
3071 throws IOException {
3072 //get the segments from inNames
3073 ArrayList <SegmentDescriptor> a = new ArrayList <SegmentDescriptor>();
3074 for (int i = 0; i < inNames.length; i++) {
3075 SegmentDescriptor s = new SegmentDescriptor(0,
3076 fs.getFileStatus(inNames[i]).getLen(), inNames[i]);
3077 s.preserveInput(!deleteInputs);
3078 s.doSync();
3079 a.add(s);
3080 }
3081 this.factor = factor;
3082 MergeQueue mQueue = new MergeQueue(a, tmpDir, progressable);
3083 return mQueue.merge();
3084 }
3085
3086 /**
3087 * Merges the contents of files passed in Path[]
3088 * @param inNames the array of path names
3089 * @param tempDir the directory for creating temp files during merge
3090 * @param deleteInputs true if the input files should be deleted when
3091 * unnecessary
3092 * @return RawKeyValueIteratorMergeQueue
3093 * @throws IOException
3094 */
3095 public RawKeyValueIterator merge(Path [] inNames, Path tempDir,
3096 boolean deleteInputs)
3097 throws IOException {
3098 //outFile will basically be used as prefix for temp files for the
3099 //intermediate merge outputs
3100 this.outFile = new Path(tempDir + Path.SEPARATOR + "merged");
3101 //get the segments from inNames
3102 ArrayList <SegmentDescriptor> a = new ArrayList <SegmentDescriptor>();
3103 for (int i = 0; i < inNames.length; i++) {
3104 SegmentDescriptor s = new SegmentDescriptor(0,
3105 fs.getFileStatus(inNames[i]).getLen(), inNames[i]);
3106 s.preserveInput(!deleteInputs);
3107 s.doSync();
3108 a.add(s);
3109 }
3110 factor = (inNames.length < factor) ? inNames.length : factor;
3111 // pass in object to report progress, if present
3112 MergeQueue mQueue = new MergeQueue(a, tempDir, progressable);
3113 return mQueue.merge();
3114 }
3115
3116 /**
3117 * Clones the attributes (like compression of the input file and creates a
3118 * corresponding Writer
3119 * @param inputFile the path of the input file whose attributes should be
3120 * cloned
3121 * @param outputFile the path of the output file
3122 * @param prog the Progressable to report status during the file write
3123 * @return Writer
3124 * @throws IOException
3125 */
3126 public Writer cloneFileAttributes(Path inputFile, Path outputFile,
3127 Progressable prog) throws IOException {
3128 Reader reader = new Reader(conf,
3129 Reader.file(inputFile),
3130 new Reader.OnlyHeaderOption());
3131 CompressionType compress = reader.getCompressionType();
3132 CompressionCodec codec = reader.getCompressionCodec();
3133 reader.close();
3134
3135 Writer writer = createWriter(conf,
3136 Writer.file(outputFile),
3137 Writer.keyClass(keyClass),
3138 Writer.valueClass(valClass),
3139 Writer.compression(compress, codec),
3140 Writer.progressable(prog));
3141 return writer;
3142 }
3143
3144 /**
3145 * Writes records from RawKeyValueIterator into a file represented by the
3146 * passed writer
3147 * @param records the RawKeyValueIterator
3148 * @param writer the Writer created earlier
3149 * @throws IOException
3150 */
3151 public void writeFile(RawKeyValueIterator records, Writer writer)
3152 throws IOException {
3153 while(records.next()) {
3154 writer.appendRaw(records.getKey().getData(), 0,
3155 records.getKey().getLength(), records.getValue());
3156 }
3157 writer.sync();
3158 }
3159
3160 /** Merge the provided files.
3161 * @param inFiles the array of input path names
3162 * @param outFile the final output file
3163 * @throws IOException
3164 */
3165 public void merge(Path[] inFiles, Path outFile) throws IOException {
3166 if (fs.exists(outFile)) {
3167 throw new IOException("already exists: " + outFile);
3168 }
3169 RawKeyValueIterator r = merge(inFiles, false, outFile.getParent());
3170 Writer writer = cloneFileAttributes(inFiles[0], outFile, null);
3171
3172 writeFile(r, writer);
3173
3174 writer.close();
3175 }
3176
3177 /** sort calls this to generate the final merged output */
3178 private int mergePass(Path tmpDir) throws IOException {
3179 if(LOG.isDebugEnabled()) {
3180 LOG.debug("running merge pass");
3181 }
3182 Writer writer = cloneFileAttributes(
3183 outFile.suffix(".0"), outFile, null);
3184 RawKeyValueIterator r = merge(outFile.suffix(".0"),
3185 outFile.suffix(".0.index"), tmpDir);
3186 writeFile(r, writer);
3187
3188 writer.close();
3189 return 0;
3190 }
3191
3192 /** Used by mergePass to merge the output of the sort
3193 * @param inName the name of the input file containing sorted segments
3194 * @param indexIn the offsets of the sorted segments
3195 * @param tmpDir the relative directory to store intermediate results in
3196 * @return RawKeyValueIterator
3197 * @throws IOException
3198 */
3199 private RawKeyValueIterator merge(Path inName, Path indexIn, Path tmpDir)
3200 throws IOException {
3201 //get the segments from indexIn
3202 //we create a SegmentContainer so that we can track segments belonging to
3203 //inName and delete inName as soon as we see that we have looked at all
3204 //the contained segments during the merge process & hence don't need
3205 //them anymore
3206 SegmentContainer container = new SegmentContainer(inName, indexIn);
3207 MergeQueue mQueue = new MergeQueue(container.getSegmentList(), tmpDir, progressable);
3208 return mQueue.merge();
3209 }
3210
3211 /** This class implements the core of the merge logic */
3212 private class MergeQueue extends PriorityQueue
3213 implements RawKeyValueIterator {
3214 private boolean compress;
3215 private boolean blockCompress;
3216 private DataOutputBuffer rawKey = new DataOutputBuffer();
3217 private ValueBytes rawValue;
3218 private long totalBytesProcessed;
3219 private float progPerByte;
3220 private Progress mergeProgress = new Progress();
3221 private Path tmpDir;
3222 private Progressable progress = null; //handle to the progress reporting object
3223 private SegmentDescriptor minSegment;
3224
3225 //a TreeMap used to store the segments sorted by size (segment offset and
3226 //segment path name is used to break ties between segments of same sizes)
3227 private Map<SegmentDescriptor, Void> sortedSegmentSizes =
3228 new TreeMap<SegmentDescriptor, Void>();
3229
3230 @SuppressWarnings("unchecked")
3231 public void put(SegmentDescriptor stream) throws IOException {
3232 if (size() == 0) {
3233 compress = stream.in.isCompressed();
3234 blockCompress = stream.in.isBlockCompressed();
3235 } else if (compress != stream.in.isCompressed() ||
3236 blockCompress != stream.in.isBlockCompressed()) {
3237 throw new IOException("All merged files must be compressed or not.");
3238 }
3239 super.put(stream);
3240 }
3241
3242 /**
3243 * A queue of file segments to merge
3244 * @param segments the file segments to merge
3245 * @param tmpDir a relative local directory to save intermediate files in
3246 * @param progress the reference to the Progressable object
3247 */
3248 public MergeQueue(List <SegmentDescriptor> segments,
3249 Path tmpDir, Progressable progress) {
3250 int size = segments.size();
3251 for (int i = 0; i < size; i++) {
3252 sortedSegmentSizes.put(segments.get(i), null);
3253 }
3254 this.tmpDir = tmpDir;
3255 this.progress = progress;
3256 }
3257 @Override
3258 protected boolean lessThan(Object a, Object b) {
3259 // indicate we're making progress
3260 if (progress != null) {
3261 progress.progress();
3262 }
3263 SegmentDescriptor msa = (SegmentDescriptor)a;
3264 SegmentDescriptor msb = (SegmentDescriptor)b;
3265 return comparator.compare(msa.getKey().getData(), 0,
3266 msa.getKey().getLength(), msb.getKey().getData(), 0,
3267 msb.getKey().getLength()) < 0;
3268 }
3269 @Override
3270 public void close() throws IOException {
3271 SegmentDescriptor ms; // close inputs
3272 while ((ms = (SegmentDescriptor)pop()) != null) {
3273 ms.cleanup();
3274 }
3275 minSegment = null;
3276 }
3277 @Override
3278 public DataOutputBuffer getKey() throws IOException {
3279 return rawKey;
3280 }
3281 @Override
3282 public ValueBytes getValue() throws IOException {
3283 return rawValue;
3284 }
3285 @Override
3286 public boolean next() throws IOException {
3287 if (size() == 0)
3288 return false;
3289 if (minSegment != null) {
3290 //minSegment is non-null for all invocations of next except the first
3291 //one. For the first invocation, the priority queue is ready for use
3292 //but for the subsequent invocations, first adjust the queue
3293 adjustPriorityQueue(minSegment);
3294 if (size() == 0) {
3295 minSegment = null;
3296 return false;
3297 }
3298 }
3299 minSegment = (SegmentDescriptor)top();
3300 long startPos = minSegment.in.getPosition(); // Current position in stream
3301 //save the raw key reference
3302 rawKey = minSegment.getKey();
3303 //load the raw value. Re-use the existing rawValue buffer
3304 if (rawValue == null) {
3305 rawValue = minSegment.in.createValueBytes();
3306 }
3307 minSegment.nextRawValue(rawValue);
3308 long endPos = minSegment.in.getPosition(); // End position after reading value
3309 updateProgress(endPos - startPos);
3310 return true;
3311 }
3312
3313 @Override
3314 public Progress getProgress() {
3315 return mergeProgress;
3316 }
3317
3318 private void adjustPriorityQueue(SegmentDescriptor ms) throws IOException{
3319 long startPos = ms.in.getPosition(); // Current position in stream
3320 boolean hasNext = ms.nextRawKey();
3321 long endPos = ms.in.getPosition(); // End position after reading key
3322 updateProgress(endPos - startPos);
3323 if (hasNext) {
3324 adjustTop();
3325 } else {
3326 pop();
3327 ms.cleanup();
3328 }
3329 }
3330
3331 private void updateProgress(long bytesProcessed) {
3332 totalBytesProcessed += bytesProcessed;
3333 if (progPerByte > 0) {
3334 mergeProgress.set(totalBytesProcessed * progPerByte);
3335 }
3336 }
3337
3338 /** This is the single level merge that is called multiple times
3339 * depending on the factor size and the number of segments
3340 * @return RawKeyValueIterator
3341 * @throws IOException
3342 */
3343 public RawKeyValueIterator merge() throws IOException {
3344 //create the MergeStreams from the sorted map created in the constructor
3345 //and dump the final output to a file
3346 int numSegments = sortedSegmentSizes.size();
3347 int origFactor = factor;
3348 int passNo = 1;
3349 LocalDirAllocator lDirAlloc = new LocalDirAllocator("io.seqfile.local.dir");
3350 do {
3351 //get the factor for this pass of merge
3352 factor = getPassFactor(passNo, numSegments);
3353 List<SegmentDescriptor> segmentsToMerge =
3354 new ArrayList<SegmentDescriptor>();
3355 int segmentsConsidered = 0;
3356 int numSegmentsToConsider = factor;
3357 while (true) {
3358 //extract the smallest 'factor' number of segment pointers from the
3359 //TreeMap. Call cleanup on the empty segments (no key/value data)
3360 SegmentDescriptor[] mStream =
3361 getSegmentDescriptors(numSegmentsToConsider);
3362 for (int i = 0; i < mStream.length; i++) {
3363 if (mStream[i].nextRawKey()) {
3364 segmentsToMerge.add(mStream[i]);
3365 segmentsConsidered++;
3366 // Count the fact that we read some bytes in calling nextRawKey()
3367 updateProgress(mStream[i].in.getPosition());
3368 }
3369 else {
3370 mStream[i].cleanup();
3371 numSegments--; //we ignore this segment for the merge
3372 }
3373 }
3374 //if we have the desired number of segments
3375 //or looked at all available segments, we break
3376 if (segmentsConsidered == factor ||
3377 sortedSegmentSizes.size() == 0) {
3378 break;
3379 }
3380
3381 numSegmentsToConsider = factor - segmentsConsidered;
3382 }
3383 //feed the streams to the priority queue
3384 initialize(segmentsToMerge.size()); clear();
3385 for (int i = 0; i < segmentsToMerge.size(); i++) {
3386 put(segmentsToMerge.get(i));
3387 }
3388 //if we have lesser number of segments remaining, then just return the
3389 //iterator, else do another single level merge
3390 if (numSegments <= factor) {
3391 //calculate the length of the remaining segments. Required for
3392 //calculating the merge progress
3393 long totalBytes = 0;
3394 for (int i = 0; i < segmentsToMerge.size(); i++) {
3395 totalBytes += segmentsToMerge.get(i).segmentLength;
3396 }
3397 if (totalBytes != 0) //being paranoid
3398 progPerByte = 1.0f / (float)totalBytes;
3399 //reset factor to what it originally was
3400 factor = origFactor;
3401 return this;
3402 } else {
3403 //we want to spread the creation of temp files on multiple disks if
3404 //available under the space constraints
3405 long approxOutputSize = 0;
3406 for (SegmentDescriptor s : segmentsToMerge) {
3407 approxOutputSize += s.segmentLength +
3408 ChecksumFileSystem.getApproxChkSumLength(
3409 s.segmentLength);
3410 }
3411 Path tmpFilename =
3412 new Path(tmpDir, "intermediate").suffix("." + passNo);
3413
3414 Path outputFile = lDirAlloc.getLocalPathForWrite(
3415 tmpFilename.toString(),
3416 approxOutputSize, conf);
3417 if(LOG.isDebugEnabled()) {
3418 LOG.debug("writing intermediate results to " + outputFile);
3419 }
3420 Writer writer = cloneFileAttributes(
3421 fs.makeQualified(segmentsToMerge.get(0).segmentPathName),
3422 fs.makeQualified(outputFile), null);
3423 writer.sync = null; //disable sync for temp files
3424 writeFile(this, writer);
3425 writer.close();
3426
3427 //we finished one single level merge; now clean up the priority
3428 //queue
3429 this.close();
3430
3431 SegmentDescriptor tempSegment =
3432 new SegmentDescriptor(0,
3433 fs.getFileStatus(outputFile).getLen(), outputFile);
3434 //put the segment back in the TreeMap
3435 sortedSegmentSizes.put(tempSegment, null);
3436 numSegments = sortedSegmentSizes.size();
3437 passNo++;
3438 }
3439 //we are worried about only the first pass merge factor. So reset the
3440 //factor to what it originally was
3441 factor = origFactor;
3442 } while(true);
3443 }
3444
3445 //Hadoop-591
3446 public int getPassFactor(int passNo, int numSegments) {
3447 if (passNo > 1 || numSegments <= factor || factor == 1)
3448 return factor;
3449 int mod = (numSegments - 1) % (factor - 1);
3450 if (mod == 0)
3451 return factor;
3452 return mod + 1;
3453 }
3454
3455 /** Return (& remove) the requested number of segment descriptors from the
3456 * sorted map.
3457 */
3458 public SegmentDescriptor[] getSegmentDescriptors(int numDescriptors) {
3459 if (numDescriptors > sortedSegmentSizes.size())
3460 numDescriptors = sortedSegmentSizes.size();
3461 SegmentDescriptor[] SegmentDescriptors =
3462 new SegmentDescriptor[numDescriptors];
3463 Iterator iter = sortedSegmentSizes.keySet().iterator();
3464 int i = 0;
3465 while (i < numDescriptors) {
3466 SegmentDescriptors[i++] = (SegmentDescriptor)iter.next();
3467 iter.remove();
3468 }
3469 return SegmentDescriptors;
3470 }
3471 } // SequenceFile.Sorter.MergeQueue
3472
3473 /** This class defines a merge segment. This class can be subclassed to
3474 * provide a customized cleanup method implementation. In this
3475 * implementation, cleanup closes the file handle and deletes the file
3476 */
3477 public class SegmentDescriptor implements Comparable {
3478
3479 long segmentOffset; //the start of the segment in the file
3480 long segmentLength; //the length of the segment
3481 Path segmentPathName; //the path name of the file containing the segment
3482 boolean ignoreSync = true; //set to true for temp files
3483 private Reader in = null;
3484 private DataOutputBuffer rawKey = null; //this will hold the current key
3485 private boolean preserveInput = false; //delete input segment files?
3486
3487 /** Constructs a segment
3488 * @param segmentOffset the offset of the segment in the file
3489 * @param segmentLength the length of the segment
3490 * @param segmentPathName the path name of the file containing the segment
3491 */
3492 public SegmentDescriptor (long segmentOffset, long segmentLength,
3493 Path segmentPathName) {
3494 this.segmentOffset = segmentOffset;
3495 this.segmentLength = segmentLength;
3496 this.segmentPathName = segmentPathName;
3497 }
3498
3499 /** Do the sync checks */
3500 public void doSync() {ignoreSync = false;}
3501
3502 /** Whether to delete the files when no longer needed */
3503 public void preserveInput(boolean preserve) {
3504 preserveInput = preserve;
3505 }
3506
3507 public boolean shouldPreserveInput() {
3508 return preserveInput;
3509 }
3510
3511 @Override
3512 public int compareTo(Object o) {
3513 SegmentDescriptor that = (SegmentDescriptor)o;
3514 if (this.segmentLength != that.segmentLength) {
3515 return (this.segmentLength < that.segmentLength ? -1 : 1);
3516 }
3517 if (this.segmentOffset != that.segmentOffset) {
3518 return (this.segmentOffset < that.segmentOffset ? -1 : 1);
3519 }
3520 return (this.segmentPathName.toString()).
3521 compareTo(that.segmentPathName.toString());
3522 }
3523
3524 @Override
3525 public boolean equals(Object o) {
3526 if (!(o instanceof SegmentDescriptor)) {
3527 return false;
3528 }
3529 SegmentDescriptor that = (SegmentDescriptor)o;
3530 if (this.segmentLength == that.segmentLength &&
3531 this.segmentOffset == that.segmentOffset &&
3532 this.segmentPathName.toString().equals(
3533 that.segmentPathName.toString())) {
3534 return true;
3535 }
3536 return false;
3537 }
3538
3539 @Override
3540 public int hashCode() {
3541 return 37 * 17 + (int) (segmentOffset^(segmentOffset>>>32));
3542 }
3543
3544 /** Fills up the rawKey object with the key returned by the Reader
3545 * @return true if there is a key returned; false, otherwise
3546 * @throws IOException
3547 */
3548 public boolean nextRawKey() throws IOException {
3549 if (in == null) {
3550 int bufferSize = getBufferSize(conf);
3551 Reader reader = new Reader(conf,
3552 Reader.file(segmentPathName),
3553 Reader.bufferSize(bufferSize),
3554 Reader.start(segmentOffset),
3555 Reader.length(segmentLength));
3556
3557 //sometimes we ignore syncs especially for temp merge files
3558 if (ignoreSync) reader.ignoreSync();
3559
3560 if (reader.getKeyClass() != keyClass)
3561 throw new IOException("wrong key class: " + reader.getKeyClass() +
3562 " is not " + keyClass);
3563 if (reader.getValueClass() != valClass)
3564 throw new IOException("wrong value class: "+reader.getValueClass()+
3565 " is not " + valClass);
3566 this.in = reader;
3567 rawKey = new DataOutputBuffer();
3568 }
3569 rawKey.reset();
3570 int keyLength =
3571 in.nextRawKey(rawKey);
3572 return (keyLength >= 0);
3573 }
3574
3575 /** Fills up the passed rawValue with the value corresponding to the key
3576 * read earlier
3577 * @param rawValue
3578 * @return the length of the value
3579 * @throws IOException
3580 */
3581 public int nextRawValue(ValueBytes rawValue) throws IOException {
3582 int valLength = in.nextRawValue(rawValue);
3583 return valLength;
3584 }
3585
3586 /** Returns the stored rawKey */
3587 public DataOutputBuffer getKey() {
3588 return rawKey;
3589 }
3590
3591 /** closes the underlying reader */
3592 private void close() throws IOException {
3593 this.in.close();
3594 this.in = null;
3595 }
3596
3597 /** The default cleanup. Subclasses can override this with a custom
3598 * cleanup
3599 */
3600 public void cleanup() throws IOException {
3601 close();
3602 if (!preserveInput) {
3603 fs.delete(segmentPathName, true);
3604 }
3605 }
3606 } // SequenceFile.Sorter.SegmentDescriptor
3607
3608 /** This class provisions multiple segments contained within a single
3609 * file
3610 */
3611 private class LinkedSegmentsDescriptor extends SegmentDescriptor {
3612
3613 SegmentContainer parentContainer = null;
3614
3615 /** Constructs a segment
3616 * @param segmentOffset the offset of the segment in the file
3617 * @param segmentLength the length of the segment
3618 * @param segmentPathName the path name of the file containing the segment
3619 * @param parent the parent SegmentContainer that holds the segment
3620 */
3621 public LinkedSegmentsDescriptor (long segmentOffset, long segmentLength,
3622 Path segmentPathName, SegmentContainer parent) {
3623 super(segmentOffset, segmentLength, segmentPathName);
3624 this.parentContainer = parent;
3625 }
3626 /** The default cleanup. Subclasses can override this with a custom
3627 * cleanup
3628 */
3629 @Override
3630 public void cleanup() throws IOException {
3631 super.close();
3632 if (super.shouldPreserveInput()) return;
3633 parentContainer.cleanup();
3634 }
3635
3636 @Override
3637 public boolean equals(Object o) {
3638 if (!(o instanceof LinkedSegmentsDescriptor)) {
3639 return false;
3640 }
3641 return super.equals(o);
3642 }
3643 } //SequenceFile.Sorter.LinkedSegmentsDescriptor
3644
3645 /** The class that defines a container for segments to be merged. Primarily
3646 * required to delete temp files as soon as all the contained segments
3647 * have been looked at */
3648 private class SegmentContainer {
3649 private int numSegmentsCleanedUp = 0; //track the no. of segment cleanups
3650 private int numSegmentsContained; //# of segments contained
3651 private Path inName; //input file from where segments are created
3652
3653 //the list of segments read from the file
3654 private ArrayList <SegmentDescriptor> segments =
3655 new ArrayList <SegmentDescriptor>();
3656 /** This constructor is there primarily to serve the sort routine that
3657 * generates a single output file with an associated index file */
3658 public SegmentContainer(Path inName, Path indexIn) throws IOException {
3659 //get the segments from indexIn
3660 FSDataInputStream fsIndexIn = fs.open(indexIn);
3661 long end = fs.getFileStatus(indexIn).getLen();
3662 while (fsIndexIn.getPos() < end) {
3663 long segmentOffset = WritableUtils.readVLong(fsIndexIn);
3664 long segmentLength = WritableUtils.readVLong(fsIndexIn);
3665 Path segmentName = inName;
3666 segments.add(new LinkedSegmentsDescriptor(segmentOffset,
3667 segmentLength, segmentName, this));
3668 }
3669 fsIndexIn.close();
3670 fs.delete(indexIn, true);
3671 numSegmentsContained = segments.size();
3672 this.inName = inName;
3673 }
3674
3675 public List <SegmentDescriptor> getSegmentList() {
3676 return segments;
3677 }
3678 public void cleanup() throws IOException {
3679 numSegmentsCleanedUp++;
3680 if (numSegmentsCleanedUp == numSegmentsContained) {
3681 fs.delete(inName, true);
3682 }
3683 }
3684 } //SequenceFile.Sorter.SegmentContainer
3685
3686 } // SequenceFile.Sorter
3687
3688 } // SequenceFile