001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.io;
020
021 import java.io.IOException;
022 import java.io.DataInput;
023 import java.io.DataOutput;
024 import java.nio.ByteBuffer;
025 import java.nio.CharBuffer;
026 import java.nio.charset.CharacterCodingException;
027 import java.nio.charset.Charset;
028 import java.nio.charset.CharsetDecoder;
029 import java.nio.charset.CharsetEncoder;
030 import java.nio.charset.CodingErrorAction;
031 import java.nio.charset.MalformedInputException;
032 import java.text.CharacterIterator;
033 import java.text.StringCharacterIterator;
034 import java.util.Arrays;
035
036 import org.apache.avro.reflect.Stringable;
037 import org.apache.hadoop.classification.InterfaceAudience;
038 import org.apache.hadoop.classification.InterfaceStability;
039 import org.apache.hadoop.classification.MapRModified;
040
041 /** This class stores text using standard UTF8 encoding. It provides methods
042 * to serialize, deserialize, and compare texts at byte level. The type of
043 * length is integer and is serialized using zero-compressed format. <p>In
044 * addition, it provides methods for string traversal without converting the
045 * byte array to a string. <p>Also includes utilities for
046 * serializing/deserialing a string, coding/decoding a string, checking if a
047 * byte array contains valid UTF8 code, calculating the length of an encoded
048 * string.
049 */
050 @Stringable
051 @InterfaceAudience.Public
052 @InterfaceStability.Stable
053 @MapRModified(summary = "Improve map task performance - bug 13086")
054 public class Text extends BinaryComparable
055 implements WritableComparable<BinaryComparable>, HasRawComparablePrefix {
056
057 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
058 new ThreadLocal<CharsetEncoder>() {
059 @Override
060 protected CharsetEncoder initialValue() {
061 return Charset.forName("UTF-8").newEncoder().
062 onMalformedInput(CodingErrorAction.REPORT).
063 onUnmappableCharacter(CodingErrorAction.REPORT);
064 }
065 };
066
067 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
068 new ThreadLocal<CharsetDecoder>() {
069 @Override
070 protected CharsetDecoder initialValue() {
071 return Charset.forName("UTF-8").newDecoder().
072 onMalformedInput(CodingErrorAction.REPORT).
073 onUnmappableCharacter(CodingErrorAction.REPORT);
074 }
075 };
076
077 private static final byte [] EMPTY_BYTES = new byte[0];
078
079 private byte[] bytes;
080 private int length;
081
082 public Text() {
083 bytes = EMPTY_BYTES;
084 }
085
086 /** Construct from a string.
087 */
088 public Text(String string) {
089 set(string);
090 }
091
092 /** Construct from another text. */
093 public Text(Text utf8) {
094 set(utf8);
095 }
096
097 /** Construct from a byte array.
098 */
099 public Text(byte[] utf8) {
100 set(utf8);
101 }
102
103 /**
104 * Get a copy of the bytes that is exactly the length of the data.
105 * See {@link #getBytes()} for faster access to the underlying array.
106 */
107 public byte[] copyBytes() {
108 byte[] result = new byte[length];
109 System.arraycopy(bytes, 0, result, 0, length);
110 return result;
111 }
112
113 /**
114 * Returns the raw bytes; however, only data up to {@link #getLength()} is
115 * valid. Please use {@link #copyBytes()} if you
116 * need the returned array to be precisely the length of the data.
117 */
118 @Override
119 public byte[] getBytes() {
120 return bytes;
121 }
122
123 /** Returns the number of bytes in the byte array */
124 @Override
125 public int getLength() {
126 return length;
127 }
128
129 /**
130 * Returns the Unicode Scalar Value (32-bit integer value)
131 * for the character at <code>position</code>. Note that this
132 * method avoids using the converter or doing String instantiation
133 * @return the Unicode scalar value at position or -1
134 * if the position is invalid or points to a
135 * trailing byte
136 */
137 public int charAt(int position) {
138 if (position > this.length) return -1; // too long
139 if (position < 0) return -1; // duh.
140
141 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
142 return bytesToCodePoint(bb.slice());
143 }
144
145 public int find(String what) {
146 return find(what, 0);
147 }
148
149 /**
150 * Finds any occurence of <code>what</code> in the backing
151 * buffer, starting as position <code>start</code>. The starting
152 * position is measured in bytes and the return value is in
153 * terms of byte position in the buffer. The backing buffer is
154 * not converted to a string for this operation.
155 * @return byte position of the first occurence of the search
156 * string in the UTF-8 buffer or -1 if not found
157 */
158 public int find(String what, int start) {
159 try {
160 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
161 ByteBuffer tgt = encode(what);
162 byte b = tgt.get();
163 src.position(start);
164
165 while (src.hasRemaining()) {
166 if (b == src.get()) { // matching first byte
167 src.mark(); // save position in loop
168 tgt.mark(); // save position in target
169 boolean found = true;
170 int pos = src.position()-1;
171 while (tgt.hasRemaining()) {
172 if (!src.hasRemaining()) { // src expired first
173 tgt.reset();
174 src.reset();
175 found = false;
176 break;
177 }
178 if (!(tgt.get() == src.get())) {
179 tgt.reset();
180 src.reset();
181 found = false;
182 break; // no match
183 }
184 }
185 if (found) return pos;
186 }
187 }
188 return -1; // not found
189 } catch (CharacterCodingException e) {
190 // can't get here
191 e.printStackTrace();
192 return -1;
193 }
194 }
195 /** Set to contain the contents of a string.
196 */
197 public void set(String string) {
198 try {
199 ByteBuffer bb = encode(string, true);
200 bytes = bb.array();
201 length = bb.limit();
202 }catch(CharacterCodingException e) {
203 throw new RuntimeException("Should not have happened ", e);
204 }
205 }
206
207 /** Set to a utf8 byte array
208 */
209 public void set(byte[] utf8) {
210 set(utf8, 0, utf8.length);
211 }
212
213 /** copy a text. */
214 public void set(Text other) {
215 set(other.getBytes(), 0, other.getLength());
216 }
217
218 /**
219 * Set the Text to range of bytes
220 * @param utf8 the data to copy from
221 * @param start the first position of the new string
222 * @param len the number of bytes of the new string
223 */
224 public void set(byte[] utf8, int start, int len) {
225 setCapacity(len, false);
226 System.arraycopy(utf8, start, bytes, 0, len);
227 this.length = len;
228 }
229
230 /**
231 * Append a range of bytes to the end of the given text
232 * @param utf8 the data to copy from
233 * @param start the first position to append from utf8
234 * @param len the number of bytes to append
235 */
236 public void append(byte[] utf8, int start, int len) {
237 setCapacity(length + len, true);
238 System.arraycopy(utf8, start, bytes, length, len);
239 length += len;
240 }
241
242 /**
243 * Clear the string to empty.
244 *
245 * <em>Note</em>: For performance reasons, this call does not clear the
246 * underlying byte array that is retrievable via {@link #getBytes()}.
247 * In order to free the byte-array memory, call {@link #set(byte[])}
248 * with an empty byte array (For example, <code>new byte[0]</code>).
249 */
250 public void clear() {
251 length = 0;
252 }
253
254 /*
255 * Sets the capacity of this Text object to <em>at least</em>
256 * <code>len</code> bytes. If the current buffer is longer,
257 * then the capacity and existing content of the buffer are
258 * unchanged. If <code>len</code> is larger
259 * than the current capacity, the Text object's capacity is
260 * increased to match.
261 * @param len the number of bytes we need
262 * @param keepData should the old data be kept
263 */
264 private void setCapacity(int len, boolean keepData) {
265 if (bytes == null || bytes.length < len) {
266 if (bytes != null && keepData) {
267 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
268 } else {
269 bytes = new byte[len];
270 }
271 }
272 }
273
274 /**
275 * Convert text back to string
276 * @see java.lang.Object#toString()
277 */
278 @Override
279 public String toString() {
280 try {
281 return decode(bytes, 0, length);
282 } catch (CharacterCodingException e) {
283 throw new RuntimeException("Should not have happened " , e);
284 }
285 }
286
287 /** deserialize
288 */
289 @Override
290 public void readFields(DataInput in) throws IOException {
291 int newLength = WritableUtils.readVInt(in);
292 setCapacity(newLength, false);
293 in.readFully(bytes, 0, newLength);
294 length = newLength;
295 }
296
297 public void readFields(DataInput in, int maxLength) throws IOException {
298 int newLength = WritableUtils.readVInt(in);
299 if (newLength < 0) {
300 throw new IOException("tried to deserialize " + newLength +
301 " bytes of data! newLength must be non-negative.");
302 } else if (newLength >= maxLength) {
303 throw new IOException("tried to deserialize " + newLength +
304 " bytes of data, but maxLength = " + maxLength);
305 }
306 setCapacity(newLength, false);
307 in.readFully(bytes, 0, newLength);
308 length = newLength;
309 }
310
311 /** Skips over one Text in the input. */
312 public static void skip(DataInput in) throws IOException {
313 int length = WritableUtils.readVInt(in);
314 WritableUtils.skipFully(in, length);
315 }
316
317 /** serialize
318 * write this object to out
319 * length uses zero-compressed encoding
320 * @see Writable#write(DataOutput)
321 */
322 @Override
323 public void write(DataOutput out) throws IOException {
324 WritableUtils.writeVInt(out, length);
325 out.write(bytes, 0, length);
326 }
327
328 public void write(DataOutput out, int maxLength) throws IOException {
329 if (length > maxLength) {
330 throw new IOException("data was too long to write! Expected " +
331 "less than or equal to " + maxLength + " bytes, but got " +
332 length + " bytes.");
333 }
334 WritableUtils.writeVInt(out, length);
335 out.write(bytes, 0, length);
336 }
337
338 /** Returns true iff <code>o</code> is a Text with the same contents. */
339 @Override
340 public boolean equals(Object o) {
341 if (o instanceof Text)
342 return super.equals(o);
343 return false;
344 }
345
346 @Override
347 public int hashCode() {
348 return super.hashCode();
349 }
350
351 /** A WritableComparator optimized for Text keys. */
352 public static class Comparator extends WritableComparator {
353 public Comparator() {
354 super(Text.class);
355 }
356
357 @Override
358 public int compare(byte[] b1, int s1, int l1,
359 byte[] b2, int s2, int l2) {
360 int n1 = WritableUtils.decodeVIntSize(b1[s1]);
361 int n2 = WritableUtils.decodeVIntSize(b2[s2]);
362 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
363 }
364 }
365
366 static {
367 // register this comparator
368 WritableComparator.define(Text.class, new Comparator());
369 }
370
371 /// STATIC UTILITIES FROM HERE DOWN
372 /**
373 * Converts the provided byte array to a String using the
374 * UTF-8 encoding. If the input is malformed,
375 * replace by a default value.
376 */
377 public static String decode(byte[] utf8) throws CharacterCodingException {
378 return decode(ByteBuffer.wrap(utf8), true);
379 }
380
381 public static String decode(byte[] utf8, int start, int length)
382 throws CharacterCodingException {
383 return decode(ByteBuffer.wrap(utf8, start, length), true);
384 }
385
386 /**
387 * Converts the provided byte array to a String using the
388 * UTF-8 encoding. If <code>replace</code> is true, then
389 * malformed input is replaced with the
390 * substitution character, which is U+FFFD. Otherwise the
391 * method throws a MalformedInputException.
392 */
393 public static String decode(byte[] utf8, int start, int length, boolean replace)
394 throws CharacterCodingException {
395 return decode(ByteBuffer.wrap(utf8, start, length), replace);
396 }
397
398 private static String decode(ByteBuffer utf8, boolean replace)
399 throws CharacterCodingException {
400 CharsetDecoder decoder = DECODER_FACTORY.get();
401 if (replace) {
402 decoder.onMalformedInput(
403 java.nio.charset.CodingErrorAction.REPLACE);
404 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
405 }
406 String str = decoder.decode(utf8).toString();
407 // set decoder back to its default value: REPORT
408 if (replace) {
409 decoder.onMalformedInput(CodingErrorAction.REPORT);
410 decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
411 }
412 return str;
413 }
414
415 /**
416 * Converts the provided String to bytes using the
417 * UTF-8 encoding. If the input is malformed,
418 * invalid chars are replaced by a default value.
419 * @return ByteBuffer: bytes stores at ByteBuffer.array()
420 * and length is ByteBuffer.limit()
421 */
422
423 public static ByteBuffer encode(String string)
424 throws CharacterCodingException {
425 return encode(string, true);
426 }
427
428 /**
429 * Converts the provided String to bytes using the
430 * UTF-8 encoding. If <code>replace</code> is true, then
431 * malformed input is replaced with the
432 * substitution character, which is U+FFFD. Otherwise the
433 * method throws a MalformedInputException.
434 * @return ByteBuffer: bytes stores at ByteBuffer.array()
435 * and length is ByteBuffer.limit()
436 */
437 public static ByteBuffer encode(String string, boolean replace)
438 throws CharacterCodingException {
439 CharsetEncoder encoder = ENCODER_FACTORY.get();
440 if (replace) {
441 encoder.onMalformedInput(CodingErrorAction.REPLACE);
442 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
443 }
444 ByteBuffer bytes =
445 encoder.encode(CharBuffer.wrap(string.toCharArray()));
446 if (replace) {
447 encoder.onMalformedInput(CodingErrorAction.REPORT);
448 encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
449 }
450 return bytes;
451 }
452
453 static final public int DEFAULT_MAX_LEN = 1024 * 1024;
454
455 /** Read a UTF8 encoded string from in
456 */
457 public static String readString(DataInput in) throws IOException {
458 int length = WritableUtils.readVInt(in);
459 byte [] bytes = new byte[length];
460 in.readFully(bytes, 0, length);
461 return decode(bytes);
462 }
463
464 /** Read a UTF8 encoded string with a maximum size
465 */
466 public static String readString(DataInput in, int maxLength)
467 throws IOException {
468 int length = WritableUtils.readVIntInRange(in, 0, maxLength);
469 byte [] bytes = new byte[length];
470 in.readFully(bytes, 0, length);
471 return decode(bytes);
472 }
473
474 /** Write a UTF8 encoded string to out
475 */
476 public static int writeString(DataOutput out, String s) throws IOException {
477 ByteBuffer bytes = encode(s);
478 int length = bytes.limit();
479 WritableUtils.writeVInt(out, length);
480 out.write(bytes.array(), 0, length);
481 return length;
482 }
483
484 /** Write a UTF8 encoded string with a maximum size to out
485 */
486 public static int writeString(DataOutput out, String s, int maxLength)
487 throws IOException {
488 ByteBuffer bytes = encode(s);
489 int length = bytes.limit();
490 if (length > maxLength) {
491 throw new IOException("string was too long to write! Expected " +
492 "less than or equal to " + maxLength + " bytes, but got " +
493 length + " bytes.");
494 }
495 WritableUtils.writeVInt(out, length);
496 out.write(bytes.array(), 0, length);
497 return length;
498 }
499
500 ////// states for validateUTF8
501
502 private static final int LEAD_BYTE = 0;
503
504 private static final int TRAIL_BYTE_1 = 1;
505
506 private static final int TRAIL_BYTE = 2;
507
508 /**
509 * Check if a byte array contains valid utf-8
510 * @param utf8 byte array
511 * @throws MalformedInputException if the byte array contains invalid utf-8
512 */
513 public static void validateUTF8(byte[] utf8) throws MalformedInputException {
514 validateUTF8(utf8, 0, utf8.length);
515 }
516
517 /**
518 * Check to see if a byte array is valid utf-8
519 * @param utf8 the array of bytes
520 * @param start the offset of the first byte in the array
521 * @param len the length of the byte sequence
522 * @throws MalformedInputException if the byte array contains invalid bytes
523 */
524 public static void validateUTF8(byte[] utf8, int start, int len)
525 throws MalformedInputException {
526 int count = start;
527 int leadByte = 0;
528 int length = 0;
529 int state = LEAD_BYTE;
530 while (count < start+len) {
531 int aByte = utf8[count] & 0xFF;
532
533 switch (state) {
534 case LEAD_BYTE:
535 leadByte = aByte;
536 length = bytesFromUTF8[aByte];
537
538 switch (length) {
539 case 0: // check for ASCII
540 if (leadByte > 0x7F)
541 throw new MalformedInputException(count);
542 break;
543 case 1:
544 if (leadByte < 0xC2 || leadByte > 0xDF)
545 throw new MalformedInputException(count);
546 state = TRAIL_BYTE_1;
547 break;
548 case 2:
549 if (leadByte < 0xE0 || leadByte > 0xEF)
550 throw new MalformedInputException(count);
551 state = TRAIL_BYTE_1;
552 break;
553 case 3:
554 if (leadByte < 0xF0 || leadByte > 0xF4)
555 throw new MalformedInputException(count);
556 state = TRAIL_BYTE_1;
557 break;
558 default:
559 // too long! Longest valid UTF-8 is 4 bytes (lead + three)
560 // or if < 0 we got a trail byte in the lead byte position
561 throw new MalformedInputException(count);
562 } // switch (length)
563 break;
564
565 case TRAIL_BYTE_1:
566 if (leadByte == 0xF0 && aByte < 0x90)
567 throw new MalformedInputException(count);
568 if (leadByte == 0xF4 && aByte > 0x8F)
569 throw new MalformedInputException(count);
570 if (leadByte == 0xE0 && aByte < 0xA0)
571 throw new MalformedInputException(count);
572 if (leadByte == 0xED && aByte > 0x9F)
573 throw new MalformedInputException(count);
574 // falls through to regular trail-byte test!!
575 case TRAIL_BYTE:
576 if (aByte < 0x80 || aByte > 0xBF)
577 throw new MalformedInputException(count);
578 if (--length == 0) {
579 state = LEAD_BYTE;
580 } else {
581 state = TRAIL_BYTE;
582 }
583 break;
584 } // switch (state)
585 count++;
586 }
587 }
588
589 /**
590 * Magic numbers for UTF-8. These are the number of bytes
591 * that <em>follow</em> a given lead byte. Trailing bytes
592 * have the value -1. The values 4 and 5 are presented in
593 * this table, even though valid UTF-8 cannot include the
594 * five and six byte sequences.
595 */
596 static final int[] bytesFromUTF8 =
597 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
603 0, 0, 0, 0, 0, 0, 0,
604 // trail bytes
605 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
606 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
607 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
608 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
611 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
612
613 /**
614 * Returns the next code point at the current position in
615 * the buffer. The buffer's position will be incremented.
616 * Any mark set on this buffer will be changed by this method!
617 */
618 public static int bytesToCodePoint(ByteBuffer bytes) {
619 bytes.mark();
620 byte b = bytes.get();
621 bytes.reset();
622 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
623 if (extraBytesToRead < 0) return -1; // trailing byte!
624 int ch = 0;
625
626 switch (extraBytesToRead) {
627 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
628 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
629 case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
630 case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
631 case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
632 case 0: ch += (bytes.get() & 0xFF);
633 }
634 ch -= offsetsFromUTF8[extraBytesToRead];
635
636 return ch;
637 }
638
639
640 static final int offsetsFromUTF8[] =
641 { 0x00000000, 0x00003080,
642 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
643
644 /**
645 * For the given string, returns the number of UTF-8 bytes
646 * required to encode the string.
647 * @param string text to encode
648 * @return number of UTF-8 bytes required to encode
649 */
650 public static int utf8Length(String string) {
651 CharacterIterator iter = new StringCharacterIterator(string);
652 char ch = iter.first();
653 int size = 0;
654 while (ch != CharacterIterator.DONE) {
655 if ((ch >= 0xD800) && (ch < 0xDC00)) {
656 // surrogate pair?
657 char trail = iter.next();
658 if ((trail > 0xDBFF) && (trail < 0xE000)) {
659 // valid pair
660 size += 4;
661 } else {
662 // invalid pair
663 size += 3;
664 iter.previous(); // rewind one
665 }
666 } else if (ch < 0x80) {
667 size++;
668 } else if (ch < 0x800) {
669 size += 2;
670 } else {
671 // ch < 0x10000, that is, the largest char value
672 size += 3;
673 }
674 ch = iter.next();
675 }
676 return size;
677 }
678
679 @Override
680 public void getPrefix(byte[] dst, int off, int prefixLen) {
681 int copyLen = Math.min(prefixLen, length);
682 int i = 0;
683 while (i < copyLen) {
684 dst[off + i] = bytes[i];
685 i++;
686 }
687 while (i < prefixLen) {
688 dst[off + i] = 0;
689 i++;
690 }
691 }
692 }