001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.io; 020 021import java.io.IOException; 022import java.io.DataInput; 023import java.io.DataOutput; 024import java.nio.ByteBuffer; 025import java.nio.CharBuffer; 026import java.nio.charset.CharacterCodingException; 027import java.nio.charset.Charset; 028import java.nio.charset.CharsetDecoder; 029import java.nio.charset.CharsetEncoder; 030import java.nio.charset.CodingErrorAction; 031import java.nio.charset.MalformedInputException; 032import java.text.CharacterIterator; 033import java.text.StringCharacterIterator; 034import java.util.Arrays; 035 036import org.apache.avro.reflect.Stringable; 037import org.apache.hadoop.classification.InterfaceAudience; 038import org.apache.hadoop.classification.InterfaceStability; 039import org.apache.hadoop.classification.MapRModified; 040 041/** This class stores text using standard UTF8 encoding. It provides methods 042 * to serialize, deserialize, and compare texts at byte level. The type of 043 * length is integer and is serialized using zero-compressed format. <p>In 044 * addition, it provides methods for string traversal without converting the 045 * byte array to a string. <p>Also includes utilities for 046 * serializing/deserialing a string, coding/decoding a string, checking if a 047 * byte array contains valid UTF8 code, calculating the length of an encoded 048 * string. 049 */ 050@Stringable 051@InterfaceAudience.Public 052@InterfaceStability.Stable 053@MapRModified(summary = "Improve map task performance - bug 13086") 054public class Text extends BinaryComparable 055 implements WritableComparable<BinaryComparable>, HasRawComparablePrefix { 056 057 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = 058 new ThreadLocal<CharsetEncoder>() { 059 @Override 060 protected CharsetEncoder initialValue() { 061 return Charset.forName("UTF-8").newEncoder(). 062 onMalformedInput(CodingErrorAction.REPORT). 063 onUnmappableCharacter(CodingErrorAction.REPORT); 064 } 065 }; 066 067 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = 068 new ThreadLocal<CharsetDecoder>() { 069 @Override 070 protected CharsetDecoder initialValue() { 071 return Charset.forName("UTF-8").newDecoder(). 072 onMalformedInput(CodingErrorAction.REPORT). 073 onUnmappableCharacter(CodingErrorAction.REPORT); 074 } 075 }; 076 077 private static final byte [] EMPTY_BYTES = new byte[0]; 078 079 private byte[] bytes; 080 private int length; 081 082 public Text() { 083 bytes = EMPTY_BYTES; 084 } 085 086 /** Construct from a string. 087 */ 088 public Text(String string) { 089 set(string); 090 } 091 092 /** Construct from another text. */ 093 public Text(Text utf8) { 094 set(utf8); 095 } 096 097 /** Construct from a byte array. 098 */ 099 public Text(byte[] utf8) { 100 set(utf8); 101 } 102 103 /** 104 * Get a copy of the bytes that is exactly the length of the data. 105 * See {@link #getBytes()} for faster access to the underlying array. 106 */ 107 public byte[] copyBytes() { 108 byte[] result = new byte[length]; 109 System.arraycopy(bytes, 0, result, 0, length); 110 return result; 111 } 112 113 /** 114 * Returns the raw bytes; however, only data up to {@link #getLength()} is 115 * valid. Please use {@link #copyBytes()} if you 116 * need the returned array to be precisely the length of the data. 117 */ 118 @Override 119 public byte[] getBytes() { 120 return bytes; 121 } 122 123 /** Returns the number of bytes in the byte array */ 124 @Override 125 public int getLength() { 126 return length; 127 } 128 129 /** 130 * Returns the Unicode Scalar Value (32-bit integer value) 131 * for the character at <code>position</code>. Note that this 132 * method avoids using the converter or doing String instantiation 133 * @return the Unicode scalar value at position or -1 134 * if the position is invalid or points to a 135 * trailing byte 136 */ 137 public int charAt(int position) { 138 if (position > this.length) return -1; // too long 139 if (position < 0) return -1; // duh. 140 141 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); 142 return bytesToCodePoint(bb.slice()); 143 } 144 145 public int find(String what) { 146 return find(what, 0); 147 } 148 149 /** 150 * Finds any occurence of <code>what</code> in the backing 151 * buffer, starting as position <code>start</code>. The starting 152 * position is measured in bytes and the return value is in 153 * terms of byte position in the buffer. The backing buffer is 154 * not converted to a string for this operation. 155 * @return byte position of the first occurence of the search 156 * string in the UTF-8 buffer or -1 if not found 157 */ 158 public int find(String what, int start) { 159 try { 160 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length); 161 ByteBuffer tgt = encode(what); 162 byte b = tgt.get(); 163 src.position(start); 164 165 while (src.hasRemaining()) { 166 if (b == src.get()) { // matching first byte 167 src.mark(); // save position in loop 168 tgt.mark(); // save position in target 169 boolean found = true; 170 int pos = src.position()-1; 171 while (tgt.hasRemaining()) { 172 if (!src.hasRemaining()) { // src expired first 173 tgt.reset(); 174 src.reset(); 175 found = false; 176 break; 177 } 178 if (!(tgt.get() == src.get())) { 179 tgt.reset(); 180 src.reset(); 181 found = false; 182 break; // no match 183 } 184 } 185 if (found) return pos; 186 } 187 } 188 return -1; // not found 189 } catch (CharacterCodingException e) { 190 // can't get here 191 e.printStackTrace(); 192 return -1; 193 } 194 } 195 /** Set to contain the contents of a string. 196 */ 197 public void set(String string) { 198 try { 199 ByteBuffer bb = encode(string, true); 200 bytes = bb.array(); 201 length = bb.limit(); 202 }catch(CharacterCodingException e) { 203 throw new RuntimeException("Should not have happened ", e); 204 } 205 } 206 207 /** Set to a utf8 byte array 208 */ 209 public void set(byte[] utf8) { 210 set(utf8, 0, utf8.length); 211 } 212 213 /** copy a text. */ 214 public void set(Text other) { 215 set(other.getBytes(), 0, other.getLength()); 216 } 217 218 /** 219 * Set the Text to range of bytes 220 * @param utf8 the data to copy from 221 * @param start the first position of the new string 222 * @param len the number of bytes of the new string 223 */ 224 public void set(byte[] utf8, int start, int len) { 225 setCapacity(len, false); 226 System.arraycopy(utf8, start, bytes, 0, len); 227 this.length = len; 228 } 229 230 /** 231 * Append a range of bytes to the end of the given text 232 * @param utf8 the data to copy from 233 * @param start the first position to append from utf8 234 * @param len the number of bytes to append 235 */ 236 public void append(byte[] utf8, int start, int len) { 237 setCapacity(length + len, true); 238 System.arraycopy(utf8, start, bytes, length, len); 239 length += len; 240 } 241 242 /** 243 * Clear the string to empty. 244 * 245 * <em>Note</em>: For performance reasons, this call does not clear the 246 * underlying byte array that is retrievable via {@link #getBytes()}. 247 * In order to free the byte-array memory, call {@link #set(byte[])} 248 * with an empty byte array (For example, <code>new byte[0]</code>). 249 */ 250 public void clear() { 251 length = 0; 252 } 253 254 /* 255 * Sets the capacity of this Text object to <em>at least</em> 256 * <code>len</code> bytes. If the current buffer is longer, 257 * then the capacity and existing content of the buffer are 258 * unchanged. If <code>len</code> is larger 259 * than the current capacity, the Text object's capacity is 260 * increased to match. 261 * @param len the number of bytes we need 262 * @param keepData should the old data be kept 263 */ 264 private void setCapacity(int len, boolean keepData) { 265 if (bytes == null || bytes.length < len) { 266 if (bytes != null && keepData) { 267 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1)); 268 } else { 269 bytes = new byte[len]; 270 } 271 } 272 } 273 274 /** 275 * Convert text back to string 276 * @see java.lang.Object#toString() 277 */ 278 @Override 279 public String toString() { 280 try { 281 return decode(bytes, 0, length); 282 } catch (CharacterCodingException e) { 283 throw new RuntimeException("Should not have happened " , e); 284 } 285 } 286 287 /** deserialize 288 */ 289 @Override 290 public void readFields(DataInput in) throws IOException { 291 int newLength = WritableUtils.readVInt(in); 292 readWithKnownLength(in, newLength); 293 } 294 295 public void readFields(DataInput in, int maxLength) throws IOException { 296 int newLength = WritableUtils.readVInt(in); 297 if (newLength < 0) { 298 throw new IOException("tried to deserialize " + newLength + 299 " bytes of data! newLength must be non-negative."); 300 } else if (newLength >= maxLength) { 301 throw new IOException("tried to deserialize " + newLength + 302 " bytes of data, but maxLength = " + maxLength); 303 } 304 readWithKnownLength(in, newLength); 305 } 306 307 /** Skips over one Text in the input. */ 308 public static void skip(DataInput in) throws IOException { 309 int length = WritableUtils.readVInt(in); 310 WritableUtils.skipFully(in, length); 311 } 312 313 /** 314 * Read a Text object whose length is already known. 315 * This allows creating Text from a stream which uses a different serialization 316 * format. 317 */ 318 public void readWithKnownLength(DataInput in, int len) throws IOException { 319 setCapacity(len, false); 320 in.readFully(bytes, 0, len); 321 length = len; 322 } 323 324 /** serialize 325 * write this object to out 326 * length uses zero-compressed encoding 327 * @see Writable#write(DataOutput) 328 */ 329 @Override 330 public void write(DataOutput out) throws IOException { 331 WritableUtils.writeVInt(out, length); 332 out.write(bytes, 0, length); 333 } 334 335 public void write(DataOutput out, int maxLength) throws IOException { 336 if (length > maxLength) { 337 throw new IOException("data was too long to write! Expected " + 338 "less than or equal to " + maxLength + " bytes, but got " + 339 length + " bytes."); 340 } 341 WritableUtils.writeVInt(out, length); 342 out.write(bytes, 0, length); 343 } 344 345 /** Returns true iff <code>o</code> is a Text with the same contents. */ 346 @Override 347 public boolean equals(Object o) { 348 if (o instanceof Text) 349 return super.equals(o); 350 return false; 351 } 352 353 @Override 354 public int hashCode() { 355 return super.hashCode(); 356 } 357 358 /** A WritableComparator optimized for Text keys. */ 359 public static class Comparator extends WritableComparator { 360 public Comparator() { 361 super(Text.class); 362 } 363 364 @Override 365 public int compare(byte[] b1, int s1, int l1, 366 byte[] b2, int s2, int l2) { 367 int n1 = WritableUtils.decodeVIntSize(b1[s1]); 368 int n2 = WritableUtils.decodeVIntSize(b2[s2]); 369 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 370 } 371 } 372 373 static { 374 // register this comparator 375 WritableComparator.define(Text.class, new Comparator()); 376 } 377 378 /// STATIC UTILITIES FROM HERE DOWN 379 /** 380 * Converts the provided byte array to a String using the 381 * UTF-8 encoding. If the input is malformed, 382 * replace by a default value. 383 */ 384 public static String decode(byte[] utf8) throws CharacterCodingException { 385 return decode(ByteBuffer.wrap(utf8), true); 386 } 387 388 public static String decode(byte[] utf8, int start, int length) 389 throws CharacterCodingException { 390 return decode(ByteBuffer.wrap(utf8, start, length), true); 391 } 392 393 /** 394 * Converts the provided byte array to a String using the 395 * UTF-8 encoding. If <code>replace</code> is true, then 396 * malformed input is replaced with the 397 * substitution character, which is U+FFFD. Otherwise the 398 * method throws a MalformedInputException. 399 */ 400 public static String decode(byte[] utf8, int start, int length, boolean replace) 401 throws CharacterCodingException { 402 return decode(ByteBuffer.wrap(utf8, start, length), replace); 403 } 404 405 private static String decode(ByteBuffer utf8, boolean replace) 406 throws CharacterCodingException { 407 CharsetDecoder decoder = DECODER_FACTORY.get(); 408 if (replace) { 409 decoder.onMalformedInput( 410 java.nio.charset.CodingErrorAction.REPLACE); 411 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 412 } 413 String str = decoder.decode(utf8).toString(); 414 // set decoder back to its default value: REPORT 415 if (replace) { 416 decoder.onMalformedInput(CodingErrorAction.REPORT); 417 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 418 } 419 return str; 420 } 421 422 /** 423 * Converts the provided String to bytes using the 424 * UTF-8 encoding. If the input is malformed, 425 * invalid chars are replaced by a default value. 426 * @return ByteBuffer: bytes stores at ByteBuffer.array() 427 * and length is ByteBuffer.limit() 428 */ 429 430 public static ByteBuffer encode(String string) 431 throws CharacterCodingException { 432 return encode(string, true); 433 } 434 435 /** 436 * Converts the provided String to bytes using the 437 * UTF-8 encoding. If <code>replace</code> is true, then 438 * malformed input is replaced with the 439 * substitution character, which is U+FFFD. Otherwise the 440 * method throws a MalformedInputException. 441 * @return ByteBuffer: bytes stores at ByteBuffer.array() 442 * and length is ByteBuffer.limit() 443 */ 444 public static ByteBuffer encode(String string, boolean replace) 445 throws CharacterCodingException { 446 CharsetEncoder encoder = ENCODER_FACTORY.get(); 447 if (replace) { 448 encoder.onMalformedInput(CodingErrorAction.REPLACE); 449 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 450 } 451 ByteBuffer bytes = 452 encoder.encode(CharBuffer.wrap(string.toCharArray())); 453 if (replace) { 454 encoder.onMalformedInput(CodingErrorAction.REPORT); 455 encoder.onUnmappableCharacter(CodingErrorAction.REPORT); 456 } 457 return bytes; 458 } 459 460 static final public int DEFAULT_MAX_LEN = 1024 * 1024; 461 462 /** Read a UTF8 encoded string from in 463 */ 464 public static String readString(DataInput in) throws IOException { 465 return readString(in, Integer.MAX_VALUE); 466 } 467 468 /** Read a UTF8 encoded string with a maximum size 469 */ 470 public static String readString(DataInput in, int maxLength) 471 throws IOException { 472 int length = WritableUtils.readVIntInRange(in, 0, maxLength); 473 byte [] bytes = new byte[length]; 474 in.readFully(bytes, 0, length); 475 return decode(bytes); 476 } 477 478 /** Write a UTF8 encoded string to out 479 */ 480 public static int writeString(DataOutput out, String s) throws IOException { 481 ByteBuffer bytes = encode(s); 482 int length = bytes.limit(); 483 WritableUtils.writeVInt(out, length); 484 out.write(bytes.array(), 0, length); 485 return length; 486 } 487 488 /** Write a UTF8 encoded string with a maximum size to out 489 */ 490 public static int writeString(DataOutput out, String s, int maxLength) 491 throws IOException { 492 ByteBuffer bytes = encode(s); 493 int length = bytes.limit(); 494 if (length > maxLength) { 495 throw new IOException("string was too long to write! Expected " + 496 "less than or equal to " + maxLength + " bytes, but got " + 497 length + " bytes."); 498 } 499 WritableUtils.writeVInt(out, length); 500 out.write(bytes.array(), 0, length); 501 return length; 502 } 503 504 ////// states for validateUTF8 505 506 private static final int LEAD_BYTE = 0; 507 508 private static final int TRAIL_BYTE_1 = 1; 509 510 private static final int TRAIL_BYTE = 2; 511 512 /** 513 * Check if a byte array contains valid utf-8 514 * @param utf8 byte array 515 * @throws MalformedInputException if the byte array contains invalid utf-8 516 */ 517 public static void validateUTF8(byte[] utf8) throws MalformedInputException { 518 validateUTF8(utf8, 0, utf8.length); 519 } 520 521 /** 522 * Check to see if a byte array is valid utf-8 523 * @param utf8 the array of bytes 524 * @param start the offset of the first byte in the array 525 * @param len the length of the byte sequence 526 * @throws MalformedInputException if the byte array contains invalid bytes 527 */ 528 public static void validateUTF8(byte[] utf8, int start, int len) 529 throws MalformedInputException { 530 int count = start; 531 int leadByte = 0; 532 int length = 0; 533 int state = LEAD_BYTE; 534 while (count < start+len) { 535 int aByte = utf8[count] & 0xFF; 536 537 switch (state) { 538 case LEAD_BYTE: 539 leadByte = aByte; 540 length = bytesFromUTF8[aByte]; 541 542 switch (length) { 543 case 0: // check for ASCII 544 if (leadByte > 0x7F) 545 throw new MalformedInputException(count); 546 break; 547 case 1: 548 if (leadByte < 0xC2 || leadByte > 0xDF) 549 throw new MalformedInputException(count); 550 state = TRAIL_BYTE_1; 551 break; 552 case 2: 553 if (leadByte < 0xE0 || leadByte > 0xEF) 554 throw new MalformedInputException(count); 555 state = TRAIL_BYTE_1; 556 break; 557 case 3: 558 if (leadByte < 0xF0 || leadByte > 0xF4) 559 throw new MalformedInputException(count); 560 state = TRAIL_BYTE_1; 561 break; 562 default: 563 // too long! Longest valid UTF-8 is 4 bytes (lead + three) 564 // or if < 0 we got a trail byte in the lead byte position 565 throw new MalformedInputException(count); 566 } // switch (length) 567 break; 568 569 case TRAIL_BYTE_1: 570 if (leadByte == 0xF0 && aByte < 0x90) 571 throw new MalformedInputException(count); 572 if (leadByte == 0xF4 && aByte > 0x8F) 573 throw new MalformedInputException(count); 574 if (leadByte == 0xE0 && aByte < 0xA0) 575 throw new MalformedInputException(count); 576 if (leadByte == 0xED && aByte > 0x9F) 577 throw new MalformedInputException(count); 578 // falls through to regular trail-byte test!! 579 case TRAIL_BYTE: 580 if (aByte < 0x80 || aByte > 0xBF) 581 throw new MalformedInputException(count); 582 if (--length == 0) { 583 state = LEAD_BYTE; 584 } else { 585 state = TRAIL_BYTE; 586 } 587 break; 588 default: 589 break; 590 } // switch (state) 591 count++; 592 } 593 } 594 595 /** 596 * Magic numbers for UTF-8. These are the number of bytes 597 * that <em>follow</em> a given lead byte. Trailing bytes 598 * have the value -1. The values 4 and 5 are presented in 599 * this table, even though valid UTF-8 cannot include the 600 * five and six byte sequences. 601 */ 602 static final int[] bytesFromUTF8 = 603 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 609 0, 0, 0, 0, 0, 0, 0, 610 // trail bytes 611 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 612 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 613 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 614 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 616 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 617 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; 618 619 /** 620 * Returns the next code point at the current position in 621 * the buffer. The buffer's position will be incremented. 622 * Any mark set on this buffer will be changed by this method! 623 */ 624 public static int bytesToCodePoint(ByteBuffer bytes) { 625 bytes.mark(); 626 byte b = bytes.get(); 627 bytes.reset(); 628 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; 629 if (extraBytesToRead < 0) return -1; // trailing byte! 630 int ch = 0; 631 632 switch (extraBytesToRead) { 633 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 634 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 635 case 3: ch += (bytes.get() & 0xFF); ch <<= 6; 636 case 2: ch += (bytes.get() & 0xFF); ch <<= 6; 637 case 1: ch += (bytes.get() & 0xFF); ch <<= 6; 638 case 0: ch += (bytes.get() & 0xFF); 639 } 640 ch -= offsetsFromUTF8[extraBytesToRead]; 641 642 return ch; 643 } 644 645 646 static final int offsetsFromUTF8[] = 647 { 0x00000000, 0x00003080, 648 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; 649 650 /** 651 * For the given string, returns the number of UTF-8 bytes 652 * required to encode the string. 653 * @param string text to encode 654 * @return number of UTF-8 bytes required to encode 655 */ 656 public static int utf8Length(String string) { 657 CharacterIterator iter = new StringCharacterIterator(string); 658 char ch = iter.first(); 659 int size = 0; 660 while (ch != CharacterIterator.DONE) { 661 if ((ch >= 0xD800) && (ch < 0xDC00)) { 662 // surrogate pair? 663 char trail = iter.next(); 664 if ((trail > 0xDBFF) && (trail < 0xE000)) { 665 // valid pair 666 size += 4; 667 } else { 668 // invalid pair 669 size += 3; 670 iter.previous(); // rewind one 671 } 672 } else if (ch < 0x80) { 673 size++; 674 } else if (ch < 0x800) { 675 size += 2; 676 } else { 677 // ch < 0x10000, that is, the largest char value 678 size += 3; 679 } 680 ch = iter.next(); 681 } 682 return size; 683 } 684 685 @Override 686 public void getPrefix(byte[] dst, int off, int prefixLen) { 687 int copyLen = Math.min(prefixLen, length); 688 int i = 0; 689 while (i < copyLen) { 690 dst[off + i] = bytes[i]; 691 i++; 692 } 693 while (i < prefixLen) { 694 dst[off + i] = 0; 695 i++; 696 } 697 } 698}