001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.io; 020 021import java.io.IOException; 022import java.io.DataInput; 023import java.io.DataOutput; 024import java.nio.ByteBuffer; 025import java.nio.CharBuffer; 026import java.nio.charset.CharacterCodingException; 027import java.nio.charset.Charset; 028import java.nio.charset.CharsetDecoder; 029import java.nio.charset.CharsetEncoder; 030import java.nio.charset.CodingErrorAction; 031import java.nio.charset.MalformedInputException; 032import java.text.CharacterIterator; 033import java.text.StringCharacterIterator; 034import java.util.Arrays; 035 036import org.apache.avro.reflect.Stringable; 037import org.apache.hadoop.classification.InterfaceAudience; 038import org.apache.hadoop.classification.InterfaceStability; 039import org.apache.hadoop.classification.MapRModified; 040 041/** This class stores text using standard UTF8 encoding. It provides methods 042 * to serialize, deserialize, and compare texts at byte level. The type of 043 * length is integer and is serialized using zero-compressed format. <p>In 044 * addition, it provides methods for string traversal without converting the 045 * byte array to a string. <p>Also includes utilities for 046 * serializing/deserialing a string, coding/decoding a string, checking if a 047 * byte array contains valid UTF8 code, calculating the length of an encoded 048 * string. 049 */ 050@Stringable 051@InterfaceAudience.Public 052@InterfaceStability.Stable 053@MapRModified(summary = "Improve map task performance - bug 13086") 054public class Text extends BinaryComparable 055 implements WritableComparable<BinaryComparable>, HasRawComparablePrefix { 056 057 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = 058 new ThreadLocal<CharsetEncoder>() { 059 @Override 060 protected CharsetEncoder initialValue() { 061 return Charset.forName("UTF-8").newEncoder(). 062 onMalformedInput(CodingErrorAction.REPORT). 063 onUnmappableCharacter(CodingErrorAction.REPORT); 064 } 065 }; 066 067 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = 068 new ThreadLocal<CharsetDecoder>() { 069 @Override 070 protected CharsetDecoder initialValue() { 071 return Charset.forName("UTF-8").newDecoder(). 072 onMalformedInput(CodingErrorAction.REPORT). 073 onUnmappableCharacter(CodingErrorAction.REPORT); 074 } 075 }; 076 077 private static final byte [] EMPTY_BYTES = new byte[0]; 078 079 private byte[] bytes; 080 private int length; 081 082 public Text() { 083 bytes = EMPTY_BYTES; 084 } 085 086 /** Construct from a string. 087 */ 088 public Text(String string) { 089 set(string); 090 } 091 092 /** Construct from another text. */ 093 public Text(Text utf8) { 094 set(utf8); 095 } 096 097 /** Construct from a byte array. 098 */ 099 public Text(byte[] utf8) { 100 set(utf8); 101 } 102 103 /** 104 * Get a copy of the bytes that is exactly the length of the data. 105 * See {@link #getBytes()} for faster access to the underlying array. 106 */ 107 public byte[] copyBytes() { 108 byte[] result = new byte[length]; 109 System.arraycopy(bytes, 0, result, 0, length); 110 return result; 111 } 112 113 /** 114 * Returns the raw bytes; however, only data up to {@link #getLength()} is 115 * valid. Please use {@link #copyBytes()} if you 116 * need the returned array to be precisely the length of the data. 117 */ 118 @Override 119 public byte[] getBytes() { 120 return bytes; 121 } 122 123 /** Returns the number of bytes in the byte array */ 124 @Override 125 public int getLength() { 126 return length; 127 } 128 129 /** 130 * Returns the Unicode Scalar Value (32-bit integer value) 131 * for the character at <code>position</code>. Note that this 132 * method avoids using the converter or doing String instantiation 133 * @return the Unicode scalar value at position or -1 134 * if the position is invalid or points to a 135 * trailing byte 136 */ 137 public int charAt(int position) { 138 if (position > this.length) return -1; // too long 139 if (position < 0) return -1; // duh. 140 141 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); 142 return bytesToCodePoint(bb.slice()); 143 } 144 145 public int find(String what) { 146 return find(what, 0); 147 } 148 149 /** 150 * Finds any occurence of <code>what</code> in the backing 151 * buffer, starting as position <code>start</code>. The starting 152 * position is measured in bytes and the return value is in 153 * terms of byte position in the buffer. The backing buffer is 154 * not converted to a string for this operation. 155 * @return byte position of the first occurence of the search 156 * string in the UTF-8 buffer or -1 if not found 157 */ 158 public int find(String what, int start) { 159 try { 160 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length); 161 ByteBuffer tgt = encode(what); 162 byte b = tgt.get(); 163 src.position(start); 164 165 while (src.hasRemaining()) { 166 if (b == src.get()) { // matching first byte 167 src.mark(); // save position in loop 168 tgt.mark(); // save position in target 169 boolean found = true; 170 int pos = src.position()-1; 171 while (tgt.hasRemaining()) { 172 if (!src.hasRemaining()) { // src expired first 173 tgt.reset(); 174 src.reset(); 175 found = false; 176 break; 177 } 178 if (!(tgt.get() == src.get())) { 179 tgt.reset(); 180 src.reset(); 181 found = false; 182 break; // no match 183 } 184 } 185 if (found) return pos; 186 } 187 } 188 return -1; // not found 189 } catch (CharacterCodingException e) { 190 // can't get here 191 e.printStackTrace(); 192 return -1; 193 } 194 } 195 /** Set to contain the contents of a string. 196 */ 197 public void set(String string) { 198 try { 199 ByteBuffer bb = encode(string, true); 200 bytes = bb.array(); 201 length = bb.limit(); 202 }catch(CharacterCodingException e) { 203 throw new RuntimeException("Should not have happened ", e); 204 } 205 } 206 207 /** Set to a utf8 byte array 208 */ 209 public void set(byte[] utf8) { 210 set(utf8, 0, utf8.length); 211 } 212 213 /** copy a text. */ 214 public void set(Text other) { 215 set(other.getBytes(), 0, other.getLength()); 216 } 217 218 /** 219 * Set the Text to range of bytes 220 * @param utf8 the data to copy from 221 * @param start the first position of the new string 222 * @param len the number of bytes of the new string 223 */ 224 public void set(byte[] utf8, int start, int len) { 225 setCapacity(len, false); 226 System.arraycopy(utf8, start, bytes, 0, len); 227 this.length = len; 228 } 229 230 /** 231 * Append a range of bytes to the end of the given text 232 * @param utf8 the data to copy from 233 * @param start the first position to append from utf8 234 * @param len the number of bytes to append 235 */ 236 public void append(byte[] utf8, int start, int len) { 237 setCapacity(length + len, true); 238 System.arraycopy(utf8, start, bytes, length, len); 239 length += len; 240 } 241 242 /** 243 * Clear the string to empty. 244 * 245 * <em>Note</em>: For performance reasons, this call does not clear the 246 * underlying byte array that is retrievable via {@link #getBytes()}. 247 * In order to free the byte-array memory, call {@link #set(byte[])} 248 * with an empty byte array (For example, <code>new byte[0]</code>). 249 */ 250 public void clear() { 251 length = 0; 252 } 253 254 /* 255 * Sets the capacity of this Text object to <em>at least</em> 256 * <code>len</code> bytes. If the current buffer is longer, 257 * then the capacity and existing content of the buffer are 258 * unchanged. If <code>len</code> is larger 259 * than the current capacity, the Text object's capacity is 260 * increased to match. 261 * @param len the number of bytes we need 262 * @param keepData should the old data be kept 263 */ 264 private void setCapacity(int len, boolean keepData) { 265 if (bytes == null || bytes.length < len) { 266 if (bytes != null && keepData) { 267 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1)); 268 } else { 269 bytes = new byte[len]; 270 } 271 } 272 } 273 274 /** 275 * Convert text back to string 276 * @see java.lang.Object#toString() 277 */ 278 @Override 279 public String toString() { 280 try { 281 return decode(bytes, 0, length); 282 } catch (CharacterCodingException e) { 283 throw new RuntimeException("Should not have happened " , e); 284 } 285 } 286 287 /** deserialize 288 */ 289 @Override 290 public void readFields(DataInput in) throws IOException { 291 int newLength = WritableUtils.readVInt(in); 292 setCapacity(newLength, false); 293 in.readFully(bytes, 0, newLength); 294 length = newLength; 295 } 296 297 public void readFields(DataInput in, int maxLength) throws IOException { 298 int newLength = WritableUtils.readVInt(in); 299 if (newLength < 0) { 300 throw new IOException("tried to deserialize " + newLength + 301 " bytes of data! newLength must be non-negative."); 302 } else if (newLength >= maxLength) { 303 throw new IOException("tried to deserialize " + newLength + 304 " bytes of data, but maxLength = " + maxLength); 305 } 306 setCapacity(newLength, false); 307 in.readFully(bytes, 0, newLength); 308 length = newLength; 309 } 310 311 /** Skips over one Text in the input. */ 312 public static void skip(DataInput in) throws IOException { 313 int length = WritableUtils.readVInt(in); 314 WritableUtils.skipFully(in, length); 315 } 316 317 /** serialize 318 * write this object to out 319 * length uses zero-compressed encoding 320 * @see Writable#write(DataOutput) 321 */ 322 @Override 323 public void write(DataOutput out) throws IOException { 324 WritableUtils.writeVInt(out, length); 325 out.write(bytes, 0, length); 326 } 327 328 public void write(DataOutput out, int maxLength) throws IOException { 329 if (length > maxLength) { 330 throw new IOException("data was too long to write! Expected " + 331 "less than or equal to " + maxLength + " bytes, but got " + 332 length + " bytes."); 333 } 334 WritableUtils.writeVInt(out, length); 335 out.write(bytes, 0, length); 336 } 337 338 /** Returns true iff <code>o</code> is a Text with the same contents. */ 339 @Override 340 public boolean equals(Object o) { 341 if (o instanceof Text) 342 return super.equals(o); 343 return false; 344 } 345 346 @Override 347 public int hashCode() { 348 return super.hashCode(); 349 } 350 351 /** A WritableComparator optimized for Text keys. */ 352 public static class Comparator extends WritableComparator { 353 public Comparator() { 354 super(Text.class); 355 } 356 357 @Override 358 public int compare(byte[] b1, int s1, int l1, 359 byte[] b2, int s2, int l2) { 360 int n1 = WritableUtils.decodeVIntSize(b1[s1]); 361 int n2 = WritableUtils.decodeVIntSize(b2[s2]); 362 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 363 } 364 } 365 366 static { 367 // register this comparator 368 WritableComparator.define(Text.class, new Comparator()); 369 } 370 371 /// STATIC UTILITIES FROM HERE DOWN 372 /** 373 * Converts the provided byte array to a String using the 374 * UTF-8 encoding. If the input is malformed, 375 * replace by a default value. 376 */ 377 public static String decode(byte[] utf8) throws CharacterCodingException { 378 return decode(ByteBuffer.wrap(utf8), true); 379 } 380 381 public static String decode(byte[] utf8, int start, int length) 382 throws CharacterCodingException { 383 return decode(ByteBuffer.wrap(utf8, start, length), true); 384 } 385 386 /** 387 * Converts the provided byte array to a String using the 388 * UTF-8 encoding. If <code>replace</code> is true, then 389 * malformed input is replaced with the 390 * substitution character, which is U+FFFD. Otherwise the 391 * method throws a MalformedInputException. 392 */ 393 public static String decode(byte[] utf8, int start, int length, boolean replace) 394 throws CharacterCodingException { 395 return decode(ByteBuffer.wrap(utf8, start, length), replace); 396 } 397 398 private static String decode(ByteBuffer utf8, boolean replace) 399 throws CharacterCodingException { 400 CharsetDecoder decoder = DECODER_FACTORY.get(); 401 if (replace) { 402 decoder.onMalformedInput( 403 java.nio.charset.CodingErrorAction.REPLACE); 404 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 405 } 406 String str = decoder.decode(utf8).toString(); 407 // set decoder back to its default value: REPORT 408 if (replace) { 409 decoder.onMalformedInput(CodingErrorAction.REPORT); 410 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 411 } 412 return str; 413 } 414 415 /** 416 * Converts the provided String to bytes using the 417 * UTF-8 encoding. If the input is malformed, 418 * invalid chars are replaced by a default value. 419 * @return ByteBuffer: bytes stores at ByteBuffer.array() 420 * and length is ByteBuffer.limit() 421 */ 422 423 public static ByteBuffer encode(String string) 424 throws CharacterCodingException { 425 return encode(string, true); 426 } 427 428 /** 429 * Converts the provided String to bytes using the 430 * UTF-8 encoding. If <code>replace</code> is true, then 431 * malformed input is replaced with the 432 * substitution character, which is U+FFFD. Otherwise the 433 * method throws a MalformedInputException. 434 * @return ByteBuffer: bytes stores at ByteBuffer.array() 435 * and length is ByteBuffer.limit() 436 */ 437 public static ByteBuffer encode(String string, boolean replace) 438 throws CharacterCodingException { 439 CharsetEncoder encoder = ENCODER_FACTORY.get(); 440 if (replace) { 441 encoder.onMalformedInput(CodingErrorAction.REPLACE); 442 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 443 } 444 ByteBuffer bytes = 445 encoder.encode(CharBuffer.wrap(string.toCharArray())); 446 if (replace) { 447 encoder.onMalformedInput(CodingErrorAction.REPORT); 448 encoder.onUnmappableCharacter(CodingErrorAction.REPORT); 449 } 450 return bytes; 451 } 452 453 static final public int DEFAULT_MAX_LEN = 1024 * 1024; 454 455 /** Read a UTF8 encoded string from in 456 */ 457 public static String readString(DataInput in) throws IOException { 458 return readString(in, Integer.MAX_VALUE); 459 } 460 461 /** Read a UTF8 encoded string with a maximum size 462 */ 463 public static String readString(DataInput in, int maxLength) 464 throws IOException { 465 int length = WritableUtils.readVIntInRange(in, 0, maxLength); 466 byte [] bytes = new byte[length]; 467 in.readFully(bytes, 0, length); 468 return decode(bytes); 469 } 470 471 /** Write a UTF8 encoded string to out 472 */ 473 public static int writeString(DataOutput out, String s) throws IOException { 474 ByteBuffer bytes = encode(s); 475 int length = bytes.limit(); 476 WritableUtils.writeVInt(out, length); 477 out.write(bytes.array(), 0, length); 478 return length; 479 } 480 481 /** Write a UTF8 encoded string with a maximum size to out 482 */ 483 public static int writeString(DataOutput out, String s, int maxLength) 484 throws IOException { 485 ByteBuffer bytes = encode(s); 486 int length = bytes.limit(); 487 if (length > maxLength) { 488 throw new IOException("string was too long to write! Expected " + 489 "less than or equal to " + maxLength + " bytes, but got " + 490 length + " bytes."); 491 } 492 WritableUtils.writeVInt(out, length); 493 out.write(bytes.array(), 0, length); 494 return length; 495 } 496 497 ////// states for validateUTF8 498 499 private static final int LEAD_BYTE = 0; 500 501 private static final int TRAIL_BYTE_1 = 1; 502 503 private static final int TRAIL_BYTE = 2; 504 505 /** 506 * Check if a byte array contains valid utf-8 507 * @param utf8 byte array 508 * @throws MalformedInputException if the byte array contains invalid utf-8 509 */ 510 public static void validateUTF8(byte[] utf8) throws MalformedInputException { 511 validateUTF8(utf8, 0, utf8.length); 512 } 513 514 /** 515 * Check to see if a byte array is valid utf-8 516 * @param utf8 the array of bytes 517 * @param start the offset of the first byte in the array 518 * @param len the length of the byte sequence 519 * @throws MalformedInputException if the byte array contains invalid bytes 520 */ 521 public static void validateUTF8(byte[] utf8, int start, int len) 522 throws MalformedInputException { 523 int count = start; 524 int leadByte = 0; 525 int length = 0; 526 int state = LEAD_BYTE; 527 while (count < start+len) { 528 int aByte = utf8[count] & 0xFF; 529 530 switch (state) { 531 case LEAD_BYTE: 532 leadByte = aByte; 533 length = bytesFromUTF8[aByte]; 534 535 switch (length) { 536 case 0: // check for ASCII 537 if (leadByte > 0x7F) 538 throw new MalformedInputException(count); 539 break; 540 case 1: 541 if (leadByte < 0xC2 || leadByte > 0xDF) 542 throw new MalformedInputException(count); 543 state = TRAIL_BYTE_1; 544 break; 545 case 2: 546 if (leadByte < 0xE0 || leadByte > 0xEF) 547 throw new MalformedInputException(count); 548 state = TRAIL_BYTE_1; 549 break; 550 case 3: 551 if (leadByte < 0xF0 || leadByte > 0xF4) 552 throw new MalformedInputException(count); 553 state = TRAIL_BYTE_1; 554 break; 555 default: 556 // too long! Longest valid UTF-8 is 4 bytes (lead + three) 557 // or if < 0 we got a trail byte in the lead byte position 558 throw new MalformedInputException(count); 559 } // switch (length) 560 break; 561 562 case TRAIL_BYTE_1: 563 if (leadByte == 0xF0 && aByte < 0x90) 564 throw new MalformedInputException(count); 565 if (leadByte == 0xF4 && aByte > 0x8F) 566 throw new MalformedInputException(count); 567 if (leadByte == 0xE0 && aByte < 0xA0) 568 throw new MalformedInputException(count); 569 if (leadByte == 0xED && aByte > 0x9F) 570 throw new MalformedInputException(count); 571 // falls through to regular trail-byte test!! 572 case TRAIL_BYTE: 573 if (aByte < 0x80 || aByte > 0xBF) 574 throw new MalformedInputException(count); 575 if (--length == 0) { 576 state = LEAD_BYTE; 577 } else { 578 state = TRAIL_BYTE; 579 } 580 break; 581 } // switch (state) 582 count++; 583 } 584 } 585 586 /** 587 * Magic numbers for UTF-8. These are the number of bytes 588 * that <em>follow</em> a given lead byte. Trailing bytes 589 * have the value -1. The values 4 and 5 are presented in 590 * this table, even though valid UTF-8 cannot include the 591 * five and six byte sequences. 592 */ 593 static final int[] bytesFromUTF8 = 594 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 600 0, 0, 0, 0, 0, 0, 0, 601 // trail bytes 602 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 603 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 604 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 605 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 606 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 607 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 608 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; 609 610 /** 611 * Returns the next code point at the current position in 612 * the buffer. The buffer's position will be incremented. 613 * Any mark set on this buffer will be changed by this method! 614 */ 615 public static int bytesToCodePoint(ByteBuffer bytes) { 616 bytes.mark(); 617 byte b = bytes.get(); 618 bytes.reset(); 619 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; 620 if (extraBytesToRead < 0) return -1; // trailing byte! 621 int ch = 0; 622 623 switch (extraBytesToRead) { 624 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 625 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 626 case 3: ch += (bytes.get() & 0xFF); ch <<= 6; 627 case 2: ch += (bytes.get() & 0xFF); ch <<= 6; 628 case 1: ch += (bytes.get() & 0xFF); ch <<= 6; 629 case 0: ch += (bytes.get() & 0xFF); 630 } 631 ch -= offsetsFromUTF8[extraBytesToRead]; 632 633 return ch; 634 } 635 636 637 static final int offsetsFromUTF8[] = 638 { 0x00000000, 0x00003080, 639 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; 640 641 /** 642 * For the given string, returns the number of UTF-8 bytes 643 * required to encode the string. 644 * @param string text to encode 645 * @return number of UTF-8 bytes required to encode 646 */ 647 public static int utf8Length(String string) { 648 CharacterIterator iter = new StringCharacterIterator(string); 649 char ch = iter.first(); 650 int size = 0; 651 while (ch != CharacterIterator.DONE) { 652 if ((ch >= 0xD800) && (ch < 0xDC00)) { 653 // surrogate pair? 654 char trail = iter.next(); 655 if ((trail > 0xDBFF) && (trail < 0xE000)) { 656 // valid pair 657 size += 4; 658 } else { 659 // invalid pair 660 size += 3; 661 iter.previous(); // rewind one 662 } 663 } else if (ch < 0x80) { 664 size++; 665 } else if (ch < 0x800) { 666 size += 2; 667 } else { 668 // ch < 0x10000, that is, the largest char value 669 size += 3; 670 } 671 ch = iter.next(); 672 } 673 return size; 674 } 675 676 @Override 677 public void getPrefix(byte[] dst, int off, int prefixLen) { 678 int copyLen = Math.min(prefixLen, length); 679 int i = 0; 680 while (i < copyLen) { 681 dst[off + i] = bytes[i]; 682 i++; 683 } 684 while (i < prefixLen) { 685 dst[off + i] = 0; 686 i++; 687 } 688 } 689}