001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io;
020
021import java.io.IOException;
022import java.io.DataInput;
023import java.io.DataOutput;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.CharacterCodingException;
027import java.nio.charset.Charset;
028import java.nio.charset.CharsetDecoder;
029import java.nio.charset.CharsetEncoder;
030import java.nio.charset.CodingErrorAction;
031import java.nio.charset.MalformedInputException;
032import java.text.CharacterIterator;
033import java.text.StringCharacterIterator;
034import java.util.Arrays;
035
036import org.apache.avro.reflect.Stringable;
037import org.apache.hadoop.classification.InterfaceAudience;
038import org.apache.hadoop.classification.InterfaceStability;
039import org.apache.hadoop.classification.MapRModified;
040
041/** This class stores text using standard UTF8 encoding.  It provides methods
042 * to serialize, deserialize, and compare texts at byte level.  The type of
043 * length is integer and is serialized using zero-compressed format.  <p>In
044 * addition, it provides methods for string traversal without converting the
045 * byte array to a string.  <p>Also includes utilities for
046 * serializing/deserialing a string, coding/decoding a string, checking if a
047 * byte array contains valid UTF8 code, calculating the length of an encoded
048 * string.
049 */
050@Stringable
051@InterfaceAudience.Public
052@InterfaceStability.Stable
053@MapRModified(summary = "Improve map task performance - bug 13086")
054public class Text extends BinaryComparable
055    implements WritableComparable<BinaryComparable>, HasRawComparablePrefix {
056  
057  private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
058    new ThreadLocal<CharsetEncoder>() {
059      @Override
060      protected CharsetEncoder initialValue() {
061        return Charset.forName("UTF-8").newEncoder().
062               onMalformedInput(CodingErrorAction.REPORT).
063               onUnmappableCharacter(CodingErrorAction.REPORT);
064    }
065  };
066  
067  private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
068    new ThreadLocal<CharsetDecoder>() {
069    @Override
070    protected CharsetDecoder initialValue() {
071      return Charset.forName("UTF-8").newDecoder().
072             onMalformedInput(CodingErrorAction.REPORT).
073             onUnmappableCharacter(CodingErrorAction.REPORT);
074    }
075  };
076  
077  private static final byte [] EMPTY_BYTES = new byte[0];
078  
079  private byte[] bytes;
080  private int length;
081
082  public Text() {
083    bytes = EMPTY_BYTES;
084  }
085
086  /** Construct from a string. 
087   */
088  public Text(String string) {
089    set(string);
090  }
091
092  /** Construct from another text. */
093  public Text(Text utf8) {
094    set(utf8);
095  }
096
097  /** Construct from a byte array.
098   */
099  public Text(byte[] utf8)  {
100    set(utf8);
101  }
102  
103  /**
104   * Get a copy of the bytes that is exactly the length of the data.
105   * See {@link #getBytes()} for faster access to the underlying array.
106   */
107  public byte[] copyBytes() {
108    byte[] result = new byte[length];
109    System.arraycopy(bytes, 0, result, 0, length);
110    return result;
111  }
112  
113  /**
114   * Returns the raw bytes; however, only data up to {@link #getLength()} is
115   * valid. Please use {@link #copyBytes()} if you
116   * need the returned array to be precisely the length of the data.
117   */
118  @Override
119  public byte[] getBytes() {
120    return bytes;
121  }
122
123  /** Returns the number of bytes in the byte array */ 
124  @Override
125  public int getLength() {
126    return length;
127  }
128  
129  /**
130   * Returns the Unicode Scalar Value (32-bit integer value)
131   * for the character at <code>position</code>. Note that this
132   * method avoids using the converter or doing String instantiation
133   * @return the Unicode scalar value at position or -1
134   *          if the position is invalid or points to a
135   *          trailing byte
136   */
137  public int charAt(int position) {
138    if (position > this.length) return -1; // too long
139    if (position < 0) return -1; // duh.
140      
141    ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
142    return bytesToCodePoint(bb.slice());
143  }
144  
145  public int find(String what) {
146    return find(what, 0);
147  }
148  
149  /**
150   * Finds any occurence of <code>what</code> in the backing
151   * buffer, starting as position <code>start</code>. The starting
152   * position is measured in bytes and the return value is in
153   * terms of byte position in the buffer. The backing buffer is
154   * not converted to a string for this operation.
155   * @return byte position of the first occurence of the search
156   *         string in the UTF-8 buffer or -1 if not found
157   */
158  public int find(String what, int start) {
159    try {
160      ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
161      ByteBuffer tgt = encode(what);
162      byte b = tgt.get();
163      src.position(start);
164          
165      while (src.hasRemaining()) {
166        if (b == src.get()) { // matching first byte
167          src.mark(); // save position in loop
168          tgt.mark(); // save position in target
169          boolean found = true;
170          int pos = src.position()-1;
171          while (tgt.hasRemaining()) {
172            if (!src.hasRemaining()) { // src expired first
173              tgt.reset();
174              src.reset();
175              found = false;
176              break;
177            }
178            if (!(tgt.get() == src.get())) {
179              tgt.reset();
180              src.reset();
181              found = false;
182              break; // no match
183            }
184          }
185          if (found) return pos;
186        }
187      }
188      return -1; // not found
189    } catch (CharacterCodingException e) {
190      // can't get here
191      e.printStackTrace();
192      return -1;
193    }
194  }  
195  /** Set to contain the contents of a string. 
196   */
197  public void set(String string) {
198    try {
199      ByteBuffer bb = encode(string, true);
200      bytes = bb.array();
201      length = bb.limit();
202    }catch(CharacterCodingException e) {
203      throw new RuntimeException("Should not have happened ", e); 
204    }
205  }
206
207  /** Set to a utf8 byte array
208   */
209  public void set(byte[] utf8) {
210    set(utf8, 0, utf8.length);
211  }
212  
213  /** copy a text. */
214  public void set(Text other) {
215    set(other.getBytes(), 0, other.getLength());
216  }
217
218  /**
219   * Set the Text to range of bytes
220   * @param utf8 the data to copy from
221   * @param start the first position of the new string
222   * @param len the number of bytes of the new string
223   */
224  public void set(byte[] utf8, int start, int len) {
225    setCapacity(len, false);
226    System.arraycopy(utf8, start, bytes, 0, len);
227    this.length = len;
228  }
229
230  /**
231   * Append a range of bytes to the end of the given text
232   * @param utf8 the data to copy from
233   * @param start the first position to append from utf8
234   * @param len the number of bytes to append
235   */
236  public void append(byte[] utf8, int start, int len) {
237    setCapacity(length + len, true);
238    System.arraycopy(utf8, start, bytes, length, len);
239    length += len;
240  }
241
242  /**
243   * Clear the string to empty.
244   *
245   * <em>Note</em>: For performance reasons, this call does not clear the
246   * underlying byte array that is retrievable via {@link #getBytes()}.
247   * In order to free the byte-array memory, call {@link #set(byte[])}
248   * with an empty byte array (For example, <code>new byte[0]</code>).
249   */
250  public void clear() {
251    length = 0;
252  }
253
254  /*
255   * Sets the capacity of this Text object to <em>at least</em>
256   * <code>len</code> bytes. If the current buffer is longer,
257   * then the capacity and existing content of the buffer are
258   * unchanged. If <code>len</code> is larger
259   * than the current capacity, the Text object's capacity is
260   * increased to match.
261   * @param len the number of bytes we need
262   * @param keepData should the old data be kept
263   */
264  private void setCapacity(int len, boolean keepData) {
265    if (bytes == null || bytes.length < len) {
266      if (bytes != null && keepData) {
267        bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
268      } else {
269        bytes = new byte[len];
270      }
271    }
272  }
273   
274  /** 
275   * Convert text back to string
276   * @see java.lang.Object#toString()
277   */
278  @Override
279  public String toString() {
280    try {
281      return decode(bytes, 0, length);
282    } catch (CharacterCodingException e) { 
283      throw new RuntimeException("Should not have happened " , e); 
284    }
285  }
286  
287  /** deserialize 
288   */
289  @Override
290  public void readFields(DataInput in) throws IOException {
291    int newLength = WritableUtils.readVInt(in);
292    readWithKnownLength(in, newLength);
293  }
294  
295  public void readFields(DataInput in, int maxLength) throws IOException {
296    int newLength = WritableUtils.readVInt(in);
297    if (newLength < 0) {
298      throw new IOException("tried to deserialize " + newLength +
299          " bytes of data!  newLength must be non-negative.");
300    } else if (newLength >= maxLength) {
301      throw new IOException("tried to deserialize " + newLength +
302          " bytes of data, but maxLength = " + maxLength);
303    }
304    readWithKnownLength(in, newLength);
305  }
306
307  /** Skips over one Text in the input. */
308  public static void skip(DataInput in) throws IOException {
309    int length = WritableUtils.readVInt(in);
310    WritableUtils.skipFully(in, length);
311  }
312
313  /**
314   * Read a Text object whose length is already known.
315   * This allows creating Text from a stream which uses a different serialization
316   * format.
317   */
318  public void readWithKnownLength(DataInput in, int len) throws IOException {
319    setCapacity(len, false);
320    in.readFully(bytes, 0, len);
321    length = len;
322  }
323
324  /** serialize
325   * write this object to out
326   * length uses zero-compressed encoding
327   * @see Writable#write(DataOutput)
328   */
329  @Override
330  public void write(DataOutput out) throws IOException {
331    WritableUtils.writeVInt(out, length);
332    out.write(bytes, 0, length);
333  }
334
335  public void write(DataOutput out, int maxLength) throws IOException {
336    if (length > maxLength) {
337      throw new IOException("data was too long to write!  Expected " +
338          "less than or equal to " + maxLength + " bytes, but got " +
339          length + " bytes.");
340    }
341    WritableUtils.writeVInt(out, length);
342    out.write(bytes, 0, length);
343  }
344
345  /** Returns true iff <code>o</code> is a Text with the same contents.  */
346  @Override
347  public boolean equals(Object o) {
348    if (o instanceof Text)
349      return super.equals(o);
350    return false;
351  }
352
353  @Override
354  public int hashCode() {
355    return super.hashCode();
356  }
357
358  /** A WritableComparator optimized for Text keys. */
359  public static class Comparator extends WritableComparator {
360    public Comparator() {
361      super(Text.class);
362    }
363
364    @Override
365    public int compare(byte[] b1, int s1, int l1,
366                       byte[] b2, int s2, int l2) {
367      int n1 = WritableUtils.decodeVIntSize(b1[s1]);
368      int n2 = WritableUtils.decodeVIntSize(b2[s2]);
369      return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
370    }
371  }
372
373  static {
374    // register this comparator
375    WritableComparator.define(Text.class, new Comparator());
376  }
377
378  /// STATIC UTILITIES FROM HERE DOWN
379  /**
380   * Converts the provided byte array to a String using the
381   * UTF-8 encoding. If the input is malformed,
382   * replace by a default value.
383   */
384  public static String decode(byte[] utf8) throws CharacterCodingException {
385    return decode(ByteBuffer.wrap(utf8), true);
386  }
387  
388  public static String decode(byte[] utf8, int start, int length) 
389    throws CharacterCodingException {
390    return decode(ByteBuffer.wrap(utf8, start, length), true);
391  }
392  
393  /**
394   * Converts the provided byte array to a String using the
395   * UTF-8 encoding. If <code>replace</code> is true, then
396   * malformed input is replaced with the
397   * substitution character, which is U+FFFD. Otherwise the
398   * method throws a MalformedInputException.
399   */
400  public static String decode(byte[] utf8, int start, int length, boolean replace) 
401    throws CharacterCodingException {
402    return decode(ByteBuffer.wrap(utf8, start, length), replace);
403  }
404  
405  private static String decode(ByteBuffer utf8, boolean replace) 
406    throws CharacterCodingException {
407    CharsetDecoder decoder = DECODER_FACTORY.get();
408    if (replace) {
409      decoder.onMalformedInput(
410          java.nio.charset.CodingErrorAction.REPLACE);
411      decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
412    }
413    String str = decoder.decode(utf8).toString();
414    // set decoder back to its default value: REPORT
415    if (replace) {
416      decoder.onMalformedInput(CodingErrorAction.REPORT);
417      decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
418    }
419    return str;
420  }
421
422  /**
423   * Converts the provided String to bytes using the
424   * UTF-8 encoding. If the input is malformed,
425   * invalid chars are replaced by a default value.
426   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
427   *                     and length is ByteBuffer.limit()
428   */
429
430  public static ByteBuffer encode(String string)
431    throws CharacterCodingException {
432    return encode(string, true);
433  }
434
435  /**
436   * Converts the provided String to bytes using the
437   * UTF-8 encoding. If <code>replace</code> is true, then
438   * malformed input is replaced with the
439   * substitution character, which is U+FFFD. Otherwise the
440   * method throws a MalformedInputException.
441   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
442   *                     and length is ByteBuffer.limit()
443   */
444  public static ByteBuffer encode(String string, boolean replace)
445    throws CharacterCodingException {
446    CharsetEncoder encoder = ENCODER_FACTORY.get();
447    if (replace) {
448      encoder.onMalformedInput(CodingErrorAction.REPLACE);
449      encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
450    }
451    ByteBuffer bytes = 
452      encoder.encode(CharBuffer.wrap(string.toCharArray()));
453    if (replace) {
454      encoder.onMalformedInput(CodingErrorAction.REPORT);
455      encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
456    }
457    return bytes;
458  }
459
460  static final public int DEFAULT_MAX_LEN = 1024 * 1024;
461
462  /** Read a UTF8 encoded string from in
463   */
464  public static String readString(DataInput in) throws IOException {
465    return readString(in, Integer.MAX_VALUE);
466  }
467  
468  /** Read a UTF8 encoded string with a maximum size
469   */
470  public static String readString(DataInput in, int maxLength)
471      throws IOException {
472    int length = WritableUtils.readVIntInRange(in, 0, maxLength);
473    byte [] bytes = new byte[length];
474    in.readFully(bytes, 0, length);
475    return decode(bytes);
476  }
477  
478  /** Write a UTF8 encoded string to out
479   */
480  public static int writeString(DataOutput out, String s) throws IOException {
481    ByteBuffer bytes = encode(s);
482    int length = bytes.limit();
483    WritableUtils.writeVInt(out, length);
484    out.write(bytes.array(), 0, length);
485    return length;
486  }
487
488  /** Write a UTF8 encoded string with a maximum size to out
489   */
490  public static int writeString(DataOutput out, String s, int maxLength)
491      throws IOException {
492    ByteBuffer bytes = encode(s);
493    int length = bytes.limit();
494    if (length > maxLength) {
495      throw new IOException("string was too long to write!  Expected " +
496          "less than or equal to " + maxLength + " bytes, but got " +
497          length + " bytes.");
498    }
499    WritableUtils.writeVInt(out, length);
500    out.write(bytes.array(), 0, length);
501    return length;
502  }
503
504  ////// states for validateUTF8
505  
506  private static final int LEAD_BYTE = 0;
507
508  private static final int TRAIL_BYTE_1 = 1;
509
510  private static final int TRAIL_BYTE = 2;
511
512  /** 
513   * Check if a byte array contains valid utf-8
514   * @param utf8 byte array
515   * @throws MalformedInputException if the byte array contains invalid utf-8
516   */
517  public static void validateUTF8(byte[] utf8) throws MalformedInputException {
518    validateUTF8(utf8, 0, utf8.length);     
519  }
520  
521  /**
522   * Check to see if a byte array is valid utf-8
523   * @param utf8 the array of bytes
524   * @param start the offset of the first byte in the array
525   * @param len the length of the byte sequence
526   * @throws MalformedInputException if the byte array contains invalid bytes
527   */
528  public static void validateUTF8(byte[] utf8, int start, int len)
529    throws MalformedInputException {
530    int count = start;
531    int leadByte = 0;
532    int length = 0;
533    int state = LEAD_BYTE;
534    while (count < start+len) {
535      int aByte = utf8[count] & 0xFF;
536
537      switch (state) {
538      case LEAD_BYTE:
539        leadByte = aByte;
540        length = bytesFromUTF8[aByte];
541
542        switch (length) {
543        case 0: // check for ASCII
544          if (leadByte > 0x7F)
545            throw new MalformedInputException(count);
546          break;
547        case 1:
548          if (leadByte < 0xC2 || leadByte > 0xDF)
549            throw new MalformedInputException(count);
550          state = TRAIL_BYTE_1;
551          break;
552        case 2:
553          if (leadByte < 0xE0 || leadByte > 0xEF)
554            throw new MalformedInputException(count);
555          state = TRAIL_BYTE_1;
556          break;
557        case 3:
558          if (leadByte < 0xF0 || leadByte > 0xF4)
559            throw new MalformedInputException(count);
560          state = TRAIL_BYTE_1;
561          break;
562        default:
563          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
564          // or if < 0 we got a trail byte in the lead byte position
565          throw new MalformedInputException(count);
566        } // switch (length)
567        break;
568
569      case TRAIL_BYTE_1:
570        if (leadByte == 0xF0 && aByte < 0x90)
571          throw new MalformedInputException(count);
572        if (leadByte == 0xF4 && aByte > 0x8F)
573          throw new MalformedInputException(count);
574        if (leadByte == 0xE0 && aByte < 0xA0)
575          throw new MalformedInputException(count);
576        if (leadByte == 0xED && aByte > 0x9F)
577          throw new MalformedInputException(count);
578        // falls through to regular trail-byte test!!
579      case TRAIL_BYTE:
580        if (aByte < 0x80 || aByte > 0xBF)
581          throw new MalformedInputException(count);
582        if (--length == 0) {
583          state = LEAD_BYTE;
584        } else {
585          state = TRAIL_BYTE;
586        }
587        break;
588      default:
589        break;
590      } // switch (state)
591      count++;
592    }
593  }
594
595  /**
596   * Magic numbers for UTF-8. These are the number of bytes
597   * that <em>follow</em> a given lead byte. Trailing bytes
598   * have the value -1. The values 4 and 5 are presented in
599   * this table, even though valid UTF-8 cannot include the
600   * five and six byte sequences.
601   */
602  static final int[] bytesFromUTF8 =
603  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
604    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
606    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
607    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
608    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
609    0, 0, 0, 0, 0, 0, 0,
610    // trail bytes
611    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
612    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
613    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
614    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
615    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
617    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
618
619  /**
620   * Returns the next code point at the current position in
621   * the buffer. The buffer's position will be incremented.
622   * Any mark set on this buffer will be changed by this method!
623   */
624  public static int bytesToCodePoint(ByteBuffer bytes) {
625    bytes.mark();
626    byte b = bytes.get();
627    bytes.reset();
628    int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
629    if (extraBytesToRead < 0) return -1; // trailing byte!
630    int ch = 0;
631
632    switch (extraBytesToRead) {
633    case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
634    case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
635    case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
636    case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
637    case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
638    case 0: ch += (bytes.get() & 0xFF);
639    }
640    ch -= offsetsFromUTF8[extraBytesToRead];
641
642    return ch;
643  }
644
645  
646  static final int offsetsFromUTF8[] =
647  { 0x00000000, 0x00003080,
648    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
649
650  /**
651   * For the given string, returns the number of UTF-8 bytes
652   * required to encode the string.
653   * @param string text to encode
654   * @return number of UTF-8 bytes required to encode
655   */
656  public static int utf8Length(String string) {
657    CharacterIterator iter = new StringCharacterIterator(string);
658    char ch = iter.first();
659    int size = 0;
660    while (ch != CharacterIterator.DONE) {
661      if ((ch >= 0xD800) && (ch < 0xDC00)) {
662        // surrogate pair?
663        char trail = iter.next();
664        if ((trail > 0xDBFF) && (trail < 0xE000)) {
665          // valid pair
666          size += 4;
667        } else {
668          // invalid pair
669          size += 3;
670          iter.previous(); // rewind one
671        }
672      } else if (ch < 0x80) {
673        size++;
674      } else if (ch < 0x800) {
675        size += 2;
676      } else {
677        // ch < 0x10000, that is, the largest char value
678        size += 3;
679      }
680      ch = iter.next();
681    }
682    return size;
683  }
684
685  @Override
686  public void getPrefix(byte[] dst, int off, int prefixLen) {
687    int copyLen = Math.min(prefixLen, length);
688    int i = 0;
689    while (i < copyLen) {
690      dst[off + i] = bytes[i];
691      i++;
692    }
693    while (i < prefixLen) {
694      dst[off + i] = 0;
695      i++;
696    }
697  }
698}