001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io;
020
021import java.io.IOException;
022import java.io.DataInput;
023import java.io.DataOutput;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.CharacterCodingException;
027import java.nio.charset.Charset;
028import java.nio.charset.CharsetDecoder;
029import java.nio.charset.CharsetEncoder;
030import java.nio.charset.CodingErrorAction;
031import java.nio.charset.MalformedInputException;
032import java.text.CharacterIterator;
033import java.text.StringCharacterIterator;
034import java.util.Arrays;
035
036import org.apache.avro.reflect.Stringable;
037import org.apache.hadoop.classification.InterfaceAudience;
038import org.apache.hadoop.classification.InterfaceStability;
039import org.apache.hadoop.classification.MapRModified;
040
041/** This class stores text using standard UTF8 encoding.  It provides methods
042 * to serialize, deserialize, and compare texts at byte level.  The type of
043 * length is integer and is serialized using zero-compressed format.  <p>In
044 * addition, it provides methods for string traversal without converting the
045 * byte array to a string.  <p>Also includes utilities for
046 * serializing/deserialing a string, coding/decoding a string, checking if a
047 * byte array contains valid UTF8 code, calculating the length of an encoded
048 * string.
049 */
050@Stringable
051@InterfaceAudience.Public
052@InterfaceStability.Stable
053@MapRModified(summary = "Improve map task performance - bug 13086")
054public class Text extends BinaryComparable
055    implements WritableComparable<BinaryComparable>, HasRawComparablePrefix {
056  
057  private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
058    new ThreadLocal<CharsetEncoder>() {
059      @Override
060      protected CharsetEncoder initialValue() {
061        return Charset.forName("UTF-8").newEncoder().
062               onMalformedInput(CodingErrorAction.REPORT).
063               onUnmappableCharacter(CodingErrorAction.REPORT);
064    }
065  };
066  
067  private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
068    new ThreadLocal<CharsetDecoder>() {
069    @Override
070    protected CharsetDecoder initialValue() {
071      return Charset.forName("UTF-8").newDecoder().
072             onMalformedInput(CodingErrorAction.REPORT).
073             onUnmappableCharacter(CodingErrorAction.REPORT);
074    }
075  };
076  
077  private static final byte [] EMPTY_BYTES = new byte[0];
078  
079  private byte[] bytes;
080  private int length;
081
082  public Text() {
083    bytes = EMPTY_BYTES;
084  }
085
086  /** Construct from a string. 
087   */
088  public Text(String string) {
089    set(string);
090  }
091
092  /** Construct from another text. */
093  public Text(Text utf8) {
094    set(utf8);
095  }
096
097  /** Construct from a byte array.
098   */
099  public Text(byte[] utf8)  {
100    set(utf8);
101  }
102  
103  /**
104   * Get a copy of the bytes that is exactly the length of the data.
105   * See {@link #getBytes()} for faster access to the underlying array.
106   */
107  public byte[] copyBytes() {
108    byte[] result = new byte[length];
109    System.arraycopy(bytes, 0, result, 0, length);
110    return result;
111  }
112  
113  /**
114   * Returns the raw bytes; however, only data up to {@link #getLength()} is
115   * valid. Please use {@link #copyBytes()} if you
116   * need the returned array to be precisely the length of the data.
117   */
118  @Override
119  public byte[] getBytes() {
120    return bytes;
121  }
122
123  /** Returns the number of bytes in the byte array */ 
124  @Override
125  public int getLength() {
126    return length;
127  }
128  
129  /**
130   * Returns the Unicode Scalar Value (32-bit integer value)
131   * for the character at <code>position</code>. Note that this
132   * method avoids using the converter or doing String instantiation
133   * @return the Unicode scalar value at position or -1
134   *          if the position is invalid or points to a
135   *          trailing byte
136   */
137  public int charAt(int position) {
138    if (position > this.length) return -1; // too long
139    if (position < 0) return -1; // duh.
140      
141    ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
142    return bytesToCodePoint(bb.slice());
143  }
144  
145  public int find(String what) {
146    return find(what, 0);
147  }
148  
149  /**
150   * Finds any occurence of <code>what</code> in the backing
151   * buffer, starting as position <code>start</code>. The starting
152   * position is measured in bytes and the return value is in
153   * terms of byte position in the buffer. The backing buffer is
154   * not converted to a string for this operation.
155   * @return byte position of the first occurence of the search
156   *         string in the UTF-8 buffer or -1 if not found
157   */
158  public int find(String what, int start) {
159    try {
160      ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
161      ByteBuffer tgt = encode(what);
162      byte b = tgt.get();
163      src.position(start);
164          
165      while (src.hasRemaining()) {
166        if (b == src.get()) { // matching first byte
167          src.mark(); // save position in loop
168          tgt.mark(); // save position in target
169          boolean found = true;
170          int pos = src.position()-1;
171          while (tgt.hasRemaining()) {
172            if (!src.hasRemaining()) { // src expired first
173              tgt.reset();
174              src.reset();
175              found = false;
176              break;
177            }
178            if (!(tgt.get() == src.get())) {
179              tgt.reset();
180              src.reset();
181              found = false;
182              break; // no match
183            }
184          }
185          if (found) return pos;
186        }
187      }
188      return -1; // not found
189    } catch (CharacterCodingException e) {
190      // can't get here
191      e.printStackTrace();
192      return -1;
193    }
194  }  
195  /** Set to contain the contents of a string. 
196   */
197  public void set(String string) {
198    try {
199      ByteBuffer bb = encode(string, true);
200      bytes = bb.array();
201      length = bb.limit();
202    }catch(CharacterCodingException e) {
203      throw new RuntimeException("Should not have happened ", e); 
204    }
205  }
206
207  /** Set to a utf8 byte array
208   */
209  public void set(byte[] utf8) {
210    set(utf8, 0, utf8.length);
211  }
212  
213  /** copy a text. */
214  public void set(Text other) {
215    set(other.getBytes(), 0, other.getLength());
216  }
217
218  /**
219   * Set the Text to range of bytes
220   * @param utf8 the data to copy from
221   * @param start the first position of the new string
222   * @param len the number of bytes of the new string
223   */
224  public void set(byte[] utf8, int start, int len) {
225    setCapacity(len, false);
226    System.arraycopy(utf8, start, bytes, 0, len);
227    this.length = len;
228  }
229
230  /**
231   * Append a range of bytes to the end of the given text
232   * @param utf8 the data to copy from
233   * @param start the first position to append from utf8
234   * @param len the number of bytes to append
235   */
236  public void append(byte[] utf8, int start, int len) {
237    setCapacity(length + len, true);
238    System.arraycopy(utf8, start, bytes, length, len);
239    length += len;
240  }
241
242  /**
243   * Clear the string to empty.
244   *
245   * <em>Note</em>: For performance reasons, this call does not clear the
246   * underlying byte array that is retrievable via {@link #getBytes()}.
247   * In order to free the byte-array memory, call {@link #set(byte[])}
248   * with an empty byte array (For example, <code>new byte[0]</code>).
249   */
250  public void clear() {
251    length = 0;
252  }
253
254  /*
255   * Sets the capacity of this Text object to <em>at least</em>
256   * <code>len</code> bytes. If the current buffer is longer,
257   * then the capacity and existing content of the buffer are
258   * unchanged. If <code>len</code> is larger
259   * than the current capacity, the Text object's capacity is
260   * increased to match.
261   * @param len the number of bytes we need
262   * @param keepData should the old data be kept
263   */
264  private void setCapacity(int len, boolean keepData) {
265    if (bytes == null || bytes.length < len) {
266      if (bytes != null && keepData) {
267        bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
268      } else {
269        bytes = new byte[len];
270      }
271    }
272  }
273   
274  /** 
275   * Convert text back to string
276   * @see java.lang.Object#toString()
277   */
278  @Override
279  public String toString() {
280    try {
281      return decode(bytes, 0, length);
282    } catch (CharacterCodingException e) { 
283      throw new RuntimeException("Should not have happened " , e); 
284    }
285  }
286  
287  /** deserialize 
288   */
289  @Override
290  public void readFields(DataInput in) throws IOException {
291    int newLength = WritableUtils.readVInt(in);
292    setCapacity(newLength, false);
293    in.readFully(bytes, 0, newLength);
294    length = newLength;
295  }
296  
297  public void readFields(DataInput in, int maxLength) throws IOException {
298    int newLength = WritableUtils.readVInt(in);
299    if (newLength < 0) {
300      throw new IOException("tried to deserialize " + newLength +
301          " bytes of data!  newLength must be non-negative.");
302    } else if (newLength >= maxLength) {
303      throw new IOException("tried to deserialize " + newLength +
304          " bytes of data, but maxLength = " + maxLength);
305    }
306    setCapacity(newLength, false);
307    in.readFully(bytes, 0, newLength);
308    length = newLength;
309  }
310
311  /** Skips over one Text in the input. */
312  public static void skip(DataInput in) throws IOException {
313    int length = WritableUtils.readVInt(in);
314    WritableUtils.skipFully(in, length);
315  }
316
317  /** serialize
318   * write this object to out
319   * length uses zero-compressed encoding
320   * @see Writable#write(DataOutput)
321   */
322  @Override
323  public void write(DataOutput out) throws IOException {
324    WritableUtils.writeVInt(out, length);
325    out.write(bytes, 0, length);
326  }
327
328  public void write(DataOutput out, int maxLength) throws IOException {
329    if (length > maxLength) {
330      throw new IOException("data was too long to write!  Expected " +
331          "less than or equal to " + maxLength + " bytes, but got " +
332          length + " bytes.");
333    }
334    WritableUtils.writeVInt(out, length);
335    out.write(bytes, 0, length);
336  }
337
338  /** Returns true iff <code>o</code> is a Text with the same contents.  */
339  @Override
340  public boolean equals(Object o) {
341    if (o instanceof Text)
342      return super.equals(o);
343    return false;
344  }
345
346  @Override
347  public int hashCode() {
348    return super.hashCode();
349  }
350
351  /** A WritableComparator optimized for Text keys. */
352  public static class Comparator extends WritableComparator {
353    public Comparator() {
354      super(Text.class);
355    }
356
357    @Override
358    public int compare(byte[] b1, int s1, int l1,
359                       byte[] b2, int s2, int l2) {
360      int n1 = WritableUtils.decodeVIntSize(b1[s1]);
361      int n2 = WritableUtils.decodeVIntSize(b2[s2]);
362      return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
363    }
364  }
365
366  static {
367    // register this comparator
368    WritableComparator.define(Text.class, new Comparator());
369  }
370
371  /// STATIC UTILITIES FROM HERE DOWN
372  /**
373   * Converts the provided byte array to a String using the
374   * UTF-8 encoding. If the input is malformed,
375   * replace by a default value.
376   */
377  public static String decode(byte[] utf8) throws CharacterCodingException {
378    return decode(ByteBuffer.wrap(utf8), true);
379  }
380  
381  public static String decode(byte[] utf8, int start, int length) 
382    throws CharacterCodingException {
383    return decode(ByteBuffer.wrap(utf8, start, length), true);
384  }
385  
386  /**
387   * Converts the provided byte array to a String using the
388   * UTF-8 encoding. If <code>replace</code> is true, then
389   * malformed input is replaced with the
390   * substitution character, which is U+FFFD. Otherwise the
391   * method throws a MalformedInputException.
392   */
393  public static String decode(byte[] utf8, int start, int length, boolean replace) 
394    throws CharacterCodingException {
395    return decode(ByteBuffer.wrap(utf8, start, length), replace);
396  }
397  
398  private static String decode(ByteBuffer utf8, boolean replace) 
399    throws CharacterCodingException {
400    CharsetDecoder decoder = DECODER_FACTORY.get();
401    if (replace) {
402      decoder.onMalformedInput(
403          java.nio.charset.CodingErrorAction.REPLACE);
404      decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
405    }
406    String str = decoder.decode(utf8).toString();
407    // set decoder back to its default value: REPORT
408    if (replace) {
409      decoder.onMalformedInput(CodingErrorAction.REPORT);
410      decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
411    }
412    return str;
413  }
414
415  /**
416   * Converts the provided String to bytes using the
417   * UTF-8 encoding. If the input is malformed,
418   * invalid chars are replaced by a default value.
419   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
420   *                     and length is ByteBuffer.limit()
421   */
422
423  public static ByteBuffer encode(String string)
424    throws CharacterCodingException {
425    return encode(string, true);
426  }
427
428  /**
429   * Converts the provided String to bytes using the
430   * UTF-8 encoding. If <code>replace</code> is true, then
431   * malformed input is replaced with the
432   * substitution character, which is U+FFFD. Otherwise the
433   * method throws a MalformedInputException.
434   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
435   *                     and length is ByteBuffer.limit()
436   */
437  public static ByteBuffer encode(String string, boolean replace)
438    throws CharacterCodingException {
439    CharsetEncoder encoder = ENCODER_FACTORY.get();
440    if (replace) {
441      encoder.onMalformedInput(CodingErrorAction.REPLACE);
442      encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
443    }
444    ByteBuffer bytes = 
445      encoder.encode(CharBuffer.wrap(string.toCharArray()));
446    if (replace) {
447      encoder.onMalformedInput(CodingErrorAction.REPORT);
448      encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
449    }
450    return bytes;
451  }
452
453  static final public int DEFAULT_MAX_LEN = 1024 * 1024;
454
455  /** Read a UTF8 encoded string from in
456   */
457  public static String readString(DataInput in) throws IOException {
458    return readString(in, Integer.MAX_VALUE);
459  }
460  
461  /** Read a UTF8 encoded string with a maximum size
462   */
463  public static String readString(DataInput in, int maxLength)
464      throws IOException {
465    int length = WritableUtils.readVIntInRange(in, 0, maxLength);
466    byte [] bytes = new byte[length];
467    in.readFully(bytes, 0, length);
468    return decode(bytes);
469  }
470  
471  /** Write a UTF8 encoded string to out
472   */
473  public static int writeString(DataOutput out, String s) throws IOException {
474    ByteBuffer bytes = encode(s);
475    int length = bytes.limit();
476    WritableUtils.writeVInt(out, length);
477    out.write(bytes.array(), 0, length);
478    return length;
479  }
480
481  /** Write a UTF8 encoded string with a maximum size to out
482   */
483  public static int writeString(DataOutput out, String s, int maxLength)
484      throws IOException {
485    ByteBuffer bytes = encode(s);
486    int length = bytes.limit();
487    if (length > maxLength) {
488      throw new IOException("string was too long to write!  Expected " +
489          "less than or equal to " + maxLength + " bytes, but got " +
490          length + " bytes.");
491    }
492    WritableUtils.writeVInt(out, length);
493    out.write(bytes.array(), 0, length);
494    return length;
495  }
496
497  ////// states for validateUTF8
498  
499  private static final int LEAD_BYTE = 0;
500
501  private static final int TRAIL_BYTE_1 = 1;
502
503  private static final int TRAIL_BYTE = 2;
504
505  /** 
506   * Check if a byte array contains valid utf-8
507   * @param utf8 byte array
508   * @throws MalformedInputException if the byte array contains invalid utf-8
509   */
510  public static void validateUTF8(byte[] utf8) throws MalformedInputException {
511    validateUTF8(utf8, 0, utf8.length);     
512  }
513  
514  /**
515   * Check to see if a byte array is valid utf-8
516   * @param utf8 the array of bytes
517   * @param start the offset of the first byte in the array
518   * @param len the length of the byte sequence
519   * @throws MalformedInputException if the byte array contains invalid bytes
520   */
521  public static void validateUTF8(byte[] utf8, int start, int len)
522    throws MalformedInputException {
523    int count = start;
524    int leadByte = 0;
525    int length = 0;
526    int state = LEAD_BYTE;
527    while (count < start+len) {
528      int aByte = utf8[count] & 0xFF;
529
530      switch (state) {
531      case LEAD_BYTE:
532        leadByte = aByte;
533        length = bytesFromUTF8[aByte];
534
535        switch (length) {
536        case 0: // check for ASCII
537          if (leadByte > 0x7F)
538            throw new MalformedInputException(count);
539          break;
540        case 1:
541          if (leadByte < 0xC2 || leadByte > 0xDF)
542            throw new MalformedInputException(count);
543          state = TRAIL_BYTE_1;
544          break;
545        case 2:
546          if (leadByte < 0xE0 || leadByte > 0xEF)
547            throw new MalformedInputException(count);
548          state = TRAIL_BYTE_1;
549          break;
550        case 3:
551          if (leadByte < 0xF0 || leadByte > 0xF4)
552            throw new MalformedInputException(count);
553          state = TRAIL_BYTE_1;
554          break;
555        default:
556          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
557          // or if < 0 we got a trail byte in the lead byte position
558          throw new MalformedInputException(count);
559        } // switch (length)
560        break;
561
562      case TRAIL_BYTE_1:
563        if (leadByte == 0xF0 && aByte < 0x90)
564          throw new MalformedInputException(count);
565        if (leadByte == 0xF4 && aByte > 0x8F)
566          throw new MalformedInputException(count);
567        if (leadByte == 0xE0 && aByte < 0xA0)
568          throw new MalformedInputException(count);
569        if (leadByte == 0xED && aByte > 0x9F)
570          throw new MalformedInputException(count);
571        // falls through to regular trail-byte test!!
572      case TRAIL_BYTE:
573        if (aByte < 0x80 || aByte > 0xBF)
574          throw new MalformedInputException(count);
575        if (--length == 0) {
576          state = LEAD_BYTE;
577        } else {
578          state = TRAIL_BYTE;
579        }
580        break;
581      } // switch (state)
582      count++;
583    }
584  }
585
586  /**
587   * Magic numbers for UTF-8. These are the number of bytes
588   * that <em>follow</em> a given lead byte. Trailing bytes
589   * have the value -1. The values 4 and 5 are presented in
590   * this table, even though valid UTF-8 cannot include the
591   * five and six byte sequences.
592   */
593  static final int[] bytesFromUTF8 =
594  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
600    0, 0, 0, 0, 0, 0, 0,
601    // trail bytes
602    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
603    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
604    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
605    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
606    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
607    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
608    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
609
610  /**
611   * Returns the next code point at the current position in
612   * the buffer. The buffer's position will be incremented.
613   * Any mark set on this buffer will be changed by this method!
614   */
615  public static int bytesToCodePoint(ByteBuffer bytes) {
616    bytes.mark();
617    byte b = bytes.get();
618    bytes.reset();
619    int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
620    if (extraBytesToRead < 0) return -1; // trailing byte!
621    int ch = 0;
622
623    switch (extraBytesToRead) {
624    case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
625    case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
626    case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
627    case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
628    case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
629    case 0: ch += (bytes.get() & 0xFF);
630    }
631    ch -= offsetsFromUTF8[extraBytesToRead];
632
633    return ch;
634  }
635
636  
637  static final int offsetsFromUTF8[] =
638  { 0x00000000, 0x00003080,
639    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
640
641  /**
642   * For the given string, returns the number of UTF-8 bytes
643   * required to encode the string.
644   * @param string text to encode
645   * @return number of UTF-8 bytes required to encode
646   */
647  public static int utf8Length(String string) {
648    CharacterIterator iter = new StringCharacterIterator(string);
649    char ch = iter.first();
650    int size = 0;
651    while (ch != CharacterIterator.DONE) {
652      if ((ch >= 0xD800) && (ch < 0xDC00)) {
653        // surrogate pair?
654        char trail = iter.next();
655        if ((trail > 0xDBFF) && (trail < 0xE000)) {
656          // valid pair
657          size += 4;
658        } else {
659          // invalid pair
660          size += 3;
661          iter.previous(); // rewind one
662        }
663      } else if (ch < 0x80) {
664        size++;
665      } else if (ch < 0x800) {
666        size += 2;
667      } else {
668        // ch < 0x10000, that is, the largest char value
669        size += 3;
670      }
671      ch = iter.next();
672    }
673    return size;
674  }
675
676  @Override
677  public void getPrefix(byte[] dst, int off, int prefixLen) {
678    int copyLen = Math.min(prefixLen, length);
679    int i = 0;
680    while (i < copyLen) {
681      dst[off + i] = bytes[i];
682      i++;
683    }
684    while (i < prefixLen) {
685      dst[off + i] = 0;
686      i++;
687    }
688  }
689}