001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.nio.ByteBuffer;
025    import java.nio.CharBuffer;
026    import java.nio.charset.CharacterCodingException;
027    import java.nio.charset.Charset;
028    import java.nio.charset.CharsetDecoder;
029    import java.nio.charset.CharsetEncoder;
030    import java.nio.charset.CodingErrorAction;
031    import java.nio.charset.MalformedInputException;
032    import java.text.CharacterIterator;
033    import java.text.StringCharacterIterator;
034    import java.util.Arrays;
035    
036    import org.apache.avro.reflect.Stringable;
037    import org.apache.hadoop.classification.InterfaceAudience;
038    import org.apache.hadoop.classification.InterfaceStability;
039    import org.apache.hadoop.classification.MapRModified;
040    
041    /** This class stores text using standard UTF8 encoding.  It provides methods
042     * to serialize, deserialize, and compare texts at byte level.  The type of
043     * length is integer and is serialized using zero-compressed format.  <p>In
044     * addition, it provides methods for string traversal without converting the
045     * byte array to a string.  <p>Also includes utilities for
046     * serializing/deserialing a string, coding/decoding a string, checking if a
047     * byte array contains valid UTF8 code, calculating the length of an encoded
048     * string.
049     */
050    @Stringable
051    @InterfaceAudience.Public
052    @InterfaceStability.Stable
053    @MapRModified(summary = "Improve map task performance - bug 13086")
054    public class Text extends BinaryComparable
055        implements WritableComparable<BinaryComparable>, HasRawComparablePrefix {
056      
057      private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
058        new ThreadLocal<CharsetEncoder>() {
059          @Override
060          protected CharsetEncoder initialValue() {
061            return Charset.forName("UTF-8").newEncoder().
062                   onMalformedInput(CodingErrorAction.REPORT).
063                   onUnmappableCharacter(CodingErrorAction.REPORT);
064        }
065      };
066      
067      private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
068        new ThreadLocal<CharsetDecoder>() {
069        @Override
070        protected CharsetDecoder initialValue() {
071          return Charset.forName("UTF-8").newDecoder().
072                 onMalformedInput(CodingErrorAction.REPORT).
073                 onUnmappableCharacter(CodingErrorAction.REPORT);
074        }
075      };
076      
077      private static final byte [] EMPTY_BYTES = new byte[0];
078      
079      private byte[] bytes;
080      private int length;
081    
082      public Text() {
083        bytes = EMPTY_BYTES;
084      }
085    
086      /** Construct from a string. 
087       */
088      public Text(String string) {
089        set(string);
090      }
091    
092      /** Construct from another text. */
093      public Text(Text utf8) {
094        set(utf8);
095      }
096    
097      /** Construct from a byte array.
098       */
099      public Text(byte[] utf8)  {
100        set(utf8);
101      }
102      
103      /**
104       * Get a copy of the bytes that is exactly the length of the data.
105       * See {@link #getBytes()} for faster access to the underlying array.
106       */
107      public byte[] copyBytes() {
108        byte[] result = new byte[length];
109        System.arraycopy(bytes, 0, result, 0, length);
110        return result;
111      }
112      
113      /**
114       * Returns the raw bytes; however, only data up to {@link #getLength()} is
115       * valid. Please use {@link #copyBytes()} if you
116       * need the returned array to be precisely the length of the data.
117       */
118      @Override
119      public byte[] getBytes() {
120        return bytes;
121      }
122    
123      /** Returns the number of bytes in the byte array */ 
124      @Override
125      public int getLength() {
126        return length;
127      }
128      
129      /**
130       * Returns the Unicode Scalar Value (32-bit integer value)
131       * for the character at <code>position</code>. Note that this
132       * method avoids using the converter or doing String instantiation
133       * @return the Unicode scalar value at position or -1
134       *          if the position is invalid or points to a
135       *          trailing byte
136       */
137      public int charAt(int position) {
138        if (position > this.length) return -1; // too long
139        if (position < 0) return -1; // duh.
140          
141        ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
142        return bytesToCodePoint(bb.slice());
143      }
144      
145      public int find(String what) {
146        return find(what, 0);
147      }
148      
149      /**
150       * Finds any occurence of <code>what</code> in the backing
151       * buffer, starting as position <code>start</code>. The starting
152       * position is measured in bytes and the return value is in
153       * terms of byte position in the buffer. The backing buffer is
154       * not converted to a string for this operation.
155       * @return byte position of the first occurence of the search
156       *         string in the UTF-8 buffer or -1 if not found
157       */
158      public int find(String what, int start) {
159        try {
160          ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
161          ByteBuffer tgt = encode(what);
162          byte b = tgt.get();
163          src.position(start);
164              
165          while (src.hasRemaining()) {
166            if (b == src.get()) { // matching first byte
167              src.mark(); // save position in loop
168              tgt.mark(); // save position in target
169              boolean found = true;
170              int pos = src.position()-1;
171              while (tgt.hasRemaining()) {
172                if (!src.hasRemaining()) { // src expired first
173                  tgt.reset();
174                  src.reset();
175                  found = false;
176                  break;
177                }
178                if (!(tgt.get() == src.get())) {
179                  tgt.reset();
180                  src.reset();
181                  found = false;
182                  break; // no match
183                }
184              }
185              if (found) return pos;
186            }
187          }
188          return -1; // not found
189        } catch (CharacterCodingException e) {
190          // can't get here
191          e.printStackTrace();
192          return -1;
193        }
194      }  
195      /** Set to contain the contents of a string. 
196       */
197      public void set(String string) {
198        try {
199          ByteBuffer bb = encode(string, true);
200          bytes = bb.array();
201          length = bb.limit();
202        }catch(CharacterCodingException e) {
203          throw new RuntimeException("Should not have happened ", e); 
204        }
205      }
206    
207      /** Set to a utf8 byte array
208       */
209      public void set(byte[] utf8) {
210        set(utf8, 0, utf8.length);
211      }
212      
213      /** copy a text. */
214      public void set(Text other) {
215        set(other.getBytes(), 0, other.getLength());
216      }
217    
218      /**
219       * Set the Text to range of bytes
220       * @param utf8 the data to copy from
221       * @param start the first position of the new string
222       * @param len the number of bytes of the new string
223       */
224      public void set(byte[] utf8, int start, int len) {
225        setCapacity(len, false);
226        System.arraycopy(utf8, start, bytes, 0, len);
227        this.length = len;
228      }
229    
230      /**
231       * Append a range of bytes to the end of the given text
232       * @param utf8 the data to copy from
233       * @param start the first position to append from utf8
234       * @param len the number of bytes to append
235       */
236      public void append(byte[] utf8, int start, int len) {
237        setCapacity(length + len, true);
238        System.arraycopy(utf8, start, bytes, length, len);
239        length += len;
240      }
241    
242      /**
243       * Clear the string to empty.
244       *
245       * <em>Note</em>: For performance reasons, this call does not clear the
246       * underlying byte array that is retrievable via {@link #getBytes()}.
247       * In order to free the byte-array memory, call {@link #set(byte[])}
248       * with an empty byte array (For example, <code>new byte[0]</code>).
249       */
250      public void clear() {
251        length = 0;
252      }
253    
254      /*
255       * Sets the capacity of this Text object to <em>at least</em>
256       * <code>len</code> bytes. If the current buffer is longer,
257       * then the capacity and existing content of the buffer are
258       * unchanged. If <code>len</code> is larger
259       * than the current capacity, the Text object's capacity is
260       * increased to match.
261       * @param len the number of bytes we need
262       * @param keepData should the old data be kept
263       */
264      private void setCapacity(int len, boolean keepData) {
265        if (bytes == null || bytes.length < len) {
266          if (bytes != null && keepData) {
267            bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
268          } else {
269            bytes = new byte[len];
270          }
271        }
272      }
273       
274      /** 
275       * Convert text back to string
276       * @see java.lang.Object#toString()
277       */
278      @Override
279      public String toString() {
280        try {
281          return decode(bytes, 0, length);
282        } catch (CharacterCodingException e) { 
283          throw new RuntimeException("Should not have happened " , e); 
284        }
285      }
286      
287      /** deserialize 
288       */
289      @Override
290      public void readFields(DataInput in) throws IOException {
291        int newLength = WritableUtils.readVInt(in);
292        setCapacity(newLength, false);
293        in.readFully(bytes, 0, newLength);
294        length = newLength;
295      }
296      
297      public void readFields(DataInput in, int maxLength) throws IOException {
298        int newLength = WritableUtils.readVInt(in);
299        if (newLength < 0) {
300          throw new IOException("tried to deserialize " + newLength +
301              " bytes of data!  newLength must be non-negative.");
302        } else if (newLength >= maxLength) {
303          throw new IOException("tried to deserialize " + newLength +
304              " bytes of data, but maxLength = " + maxLength);
305        }
306        setCapacity(newLength, false);
307        in.readFully(bytes, 0, newLength);
308        length = newLength;
309      }
310    
311      /** Skips over one Text in the input. */
312      public static void skip(DataInput in) throws IOException {
313        int length = WritableUtils.readVInt(in);
314        WritableUtils.skipFully(in, length);
315      }
316    
317      /** serialize
318       * write this object to out
319       * length uses zero-compressed encoding
320       * @see Writable#write(DataOutput)
321       */
322      @Override
323      public void write(DataOutput out) throws IOException {
324        WritableUtils.writeVInt(out, length);
325        out.write(bytes, 0, length);
326      }
327    
328      public void write(DataOutput out, int maxLength) throws IOException {
329        if (length > maxLength) {
330          throw new IOException("data was too long to write!  Expected " +
331              "less than or equal to " + maxLength + " bytes, but got " +
332              length + " bytes.");
333        }
334        WritableUtils.writeVInt(out, length);
335        out.write(bytes, 0, length);
336      }
337    
338      /** Returns true iff <code>o</code> is a Text with the same contents.  */
339      @Override
340      public boolean equals(Object o) {
341        if (o instanceof Text)
342          return super.equals(o);
343        return false;
344      }
345    
346      @Override
347      public int hashCode() {
348        return super.hashCode();
349      }
350    
351      /** A WritableComparator optimized for Text keys. */
352      public static class Comparator extends WritableComparator {
353        public Comparator() {
354          super(Text.class);
355        }
356    
357        @Override
358        public int compare(byte[] b1, int s1, int l1,
359                           byte[] b2, int s2, int l2) {
360          int n1 = WritableUtils.decodeVIntSize(b1[s1]);
361          int n2 = WritableUtils.decodeVIntSize(b2[s2]);
362          return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
363        }
364      }
365    
366      static {
367        // register this comparator
368        WritableComparator.define(Text.class, new Comparator());
369      }
370    
371      /// STATIC UTILITIES FROM HERE DOWN
372      /**
373       * Converts the provided byte array to a String using the
374       * UTF-8 encoding. If the input is malformed,
375       * replace by a default value.
376       */
377      public static String decode(byte[] utf8) throws CharacterCodingException {
378        return decode(ByteBuffer.wrap(utf8), true);
379      }
380      
381      public static String decode(byte[] utf8, int start, int length) 
382        throws CharacterCodingException {
383        return decode(ByteBuffer.wrap(utf8, start, length), true);
384      }
385      
386      /**
387       * Converts the provided byte array to a String using the
388       * UTF-8 encoding. If <code>replace</code> is true, then
389       * malformed input is replaced with the
390       * substitution character, which is U+FFFD. Otherwise the
391       * method throws a MalformedInputException.
392       */
393      public static String decode(byte[] utf8, int start, int length, boolean replace) 
394        throws CharacterCodingException {
395        return decode(ByteBuffer.wrap(utf8, start, length), replace);
396      }
397      
398      private static String decode(ByteBuffer utf8, boolean replace) 
399        throws CharacterCodingException {
400        CharsetDecoder decoder = DECODER_FACTORY.get();
401        if (replace) {
402          decoder.onMalformedInput(
403              java.nio.charset.CodingErrorAction.REPLACE);
404          decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
405        }
406        String str = decoder.decode(utf8).toString();
407        // set decoder back to its default value: REPORT
408        if (replace) {
409          decoder.onMalformedInput(CodingErrorAction.REPORT);
410          decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
411        }
412        return str;
413      }
414    
415      /**
416       * Converts the provided String to bytes using the
417       * UTF-8 encoding. If the input is malformed,
418       * invalid chars are replaced by a default value.
419       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
420       *                     and length is ByteBuffer.limit()
421       */
422    
423      public static ByteBuffer encode(String string)
424        throws CharacterCodingException {
425        return encode(string, true);
426      }
427    
428      /**
429       * Converts the provided String to bytes using the
430       * UTF-8 encoding. If <code>replace</code> is true, then
431       * malformed input is replaced with the
432       * substitution character, which is U+FFFD. Otherwise the
433       * method throws a MalformedInputException.
434       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
435       *                     and length is ByteBuffer.limit()
436       */
437      public static ByteBuffer encode(String string, boolean replace)
438        throws CharacterCodingException {
439        CharsetEncoder encoder = ENCODER_FACTORY.get();
440        if (replace) {
441          encoder.onMalformedInput(CodingErrorAction.REPLACE);
442          encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
443        }
444        ByteBuffer bytes = 
445          encoder.encode(CharBuffer.wrap(string.toCharArray()));
446        if (replace) {
447          encoder.onMalformedInput(CodingErrorAction.REPORT);
448          encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
449        }
450        return bytes;
451      }
452    
453      static final public int DEFAULT_MAX_LEN = 1024 * 1024;
454    
455      /** Read a UTF8 encoded string from in
456       */
457      public static String readString(DataInput in) throws IOException {
458        int length = WritableUtils.readVInt(in);
459        byte [] bytes = new byte[length];
460        in.readFully(bytes, 0, length);
461        return decode(bytes);
462      }
463      
464      /** Read a UTF8 encoded string with a maximum size
465       */
466      public static String readString(DataInput in, int maxLength)
467          throws IOException {
468        int length = WritableUtils.readVIntInRange(in, 0, maxLength);
469        byte [] bytes = new byte[length];
470        in.readFully(bytes, 0, length);
471        return decode(bytes);
472      }
473      
474      /** Write a UTF8 encoded string to out
475       */
476      public static int writeString(DataOutput out, String s) throws IOException {
477        ByteBuffer bytes = encode(s);
478        int length = bytes.limit();
479        WritableUtils.writeVInt(out, length);
480        out.write(bytes.array(), 0, length);
481        return length;
482      }
483    
484      /** Write a UTF8 encoded string with a maximum size to out
485       */
486      public static int writeString(DataOutput out, String s, int maxLength)
487          throws IOException {
488        ByteBuffer bytes = encode(s);
489        int length = bytes.limit();
490        if (length > maxLength) {
491          throw new IOException("string was too long to write!  Expected " +
492              "less than or equal to " + maxLength + " bytes, but got " +
493              length + " bytes.");
494        }
495        WritableUtils.writeVInt(out, length);
496        out.write(bytes.array(), 0, length);
497        return length;
498      }
499    
500      ////// states for validateUTF8
501      
502      private static final int LEAD_BYTE = 0;
503    
504      private static final int TRAIL_BYTE_1 = 1;
505    
506      private static final int TRAIL_BYTE = 2;
507    
508      /** 
509       * Check if a byte array contains valid utf-8
510       * @param utf8 byte array
511       * @throws MalformedInputException if the byte array contains invalid utf-8
512       */
513      public static void validateUTF8(byte[] utf8) throws MalformedInputException {
514        validateUTF8(utf8, 0, utf8.length);     
515      }
516      
517      /**
518       * Check to see if a byte array is valid utf-8
519       * @param utf8 the array of bytes
520       * @param start the offset of the first byte in the array
521       * @param len the length of the byte sequence
522       * @throws MalformedInputException if the byte array contains invalid bytes
523       */
524      public static void validateUTF8(byte[] utf8, int start, int len)
525        throws MalformedInputException {
526        int count = start;
527        int leadByte = 0;
528        int length = 0;
529        int state = LEAD_BYTE;
530        while (count < start+len) {
531          int aByte = utf8[count] & 0xFF;
532    
533          switch (state) {
534          case LEAD_BYTE:
535            leadByte = aByte;
536            length = bytesFromUTF8[aByte];
537    
538            switch (length) {
539            case 0: // check for ASCII
540              if (leadByte > 0x7F)
541                throw new MalformedInputException(count);
542              break;
543            case 1:
544              if (leadByte < 0xC2 || leadByte > 0xDF)
545                throw new MalformedInputException(count);
546              state = TRAIL_BYTE_1;
547              break;
548            case 2:
549              if (leadByte < 0xE0 || leadByte > 0xEF)
550                throw new MalformedInputException(count);
551              state = TRAIL_BYTE_1;
552              break;
553            case 3:
554              if (leadByte < 0xF0 || leadByte > 0xF4)
555                throw new MalformedInputException(count);
556              state = TRAIL_BYTE_1;
557              break;
558            default:
559              // too long! Longest valid UTF-8 is 4 bytes (lead + three)
560              // or if < 0 we got a trail byte in the lead byte position
561              throw new MalformedInputException(count);
562            } // switch (length)
563            break;
564    
565          case TRAIL_BYTE_1:
566            if (leadByte == 0xF0 && aByte < 0x90)
567              throw new MalformedInputException(count);
568            if (leadByte == 0xF4 && aByte > 0x8F)
569              throw new MalformedInputException(count);
570            if (leadByte == 0xE0 && aByte < 0xA0)
571              throw new MalformedInputException(count);
572            if (leadByte == 0xED && aByte > 0x9F)
573              throw new MalformedInputException(count);
574            // falls through to regular trail-byte test!!
575          case TRAIL_BYTE:
576            if (aByte < 0x80 || aByte > 0xBF)
577              throw new MalformedInputException(count);
578            if (--length == 0) {
579              state = LEAD_BYTE;
580            } else {
581              state = TRAIL_BYTE;
582            }
583            break;
584          } // switch (state)
585          count++;
586        }
587      }
588    
589      /**
590       * Magic numbers for UTF-8. These are the number of bytes
591       * that <em>follow</em> a given lead byte. Trailing bytes
592       * have the value -1. The values 4 and 5 are presented in
593       * this table, even though valid UTF-8 cannot include the
594       * five and six byte sequences.
595       */
596      static final int[] bytesFromUTF8 =
597      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
600        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
601        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
602        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
603        0, 0, 0, 0, 0, 0, 0,
604        // trail bytes
605        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
606        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
607        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
608        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
609        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
611        3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
612    
613      /**
614       * Returns the next code point at the current position in
615       * the buffer. The buffer's position will be incremented.
616       * Any mark set on this buffer will be changed by this method!
617       */
618      public static int bytesToCodePoint(ByteBuffer bytes) {
619        bytes.mark();
620        byte b = bytes.get();
621        bytes.reset();
622        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
623        if (extraBytesToRead < 0) return -1; // trailing byte!
624        int ch = 0;
625    
626        switch (extraBytesToRead) {
627        case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
628        case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
629        case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
630        case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
631        case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
632        case 0: ch += (bytes.get() & 0xFF);
633        }
634        ch -= offsetsFromUTF8[extraBytesToRead];
635    
636        return ch;
637      }
638    
639      
640      static final int offsetsFromUTF8[] =
641      { 0x00000000, 0x00003080,
642        0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
643    
644      /**
645       * For the given string, returns the number of UTF-8 bytes
646       * required to encode the string.
647       * @param string text to encode
648       * @return number of UTF-8 bytes required to encode
649       */
650      public static int utf8Length(String string) {
651        CharacterIterator iter = new StringCharacterIterator(string);
652        char ch = iter.first();
653        int size = 0;
654        while (ch != CharacterIterator.DONE) {
655          if ((ch >= 0xD800) && (ch < 0xDC00)) {
656            // surrogate pair?
657            char trail = iter.next();
658            if ((trail > 0xDBFF) && (trail < 0xE000)) {
659              // valid pair
660              size += 4;
661            } else {
662              // invalid pair
663              size += 3;
664              iter.previous(); // rewind one
665            }
666          } else if (ch < 0x80) {
667            size++;
668          } else if (ch < 0x800) {
669            size += 2;
670          } else {
671            // ch < 0x10000, that is, the largest char value
672            size += 3;
673          }
674          ch = iter.next();
675        }
676        return size;
677      }
678    
679      @Override
680      public void getPrefix(byte[] dst, int off, int prefixLen) {
681        int copyLen = Math.min(prefixLen, length);
682        int i = 0;
683        while (i < copyLen) {
684          dst[off + i] = bytes[i];
685          i++;
686        }
687        while (i < prefixLen) {
688          dst[off + i] = 0;
689          i++;
690        }
691      }
692    }