1/*
2 * reserved comment block
3 * DO NOT REMOVE OR ALTER!
4 */
5/*
6 * Licensed to the Apache Software Foundation (ASF) under one or more
7 * contributor license agreements.  See the NOTICE file distributed with
8 * this work for additional information regarding copyright ownership.
9 * The ASF licenses this file to You under the Apache License, Version 2.0
10 * (the "License"); you may not use this file except in compliance with
11 * the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and
19 * limitations under the License.
20 */
21
22package com.sun.org.apache.xerces.internal.impl.io;
23
24import java.io.InputStream;
25import java.io.IOException;
26import java.io.Reader;
27
28import java.util.Locale;
29import com.sun.org.apache.xerces.internal.util.MessageFormatter;
30import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
31
32import com.sun.xml.internal.stream.util.BufferAllocator;
33import com.sun.xml.internal.stream.util.ThreadLocalBufferAllocator;
34
35/**
36 * <p>A UTF-8 reader.</p>
37 *
38 * @xerces.internal
39 *
40 * @author Andy Clark, IBM
41 *
42 */
43public class UTF8Reader
44    extends Reader {
45
46    //
47    // Constants
48    //
49
50    /** Default byte buffer size (2048). */
51    public static final int DEFAULT_BUFFER_SIZE = 2048;
52
53    // debugging
54
55    /** Debug read. */
56    private static final boolean DEBUG_READ = false;
57
58    //
59    // Data
60    //
61
62    /** Input stream. */
63    protected InputStream fInputStream;
64
65    /** Byte buffer. */
66    protected byte[] fBuffer;
67
68    /** Offset into buffer. */
69    protected int fOffset;
70
71    /** Surrogate character. */
72    private int fSurrogate = -1;
73
74    // message formatter; used to produce localized
75    // exception messages
76    private MessageFormatter fFormatter = null;
77
78    //Locale to use for messages
79    private Locale fLocale = null;
80
81    //
82    // Constructors
83    //
84
85    /**
86     * Constructs a UTF-8 reader from the specified input stream
87     * using the default buffer size.  Primarily for testing.
88     *
89     * @param inputStream The input stream.
90     */
91    public UTF8Reader(InputStream inputStream) {
92        this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
93    } // <init>(InputStream, MessageFormatter)
94
95    /**
96     * Constructs a UTF-8 reader from the specified input stream
97     * using the default buffer size and the given MessageFormatter.
98     *
99     * @param inputStream The input stream.
100     * @param messageFormatter  given MessageFormatter
101     * @param locale    Locale to use for messages
102     */
103    public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter,
104            Locale locale) {
105        this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
106    } // <init>(InputStream, MessageFormatter, Locale)
107
108    /**
109     * Constructs a UTF-8 reader from the specified input stream,
110     * buffer size and MessageFormatter.
111     *
112     * @param inputStream The input stream.
113     * @param size        The initial buffer size.
114     * @param messageFormatter  the formatter for localizing/formatting errors.
115     * @param locale    the Locale to use for messages
116     */
117    public UTF8Reader(InputStream inputStream, int size,
118            MessageFormatter messageFormatter, Locale locale) {
119        fInputStream = inputStream;
120        BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator();
121        fBuffer = ba.getByteBuffer(size);
122        if (fBuffer == null) {
123            fBuffer = new byte[size];
124        }
125        fFormatter = messageFormatter;
126        fLocale = locale;
127    } // <init>(InputStream, int, MessageFormatter, Locale)
128
129    //
130    // Reader methods
131    //
132
133    /**
134     * Read a single character.  This method will block until a character is
135     * available, an I/O error occurs, or the end of the stream is reached.
136     *
137     * <p> Subclasses that intend to support efficient single-character input
138     * should override this method.
139     *
140     * @return     The character read, as an integer in the range 0 to 16383
141     *             (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
142     *             been reached
143     *
144     * @exception  IOException  If an I/O error occurs
145     */
146    public int read() throws IOException {
147
148        // decode character
149        int c = fSurrogate;
150        if (fSurrogate == -1) {
151            // NOTE: We use the index into the buffer if there are remaining
152            //       bytes from the last block read. -Ac
153            int index = 0;
154
155            // get first byte
156            int b0 = index == fOffset
157                   ? fInputStream.read() : fBuffer[index++] & 0x00FF;
158            if (b0 == -1) {
159                return -1;
160            }
161
162            // UTF-8:   [0xxx xxxx]
163            // Unicode: [0000 0000] [0xxx xxxx]
164            if (b0 < 0x80) {
165                c = (char)b0;
166            }
167
168            // UTF-8:   [110y yyyy] [10xx xxxx]
169            // Unicode: [0000 0yyy] [yyxx xxxx]
170            else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
171                int b1 = index == fOffset
172                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
173                if (b1 == -1) {
174                    expectedByte(2, 2);
175                }
176                if ((b1 & 0xC0) != 0x80) {
177                    invalidByte(2, 2, b1);
178                }
179                c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
180            }
181
182            // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
183            // Unicode: [zzzz yyyy] [yyxx xxxx]
184            else if ((b0 & 0xF0) == 0xE0) {
185                int b1 = index == fOffset
186                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
187                if (b1 == -1) {
188                    expectedByte(2, 3);
189                }
190                if ((b1 & 0xC0) != 0x80
191                    || (b0 == 0xED && b1 >= 0xA0)
192                    || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
193                    invalidByte(2, 3, b1);
194                }
195                int b2 = index == fOffset
196                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
197                if (b2 == -1) {
198                    expectedByte(3, 3);
199                }
200                if ((b2 & 0xC0) != 0x80) {
201                    invalidByte(3, 3, b2);
202                }
203                c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
204                    (b2 & 0x003F);
205            }
206
207            // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
208            // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
209            //          [1101 11yy] [yyxx xxxx] (low surrogate)
210            //          * uuuuu = wwww + 1
211            else if ((b0 & 0xF8) == 0xF0) {
212                int b1 = index == fOffset
213                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
214                if (b1 == -1) {
215                    expectedByte(2, 4);
216                }
217                if ((b1 & 0xC0) != 0x80
218                    || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
219                    invalidByte(2, 3, b1);
220                }
221                int b2 = index == fOffset
222                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
223                if (b2 == -1) {
224                    expectedByte(3, 4);
225                }
226                if ((b2 & 0xC0) != 0x80) {
227                    invalidByte(3, 3, b2);
228                }
229                int b3 = index == fOffset
230                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
231                if (b3 == -1) {
232                    expectedByte(4, 4);
233                }
234                if ((b3 & 0xC0) != 0x80) {
235                    invalidByte(4, 4, b3);
236                }
237                int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
238                if (uuuuu > 0x10) {
239                    invalidSurrogate(uuuuu);
240                }
241                int wwww = uuuuu - 1;
242                int hs = 0xD800 |
243                         ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
244                         ((b2 >> 4) & 0x0003);
245                int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
246                c = hs;
247                fSurrogate = ls;
248            }
249
250            // error
251            else {
252                invalidByte(1, 1, b0);
253            }
254        }
255
256        // use surrogate
257        else {
258            fSurrogate = -1;
259        }
260
261        // return character
262        if (DEBUG_READ) {
263            System.out.println("read(): 0x"+Integer.toHexString(c));
264        }
265        return c;
266
267    } // read():int
268
269    /**
270     * Read characters into a portion of an array.  This method will block
271     * until some input is available, an I/O error occurs, or the end of the
272     * stream is reached.
273     *
274     * @param      ch     Destination buffer
275     * @param      offset Offset at which to start storing characters
276     * @param      length Maximum number of characters to read
277     *
278     * @return     The number of characters read, or -1 if the end of the
279     *             stream has been reached
280     *
281     * @exception  IOException  If an I/O error occurs
282     */
283    public int read(char ch[], int offset, int length) throws IOException {
284
285        // handle surrogate
286        int out = offset;
287        if (fSurrogate != -1) {
288            ch[offset + 1] = (char)fSurrogate;
289            fSurrogate = -1;
290            length--;
291            out++;
292        }
293
294        // read bytes
295        int count = 0;
296        if (fOffset == 0) {
297            // adjust length to read
298            if (length > fBuffer.length) {
299                length = fBuffer.length;
300            }
301
302            // perform read operation
303            count = fInputStream.read(fBuffer, 0, length);
304            if (count == -1) {
305                return -1;
306            }
307            count += out - offset;
308        }
309
310        // skip read; last character was in error
311        // NOTE: Having an offset value other than zero means that there was
312        //       an error in the last character read. In this case, we have
313        //       skipped the read so we don't consume any bytes past the
314        //       error. By signalling the error on the next block read we
315        //       allow the method to return the most valid characters that
316        //       it can on the previous block read. -Ac
317        else {
318            count = fOffset;
319            fOffset = 0;
320        }
321
322        // convert bytes to characters
323        final int total = count;
324        int in;
325        byte byte1;
326        final byte byte0 = 0;
327        for (in = 0; in < total; in++) {
328            byte1 = fBuffer[in];
329            if (byte1 >= byte0) {
330                ch[out++] = (char)byte1;
331            }
332            else   {
333                break;
334            }
335        }
336        for ( ; in < total; in++) {
337            byte1 = fBuffer[in];
338
339            // UTF-8:   [0xxx xxxx]
340            // Unicode: [0000 0000] [0xxx xxxx]
341            if (byte1 >= byte0) {
342                ch[out++] = (char)byte1;
343                continue;
344            }
345
346            // UTF-8:   [110y yyyy] [10xx xxxx]
347            // Unicode: [0000 0yyy] [yyxx xxxx]
348            int b0 = byte1 & 0x0FF;
349            if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
350                int b1 = -1;
351                if (++in < total) {
352                    b1 = fBuffer[in] & 0x00FF;
353                }
354                else {
355                    b1 = fInputStream.read();
356                    if (b1 == -1) {
357                        if (out > offset) {
358                            fBuffer[0] = (byte)b0;
359                            fOffset = 1;
360                            return out - offset;
361                        }
362                        expectedByte(2, 2);
363                    }
364                    count++;
365                }
366                if ((b1 & 0xC0) != 0x80) {
367                    if (out > offset) {
368                        fBuffer[0] = (byte)b0;
369                        fBuffer[1] = (byte)b1;
370                        fOffset = 2;
371                        return out - offset;
372                    }
373                    invalidByte(2, 2, b1);
374                }
375                int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
376                ch[out++] = (char)c;
377                count -= 1;
378                continue;
379            }
380
381            // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
382            // Unicode: [zzzz yyyy] [yyxx xxxx]
383            if ((b0 & 0xF0) == 0xE0) {
384                int b1 = -1;
385                if (++in < total) {
386                    b1 = fBuffer[in] & 0x00FF;
387                }
388                else {
389                    b1 = fInputStream.read();
390                    if (b1 == -1) {
391                        if (out > offset) {
392                            fBuffer[0] = (byte)b0;
393                            fOffset = 1;
394                            return out - offset;
395                        }
396                        expectedByte(2, 3);
397                    }
398                    count++;
399                }
400                if ((b1 & 0xC0) != 0x80
401                    || (b0 == 0xED && b1 >= 0xA0)
402                    || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
403                    if (out > offset) {
404                        fBuffer[0] = (byte)b0;
405                        fBuffer[1] = (byte)b1;
406                        fOffset = 2;
407                        return out - offset;
408                    }
409                    invalidByte(2, 3, b1);
410                }
411                int b2 = -1;
412                if (++in < total) {
413                    b2 = fBuffer[in] & 0x00FF;
414                }
415                else {
416                    b2 = fInputStream.read();
417                    if (b2 == -1) {
418                        if (out > offset) {
419                            fBuffer[0] = (byte)b0;
420                            fBuffer[1] = (byte)b1;
421                            fOffset = 2;
422                            return out - offset;
423                        }
424                        expectedByte(3, 3);
425                    }
426                    count++;
427                }
428                if ((b2 & 0xC0) != 0x80) {
429                    if (out > offset) {
430                        fBuffer[0] = (byte)b0;
431                        fBuffer[1] = (byte)b1;
432                        fBuffer[2] = (byte)b2;
433                        fOffset = 3;
434                        return out - offset;
435                    }
436                    invalidByte(3, 3, b2);
437                }
438                int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
439                        (b2 & 0x003F);
440                ch[out++] = (char)c;
441                count -= 2;
442                continue;
443            }
444
445            // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
446            // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
447            //          [1101 11yy] [yyxx xxxx] (low surrogate)
448            //          * uuuuu = wwww + 1
449            if ((b0 & 0xF8) == 0xF0) {
450                int b1 = -1;
451                if (++in < total) {
452                    b1 = fBuffer[in] & 0x00FF;
453                }
454                else {
455                    b1 = fInputStream.read();
456                    if (b1 == -1) {
457                        if (out > offset) {
458                            fBuffer[0] = (byte)b0;
459                            fOffset = 1;
460                            return out - offset;
461                        }
462                        expectedByte(2, 4);
463                    }
464                    count++;
465                }
466                if ((b1 & 0xC0) != 0x80
467                    || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
468                    if (out > offset) {
469                        fBuffer[0] = (byte)b0;
470                        fBuffer[1] = (byte)b1;
471                        fOffset = 2;
472                        return out - offset;
473                    }
474                    invalidByte(2, 4, b1);
475                }
476                int b2 = -1;
477                if (++in < total) {
478                    b2 = fBuffer[in] & 0x00FF;
479                }
480                else {
481                    b2 = fInputStream.read();
482                    if (b2 == -1) {
483                        if (out > offset) {
484                            fBuffer[0] = (byte)b0;
485                            fBuffer[1] = (byte)b1;
486                            fOffset = 2;
487                            return out - offset;
488                        }
489                        expectedByte(3, 4);
490                    }
491                    count++;
492                }
493                if ((b2 & 0xC0) != 0x80) {
494                    if (out > offset) {
495                        fBuffer[0] = (byte)b0;
496                        fBuffer[1] = (byte)b1;
497                        fBuffer[2] = (byte)b2;
498                        fOffset = 3;
499                        return out - offset;
500                    }
501                    invalidByte(3, 4, b2);
502                }
503                int b3 = -1;
504                if (++in < total) {
505                    b3 = fBuffer[in] & 0x00FF;
506                }
507                else {
508                    b3 = fInputStream.read();
509                    if (b3 == -1) {
510                        if (out > offset) {
511                            fBuffer[0] = (byte)b0;
512                            fBuffer[1] = (byte)b1;
513                            fBuffer[2] = (byte)b2;
514                            fOffset = 3;
515                            return out - offset;
516                        }
517                        expectedByte(4, 4);
518                    }
519                    count++;
520                }
521                if ((b3 & 0xC0) != 0x80) {
522                    if (out > offset) {
523                        fBuffer[0] = (byte)b0;
524                        fBuffer[1] = (byte)b1;
525                        fBuffer[2] = (byte)b2;
526                        fBuffer[3] = (byte)b3;
527                        fOffset = 4;
528                        return out - offset;
529                    }
530                    invalidByte(4, 4, b2);
531                }
532
533                // check if output buffer is large enough to hold 2 surrogate chars
534                if (out + 1 >= ch.length) {
535                    fBuffer[0] = (byte)b0;
536                    fBuffer[1] = (byte)b1;
537                    fBuffer[2] = (byte)b2;
538                    fBuffer[3] = (byte)b3;
539                    fOffset = 4;
540                    return out - offset;
541                }
542
543                // decode bytes into surrogate characters
544                int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
545                if (uuuuu > 0x10) {
546                    invalidSurrogate(uuuuu);
547                }
548                int wwww = uuuuu - 1;
549                int zzzz = b1 & 0x000F;
550                int yyyyyy = b2 & 0x003F;
551                int xxxxxx = b3 & 0x003F;
552                int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
553                int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
554
555                // set characters
556                ch[out++] = (char)hs;
557                ch[out++] = (char)ls;
558                count -= 2;
559                continue;
560            }
561
562            // error
563            if (out > offset) {
564                fBuffer[0] = (byte)b0;
565                fOffset = 1;
566                return out - offset;
567            }
568            invalidByte(1, 1, b0);
569        }
570
571        // return number of characters converted
572        if (DEBUG_READ) {
573            System.out.println("read(char[],"+offset+','+length+"): count="+count);
574        }
575        return count;
576
577    } // read(char[],int,int)
578
579    /**
580     * Skip characters.  This method will block until some characters are
581     * available, an I/O error occurs, or the end of the stream is reached.
582     *
583     * @param  n  The number of characters to skip
584     *
585     * @return    The number of characters actually skipped
586     *
587     * @exception  IOException  If an I/O error occurs
588     */
589    public long skip(long n) throws IOException {
590
591        long remaining = n;
592        final char[] ch = new char[fBuffer.length];
593        do {
594            int length = ch.length < remaining ? ch.length : (int)remaining;
595            int count = read(ch, 0, length);
596            if (count > 0) {
597                remaining -= count;
598            }
599            else {
600                break;
601            }
602        } while (remaining > 0);
603
604        long skipped = n - remaining;
605        return skipped;
606
607    } // skip(long):long
608
609    /**
610     * Tell whether this stream is ready to be read.
611     *
612     * @return True if the next read() is guaranteed not to block for input,
613     * false otherwise.  Note that returning false does not guarantee that the
614     * next read will block.
615     *
616     * @exception  IOException  If an I/O error occurs
617     */
618    public boolean ready() throws IOException {
619        return false;
620    } // ready()
621
622    /**
623     * Tell whether this stream supports the mark() operation.
624     */
625    public boolean markSupported() {
626        return false;
627    } // markSupported()
628
629    /**
630     * Mark the present position in the stream.  Subsequent calls to reset()
631     * will attempt to reposition the stream to this point.  Not all
632     * character-input streams support the mark() operation.
633     *
634     * @param  readAheadLimit  Limit on the number of characters that may be
635     *                         read while still preserving the mark.  After
636     *                         reading this many characters, attempting to
637     *                         reset the stream may fail.
638     *
639     * @exception  IOException  If the stream does not support mark(),
640     *                          or if some other I/O error occurs
641     */
642    public void mark(int readAheadLimit) throws IOException {
643        throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
644    } // mark(int)
645
646    /**
647     * Reset the stream.  If the stream has been marked, then attempt to
648     * reposition it at the mark.  If the stream has not been marked, then
649     * attempt to reset it in some way appropriate to the particular stream,
650     * for example by repositioning it to its starting point.  Not all
651     * character-input streams support the reset() operation, and some support
652     * reset() without supporting mark().
653     *
654     * @exception  IOException  If the stream has not been marked,
655     *                          or if the mark has been invalidated,
656     *                          or if the stream does not support reset(),
657     *                          or if some other I/O error occurs
658     */
659    public void reset() throws IOException {
660        fOffset = 0;
661        fSurrogate = -1;
662    } // reset()
663
664    /**
665     * Close the stream.  Once a stream has been closed, further read(),
666     * ready(), mark(), or reset() invocations will throw an IOException.
667     * Closing a previously-closed stream, however, has no effect.
668     *
669     * @exception  IOException  If an I/O error occurs
670     */
671    public void close() throws IOException {
672        BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator();
673        ba.returnByteBuffer(fBuffer);
674        fBuffer = null;
675        fInputStream.close();
676    } // close()
677
678    //
679    // Private methods
680    //
681
682    /** Throws an exception for expected byte. */
683    private void expectedByte(int position, int count)
684        throws MalformedByteSequenceException {
685
686        throw new MalformedByteSequenceException(fFormatter,
687            fLocale,
688            XMLMessageFormatter.XML_DOMAIN,
689            "ExpectedByte",
690            new Object[] {Integer.toString(position), Integer.toString(count)});
691
692    } // expectedByte(int,int)
693
694    /** Throws an exception for invalid byte. */
695    private void invalidByte(int position, int count, int c)
696        throws MalformedByteSequenceException {
697
698        throw new MalformedByteSequenceException(fFormatter,
699            fLocale,
700            XMLMessageFormatter.XML_DOMAIN,
701            "InvalidByte",
702            new Object [] {Integer.toString(position), Integer.toString(count)});
703
704    } // invalidByte(int,int,int)
705
706    /** Throws an exception for invalid surrogate bits. */
707    private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException {
708
709        throw new MalformedByteSequenceException(fFormatter,
710            fLocale,
711            XMLMessageFormatter.XML_DOMAIN,
712            "InvalidHighSurrogate",
713            new Object[] {Integer.toHexString(uuuuu)});
714
715    } // invalidSurrogate(int)
716
717} // class UTF8Reader
718