1/*
2 * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package jdk.internal.util.xml.impl;
27
28import java.io.Reader;
29import java.io.InputStream;
30import java.io.IOException;
31import java.io.UnsupportedEncodingException;
32
33/**
34 * UTF-8 transformed UCS-2 character stream reader.
35 *
36 * This reader converts UTF-8 transformed UCS-2 characters to Java characters.
37 * The UCS-2 subset of UTF-8 transformation is described in RFC-2279 #2
38 * "UTF-8 definition":
39 *  0000 0000-0000 007F   0xxxxxxx
40 *  0000 0080-0000 07FF   110xxxxx 10xxxxxx
41 *  0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
42 *
43 * This reader will return incorrect last character on broken UTF-8 stream.
44 */
45public class ReaderUTF8 extends Reader {
46
47    private InputStream is;
48
49    /**
50     * Constructor.
51     *
52     * @param is A byte input stream.
53     */
54    public ReaderUTF8(InputStream is) {
55        this.is = is;
56    }
57
58    /**
59     * Reads characters into a portion of an array.
60     *
61     * @param cbuf Destination buffer.
62     * @param off Offset at which to start storing characters.
63     * @param len Maximum number of characters to read.
64     * @exception IOException If any IO errors occur.
65     * @exception UnsupportedEncodingException If UCS-4 character occur in the stream.
66     */
67    public int read(char[] cbuf, int off, int len) throws IOException {
68        int num = 0;
69        int val;
70        while (num < len) {
71            if ((val = is.read()) < 0) {
72                return (num != 0) ? num : -1;
73            }
74            switch (val & 0xf0) {
75                case 0xc0:
76                case 0xd0:
77                    cbuf[off++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f));
78                    break;
79
80                case 0xe0:
81                    cbuf[off++] = (char) (((val & 0x0f) << 12)
82                            | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f));
83                    break;
84
85                case 0xf0:      // UCS-4 character
86                    throw new UnsupportedEncodingException("UTF-32 (or UCS-4) encoding not supported.");
87
88                default:
89                    cbuf[off++] = (char) val;
90                    break;
91            }
92            num++;
93        }
94        return num;
95    }
96
97    /**
98     * Reads a single character.
99     *
100     * @return The character read, as an integer in the range 0 to 65535
101     *  (0x00-0xffff), or -1 if the end of the stream has been reached.
102     * @exception IOException If any IO errors occur.
103     * @exception UnsupportedEncodingException If UCS-4 character occur in the stream.
104     */
105    public int read() throws IOException {
106        int val;
107        if ((val = is.read()) < 0) {
108            return -1;
109        }
110        switch (val & 0xf0) {
111            case 0xc0:
112            case 0xd0:
113                val = ((val & 0x1f) << 6) | (is.read() & 0x3f);
114                break;
115
116            case 0xe0:
117                val = ((val & 0x0f) << 12)
118                        | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f);
119                break;
120
121            case 0xf0:  // UCS-4 character
122                throw new UnsupportedEncodingException();
123
124            default:
125                break;
126        }
127        return val;
128    }
129
130    /**
131     * Closes the stream.
132     *
133     * @exception IOException If any IO errors occur.
134     */
135    public void close() throws IOException {
136        is.close();
137    }
138}
139