1/*
2 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package sun.nio.cs.ext;
27
28import java.nio.ByteBuffer;
29import java.nio.CharBuffer;
30import java.nio.charset.Charset;
31import java.nio.charset.CharsetDecoder;
32import java.nio.charset.CharsetEncoder;
33import java.nio.charset.CoderResult;
34import java.nio.charset.CharacterCodingException;
35import java.nio.charset.MalformedInputException;
36import sun.nio.cs.DelegatableDecoder;
37import sun.nio.cs.HistoricallyNamedCharset;
38import java.security.AccessController;
39import java.security.PrivilegedAction;
40import sun.nio.cs.*;
41import static java.lang.Character.UnicodeBlock;
42
43
44public class JISAutoDetect
45    extends Charset
46    implements HistoricallyNamedCharset
47{
48
49    private final static int EUCJP_MASK       = 0x01;
50    private final static int SJIS2B_MASK      = 0x02;
51    private final static int SJIS1B_MASK      = 0x04;
52    private final static int EUCJP_KANA1_MASK = 0x08;
53    private final static int EUCJP_KANA2_MASK = 0x10;
54
55    public JISAutoDetect() {
56        super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));
57    }
58
59    public boolean contains(Charset cs) {
60        return ((cs.name().equals("US-ASCII"))
61                || (cs instanceof SJIS)
62                || (cs instanceof EUC_JP)
63                || (cs instanceof ISO2022_JP));
64    }
65
66    public boolean canEncode() {
67        return false;
68    }
69
70    public CharsetDecoder newDecoder() {
71        return new Decoder(this);
72    }
73
74    public String historicalName() {
75        return "JISAutoDetect";
76    }
77
78    public CharsetEncoder newEncoder() {
79        throw new UnsupportedOperationException();
80    }
81
82    // A heuristic algorithm for guessing if EUC-decoded text really
83    // might be Japanese text.  Better heuristics are possible...
84    private static boolean looksLikeJapanese(CharBuffer cb) {
85        int hiragana = 0;       // Fullwidth Hiragana
86        int katakana = 0;       // Halfwidth Katakana
87        while (cb.hasRemaining()) {
88            char c = cb.get();
89            if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;
90            if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;
91        }
92        return false;
93    }
94
95    private static class Decoder extends CharsetDecoder {
96        private final static String osName = AccessController.doPrivileged(
97            (PrivilegedAction<String>) () -> System.getProperty("os.name"));
98
99        private final static String SJISName = getSJISName();
100        private final static String EUCJPName = getEUCJPName();
101        private DelegatableDecoder detectedDecoder = null;
102
103        public Decoder(Charset cs) {
104            super(cs, 0.5f, 1.0f);
105        }
106
107        private static boolean isPlainASCII(byte b) {
108            return b >= 0 && b != 0x1b;
109        }
110
111        private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {
112            int start = src.position();
113            int limit = start + Math.min(src.remaining(), dst.remaining());
114            int p;
115            byte b;
116            for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)
117                dst.put((char)(b & 0xff));
118            src.position(p);
119        }
120
121        private CoderResult decodeLoop(DelegatableDecoder decoder,
122                                       ByteBuffer src, CharBuffer dst) {
123            ((CharsetDecoder)decoder).reset();
124            detectedDecoder = decoder;
125            return detectedDecoder.decodeLoop(src, dst);
126        }
127
128        protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
129            if (detectedDecoder == null) {
130                copyLeadingASCII(src, dst);
131
132                // All ASCII?
133                if (! src.hasRemaining())
134                    return CoderResult.UNDERFLOW;
135                // Overflow only if there is still ascii but no out buffer.
136                if (!dst.hasRemaining() &&
137                    isPlainASCII(src.get(src.position())))
138                    return CoderResult.OVERFLOW;
139
140                // We need to perform double, not float, arithmetic; otherwise
141                // we lose low order bits when src is larger than 2**24.
142                int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());
143                CharBuffer sandbox = CharBuffer.allocate(cbufsiz);
144
145                // First try ISO-2022-JP, since there is no ambiguity
146                Charset cs2022 = Charset.forName("ISO-2022-JP");
147                DelegatableDecoder dd2022
148                    = (DelegatableDecoder) cs2022.newDecoder();
149                ByteBuffer src2022 = src.asReadOnlyBuffer();
150                CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);
151                if (! res2022.isError())
152                    return decodeLoop(dd2022, src, dst);
153
154                // We must choose between EUC and SJIS
155                Charset csEUCJ = Charset.forName(EUCJPName);
156                Charset csSJIS = Charset.forName(SJISName);
157
158                DelegatableDecoder ddEUCJ
159                    = (DelegatableDecoder) csEUCJ.newDecoder();
160                DelegatableDecoder ddSJIS
161                    = (DelegatableDecoder) csSJIS.newDecoder();
162
163                ByteBuffer srcEUCJ = src.asReadOnlyBuffer();
164                sandbox.clear();
165                CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);
166                // If EUC decoding fails, must be SJIS
167                if (resEUCJ.isError())
168                    return decodeLoop(ddSJIS, src, dst);
169                ByteBuffer srcSJIS = src.asReadOnlyBuffer();
170                CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);
171                CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);
172                // If SJIS decoding fails, must be EUC
173                if (resSJIS.isError())
174                    return decodeLoop(ddEUCJ, src, dst);
175
176                // From here on, we have some ambiguity, and must guess.
177
178                // We prefer input that does not appear to end mid-character.
179                if (srcEUCJ.position() > srcSJIS.position())
180                    return decodeLoop(ddEUCJ, src, dst);
181
182                if (srcEUCJ.position() < srcSJIS.position())
183                    return decodeLoop(ddSJIS, src, dst);
184
185                // end-of-input is after the first byte of the first char?
186                if (src.position() == srcEUCJ.position())
187                    return CoderResult.UNDERFLOW;
188
189                // Use heuristic knowledge of typical Japanese text
190                sandbox.flip();
191                return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,
192                                  src, dst);
193            }
194
195            return detectedDecoder.decodeLoop(src, dst);
196        }
197
198        protected void implReset() {
199            detectedDecoder = null;
200        }
201
202        protected CoderResult implFlush(CharBuffer out) {
203            if (detectedDecoder != null)
204                return detectedDecoder.implFlush(out);
205            else
206                return super.implFlush(out);
207        }
208
209        public boolean isAutoDetecting() {
210            return true;
211        }
212
213        public boolean isCharsetDetected() {
214            return detectedDecoder != null;
215        }
216
217        public Charset detectedCharset() {
218            if (detectedDecoder == null)
219                throw new IllegalStateException("charset not yet detected");
220            return ((CharsetDecoder) detectedDecoder).charset();
221        }
222
223
224        /**
225         * Returned Shift_JIS Charset name is OS dependent
226         */
227        private static String getSJISName() {
228            if (osName.equals("Solaris") || osName.equals("SunOS"))
229                return("PCK");
230            else if (osName.startsWith("Windows"))
231                return("windows-31J");
232            else
233                return("Shift_JIS");
234        }
235
236        /**
237         * Returned EUC-JP Charset name is OS dependent
238         */
239
240        private static String getEUCJPName() {
241            if (osName.equals("Solaris") || osName.equals("SunOS"))
242                return("x-eucjp-open");
243            else
244                return("EUC_JP");
245        }
246
247    }
248}
249