1/*
2 * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package sun.nio.cs;
27
28import java.nio.Buffer;
29import java.nio.ByteBuffer;
30import java.nio.CharBuffer;
31import java.nio.charset.Charset;
32import java.nio.charset.CharsetDecoder;
33import java.nio.charset.CharsetEncoder;
34import java.nio.charset.CoderResult;
35import java.nio.charset.CodingErrorAction;
36
37/* Legal CESU-8 Byte Sequences
38 *
39 * #    Code Points      Bits   Bit/Byte pattern
40 * 1                     7      0xxxxxxx
41 *      U+0000..U+007F          00..7F
42 *
43 * 2                     11     110xxxxx    10xxxxxx
44 *      U+0080..U+07FF          C2..DF      80..BF
45 *
46 * 3                     16     1110xxxx    10xxxxxx    10xxxxxx
47 *      U+0800..U+0FFF          E0          A0..BF      80..BF
48 *      U+1000..U+FFFF          E1..EF      80..BF      80..BF
49 *
50 */
51
52class CESU_8 extends Unicode
53{
54    public CESU_8() {
55        super("CESU-8", StandardCharsets.aliases_CESU_8);
56    }
57
58    public String historicalName() {
59        return "CESU8";
60    }
61
62    public CharsetDecoder newDecoder() {
63        return new Decoder(this);
64    }
65
66    public CharsetEncoder newEncoder() {
67        return new Encoder(this);
68    }
69
70    private static final void updatePositions(Buffer src, int sp,
71                                              Buffer dst, int dp) {
72        src.position(sp - src.arrayOffset());
73        dst.position(dp - dst.arrayOffset());
74    }
75
76    private static class Decoder extends CharsetDecoder
77                                 implements ArrayDecoder {
78        private Decoder(Charset cs) {
79            super(cs, 1.0f, 1.0f);
80        }
81
82        private static boolean isNotContinuation(int b) {
83            return (b & 0xc0) != 0x80;
84        }
85
86        //  [E0]     [A0..BF] [80..BF]
87        //  [E1..EF] [80..BF] [80..BF]
88        private static boolean isMalformed3(int b1, int b2, int b3) {
89            return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
90                   (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
91        }
92
93        // only used when there is only one byte left in src buffer
94        private static boolean isMalformed3_2(int b1, int b2) {
95            return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
96                   (b2 & 0xc0) != 0x80;
97        }
98
99
100        //  [F0]     [90..BF] [80..BF] [80..BF]
101        //  [F1..F3] [80..BF] [80..BF] [80..BF]
102        //  [F4]     [80..8F] [80..BF] [80..BF]
103        //  only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
104        //  will be checked by Character.isSupplementaryCodePoint(uc)
105        private static boolean isMalformed4(int b2, int b3, int b4) {
106            return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
107                   (b4 & 0xc0) != 0x80;
108        }
109
110        // only used when there is less than 4 bytes left in src buffer
111        private static boolean isMalformed4_2(int b1, int b2) {
112            return (b1 == 0xf0 && b2 == 0x90) ||
113                   (b2 & 0xc0) != 0x80;
114        }
115
116        private static boolean isMalformed4_3(int b3) {
117            return (b3 & 0xc0) != 0x80;
118        }
119
120        private static CoderResult malformedN(ByteBuffer src, int nb) {
121            switch (nb) {
122            case 1:
123            case 2:                    // always 1
124                return CoderResult.malformedForLength(1);
125            case 3:
126                int b1 = src.get();
127                int b2 = src.get();    // no need to lookup b3
128                return CoderResult.malformedForLength(
129                    ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
130                     isNotContinuation(b2)) ? 1 : 2);
131            case 4:  // we don't care the speed here
132                b1 = src.get() & 0xff;
133                b2 = src.get() & 0xff;
134                if (b1 > 0xf4 ||
135                    (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
136                    (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
137                    isNotContinuation(b2))
138                    return CoderResult.malformedForLength(1);
139                if (isNotContinuation(src.get()))
140                    return CoderResult.malformedForLength(2);
141                return CoderResult.malformedForLength(3);
142            default:
143                assert false;
144                return null;
145            }
146        }
147
148        private static CoderResult malformed(ByteBuffer src, int sp,
149                                             CharBuffer dst, int dp,
150                                             int nb)
151        {
152            src.position(sp - src.arrayOffset());
153            CoderResult cr = malformedN(src, nb);
154            updatePositions(src, sp, dst, dp);
155            return cr;
156        }
157
158
159        private static CoderResult malformed(ByteBuffer src,
160                                             int mark, int nb)
161        {
162            src.position(mark);
163            CoderResult cr = malformedN(src, nb);
164            src.position(mark);
165            return cr;
166        }
167
168        private static CoderResult malformedForLength(ByteBuffer src,
169                                                      int sp,
170                                                      CharBuffer dst,
171                                                      int dp,
172                                                      int malformedNB)
173        {
174            updatePositions(src, sp, dst, dp);
175            return CoderResult.malformedForLength(malformedNB);
176        }
177
178        private static CoderResult malformedForLength(ByteBuffer src,
179                                                      int mark,
180                                                      int malformedNB)
181        {
182            src.position(mark);
183            return CoderResult.malformedForLength(malformedNB);
184        }
185
186
187        private static CoderResult xflow(Buffer src, int sp, int sl,
188                                         Buffer dst, int dp, int nb) {
189            updatePositions(src, sp, dst, dp);
190            return (nb == 0 || sl - sp < nb)
191                   ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
192        }
193
194        private static CoderResult xflow(Buffer src, int mark, int nb) {
195            src.position(mark);
196            return (nb == 0 || src.remaining() < nb)
197                   ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
198        }
199
200        private CoderResult decodeArrayLoop(ByteBuffer src,
201                                            CharBuffer dst)
202        {
203            // This method is optimized for ASCII input.
204            byte[] sa = src.array();
205            int sp = src.arrayOffset() + src.position();
206            int sl = src.arrayOffset() + src.limit();
207
208            char[] da = dst.array();
209            int dp = dst.arrayOffset() + dst.position();
210            int dl = dst.arrayOffset() + dst.limit();
211            int dlASCII = dp + Math.min(sl - sp, dl - dp);
212
213            // ASCII only loop
214            while (dp < dlASCII && sa[sp] >= 0)
215                da[dp++] = (char) sa[sp++];
216            while (sp < sl) {
217                int b1 = sa[sp];
218                if (b1 >= 0) {
219                    // 1 byte, 7 bits: 0xxxxxxx
220                    if (dp >= dl)
221                        return xflow(src, sp, sl, dst, dp, 1);
222                    da[dp++] = (char) b1;
223                    sp++;
224                } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
225                    // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
226                    if (sl - sp < 2 || dp >= dl)
227                        return xflow(src, sp, sl, dst, dp, 2);
228                    int b2 = sa[sp + 1];
229                    if (isNotContinuation(b2))
230                        return malformedForLength(src, sp, dst, dp, 1);
231                    da[dp++] = (char) (((b1 << 6) ^ b2)
232                                       ^
233                                       (((byte) 0xC0 << 6) ^
234                                        ((byte) 0x80 << 0)));
235                    sp += 2;
236                } else if ((b1 >> 4) == -2) {
237                    // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
238                    int srcRemaining = sl - sp;
239                    if (srcRemaining < 3 || dp >= dl) {
240                        if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1]))
241                            return malformedForLength(src, sp, dst, dp, 1);
242                        return xflow(src, sp, sl, dst, dp, 3);
243                    }
244                    int b2 = sa[sp + 1];
245                    int b3 = sa[sp + 2];
246                    if (isMalformed3(b1, b2, b3))
247                        return malformed(src, sp, dst, dp, 3);
248                    da[dp++] = (char)
249                        ((b1 << 12) ^
250                         (b2 <<  6) ^
251                         (b3 ^
252                          (((byte) 0xE0 << 12) ^
253                           ((byte) 0x80 <<  6) ^
254                           ((byte) 0x80 <<  0))));
255                    sp += 3;
256                } else {
257                    return malformed(src, sp, dst, dp, 1);
258                }
259            }
260            return xflow(src, sp, sl, dst, dp, 0);
261        }
262
263        private CoderResult decodeBufferLoop(ByteBuffer src,
264                                             CharBuffer dst)
265        {
266            int mark = src.position();
267            int limit = src.limit();
268            while (mark < limit) {
269                int b1 = src.get();
270                if (b1 >= 0) {
271                    // 1 byte, 7 bits: 0xxxxxxx
272                    if (dst.remaining() < 1)
273                        return xflow(src, mark, 1); // overflow
274                    dst.put((char) b1);
275                    mark++;
276                } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
277                    // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
278                    if (limit - mark < 2|| dst.remaining() < 1)
279                        return xflow(src, mark, 2);
280                    int b2 = src.get();
281                    if (isNotContinuation(b2))
282                        return malformedForLength(src, mark, 1);
283                    dst.put((char) (((b1 << 6) ^ b2)
284                                    ^
285                                    (((byte) 0xC0 << 6) ^
286                                     ((byte) 0x80 << 0))));
287                    mark += 2;
288                } else if ((b1 >> 4) == -2) {
289                    // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
290                    int srcRemaining = limit - mark;
291                    if (srcRemaining < 3 || dst.remaining() < 1) {
292                        if (srcRemaining > 1 && isMalformed3_2(b1, src.get()))
293                            return malformedForLength(src, mark, 1);
294                        return xflow(src, mark, 3);
295                    }
296                    int b2 = src.get();
297                    int b3 = src.get();
298                    if (isMalformed3(b1, b2, b3))
299                        return malformed(src, mark, 3);
300                    dst.put((char)
301                            ((b1 << 12) ^
302                             (b2 <<  6) ^
303                             (b3 ^
304                              (((byte) 0xE0 << 12) ^
305                               ((byte) 0x80 <<  6) ^
306                               ((byte) 0x80 <<  0)))));
307                    mark += 3;
308                } else {
309                    return malformed(src, mark, 1);
310                }
311            }
312            return xflow(src, mark, 0);
313        }
314
315        protected CoderResult decodeLoop(ByteBuffer src,
316                                         CharBuffer dst)
317        {
318            if (src.hasArray() && dst.hasArray())
319                return decodeArrayLoop(src, dst);
320            else
321                return decodeBufferLoop(src, dst);
322        }
323
324        private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)
325        {
326            if (bb == null)
327                bb = ByteBuffer.wrap(ba);
328            bb.position(sp);
329            return bb;
330        }
331
332        // returns -1 if there is/are malformed byte(s) and the
333        // "action" for malformed input is not REPLACE.
334        public int decode(byte[] sa, int sp, int len, char[] da) {
335            final int sl = sp + len;
336            int dp = 0;
337            int dlASCII = Math.min(len, da.length);
338            ByteBuffer bb = null;  // only necessary if malformed
339
340            // ASCII only optimized loop
341            while (dp < dlASCII && sa[sp] >= 0)
342                da[dp++] = (char) sa[sp++];
343
344            while (sp < sl) {
345                int b1 = sa[sp++];
346                if (b1 >= 0) {
347                    // 1 byte, 7 bits: 0xxxxxxx
348                    da[dp++] = (char) b1;
349                } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
350                    // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
351                    if (sp < sl) {
352                        int b2 = sa[sp++];
353                        if (isNotContinuation(b2)) {
354                            if (malformedInputAction() != CodingErrorAction.REPLACE)
355                                return -1;
356                            da[dp++] = replacement().charAt(0);
357                            sp--;            // malformedN(bb, 2) always returns 1
358                        } else {
359                            da[dp++] = (char) (((b1 << 6) ^ b2)^
360                                           (((byte) 0xC0 << 6) ^
361                                            ((byte) 0x80 << 0)));
362                        }
363                        continue;
364                    }
365                    if (malformedInputAction() != CodingErrorAction.REPLACE)
366                        return -1;
367                    da[dp++] = replacement().charAt(0);
368                    return dp;
369                } else if ((b1 >> 4) == -2) {
370                    // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
371                    if (sp + 1 < sl) {
372                        int b2 = sa[sp++];
373                        int b3 = sa[sp++];
374                        if (isMalformed3(b1, b2, b3)) {
375                            if (malformedInputAction() != CodingErrorAction.REPLACE)
376                                return -1;
377                            da[dp++] = replacement().charAt(0);
378                            sp -=3;
379                            bb = getByteBuffer(bb, sa, sp);
380                            sp += malformedN(bb, 3).length();
381                        } else {
382                            da[dp++] = (char)((b1 << 12) ^
383                                              (b2 <<  6) ^
384                                              (b3 ^
385                                              (((byte) 0xE0 << 12) ^
386                                              ((byte) 0x80 <<  6) ^
387                                              ((byte) 0x80 <<  0))));
388                        }
389                        continue;
390                    }
391                    if (malformedInputAction() != CodingErrorAction.REPLACE)
392                        return -1;
393                    if (sp  < sl && isMalformed3_2(b1, sa[sp])) {
394                        da[dp++] = replacement().charAt(0);
395                        continue;
396
397                    }
398                    da[dp++] = replacement().charAt(0);
399                    return dp;
400                } else {
401                    if (malformedInputAction() != CodingErrorAction.REPLACE)
402                        return -1;
403                    da[dp++] = replacement().charAt(0);
404                }
405            }
406            return dp;
407        }
408    }
409
410    private static class Encoder extends CharsetEncoder
411                                 implements ArrayEncoder {
412
413        private Encoder(Charset cs) {
414            super(cs, 1.1f, 3.0f);
415        }
416
417        public boolean canEncode(char c) {
418            return !Character.isSurrogate(c);
419        }
420
421        public boolean isLegalReplacement(byte[] repl) {
422            return ((repl.length == 1 && repl[0] >= 0) ||
423                    super.isLegalReplacement(repl));
424        }
425
426        private static CoderResult overflow(CharBuffer src, int sp,
427                                            ByteBuffer dst, int dp) {
428            updatePositions(src, sp, dst, dp);
429            return CoderResult.OVERFLOW;
430        }
431
432        private static CoderResult overflow(CharBuffer src, int mark) {
433            src.position(mark);
434            return CoderResult.OVERFLOW;
435        }
436
437        private static void to3Bytes(byte[] da, int dp, char c) {
438            da[dp] = (byte)(0xe0 | ((c >> 12)));
439            da[dp + 1] = (byte)(0x80 | ((c >>  6) & 0x3f));
440            da[dp + 2] = (byte)(0x80 | (c & 0x3f));
441        }
442
443        private static void to3Bytes(ByteBuffer dst, char c) {
444            dst.put((byte)(0xe0 | ((c >> 12))));
445            dst.put((byte)(0x80 | ((c >>  6) & 0x3f)));
446            dst.put((byte)(0x80 | (c & 0x3f)));
447        }
448
449        private Surrogate.Parser sgp;
450        private char[] c2;
451        private CoderResult encodeArrayLoop(CharBuffer src,
452                                            ByteBuffer dst)
453        {
454            char[] sa = src.array();
455            int sp = src.arrayOffset() + src.position();
456            int sl = src.arrayOffset() + src.limit();
457
458            byte[] da = dst.array();
459            int dp = dst.arrayOffset() + dst.position();
460            int dl = dst.arrayOffset() + dst.limit();
461            int dlASCII = dp + Math.min(sl - sp, dl - dp);
462
463            // ASCII only loop
464            while (dp < dlASCII && sa[sp] < '\u0080')
465                da[dp++] = (byte) sa[sp++];
466            while (sp < sl) {
467                char c = sa[sp];
468                if (c < 0x80) {
469                    // Have at most seven bits
470                    if (dp >= dl)
471                        return overflow(src, sp, dst, dp);
472                    da[dp++] = (byte)c;
473                } else if (c < 0x800) {
474                    // 2 bytes, 11 bits
475                    if (dl - dp < 2)
476                        return overflow(src, sp, dst, dp);
477                    da[dp++] = (byte)(0xc0 | (c >> 6));
478                    da[dp++] = (byte)(0x80 | (c & 0x3f));
479                } else if (Character.isSurrogate(c)) {
480                    // Have a surrogate pair
481                    if (sgp == null)
482                        sgp = new Surrogate.Parser();
483                    int uc = sgp.parse(c, sa, sp, sl);
484                    if (uc < 0) {
485                        updatePositions(src, sp, dst, dp);
486                        return sgp.error();
487                    }
488                    if (dl - dp < 6)
489                        return overflow(src, sp, dst, dp);
490                    to3Bytes(da, dp, Character.highSurrogate(uc));
491                    dp += 3;
492                    to3Bytes(da, dp, Character.lowSurrogate(uc));
493                    dp += 3;
494                    sp++;  // 2 chars
495                } else {
496                    // 3 bytes, 16 bits
497                    if (dl - dp < 3)
498                        return overflow(src, sp, dst, dp);
499                    to3Bytes(da, dp, c);
500                    dp += 3;
501                }
502                sp++;
503            }
504            updatePositions(src, sp, dst, dp);
505            return CoderResult.UNDERFLOW;
506        }
507
508        private CoderResult encodeBufferLoop(CharBuffer src,
509                                             ByteBuffer dst)
510        {
511            int mark = src.position();
512            while (src.hasRemaining()) {
513                char c = src.get();
514                if (c < 0x80) {
515                    // Have at most seven bits
516                    if (!dst.hasRemaining())
517                        return overflow(src, mark);
518                    dst.put((byte)c);
519                } else if (c < 0x800) {
520                    // 2 bytes, 11 bits
521                    if (dst.remaining() < 2)
522                        return overflow(src, mark);
523                    dst.put((byte)(0xc0 | (c >> 6)));
524                    dst.put((byte)(0x80 | (c & 0x3f)));
525                } else if (Character.isSurrogate(c)) {
526                    // Have a surrogate pair
527                    if (sgp == null)
528                        sgp = new Surrogate.Parser();
529                    int uc = sgp.parse(c, src);
530                    if (uc < 0) {
531                        src.position(mark);
532                        return sgp.error();
533                    }
534                    if (dst.remaining() < 6)
535                        return overflow(src, mark);
536                    to3Bytes(dst, Character.highSurrogate(uc));
537                    to3Bytes(dst, Character.lowSurrogate(uc));
538                    mark++;  // 2 chars
539                } else {
540                    // 3 bytes, 16 bits
541                    if (dst.remaining() < 3)
542                        return overflow(src, mark);
543                    to3Bytes(dst, c);
544                }
545                mark++;
546            }
547            src.position(mark);
548            return CoderResult.UNDERFLOW;
549        }
550
551        protected final CoderResult encodeLoop(CharBuffer src,
552                                               ByteBuffer dst)
553        {
554            if (src.hasArray() && dst.hasArray())
555                return encodeArrayLoop(src, dst);
556            else
557                return encodeBufferLoop(src, dst);
558        }
559
560        // returns -1 if there is malformed char(s) and the
561        // "action" for malformed input is not REPLACE.
562        public int encode(char[] sa, int sp, int len, byte[] da) {
563            int sl = sp + len;
564            int dp = 0;
565            int dlASCII = dp + Math.min(len, da.length);
566
567            // ASCII only optimized loop
568            while (dp < dlASCII && sa[sp] < '\u0080')
569                da[dp++] = (byte) sa[sp++];
570
571            while (sp < sl) {
572                char c = sa[sp++];
573                if (c < 0x80) {
574                    // Have at most seven bits
575                    da[dp++] = (byte)c;
576                } else if (c < 0x800) {
577                    // 2 bytes, 11 bits
578                    da[dp++] = (byte)(0xc0 | (c >> 6));
579                    da[dp++] = (byte)(0x80 | (c & 0x3f));
580                } else if (Character.isSurrogate(c)) {
581                    if (sgp == null)
582                        sgp = new Surrogate.Parser();
583                    int uc = sgp.parse(c, sa, sp - 1, sl);
584                    if (uc < 0) {
585                        if (malformedInputAction() != CodingErrorAction.REPLACE)
586                            return -1;
587                        da[dp++] = replacement()[0];
588                    } else {
589                        to3Bytes(da, dp, Character.highSurrogate(uc));
590                        dp += 3;
591                        to3Bytes(da, dp, Character.lowSurrogate(uc));
592                        dp += 3;
593                        sp++;  // 2 chars
594                    }
595                } else {
596                    // 3 bytes, 16 bits
597                    to3Bytes(da, dp, c);
598                    dp += 3;
599                }
600            }
601            return dp;
602        }
603    }
604}
605