1/*
2 * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24/* @test
25 * @bug 8186801 8186751
26 * @summary Test the charset mappings
27 */
28
29import java.io.*;
30import java.nio.*;
31import java.nio.file.*;
32import java.nio.charset.*;
33import java.util.*;
34import java.util.function.*;
35import java.util.regex.*;
36import java.util.stream.*;
37
38public class TestCharsetMapping {
39
40    private static final int BUFSIZ = 8192;     // Initial buffer size
41    private static final int MAXERRS = 10;      // Errors reported per test
42
43    private static final PrintStream log = System.out;
44
45    // Set by -v on the command line
46    private static boolean verbose = false;
47
48    // Test modes
49    private static final int ENCODE = 1;
50    private static final int DECODE = 2;
51
52    // Utilities
53    private static ByteBuffer expand(ByteBuffer bb) {
54        ByteBuffer nbb = ByteBuffer.allocate(bb.capacity() * 2);
55        bb.flip();
56        nbb.put(bb);
57        return nbb;
58    }
59
60    private static CharBuffer expand(CharBuffer cb) {
61        CharBuffer ncb = CharBuffer.allocate(cb.capacity() * 2);
62        cb.flip();
63        ncb.put(cb);
64        return ncb;
65    }
66
67    private static byte[] parseBytes(String s) {
68        int nb = s.length() / 2;
69        byte[] bs = new byte[nb];
70        for (int i = 0; i < nb; i++) {
71            int j = i * 2;
72            if (j + 2 > s.length())
73                throw new RuntimeException("Malformed byte string: " + s);
74            bs[i] = (byte)Integer.parseInt(s.substring(j, j + 2), 16);
75        }
76        return bs;
77    }
78
79    private static String printBytes(byte[] bs) {
80        StringBuffer sb = new StringBuffer();
81        for (int i = 0; i < bs.length; i++) {
82            sb.append(Integer.toHexString((bs[i] >> 4) & 0xf));
83            sb.append(Integer.toHexString((bs[i] >> 0) & 0xf));
84        }
85        return sb.toString();
86    }
87
88    private static String printCodePoint(int cp) {
89        StringBuffer sb = new StringBuffer();
90        sb.append("U+");
91        if (cp > 0xffff)
92            sb.append(Integer.toHexString((cp >> 16) & 0xf));
93        sb.append(Integer.toHexString((cp >> 12) & 0xf));
94        sb.append(Integer.toHexString((cp >> 8) & 0xf));
95        sb.append(Integer.toHexString((cp >> 4) & 0xf));
96        sb.append(Integer.toHexString((cp >> 0) & 0xf));
97        return sb.toString();
98    }
99
100    private static int getCodePoint(CharBuffer cb) {
101        char c = cb.get();
102        if (Character.isHighSurrogate(c))
103            return Character.toCodePoint(c, cb.get());
104        else
105            return c;
106    }
107
108    private static String plural(int n) {
109        return (n == 1 ? "" : "s");
110    }
111
112    // TestCharsetMapping
113    private CharsetInfo csinfo;
114    private CharsetDecoder decoder = null;
115    private CharsetEncoder encoder = null;
116
117    // Stateful dbcs encoding has leading shift byte '0x0e'
118    // and trailing shift byte '0x0f'.
119    // The flag variable shiftHackDBCS is 'true' for stateful
120    // EBCDIC encodings, which indicates the need of adding/
121    // removing the shift bytes.
122    private boolean shiftHackDBCS = false;
123
124    private TestCharsetMapping(CharsetInfo csinfo) throws Exception {
125        this.csinfo = csinfo;
126        this.encoder = csinfo.cs.newEncoder()
127            .onUnmappableCharacter(CodingErrorAction.REPLACE)
128            .onMalformedInput(CodingErrorAction.REPLACE);
129        this.decoder = csinfo.cs.newDecoder()
130            .onUnmappableCharacter(CodingErrorAction.REPLACE)
131            .onMalformedInput(CodingErrorAction.REPLACE);
132    }
133
134    private class Test {
135        // An instance of this class tests all mappings for
136        // a particular bytesPerChar value
137        private int bytesPerChar;
138
139        // Reference data from .map/nr/c2b files
140        private ByteBuffer refBytes = ByteBuffer.allocate(BUFSIZ);
141        private CharBuffer refChars = CharBuffer.allocate(BUFSIZ);
142
143        private ByteBuffer dRefBytes = ByteBuffer.allocateDirect(BUFSIZ);
144        private CharBuffer dRefChars = ByteBuffer.allocateDirect(BUFSIZ*2).asCharBuffer();
145
146        private Test(int bpc) {
147            bytesPerChar = bpc;
148        }
149
150        // shiftHackDBCS can add the leading/trailing shift bytesa
151        private void put(byte[] bs) {
152            if (refBytes.remaining() < bytesPerChar)
153                refBytes = expand(refBytes);
154            refBytes.put(bs);
155        }
156
157        private void put(byte[] bs, char[] cc) {
158            if (bs.length != bytesPerChar)
159                throw new IllegalArgumentException(bs.length
160                                                   + " != "
161                                                   + bytesPerChar);
162            if (refBytes.remaining() < bytesPerChar)
163                refBytes = expand(refBytes);
164            refBytes.put(bs);
165            if (refChars.remaining() < cc.length)
166                refChars = expand(refChars);
167            refChars.put(cc);
168        }
169
170        private boolean decode(ByteBuffer refBytes, CharBuffer refChars)
171            throws Exception {
172            log.println("    decode" + (refBytes.isDirect()?" (direct)":""));
173            CharBuffer out = decoder.decode(refBytes);
174
175            refBytes.rewind();
176            byte[] bs = new byte[bytesPerChar];
177            int e = 0;
178
179            if (shiftHackDBCS && bytesPerChar == 2 && refBytes.get() != (byte)0x0e) {
180                log.println("Missing leading byte");
181            }
182
183            while (refChars.hasRemaining()) {
184                refBytes.get(bs);
185                int rcp = getCodePoint(refChars);
186                int ocp = getCodePoint(out);
187                if (rcp != ocp) {
188                    log.println("      Error: "
189                                + printBytes(bs)
190                                + " --> "
191                                + printCodePoint(ocp)
192                                + ", expected "
193                                + printCodePoint(rcp));
194                    if (++e >= MAXERRS) {
195                        log.println("      Too many errors, giving up");
196                        break;
197                    }
198                }
199                if (verbose) {
200                    log.println("      "
201                                + printBytes(bs)
202                                + " --> "
203                                + printCodePoint(rcp));
204                }
205            }
206
207            if (shiftHackDBCS && bytesPerChar == 2 && refBytes.get() != (byte)0x0f) {
208                log.println("Missing trailing byte");
209            }
210
211            if (e == 0 && (refChars.hasRemaining() || out.hasRemaining())) {
212                // Paranoia: Didn't consume everything
213                throw new IllegalStateException();
214            }
215            refBytes.rewind();
216            refChars.rewind();
217            return (e == 0);
218        }
219
220        private boolean encode(ByteBuffer refBytes, CharBuffer refChars)
221            throws Exception {
222            log.println("    encode" + (refBytes.isDirect()?" (direct)":""));
223            ByteBuffer out = encoder.encode(refChars);
224            refChars.rewind();
225
226            if (shiftHackDBCS && bytesPerChar == 2 && out.get() != refBytes.get()) {
227                log.println("Missing leading byte");
228                return false;
229            }
230
231            byte[] rbs = new byte[bytesPerChar];
232            byte[] obs = new byte[bytesPerChar];
233            int e = 0;
234            while (refChars.hasRemaining()) {
235                int cp = getCodePoint(refChars);
236                refBytes.get(rbs);
237                out.get(obs);
238                boolean eq = true;
239                for (int i = 0; i < bytesPerChar; i++)
240                    eq &= rbs[i] == obs[i];
241                if (!eq) {
242                    log.println("      Error: "
243                                + printCodePoint(cp)
244                                + " --> "
245                                + printBytes(obs)
246                                + ", expected "
247                                + printBytes(rbs));
248                    if (++e >= MAXERRS) {
249                        log.println("      Too many errors, giving up");
250                        break;
251                    }
252                }
253                if (verbose) {
254                    log.println("      "
255                                + printCodePoint(cp)
256                                + " --> "
257                                + printBytes(rbs));
258                }
259            }
260
261            if (shiftHackDBCS && bytesPerChar == 2 && out.get() != refBytes.get()) {
262                log.println("Missing trailing byte");
263                return false;
264            }
265
266            if (e == 0 && (refBytes.hasRemaining() || out.hasRemaining())) {
267                // Paranoia: Didn't consume everything
268                throw new IllegalStateException();
269            }
270
271            refBytes.rewind();
272            refChars.rewind();
273            return (e == 0);
274        }
275
276        private boolean run(int mode) throws Exception {
277            log.println("  " + bytesPerChar
278                        + " byte" + plural(bytesPerChar) + "/char");
279
280            if (dRefBytes.capacity() < refBytes.capacity()) {
281                dRefBytes = ByteBuffer.allocateDirect(refBytes.capacity());
282            }
283            if (dRefChars.capacity() < refChars.capacity()) {
284                dRefChars = ByteBuffer.allocateDirect(refChars.capacity()*2)
285                                      .asCharBuffer();
286            }
287            refBytes.flip();
288            refChars.flip();
289            dRefBytes.clear();
290            dRefChars.clear();
291
292            dRefBytes.put(refBytes).flip();
293            dRefChars.put(refChars).flip();
294            refBytes.flip();
295            refChars.flip();
296
297            boolean rv = true;
298            if (mode != ENCODE) {
299                rv &= decode(refBytes, refChars);
300                rv &= decode(dRefBytes, dRefChars);
301            }
302            if (mode != DECODE) {
303                rv &= encode(refBytes, refChars);
304                rv &= encode(dRefBytes, dRefChars);
305            }
306            return rv;
307        }
308    }
309
310    // Maximum bytes/char being tested
311    private int maxBytesPerChar = 0;
312
313    // Tests, indexed by bytesPerChar - 1
314    private Test[] tests;
315
316    private void clearTests() {
317        maxBytesPerChar = 0;
318        tests = new Test[0];
319    }
320
321    // Find the test for the given bytes/char value,
322    // expanding the test array if needed
323    //
324    private Test testFor(int bpc) {
325        if (bpc > maxBytesPerChar) {
326            Test[] ts = new Test[bpc];
327            System.arraycopy(tests, 0, ts, 0, maxBytesPerChar);
328            for (int i = maxBytesPerChar; i < bpc; i++)
329                ts[i] = new Test(i + 1);
330            tests = ts;
331            maxBytesPerChar = bpc;
332        }
333        return tests[bpc - 1];
334    }
335
336    private boolean testStringConv() throws Exception {
337        if (shiftHackDBCS) {
338            log.println("  string de/encoding   skipped for ebcdic");
339            return true;
340        }
341        boolean rv = true;
342        log.println("  string de/encoding");
343        // for new String()
344        ByteArrayOutputStream baosDec = new ByteArrayOutputStream();
345        StringBuilder sbDec = new StringBuilder();
346        // for String.getBytes()
347        ByteArrayOutputStream baosEnc = new ByteArrayOutputStream();
348        StringBuilder sbEnc = new StringBuilder();
349
350        for (Entry e : csinfo.mappings) {
351            baosDec.write(e.bs);
352            sbDec.append(Character.toChars(e.cp));
353            if (e.cp2 != 0)
354                sbDec.append(e.cp2);
355
356            // non-roundtrip b2c, and c2b
357            if (csinfo.nr != null && csinfo.nr.containsKey(e.bb) ||
358                csinfo.c2b != null && !csinfo.c2b.containsKey(e.cp))
359                continue;
360            baosEnc.write(e.bs);
361            sbEnc.append(Character.toChars(e.cp));
362            if (e.cp2 != 0)
363                sbEnc.append(e.cp2);
364        }
365        log.println("    new String()");
366        if (!new String(baosDec.toByteArray(), csinfo.csName).equals(sbDec.toString())) {
367            log.println("      Error: new String() failed");
368            rv = false;
369        }
370        log.println("    String.getBytes()");
371        if (!Arrays.equals(baosEnc.toByteArray(), sbEnc.toString().getBytes(csinfo.csName))) {
372            log.println("      Error: String().getBytes() failed");
373            rv = false;
374        }
375        return rv;
376    }
377
378    private boolean run() throws Exception {
379        boolean rv = true;
380        shiftHackDBCS = csinfo.type.equals("ebcdic");    // isStateful;
381
382        // (1) new String()/String.getBytes()
383        rv &= testStringConv();
384
385        // (2) DECODE:
386        clearTests();
387        if (shiftHackDBCS) {
388            testFor(2).put(new byte[] { 0x0e });
389        }
390        csinfo.mappings.forEach(e -> {
391                if (e.cp2 != 0)
392                    return;          // skip composite (base+cc) for now
393                byte[] bs = e.bs;
394                char[] cc = Character.toChars(e.cp);
395                testFor(bs.length).put(bs, cc);
396            });
397        if (shiftHackDBCS) {
398            testFor(2).put(new byte[] { 0x0f });
399        }
400        for (int i = 0; i < maxBytesPerChar; i++) {
401            rv &= tests[i].run(DECODE);
402        }
403
404        // (3) ENCODE:
405        clearTests();
406        if (shiftHackDBCS) {
407            testFor(2).put(new byte[] { 0x0e });
408        }
409        csinfo.mappings.forEach(e -> {
410                if (e.cp2 != 0)
411                    return;          // skip composite (base+cc) for now
412                if (csinfo.nr != null && csinfo.nr.containsKey(e.bb))
413                    return;          // non-roundtrip b2c
414                if (csinfo.c2b != null && csinfo.c2b.containsKey(e.cp))
415                    return;          // c2b only mapping
416                byte[] bs = e.bs;
417                char[] cc = Character.toChars(e.cp);
418                testFor(bs.length).put(bs, cc);
419            });
420        if (csinfo.c2b != null)
421            csinfo.c2b.values().forEach(e -> {
422                    byte[] bs = e.bs;
423                    char[] cc = Character.toChars(e.cp);
424                    testFor(bs.length).put(bs, cc);
425                });
426        if (shiftHackDBCS) {
427            testFor(2).put(new byte[] { 0x0f });
428        }
429        for (int i = 0; i < maxBytesPerChar; i++) {
430            rv &= tests[i].run(ENCODE);
431        }
432        return rv;
433    }
434
435    private static class Entry {
436        byte[] bs;   // byte sequence reps
437        int cp;      // Unicode codepoint
438        int cp2;     // CC of composite
439        long bb;     // bs in "long" form for nr lookup;
440    }
441
442    private final static int  UNMAPPABLE = 0xFFFD;
443    private static final Pattern ptn = Pattern.compile("(?:0x)?(\\p{XDigit}++)\\s++(?:U\\+|0x)?(\\p{XDigit}++)(?:\\s++#.*)?");
444    private static final int G_BS  = 1;
445    private static final int G_CP  = 2;
446    private static final int G_CP2 = 3;
447
448    private static class CharsetInfo {
449        Charset  cs;
450        String   pkgName;
451        String   clzName;
452        String   csName;
453        String   hisName;
454        String   type;
455        boolean  isInternal;
456        Set<String> aliases = new HashSet<>();
457
458        // mapping entries
459        List<Entry> mappings;
460        Map<Long, Entry> nr;       // bytes -> entry
461        Map<Integer, Entry> c2b;   // cp -> entry
462
463        CharsetInfo(String csName, String clzName) {
464            this.csName = csName;
465            this.clzName = clzName;
466        }
467
468        private Entry parse(Matcher m) {
469            Entry e = new Entry();
470            e.bb = Long.parseLong(m.group(G_BS), 16);
471            if (e.bb < 0x100)
472                e.bs = new byte[] { (byte)e.bb };
473            else
474                e.bs = parseBytes(m.group(G_BS));
475            e.cp = Integer.parseInt(m.group(G_CP), 16);
476            if (G_CP2 <= m.groupCount() && m.group(G_CP2) != null)
477               e.cp2 = Integer.parseInt(m.group(G_CP2), 16);
478            else
479               e.cp2 = 0;
480            return e;
481        }
482
483        boolean loadMappings(Path dir) throws IOException {
484            // xxx.map
485            Path path = dir.resolve(clzName + ".map");
486            if (!Files.exists(path)) {
487                return false;
488            }
489            Matcher m = ptn.matcher("");
490            mappings = Files.lines(path)
491                .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt())
492                .map(ln -> parse(m))
493                .filter(e -> e.cp != UNMAPPABLE)  // non-mapping
494                .collect(Collectors.toList());
495            // xxx.nr
496            path = dir.resolve(clzName + ".nr");
497            if (Files.exists(path)) {
498                nr = Files.lines(path)
499                    .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt())
500                    .map(ln -> parse(m))
501                    .collect(Collectors.toMap(e -> e.bb, Function.identity()));
502            }
503            // xxx.c2b
504            path = dir.resolve(clzName + ".c2b");
505            if (Files.exists(path)) {
506                c2b = Files.lines(path)
507                    .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt())
508                    .map(ln -> parse(m))
509                    .collect(Collectors.toMap(e -> e.cp, Function.identity()));
510            }
511            return true;
512        }
513    }
514
515    private static Set<CharsetInfo> charsets(Path cslist) throws IOException {
516        Set<CharsetInfo> charsets = new LinkedHashSet<>();
517        Iterator<String> itr = Files.readAllLines(cslist).iterator();
518        CharsetInfo cs = null;
519
520        while (itr.hasNext()) {
521            String line = itr.next();
522            if (line.startsWith("#") || line.length() == 0) {
523                continue;
524            }
525            String[] tokens = line.split("\\s+");
526            if (tokens.length < 2) {
527                continue;
528            }
529            if ("charset".equals(tokens[0])) {
530                if (cs != null) {
531                    charsets.add(cs);
532                    cs = null;
533                }
534                if (tokens.length < 3) {
535                    throw new RuntimeException("Error: incorrect charset line [" + line + "]");
536                }
537                cs = new CharsetInfo(tokens[1], tokens[2]);
538            } else {
539                String key = tokens[1];              // leading empty str
540                switch (key) {
541                    case "alias":
542                        if (tokens.length < 3) {
543                            throw new RuntimeException("Error: incorrect alias line [" + line + "]");
544                        }
545                        cs.aliases.add(tokens[2]);   // ALIAS_NAME
546                        break;
547                    case "package":
548                        cs.pkgName = tokens[2];
549                        break;
550                    case "type":
551                        cs.type = tokens[2];
552                        break;
553                    case "hisname":
554                        cs.hisName = tokens[2];
555                        break;
556                    case "internal":
557                        cs.isInternal = Boolean.parseBoolean(tokens[2]);
558                        break;
559                    default:  // ignore
560                }
561            }
562        }
563        if (cs != null) {
564            charsets.add(cs);
565        }
566        return charsets;
567    }
568
569    public static void main(String args[]) throws Exception {
570        Path dir = Paths.get(System.getProperty("test.src", ".") +
571                             "/../../../../make/data/charsetmapping");
572        if (!Files.exists(dir)) {
573            // not inside jdk repo, no mappings, exit silently
574            log.println("Nothing done, not in a jdk repo: ");
575            return;
576        }
577        if (args.length > 0 && "-v".equals(args[0])) {
578            // For debugging: java CoderTest [-v]
579            verbose = true;
580        }
581
582        int errors = 0;
583        int tested = 0;
584        int skipped = 0;
585        int known = 0;
586
587        for (CharsetInfo csinfo : charsets(dir.resolve("charsets"))) {
588            String csname = csinfo.csName;
589
590            if (csinfo.isInternal) {
591                continue;
592            }
593
594            log.printf("%ntesting: %-16s", csname);
595
596            if (!Charset.isSupported(csname)) {
597                errors++;
598                log.println("    [error: charset is not supported]");
599                continue;
600            }
601
602            Charset cs = csinfo.cs = Charset.forName(csinfo.csName);
603            // test name()
604            if (!cs.name().equals(csinfo.csName)) {
605                errors++;
606                log.printf("    [error: wrong csname: " + csinfo.csName
607                           + " vs " + cs.name() + "]");
608            }
609            // test aliases()
610            if (!cs.aliases().equals(csinfo.aliases)) {
611                errors++;
612                log.printf("    [error wrong aliases]");
613                if (verbose) {
614                    log.println();
615                    log.println("    expected: " + csinfo.aliases);
616                    log.println("         got: " + cs.aliases());
617                }
618            }
619
620            if (csinfo.type.equals("source")) {
621                log.println("    [skipped: source based]");
622                skipped++;
623                continue;
624            }
625
626            if (!csinfo.loadMappings(dir)) {
627                log.println("    [error loading mappings failed]");
628                errors++;
629                continue;
630            }
631
632            tested++;
633            log.println();
634            if (!new TestCharsetMapping(csinfo).run()) {
635
636                /////////////// known nr/c2b issues ////////////////
637                if (csinfo.csName.equals("x-IBM948") ||
638                    csinfo.csName.equals("x-IBM950") ||
639                    csinfo.csName.equals("x-IBM937") ||
640                    csinfo.csName.equals("x-IBM1383"))
641                {
642                    log.println("    [**** skipped, KNOWN nr/c2b mapping issue]");
643                    known++;
644                    continue;
645                }
646
647                errors++;
648            }
649        }
650
651        log.println();
652        log.println(tested + " charset" + plural(tested) + " tested, "
653                    + skipped + " skipped, " + known + " known issue(s)");
654        log.println();
655        if (errors > 0)
656            throw new Exception("Errors detected in "
657                                + errors + " charset" + plural(errors));
658    }
659}
660