1/*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/**
27 * This is a tool to generate categoryNames and categoryMap which are used in
28 * CharSet.java.
29 */
30
31package build.tools.generatebreakiteratordata;
32
33import java.io.BufferedReader;
34import java.io.BufferedWriter;
35import java.io.FileReader;
36import java.io.FileWriter;
37import java.util.StringTokenizer;
38
39class CharacterCategory {
40
41    /**
42     * A list of Unicode category names.
43     */
44    static final String[] categoryNames = {
45        "Ll",        /* Letter, Lowercase */
46        "Lu",        /* Letter, Uppercase */
47        "Lt",        /* Letter, Titlecase */
48        "Lo",        /* Letter, Other */
49        "Lm",        /* Letter, Modifier */
50        "Nd",        /* Number, Decimal Digit */
51        "Nl",        /* Number, Letter */
52        "No",        /* Number, Other */
53        "Ps",        /* Punctuation, Open */
54        "Pe",        /* Punctuation, Close */
55        "Pi",        /* Punctuation, Initial quote */
56        "Pf",        /* Punctuation, Final quote */
57        "Pd",        /* Punctuation, Dash */
58        "Pc",        /* Punctuation, Connector */
59        "Po",        /* Punctuation, Other */
60        "Sc",        /* Symbol, Currency */
61        "Sm",        /* Symbol, Math */
62        "So",         /* Symbol, Other */
63        "Mn",        /* Mark, Non-Spacing */
64        "Mc",        /* Mark, Spacing Combining */
65        "Me",        /* Mark, Enclosing */
66        "Zl",        /* Separator, Line */
67        "Zp",        /* Separator, Paragraph */
68        "Zs",        /* Separator, Space */
69        "Cc",        /* Other, Control */
70        "Cf",        /* Other, Format */
71        "--",        /* Dummy, ignored */
72        // Don't add anything after the Dummy entry!!
73    };
74
75    /**
76     * A array of Unicode code points for each category.
77     */
78    private static int[][] categoryMap;
79
80
81    /**
82     * Generates CategoryMap for GenerateBreakIteratorData.
83     */
84    static void makeCategoryMap(String filename) {
85        /* Overwrite specfile name */
86        specfile = filename;
87
88        /* Generate data in current format (1.5.0) */
89        generateNewData();
90
91        /* Copy generated data to cateogyMap */
92        categoryMap = new int[categoryNames.length-1][];
93        for (int i = 0; i < categoryNames.length-1; i++) {
94            int len = newListCount[BMP][i] + newListCount[nonBMP][i];
95            categoryMap[i] = new int[len];
96            System.arraycopy(newList[i], 0, categoryMap[i], 0, len);
97        }
98    }
99
100    /**
101     * Returns categoryMap for the given category.
102     */
103    static int[] getCategoryMap(int category) {
104        return categoryMap[category];
105    }
106
107
108    /**
109     * Only used for debugging and generating a test program.
110     */
111    public static void main(String[] args) {
112        /* Parses command-line options */
113        processArgs(args);
114
115        /* Generates data in current format (1.5.0) */
116        generateNewData();
117
118        /*
119         * Generates data in older format (1.4.X and earlier) and creates
120         * the old CategoryMap if "oldFilename" is not null.
121         */
122        if (!oldDatafile.equals("")) {
123            generateOldData();
124            generateOldDatafile();
125        }
126
127        /* Displays summary of generated data */
128         showSummary();
129
130        /*
131         * Generates a test program which compares the new data and the return
132         * values of Character.getType().
133         * and the old data and the new data.
134         */
135        generateTestProgram();
136    }
137
138
139    /**
140     * Spec (Unicode data file)
141     */
142    private static String specfile = "UnicodeData.txt";
143
144    /**
145     * Output directory
146     */
147    private static String outputDir = "";
148
149    /**
150     * Old data filename
151     */
152    private static String oldDatafile = "";
153
154    /**
155     * Parses the specified arguments and sets up the variables.
156     */
157    private static void processArgs(String[] args) {
158        for (int i = 0; i < args.length; i++) {
159            String arg =args[i];
160            if (arg.equals("-spec")) {
161                specfile = args[++i];
162            } else if (arg.equals("-old")) {
163                oldDatafile = args[++i];
164            } else if (arg.equals("-o")) {
165                outputDir = args[++i];
166            } else {
167                System.err.println("Usage: java CharacterCategory [-spec specfile]");
168                System.exit(1);
169            }
170        }
171    }
172
173
174    /**
175     * Displays summary of generated data
176     */
177    private static void showSummary() {
178        int oldSum = 0;
179        int newSum = 0;
180        int oldSuppSum = 0;
181        int newSuppSum = 0;
182
183        for (int i = 0; i < categoryNames.length-1; i++) {
184            int newNum = newListCount[BMP][i] + newListCount[nonBMP][i];
185
186            if (oldTotalCount[i] != newNum) {
187                System.err.println("Error: The number of generated data is different between the new approach and the old approach.");
188            }
189            if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) {
190                System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach.");
191            }
192
193            System.out.println("    " + categoryNames[i] + ": " +
194                               oldTotalCount[i] +
195                               "(" + oldListCount[BEFORE][i] +
196                               " + " + oldListCount[SURROGATE][i] +
197                               " + " + oldListCount[AFTER][i] + ")" +
198                               " --- " + newNum +
199                               "(" + newListCount[BMP][i] +
200                               " + " + newListCount[nonBMP][i] + ")");
201
202            oldSum += oldListCount[BEFORE][i] * 2 +
203                      oldListCount[SURROGATE][i] * 4 +
204                      oldListCount[AFTER][i] * 2;
205            newSum += newNum * 4 ;
206            oldSuppSum += oldListCount[SURROGATE][i] * 4;
207            newSuppSum += newListCount[nonBMP][i] * 4;
208        }
209
210        System.out.println("\nTotal buffer sizes are:\n    " +
211                           oldSum + "bytes(Including " + oldSuppSum +
212                           "bytes for supplementary characters)\n    " +
213                           newSum + "bytes(Including " + newSuppSum +
214                           "bytes for supplementary characters)");
215
216        if (!ignoredOld.toString().equals(ignoredNew.toString())) {
217            System.err.println("Ignored categories: Error: List mismatch: " +
218                                ignoredOld + " vs. " + ignoredNew);
219        } else {
220            System.out.println("\nIgnored categories: " + ignoredOld);
221            System.out.println("Please confirm that they aren't used in BreakIteratorRules.");
222        }
223    }
224
225
226    private static final int HighSurrogate_CodeUnit_Start = 0xD800;
227    private static final int LowSurrogate_CodeUnit_Start  = 0xDC00;
228    private static final int Supplementary_CodePoint_Start    = 0x10000;
229
230
231    private static StringBuffer ignoredOld = new StringBuffer();
232    private static int[] oldTotalCount = new int[categoryNames.length];
233    private static int[][] oldListCount = new int[3][categoryNames.length];
234    private static int[][] oldListLen = new int[3][categoryNames.length];
235    private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length];
236
237    private static final int BEFORE = 0;
238    private static final int SURROGATE = 1;
239    private static final int AFTER = 2;
240
241    /**
242     * Makes CategoryMap in ordler format which had been used by JDK 1.4.X and
243     * earlier versions.
244     */
245    private static void generateOldData() {
246        /* Initialize arrays. */
247        for (int i = 0; i<categoryNames.length; i++) {
248            for (int j = BEFORE; j <= AFTER; j++) {
249                oldListCount[j][i] = 0;
250                oldList[j][i] = new StringBuffer();
251                oldListLen[j][i] = 17;
252            }
253        }
254
255        storeOldData();
256
257        if (oldTotalCount[categoryNames.length-1] != 1) {
258            System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
259            System.exit(1);
260        }
261    }
262
263    private static void storeOldData() {
264        try {
265            FileReader fin = new FileReader(specfile);
266            BufferedReader bin = new BufferedReader(fin);
267
268            String prevCode = "????";
269            String line;
270            int prevIndex = categoryNames.length - 1;
271            int prevCodeValue = -1;
272            int curCodeValue = 0;
273            boolean setFirst = false;
274
275            while ((line = bin.readLine()) != null) {
276                if (line.length() == 0) {
277                    continue;
278                }
279
280                StringTokenizer st = new StringTokenizer(line, ";");
281                String code = st.nextToken();
282
283                char c = code.charAt(0);
284                if (c == '#' || c == '/') {
285                    continue;
286                }
287
288                int i = Integer.valueOf(code, 16).intValue();
289
290                String characterName = st.nextToken();
291                String category = st.nextToken();
292
293                int index;
294                for (index = 0; index < categoryNames.length; index++) {
295                    if (category.equals(categoryNames[index])) {
296                        break;
297                    }
298                }
299
300                if (index != categoryNames.length) {
301                    curCodeValue = Integer.parseInt(code, 16);
302                    if (prevIndex != index) {
303                        appendOldChar(prevIndex, prevCodeValue, prevCode);
304                        appendOldChar(index, curCodeValue, code);
305                        prevIndex = index;
306                    } else if (prevCodeValue != curCodeValue - 1) {
307                        if (setFirst && characterName.endsWith(" Last>")) {
308                            setFirst = false;
309                        } else {
310                            appendOldChar(prevIndex, prevCodeValue, prevCode);
311                            appendOldChar(index, curCodeValue, code);
312                        }
313                    }
314                    prevCodeValue = curCodeValue;
315                    prevCode = code;
316                    if (characterName.endsWith(" First>")) {
317                        setFirst = true;
318                    }
319                } else {
320                    if (ignoredOld.indexOf(category) == -1) {
321                        ignoredOld.append(category);
322                        ignoredOld.append(' ');
323                    }
324                }
325            }
326            appendOldChar(prevIndex, prevCodeValue, prevCode);
327
328            bin.close();
329            fin.close();
330        }
331        catch (Exception e) {
332            throw new InternalError(e.toString());
333        }
334    }
335
336    private static void appendOldChar(int index, int code, String s) {
337        int range;
338        if (code < HighSurrogate_CodeUnit_Start) {
339            range = BEFORE;
340        } else if (code < Supplementary_CodePoint_Start) {
341            range = AFTER;
342        } else {
343            range = SURROGATE;
344        }
345
346        if (oldListLen[range][index] > 64) {
347            oldList[range][index].append("\"\n                + \"");
348            oldListLen[range][index] = 19;
349        }
350
351        if (code == 0x22 || code == 0x5c) {
352            oldList[range][index].append('\\');
353            oldList[range][index].append((char)code);
354            oldListLen[range][index] += 2;
355        } else if (code > 0x20 && code < 0x7F) {
356            oldList[range][index].append((char)code);
357            oldListLen[range][index] ++;
358        } else {
359            if (range == SURROGATE) {// Need to convert code point to code unit
360                oldList[range][index].append(toCodeUnit(code));
361                oldListLen[range][index] += 12;
362            } else {
363                oldList[range][index].append("\\u");
364                oldList[range][index].append(s);
365                oldListLen[range][index] += 6;
366            }
367        }
368        oldListCount[range][index] ++;
369        oldTotalCount[index]++;
370    }
371
372    private static String toCodeUnit(int i) {
373        StringBuffer sb = new StringBuffer();
374        sb.append("\\u");
375        sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase());
376        sb.append("\\u");
377        sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase());
378        return sb.toString();
379    }
380
381    private static int toCodePoint(String s) {
382        char c1 = s.charAt(0);
383
384        if (s.length() == 1 || !Character.isHighSurrogate(c1)) {
385            return (int)c1;
386        } else {
387            char c2 = s.charAt(1);
388            if (s.length() != 2 || !Character.isLowSurrogate(c2)) {
389                return -1;
390            }
391            return Character.toCodePoint(c1, c2);
392        }
393    }
394
395
396    private static StringBuffer ignoredNew = new StringBuffer();
397    private static int[] newTotalCount = new int[categoryNames.length];
398    private static int[][] newListCount = new int[2][categoryNames.length];
399    private static int[][] newList = new int[categoryNames.length][];
400
401    private static final int BMP = 0;
402    private static final int nonBMP = 1;
403
404    /**
405     * Makes CategoryMap in newer format which is used by JDK 1.5.0.
406     */
407    private static void generateNewData() {
408        /* Initialize arrays. */
409        for (int i = 0; i<categoryNames.length; i++) {
410            newList[i] = new int[10];
411        }
412
413        storeNewData();
414
415        if (newListCount[BMP][categoryNames.length-1] != 1) {
416            System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
417            System.exit(1);
418        }
419    }
420
421    private static void storeNewData() {
422        try {
423            FileReader fin = new FileReader(specfile);
424            BufferedReader bin = new BufferedReader(fin);
425
426            String line;
427            int prevIndex = categoryNames.length - 1;
428            int prevCodeValue = -1;
429            int curCodeValue = 0;
430            boolean setFirst = false;
431
432            while ((line = bin.readLine()) != null) {
433                if (line.length() == 0) {
434                    continue;
435                }
436
437                StringTokenizer st = new StringTokenizer(line, ";");
438                String code = st.nextToken();
439
440                char c = code.charAt(0);
441                if (c == '#' || c == '/') {
442                    continue;
443                }
444
445                int i = Integer.valueOf(code, 16).intValue();
446
447                String characterName = st.nextToken();
448                String category = st.nextToken();
449
450                int index;
451                for (index = 0; index < categoryNames.length; index++) {
452                    if (category.equals(categoryNames[index])) {
453                        break;
454                    }
455                }
456
457                if (index != categoryNames.length) {
458                    curCodeValue = Integer.parseInt(code, 16);
459                    if (prevIndex == index) {
460                        if (setFirst) {
461                            if (characterName.endsWith(" Last>")) {
462                                setFirst = false;
463                            } else {
464                                System.err.println("*** Error 1 at " + code);
465                            }
466                        } else {
467                            if (characterName.endsWith(" First>")) {
468                                setFirst = true;
469                            } else if (characterName.endsWith(" Last>")) {
470                                System.err.println("*** Error 2 at " + code);
471                            } else {
472                                if (prevCodeValue != curCodeValue - 1) {
473                                    appendNewChar(prevIndex, prevCodeValue);
474                                    appendNewChar(index, curCodeValue);
475                                }
476                            }
477                        }
478                    } else {
479                        if (setFirst) {
480                            System.err.println("*** Error 3 at " + code);
481                        } else if (characterName.endsWith(" First>")) {
482                            setFirst = true;
483                        } else if (characterName.endsWith(" Last>")) {
484                            System.err.println("*** Error 4 at " + code);
485                        }
486                        appendNewChar(prevIndex, prevCodeValue);
487                        appendNewChar(index, curCodeValue);
488                        prevIndex = index;
489                    }
490                    prevCodeValue = curCodeValue;
491                } else {
492                    if (ignoredNew.indexOf(category) == -1) {
493                        ignoredNew.append(category);
494                        ignoredNew.append(' ');
495                    }
496                }
497            }
498            appendNewChar(prevIndex, prevCodeValue);
499
500            bin.close();
501            fin.close();
502        }
503        catch (Exception e) {
504            System.err.println("Error occurred on accessing " + specfile);
505            e.printStackTrace();
506            System.exit(1);
507        }
508    }
509
510    private static void appendNewChar(int index, int code) {
511        int bufLen = newList[index].length;
512        if (newTotalCount[index] == bufLen) {
513            int[] tmpBuf = new int[bufLen + 10];
514            System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen);
515            newList[index] = tmpBuf;
516        }
517
518        newList[index][newTotalCount[index]++] = code;
519        if (code < 0x10000) {
520            newListCount[BMP][index]++;
521        } else {
522            newListCount[nonBMP][index]++;
523        }
524    }
525
526
527    /* Generates the old CategoryMap. */
528    private static void generateOldDatafile() {
529        try {
530            FileWriter fout = new FileWriter(oldDatafile);
531            BufferedWriter bout = new BufferedWriter(fout);
532
533            bout.write("\n    //\n    // The following String[][] can be used in CharSet.java as is.\n    //\n\n    private static final String[][] categoryMap = {\n");
534            for (int i = 0; i < categoryNames.length - 1; i++) {
535                if (oldTotalCount[i] != 0) {
536                    bout.write("        { \"" + categoryNames[i] + "\",");
537
538                    /* 0x0000-0xD7FF */
539                    if (oldListCount[BEFORE][i] != 0) {
540                        bout.write(" \"");
541
542                        bout.write(oldList[BEFORE][i].toString() + "\"\n");
543                    }
544
545                    /* 0xD800-0xFFFF */
546                    if (oldListCount[AFTER][i] != 0) {
547                        if (oldListCount[BEFORE][i] != 0) {
548                            bout.write("                + \"");
549                        } else {
550                            bout.write(" \"");
551                        }
552                        bout.write(oldList[AFTER][i].toString() + "\"\n");
553                    }
554
555                    /* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */
556                    if (oldListCount[SURROGATE][i] != 0) {
557                        if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) {
558                            bout.write("                + \"");
559                        } else {
560                            bout.write(" \"");
561                        }
562                        bout.write(oldList[SURROGATE][i].toString() + "\"\n");
563                    }
564                    bout.write("        },\n");
565
566                }
567            }
568            bout.write("    };\n\n");
569            bout.close();
570            fout.close();
571        }
572        catch (Exception e) {
573            System.err.println("Error occurred on accessing " + oldDatafile);
574            e.printStackTrace();
575            System.exit(1);
576        }
577
578        System.out.println("\n" + oldDatafile + " has been generated.");
579    }
580
581
582    /**
583     * Test program to be generated
584     */
585    private static final String outfile = "CharacterCategoryTest.java";
586
587    /*
588     * Generates a test program which compare the generated date (newer one)
589     * with the return values of Characger.getType().
590     */
591    private static void generateTestProgram() {
592        try {
593            FileWriter fout = new FileWriter(outfile);
594            BufferedWriter bout = new BufferedWriter(fout);
595
596            bout.write(collationMethod);
597            bout.write("\n    //\n    // The following arrays can be used in CharSet.java as is.\n    //\n\n");
598
599            bout.write("    private static final String[] categoryNames = {");
600            for (int i = 0; i < categoryNames.length - 1; i++) {
601                if (i % 10 == 0) {
602                    bout.write("\n        ");
603                }
604                bout.write("\"" + categoryNames[i] + "\", ");
605            }
606            bout.write("\n    };\n\n");
607
608            bout.write("    private static final int[][] categoryMap = {\n");
609
610            for (int i = 0; i < categoryNames.length - 1; i++) {
611                StringBuffer sb = new StringBuffer("        { /*  Data for \"" + categoryNames[i] + "\" category */");
612
613                for (int j = 0; j < newTotalCount[i]; j++) {
614                    if (j % 8 == 0) {
615                        sb.append("\n        ");
616                    }
617                    sb.append(" 0x");
618                    sb.append(Integer.toString(newList[i][j], 16).toUpperCase());
619                    sb.append(',');
620                }
621                sb.append("\n        },\n");
622                bout.write(sb.toString());
623            }
624
625            bout.write("    };\n");
626
627            bout.write("\n}\n");
628
629            bout.close();
630            fout.close();
631        }
632        catch (Exception e) {
633            System.err.println("Error occurred on accessing " + outfile);
634            e.printStackTrace();
635            System.exit(1);
636        }
637
638        System.out.println("\n" + outfile + " has been generated.");
639    }
640
641    static String collationMethod =
642"public class CharacterCategoryTest {\n\n" +
643"    static final int SIZE = 0x110000;\n" +
644"    static final String[] category = {\n" +
645"       \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" +
646"       \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" +
647"       \"Cf\", \"\",   \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" +
648"       \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" +
649"    };\n\n" +
650"    public static void main(String[] args) {\n" +
651"        boolean err = false;\n" +
652"        byte[] b = new byte[SIZE];\n" +
653"        for (int i = 0; i < SIZE; i++) {\n" +
654"            b[i] = 0;\n" +
655"        }\n" +
656"        for (int i = 0; i < categoryMap.length; i++) {\n" +
657"            byte categoryNum = 0;\n" +
658"            String categoryName = categoryNames[i];\n" +
659"            for (int j = 0; j < category.length; j++) {\n" +
660"                if (categoryName.equals(category[j])) {\n" +
661"                    categoryNum = (byte)j;\n" +
662"                    break;\n" +
663"                }\n" +
664"            }\n" +
665"            int[] values = categoryMap[i];\n" +
666"            for (int j = 0; j < values.length;) {\n" +
667"                int firstChar = values[j++];\n" +
668"                int lastChar = values[j++];\n" +
669"                for (int k = firstChar; k <= lastChar; k++) {\n" +
670"                    b[k] = categoryNum;\n" +
671"                }\n" +
672"            }\n" +
673"        }\n" +
674"        for (int i = 0; i < SIZE; i++) {\n" +
675"            int characterType = Character.getType(i);\n" +
676"            if (b[i] != characterType) {\n" +
677"                /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" +
678"                if (characterType == Character.PRIVATE_USE ||\n" +
679"                    characterType == Character.SURROGATE ||\n" +
680"                    characterType == Character.MODIFIER_SYMBOL) {\n" +
681"                    continue;\n" +
682"                }\n" +
683"                err = true;\n" +
684"                System.err.println(\"Category conflict for a character(0x\" +\n" +
685"                                   Integer.toHexString(i) +\n" +
686"                                   \"). CharSet.categoryMap:\" +\n" +
687"                                   category[b[i]] +\n" +
688"                                   \"  Character.getType():\" +\n" +
689"                                   category[characterType]);\n" +
690"            }\n" +
691"        }\n\n" +
692"        if (err) {\n" +
693"            throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" +
694"        }\n" +
695"    }\n";
696
697}
698