UnicodeSpec.java revision 8845:4be14673b9bf
1/*
2 * Copyright (c) 2002, 2011, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package build.tools.generatecharacter;
27
28import java.io.BufferedReader;
29import java.io.FileReader;
30import java.io.FileNotFoundException;
31import java.io.IOException;
32import java.util.StringTokenizer;
33import java.io.File;
34import java.util.regex.Pattern;
35import java.util.ArrayList;
36
37/**
38 * The UnicodeSpec class provides a way to read in Unicode character
39 * properties from a Unicode data file.  One instance of class UnicodeSpec
40 * holds a decoded version of one line of the data file.  The file may
41 * be obtained from www.unicode.org.  The method readSpecFile returns an array
42 * of UnicodeSpec objects.
43 * @author      Guy Steele
44 * @author  John O'Conner
45 */
46
47public class UnicodeSpec {
48
49    private static final int MAP_UNDEFINED = 0xFFFFFFFF;
50
51        /**
52         * Construct a default UnicodeSpec object, with a default
53         * code point value 0xFFFF.
54         *
55         */
56    public UnicodeSpec() {
57                this(0xffff);
58    }
59
60        /**
61         * Construct a UnicodeSpec object for the given <code>codePoint<code>
62         * argument. Provide default properties.
63         * @param codePoint a Unicode code point between 0x0000 and 0x10FFFF
64         */
65    public UnicodeSpec(int codePoint) {
66        this.codePoint = codePoint;
67        generalCategory = UNASSIGNED;
68        bidiCategory = DIRECTIONALITY_UNDEFINED;
69        mirrored = false;
70        titleMap = MAP_UNDEFINED;
71        upperMap = MAP_UNDEFINED;
72        lowerMap = MAP_UNDEFINED;
73        decimalValue = -1;
74        digitValue = -1;
75        numericValue = "";
76                oldName = null;
77                comment = null;
78                name = null;
79    }
80
81        /**
82         * Create a String representation of this UnicodeSpec object.
83         * The string will contain the code point and all its case mappings
84         * if available.
85         */
86    public String toString() {
87        StringBuffer result = new StringBuffer(hex6(codePoint));
88        if (getUpperMap() != MAP_UNDEFINED) {
89            result.append(", upper=").append(hex6(upperMap));
90        }
91        if (getLowerMap() != MAP_UNDEFINED) {
92            result.append(", lower=").append(hex6(lowerMap));
93        }
94        if (getTitleMap() != MAP_UNDEFINED) {
95            result.append(", title=").append(hex6(titleMap));
96        }
97        return result.toString();
98    }
99
100    static String hex4(int n) {
101        String q = Integer.toHexString(n & 0xFFFF).toUpperCase();
102        return "0000".substring(Math.min(4, q.length())) + q;
103    }
104
105        static String hex6(int n) {
106                String str = Integer.toHexString(n & 0xFFFFFF).toUpperCase();
107                return "000000".substring(Math.min(6, str.length())) + str;
108
109        }
110
111
112    /**
113    * Given one line of a Unicode data file as a String, parse the line
114    * and return a UnicodeSpec object that contains the same character information.
115    *
116    * @param s a line of the Unicode data file to be parsed
117    * @return a UnicodeSpec object, or null if the parsing process failed for some reason
118    */
119    public static UnicodeSpec parse(String s) {
120        UnicodeSpec spec = null;
121        String[] tokens = null;
122
123        try {
124                        tokens = tokenSeparator.split(s, REQUIRED_FIELDS);
125            spec = new UnicodeSpec();
126            spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE]));
127            spec.setName(parseName(tokens[FIELD_NAME]));
128            spec.setGeneralCategory(parseGeneralCategory(tokens[FIELD_CATEGORY]));
129            spec.setBidiCategory(parseBidiCategory(tokens[FIELD_BIDI]));
130            spec.setCombiningClass(parseCombiningClass(tokens[FIELD_CLASS]));
131            spec.setDecomposition(parseDecomposition(tokens[FIELD_DECOMPOSITION]));
132            spec.setDecimalValue(parseDecimalValue(tokens[FIELD_DECIMAL]));
133            spec.setDigitValue(parseDigitValue(tokens[FIELD_DIGIT]));
134            spec.setNumericValue(parseNumericValue(tokens[FIELD_NUMERIC]));
135            spec.setMirrored(parseMirrored(tokens[FIELD_MIRRORED]));
136            spec.setOldName(parseOldName(tokens[FIELD_OLDNAME]));
137            spec.setComment(parseComment(tokens[FIELD_COMMENT]));
138            spec.setUpperMap(parseUpperMap(tokens[FIELD_UPPERCASE]));
139            spec.setLowerMap(parseLowerMap(tokens[FIELD_LOWERCASE]));
140            spec.setTitleMap(parseTitleMap(tokens[FIELD_TITLECASE]));
141        }
142
143        catch(Exception e) {
144            spec = null;
145            System.out.println("Error parsing spec line.");
146        }
147        return spec;
148    }
149
150    /**
151    * Parse the codePoint attribute for a Unicode character.  If the parse succeeds,
152    * the codePoint field of this UnicodeSpec object is updated and false is returned.
153    *
154    * The codePoint attribute should be a four to six digit hexadecimal integer.
155    *
156    * @param s   the codePoint attribute extracted from a line of the Unicode data file
157    * @return   code point if successful
158    * @exception NumberFormatException if unable to parse argument
159    */
160    public static int parseCodePoint(String s) throws NumberFormatException {
161        return Integer.parseInt(s, 16);
162    }
163
164    public static String parseName(String s) throws Exception {
165        if (s==null) throw new Exception("Cannot parse name.");
166        return s;
167    }
168
169    public static byte parseGeneralCategory(String s) throws Exception {
170        byte category = GENERAL_CATEGORY_COUNT;
171
172        for (byte x=0; x<generalCategoryList.length; x++) {
173            if (s.equals(generalCategoryList[x][SHORT])) {
174                category = x;
175                break;
176            }
177        }
178        if (category >= GENERAL_CATEGORY_COUNT) {
179            throw new Exception("Could not parse general category.");
180        }
181        return category;
182    }
183
184    public static byte parseBidiCategory(String s) throws Exception {
185        byte category = DIRECTIONALITY_CATEGORY_COUNT;
186
187        for (byte x=0; x<bidiCategoryList.length; x++) {
188            if (s.equals(bidiCategoryList[x][SHORT])) {
189                category = x;
190                break;
191            }
192        }
193        if (category >= DIRECTIONALITY_CATEGORY_COUNT) {
194            throw new Exception("Could not parse bidi category.");
195        }
196        return category;
197    }
198
199
200    /**
201    * Parse the combining attribute for a Unicode character.  If there is a combining
202    * attribute and the parse succeeds, then the hasCombining field is set to true,
203    * the combining field of this UnicodeSpec object is updated, and false is returned.
204    * If the combining attribute is an empty string, the parse succeeds but the
205    * hasCombining field is set to false. (and false is returned).
206    *
207    * The combining attribute, if any, should be a nonnegative decimal integer.
208    *
209    * @param s   the combining attribute extracted from a line of the Unicode data file
210    * @return   the combining class value if any, -1 if property not defined
211    * @exception Exception if can't parse the combining class
212    */
213
214    public static int parseCombiningClass(String s) throws Exception {
215        int combining = -1;
216        if (s.length()>0) {
217            combining = Integer.parseInt(s, 10);
218        }
219        return combining;
220    }
221
222    /**
223    * Parse the decomposition attribute for a Unicode character.  If the parse succeeds,
224    * the decomposition field of this UnicodeSpec object is updated and false is returned.
225    *
226    * The decomposition attribute is complicated; for now, it is treated as a string.
227    *
228    * @param s   the decomposition attribute extracted from a line of the Unicode data file
229    * @return   true if the parse failed; otherwise false
230    */
231
232    public static String parseDecomposition(String s) throws Exception {
233        if (s==null) throw new Exception("Cannot parse decomposition.");
234        return s;
235    }
236
237
238    /**
239    * Parse the decimal value attribute for a Unicode character.  If there is a decimal value
240    * attribute and the parse succeeds, then the hasDecimalValue field is set to true,
241    * the decimalValue field of this UnicodeSpec object is updated, and false is returned.
242    * If the decimal value attribute is an empty string, the parse succeeds but the
243    * hasDecimalValue field is set to false. (and false is returned).
244    *
245    * The decimal value attribute, if any, should be a nonnegative decimal integer.
246    *
247    * @param s   the decimal value attribute extracted from a line of the Unicode data file
248    * @return   the decimal value as an int, -1 if no decimal value defined
249    * @exception NumberFormatException if the parse fails
250    */
251    public static int parseDecimalValue(String s) throws NumberFormatException {
252        int value = -1;
253
254        if (s.length() > 0) {
255            value = Integer.parseInt(s, 10);
256        }
257        return value;
258    }
259
260    /**
261    * Parse the digit value attribute for a Unicode character.  If there is a digit value
262    * attribute and the parse succeeds, then the hasDigitValue field is set to true,
263    * the digitValue field of this UnicodeSpec object is updated, and false is returned.
264    * If the digit value attribute is an empty string, the parse succeeds but the
265    * hasDigitValue field is set to false. (and false is returned).
266    *
267    * The digit value attribute, if any, should be a nonnegative decimal integer.
268    *
269    * @param s   the digit value attribute extracted from a line of the Unicode data file
270    * @return   the digit value as an non-negative int, or -1 if no digit property defined
271    * @exception NumberFormatException if the parse fails
272    */
273    public static int parseDigitValue(String s) throws NumberFormatException {
274        int value = -1;
275
276        if (s.length() > 0) {
277            value = Integer.parseInt(s, 10);
278        }
279        return value;
280    }
281
282    public static String parseNumericValue(String s) throws Exception {
283        if (s == null) throw new Exception("Cannot parse numeric value.");
284        return s;
285    }
286
287    public static String parseComment(String s) throws Exception {
288        if (s == null) throw new Exception("Cannot parse comment.");
289        return s;
290    }
291
292    public static boolean parseMirrored(String s) throws Exception {
293        boolean mirrored;
294        if (s.length() == 1) {
295            if (s.charAt(0) == 'Y') {mirrored = true;}
296            else if (s.charAt(0) == 'N') {mirrored = false;}
297            else {throw new Exception("Cannot parse mirrored property.");}
298        }
299        else { throw new Exception("Cannot parse mirrored property.");}
300        return mirrored;
301    }
302
303    public static String parseOldName(String s) throws Exception {
304        if (s == null) throw new Exception("Cannot parse old name");
305        return s;
306    }
307
308    /**
309    * Parse the uppercase mapping attribute for a Unicode character.  If there is a uppercase
310    * mapping attribute and the parse succeeds, then the hasUpperMap field is set to true,
311    * the upperMap field of this UnicodeSpec object is updated, and false is returned.
312    * If the uppercase mapping attribute is an empty string, the parse succeeds but the
313    * hasUpperMap field is set to false. (and false is returned).
314    *
315    * The uppercase mapping attribute should be a four to six digit hexadecimal integer.
316    *
317    * @param s   the uppercase mapping attribute extracted from a line of the Unicode data file
318    * @return   simple uppercase character mapping if defined, MAP_UNDEFINED otherwise
319    * @exception NumberFormatException if parse fails
320    */
321    public static int parseUpperMap(String s) throws NumberFormatException {
322        int upperCase = MAP_UNDEFINED;
323
324                int length = s.length();
325        if (length >= 4 && length <=6) {
326            upperCase = Integer.parseInt(s, 16);
327        }
328        else if (s.length() != 0) {
329            throw new NumberFormatException();
330        }
331        return upperCase;
332    }
333
334    /**
335    * Parse the lowercase mapping attribute for a Unicode character.  If there is a lowercase
336    * mapping attribute and the parse succeeds, then the hasLowerMap field is set to true,
337    * the lowerMap field of this UnicodeSpec object is updated, and false is returned.
338    * If the lowercase mapping attribute is an empty string, the parse succeeds but the
339     * hasLowerMap field is set to false. (and false is returned).
340    *
341    * The lowercase mapping attribute should be a four to six digit hexadecimal integer.
342    *
343    * @param s   the lowercase mapping attribute extracted from a line of the Unicode data file
344    * @return   simple lowercase character mapping if defined, MAP_UNDEFINED otherwise
345    * @exception NumberFormatException if parse fails
346    */
347    public static int parseLowerMap(String s) throws NumberFormatException {
348        int lowerCase = MAP_UNDEFINED;
349                int length = s.length();
350        if (length >= 4 && length <= 6) {
351            lowerCase = Integer.parseInt(s, 16);
352        }
353        else if (s.length() != 0) {
354            throw new NumberFormatException();
355        }
356        return lowerCase;
357    }
358
359    /**
360    * Parse the titlecase mapping attribute for a Unicode character.  If there is a titlecase
361    * mapping attribute and the parse succeeds, then the hasTitleMap field is set to true,
362    * the titleMap field of this UnicodeSpec object is updated, and false is returned.
363    * If the titlecase mapping attribute is an empty string, the parse succeeds but the
364    * hasTitleMap field is set to false. (and false is returned).
365    *
366    * The titlecase mapping attribute should be a four to six digit hexadecimal integer.
367    *
368    * @param s   the titlecase mapping attribute extracted from a line of the Unicode data file
369    * @return   simple title case char mapping if defined, MAP_UNDEFINED otherwise
370    * @exception NumberFormatException if parse fails
371    */
372    public static int parseTitleMap(String s) throws NumberFormatException {
373        int titleCase = MAP_UNDEFINED;
374                int length = s.length();
375        if (length >= 4 && length <= 6) {
376            titleCase = Integer.parseInt(s, 16);
377        }
378        else if (s.length() != 0) {
379            throw new NumberFormatException();
380        }
381        return titleCase;
382    }
383
384    /**
385    * Read and parse a Unicode data file.
386    *
387    * @param file   a file specifying the Unicode data file to be read
388    * @return   an array of UnicodeSpec objects, one for each line of the
389    *           Unicode data file that could be successfully parsed as
390    *           specifying Unicode character attributes
391    */
392
393    public static UnicodeSpec[] readSpecFile(File file, int plane) throws FileNotFoundException {
394        ArrayList<UnicodeSpec> list = new ArrayList<>(3000);
395        UnicodeSpec[] result = null;
396        int count = 0;
397        BufferedReader f = new BufferedReader(new FileReader(file));
398        String line = null;
399        loop:
400        while(true) {
401            try {
402                line = f.readLine();
403            }
404            catch (IOException e) {
405                break loop;
406            }
407            if (line == null) break loop;
408            UnicodeSpec item = parse(line.trim());
409            int specPlane = item.getCodePoint() >>> 16;
410            if (specPlane < plane) continue;
411            if (specPlane > plane) break;
412
413            if (item != null) {
414                list.add(item);
415            }
416        }
417        result = new UnicodeSpec[list.size()];
418        list.toArray(result);
419        return result;
420    }
421
422    void setCodePoint(int value) {
423        codePoint = value;
424    }
425
426    /**
427     * Return the code point in this Unicode specification
428     * @return the char code point representing by the specification
429     */
430    public int getCodePoint() {
431        return codePoint;
432    }
433
434    void setName(String name) {
435        this.name = name;
436    }
437
438    public String getName() {
439        return name;
440    }
441
442    void setGeneralCategory(byte category) {
443        generalCategory = category;
444    }
445
446    public byte getGeneralCategory() {
447        return generalCategory;
448    }
449
450    void setBidiCategory(byte category) {
451        bidiCategory = category;
452    }
453
454    public byte getBidiCategory() {
455        return bidiCategory;
456    }
457
458    void setCombiningClass(int combiningClass) {
459        this.combiningClass = combiningClass;
460    }
461
462    public int getCombiningClass() {
463        return combiningClass;
464    }
465
466    void setDecomposition(String decomposition) {
467        this.decomposition = decomposition;
468    }
469
470    public String getDecomposition() {
471         return decomposition;
472    }
473
474    void setDecimalValue(int value) {
475        decimalValue = value;
476    }
477
478    public int getDecimalValue() {
479        return decimalValue;
480    }
481
482    public boolean isDecimalValue() {
483        return decimalValue != -1;
484    }
485
486    void setDigitValue(int value) {
487        digitValue = value;
488    }
489
490    public int getDigitValue() {
491        return digitValue;
492    }
493
494    public boolean isDigitValue() {
495        return digitValue != -1;
496    }
497
498    void setNumericValue(String value) {
499        numericValue = value;
500    }
501
502    public String getNumericValue() {
503        return numericValue;
504    }
505
506    public boolean isNumericValue() {
507        return numericValue.length() > 0;
508    }
509
510    void setMirrored(boolean value) {
511        mirrored = value;
512    }
513
514    public boolean isMirrored() {
515        return mirrored;
516    }
517
518    void setOldName(String name) {
519        oldName = name;
520    }
521
522    public String getOldName() {
523        return oldName;
524    }
525
526    void setComment(String comment) {
527        this.comment = comment;
528    }
529
530    public String getComment() {
531        return comment;
532    }
533
534    void setUpperMap(int ch) {
535        upperMap = ch;
536    };
537
538    public int getUpperMap() {
539        return upperMap;
540    }
541
542    public boolean hasUpperMap() {
543        return upperMap != MAP_UNDEFINED;
544    }
545
546    void setLowerMap(int ch) {
547        lowerMap = ch;
548    }
549
550    public int getLowerMap() {
551        return lowerMap;
552    }
553
554    public boolean hasLowerMap() {
555        return lowerMap != MAP_UNDEFINED;
556    }
557
558    void setTitleMap(int ch) {
559        titleMap = ch;
560    }
561
562    public int getTitleMap() {
563        return titleMap;
564    }
565
566    public boolean hasTitleMap() {
567        return titleMap != MAP_UNDEFINED;
568    }
569
570    int codePoint;         // the characters UTF-32 code value
571    String name;            // the ASCII name
572    byte generalCategory;   // general category, available via Characte.getType()
573    byte bidiCategory;      // available via Character.getBidiType()
574    int combiningClass;     // not used in Character
575    String decomposition;   // not used in Character
576    int decimalValue;       // decimal digit value
577    int digitValue;         // not all digits are decimal
578    String numericValue;    // numeric value if digit or non-digit
579    boolean mirrored;       //
580    String oldName;
581    String comment;
582    int upperMap;
583    int lowerMap;
584    int titleMap;
585
586    // this is the number of fields in one line of the UnicodeData.txt file
587    // each field is separated by a semicolon (a token)
588    static final int REQUIRED_FIELDS = 15;
589
590    /**
591     * General category types
592     * To preserve compatibility, these values cannot be changed
593     */
594    public static final byte
595        UNASSIGNED                  =  0, // Cn normative
596        UPPERCASE_LETTER            =  1, // Lu normative
597        LOWERCASE_LETTER            =  2, // Ll normative
598        TITLECASE_LETTER            =  3, // Lt normative
599        MODIFIER_LETTER             =  4, // Lm normative
600        OTHER_LETTER                =  5, // Lo normative
601        NON_SPACING_MARK            =  6, // Mn informative
602        ENCLOSING_MARK              =  7, // Me informative
603        COMBINING_SPACING_MARK      =  8, // Mc normative
604        DECIMAL_DIGIT_NUMBER        =  9, // Nd normative
605        LETTER_NUMBER               = 10, // Nl normative
606        OTHER_NUMBER                = 11, // No normative
607        SPACE_SEPARATOR             = 12, // Zs normative
608        LINE_SEPARATOR              = 13, // Zl normative
609        PARAGRAPH_SEPARATOR         = 14, // Zp normative
610        CONTROL                     = 15, // Cc normative
611        FORMAT                      = 16, // Cf normative
612        // 17 is unused for no apparent reason,
613        // but must preserve forward compatibility
614        PRIVATE_USE                 = 18, // Co normative
615        SURROGATE                   = 19, // Cs normative
616        DASH_PUNCTUATION            = 20, // Pd informative
617        START_PUNCTUATION           = 21, // Ps informative
618        END_PUNCTUATION             = 22, // Pe informative
619        CONNECTOR_PUNCTUATION       = 23, // Pc informative
620        OTHER_PUNCTUATION           = 24, // Po informative
621        MATH_SYMBOL                 = 25, // Sm informative
622        CURRENCY_SYMBOL             = 26, // Sc informative
623        MODIFIER_SYMBOL             = 27, // Sk informative
624        OTHER_SYMBOL                = 28, // So informative
625        INITIAL_QUOTE_PUNCTUATION   = 29, // Pi informative
626        FINAL_QUOTE_PUNCTUATION     = 30, // Pf informative
627
628        // this value is only used in the character generation tool
629        // it can change to accommodate the addition of new categories.
630        GENERAL_CATEGORY_COUNT      = 31; // sentinel value
631
632    static final byte SHORT = 0, LONG = 1;
633    // general category type strings
634    // NOTE: The order of this category array is dependent on the assignment of
635    // category constants above. We want to access this array using constants above.
636    // [][SHORT] is the SHORT name, [][LONG] is the LONG name
637    static final String[][] generalCategoryList = {
638        {"Cn", "UNASSIGNED"},
639        {"Lu", "UPPERCASE_LETTER"},
640        {"Ll", "LOWERCASE_LETTER"},
641        {"Lt", "TITLECASE_LETTER"},
642        {"Lm", "MODIFIER_LETTER"},
643        {"Lo", "OTHER_LETTER"},
644        {"Mn", "NON_SPACING_MARK"},
645        {"Me", "ENCLOSING_MARK"},
646        {"Mc", "COMBINING_SPACING_MARK"},
647        {"Nd", "DECIMAL_DIGIT_NUMBER"},
648        {"Nl", "LETTER_NUMBER"},
649        {"No", "OTHER_NUMBER"},
650        {"Zs", "SPACE_SEPARATOR"},
651        {"Zl", "LINE_SEPARATOR"},
652        {"Zp", "PARAGRAPH_SEPARATOR"},
653        {"Cc", "CONTROL"},
654        {"Cf", "FORMAT"},
655        {"xx", "unused"},
656        {"Co", "PRIVATE_USE"},
657        {"Cs", "SURROGATE"},
658        {"Pd", "DASH_PUNCTUATION"},
659        {"Ps", "START_PUNCTUATION"},
660        {"Pe", "END_PUNCTUATION"},
661        {"Pc", "CONNECTOR_PUNCTUATION"},
662        {"Po", "OTHER_PUNCTUATION"},
663        {"Sm", "MATH_SYMBOL"},
664        {"Sc", "CURRENCY_SYMBOL"},
665        {"Sk", "MODIFIER_SYMBOL"},
666        {"So", "OTHER_SYMBOL"},
667        {"Pi", "INITIAL_QUOTE_PUNCTUATION"},
668        {"Pf", "FINAL_QUOTE_PUNCTUATION"}
669    };
670
671    /**
672     * Bidirectional categories
673     */
674    public static final byte
675                DIRECTIONALITY_UNDEFINED                  = -1,
676        // Strong category
677        DIRECTIONALITY_LEFT_TO_RIGHT              =  0, // L
678        DIRECTIONALITY_RIGHT_TO_LEFT              =  1, // R
679        DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC       =  2, // AL
680        // Weak category
681        DIRECTIONALITY_EUROPEAN_NUMBER            =  3, // EN
682        DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR  =  4, // ES
683        DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR =  5, // ET
684        DIRECTIONALITY_ARABIC_NUMBER              =  6, // AN
685        DIRECTIONALITY_COMMON_NUMBER_SEPARATOR    =  7, // CS
686        DIRECTIONALITY_NONSPACING_MARK            =  8, // NSM
687        DIRECTIONALITY_BOUNDARY_NEUTRAL           =  9, // BN
688        // Neutral category
689        DIRECTIONALITY_PARAGRAPH_SEPARATOR        = 10, // B
690        DIRECTIONALITY_SEGMENT_SEPARATOR          = 11, // S
691        DIRECTIONALITY_WHITESPACE                 = 12, // WS
692        DIRECTIONALITY_OTHER_NEUTRALS              = 13, // ON
693
694        DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING    = 14, // LRE
695        DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE     = 15, // LRO
696        DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING    = 16, // RLE
697        DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE     = 17, // RLO
698        DIRECTIONALITY_POP_DIRECTIONAL_FORMAT     = 18, // PDF
699
700        DIRECTIONALITY_CATEGORY_COUNT             = 19; // sentinel value
701
702    // If changes are made to the above bidi category assignments, this
703    // list of bidi category names must be changed to keep their order in synch.
704    // Access this list using the bidi category constants above.
705    static final String[][] bidiCategoryList = {
706        {"L", "DIRECTIONALITY_LEFT_TO_RIGHT"},
707        {"R", "DIRECTIONALITY_RIGHT_TO_LEFT"},
708        {"AL", "DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC"},
709        {"EN", "DIRECTIONALITY_EUROPEAN_NUMBER"},
710        {"ES", "DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR"},
711        {"ET", "DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR"},
712        {"AN", "DIRECTIONALITY_ARABIC_NUMBER"},
713        {"CS", "DIRECTIONALITY_COMMON_NUMBER_SEPARATOR"},
714        {"NSM", "DIRECTIONALITY_NONSPACING_MARK"},
715        {"BN", "DIRECTIONALITY_BOUNDARY_NEUTRAL"},
716        {"B", "DIRECTIONALITY_PARAGRAPH_SEPARATOR"},
717        {"S", "DIRECTIONALITY_SEGMENT_SEPARATOR"},
718        {"WS", "DIRECTIONALITY_WHITESPACE"},
719        {"ON", "DIRECTIONALITY_OTHER_NEUTRALS"},
720        {"LRE", "DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING"},
721        {"LRO", "DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE"},
722        {"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"},
723        {"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"},
724        {"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"},
725
726    };
727
728    // Unicode specification lines have fields in this order.
729    static final byte
730        FIELD_VALUE         = 0,
731        FIELD_NAME          = 1,
732        FIELD_CATEGORY      = 2,
733        FIELD_CLASS         = 3,
734        FIELD_BIDI          = 4,
735        FIELD_DECOMPOSITION = 5,
736        FIELD_DECIMAL       = 6,
737        FIELD_DIGIT         = 7,
738        FIELD_NUMERIC       = 8,
739        FIELD_MIRRORED      = 9,
740        FIELD_OLDNAME       = 10,
741        FIELD_COMMENT       = 11,
742        FIELD_UPPERCASE     = 12,
743        FIELD_LOWERCASE     = 13,
744        FIELD_TITLECASE     = 14;
745
746        static final Pattern tokenSeparator = Pattern.compile(";");
747
748        public static void main(String[] args) {
749                UnicodeSpec[] spec = null;
750                if (args.length == 2 ) {
751                        try {
752                                File file = new File(args[0]);
753                                int plane = Integer.parseInt(args[1]);
754                                spec = UnicodeSpec.readSpecFile(file, plane);
755                                System.out.println("UnicodeSpec[" + spec.length + "]:");
756                                for (int x=0; x<spec.length; x++) {
757                                        System.out.println(spec[x].toString());
758                                }
759                        }
760                        catch(Exception e) {
761                                e.printStackTrace();
762                        }
763                }
764
765        }
766
767}
768