1/*
2 * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package build.tools.generatecharacter;
27
28import java.io.IOException;
29import java.io.FileNotFoundException;
30import java.io.BufferedReader;
31import java.io.FileReader;
32import java.io.PrintWriter;
33import java.io.BufferedWriter;
34import java.io.FileWriter;
35import java.io.File;
36import java.util.List;
37
38import build.tools.generatecharacter.CharacterName;
39
40/**
41 * This program generates the source code for the class java.lang.Character.
42 * It also generates native C code that can perform the same operations.
43 * It requires two external input data files:
44 * <ul>
45 * <li> Unicode specification file
46 * <li> Character class template file
47 * </ul>
48 * The Unicode specification file is available from the Unicode consortium.
49 * It has character specification lines that look like this:
50 * <listing>
51 * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
52 * </listing>
53 * The Character class template file is filled in with additional
54 * information to produce the file Character.java, which can then be
55 * compiled by a Java compiler.  The template file contains certain
56 * markers consisting of an alphabetic name string preceded by "$$".
57 * Such markers are replaced with generated program text.  As a special
58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
59 * alphabetic characters constituting a variable name.  The character "_"
60 * is considered alphabetic for these purposes.
61 *
62 * @author  Guy Steele
63 * @author  Alan Liu
64 * @author  John O'Conner
65 */
66
67public class GenerateCharacter {
68
69    final static boolean DEBUG = false;
70
71    final static String commandMarker = "$$";
72    static String ROOT                        = "";
73    static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
74    static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
75    static String DefaultPropListFileName     = ROOT + "PropList.txt";
76    static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
77    static String DefaultJavaOutputFileName   = ROOT + "Character.java";
78    static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
79    static String DefaultCOutputFileName      = ROOT + "Character.c";
80
81    static int plane = 0;
82
83    /* The overall idea is that, in the generated Character class source code,
84    most character property data is stored in a special multi-level table whose
85    structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
86    The integers must sum to 16 (the number of bits in a character).
87    The first table is indexed by the k1 high-order bits of the character code.
88    The result is concatenated to the next k2 bits of the character code to index
89    the second table, and so on.  Eventually the kn low-order bits of the character
90    code are concatenated and used to index one of two tables A and B; A contains
91    32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
92    can be thus obtained encode the properties for the character.
93
94    The default specification is [9, 4, 3, 0].  This particular table format was
95    designed by conducting an exhaustive search of table formats to minimize the
96    space consumed by the tables: the first and third tables need have only byte
97    values (the second table must have short values).  Another good choice is
98    [10, 6, 0], which produces a larger table but allows particularly fast table
99    lookup code.
100
101    In each case, where the word "concatenated" is used, this may imply
102    first a << and then a | operation, or perhaps just a | operation if
103    the values in the table can be preshifted (generally possible if the table
104    entries are short rather than byte).
105    */
106
107    /* The character properties are currently encoded into A (32 bits)and B (16 bits)
108       two parts.
109
110    A: the low 32 bits are defined  in the following manner:
111
112    1 bit Mirrored property.
113    4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
114    9 bits      A signed offset used for converting case .
115    1 bit       If 1, adding the signed offset converts the character to lowercase.
116    1 bit       If 1, subtracting the signed offset converts the character to uppercase.
117        Note: for a titlecase character, both of the preceding bits will be 1
118        and the signed offset will be 1.
119    1 bit   If 1, this character has a titlecase equivalent (possibly itself);
120        in this case, the two bits before this bit can be used to decide
121        whether this character is in fact uppercase, lowercase, or titlecase.
122    3 bits      This field provides a quick way to lex identifiers.
123        The eight possible values for this field are as follows:
124        0  May not be part of an identifier
125        1  Ignorable control; may continue a Unicode identifier or Java identifier
126        2  May continue a Java identifier but not a Unicode identifier (unused)
127        3  May continue a Unicode identifier or Java identifier
128        4  Is a Java whitespace character
129        5  May start or continue a Java identifier;
130           may continue but not start a Unicode identifier
131           (this value is used for connector punctuation such as _)
132        6  May start or continue a Java identifier;
133           may not occur in a Unicode identifier
134           (this value is used for currency symbols such as $)
135        7  May start or continue a Unicode identifier or Java identifier
136        Thus:
137           5, 6, 7 may start a Java identifier
138           1, 2, 3, 5, 6, 7 may continue a Java identifier
139           7 may start a Unicode identifier
140           1, 3, 5, 7 may continue a Unicode identifier
141           1 is ignorable within an identifier
142           4 is Java whitespace
143    2 bits      This field indicates whether the character has a numeric property.
144        The four possible values for this field are as follows:
145        0  This character has no numeric property.
146        1  Adding the digit offset to the character code and then
147           masking with 0x1F will produce the desired numeric value.
148        2  This character has a "strange" numeric value.
149        3  A Java supradecimal digit: adding the digit offset to the
150           character code, then masking with 0x1F, then adding 10
151           will produce the desired numeric value.
152    5 bits  The digit offset (see description of previous field)
153    5 bits      Character type (see below)
154
155    B: the high 16 bits are defined as:
156    1 bit Other_Lowercase property
157    1 bit Other_Uppercase property
158    1 bit Other_Alphabetic property
159    1 bit Other_Math property
160    1 bit Ideographic property
161    1 bit Noncharacter codepoint property
162    */
163
164
165    // bit masks identify each component of a 32-bit property field described
166    // above.
167    // shift* indicates how many shifts right must happen to get the
168    // indicated property value in the lowest bits of the 32-bit space.
169    private static final int
170        shiftType           = 0,        maskType            =       0x001F,
171        shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
172        shiftNumericType    = 10,       maskNumericType     =       0x0C00,
173        shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
174                                        maskUnicodePart     =       0x1000,
175        shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
176                                        maskLowerCase       =      0x20000,
177                                        maskUpperCase       =      0x10000,
178                                        maskTitleCase       =      0x08000,
179        shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
180        shiftCaseOffsetSign = 5,
181                                        // used only when calculating and
182                                        // storing digit offsets from char values
183                                        maskDigit               =   0x001F,
184                                        // case offset are 9 bits
185                                        maskCase                =   0x01FF,
186        shiftBidi           = 27,       maskBidi              = 0x78000000,
187        shiftMirrored       = 31,       //maskMirrored          = 0x80000000,
188        shiftPlane          = 16,       maskPlane = 0xFF0000;
189
190    // maskMirrored needs to be long, if up 16-bit
191    private static final long maskMirrored          = 0x80000000L;
192
193    // bit masks identify the 16-bit priperty field described above, in B
194    // table
195    private static final long
196        maskOtherLowercase  = 0x100000000L,
197        maskOtherUppercase  = 0x200000000L,
198        maskOtherAlphabetic = 0x400000000L,
199        maskOtherMath       = 0x800000000L,
200        maskIdeographic     = 0x1000000000L,
201        maskNoncharacterCP  = 0x2000000000L;
202
203    // Can compare masked values with these to determine
204    // numeric or lexical types.
205    public static int
206        valueNotNumeric             = 0x0000,
207        valueDigit                  = 0x0400,
208        valueStrangeNumeric         = 0x0800,
209        valueJavaSupradecimal       = 0x0C00,
210        valueIgnorable              = 0x1000,
211        valueJavaOnlyPart           = 0x2000,
212        valueJavaUnicodePart        = 0x3000,
213        valueJavaWhitespace         = 0x4000,
214        valueJavaStartUnicodePart   = 0x5000,
215        valueJavaOnlyStart          = 0x6000,
216        valueJavaUnicodeStart       = 0x7000,
217        lowJavaStart                = 0x5000,
218        nonzeroJavaPart             = 0x3000,
219        valueUnicodeStart           = 0x7000;
220
221    // these values are used when only identifier properties are generated
222    // for use in verifier code. Shortens the property down to a single byte.
223    private static final int
224        bitJavaStart            = 0x02,
225        bitJavaPart             = 0x01,
226        maskIsJavaIdentifierPart = bitJavaPart,
227        maskIsJavaIdentifierStart = bitJavaStart;
228
229    static int maxOffset = maskCase/2 ;
230    static int minOffset = -maxOffset;
231
232    /* The following routines provide simple, concise formatting of long integer values.
233     The number in the name of the method indicates the desired number of characters
234     to be produced.  If the number of digits required to represent the integer value
235     is less than that number, then the output is padded on the left  with zeros
236     (for hex) or with spaces (for decimal).  If the number of digits required to
237     represent the integer value is greater than the desired number, then all the digits
238     that are required are actually produced.
239    */
240
241    static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
242
243    static String hex2(long n) {
244        String q = Long.toHexString(n & 0xFF).toUpperCase();
245        return "00".substring(Math.min(2, q.length())) + q;
246    }
247
248    static String hex4(long n) {
249        String q = Long.toHexString(n & 0xFFFF).toUpperCase();
250        return "0000".substring(Math.min(4, q.length())) + q;
251    }
252
253    static String hex8(long n) {
254        String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
255        return "00000000".substring(Math.min(8, q.length())) + q;
256    }
257
258    static String hex16(long n) {
259        String q = Long.toHexString(n).toUpperCase();
260        return "0000000000000000".substring(Math.min(16, q.length())) + q;
261    }
262
263    static String dec3(long n) {
264        String q = Long.toString(n);
265        return "   ".substring(Math.min(3, q.length())) + q;
266    }
267
268    static String dec5(long n) {
269        String q = Long.toString(n);
270        return "     ".substring(Math.min(5, q.length())) + q;
271    }
272
273    /* This routine is called when some failure occurs. */
274
275    static void FAIL(String s) {
276        System.out.println("** " + s);
277    }
278
279    /**
280    * Given the data from the Unicode specification file, this routine builds a map.
281    *
282    * The specification file is assumed to contain its data in sorted order by
283    * character code; as a result, the array passed as an argument to this method
284    * has its components in the same sorted order, with one entry for each defined
285    * Unicode character or character range.  (A range is indicated by two consecutive
286    * entries, such that the name of the first entry begins with "<" and ends with
287    * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
288    * therefore a sparse representation of the character property data.
289    *
290    * The resulting map is dense representation of the character data.  It contains
291    * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
292    * of this long value are used, but type long is used rather than int to facilitate
293    * future extensions of this source code generator that might require more than
294    * 32 bits to encode relevant character properties.)  Entry k holds the encoded
295    * properties for character k.
296    *
297    * Method buildMap manages the transformation from the sparse representation to
298    * the dense representation.  It calls method buildOne to handle the encoding
299    * of character property data from a single UnicodeSpec object into 32 bits.
300    * For undefined characters, method buildOne is not called and the map entry for
301    * that character is set to UnicodeSpec.UNASSIGNED.
302    *
303    * @param data       character property data from the Unicode specification file
304    * @return   an array of length 65536 with one entry for every possible char value
305    *
306    * @see GenerateCharacter#buildOne
307    */
308
309    static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
310    {
311        long[] result;
312        if (bLatin1 == true) {
313            result = new long[256];
314        } else {
315            result = new long[1<<16];
316        }
317        int k=0;
318        int codePoint = plane<<16;
319        UnicodeSpec nonCharSpec = new UnicodeSpec();
320        for (int j = 0; j < data.length && k < result.length; j++) {
321            if (data[j].codePoint == codePoint) {
322                result[k] = buildOne(codePoint, data[j], specialMaps);
323                ++k;
324                ++codePoint;
325            }
326            else if(data[j].codePoint > codePoint) {
327                if (data[j].name.endsWith("Last>")) {
328                    // build map data for all chars except last in range
329                    while (codePoint < data[j].codePoint && k < result.length) {
330                        result[k] = buildOne(codePoint, data[j], specialMaps);
331                        ++k;
332                        ++codePoint;
333                    }
334                }
335                else {
336                    // we have a few unassigned chars before data[j].codePoint
337                    while (codePoint < data[j].codePoint && k < result.length) {
338                        result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
339                        ++k;
340                        ++codePoint;
341                    }
342                }
343                k = data[j].codePoint & 0xFFFF;
344                codePoint = data[j].codePoint;
345                result[k] = buildOne(codePoint, data[j], specialMaps);
346                ++k;
347                ++codePoint;
348            }
349            else {
350                System.out.println("An error has occured during spec mapping.");
351                System.exit(0);
352            }
353        }
354        // if there are still unprocessed chars, process them
355        // as unassigned/undefined.
356        codePoint = (plane<<16) | k;
357        while (k < result.length) {
358            result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
359            ++k;
360            ++codePoint;
361        }
362        // now add all extra supported properties from PropList, to the
363        // upper 16-bit
364        addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
365        addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
366        addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
367        addExProp(result, propList, "Ideographic", maskIdeographic);
368        //addExProp(result, propList, "Other_Math", maskOtherMath);
369        //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
370
371        return result;
372    }
373
374    // The maximum and minimum offsets found while scanning the database
375    static int maxOffsetSeen = 0;
376    static int minOffsetSeen = 0;
377
378    /**
379     * Some Unicode separator characters are not considered Java whitespace.
380     * @param c character to test
381     * @return true if c in an invalid Java whitespace character, false otherwise.
382     */
383    static boolean isInvalidJavaWhiteSpace(int c) {
384        int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
385        boolean retValue = false;
386        for(int x=0;x<exceptions.length;x++) {
387            if(c == exceptions[x]) {
388                retValue = true;
389                break;
390            }
391        }
392        return retValue;
393
394    }
395
396    /**
397    * Given the character property data for one Unicode character, encode the data
398    * of interest into a single long integer value.  (Right now only 32 bits
399    * of this long value are used, but type long is used rather than int to facilitate
400    * future extensions of this source code generator that might require more than
401    * 32 bits to encode relevant character properties.)
402    *
403    * @param c   the character code for which to encode property data
404    * @param us  property data record from the Unicode specification file
405    *            (its character code might not be equal to c if it specifies data
406    *            for a range of characters)
407    * @return   an encoded long value that contains the properties for a single char
408    *
409    * @see GenerateCharacter#buildMap
410    */
411
412    static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
413        long resultA = 0;
414        // record the general category
415        resultA |= us.generalCategory;
416
417        // record the numeric properties
418        NUMERIC: {
419        STRANGE: {
420            int val = 0;
421            // c is A-Z
422            if ((c >= 0x0041) && (c <= 0x005A)) {
423                val = c - 0x0041;
424                resultA |= valueJavaSupradecimal;
425            // c is a-z
426            } else if ((c >= 0x0061) && (c <= 0x007A)) {
427                val = c - 0x0061;
428                resultA |= valueJavaSupradecimal;
429            // c is a full-width A-Z
430            } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
431                val = c - 0xFF21;
432                resultA |= valueJavaSupradecimal;
433            // c is a full-width a-z
434            } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
435                val = c - 0xFF41;
436                resultA |= valueJavaSupradecimal;
437            } else if (us.isDecimalValue()) {
438                val = us.decimalValue;
439                resultA |= valueDigit;
440            } else if (us.isDigitValue()) {
441                val = us.digitValue;
442                resultA |= valueDigit;
443            } else {
444                if (us.numericValue.length() == 0) {
445                    break NUMERIC;                      // no numeric value at all
446                } else {
447                    try {
448                        val = Integer.parseInt(us.numericValue);
449                        if (val >= 32 || val < 0) break STRANGE;
450                        if (c == 0x215F) break STRANGE;
451                    } catch(NumberFormatException e) {
452                        break STRANGE;
453                    }
454                    resultA |= valueDigit;
455                }
456            }
457            if (val >= 32 || val < 0) break STRANGE;
458            resultA |= ((val - c & maskDigit) << shiftDigitOffset);
459            break NUMERIC;
460        } // end STRANGE
461        resultA |= valueStrangeNumeric;
462        } // end NUMERIC
463
464        // record case mapping
465        int offset = 0;
466        // might have a 1:M mapping
467        int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
468        boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
469        if (bHasUpper) {
470            resultA |= maskUpperCase;
471        }
472        if (specialMap != -1) {
473            // has mapping, but cannot record the
474            // proper offset; can only flag it and provide special case
475            // code in Character.java
476            offset = -1;
477        }
478        else if (us.hasUpperMap())  {
479            offset = c - us.upperMap;
480        }
481
482        if (us.hasLowerMap()) {
483            resultA |= maskLowerCase;
484            if (offset == 0)
485                offset = us.lowerMap - c;
486            else if (offset != (us.lowerMap - c)) {
487                if (DEBUG) {
488                FAIL("Character " + hex(c) +
489                " has incompatible lowercase and uppercase mappings");
490                }
491            }
492        }
493        if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
494            (bHasUpper && us.hasLowerMap())) {
495            resultA |= maskTitleCase;
496        }
497        if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
498            System.out.println("Warning: Character " + hex4(c) + " has upper but " +
499                               "no title case; Java won't know this");
500        }
501        if (offset < minOffsetSeen) minOffsetSeen = offset;
502        if (offset > maxOffsetSeen) maxOffsetSeen = offset;
503        if (offset > maxOffset || offset < minOffset) {
504            if (DEBUG) {
505            FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
506            }
507            offset = maskCase;
508        }
509        resultA |= ((offset & maskCase) << shiftCaseOffset);
510
511        // record lexical info about this character
512        if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
513                || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
514                || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
515                || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
516                || us.generalCategory == UnicodeSpec.OTHER_LETTER
517                || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
518            resultA |= valueJavaUnicodeStart;
519        }
520        else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
521                || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
522                || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
523            resultA |= valueJavaUnicodePart;
524        }
525        else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
526            resultA |= valueJavaStartUnicodePart;
527        }
528        else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
529            resultA |= valueJavaOnlyStart;
530        }
531        else if (((c >= 0x0000) && (c <= 0x0008))
532                || ((c >= 0x000E) && (c <= 0x001B))
533                || ((c >= 0x007F) && (c <= 0x009F))
534                || us.generalCategory == UnicodeSpec.FORMAT) {
535            resultA |= valueIgnorable;
536        }
537        else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
538                || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
539                || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
540            if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
541        }
542        else if (((c >= 0x0009) && (c <= 0x000D))
543                || ((c >= 0x001C) && (c <= 0x001F))) {
544            resultA |= valueJavaWhitespace;
545        }
546
547        // record bidi category
548        if (!nobidi) {
549            int tmpBidi =
550                (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
551                    us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
552            resultA |= tmpBidi;
553        }
554
555        // record mirrored property
556        if (!nomirror) {
557            resultA |= us.mirrored ? maskMirrored : 0;
558        }
559
560        if (identifiers) {
561            long replacement = 0;
562            if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
563                replacement |= bitJavaStart;
564            }
565            if ( ((resultA & nonzeroJavaPart) != 0)
566                    && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
567                replacement |= bitJavaPart;
568            }
569            resultA = replacement;
570        }
571        return resultA;
572    }
573
574    static void addExProp(long[] map, PropList propList, String prop, long mask) {
575        List<Integer> cps = propList.codepoints(prop);
576        if (cps != null) {
577            for (Integer cp : cps) {
578                if (cp < map.length)
579                    map[cp] |= mask;
580            }
581        }
582    }
583
584    /**
585    * This is the heart of the table compression strategy.  The inputs are a map
586    * and a number of bits (size).  The map is simply an array of long integer values;
587    * the number of bits indicates how index values for that map are to be split.
588    * The length of the given map must be a multiple of (1 << size).  The result is
589    * a new map z and a compressed table t such that for every valid index value k
590    * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
591    *
592    * In other words, the index k can be split into two parts, namely the "size"
593    * low-order bits and all the remaining high-order bits; the high-order bits are then
594    * remapped by map z to produce an index into table t.  In effect, the data of the
595    * original map m is broken up into blocks of size (1<<size); the compression relies
596    * on the expectation that many of these blocks will be identical and therefore need
597    * be represented only once in the compressed table t.
598    *
599    * This method is intended to be used iteratively.  The first map to be handed
600    * to it is the one constructed by method buildMap.  After that, the first of the
601    * two arrays returned by this method is fed back into it for further compression.
602    * At the end of the iteration, one has a starter map and a sequence of tables.
603    *
604    * The algorithm used to implement this computation is straightforward and not
605    * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
606    * to locate identical blocks, so overall the time complexity of the algorithm
607    * is quadratic in the length of the input map.  Fortunately, speed is not crucial
608    * to this application.
609    *
610    * @param map                a map to be compressed
611    * @param size       the number of index bits to be split off by the compression
612    * @return   an array of length 2 containing two arrays; the first is a new map
613    *           and the second is a compressed data table
614    *
615    * @see GenerateCharacter#buildMap
616    */
617
618    static long[][] buildTable(long[] map, int size) {
619        int n = map.length;
620        if (((n >> size) << size) != n) {
621            FAIL("Length " + n + " is not a multiple of " + (1 << size));
622        }
623        int m = 1 << size;
624        // We know the final length of the new map up front.
625        long[] newmap = new long[n >> size];
626        // The buffer is used temporarily to hold data for the compressed table
627        // because we don't know its final length yet.
628        long[] buffer = new long[n];
629        int ptr = 0;
630OUTER:  for (int i = 0; i < n; i += m) {
631            // For every block of size m in the original map...
632    MIDDLE: for (int j = 0; j < ptr; j += m) {
633            // Find out whether there is already a block just like it in the buffer.
634                for (int k = 0; k < m; k++) {
635                    if (buffer[j+k] != map[i+k])
636                        continue MIDDLE;
637                }
638                // There is a block just like it at position j, so just
639                // put its index into the new map (thereby sharing it).
640                newmap[i >> size] = (j >> size);
641                continue OUTER;
642            } // end MIDDLE
643            // There is no block just like it already, so add it to
644            // the buffer and put its index into the new map.
645            for (int k = 0; k < m; k++) {
646                buffer[ptr+k] = map[i+k];
647            }
648            newmap[i >> size] = (ptr >> size);
649            ptr += m;
650        } // end OUTER
651        // Now we know how long the compressed table should be,
652        // so create a new array and copy data from the temporary buffer.
653        long[] newdata = new long[ptr];
654        for (int j = 0; j < ptr; j++) {
655            newdata[j] = buffer[j];
656        }
657        // Return the new map and the new data table.
658        long[][] result = { newmap, newdata };
659        return result;
660    }
661
662    /**
663    * Once the compressed tables have been computed, this method reads in a
664    * template file for the source code to be generated and writes out the final
665    * source code by acting as a sort of specialized macro processor.
666    *
667    * The first output line is a comment saying that the file was automatically
668    * generated; it includes a timestamp.  All other output is generated by
669    * reading a line from the template file, performing macro replacements,
670    * and then writing the resulting line or lines of code to the output file.
671    *
672    * This method handles the I/O, the timestamp comment, and the locating of
673    * macro calls within each input line.  The method replaceCommand is called
674    * to generate replacement text for each macro call.
675    *
676    * Macro calls to be replaced are indicated in the template file by
677    * occurrences of the commandMarker "$$".  The rest of the call may consist
678    * of Java letters (including the underscore "_") and also of balanced
679    * parentheses.
680    *
681    * @param theTemplateFileName
682    *           the file name for the template input file
683    * @param theOutputFileName
684    *           the file name for the source code output file
685    *
686    *     @see GenerateCharacter#replaceCommand
687    */
688
689    static void generateCharacterClass(String theTemplateFileName,
690                                       String theOutputFileName)
691        throws FileNotFoundException, IOException {
692        BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
693        PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
694        out.println(commentStart +
695            " This file was generated AUTOMATICALLY from a template file " +
696            new java.util.Date() + commentEnd);
697        int marklen = commandMarker.length();
698        LOOP: while(true) {
699            try {
700                String line = in.readLine();
701                if (line == null) break LOOP;
702                int pos = 0;
703                int depth = 0;
704                while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
705                    int newpos = pos + marklen;
706                    char ch = 'x';
707                    SCAN: while (newpos < line.length() &&
708                            (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
709                            || ch == '(' || (ch == ')' && depth > 0))) {
710                        ++newpos;
711                        if (ch == '(') {
712                            ++depth;
713                        }
714                        else if (ch == ')') {
715                            --depth;
716                            if (depth == 0)
717                                break SCAN;
718                        }
719                    }
720                    String replacement = replaceCommand(line.substring(pos + marklen, newpos));
721                    line = line.substring(0, pos) + replacement + line.substring(newpos);
722                    pos += replacement.length();
723                }
724                out.println(line);
725            }
726            catch (IOException e) {
727                break LOOP;
728            }
729        }
730        in.close();
731        out.close();
732    }
733
734    /**
735    * The replaceCommand method takes a command (a macro call without the
736    * leading marker "$$") and computes replacement text for it.
737    *
738    * Most of the commands are simply names of integer constants that are defined
739    * in the source code of this GenerateCharacter class.  The replacement text is
740    * simply the value of the constant as an appropriately formatted integer literal.
741    *
742    * Two cases are more complicated, however.  The command "Tables" causes the
743    * final map and compressed tables to be emitted, with elaborate comments
744    * describing their contents.  (This is actually handled by method genTables.)
745    * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
746    * an expression that will return the character property data for the character
747    * whose code is the value of the variable "xxx".  (this is handled by method
748    * "genAccess".)
749    *
750    * @param x  a command from the template file to be replaced
751    * @return   the replacement text, as a String
752    *
753    * @see GenerateCharacter#genTables
754    * @see GenerateCharacter#genAccess
755    * @see GenerateCharacter#generateCharacterClass
756    */
757
758    static String replaceCommand(String x) {
759        if (x.equals("Tables")) return genTables();
760        if (x.equals("Initializers")) return genInitializers();
761        if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
762                x.substring(x.length()-1).equals(")") )
763            return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
764        if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
765                x.substring(x.length()-1).equals(")") )
766            return genAccess("B", x.substring(9, x.length()-1), 16);
767        if (x.equals("shiftType")) return Long.toString(shiftType);
768        if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
769        if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
770        if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
771        if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
772        if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
773        if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
774        if (x.equals("maskCase")) return "0x" + hex8(maskCase);
775        if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
776        if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
777        if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
778        if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
779        if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
780        if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
781        if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
782        if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
783        if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
784        if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
785        if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
786        if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
787        if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
788        if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
789        if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
790        if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
791        if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
792        if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
793        if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
794        if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
795        if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
796        if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
797        if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
798        if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
799        if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
800        if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
801        if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
802        if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
803        if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
804        if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
805        if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
806        if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
807        if (x.equals("maskType")) return "0x" + hex(maskType);
808        if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
809        if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
810        if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
811        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
812            return Integer.toString(UnicodeSpec.UNASSIGNED);
813        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
814            return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
815        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
816            return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
817        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
818            return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
819        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
820             return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
821        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
822             return Integer.toString(UnicodeSpec.OTHER_LETTER);
823        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
824             return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
825        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
826             return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
827        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
828             return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
829        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
830             return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
831        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
832             return Integer.toString(UnicodeSpec.OTHER_NUMBER);
833        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
834             return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
835        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
836             return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
837        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
838             return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
839        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
840            return Integer.toString(UnicodeSpec.CONTROL);
841        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
842            return Integer.toString(UnicodeSpec.FORMAT);
843        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
844            return Integer.toString(UnicodeSpec.PRIVATE_USE);
845        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
846            return Integer.toString(UnicodeSpec.SURROGATE);
847        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
848            return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
849        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
850            return Integer.toString(UnicodeSpec.START_PUNCTUATION);
851        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
852            return Integer.toString(UnicodeSpec.END_PUNCTUATION);
853        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
854            return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
855        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
856            return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
857        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
858            return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
859        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
860            return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
861        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
862            return Integer.toString(UnicodeSpec.LETTER_NUMBER);
863        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
864            return Integer.toString(UnicodeSpec.MATH_SYMBOL);
865        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
866            return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
867        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
868            return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
869        if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
870            return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
871        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
872            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
873        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
874            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
875        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
876            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
877        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
878            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
879        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
880            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
881        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
882            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
883        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
884            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
885        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
886            return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
887        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
888            return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
889        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
890            return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
891        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
892            return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
893        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
894            return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
895        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
896            return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
897        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
898            return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
899         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
900            return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
901        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
902            return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
903        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
904            return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
905        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
906            return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
907        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
908            return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
909        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG]))
910            return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE);
911        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG]))
912            return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE);
913        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG]))
914            return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE);
915        if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG]))
916            return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE);
917        FAIL("Unknown text substitution marker " + commandMarker + x);
918        return commandMarker + x;
919    }
920
921    /**
922    * The genTables method generates source code for all the lookup tables
923    * needed to represent the various Unicode character properties.
924    * It simply calls the method genTable once for each table to be generated
925    * and then generates a summary comment.
926    *
927    * @return   the replacement text for the "Tables" command, as a String
928    *
929    * @see GenerateCharacter#genTable
930    * @see GenerateCharacter#replaceCommand
931    */
932    static String genTables() {
933        int n = sizes.length;
934        StringBuffer result = new StringBuffer();
935        // liu : Add a comment showing the source of this table
936        result.append(commentStart + " The following tables and code generated using:" +
937                  commentEnd + "\n  ");
938        result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
939
940                if (plane == 0 && bLatin1 == false) {
941            genCaseMapTableDeclaration(result);
942            genCaseMapTable(initializers, specialCaseMaps);
943                }
944        int totalBytes = 0;
945        for (int k = 0; k < n - 1; k++) {
946            genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
947                sizes[k+1], false, false, k==0);
948            int s = bytes[k];
949            if (s == 1 && useCharForByte) {
950                s = 2;
951            }
952            totalBytes += tables[k].length * s;
953        }
954        genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
955            sizes[n - 1], false, 0, true, !(identifiers), false);
956
957        // If we ever need more than 32 bits to represent the character properties,
958        // then a table "B" may be needed as well.
959        genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
960
961        totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
962        result.append(commentStart);
963        result.append(" In all, the character property tables require ");
964        result.append(totalBytes).append(" bytes.").append(commentEnd);
965        if (verbose) {
966            System.out.println("The character property tables require "
967                 + totalBytes + " bytes.");
968        }
969        return result.toString();
970    }
971
972    /**
973     * The genInitializers method generates the body of the
974     * ensureInitted() method, which enables lazy initialization of
975     * the case map table and other tables.
976     */
977    static String genInitializers() {
978        return initializers.toString();
979    }
980
981    /**
982     * Return the total number of bytes needed by all tables.  This is a stripped-
983     * down copy of genTables().
984     */
985    static int getTotalBytes() {
986        int n = sizes.length;
987        int totalBytes = 0;
988        for (int k = 0; k < n - 1; k++) {
989            totalBytes += tables[k].length * bytes[k];
990        }
991        totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
992                         + 31) >> 5) << 2);
993        return totalBytes;
994    }
995
996    static void appendEscapedStringFragment(StringBuffer result,
997                                            char[] line,
998                                            int length,
999                                            boolean lastFragment) {
1000        result.append("    \"");
1001        for (int k=0; k<length; ++k) {
1002            result.append("\\u");
1003            result.append(hex4(line[k]));
1004        }
1005        result.append("\"");
1006        result.append(lastFragment ? ";" : "+");
1007        result.append("\n");
1008    }
1009
1010    static String SMALL_INITIALIZER =
1011        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1012        // "            $$name = new $$type[$$size];\n"+
1013        "            int len = $$name_DATA.length();\n"+
1014        "            int j=0;\n"+
1015        "            for (int i=0; i<len; ++i) {\n"+
1016        "                int c = $$name_DATA.charAt(i);\n"+
1017        "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
1018        "                    $$name[j++] = ($$type)c;\n"+
1019        "                    c >>= $$bits;\n"+
1020        "                }\n"+
1021        "            }\n"+
1022        "            assert (j == $$size);\n"+
1023        "        }\n";
1024
1025    static String SAME_SIZE_INITIALIZER =
1026        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1027        "            assert ($$name_DATA.length() == $$size);\n"+
1028        // "            $$name = new $$type[$$size];\n"+
1029        "            for (int i=0; i<$$size; ++i)\n"+
1030        "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
1031        "        }\n";
1032
1033    static String BIG_INITIALIZER =
1034        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1035        // "            $$name = new $$type[$$size];\n"+
1036        "            int len = $$name_DATA.length();\n"+
1037        "            int j=0;\n"+
1038        "            int charsInEntry=0;\n"+
1039        "            $$type entry=0;\n"+
1040        "            for (int i=0; i<len; ++i) {\n"+
1041        "                entry |= $$name_DATA.charAt(i);\n"+
1042        "                if (++charsInEntry == $$charsPerEntry) {\n"+
1043        "                    $$name[j++] = entry;\n"+
1044        "                    entry = 0;\n"+
1045        "                    charsInEntry = 0;\n"+
1046        "                }\n"+
1047        "                else {\n"+
1048        "                    entry <<= 16;\n"+
1049        "                }\n"+
1050        "            }\n"+
1051        "            assert (j == $$size);\n"+
1052        "        }\n";
1053
1054    static String INT32_INITIALIZER =
1055        "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1056        "            char[] data = $$name_DATA.toCharArray();\n"+
1057        "            assert (data.length == ($$size * 2));\n"+
1058        "            int i = 0, j = 0;\n"+
1059        "            while (i < ($$size * 2)) {\n"+
1060        "                int entry = data[i++] << 16;\n"+
1061        "                $$name[j++] = entry | data[i++];\n"+
1062        "            }\n"+
1063        "        }\n";
1064
1065    static void addInitializer(String name, String type, int entriesPerChar,
1066                               int bits, int size) {
1067
1068        String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1069                          ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1070        if (entriesPerChar == -2) {
1071            template = INT32_INITIALIZER;
1072        }
1073        int marklen = commandMarker.length();
1074        int pos = 0;
1075        while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1076            int newpos = pos + marklen;
1077            char ch = 'x';
1078            while (newpos < template.length() &&
1079                   Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1080                   ch != '_') // Don't allow this in token names
1081                ++newpos;
1082            String token = template.substring(pos+marklen, newpos);
1083            String replacement = "ERROR";
1084
1085            if (token.equals("name")) replacement = name;
1086            else if (token.equals("type")) replacement = type;
1087            else if (token.equals("bits")) replacement = ""+bits;
1088            else if (token.equals("size")) replacement = ""+size;
1089            else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1090            else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1091            else FAIL("Unrecognized token: " + token);
1092
1093            template = template.substring(0, pos) + replacement + template.substring(newpos);
1094            pos += replacement.length();
1095        }
1096        initializers.append(template);
1097    }
1098
1099    /**
1100    * The genTable method generates source code for one lookup table.
1101    * Most of the complexity stems from handling various options as to
1102    * the type of the array components, the precise representation of the
1103    * values, the format in which to render each value, the number of values
1104    * to emit on each line of source code, and the kinds of useful comments
1105    * to be generated.
1106    *
1107    * @param result     a StringBuffer, to which the generated source code
1108    *                   text is to be appended
1109    * @param name       the name of the table
1110    * @param table      the table data (an array of long values)
1111    * @param extract    a distance, in bits, by which each entry of the table
1112    *                   is to be right-shifted before it is processed
1113    * @param bits       the number of bits (not bytes) to be used to represent
1114    *                   each table entry
1115    * @param size       the table data is divided up into blocks of size (1<<size);
1116    *                   in this method, this information is used only to affect
1117    *                   how many table values are to be generated per line
1118    * @param preshifted if this flag is true, then the table entries are to be
1119    *                   emitted in a preshifted form; that is, each value should
1120    *                   be left-shifted by the amount "shift", so that this work
1121    *                   is built into the table and need not be performed by an
1122    *                   explicit shift operator at run time
1123    * @param shift      this is the shift amount for preshifting of table entries
1124    * @param hexFormat  if this flag is true, table entries should be emitted as
1125    *                   hexadecimal literals; otherwise decimal literals are used
1126    * @param properties if this flag is true, the table entries are encoded
1127    *                   character properties rather than indexes into yet other tables;
1128    *                   therefore comments describing the encoded properties should
1129    *                   be generated
1130    * @param hexComment if this flag is true, each line of output is labelled with
1131    *                   a hexadecimal comment indicating the character values to
1132    *                   which that line applies; otherwise, decimal values indicating
1133    *                   table indices are generated
1134    *
1135    * @see GenerateCharacter#genTables
1136    * @see GenerateCharacter#replaceCommand
1137    */
1138
1139    static void genTable(StringBuffer result, String name,
1140                         long[] table, int extract, int bits, int size,
1141                         boolean preshifted, int shift, boolean hexFormat,
1142                         boolean properties, boolean hexComment) {
1143
1144        String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1145            bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1146            bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1147            bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1148            bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1149            bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1150            (Csyntax ? "int64" : "long");
1151        long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1152            bits == 2 ? Integer.MAX_VALUE :
1153            bits == 4 ? Integer.MAX_VALUE :
1154            bits == 8 ? Byte.MAX_VALUE :
1155            bits == 16 ? Short.MAX_VALUE :
1156            bits == 32 ? Integer.MAX_VALUE :
1157            Long.MAX_VALUE;
1158        int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1159        boolean shiftEntries = preshifted && shift != 0;
1160        if (bits == 8 && tableAsString && useCharForByte) {
1161            atype = "char";
1162            maxPosEntry = Character.MAX_VALUE;
1163            entriesPerChar = 1;
1164        }
1165        boolean noConversion = atype.equals("char");
1166
1167        result.append(commentStart);
1168        result.append(" The ").append(name).append(" table has ").append(table.length);
1169        result.append(" entries for a total of ");
1170        int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1171        if (bits == 8 && useCharForByte) {
1172            sizeOfTable *= 2;
1173        }
1174        result.append(sizeOfTable);
1175        result.append(" bytes.").append(commentEnd).append("\n\n");
1176        if (Csyntax)
1177            result.append("  static ");
1178        else
1179            result.append("  static final ");
1180        result.append(atype);
1181        result.append(" ").append(name).append("[");
1182        if (Csyntax)
1183            result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1184        if (tableAsString) {
1185            if (noConversion) {
1186                result.append("] = (\n");
1187            } else {
1188                result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1189                result.append("static final String ").append(name).append("_DATA =\n");
1190            }
1191            int CHARS_PER_LINE = 8;
1192            StringBuffer theString = new StringBuffer();
1193            int entriesInCharSoFar = 0;
1194            char ch = '\u0000';
1195            int charsPerEntry = -entriesPerChar;
1196            for (int j=0; j<table.length; ++j) {
1197                //long entry = table[j] >> extract;
1198                long entry;
1199                if ("A".equals(name))
1200                    entry = (table[j] & 0xffffffffL) >> extract;
1201                else
1202                    entry = (table[j] >> extract);
1203                if (shiftEntries) entry <<= shift;
1204                if (entry >= (1L << bits)) {
1205                    FAIL("Entry too big");
1206                }
1207                if (entriesPerChar > 0) {
1208                    // Pack multiple entries into a character
1209                    ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1210                    ++entriesInCharSoFar;
1211                    if (entriesInCharSoFar == entriesPerChar) {
1212                        // Character is full
1213                        theString.append(ch);
1214                        entriesInCharSoFar = 0;
1215                        ch = '\u0000';
1216                    }
1217                }
1218                else {
1219                    // Use multiple characters per entry
1220                    for (int k=0; k<charsPerEntry; ++k) {
1221                        ch = (char)(entry >> ((charsPerEntry-1)*16));
1222                        entry <<= 16;
1223                        theString.append(ch);
1224                    }
1225                }
1226            }
1227            if (entriesInCharSoFar > 0) {
1228                while (entriesInCharSoFar < entriesPerChar) {
1229                    ch = (char)((int)ch >> bits);
1230                    ++entriesInCharSoFar;
1231                }
1232                theString.append(ch);
1233                entriesInCharSoFar = 0;
1234            }
1235            result.append(Utility.formatForSource(theString.toString(), "    "));
1236            if (noConversion) {
1237                result.append(").toCharArray()");
1238            }
1239            result.append(";\n\n  ");
1240
1241            if (!noConversion) {
1242                addInitializer(name, atype, entriesPerChar, bits, table.length);
1243            }
1244        }
1245        else {
1246            result.append("] = {");
1247            boolean castEntries = shiftEntries && (bits < 32);
1248            int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1249                bits == 2 ? 16*4 :
1250                bits == 4 ? 8*4 :
1251                bits == 8 ? 8 :
1252                bits == 16 ? 8 :
1253                bits == 32 ? 4 : 2) :
1254                (bits == 8 ? 8 :
1255                bits == 16 ? 8 : 4);
1256            int printMask = properties ? 0 :
1257            Math.min(1 << size,
1258                printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1259            int commentShift = ((1 << size) == table.length) ? 0 : size;
1260            int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1261            long val = 0;
1262            for (int j = 0; j < table.length; j++) {
1263                if ((j & printMask) == 0) {
1264                    while (result.charAt(result.length() - 1) == ' ')
1265                        result.setLength(result.length() - 1);
1266                    result.append("\n    ");
1267                }
1268        PRINT:  {
1269                if (castEntries)
1270                    result.append("(").append(atype).append(")(");
1271                long entry = table[j] >> extract;
1272                int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1273                int k = j & packMask;
1274                if (bits >= 8)
1275                    val = entry;
1276                else if (k == 0) {
1277                    val = entry;
1278                    break PRINT;
1279                }
1280                else {
1281                    val |= (entry << (k*bits));
1282                    if (k != packMask)
1283                        break PRINT;
1284                }
1285                if (val > maxPosEntry && !Csyntax) { // liu
1286                // For values that are out of range, convert them to in-range negative values.
1287                // Actually, output the '-' and convert them to the negative of the corresponding
1288                // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1289                    result.append('-');
1290                    val = maxPosEntry + maxPosEntry + 2 - val;
1291                }
1292                if (hexFormat) {
1293                    result.append("0x");
1294                    if (bits == 8)
1295                        result.append(hex2((byte)val));
1296                    else if (bits == 16)
1297                        result.append(hex4((short)val));
1298                    else if (bits == 32 || bits < 8)
1299                        result.append(hex8((int)val));
1300                    else {
1301                        result.append(hex16(val));
1302                        if (!Csyntax)
1303                            result.append("L");
1304                    }
1305                }
1306                else {
1307                    if (bits == 8)
1308                        result.append(dec3(val));
1309                    else if (bits == 64) {
1310                        result.append(dec5(val));
1311                        if (!Csyntax)
1312                            result.append("L");
1313                    }
1314                    else
1315                        result.append(dec5(val));
1316                }
1317                if (shiftEntries)
1318                    result.append("<<").append(shift);
1319                if (castEntries) result.append(")");
1320                if (j < (table.length - 1))
1321                    result.append(", ");
1322                else
1323                    result.append("  ");
1324                if ((j & printMask) == printMask) {
1325                    result.append(" ").append(commentStart).append(" ");
1326                    if (hexComment)
1327                        result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1328                    else
1329                        result.append(dec3((j & ~commentMask) >> commentShift));
1330                    if (properties) propertiesComments(result, val);
1331                    result.append(commentEnd);
1332                }
1333                } // end PRINT
1334            }
1335            result.append("\n  };\n\n  ");
1336        }
1337    }
1338
1339    static void genCaseMapTableDeclaration(StringBuffer result) {
1340        String myTab = "    ";
1341        result.append(myTab + "static final char[][][] charMap;\n");
1342    }
1343
1344    static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1345        String myTab = "    ";
1346        int ch;
1347        char[] map;
1348        result.append(myTab + "charMap = new char[][][] {\n");
1349        for (int x = 0; x < specialCaseMaps.length; x++) {
1350            ch = specialCaseMaps[x].getCharSource();
1351            map = specialCaseMaps[x].getUpperCaseMap();
1352            result.append(myTab + myTab);
1353            result.append("{ ");
1354            result.append("{\'\\u"+hex4(ch)+"\'}, {");
1355            for (int y = 0; y < map.length; y++) {
1356                result.append("\'\\u"+hex4(map[y])+"\', ");
1357            }
1358            result.append("} },\n");
1359        }
1360        result.append(myTab + "};\n");
1361
1362    }
1363
1364    /**
1365    * The propertiesComments method generates comments describing encoded
1366    * character properties.
1367    *
1368    * @param result     a StringBuffer, to which the generated source code
1369    *                   text is to be appended
1370    * @param val                encoded character properties
1371    *
1372    * @see GenerateCharacter#genTable
1373    */
1374
1375    static void propertiesComments(StringBuffer result, long val) {
1376        result.append("   ");
1377        switch ((int)(val & maskType)) {
1378            case UnicodeSpec.CONTROL:
1379                result.append("Cc");
1380                break;
1381            case UnicodeSpec.FORMAT:
1382                result.append("Cf");
1383                break;
1384            case UnicodeSpec.PRIVATE_USE:
1385                result.append("Co");
1386                break;
1387            case UnicodeSpec.SURROGATE:
1388                result.append("Cs");
1389                break;
1390            case UnicodeSpec.LOWERCASE_LETTER:
1391                result.append("Ll");
1392                break;
1393            case UnicodeSpec.MODIFIER_LETTER:
1394                result.append("Lm");
1395                break;
1396            case UnicodeSpec.OTHER_LETTER:
1397                result.append("Lo");
1398                break;
1399            case UnicodeSpec.TITLECASE_LETTER:
1400                result.append("Lt");
1401                break;
1402            case UnicodeSpec.UPPERCASE_LETTER:
1403                result.append("Lu");
1404                break;
1405            case UnicodeSpec.COMBINING_SPACING_MARK:
1406                result.append("Mc");
1407                break;
1408            case UnicodeSpec.ENCLOSING_MARK:
1409                result.append("Me");
1410                break;
1411            case UnicodeSpec.NON_SPACING_MARK:
1412                result.append("Mn");
1413                break;
1414            case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1415                result.append("Nd");
1416                break;
1417            case UnicodeSpec.LETTER_NUMBER:
1418                result.append("Nl");
1419                break;
1420            case UnicodeSpec.OTHER_NUMBER:
1421                result.append("No");
1422                break;
1423            case UnicodeSpec.CONNECTOR_PUNCTUATION:
1424                result.append("Pc");
1425                break;
1426            case UnicodeSpec.DASH_PUNCTUATION:
1427                result.append("Pd");
1428                break;
1429            case UnicodeSpec.END_PUNCTUATION:
1430                result.append("Pe");
1431                break;
1432            case UnicodeSpec.OTHER_PUNCTUATION:
1433                result.append("Po");
1434                break;
1435            case UnicodeSpec.START_PUNCTUATION:
1436                result.append("Ps");
1437                break;
1438            case UnicodeSpec.CURRENCY_SYMBOL:
1439                result.append("Sc");
1440                break;
1441            case UnicodeSpec.MODIFIER_SYMBOL:
1442                result.append("Sk");
1443                break;
1444            case UnicodeSpec.MATH_SYMBOL:
1445                result.append("Sm");
1446                break;
1447            case UnicodeSpec.OTHER_SYMBOL:
1448                result.append("So");
1449                break;
1450            case UnicodeSpec.LINE_SEPARATOR:
1451                result.append("Zl"); break;
1452            case UnicodeSpec.PARAGRAPH_SEPARATOR:
1453                result.append("Zp");
1454                break;
1455            case UnicodeSpec.SPACE_SEPARATOR:
1456                result.append("Zs");
1457                break;
1458            case UnicodeSpec.UNASSIGNED:
1459                result.append("unassigned");
1460                break;
1461        }
1462
1463        switch ((int)((val & maskBidi) >> shiftBidi)) {
1464            case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1465                result.append(", L");
1466                break;
1467            case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1468                result.append(", R");
1469                break;
1470            case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1471                result.append(", EN");
1472                break;
1473            case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1474                result.append(", ES");
1475                break;
1476            case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1477                result.append(", ET");
1478                break;
1479            case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1480                result.append(", AN");
1481                break;
1482            case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1483                result.append(", CS");
1484                break;
1485            case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1486                result.append(", B");
1487                break;
1488            case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1489                result.append(", S");
1490                break;
1491            case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1492                result.append(", WS");
1493                break;
1494            case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1495                result.append(", ON");
1496                break;
1497        }
1498        if ((val & maskUpperCase) != 0) {
1499            result.append(", hasUpper (subtract ");
1500            result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1501        }
1502        if ((val & maskLowerCase) != 0) {
1503            result.append(", hasLower (add ");
1504            result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1505        }
1506        if ((val & maskTitleCase) != 0) {
1507            result.append(", hasTitle");
1508        }
1509        if ((val & maskIdentifierInfo) == valueIgnorable) {
1510            result.append(", ignorable");
1511        }
1512        if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1513            result.append(", identifier part");
1514        }
1515        if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1516            result.append(", underscore");
1517        }
1518        if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1519            result.append(", whitespace");
1520        }
1521        if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1522            result.append(", currency");
1523        }
1524        if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1525            result.append(", identifier start");
1526        }
1527        if ((val & maskNumericType) == valueDigit) {
1528            result.append(", decimal ");
1529            result.append((val & maskDigitOffset) >> shiftDigitOffset);
1530        }
1531        if ((val & maskNumericType) == valueStrangeNumeric) {
1532            result.append(", strange");
1533        }
1534        if ((val & maskNumericType) == valueJavaSupradecimal) {
1535            result.append(", supradecimal ");
1536            result.append((val & maskDigitOffset) >> shiftDigitOffset);
1537        }
1538    }
1539
1540    static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1541
1542    static String tableName(int j) { return tableNames[j]; }
1543
1544    /**
1545    * The genAccess method generates source code for one table access expression.
1546    *
1547    * Most of the complexity stems from handling various options as to
1548    * table representation, such as whether it contains values so large that
1549    * they are represented as negative values and whether the table values are
1550    * preshifted.  This method also avoids such "ugly" expressions as shifting
1551    * by distance zero, masking when no masking is necessary, and so on.
1552    * For clarity, it generates expressions that do not rely on operator
1553    * precedence, but otherwise it avoids generating redundant parentheses.
1554    *
1555    * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1556    * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1557    *
1558    * @param tbl                the name of the final table to be accessed
1559    * @param var                the variable name that appeared in parentheses in the
1560    *                           "Lookup" command
1561    * @param bits       the number of bits (not bytes) to be used to represent
1562    *                   the final table entry
1563    * @return   the replacement text for the "Lookup(xxx)" command, as a String
1564    *
1565    * @see GenerateCharacter#replaceCommand
1566    */
1567
1568    static String genAccess(String tbl, String var, int bits) {
1569        String access = null;
1570        int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1571        for (int k = 0; k < sizes.length; k++) {
1572            int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1573            int shift = shifts[k] + offset;
1574            String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1575            int mask = (1 << (sizes[k] - offset)) - 1;
1576            String masked = (k == 0) ? shifted :
1577              "(" + shifted + "&0x" + hex(mask) + ")";
1578            String index = (k == 0) ? masked :
1579             (mask == 0) ? access : "(" + access + "|" + masked + ")";
1580            String indexNoParens = (index.charAt(0) != '(') ? index :
1581                 index.substring(1, index.length() - 1);
1582            String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1583            String fetched = tblname + "[" + indexNoParens + "]";
1584            String zeroextended = (zeroextend[k] == 0) ? fetched :
1585                "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1586            int adjustment = preshifted[k] ? 0 :
1587               sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1588            String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1589                "(" + zeroextended + "<<" + adjustment + ")";
1590            String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1591                (bits == 2) ? "((" + var + "&0xF)<<1)" :
1592                (bits == 4) ? "((" + var + "&7)<<2)" : null;
1593            String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1594                "((" + adjusted + ">>" + bitshift + ")&" +
1595                (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1596            access = extracted;
1597        }
1598        return access;
1599    }
1600
1601    /* The command line arguments are decoded and used to set the following
1602     global variables.
1603     */
1604
1605    static boolean verbose = false;
1606    static boolean nobidi = false;
1607    static boolean nomirror = false;
1608    static boolean identifiers = false;
1609    static boolean Csyntax = false;
1610    static String TemplateFileName = null;
1611    static String OutputFileName = null;
1612    static String UnicodeSpecFileName = null; // liu
1613    static String SpecialCasingFileName = null;
1614    static String PropListFileName = null;
1615    static boolean useCharForByte = false;
1616    static int[] sizes;
1617    static int bins = 0; // liu; if > 0, then perform search
1618    static boolean tableAsString = false;
1619    static boolean bLatin1 = false;
1620
1621    static String commandLineDescription;
1622
1623    /* Other global variables, equal in length to the "sizes" array. */
1624
1625    static int[] shifts;
1626    static int[] zeroextend;
1627    static int[] bytes;
1628    static boolean[] preshifted;
1629    static long[][] tables;
1630
1631
1632    /* Other global variables */
1633    static String commentStart;
1634    static String commentEnd;
1635
1636    static StringBuffer initializers = new StringBuffer();
1637
1638    /* special casing rules for 1:M toUpperCase mappings */
1639    static SpecialCaseMap[] specialCaseMaps;
1640
1641    /**
1642    * Process the command line arguments.
1643    *
1644    * The allowed flags in command line are:
1645    * <dl>
1646    * <dt> -verbose             <dd> Emit comments to standard output describing
1647    *                                   what's going on during the processing.
1648    * <dt> -nobidi              <dd> Do not include bidi categories in the
1649    *                                   encoded character properties.
1650    * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1651    *                        character properties.
1652    * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1653    * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1654    * <dt> -o filename          <dd> Specify output file name.
1655    * <dt> -template filename   <dd> Specify template input file name.
1656    * <dt> -spec filename        <dd> Specify Unicode spec file name.
1657    * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1658    * <dt> -search bins          <dd> Try different partitions into the specified
1659    *                                    number of bins.  E.g., for 2 bins, try
1660    *                                    16 0, 15 1,..., 0 16.
1661    * <dt> -string               <dd> Create table as string.  Only valid with Java
1662    *                                    syntax.
1663    * <dt> -latin1          <dd> Create a latin 1 only property table.
1664    * </dl>
1665    * In addition, decimal literals may appear as command line arguments;
1666    * each one represents the number of bits of the character to be broken
1667    * off at each lookup step.  If present, they must add up to 16 (the number
1668    * of bits in a char value).  For smaller tables, the last value should
1669    * be 0; values other than the last one may not be zero.  If no such
1670    * numeric values are provided, default values are used.
1671    *
1672    * @param args       the command line arguments, as an array of String
1673    *
1674    * @see GenerateCharacter#main
1675    */
1676
1677    static void processArgs(String[] args) {
1678        StringBuffer desc = new StringBuffer("java GenerateCharacter");
1679        for (int j=0; j<args.length; ++j) {
1680            desc.append(" " + args[j]);
1681        }
1682        for (int j = 0; j < args.length; j++) {
1683            if (args[j].equals("-verbose") || args[j].equals("-v"))
1684                verbose = true;
1685            else if (args[j].equals("-nobidi"))
1686                nobidi = true;
1687            else if (args[j].equals("-nomirror"))
1688                nomirror = true;
1689            else if (args[j].equals("-identifiers"))
1690                identifiers = true;
1691            else if (args[j].equals("-c"))
1692                Csyntax = true;
1693            else if (args[j].equals("-string"))
1694                tableAsString = true;
1695            else if (args[j].equals("-o")) {
1696                if (j == args.length - 1) {
1697                    FAIL("File name missing after -o");
1698                }
1699                else {
1700                    OutputFileName = args[++j];
1701                }
1702            }
1703            else if (args[j].equals("-search")) {
1704                if (j == args.length - 1)
1705                    FAIL("Bin count missing after -search");
1706                else {
1707                    bins = Integer.parseInt(args[++j]);
1708                    if (bins < 1 || bins > 10)
1709                        FAIL("Bin count must be >= 1 and <= 10");
1710                }
1711            }
1712            else if (args[j].equals("-template")) {
1713                if (j == args.length - 1)
1714                    FAIL("File name missing after -template");
1715                else
1716                    TemplateFileName = args[++j];
1717            }
1718            else if (args[j].equals("-spec")) { // liu
1719                if (j == args.length - 1) {
1720                    FAIL("File name missing after -spec");
1721                }
1722                else {
1723                    UnicodeSpecFileName = args[++j];
1724                }
1725            }
1726            else if (args[j].equals("-specialcasing")) {
1727                if (j == args.length -1) {
1728                    FAIL("File name missing after -specialcasing");
1729                }
1730                else {
1731                    SpecialCasingFileName = args[++j];
1732                }
1733            }
1734            else if (args[j].equals("-proplist")) {
1735                if (j == args.length -1) {
1736                    FAIL("File name missing after -proplist");
1737                }
1738                else {
1739                    PropListFileName = args[++j];
1740                }
1741            }
1742            else if (args[j].equals("-plane")) {
1743                if (j == args.length -1) {
1744                    FAIL("Plane number missing after -plane");
1745                }
1746                else {
1747                    plane = Integer.parseInt(args[++j]);
1748                }
1749                if (plane > 0) {
1750                    bLatin1 = false;
1751                }
1752            }
1753            else if ("-usecharforbyte".equals(args[j])) {
1754                useCharForByte = true;
1755            }
1756            else if (args[j].equals("-latin1")) {
1757                bLatin1 = true;
1758                plane = 0;
1759            }
1760            else {
1761                try {
1762                    int val = Integer.parseInt(args[j]);
1763                    if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1764                    if (sizes == null)
1765                        sizes = new int[1];
1766                    else {
1767                        int[] newsizes = new int[sizes.length + 1];
1768                        System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1769                        sizes = newsizes;
1770                    }
1771                    sizes[sizes.length - 1] = val;
1772                }
1773                catch(NumberFormatException e) {
1774                    FAIL("Unknown switch: " + args[j]);
1775                }
1776            }
1777        }
1778        if (Csyntax && tableAsString) {
1779            FAIL("Can't specify table as string with C syntax");
1780        }
1781        if (sizes == null) {
1782            desc.append(" [");
1783            if (identifiers) {
1784                int[] newsizes = { 8, 4, 4 };           // Good default values
1785                desc.append("8 4 4]");
1786                sizes = newsizes;
1787            }
1788            else {
1789                int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1790                desc.append("10 5 1]");
1791                sizes = newsizes;
1792            }
1793        }
1794        if (UnicodeSpecFileName == null) { // liu
1795            UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1796            desc.append(" [-spec " + UnicodeSpecFileName + ']');
1797        }
1798        if (SpecialCasingFileName == null) {
1799            SpecialCasingFileName = DefaultSpecialCasingFileName;
1800            desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1801        }
1802        if (PropListFileName == null) {
1803            PropListFileName = DefaultPropListFileName;
1804            desc.append(" [-proplist " + PropListFileName + ']');
1805        }
1806        if (TemplateFileName == null) {
1807            TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1808                  : DefaultJavaTemplateFileName);
1809            desc.append(" [-template " + TemplateFileName + ']');
1810        }
1811        if (OutputFileName == null) {
1812            OutputFileName = (Csyntax ? DefaultCOutputFileName
1813                    : DefaultJavaOutputFileName);
1814            desc.append(" [-o " + OutputFileName + ']');
1815        }
1816        commentStart = (Csyntax ? "/*" : "//");
1817        commentEnd = (Csyntax ? " */" : "");
1818        commandLineDescription = desc.toString();
1819    }
1820
1821    private static void searchBins(long[] map, int binsOccupied) throws Exception {
1822        int bitsFree = 16;
1823        for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1824        if (binsOccupied == (bins-1)) {
1825            sizes[binsOccupied] = bitsFree;
1826            generateForSizes(map);
1827        }
1828        else {
1829            for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1830                sizes[binsOccupied] = i;
1831                searchBins(map, binsOccupied+1);
1832            }
1833        }
1834    }
1835
1836    private static void generateForSizes(long[] map) throws Exception {
1837        int sum = 0;
1838        shifts = new int[sizes.length];
1839        for (int k = sizes.length - 1; k >= 0; k--) {
1840            shifts[k] = sum;
1841            sum += sizes[k];
1842        }
1843        if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1844            FAIL("Bit field widths total to " + sum +
1845             ": wrong total for map of size " + map.length);
1846        }
1847        // need a table for each set of lookup bits in char
1848        tables = new long[sizes.length][];
1849        // the last table is the map
1850        tables[sizes.length - 1] = map;
1851        for (int j = sizes.length - 1; j > 0; j--) {
1852            if (verbose && bins==0)
1853                System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1854            long[][] temp = buildTable(tables[j], sizes[j]);
1855            tables[j-1] = temp[0];
1856            tables[j] = temp[1];
1857        }
1858        preshifted = new boolean[sizes.length];
1859        zeroextend = new int[sizes.length];
1860        bytes = new int[sizes.length];
1861        for (int j = 0; j < sizes.length - 1; j++) {
1862            int len = tables[j+1].length;
1863            int size = sizes[j+1];
1864            if (len > 0x100 && (len >> size) <= 0x100) {
1865                len >>= size;
1866                preshifted[j] = false;
1867            }
1868            else if (len > 0x10000 && (len >> size) <= 0x10000) {
1869                len >>= size;
1870                preshifted[j] = false;
1871            }
1872            else preshifted[j] = true;
1873            if (Csyntax)
1874                zeroextend[j] = 0;
1875            else if (len > 0x7F && len <= 0xFF) {
1876                if (!useCharForByte) {
1877                    zeroextend[j] = 0xFF;
1878                }
1879            } else if (len > 0x7FFF && len <= 0xFFFF)
1880                zeroextend[j] = 0xFFFF;
1881            else zeroextend[j] = 0;
1882            if (len <= 0x100) bytes[j] = 1;
1883            else if (len <= 0x10000) bytes[j] = 2;
1884            else bytes[j] = 4;
1885        }
1886        preshifted[sizes.length - 1] = true;
1887        zeroextend[sizes.length - 1] = 0;
1888        bytes[sizes.length - 1] = 0;
1889        if (bins > 0) {
1890            int totalBytes = getTotalBytes();
1891            String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1892            int accessComplexity = 0;
1893            for (int j=0; j<access.length(); ++j) {
1894                char ch = access.charAt(j);
1895                if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1896                if (ch == '<' || ch == '>') ++j;
1897            }
1898            System.out.print("(");
1899            for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1900            System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1901            return;
1902        }
1903        if (verbose) {
1904            System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1905            for (int j = 0; j < sizes.length; j++) {
1906                System.out.println(dec5(j) + "\t" +
1907                    dec5(sizes[j]) + "\t" +
1908                    dec5(tables[j].length) + "\t" +
1909                    dec5(shifts[j]) + "\t" +
1910                    dec5(zeroextend[j]) + "\t" +
1911                    dec5(bytes[j]) + "\t " +
1912                    preshifted[j]);
1913            }
1914        }
1915        if (verbose) {
1916            System.out.println("Generating source code for class Character");
1917            System.out.println("A table access looks like " +
1918                         genAccess("A", "ch", (identifiers ? 2 : 32)));
1919        }
1920        generateCharacterClass(TemplateFileName, OutputFileName);
1921    }
1922
1923    /**
1924    * The main program for generating source code for the Character class.
1925    * The basic outline of its operation is:
1926    * <ol>
1927    * <li> Process the command line arguments.  One result of this process
1928    *           is a list of sizes (measured in bits and summing to 16).
1929    * <li> Get the Unicode character property data from the specification file.
1930    * <li> From that, build a map that has, for each character code, its
1931    *           relevant properties encoded as a long integer value.
1932    * <li> Repeatedly compress the map, producing a compressed table and a
1933    *           new map.  This is done once for each size value in the list.
1934    *           When this is done, we have a set of tables.
1935    * <li> Make some decisions about table representation; record these
1936    *           decisions in arrays named preshifted, zeroextend, and bytes.
1937    * <li> Generate the source code for the class Character by performing
1938    *           macro processing on a template file.
1939    * </ol>
1940    *
1941    * @param args       the command line arguments, as an array of String
1942    *
1943    * @see GenerateCharacter#processArgs
1944    * @see UnicodeSpec@readSpecFile
1945    * @see GenerateCharacter#buildMap
1946    * @see GenerateCharacter#buildTable
1947    * @see GenerateCharacter#generateCharacterClass
1948    */
1949
1950    public static void main(String[] args) {
1951        processArgs(args);
1952        try {
1953
1954            UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1955            specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1956            PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1957
1958            if (verbose) {
1959                System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1960            }
1961            long[] map = buildMap(data, specialCaseMaps, propList);
1962            if (verbose) {
1963                System.err.println("Completed building of initial map");
1964            }
1965
1966            if (bins == 0) {
1967                generateForSizes(map);
1968            }
1969            else {
1970                while (bins > 0) {
1971                    sizes = new int[bins];
1972                    searchBins(map, 0);
1973                    --bins;
1974                }
1975            }
1976            if (verbose && false) {
1977                System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1978                             hex8(maxOffsetSeen));
1979                System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
1980                             hex8(maxOffset));
1981            }
1982        }
1983        catch (FileNotFoundException e) { FAIL(e.toString()); }
1984        catch (IOException e) { FAIL(e.toString()); }
1985        catch (Throwable e) {
1986            System.out.println("Unexpected exception:");
1987            e.printStackTrace();
1988            FAIL("Unexpected exception!");
1989        }
1990        if (verbose) { System.out.println("Done!");}
1991    }
1992
1993}   // end class
1994