1/*
2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25/*
26 *******************************************************************************
27 * Copyright (C) 1996-2014, International Business Machines Corporation and
28 * others. All Rights Reserved.
29 *******************************************************************************
30 */
31
32package sun.text.normalizer;
33
34import java.io.IOException;
35import java.nio.ByteBuffer;
36import java.util.Iterator;
37import java.util.MissingResourceException;
38
39import sun.text.normalizer.UCharacter.HangulSyllableType;
40import sun.text.normalizer.UCharacter.NumericType;
41
42/**
43* <p>Internal class used for Unicode character property database.</p>
44* <p>This classes store binary data read from uprops.icu.
45* It does not have the capability to parse the data into more high-level
46* information. It only returns bytes of information when required.</p>
47* <p>Due to the form most commonly used for retrieval, array of char is used
48* to store the binary data.</p>
49* <p>UCharacterPropertyDB also contains information on accessing indexes to
50* significant points in the binary data.</p>
51* <p>Responsibility for molding the binary data into more meaning form lies on
52* <a href=UCharacter.html>UCharacter</a>.</p>
53* @author Syn Wee Quek
54* @since release 2.1, february 1st 2002
55*/
56
57final class UCharacterProperty
58{
59    // public data members -----------------------------------------------
60
61    /*
62     * public singleton instance
63     */
64    public static final UCharacterProperty INSTANCE;
65
66    /**
67    * Trie data
68    */
69    public Trie2_16 m_trie_;
70
71    /**
72    * Unicode version
73    */
74    public VersionInfo m_unicodeVersion_;
75
76    /**
77    * Character type mask
78    */
79    public static final int TYPE_MASK = 0x1F;
80
81    // uprops.h enum UPropertySource --------------------------------------- ***
82
83    /** From uchar.c/uprops.icu main trie */
84    public static final int SRC_CHAR=1;
85    /** From uchar.c/uprops.icu properties vectors trie */
86    public static final int SRC_PROPSVEC=2;
87    /** From ubidi_props.c/ubidi.icu */
88    public static final int SRC_BIDI=5;
89    /** From normalizer2impl.cpp/nfc.nrm */
90    public static final int SRC_NFC=8;
91    /** From normalizer2impl.cpp/nfkc.nrm */
92    public static final int SRC_NFKC=9;
93
94    // public methods ----------------------------------------------------
95
96    /**
97    * Gets the main property value for code point ch.
98    * @param ch code point whose property value is to be retrieved
99    * @return property value of code point
100    */
101    public final int getProperty(int ch)
102    {
103        return m_trie_.get(ch);
104    }
105
106    /**
107     * Gets the unicode additional properties.
108     * Java version of C u_getUnicodeProperties().
109     * @param codepoint codepoint whose additional properties is to be
110     *                  retrieved
111     * @param column The column index.
112     * @return unicode properties
113     */
114    public int getAdditional(int codepoint, int column) {
115        assert column >= 0;
116        if (column >= m_additionalColumnsCount_) {
117            return 0;
118        }
119        return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
120    }
121
122    /**
123     * <p>Get the "age" of the code point.</p>
124     * <p>The "age" is the Unicode version when the code point was first
125     * designated (as a non-character or for Private Use) or assigned a
126     * character.</p>
127     * <p>This can be useful to avoid emitting code points to receiving
128     * processes that do not accept newer characters.</p>
129     * <p>The data is from the UCD file DerivedAge.txt.</p>
130     * <p>This API does not check the validity of the codepoint.</p>
131     * @param codepoint The code point.
132     * @return the Unicode version number
133     */
134    public VersionInfo getAge(int codepoint)
135    {
136        int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
137        return VersionInfo.getInstance(
138                           (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
139                           version & LAST_NIBBLE_MASK_, 0, 0);
140    }
141
142    // int-value and enumerated properties --------------------------------- ***
143
144    public int getType(int c) {
145        return getProperty(c)&TYPE_MASK;
146    }
147
148    /*
149     * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
150     * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
151     */
152    private static final int /* UHangulSyllableType */ gcbToHst[]={
153        HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
154        HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CONTROL */
155        HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CR */
156        HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_EXTEND */
157        HangulSyllableType.LEADING_JAMO,     /* U_GCB_L */
158        HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_LF */
159        HangulSyllableType.LV_SYLLABLE,      /* U_GCB_LV */
160        HangulSyllableType.LVT_SYLLABLE,     /* U_GCB_LVT */
161        HangulSyllableType.TRAILING_JAMO,    /* U_GCB_T */
162        HangulSyllableType.VOWEL_JAMO        /* U_GCB_V */
163        /*
164         * Omit GCB values beyond what we need for hst.
165         * The code below checks for the array length.
166         */
167    };
168
169    private class IntProperty {
170        int column;  // SRC_PROPSVEC column, or "source" if mask==0
171        int mask;
172        int shift;
173
174        IntProperty(int column, int mask, int shift) {
175            this.column=column;
176            this.mask=mask;
177            this.shift=shift;
178        }
179
180        IntProperty(int source) {
181            this.column=source;
182            this.mask=0;
183        }
184
185        int getValue(int c) {
186            // systematic, directly stored properties
187            return (getAdditional(c, column)&mask)>>>shift;
188        }
189    }
190
191    private class BiDiIntProperty extends IntProperty {
192        BiDiIntProperty() {
193            super(SRC_BIDI);
194        }
195    }
196
197    private class CombiningClassIntProperty extends IntProperty {
198        CombiningClassIntProperty(int source) {
199            super(source);
200        }
201    }
202
203    private class NormQuickCheckIntProperty extends IntProperty {  // UCHAR_NF*_QUICK_CHECK properties
204        int which;
205        int max;
206
207        NormQuickCheckIntProperty(int source, int which, int max) {
208            super(source);
209            this.which=which;
210            this.max=max;
211        }
212    }
213
214    private IntProperty intProp =  new BiDiIntProperty() {  // BIDI_PAIRED_BRACKET_TYPE
215        int getValue(int c) {
216            return UBiDiProps.INSTANCE.getPairedBracketType(c);
217        }
218    };
219
220    public int getIntPropertyValue(int c, int which) {
221        if (which == BIDI_PAIRED_BRACKET_TYPE) {
222            return intProp.getValue(c);
223        }
224        return 0; // undefined
225    }
226
227    /**
228    * Forms a supplementary code point from the argument character<br>
229    * Note this is for internal use hence no checks for the validity of the
230    * surrogate characters are done
231    * @param lead lead surrogate character
232    * @param trail trailing surrogate character
233    * @return code point of the supplementary character
234    */
235    public static int getRawSupplementary(char lead, char trail)
236    {
237        return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
238    }
239
240    /**
241     * Gets the type mask
242     * @param type character type
243     * @return mask
244     */
245    public static final int getMask(int type)
246    {
247        return 1 << type;
248    }
249
250    /**
251     * Returns the digit values of characters like 'A' - 'Z', normal,
252     * half-width and full-width. This method assumes that the other digit
253     * characters are checked by the calling method.
254     * @param ch character to test
255     * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
256     *         its corresponding digit will be returned.
257     */
258    public static int getEuropeanDigit(int ch) {
259        if ((ch > 0x7a && ch < 0xff21)
260            || ch < 0x41 || (ch > 0x5a && ch < 0x61)
261            || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
262            return -1;
263        }
264        if (ch <= 0x7a) {
265            // ch >= 0x41 or ch < 0x61
266            return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
267        }
268        // ch >= 0xff21
269        if (ch <= 0xff3a) {
270            return ch + 10 - 0xff21;
271        }
272        // ch >= 0xff41 && ch <= 0xff5a
273        return ch + 10 - 0xff41;
274    }
275
276    public int digit(int c) {
277        int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
278        if(value<=9) {
279            return value;
280        } else {
281            return -1;
282        }
283    }
284
285    // protected variables -----------------------------------------------
286
287    /**
288     * Extra property trie
289     */
290    Trie2_16 m_additionalTrie_;
291    /**
292     * Extra property vectors, 1st column for age and second for binary
293     * properties.
294     */
295    int m_additionalVectors_[];
296    /**
297     * Number of additional columns
298     */
299    int m_additionalColumnsCount_;
300    /**
301     * Maximum values for block, bits used as in vector word
302     * 0
303     */
304    int m_maxBlockScriptValue_;
305    /**
306     * Maximum values for script, bits used as in vector word
307     * 0
308     */
309     int m_maxJTGValue_;
310    /**
311     * Script_Extensions data
312     */
313    public char[] m_scriptExtensions_;
314
315    // private variables -------------------------------------------------
316
317    /**
318    * Default name of the datafile
319    */
320    private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
321
322    /**
323    * Shift value for lead surrogate to form a supplementary character.
324    */
325    private static final int LEAD_SURROGATE_SHIFT_ = 10;
326    /**
327    * Offset to add to combined surrogate pair to avoid masking.
328    */
329    private static final int SURROGATE_OFFSET_ =
330                           UTF16.SUPPLEMENTARY_MIN_VALUE -
331                           (UTF16.SURROGATE_MIN_VALUE <<
332                           LEAD_SURROGATE_SHIFT_) -
333                           UTF16.TRAIL_SURROGATE_MIN_VALUE;
334
335
336    // property data constants -------------------------------------------------
337
338    /**
339     * Numeric types and values in the main properties words.
340     */
341    private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
342    private static final int getNumericTypeValue(int props) {
343        return props >> NUMERIC_TYPE_VALUE_SHIFT_;
344    }
345
346    /* constants for the storage form of numeric types and values */
347    /** No numeric value. */
348    private static final int NTV_NONE_ = 0;
349    /** Decimal digits: nv=0..9 */
350    private static final int NTV_DECIMAL_START_ = 1;
351    /** Other digits: nv=0..9 */
352    private static final int NTV_DIGIT_START_ = 11;
353    /** Small integers: nv=0..154 */
354    private static final int NTV_NUMERIC_START_ = 21;
355
356    private static final int ntvGetType(int ntv) {
357        return
358            (ntv==NTV_NONE_) ? NumericType.NONE :
359            (ntv<NTV_DIGIT_START_) ?  NumericType.DECIMAL :
360            (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
361            NumericType.NUMERIC;
362    }
363
364    /*
365     * Properties in vector word 0
366     * Bits
367     * 31..24   DerivedAge version major/minor one nibble each
368     * 23..22   3..1: Bits 7..0 = Script_Extensions index
369     *             3: Script value from Script_Extensions
370     *             2: Script=Inherited
371     *             1: Script=Common
372     *             0: Script=bits 7..0
373     * 21..20   reserved
374     * 19..17   East Asian Width
375     * 16.. 8   UBlockCode
376     *  7.. 0   UScriptCode
377     */
378    /**
379     * Script_Extensions: mask includes Script
380     */
381    public static final int SCRIPT_X_MASK = 0x00c000ff;
382    //private static final int SCRIPT_X_SHIFT = 22;
383    /**
384     * Integer properties mask and shift values for East Asian cell width.
385     * Equivalent to icu4c UPROPS_EA_MASK
386     */
387    private static final int EAST_ASIAN_MASK_ = 0x000e0000;
388    /**
389     * Integer properties mask and shift values for East Asian cell width.
390     * Equivalent to icu4c UPROPS_EA_SHIFT
391     */
392    private static final int EAST_ASIAN_SHIFT_ = 17;
393    /**
394     * Integer properties mask and shift values for blocks.
395     * Equivalent to icu4c UPROPS_BLOCK_MASK
396     */
397    private static final int BLOCK_MASK_ = 0x0001ff00;
398    /**
399     * Integer properties mask and shift values for blocks.
400     * Equivalent to icu4c UPROPS_BLOCK_SHIFT
401     */
402    private static final int BLOCK_SHIFT_ = 8;
403    /**
404     * Integer properties mask and shift values for scripts.
405     * Equivalent to icu4c UPROPS_SHIFT_MASK
406     */
407    public static final int SCRIPT_MASK_ = 0x000000ff;
408
409    /**
410     * Additional properties used in internal trie data
411     */
412    /*
413     * Properties in vector word 1
414     * Each bit encodes one binary property.
415     * The following constants represent the bit number, use 1<<UPROPS_XYZ.
416     * UPROPS_BINARY_1_TOP<=32!
417     *
418     * Keep this list of property enums in sync with
419     * propListNames[] in icu/source/tools/genprops/props2.c!
420     *
421     * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
422     */
423    private static final int WHITE_SPACE_PROPERTY_ = 0;
424    private static final int DASH_PROPERTY_ = 1;
425    private static final int HYPHEN_PROPERTY_ = 2;
426    private static final int QUOTATION_MARK_PROPERTY_ = 3;
427    private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
428    private static final int MATH_PROPERTY_ = 5;
429    private static final int HEX_DIGIT_PROPERTY_ = 6;
430    private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
431    private static final int ALPHABETIC_PROPERTY_ = 8;
432    private static final int IDEOGRAPHIC_PROPERTY_ = 9;
433    private static final int DIACRITIC_PROPERTY_ = 10;
434    private static final int EXTENDER_PROPERTY_ = 11;
435    private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
436    private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
437    private static final int GRAPHEME_LINK_PROPERTY_ = 14;
438    private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
439    private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
440    private static final int RADICAL_PROPERTY_ = 17;
441    private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
442    private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
443    private static final int DEPRECATED_PROPERTY_ = 20;
444    private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
445    private static final int XID_START_PROPERTY_ = 22;
446    private static final int XID_CONTINUE_PROPERTY_ = 23;
447    private static final int ID_START_PROPERTY_    = 24;
448    private static final int ID_CONTINUE_PROPERTY_ = 25;
449    private static final int GRAPHEME_BASE_PROPERTY_ = 26;
450    private static final int S_TERM_PROPERTY_ = 27;
451    private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
452    private static final int PATTERN_SYNTAX = 29;                   /* new in ICU 3.4 and Unicode 4.1 */
453    private static final int PATTERN_WHITE_SPACE = 30;
454
455    /*
456     * Properties in vector word 2
457     * Bits
458     * 31..26   reserved
459     * 25..20   Line Break
460     * 19..15   Sentence Break
461     * 14..10   Word Break
462     *  9.. 5   Grapheme Cluster Break
463     *  4.. 0   Decomposition Type
464     */
465    private static final int LB_MASK          = 0x03f00000;
466    private static final int LB_SHIFT         = 20;
467
468    private static final int SB_MASK          = 0x000f8000;
469    private static final int SB_SHIFT         = 15;
470
471    private static final int WB_MASK          = 0x00007c00;
472    private static final int WB_SHIFT         = 10;
473
474    private static final int GCB_MASK         = 0x000003e0;
475    private static final int GCB_SHIFT        = 5;
476
477    /**
478     * Integer properties mask for decomposition type.
479     * Equivalent to icu4c UPROPS_DT_MASK.
480     */
481    private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
482
483    /**
484     * First nibble shift
485     */
486    private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
487    /**
488     * Second nibble mask
489     */
490    private static final int LAST_NIBBLE_MASK_ = 0xF;
491    /**
492     * Age value shift
493     */
494    private static final int AGE_SHIFT_ = 24;
495
496    // private constructors --------------------------------------------------
497
498    /**
499     * Constructor
500     * @exception IOException thrown when data reading fails or data corrupted
501     */
502    private UCharacterProperty() throws IOException
503    {
504        // jar access
505        ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
506        m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
507        // Read or skip the 16 indexes.
508        int propertyOffset = bytes.getInt();
509        /* exceptionOffset = */ bytes.getInt();
510        /* caseOffset = */ bytes.getInt();
511        int additionalOffset = bytes.getInt();
512        int additionalVectorsOffset = bytes.getInt();
513        m_additionalColumnsCount_ = bytes.getInt();
514        int scriptExtensionsOffset = bytes.getInt();
515        int reservedOffset7 = bytes.getInt();
516        /* reservedOffset8 = */ bytes.getInt();
517        /* dataTopOffset = */ bytes.getInt();
518        m_maxBlockScriptValue_ = bytes.getInt();
519        m_maxJTGValue_ = bytes.getInt();
520        ICUBinary.skipBytes(bytes, (16 - 12) << 2);
521
522        // read the main properties trie
523        m_trie_ = Trie2_16.createFromSerialized(bytes);
524        int expectedTrieLength = (propertyOffset - 16) * 4;
525        int trieLength = m_trie_.getSerializedLength();
526        if(trieLength > expectedTrieLength) {
527            throw new IOException("uprops.icu: not enough bytes for main trie");
528        }
529        // skip padding after trie bytes
530        ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
531
532        // skip unused intervening data structures
533        ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
534
535        if(m_additionalColumnsCount_ > 0) {
536            // reads the additional property block
537            m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
538            expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
539            trieLength = m_additionalTrie_.getSerializedLength();
540            if(trieLength > expectedTrieLength) {
541                throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
542            }
543            // skip padding after trie bytes
544            ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
545
546            // additional properties
547            int size = scriptExtensionsOffset - additionalVectorsOffset;
548            m_additionalVectors_ = new int[size];
549            for (int i = 0; i < size; i ++) {
550                m_additionalVectors_[i] = bytes.getInt();
551            }
552        }
553
554        // Script_Extensions
555        int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
556        if(numChars > 0) {
557            m_scriptExtensions_ = new char[numChars];
558            for(int i = 0; i < numChars; ++i) {
559                m_scriptExtensions_[i] = bytes.getChar();
560            }
561        }
562    }
563
564    private static final class IsAcceptable implements ICUBinary.Authenticate {
565        // @Override when we switch to Java 6
566        public boolean isDataVersionAcceptable(byte version[]) {
567            return version[0] == 7;
568        }
569    }
570
571    private static final int DATA_FORMAT = 0x5550726F;  // "UPro"
572
573    public void upropsvec_addPropertyStarts(UnicodeSet set) {
574        /* add the start code point of each same-value range of the properties vectors trie */
575        if(m_additionalColumnsCount_>0) {
576            /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
577            Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
578            Trie2.Range range;
579            while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
580                set.add(range.startCodePoint);
581            }
582        }
583    }
584
585    // This static initializer block must be placed after
586    // other static member initialization
587    static {
588        try {
589            INSTANCE = new UCharacterProperty();
590        }
591        catch (IOException e) {
592            throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
593        }
594    }
595
596
597    // Moved from UProperty.java
598    /**
599     * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
600     * Used in UAX #9: Unicode Bidirectional Algorithm
601     * (http://www.unicode.org/reports/tr9/)
602     * Returns UCharacter.BidiPairedBracketType values.
603     * @stable ICU 52
604     */
605    public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
606
607}
608