1/*
2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 ******************************************************************************
28 * Copyright (C) 1996-2014, International Business Machines Corporation and
29 * others. All Rights Reserved.
30 ******************************************************************************
31 */
32
33package sun.text.normalizer;
34
35import java.io.DataInputStream;
36import java.io.InputStream;
37import java.io.IOException;
38
39/**
40 * <p>A trie is a kind of compressed, serializable table of values
41 * associated with Unicode code points (0..0x10ffff).</p>
42 * <p>This class defines the basic structure of a trie and provides methods
43 * to <b>retrieve the offsets to the actual data</b>.</p>
44 * <p>Data will be the form of an array of basic types, char or int.</p>
45 * <p>The actual data format will have to be specified by the user in the
46 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
47 * <p>This trie implementation is optimized for getting offset while walking
48 * forward through a UTF-16 string.
49 * Therefore, the simplest and fastest access macros are the
50 * fromLead() and fromOffsetTrail() methods.
51 * The fromBMP() method are a little more complicated; they get offsets even
52 * for lead surrogate codepoints, while the fromLead() method get special
53 * "folded" offsets for lead surrogate code units if there is relevant data
54 * associated with them.
55 * From such a folded offsets, an offset needs to be extracted to supply
56 * to the fromOffsetTrail() methods.
57 * To handle such supplementary codepoints, some offset information are kept
58 * in the data.</p>
59 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
60 * that offset from the folded value for the lead surrogate unit.</p>
61 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
62 * com.ibm.icu.impl.IntTrie.</p>
63 * @author synwee
64 * @see com.ibm.icu.impl.CharTrie
65 * @see com.ibm.icu.impl.IntTrie
66 * @since release 2.1, Jan 01 2002
67 */
68public abstract class Trie
69{
70    // public class declaration ----------------------------------------
71
72    /**
73    * Character data in com.ibm.impl.Trie have different user-specified format
74    * for different purposes.
75    * This interface specifies methods to be implemented in order for
76    * com.ibm.impl.Trie, to surrogate offset information encapsulated within
77    * the data.
78    */
79    public static interface DataManipulate
80    {
81        /**
82        * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
83        * data
84        * the index array offset of the indexes for that lead surrogate.
85        * @param value data value for a surrogate from the trie, including the
86        *        folding offset
87        * @return data offset or 0 if there is no data for the lead surrogate
88        */
89        public int getFoldingOffset(int value);
90    }
91
92    // default implementation
93    private static class DefaultGetFoldingOffset implements DataManipulate {
94        public int getFoldingOffset(int value) {
95            return value;
96        }
97    }
98
99    // protected constructor -------------------------------------------
100
101    /**
102    * Trie constructor for CharTrie use.
103    * @param inputStream ICU data file input stream which contains the
104    *                        trie
105    * @param dataManipulate object containing the information to parse the
106    *                       trie data
107    * @throws IOException thrown when input stream does not have the
108    *                        right header.
109    */
110    protected Trie(InputStream inputStream,
111                   DataManipulate  dataManipulate) throws IOException
112    {
113        DataInputStream input = new DataInputStream(inputStream);
114        // Magic number to authenticate the data.
115        int signature = input.readInt();
116        m_options_    = input.readInt();
117
118        if (!checkHeader(signature)) {
119            throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
120        }
121
122        if(dataManipulate != null) {
123            m_dataManipulate_ = dataManipulate;
124        } else {
125            m_dataManipulate_ = new DefaultGetFoldingOffset();
126        }
127        m_isLatin1Linear_ = (m_options_ &
128                             HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
129        m_dataOffset_     = input.readInt();
130        m_dataLength_     = input.readInt();
131        unserialize(inputStream);
132    }
133
134    // protected data members ------------------------------------------
135
136    /**
137     * Lead surrogate code points' index displacement in the index array.
138     * <pre>{@code
139     * 0x10000-0xd800=0x2800
140     * 0x2800 >> INDEX_STAGE_1_SHIFT_
141     * }</pre>
142     */
143    protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
144    /**
145     * Shift size for shifting right the input index. 1..9
146     */
147    protected static final int INDEX_STAGE_1_SHIFT_ = 5;
148    /**
149     * Shift size for shifting left the index array values.
150     * Increases possible data size with 16-bit index values at the cost
151     * of compactability.
152     * This requires blocks of stage 2 data to be aligned by
153     * DATA_GRANULARITY.
154     * 0..INDEX_STAGE_1_SHIFT
155     */
156    protected static final int INDEX_STAGE_2_SHIFT_ = 2;
157    /**
158     * Number of data values in a stage 2 (data array) block.
159     */
160    protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
161    /**
162     * Mask for getting the lower bits from the input index.
163     * DATA_BLOCK_LENGTH - 1.
164     */
165    protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
166    /**
167     * Surrogate mask to use when shifting offset to retrieve supplementary
168     * values
169     */
170    protected static final int SURROGATE_MASK_ = 0x3FF;
171    /**
172     * Index or UTF16 characters
173     */
174    protected char m_index_[];
175    /**
176     * Internal TrieValue which handles the parsing of the data value.
177     * This class is to be implemented by the user
178     */
179    protected DataManipulate m_dataManipulate_;
180    /**
181     * Start index of the data portion of the trie. CharTrie combines
182     * index and data into a char array, so this is used to indicate the
183     * initial offset to the data portion.
184     * Note this index always points to the initial value.
185     */
186    protected int m_dataOffset_;
187    /**
188     * Length of the data array
189     */
190    protected int m_dataLength_;
191
192    // protected methods -----------------------------------------------
193
194    /**
195    * Gets the offset to the data which the surrogate pair points to.
196    * @param lead lead surrogate
197    * @param trail trailing surrogate
198    * @return offset to data
199    */
200    protected abstract int getSurrogateOffset(char lead, char trail);
201
202    /**
203    * Gets the offset to the data which the index ch after variable offset
204    * points to.
205    * Note for locating a non-supplementary character data offset, calling
206    * <p>
207    * getRawOffset(0, ch);
208    * </p>
209    * will do. Otherwise if it is a supplementary character formed by
210    * surrogates lead and trail. Then we would have to call getRawOffset()
211    * with getFoldingIndexOffset(). See getSurrogateOffset().
212    * @param offset index offset which ch is to start from
213    * @param ch index to be used after offset
214    * @return offset to the data
215    */
216    protected final int getRawOffset(int offset, char ch)
217    {
218        return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
219                << INDEX_STAGE_2_SHIFT_)
220                + (ch & INDEX_STAGE_3_MASK_);
221    }
222
223    /**
224    * Gets the offset to data which the BMP character points to
225    * Treats a lead surrogate as a normal code point.
226    * @param ch BMP character
227    * @return offset to data
228    */
229    protected final int getBMPOffset(char ch)
230    {
231        return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
232                && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
233                ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
234                : getRawOffset(0, ch);
235                // using a getRawOffset(ch) makes no diff
236    }
237
238    /**
239    * Gets the offset to the data which this lead surrogate character points
240    * to.
241    * Data at the returned offset may contain folding offset information for
242    * the next trailing surrogate character.
243    * @param ch lead surrogate character
244    * @return offset to data
245    */
246    protected final int getLeadOffset(char ch)
247    {
248       return getRawOffset(0, ch);
249    }
250
251    /**
252     * Internal trie getter from a code point.
253     * Could be faster(?) but longer with
254     * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
255     * Gets the offset to data which the codepoint points to
256     * @param ch codepoint
257     * @return offset to data
258     */
259    protected final int getCodePointOffset(int ch)
260    {
261        // if ((ch >> 16) == 0) slower
262        if (ch < 0) {
263            return -1;
264        } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
265            // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
266            return getRawOffset(0, (char)ch);
267        } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
268            // BMP codepoint
269            return getBMPOffset((char)ch);
270        } else if (ch <= UCharacter.MAX_VALUE) {
271            // look at the construction of supplementary characters
272            // trail forms the ends of it.
273            return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
274                                      (char)(ch & SURROGATE_MASK_));
275        } else {
276            // return -1 if there is an error, in this case we return
277            return -1;
278        }
279    }
280
281    /**
282    * <p>Parses the inputstream and creates the trie index with it.</p>
283    * <p>This is overwritten by the child classes.
284    * @param inputStream input stream containing the trie information
285    * @exception IOException thrown when data reading fails.
286    */
287    protected void unserialize(InputStream inputStream) throws IOException
288    {
289        //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
290        m_index_              = new char[m_dataOffset_];
291        DataInputStream input = new DataInputStream(inputStream);
292        for (int i = 0; i < m_dataOffset_; i ++) {
293             m_index_[i] = input.readChar();
294        }
295    }
296
297    /**
298    * Determines if this is a 16 bit trie
299    * @return true if this is a 16 bit trie
300    */
301    protected final boolean isCharTrie()
302    {
303        return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
304    }
305
306    // private data members --------------------------------------------
307
308    /**
309     * Latin 1 option mask
310     */
311    protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
312    /**
313    * Constant number to authenticate the byte block
314    */
315    protected static final int HEADER_SIGNATURE_ = 0x54726965;
316    /**
317    * Header option formatting
318    */
319    private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
320    protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
321    protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
322
323    /**
324     * Flag indicator for Latin quick access data block
325     */
326    private boolean m_isLatin1Linear_;
327
328    /**
329     * <p>Trie options field.</p>
330     * <p>options bit field:<br>
331     * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
332     * 8  0 = 16-bit data, 1=32-bit data<br>
333     * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
334     * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
335     */
336    private int m_options_;
337
338    // private methods ---------------------------------------------------
339
340    /**
341     * Authenticates raw data header.
342     * Checking the header information, signature and options.
343     * @param signature This contains the options and type of a Trie
344     * @return true if the header is authenticated valid
345     */
346    private final boolean checkHeader(int signature)
347    {
348        // check the signature
349        // Trie in big-endian US-ASCII (0x54726965).
350        // Magic number to authenticate the data.
351        if (signature != HEADER_SIGNATURE_) {
352            return false;
353        }
354
355        if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
356                                                    INDEX_STAGE_1_SHIFT_ ||
357            ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
358                                                HEADER_OPTIONS_SHIFT_MASK_)
359                                                 != INDEX_STAGE_2_SHIFT_) {
360            return false;
361        }
362        return true;
363    }
364}
365