1/* 2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26/* 27 ****************************************************************************** 28 * Copyright (C) 1996-2014, International Business Machines Corporation and 29 * others. All Rights Reserved. 30 ****************************************************************************** 31 */ 32 33package sun.text.normalizer; 34 35import java.io.DataInputStream; 36import java.io.InputStream; 37import java.io.IOException; 38 39/** 40 * <p>A trie is a kind of compressed, serializable table of values 41 * associated with Unicode code points (0..0x10ffff).</p> 42 * <p>This class defines the basic structure of a trie and provides methods 43 * to <b>retrieve the offsets to the actual data</b>.</p> 44 * <p>Data will be the form of an array of basic types, char or int.</p> 45 * <p>The actual data format will have to be specified by the user in the 46 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> 47 * <p>This trie implementation is optimized for getting offset while walking 48 * forward through a UTF-16 string. 49 * Therefore, the simplest and fastest access macros are the 50 * fromLead() and fromOffsetTrail() methods. 51 * The fromBMP() method are a little more complicated; they get offsets even 52 * for lead surrogate codepoints, while the fromLead() method get special 53 * "folded" offsets for lead surrogate code units if there is relevant data 54 * associated with them. 55 * From such a folded offsets, an offset needs to be extracted to supply 56 * to the fromOffsetTrail() methods. 57 * To handle such supplementary codepoints, some offset information are kept 58 * in the data.</p> 59 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve 60 * that offset from the folded value for the lead surrogate unit.</p> 61 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or 62 * com.ibm.icu.impl.IntTrie.</p> 63 * @author synwee 64 * @see com.ibm.icu.impl.CharTrie 65 * @see com.ibm.icu.impl.IntTrie 66 * @since release 2.1, Jan 01 2002 67 */ 68public abstract class Trie 69{ 70 // public class declaration ---------------------------------------- 71 72 /** 73 * Character data in com.ibm.impl.Trie have different user-specified format 74 * for different purposes. 75 * This interface specifies methods to be implemented in order for 76 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 77 * the data. 78 */ 79 public static interface DataManipulate 80 { 81 /** 82 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's 83 * data 84 * the index array offset of the indexes for that lead surrogate. 85 * @param value data value for a surrogate from the trie, including the 86 * folding offset 87 * @return data offset or 0 if there is no data for the lead surrogate 88 */ 89 public int getFoldingOffset(int value); 90 } 91 92 // default implementation 93 private static class DefaultGetFoldingOffset implements DataManipulate { 94 public int getFoldingOffset(int value) { 95 return value; 96 } 97 } 98 99 // protected constructor ------------------------------------------- 100 101 /** 102 * Trie constructor for CharTrie use. 103 * @param inputStream ICU data file input stream which contains the 104 * trie 105 * @param dataManipulate object containing the information to parse the 106 * trie data 107 * @throws IOException thrown when input stream does not have the 108 * right header. 109 */ 110 protected Trie(InputStream inputStream, 111 DataManipulate dataManipulate) throws IOException 112 { 113 DataInputStream input = new DataInputStream(inputStream); 114 // Magic number to authenticate the data. 115 int signature = input.readInt(); 116 m_options_ = input.readInt(); 117 118 if (!checkHeader(signature)) { 119 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 120 } 121 122 if(dataManipulate != null) { 123 m_dataManipulate_ = dataManipulate; 124 } else { 125 m_dataManipulate_ = new DefaultGetFoldingOffset(); 126 } 127 m_isLatin1Linear_ = (m_options_ & 128 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 129 m_dataOffset_ = input.readInt(); 130 m_dataLength_ = input.readInt(); 131 unserialize(inputStream); 132 } 133 134 // protected data members ------------------------------------------ 135 136 /** 137 * Lead surrogate code points' index displacement in the index array. 138 * <pre>{@code 139 * 0x10000-0xd800=0x2800 140 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 141 * }</pre> 142 */ 143 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 144 /** 145 * Shift size for shifting right the input index. 1..9 146 */ 147 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 148 /** 149 * Shift size for shifting left the index array values. 150 * Increases possible data size with 16-bit index values at the cost 151 * of compactability. 152 * This requires blocks of stage 2 data to be aligned by 153 * DATA_GRANULARITY. 154 * 0..INDEX_STAGE_1_SHIFT 155 */ 156 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 157 /** 158 * Number of data values in a stage 2 (data array) block. 159 */ 160 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 161 /** 162 * Mask for getting the lower bits from the input index. 163 * DATA_BLOCK_LENGTH - 1. 164 */ 165 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 166 /** 167 * Surrogate mask to use when shifting offset to retrieve supplementary 168 * values 169 */ 170 protected static final int SURROGATE_MASK_ = 0x3FF; 171 /** 172 * Index or UTF16 characters 173 */ 174 protected char m_index_[]; 175 /** 176 * Internal TrieValue which handles the parsing of the data value. 177 * This class is to be implemented by the user 178 */ 179 protected DataManipulate m_dataManipulate_; 180 /** 181 * Start index of the data portion of the trie. CharTrie combines 182 * index and data into a char array, so this is used to indicate the 183 * initial offset to the data portion. 184 * Note this index always points to the initial value. 185 */ 186 protected int m_dataOffset_; 187 /** 188 * Length of the data array 189 */ 190 protected int m_dataLength_; 191 192 // protected methods ----------------------------------------------- 193 194 /** 195 * Gets the offset to the data which the surrogate pair points to. 196 * @param lead lead surrogate 197 * @param trail trailing surrogate 198 * @return offset to data 199 */ 200 protected abstract int getSurrogateOffset(char lead, char trail); 201 202 /** 203 * Gets the offset to the data which the index ch after variable offset 204 * points to. 205 * Note for locating a non-supplementary character data offset, calling 206 * <p> 207 * getRawOffset(0, ch); 208 * </p> 209 * will do. Otherwise if it is a supplementary character formed by 210 * surrogates lead and trail. Then we would have to call getRawOffset() 211 * with getFoldingIndexOffset(). See getSurrogateOffset(). 212 * @param offset index offset which ch is to start from 213 * @param ch index to be used after offset 214 * @return offset to the data 215 */ 216 protected final int getRawOffset(int offset, char ch) 217 { 218 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 219 << INDEX_STAGE_2_SHIFT_) 220 + (ch & INDEX_STAGE_3_MASK_); 221 } 222 223 /** 224 * Gets the offset to data which the BMP character points to 225 * Treats a lead surrogate as a normal code point. 226 * @param ch BMP character 227 * @return offset to data 228 */ 229 protected final int getBMPOffset(char ch) 230 { 231 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 232 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 233 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 234 : getRawOffset(0, ch); 235 // using a getRawOffset(ch) makes no diff 236 } 237 238 /** 239 * Gets the offset to the data which this lead surrogate character points 240 * to. 241 * Data at the returned offset may contain folding offset information for 242 * the next trailing surrogate character. 243 * @param ch lead surrogate character 244 * @return offset to data 245 */ 246 protected final int getLeadOffset(char ch) 247 { 248 return getRawOffset(0, ch); 249 } 250 251 /** 252 * Internal trie getter from a code point. 253 * Could be faster(?) but longer with 254 * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} 255 * Gets the offset to data which the codepoint points to 256 * @param ch codepoint 257 * @return offset to data 258 */ 259 protected final int getCodePointOffset(int ch) 260 { 261 // if ((ch >> 16) == 0) slower 262 if (ch < 0) { 263 return -1; 264 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 265 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 266 return getRawOffset(0, (char)ch); 267 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 268 // BMP codepoint 269 return getBMPOffset((char)ch); 270 } else if (ch <= UCharacter.MAX_VALUE) { 271 // look at the construction of supplementary characters 272 // trail forms the ends of it. 273 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 274 (char)(ch & SURROGATE_MASK_)); 275 } else { 276 // return -1 if there is an error, in this case we return 277 return -1; 278 } 279 } 280 281 /** 282 * <p>Parses the inputstream and creates the trie index with it.</p> 283 * <p>This is overwritten by the child classes. 284 * @param inputStream input stream containing the trie information 285 * @exception IOException thrown when data reading fails. 286 */ 287 protected void unserialize(InputStream inputStream) throws IOException 288 { 289 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ 290 m_index_ = new char[m_dataOffset_]; 291 DataInputStream input = new DataInputStream(inputStream); 292 for (int i = 0; i < m_dataOffset_; i ++) { 293 m_index_[i] = input.readChar(); 294 } 295 } 296 297 /** 298 * Determines if this is a 16 bit trie 299 * @return true if this is a 16 bit trie 300 */ 301 protected final boolean isCharTrie() 302 { 303 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 304 } 305 306 // private data members -------------------------------------------- 307 308 /** 309 * Latin 1 option mask 310 */ 311 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 312 /** 313 * Constant number to authenticate the byte block 314 */ 315 protected static final int HEADER_SIGNATURE_ = 0x54726965; 316 /** 317 * Header option formatting 318 */ 319 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 320 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 321 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 322 323 /** 324 * Flag indicator for Latin quick access data block 325 */ 326 private boolean m_isLatin1Linear_; 327 328 /** 329 * <p>Trie options field.</p> 330 * <p>options bit field:<br> 331 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 332 * 8 0 = 16-bit data, 1=32-bit data<br> 333 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 334 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 335 */ 336 private int m_options_; 337 338 // private methods --------------------------------------------------- 339 340 /** 341 * Authenticates raw data header. 342 * Checking the header information, signature and options. 343 * @param signature This contains the options and type of a Trie 344 * @return true if the header is authenticated valid 345 */ 346 private final boolean checkHeader(int signature) 347 { 348 // check the signature 349 // Trie in big-endian US-ASCII (0x54726965). 350 // Magic number to authenticate the data. 351 if (signature != HEADER_SIGNATURE_) { 352 return false; 353 } 354 355 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 356 INDEX_STAGE_1_SHIFT_ || 357 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 358 HEADER_OPTIONS_SHIFT_MASK_) 359 != INDEX_STAGE_2_SHIFT_) { 360 return false; 361 } 362 return true; 363 } 364} 365