RBTableBuilder.java revision 12745:f068a4ffddd2
1/* 2 * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26/* 27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 28 * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved 29 * 30 * The original version of this source code and documentation is copyrighted 31 * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These 32 * materials are provided under terms of a License Agreement between Taligent 33 * and Sun. This technology is protected by multiple US and International 34 * patents. This notice and attribution to Taligent may not be removed. 35 * Taligent is a registered trademark of Taligent, Inc. 36 * 37 */ 38 39package java.text; 40 41import java.util.Vector; 42import sun.text.UCompactIntArray; 43import sun.text.IntHashtable; 44import sun.text.ComposedCharIter; 45import sun.text.CollatorUtilities; 46import sun.text.normalizer.NormalizerImpl; 47 48/** 49 * This class contains all the code to parse a RuleBasedCollator pattern 50 * and build a RBCollationTables object from it. A particular instance 51 * of tis class exists only during the actual build process-- once an 52 * RBCollationTables object has been built, the RBTableBuilder object 53 * goes away. This object carries all of the state which is only needed 54 * during the build process, plus a "shadow" copy of all of the state 55 * that will go into the tables object itself. This object communicates 56 * with RBCollationTables through a separate class, RBCollationTables.BuildAPI, 57 * this is an inner class of RBCollationTables and provides a separate 58 * private API for communication with RBTableBuilder. 59 * This class isn't just an inner class of RBCollationTables itself because 60 * of its large size. For source-code readability, it seemed better for the 61 * builder to have its own source file. 62 */ 63final class RBTableBuilder { 64 65 public RBTableBuilder(RBCollationTables.BuildAPI tables) { 66 this.tables = tables; 67 } 68 69 /** 70 * Create a table-based collation object with the given rules. 71 * This is the main function that actually builds the tables and 72 * stores them back in the RBCollationTables object. It is called 73 * ONLY by the RBCollationTables constructor. 74 * @see RuleBasedCollator#RuleBasedCollator 75 * @exception ParseException If the rules format is incorrect. 76 */ 77 78 public void build(String pattern, int decmp) throws ParseException 79 { 80 boolean isSource = true; 81 int i = 0; 82 String expChars; 83 String groupChars; 84 if (pattern.length() == 0) 85 throw new ParseException("Build rules empty.", 0); 86 87 // This array maps Unicode characters to their collation ordering 88 mapping = new UCompactIntArray(RBCollationTables.UNMAPPED); 89 // Normalize the build rules. Find occurances of all decomposed characters 90 // and normalize the rules before feeding into the builder. By "normalize", 91 // we mean that all precomposed Unicode characters must be converted into 92 // a base character and one or more combining characters (such as accents). 93 // When there are multiple combining characters attached to a base character, 94 // the combining characters must be in their canonical order 95 // 96 // sherman/Note: 97 //(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose 98 //hangual syllables to jamos, so we can actually just call decompose with 99 //normalizer's IGNORE_HANGUL option turned on 100 // 101 //(2)just call the "special version" in NormalizerImpl directly 102 //pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true); 103 // 104 //Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp); 105 //pattern = Normalizer.normalize(pattern, mode, 0, true); 106 107 pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern); 108 109 // Build the merged collation entries 110 // Since rules can be specified in any order in the string 111 // (e.g. "c , C < d , D < e , E .... C < CH") 112 // this splits all of the rules in the string out into separate 113 // objects and then sorts them. In the above example, it merges the 114 // "C < CH" rule in just before the "C < D" rule. 115 // 116 117 mPattern = new MergeCollation(pattern); 118 119 int order = 0; 120 121 // Now walk though each entry and add it to my own tables 122 for (i = 0; i < mPattern.getCount(); ++i) 123 { 124 PatternEntry entry = mPattern.getItemAt(i); 125 if (entry != null) { 126 groupChars = entry.getChars(); 127 if (groupChars.length() > 1) { 128 switch(groupChars.charAt(groupChars.length()-1)) { 129 case '@': 130 frenchSec = true; 131 groupChars = groupChars.substring(0, groupChars.length()-1); 132 break; 133 case '!': 134 seAsianSwapping = true; 135 groupChars = groupChars.substring(0, groupChars.length()-1); 136 break; 137 } 138 } 139 140 order = increment(entry.getStrength(), order); 141 expChars = entry.getExtension(); 142 143 if (expChars.length() != 0) { 144 addExpandOrder(groupChars, expChars, order); 145 } else if (groupChars.length() > 1) { 146 char ch = groupChars.charAt(0); 147 if (Character.isHighSurrogate(ch) && groupChars.length() == 2) { 148 addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order); 149 } else { 150 addContractOrder(groupChars, order); 151 } 152 } else { 153 char ch = groupChars.charAt(0); 154 addOrder(ch, order); 155 } 156 } 157 } 158 addComposedChars(); 159 160 commit(); 161 mapping.compact(); 162 /* 163 System.out.println("mappingSize=" + mapping.getKSize()); 164 for (int j = 0; j < 0xffff; j++) { 165 int value = mapping.elementAt(j); 166 if (value != RBCollationTables.UNMAPPED) 167 System.out.println("index=" + Integer.toString(j, 16) 168 + ", value=" + Integer.toString(value, 16)); 169 } 170 */ 171 tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable, 172 contractFlags, maxSecOrder, maxTerOrder); 173 } 174 175 /** Add expanding entries for pre-composed unicode characters so that this 176 * collator can be used reasonably well with decomposition turned off. 177 */ 178 private void addComposedChars() throws ParseException { 179 // Iterate through all of the pre-composed characters in Unicode 180 ComposedCharIter iter = new ComposedCharIter(); 181 int c; 182 while ((c = iter.next()) != ComposedCharIter.DONE) { 183 if (getCharOrder(c) == RBCollationTables.UNMAPPED) { 184 // 185 // We don't already have an ordering for this pre-composed character. 186 // 187 // First, see if the decomposed string is already in our 188 // tables as a single contracting-string ordering. 189 // If so, just map the precomposed character to that order. 190 // 191 // TODO: What we should really be doing here is trying to find the 192 // longest initial substring of the decomposition that is present 193 // in the tables as a contracting character sequence, and find its 194 // ordering. Then do this recursively with the remaining chars 195 // so that we build a list of orderings, and add that list to 196 // the expansion table. 197 // That would be more correct but also significantly slower, so 198 // I'm not totally sure it's worth doing. 199 // 200 String s = iter.decomposition(); 201 202 //sherman/Note: if this is 1 character decomposed string, the 203 //only thing need to do is to check if this decomposed character 204 //has an entry in our order table, this order is not necessary 205 //to be a contraction order, if it does have one, add an entry 206 //for the precomposed character by using the same order, the 207 //previous impl unnecessarily adds a single character expansion 208 //entry. 209 if (s.length() == 1) { 210 int order = getCharOrder(s.charAt(0)); 211 if (order != RBCollationTables.UNMAPPED) { 212 addOrder(c, order); 213 } 214 continue; 215 } else if (s.length() == 2) { 216 char ch0 = s.charAt(0); 217 if (Character.isHighSurrogate(ch0)) { 218 int order = getCharOrder(s.codePointAt(0)); 219 if (order != RBCollationTables.UNMAPPED) { 220 addOrder(c, order); 221 } 222 continue; 223 } 224 } 225 int contractOrder = getContractOrder(s); 226 if (contractOrder != RBCollationTables.UNMAPPED) { 227 addOrder(c, contractOrder); 228 } else { 229 // 230 // We don't have a contracting ordering for the entire string 231 // that results from the decomposition, but if we have orders 232 // for each individual character, we can add an expanding 233 // table entry for the pre-composed character 234 // 235 boolean allThere = true; 236 for (int i = 0; i < s.length(); i++) { 237 if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) { 238 allThere = false; 239 break; 240 } 241 } 242 if (allThere) { 243 addExpandOrder(c, s, RBCollationTables.UNMAPPED); 244 } 245 } 246 } 247 } 248 } 249 250 /** 251 * Look up for unmapped values in the expanded character table. 252 * 253 * When the expanding character tables are built by addExpandOrder, 254 * it doesn't know what the final ordering of each character 255 * in the expansion will be. Instead, it just puts the raw character 256 * code into the table, adding CHARINDEX as a flag. Now that we've 257 * finished building the mapping table, we can go back and look up 258 * that character to see what its real collation order is and 259 * stick that into the expansion table. That lets us avoid doing 260 * a two-stage lookup later. 261 */ 262 private final void commit() 263 { 264 if (expandTable != null) { 265 for (int i = 0; i < expandTable.size(); i++) { 266 int[] valueList = expandTable.elementAt(i); 267 for (int j = 0; j < valueList.length; j++) { 268 int order = valueList[j]; 269 if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) { 270 // found a expanding character that isn't filled in yet 271 int ch = order - CHARINDEX; 272 273 // Get the real values for the non-filled entry 274 int realValue = getCharOrder(ch); 275 276 if (realValue == RBCollationTables.UNMAPPED) { 277 // The real value is still unmapped, maybe it's ignorable 278 valueList[j] = IGNORABLEMASK & ch; 279 } else { 280 // just fill in the value 281 valueList[j] = realValue; 282 } 283 } 284 } 285 } 286 } 287 } 288 /** 289 * Increment of the last order based on the comparison level. 290 */ 291 private final int increment(int aStrength, int lastValue) 292 { 293 switch(aStrength) 294 { 295 case Collator.PRIMARY: 296 // increment priamry order and mask off secondary and tertiary difference 297 lastValue += PRIMARYORDERINCREMENT; 298 lastValue &= RBCollationTables.PRIMARYORDERMASK; 299 isOverIgnore = true; 300 break; 301 case Collator.SECONDARY: 302 // increment secondary order and mask off tertiary difference 303 lastValue += SECONDARYORDERINCREMENT; 304 lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY; 305 // record max # of ignorable chars with secondary difference 306 if (!isOverIgnore) 307 maxSecOrder++; 308 break; 309 case Collator.TERTIARY: 310 // increment tertiary order 311 lastValue += TERTIARYORDERINCREMENT; 312 // record max # of ignorable chars with tertiary difference 313 if (!isOverIgnore) 314 maxTerOrder++; 315 break; 316 } 317 return lastValue; 318 } 319 320 /** 321 * Adds a character and its designated order into the collation table. 322 */ 323 private final void addOrder(int ch, int anOrder) 324 { 325 // See if the char already has an order in the mapping table 326 int order = mapping.elementAt(ch); 327 328 if (order >= RBCollationTables.CONTRACTCHARINDEX) { 329 // There's already an entry for this character that points to a contracting 330 // character table. Instead of adding the character directly to the mapping 331 // table, we must add it to the contract table instead. 332 int length = 1; 333 if (Character.isSupplementaryCodePoint(ch)) { 334 length = Character.toChars(ch, keyBuf, 0); 335 } else { 336 keyBuf[0] = (char)ch; 337 } 338 addContractOrder(new String(keyBuf, 0, length), anOrder); 339 } else { 340 // add the entry to the mapping table, 341 // the same later entry replaces the previous one 342 mapping.setElementAt(ch, anOrder); 343 } 344 } 345 346 private final void addContractOrder(String groupChars, int anOrder) { 347 addContractOrder(groupChars, anOrder, true); 348 } 349 350 /** 351 * Adds the contracting string into the collation table. 352 */ 353 private final void addContractOrder(String groupChars, int anOrder, 354 boolean fwd) 355 { 356 if (contractTable == null) { 357 contractTable = new Vector<>(INITIALTABLESIZE); 358 } 359 360 //initial character 361 int ch = groupChars.codePointAt(0); 362 /* 363 char ch0 = groupChars.charAt(0); 364 int ch = Character.isHighSurrogate(ch0)? 365 Character.toCodePoint(ch0, groupChars.charAt(1)):ch0; 366 */ 367 // See if the initial character of the string already has a contract table. 368 int entry = mapping.elementAt(ch); 369 Vector<EntryPair> entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX); 370 371 if (entryTable == null) { 372 // We need to create a new table of contract entries for this base char 373 int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size(); 374 entryTable = new Vector<>(INITIALTABLESIZE); 375 contractTable.addElement(entryTable); 376 377 // Add the initial character's current ordering first. then 378 // update its mapping to point to this contract table 379 entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry)); 380 mapping.setElementAt(ch, tableIndex); 381 } 382 383 // Now add (or replace) this string in the table 384 int index = RBCollationTables.getEntry(entryTable, groupChars, fwd); 385 if (index != RBCollationTables.UNMAPPED) { 386 EntryPair pair = entryTable.elementAt(index); 387 pair.value = anOrder; 388 } else { 389 EntryPair pair = entryTable.lastElement(); 390 391 // NOTE: This little bit of logic is here to speed CollationElementIterator 392 // .nextContractChar(). This code ensures that the longest sequence in 393 // this list is always the _last_ one in the list. This keeps 394 // nextContractChar() from having to search the entire list for the longest 395 // sequence. 396 if (groupChars.length() > pair.entryName.length()) { 397 entryTable.addElement(new EntryPair(groupChars, anOrder, fwd)); 398 } else { 399 entryTable.insertElementAt(new EntryPair(groupChars, anOrder, 400 fwd), entryTable.size() - 1); 401 } 402 } 403 404 // If this was a forward mapping for a contracting string, also add a 405 // reverse mapping for it, so that CollationElementIterator.previous 406 // can work right 407 if (fwd && groupChars.length() > 1) { 408 addContractFlags(groupChars); 409 addContractOrder(new StringBuffer(groupChars).reverse().toString(), 410 anOrder, false); 411 } 412 } 413 414 /** 415 * If the given string has been specified as a contracting string 416 * in this collation table, return its ordering. 417 * Otherwise return UNMAPPED. 418 */ 419 private int getContractOrder(String groupChars) 420 { 421 int result = RBCollationTables.UNMAPPED; 422 if (contractTable != null) { 423 int ch = groupChars.codePointAt(0); 424 /* 425 char ch0 = groupChars.charAt(0); 426 int ch = Character.isHighSurrogate(ch0)? 427 Character.toCodePoint(ch0, groupChars.charAt(1)):ch0; 428 */ 429 Vector<EntryPair> entryTable = getContractValues(ch); 430 if (entryTable != null) { 431 int index = RBCollationTables.getEntry(entryTable, groupChars, true); 432 if (index != RBCollationTables.UNMAPPED) { 433 EntryPair pair = entryTable.elementAt(index); 434 result = pair.value; 435 } 436 } 437 } 438 return result; 439 } 440 441 private final int getCharOrder(int ch) { 442 int order = mapping.elementAt(ch); 443 444 if (order >= RBCollationTables.CONTRACTCHARINDEX) { 445 Vector<EntryPair> groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX); 446 EntryPair pair = groupList.firstElement(); 447 order = pair.value; 448 } 449 return order; 450 } 451 452 /** 453 * Get the entry of hash table of the contracting string in the collation 454 * table. 455 * @param ch the starting character of the contracting string 456 */ 457 private Vector<EntryPair> getContractValues(int ch) 458 { 459 int index = mapping.elementAt(ch); 460 return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX); 461 } 462 463 private Vector<EntryPair> getContractValuesImpl(int index) 464 { 465 if (index >= 0) 466 { 467 return contractTable.elementAt(index); 468 } 469 else // not found 470 { 471 return null; 472 } 473 } 474 475 /** 476 * Adds the expanding string into the collation table. 477 */ 478 private final void addExpandOrder(String contractChars, 479 String expandChars, 480 int anOrder) throws ParseException 481 { 482 // Create an expansion table entry 483 int tableIndex = addExpansion(anOrder, expandChars); 484 485 // And add its index into the main mapping table 486 if (contractChars.length() > 1) { 487 char ch = contractChars.charAt(0); 488 if (Character.isHighSurrogate(ch) && contractChars.length() == 2) { 489 char ch2 = contractChars.charAt(1); 490 if (Character.isLowSurrogate(ch2)) { 491 //only add into table when it is a legal surrogate 492 addOrder(Character.toCodePoint(ch, ch2), tableIndex); 493 } 494 } else { 495 addContractOrder(contractChars, tableIndex); 496 } 497 } else { 498 addOrder(contractChars.charAt(0), tableIndex); 499 } 500 } 501 502 private final void addExpandOrder(int ch, String expandChars, int anOrder) 503 throws ParseException 504 { 505 int tableIndex = addExpansion(anOrder, expandChars); 506 addOrder(ch, tableIndex); 507 } 508 509 /** 510 * Create a new entry in the expansion table that contains the orderings 511 * for the given characers. If anOrder is valid, it is added to the 512 * beginning of the expanded list of orders. 513 */ 514 private int addExpansion(int anOrder, String expandChars) { 515 if (expandTable == null) { 516 expandTable = new Vector<>(INITIALTABLESIZE); 517 } 518 519 // If anOrder is valid, we want to add it at the beginning of the list 520 int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1; 521 522 int[] valueList = new int[expandChars.length() + offset]; 523 if (offset == 1) { 524 valueList[0] = anOrder; 525 } 526 527 int j = offset; 528 for (int i = 0; i < expandChars.length(); i++) { 529 char ch0 = expandChars.charAt(i); 530 char ch1; 531 int ch; 532 if (Character.isHighSurrogate(ch0)) { 533 if (++i == expandChars.length() || 534 !Character.isLowSurrogate(ch1=expandChars.charAt(i))) { 535 //ether we are missing the low surrogate or the next char 536 //is not a legal low surrogate, so stop loop 537 break; 538 } 539 ch = Character.toCodePoint(ch0, ch1); 540 541 } else { 542 ch = ch0; 543 } 544 545 int mapValue = getCharOrder(ch); 546 547 if (mapValue != RBCollationTables.UNMAPPED) { 548 valueList[j++] = mapValue; 549 } else { 550 // can't find it in the table, will be filled in by commit(). 551 valueList[j++] = CHARINDEX + ch; 552 } 553 } 554 if (j < valueList.length) { 555 //we had at least one supplementary character, the size of valueList 556 //is bigger than it really needs... 557 int[] tmpBuf = new int[j]; 558 while (--j >= 0) { 559 tmpBuf[j] = valueList[j]; 560 } 561 valueList = tmpBuf; 562 } 563 // Add the expanding char list into the expansion table. 564 int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size(); 565 expandTable.addElement(valueList); 566 567 return tableIndex; 568 } 569 570 private void addContractFlags(String chars) { 571 char c0; 572 int c; 573 int len = chars.length(); 574 for (int i = 0; i < len; i++) { 575 c0 = chars.charAt(i); 576 c = Character.isHighSurrogate(c0) 577 ?Character.toCodePoint(c0, chars.charAt(++i)) 578 :c0; 579 contractFlags.put(c, 1); 580 } 581 } 582 583 // ============================================================== 584 // constants 585 // ============================================================== 586 static final int CHARINDEX = 0x70000000; // need look up in .commit() 587 588 private static final int IGNORABLEMASK = 0x0000ffff; 589 private static final int PRIMARYORDERINCREMENT = 0x00010000; 590 private static final int SECONDARYORDERINCREMENT = 0x00000100; 591 private static final int TERTIARYORDERINCREMENT = 0x00000001; 592 private static final int INITIALTABLESIZE = 20; 593 private static final int MAXKEYSIZE = 5; 594 595 // ============================================================== 596 // instance variables 597 // ============================================================== 598 599 // variables used by the build process 600 private RBCollationTables.BuildAPI tables = null; 601 private MergeCollation mPattern = null; 602 private boolean isOverIgnore = false; 603 private char[] keyBuf = new char[MAXKEYSIZE]; 604 private IntHashtable contractFlags = new IntHashtable(100); 605 606 // "shadow" copies of the instance variables in RBCollationTables 607 // (the values in these variables are copied back into RBCollationTables 608 // at the end of the build process) 609 private boolean frenchSec = false; 610 private boolean seAsianSwapping = false; 611 612 private UCompactIntArray mapping = null; 613 private Vector<Vector<EntryPair>> contractTable = null; 614 private Vector<int[]> expandTable = null; 615 616 private short maxSecOrder = 0; 617 private short maxTerOrder = 0; 618} 619