RuleBasedBreakIteratorBuilder.java revision 9330:8b1f1c2a400f
1/* 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package build.tools.generatebreakiteratordata; 27 28import java.io.*; 29import java.util.Enumeration; 30import java.util.Hashtable; 31import java.util.Stack; 32import java.util.Vector; 33import java.util.zip.CRC32; 34import sun.text.CompactByteArray; 35 36/** 37 * This class has the job of constructing a RuleBasedBreakIterator from a 38 * textual description. A Builder is constructed by GenerateBreakIteratorData, 39 * which uses it to construct the iterator itself and then throws it away. 40 * <p>The construction logic is separated out into its own class for two primary 41 * reasons: 42 * <ul> 43 * <li>The construction logic is quite sophisticated and large. Separating 44 * it out into its own class means the code must only be loaded into memory 45 * while a RuleBasedBreakIterator is being constructed, and can be purged after 46 * that. 47 * <li>There is a fair amount of state that must be maintained throughout the 48 * construction process that is not needed by the iterator after construction. 49 * Separating this state out into another class prevents all of the functions 50 * that construct the iterator from having to have really long parameter lists, 51 * (hopefully) contributing to readability and maintainability. 52 * </ul> 53 * <p> 54 * It'd be really nice if this could be an independent class rather than an 55 * inner class, because that would shorten the source file considerably, but 56 * making Builder an inner class of RuleBasedBreakIterator allows it direct 57 * access to RuleBasedBreakIterator's private members, which saves us from 58 * having to provide some kind of "back door" to the Builder class that could 59 * then also be used by other classes. 60 */ 61class RuleBasedBreakIteratorBuilder { 62 63 /** 64 * A token used as a character-category value to identify ignore characters 65 */ 66 protected static final byte IGNORE = -1; 67 68 /** 69 * Tables that indexes from character values to character category numbers 70 */ 71 private CompactByteArray charCategoryTable = null; 72 private SupplementaryCharacterData supplementaryCharCategoryTable = null; 73 74 /** 75 * The table of state transitions used for forward iteration 76 */ 77 private short[] stateTable = null; 78 79 /** 80 * The table of state transitions used to sync up the iterator with the 81 * text in backwards and random-access iteration 82 */ 83 private short[] backwardsStateTable = null; 84 85 /** 86 * A list of flags indicating which states in the state table are accepting 87 * ("end") states 88 */ 89 private boolean[] endStates = null; 90 91 /** 92 * A list of flags indicating which states in the state table are 93 * lookahead states (states which turn lookahead on and off) 94 */ 95 private boolean[] lookaheadStates = null; 96 97 /** 98 * A table for additional data. May be used by a subclass of 99 * RuleBasedBreakIterator. 100 */ 101 private byte[] additionalData = null; 102 103 /** 104 * The number of character categories (and, thus, the number of columns in 105 * the state tables) 106 */ 107 private int numCategories; 108 109 /** 110 * A temporary holding place used for calculating the character categories. 111 * This object contains CharSet objects. 112 */ 113 protected Vector<CharSet> categories = null; 114 115 /** 116 * A table used to map parts of regexp text to lists of character 117 * categories, rather than having to figure them out from scratch each time 118 */ 119 protected Hashtable<String, Object> expressions = null; 120 121 /** 122 * A temporary holding place for the list of ignore characters 123 */ 124 protected CharSet ignoreChars = null; 125 126 /** 127 * A temporary holding place where the forward state table is built 128 */ 129 protected Vector<short[]> tempStateTable = null; 130 131 /** 132 * A list of all the states that have to be filled in with transitions to 133 * the next state that is created. Used when building the state table from 134 * the regular expressions. 135 */ 136 protected Vector<Integer> decisionPointList = null; 137 138 /** 139 * A stack for holding decision point lists. This is used to handle nested 140 * parentheses and braces in regexps. 141 */ 142 protected Stack<Vector<Integer>> decisionPointStack = null; 143 144 /** 145 * A list of states that loop back on themselves. Used to handle .*? 146 */ 147 protected Vector<Integer> loopingStates = null; 148 149 /** 150 * Looping states actually have to be backfilled later in the process 151 * than everything else. This is where a the list of states to backfill 152 * is accumulated. This is also used to handle .*? 153 */ 154 protected Vector<Integer> statesToBackfill = null; 155 156 /** 157 * A list mapping pairs of state numbers for states that are to be combined 158 * to the state number of the state representing their combination. Used 159 * in the process of making the state table deterministic to prevent 160 * infinite recursion. 161 */ 162 protected Vector<int[]> mergeList = null; 163 164 /** 165 * A flag that is used to indicate when the list of looping states can 166 * be reset. 167 */ 168 protected boolean clearLoopingStates = false; 169 170 /** 171 * A bit mask used to indicate a bit in the table's flags column that marks 172 * a state as an accepting state. 173 */ 174 protected static final int END_STATE_FLAG = 0x8000; 175 176 /** 177 * A bit mask used to indicate a bit in the table's flags column that marks 178 * a state as one the builder shouldn't loop to any looping states 179 */ 180 protected static final int DONT_LOOP_FLAG = 0x4000; 181 182 /** 183 * A bit mask used to indicate a bit in the table's flags column that marks 184 * a state as a lookahead state. 185 */ 186 protected static final int LOOKAHEAD_STATE_FLAG = 0x2000; 187 188 /** 189 * A bit mask representing the union of the mask values listed above. 190 * Used for clearing or masking off the flag bits. 191 */ 192 protected static final int ALL_FLAGS = END_STATE_FLAG 193 | LOOKAHEAD_STATE_FLAG 194 | DONT_LOOP_FLAG; 195 196 /** 197 * This is the main function for setting up the BreakIterator's tables. It 198 * just vectors different parts of the job off to other functions. 199 */ 200 public RuleBasedBreakIteratorBuilder(String description) { 201 Vector<String> tempRuleList = buildRuleList(description); 202 buildCharCategories(tempRuleList); 203 buildStateTable(tempRuleList); 204 buildBackwardsStateTable(tempRuleList); 205 } 206 207 /** 208 * Thus function has three main purposes: 209 * <ul><li>Perform general syntax checking on the description, so the rest 210 * of the build code can assume that it's parsing a legal description. 211 * <li>Split the description into separate rules 212 * <li>Perform variable-name substitutions (so that no one else sees 213 * variable names) 214 * </ul> 215 */ 216 private Vector<String> buildRuleList(String description) { 217 // invariants: 218 // - parentheses must be balanced: ()[]{}<> 219 // - nothing can be nested inside <> 220 // - nothing can be nested inside [] except more []s 221 // - pairs of ()[]{}<> must not be empty 222 // - ; can only occur at the outer level 223 // - | can only appear inside () 224 // - only one = or / can occur in a single rule 225 // - = and / cannot both occur in the same rule 226 // - <> can only occur on the left side of a = expression 227 // (because we'll perform substitutions to eliminate them other places) 228 // - the left-hand side of a = expression can only be a single character 229 // (possibly with \) or text inside <> 230 // - the right-hand side of a = expression must be enclosed in [] or () 231 // - * may not occur at the beginning of a rule, nor may it follow 232 // =, /, (, (, |, }, ;, or * 233 // - ? may only follow * 234 // - the rule list must contain at least one / rule 235 // - no rule may be empty 236 // - all printing characters in the ASCII range except letters and digits 237 // are reserved and must be preceded by \ 238 // - ! may only occur at the beginning of a rule 239 240 // set up a vector to contain the broken-up description (each entry in the 241 // vector is a separate rule) and a stack for keeping track of opening 242 // punctuation 243 Vector<String> tempRuleList = new Vector<>(); 244 Stack<Character> parenStack = new Stack<>(); 245 246 int p = 0; 247 int ruleStart = 0; 248 int c = '\u0000'; 249 int lastC = '\u0000'; 250 int lastOpen = '\u0000'; 251 boolean haveEquals = false; 252 boolean havePipe = false; 253 boolean sawVarName = false; 254 final String charsThatCantPrecedeAsterisk = "=/{(|}*;\u0000"; 255 256 // if the description doesn't end with a semicolon, tack a semicolon onto the end 257 if (description.length() != 0 && 258 description.codePointAt(description.length() - 1) != ';') { 259 description = description + ";"; 260 } 261 262 // for each character, do... 263 while (p < description.length()) { 264 c = description.codePointAt(p); 265 266 switch (c) { 267 // if the character is a backslash, skip the character that follows it 268 // (it'll get treated as a literal character) 269 case '\\': 270 ++p; 271 break; 272 273 // if the character is opening punctuation, verify that no nesting 274 // rules are broken, and push the character onto the stack 275 case '{': 276 case '<': 277 case '[': 278 case '(': 279 if (lastOpen == '<') { 280 error("Can't nest brackets inside <>", p, description); 281 } 282 if (lastOpen == '[' && c != '[') { 283 error("Can't nest anything in [] but []", p, description); 284 } 285 286 // if we see < anywhere except on the left-hand side of =, 287 // we must be seeing a variable name that was never defined 288 if (c == '<' && (haveEquals || havePipe)) { 289 error("Unknown variable name", p, description); 290 } 291 292 lastOpen = c; 293 parenStack.push(new Character((char)c)); 294 if (c == '<') { 295 sawVarName = true; 296 } 297 break; 298 299 // if the character is closing punctuation, verify that it matches the 300 // last opening punctuation we saw, and that the brackets contain 301 // something, then pop the stack 302 case '}': 303 case '>': 304 case ']': 305 case ')': 306 char expectedClose = '\u0000'; 307 switch (lastOpen) { 308 case '{': 309 expectedClose = '}'; 310 break; 311 case '[': 312 expectedClose = ']'; 313 break; 314 case '(': 315 expectedClose = ')'; 316 break; 317 case '<': 318 expectedClose = '>'; 319 break; 320 } 321 if (c != expectedClose) { 322 error("Unbalanced parentheses", p, description); 323 } 324 if (lastC == lastOpen) { 325 error("Parens don't contain anything", p, description); 326 } 327 parenStack.pop(); 328 if (!parenStack.empty()) { 329 lastOpen = parenStack.peek().charValue(); 330 } 331 else { 332 lastOpen = '\u0000'; 333 } 334 335 break; 336 337 // if the character is an asterisk, make sure it occurs in a place 338 // where an asterisk can legally go 339 case '*': 340 if (charsThatCantPrecedeAsterisk.indexOf(lastC) != -1) { 341 error("Misplaced asterisk", p, description); 342 } 343 break; 344 345 // if the character is a question mark, make sure it follows an asterisk 346 case '?': 347 if (lastC != '*') { 348 error("Misplaced ?", p, description); 349 } 350 break; 351 352 // if the character is an equals sign, make sure we haven't seen another 353 // equals sign or a slash yet 354 case '=': 355 if (haveEquals || havePipe) { 356 error("More than one = or / in rule", p, description); 357 } 358 haveEquals = true; 359 break; 360 361 // if the character is a slash, make sure we haven't seen another slash 362 // or an equals sign yet 363 case '/': 364 if (haveEquals || havePipe) { 365 error("More than one = or / in rule", p, description); 366 } 367 if (sawVarName) { 368 error("Unknown variable name", p, description); 369 } 370 havePipe = true; 371 break; 372 373 // if the character is an exclamation point, make sure it occurs only 374 // at the beginning of a rule 375 case '!': 376 if (lastC != ';' && lastC != '\u0000') { 377 error("! can only occur at the beginning of a rule", p, description); 378 } 379 break; 380 381 // we don't have to do anything special on a period 382 case '.': 383 break; 384 385 // if the character is a syntax character that can only occur 386 // inside [], make sure that it does in fact only occur inside []. 387 case '^': 388 case '-': 389 case ':': 390 if (lastOpen != '[' && lastOpen != '<') { 391 error("Illegal character", p, description); 392 } 393 break; 394 395 // if the character is a semicolon, do the following... 396 case ';': 397 // make sure the rule contains something and that there are no 398 // unbalanced parentheses or brackets 399 if (lastC == ';' || lastC == '\u0000') { 400 error("Empty rule", p, description); 401 } 402 if (!parenStack.empty()) { 403 error("Unbalanced parenheses", p, description); 404 } 405 406 if (parenStack.empty()) { 407 // if the rule contained an = sign, call processSubstitution() 408 // to replace the substitution name with the substitution text 409 // wherever it appears in the description 410 if (haveEquals) { 411 description = processSubstitution(description.substring(ruleStart, 412 p), description, p + 1); 413 } 414 else { 415 // otherwise, check to make sure the rule doesn't reference 416 // any undefined substitutions 417 if (sawVarName) { 418 error("Unknown variable name", p, description); 419 } 420 421 // then add it to tempRuleList 422 tempRuleList.addElement(description.substring(ruleStart, p)); 423 } 424 425 // and reset everything to process the next rule 426 ruleStart = p + 1; 427 haveEquals = havePipe = sawVarName = false; 428 } 429 break; 430 431 // if the character is a vertical bar, check to make sure that it 432 // occurs inside a () expression and that the character that precedes 433 // it isn't also a vertical bar 434 case '|': 435 if (lastC == '|') { 436 error("Empty alternative", p, description); 437 } 438 if (parenStack.empty() || lastOpen != '(') { 439 error("Misplaced |", p, description); 440 } 441 break; 442 443 // if the character is anything else (escaped characters are 444 // skipped and don't make it here), it's an error 445 default: 446 if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c) 447 && !Character.isDigit((char)c)) { 448 error("Illegal character", p, description); 449 } 450 if (c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) { 451 ++p; 452 } 453 break; 454 } 455 lastC = c; 456 ++p; 457 } 458 if (tempRuleList.size() == 0) { 459 error("No valid rules in description", p, description); 460 } 461 return tempRuleList; 462 } 463 464 /** 465 * This function performs variable-name substitutions. First it does syntax 466 * checking on the variable-name definition. If it's syntactically valid, it 467 * then goes through the remainder of the description and does a simple 468 * find-and-replace of the variable name with its text. (The variable text 469 * must be enclosed in either [] or () for this to work.) 470 */ 471 protected String processSubstitution(String substitutionRule, String description, 472 int startPos) { 473 // isolate out the text on either side of the equals sign 474 String replace; 475 String replaceWith; 476 int equalPos = substitutionRule.indexOf('='); 477 replace = substitutionRule.substring(0, equalPos); 478 replaceWith = substitutionRule.substring(equalPos + 1); 479 480 // check to see whether the substitution name is something we've declared 481 // to be "special". For RuleBasedBreakIterator itself, this is "<ignore>". 482 // This function takes care of any extra processing that has to be done 483 // with "special" substitution names. 484 handleSpecialSubstitution(replace, replaceWith, startPos, description); 485 486 // perform various other syntax checks on the rule 487 if (replaceWith.length() == 0) { 488 error("Nothing on right-hand side of =", startPos, description); 489 } 490 if (replace.length() == 0) { 491 error("Nothing on left-hand side of =", startPos, description); 492 } 493 if (replace.length() == 2 && replace.charAt(0) != '\\') { 494 error("Illegal left-hand side for =", startPos, description); 495 } 496 if (replace.length() >= 3 && replace.charAt(0) != '<' && 497 replace.codePointBefore(equalPos) != '>') { 498 error("Illegal left-hand side for =", startPos, description); 499 } 500 if (!(replaceWith.charAt(0) == '[' && 501 replaceWith.charAt(replaceWith.length() - 1) == ']') && 502 !(replaceWith.charAt(0) == '(' && 503 replaceWith.charAt(replaceWith.length() - 1) == ')')) { 504 error("Illegal right-hand side for =", startPos, description); 505 } 506 507 // now go through the rest of the description (which hasn't been broken up 508 // into separate rules yet) and replace every occurrence of the 509 // substitution name with the substitution body 510 StringBuffer result = new StringBuffer(); 511 result.append(description.substring(0, startPos)); 512 int lastPos = startPos; 513 int pos = description.indexOf(replace, startPos); 514 while (pos != -1) { 515 result.append(description.substring(lastPos, pos)); 516 result.append(replaceWith); 517 lastPos = pos + replace.length(); 518 pos = description.indexOf(replace, lastPos); 519 } 520 result.append(description.substring(lastPos)); 521 return result.toString(); 522 } 523 524 /** 525 * This function defines a protocol for handling substitution names that 526 * are "special," i.e., that have some property beyond just being 527 * substitutions. At the RuleBasedBreakIterator level, we have one 528 * special substitution name, "<ignore>". Subclasses can override this 529 * function to add more. Any special processing that has to go on beyond 530 * that which is done by the normal substitution-processing code is done 531 * here. 532 */ 533 protected void handleSpecialSubstitution(String replace, String replaceWith, 534 int startPos, String description) { 535 // if we get a definition for a substitution called "ignore", it defines 536 // the ignore characters for the iterator. Check to make sure the expression 537 // is a [] expression, and if it is, parse it and store the characters off 538 // to the side. 539 if (replace.equals("<ignore>")) { 540 if (replaceWith.charAt(0) == '(') { 541 error("Ignore group can't be enclosed in (", startPos, description); 542 } 543 ignoreChars = CharSet.parseString(replaceWith); 544 } 545 } 546 547 /** 548 * This function builds the character category table. On entry, 549 * tempRuleList is a vector of break rules that has had variable names substituted. 550 * On exit, the charCategoryTable data member has been initialized to hold the 551 * character category table, and tempRuleList's rules have been munged to contain 552 * character category numbers everywhere a literal character or a [] expression 553 * originally occurred. 554 */ 555 @SuppressWarnings("fallthrough") 556 protected void buildCharCategories(Vector<String> tempRuleList) { 557 int bracketLevel = 0; 558 int p = 0; 559 int lineNum = 0; 560 561 // build hash table of every literal character or [] expression in the rule list 562 // and use CharSet.parseString() to derive a CharSet object representing the 563 // characters each refers to 564 expressions = new Hashtable<>(); 565 while (lineNum < tempRuleList.size()) { 566 String line = tempRuleList.elementAt(lineNum); 567 p = 0; 568 while (p < line.length()) { 569 int c = line.codePointAt(p); 570 switch (c) { 571 // skip over all syntax characters except [ 572 case '{': case '}': case '(': case ')': case '*': case '.': 573 case '/': case '|': case ';': case '?': case '!': 574 break; 575 576 // for [, find the matching ] (taking nested [] pairs into account) 577 // and add the whole expression to the expression list 578 case '[': 579 int q = p + 1; 580 ++bracketLevel; 581 while (q < line.length() && bracketLevel != 0) { 582 c = line.codePointAt(q); 583 switch (c) { 584 case '\\': 585 q++; 586 break; 587 case '[': 588 ++bracketLevel; 589 break; 590 case ']': 591 --bracketLevel; 592 break; 593 } 594 q = q + Character.charCount(c); 595 } 596 if (expressions.get(line.substring(p, q)) == null) { 597 expressions.put(line.substring(p, q), CharSet.parseString(line.substring(p, q))); 598 } 599 p = q - 1; 600 break; 601 602 // for \ sequences, just move to the next character and treat 603 // it as a single character 604 case '\\': 605 ++p; 606 c = line.codePointAt(p); 607 // DON'T break; fall through into "default" clause 608 609 // for an isolated single character, add it to the expression list 610 default: 611 expressions.put(line.substring(p, p + 1), CharSet.parseString(line.substring(p, p + 1))); 612 break; 613 } 614 p += Character.charCount(line.codePointAt(p)); 615 } 616 ++lineNum; 617 } 618 // dump CharSet's internal expression cache 619 CharSet.releaseExpressionCache(); 620 621 // create the temporary category table (which is a vector of CharSet objects) 622 categories = new Vector<>(); 623 if (ignoreChars != null) { 624 categories.addElement(ignoreChars); 625 } 626 else { 627 categories.addElement(new CharSet()); 628 } 629 ignoreChars = null; 630 631 // this is a hook to allow subclasses to add categories on their own 632 mungeExpressionList(expressions); 633 634 // Derive the character categories. Go through the existing character categories 635 // looking for overlap. Any time there's overlap, we create a new character 636 // category for the characters that overlapped and remove them from their original 637 // category. At the end, any characters that are left in the expression haven't 638 // been mentioned in any category, so another new category is created for them. 639 // For example, if the first expression is [abc], then a, b, and c will be placed 640 // into a single character category. If the next expression is [bcd], we will first 641 // remove b and c from their existing category (leaving a behind), create a new 642 // category for b and c, and then create another new category for d (which hadn't 643 // been mentioned in the previous expression). 644 // At no time should a character ever occur in more than one character category. 645 646 // for each expression in the expressions list, do... 647 for (Enumeration<Object> iter = expressions.elements(); iter.hasMoreElements(); ) { 648 // initialize the working char set to the chars in the current expression 649 CharSet e = (CharSet)iter.nextElement(); 650 651 // for each category in the category list, do... 652 for (int j = categories.size() - 1; !e.empty() && j > 0; j--) { 653 654 // if there's overlap between the current working set of chars 655 // and the current category... 656 CharSet that = categories.elementAt(j); 657 if (!that.intersection(e).empty()) { 658 659 // add a new category for the characters that were in the 660 // current category but not in the working char set 661 CharSet temp = that.difference(e); 662 if (!temp.empty()) { 663 categories.addElement(temp); 664 } 665 666 // remove those characters from the working char set and replace 667 // the current category with the characters that it did 668 // have in common with the current working char set 669 temp = e.intersection(that); 670 e = e.difference(that); 671 if (!temp.equals(that)) { 672 categories.setElementAt(temp, j); 673 } 674 } 675 } 676 677 // if there are still characters left in the working char set, 678 // add a new category containing them 679 if (!e.empty()) { 680 categories.addElement(e); 681 } 682 } 683 684 // we have the ignore characters stored in position 0. Make an extra pass through 685 // the character category list and remove anything from the ignore list that shows 686 // up in some other category 687 CharSet allChars = new CharSet(); 688 for (int i = 1; i < categories.size(); i++) { 689 allChars = allChars.union(categories.elementAt(i)); 690 } 691 CharSet ignoreChars = categories.elementAt(0); 692 ignoreChars = ignoreChars.difference(allChars); 693 categories.setElementAt(ignoreChars, 0); 694 695 // now that we've derived the character categories, go back through the expression 696 // list and replace each CharSet object with a String that represents the 697 // character categories that expression refers to. The String is encoded: each 698 // character is a character category number (plus 0x100 to avoid confusing them 699 // with syntax characters in the rule grammar) 700 701 for (Enumeration<String> iter = expressions.keys(); iter.hasMoreElements(); ) { 702 String key = iter.nextElement(); 703 CharSet cs = (CharSet)expressions.get(key); 704 StringBuffer cats = new StringBuffer(); 705 706 // for each category... 707 for (int j = 0; j < categories.size(); j++) { 708 709 // if the current expression contains characters in that category... 710 CharSet temp = cs.intersection(categories.elementAt(j)); 711 if (!temp.empty()) { 712 713 // then add the encoded category number to the String for this 714 // expression 715 cats.append((char)(0x100 + j)); 716 if (temp.equals(cs)) { 717 break; 718 } 719 } 720 } 721 722 // once we've finished building the encoded String for this expression, 723 // replace the CharSet object with it 724 expressions.put(key, cats.toString()); 725 } 726 727 // and finally, we turn the temporary category table into a permanent category 728 // table, which is a CompactByteArray. (we skip category 0, which by definition 729 // refers to all characters not mentioned specifically in the rules) 730 charCategoryTable = new CompactByteArray((byte)0); 731 supplementaryCharCategoryTable = new SupplementaryCharacterData((byte)0); 732 733 // for each category... 734 for (int i = 0; i < categories.size(); i++) { 735 CharSet chars = categories.elementAt(i); 736 737 // go through the character ranges in the category one by one... 738 Enumeration<int[]> enum_ = chars.getChars(); 739 while (enum_.hasMoreElements()) { 740 int[] range = enum_.nextElement(); 741 742 // and set the corresponding elements in the CompactArray accordingly 743 if (i != 0) { 744 if (range[0] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 745 if (range[1] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 746 charCategoryTable.setElementAt((char)range[0], (char)range[1], (byte)i); 747 } else { 748 charCategoryTable.setElementAt((char)range[0], (char)0xFFFF, (byte)i); 749 supplementaryCharCategoryTable.appendElement(Character.MIN_SUPPLEMENTARY_CODE_POINT, range[1], (byte)i); 750 } 751 } else { 752 supplementaryCharCategoryTable.appendElement(range[0], range[1], (byte)i); 753 } 754 } 755 756 // (category 0 is special-- it's the hiding place for the ignore 757 // characters, whose real category number in the CompactArray is 758 // -1 [this is because category 0 contains all characters not 759 // specifically mentioned anywhere in the rules] ) 760 else { 761 if (range[0] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 762 if (range[1] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 763 charCategoryTable.setElementAt((char)range[0], (char)range[1], IGNORE); 764 } else { 765 charCategoryTable.setElementAt((char)range[0], (char)0xFFFF, IGNORE); 766 supplementaryCharCategoryTable.appendElement(Character.MIN_SUPPLEMENTARY_CODE_POINT, range[1], IGNORE); 767 } 768 } else { 769 supplementaryCharCategoryTable.appendElement(range[0], range[1], IGNORE); 770 } 771 } 772 } 773 } 774 775 // once we've populated the CompactArray, compact it 776 charCategoryTable.compact(); 777 778 // And, complete the category table for supplementary characters 779 supplementaryCharCategoryTable.complete(); 780 781 // initialize numCategories 782 numCategories = categories.size(); 783 } 784 785 protected void mungeExpressionList(Hashtable<String, Object> expressions) { 786 // empty in the parent class. This function provides a hook for subclasses 787 // to mess with the character category table. 788 } 789 790 /** 791 * This is the function that builds the forward state table. Most of the real 792 * work is done in parseRule(), which is called once for each rule in the 793 * description. 794 */ 795 private void buildStateTable(Vector<String> tempRuleList) { 796 // initialize our temporary state table, and fill it with two states: 797 // state 0 is a dummy state that allows state 1 to be the starting state 798 // and 0 to represent "stop". State 1 is added here to seed things 799 // before we start parsing 800 tempStateTable = new Vector<>(); 801 tempStateTable.addElement(new short[numCategories + 1]); 802 tempStateTable.addElement(new short[numCategories + 1]); 803 804 // call parseRule() for every rule in the rule list (except those which 805 // start with !, which are actually backwards-iteration rules) 806 for (int i = 0; i < tempRuleList.size(); i++) { 807 String rule = tempRuleList.elementAt(i); 808 if (rule.charAt(0) != '!') { 809 parseRule(rule, true); 810 } 811 } 812 813 // finally, use finishBuildingStateTable() to minimize the number of 814 // states in the table and perform some other cleanup work 815 finishBuildingStateTable(true); 816 } 817 818 /** 819 * This is where most of the work really happens. This routine parses a single 820 * rule in the rule description, adding and modifying states in the state 821 * table according to the new expression. The state table is kept deterministic 822 * throughout the whole operation, although some ugly postprocessing is needed 823 * to handle the *? token. 824 */ 825 private void parseRule(String rule, boolean forward) { 826 // algorithm notes: 827 // - The basic idea here is to read successive character-category groups 828 // from the input string. For each group, you create a state and point 829 // the appropriate entries in the previous state to it. This produces a 830 // straight line from the start state to the end state. The {}, *, and (|) 831 // idioms produce branches in this straight line. These branches (states 832 // that can transition to more than one other state) are called "decision 833 // points." A list of decision points is kept. This contains a list of 834 // all states that can transition to the next state to be created. For a 835 // straight line progression, the only thing in the decision-point list is 836 // the current state. But if there's a branch, the decision-point list 837 // will contain all of the beginning points of the branch when the next 838 // state to be created represents the end point of the branch. A stack is 839 // used to save decision point lists in the presence of nested parentheses 840 // and the like. For example, when a { is encountered, the current decision 841 // point list is saved on the stack and restored when the corresponding } 842 // is encountered. This way, after the } is read, the decision point list 843 // will contain both the state right before the } _and_ the state before 844 // the whole {} expression. Both of these states can transition to the next 845 // state after the {} expression. 846 // - one complication arises when we have to stamp a transition value into 847 // an array cell that already contains one. The updateStateTable() and 848 // mergeStates() functions handle this case. Their basic approach is to 849 // create a new state that combines the two states that conflict and point 850 // at it when necessary. This happens recursively, so if the merged states 851 // also conflict, they're resolved in the same way, and so on. There are 852 // a number of tests aimed at preventing infinite recursion. 853 // - another complication arises with repeating characters. It's somewhat 854 // ambiguous whether the user wants a greedy or non-greedy match in these cases. 855 // (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in 856 // "abc" or the LONGEST sequence of letters ending in "abc". We've adopted 857 // the *? to mean "shortest" and * by itself to mean "longest". (You get the 858 // same result with both if there's no overlap between the repeating character 859 // group and the group immediately following it.) Handling the *? token is 860 // rather complicated and involves keeping track of whether a state needs to 861 // be merged (as described above) or merely overwritten when you update one of 862 // its cells, and copying the contents of a state that loops with a *? token 863 // into some of the states that follow it after the rest of the table-building 864 // process is complete ("backfilling"). 865 // implementation notes: 866 // - This function assumes syntax checking has been performed on the input string 867 // prior to its being passed in here. It assumes that parentheses are 868 // balanced, all literal characters are enclosed in [] and turned into category 869 // numbers, that there are no illegal characters or character sequences, and so 870 // on. Violation of these invariants will lead to undefined behavior. 871 // - It'd probably be better to use linked lists rather than Vector and Stack 872 // to maintain the decision point list and stack. I went for simplicity in 873 // this initial implementation. If performance is critical enough, we can go 874 // back and fix this later. 875 // -There are a number of important limitations on the *? token. It does not work 876 // right when followed by a repeating character sequence (e.g., ".*?(abc)*") 877 // (although it does work right when followed by a single repeating character). 878 // It will not always work right when nested in parentheses or braces (although 879 // sometimes it will). It also will not work right if the group of repeating 880 // characters and the group of characters that follows overlap partially 881 // (e.g., "[a-g]*?[e-j]"). None of these capabilites was deemed necessary for 882 // describing breaking rules we know about, so we left them out for 883 // expeditiousness. 884 // - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"-- 885 // that is, if the string ends in "aaaabc", the break will go before the first 886 // "a" rather than the last one. Both of these are limitations in the design 887 // of RuleBasedBreakIterator and not limitations of the rule parser. 888 889 int p = 0; 890 int currentState = 1; // don't use state number 0; 0 means "stop" 891 int lastState = currentState; 892 String pendingChars = ""; 893 894 decisionPointStack = new Stack<>(); 895 decisionPointList = new Vector<>(); 896 loopingStates = new Vector<>(); 897 statesToBackfill = new Vector<>(); 898 899 short[] state; 900 boolean sawEarlyBreak = false; 901 902 // if we're adding rules to the backward state table, mark the initial state 903 // as a looping state 904 if (!forward) { 905 loopingStates.addElement(new Integer(1)); 906 } 907 908 // put the current state on the decision point list before we start 909 decisionPointList.addElement(new Integer(currentState)); // we want currentState to 910 // be 1 here... 911 currentState = tempStateTable.size() - 1; // but after that, we want it to be 912 // 1 less than the state number of the next state 913 while (p < rule.length()) { 914 int c = rule.codePointAt(p); 915 clearLoopingStates = false; 916 917 // this section handles literal characters, escaped characters (which are 918 // effectively literal characters too), the . token, and [] expressions 919 if (c == '[' 920 || c == '\\' 921 || Character.isLetter(c) 922 || Character.isDigit(c) 923 || c < ' ' 924 || c == '.' 925 || c >= '\u007f') { 926 927 // if we're not on a period, isolate the expression and look up 928 // the corresponding category list 929 if (c != '.') { 930 int q = p; 931 932 // if we're on a backslash, the expression is the character 933 // after the backslash 934 if (c == '\\') { 935 q = p + 2; 936 ++p; 937 } 938 939 // if we're on an opening bracket, scan to the closing bracket 940 // to isolate the expression 941 else if (c == '[') { 942 int bracketLevel = 1; 943 944 q += Character.charCount(rule.codePointAt(q)); 945 while (bracketLevel > 0) { 946 c = rule.codePointAt(q); 947 if (c == '[') { 948 ++bracketLevel; 949 } 950 else if (c == ']') { 951 --bracketLevel; 952 } 953 else if (c == '\\') { 954 c = rule.codePointAt(++q); 955 } 956 q += Character.charCount(c); 957 } 958 } 959 960 // otherwise, the expression is just the character itself 961 else { 962 q = p + Character.charCount(c); 963 } 964 965 // look up the category list for the expression and store it 966 // in pendingChars 967 pendingChars = (String)expressions.get(rule.substring(p, q)); 968 969 // advance the current position past the expression 970 p = q - Character.charCount(rule.codePointBefore(q)); 971 } 972 973 // if the character we're on is a period, we end up down here 974 else { 975 int rowNum = decisionPointList.lastElement().intValue(); 976 state = tempStateTable.elementAt(rowNum); 977 978 // if the period is followed by an asterisk, then just set the current 979 // state to loop back on itself 980 if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) { 981 decisionPointList.addElement(new Integer(state[0])); 982 pendingChars = ""; 983 ++p; 984 } 985 986 // otherwise, fabricate a category list ("pendingChars") with 987 // every category in it 988 else { 989 StringBuffer temp = new StringBuffer(); 990 for (int i = 0; i < numCategories; i++) 991 temp.append((char)(i + 0x100)); 992 pendingChars = temp.toString(); 993 } 994 } 995 996 // we'll end up in here for all expressions except for .*, which is 997 // special-cased above 998 if (pendingChars.length() != 0) { 999 1000 // if the expression is followed by an asterisk, then push a copy 1001 // of the current desicion point list onto the stack (this is 1002 // the same thing we do on an opening brace) 1003 if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') { 1004 @SuppressWarnings("unchecked") 1005 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); 1006 decisionPointStack.push(clone); 1007 } 1008 1009 // create a new state, add it to the list of states to backfill 1010 // if we have looping states to worry about, set its "don't make 1011 // me an accepting state" flag if we've seen a slash, and add 1012 // it to the end of the state table 1013 int newState = tempStateTable.size(); 1014 if (loopingStates.size() != 0) { 1015 statesToBackfill.addElement(new Integer(newState)); 1016 } 1017 state = new short[numCategories + 1]; 1018 if (sawEarlyBreak) { 1019 state[numCategories] = DONT_LOOP_FLAG; 1020 } 1021 tempStateTable.addElement(state); 1022 1023 // update everybody in the decision point list to point to 1024 // the new state (this also performs all the reconciliation 1025 // needed to make the table deterministic), then clear the 1026 // decision point list 1027 updateStateTable(decisionPointList, pendingChars, (short)newState); 1028 decisionPointList.removeAllElements(); 1029 1030 // add all states created since the last literal character we've 1031 // seen to the decision point list 1032 lastState = currentState; 1033 do { 1034 ++currentState; 1035 decisionPointList.addElement(new Integer(currentState)); 1036 } while (currentState + 1 < tempStateTable.size()); 1037 } 1038 } 1039 1040 // a { marks the beginning of an optional run of characters. Push a 1041 // copy of the current decision point list onto the stack. This saves 1042 // it, preventing it from being affected by whatever's inside the parentheses. 1043 // This decision point list is restored when a } is encountered. 1044 else if (c == '{') { 1045 @SuppressWarnings("unchecked") 1046 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); 1047 decisionPointStack.push(clone); 1048 } 1049 1050 // a } marks the end of an optional run of characters. Pop the last decision 1051 // point list off the stack and merge it with the current decision point list. 1052 // a * denotes a repeating character or group (* after () is handled separately 1053 // below). In addition to restoring the decision point list, modify the 1054 // current state to point to itself on the appropriate character categories. 1055 else if (c == '}' || c == '*') { 1056 // when there's a *, update the current state to loop back on itself 1057 // on the character categories that caused us to enter this state 1058 if (c == '*') { 1059 for (int i = lastState + 1; i < tempStateTable.size(); i++) { 1060 Vector<Integer> temp = new Vector<>(); 1061 temp.addElement(new Integer(i)); 1062 updateStateTable(temp, pendingChars, (short)(lastState + 1)); 1063 } 1064 } 1065 1066 // pop the top element off the decision point stack and merge 1067 // it with the current decision point list (this causes the divergent 1068 // paths through the state table to come together again on the next 1069 // new state) 1070 Vector<Integer> temp = decisionPointStack.pop(); 1071 for (int i = 0; i < decisionPointList.size(); i++) 1072 temp.addElement(decisionPointList.elementAt(i)); 1073 decisionPointList = temp; 1074 } 1075 1076 // a ? after a * modifies the behavior of * in cases where there is overlap 1077 // between the set of characters that repeat and the characters which follow. 1078 // Without the ?, all states following the repeating state, up to a state which 1079 // is reached by a character that doesn't overlap, will loop back into the 1080 // repeating state. With the ?, the mark states following the *? DON'T loop 1081 // back into the repeating state. Thus, "[a-z]*xyz" will match the longest 1082 // sequence of letters that ends in "xyz," while "[a-z]*? will match the 1083 // _shortest_ sequence of letters that ends in "xyz". 1084 // We use extra bookkeeping to achieve this effect, since everything else works 1085 // according to the "longest possible match" principle. The basic principle 1086 // is that transitions out of a looping state are written in over the looping 1087 // value instead of being reconciled, and that we copy the contents of the 1088 // looping state into empty cells of all non-terminal states that follow the 1089 // looping state. 1090 else if (c == '?') { 1091 setLoopingStates(decisionPointList, decisionPointList); 1092 } 1093 1094 // a ( marks the beginning of a sequence of characters. Parentheses can either 1095 // contain several alternative character sequences (i.e., "(ab|cd|ef)"), or 1096 // they can contain a sequence of characters that can repeat (i.e., "(abc)*"). Thus, 1097 // A () group can have multiple entry and exit points. To keep track of this, 1098 // we reserve TWO spots on the decision-point stack. The top of the stack is 1099 // the list of exit points, which becomes the current decision point list when 1100 // the ) is reached. The next entry down is the decision point list at the 1101 // beginning of the (), which becomes the current decision point list at every 1102 // entry point. 1103 // In addition to keeping track of the exit points and the active decision 1104 // points before the ( (i.e., the places from which the () can be entered), 1105 // we need to keep track of the entry points in case the expression loops 1106 // (i.e., is followed by *). We do that by creating a dummy state in the 1107 // state table and adding it to the decision point list (BEFORE it's duplicated 1108 // on the stack). Nobody points to this state, so it'll get optimized out 1109 // at the end. It exists only to hold the entry points in case the () 1110 // expression loops. 1111 else if (c == '(') { 1112 1113 // add a new state to the state table to hold the entry points into 1114 // the () expression 1115 tempStateTable.addElement(new short[numCategories + 1]); 1116 1117 // we have to adjust lastState and currentState to account for the 1118 // new dummy state 1119 lastState = currentState; 1120 ++currentState; 1121 1122 // add the current state to the decision point list (add it at the 1123 // BEGINNING so we can find it later) 1124 decisionPointList.insertElementAt(new Integer(currentState), 0); 1125 1126 // finally, push a copy of the current decision point list onto the 1127 // stack (this keeps track of the active decision point list before 1128 // the () expression), followed by an empty decision point list 1129 // (this will hold the exit points) 1130 @SuppressWarnings("unchecked") 1131 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); 1132 decisionPointStack.push(clone); 1133 decisionPointStack.push(new Vector<Integer>()); 1134 } 1135 1136 // a | separates alternative character sequences in a () expression. When 1137 // a | is encountered, we add the current decision point list to the exit-point 1138 // list, and restore the decision point list to its state prior to the (. 1139 else if (c == '|') { 1140 1141 // pick out the top two decision point lists on the stack 1142 Vector<Integer> oneDown = decisionPointStack.pop(); 1143 Vector<Integer> twoDown = decisionPointStack.peek(); 1144 decisionPointStack.push(oneDown); 1145 1146 // append the current decision point list to the list below it 1147 // on the stack (the list of exit points), and restore the 1148 // current decision point list to its state before the () expression 1149 for (int i = 0; i < decisionPointList.size(); i++) 1150 oneDown.addElement(decisionPointList.elementAt(i)); 1151 @SuppressWarnings("unchecked") 1152 Vector<Integer> clone = (Vector<Integer>)twoDown.clone(); 1153 decisionPointList = clone; 1154 } 1155 1156 // a ) marks the end of a sequence of characters. We do one of two things 1157 // depending on whether the sequence repeats (i.e., whether the ) is followed 1158 // by *): If the sequence doesn't repeat, then the exit-point list is merged 1159 // with the current decision point list and the decision point list from before 1160 // the () is thrown away. If the sequence does repeat, then we fish out the 1161 // state we were in before the ( and copy all of its forward transitions 1162 // (i.e., every transition added by the () expression) into every state in the 1163 // exit-point list and the current decision point list. The current decision 1164 // point list is then merged with both the exit-point list AND the saved version 1165 // of the decision point list from before the (). Then we throw out the *. 1166 else if (c == ')') { 1167 1168 // pull the exit point list off the stack, merge it with the current 1169 // decision point list, and make the merged version the current 1170 // decision point list 1171 Vector<Integer> exitPoints = decisionPointStack.pop(); 1172 for (int i = 0; i < decisionPointList.size(); i++) 1173 exitPoints.addElement(decisionPointList.elementAt(i)); 1174 decisionPointList = exitPoints; 1175 1176 // if the ) isn't followed by a *, then all we have to do is throw 1177 // away the other list on the decision point stack, and we're done 1178 if (p + 1 >= rule.length() || rule.charAt(p + 1) != '*') { 1179 decisionPointStack.pop(); 1180 } 1181 1182 // but if the sequence repeats, we have a lot more work to do... 1183 else { 1184 1185 // now exitPoints and decisionPointList have to point to equivalent 1186 // vectors, but not the SAME vector 1187 @SuppressWarnings("unchecked") 1188 Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); 1189 exitPoints = clone; 1190 1191 // pop the original decision point list off the stack 1192 Vector<Integer> temp = decisionPointStack.pop(); 1193 1194 // we squirreled away the row number of our entry point list 1195 // at the beginning of the original decision point list. Fish 1196 // that state number out and retrieve the entry point list 1197 int tempStateNum = temp.firstElement().intValue(); 1198 short[] tempState = tempStateTable.elementAt(tempStateNum); 1199 1200 // merge the original decision point list with the current 1201 // decision point list 1202 for (int i = 0; i < decisionPointList.size(); i++) 1203 temp.addElement(decisionPointList.elementAt(i)); 1204 decisionPointList = temp; 1205 1206 // finally, copy every forward reference from the entry point 1207 // list into every state in the new decision point list 1208 for (int i = 0; i < tempState.length; i++) { 1209 if (tempState[i] > tempStateNum) { 1210 updateStateTable(exitPoints, 1211 new Character((char)(i + 0x100)).toString(), 1212 tempState[i]); 1213 } 1214 } 1215 1216 // update lastState and currentState, and throw away the * 1217 lastState = currentState; 1218 currentState = tempStateTable.size() - 1; 1219 ++p; 1220 } 1221 } 1222 1223 // a / marks the position where the break is to go if the character sequence 1224 // matches this rule. We update the flag word of every state on the decision 1225 // point list to mark them as ending states, and take note of the fact that 1226 // we've seen the slash 1227 else if (c == '/') { 1228 sawEarlyBreak = true; 1229 for (int i = 0; i < decisionPointList.size(); i++) { 1230 state = tempStateTable.elementAt(decisionPointList. 1231 elementAt(i).intValue()); 1232 state[numCategories] |= LOOKAHEAD_STATE_FLAG; 1233 } 1234 } 1235 1236 // if we get here without executing any of the above clauses, we have a 1237 // syntax error. However, for now we just ignore the offending character 1238 // and move on 1239 1240 // clearLoopingStates is a signal back from updateStateTable() that we've 1241 // transitioned to a state that won't loop back to the current looping 1242 // state. (In other words, we've gotten to a point where we can no longer 1243 // go back into a *? we saw earlier.) Clear out the list of looping states 1244 // and backfill any states that need to be backfilled. 1245 if (clearLoopingStates) { 1246 setLoopingStates(null, decisionPointList); 1247 } 1248 1249 // advance to the next character, now that we've processed the current 1250 // character 1251 p += Character.charCount(c); 1252 } 1253 1254 // this takes care of backfilling any states that still need to be backfilled 1255 setLoopingStates(null, decisionPointList); 1256 1257 // when we reach the end of the string, we do a postprocessing step to mark the 1258 // end states. The decision point list contains every state that can transition 1259 // to the end state-- that is, every state that is the last state in a sequence 1260 // that matches the rule. All of these states are considered "mark states" 1261 // or "accepting states"-- that is, states that cause the position returned from 1262 // next() to be updated. A mark state represents a possible break position. 1263 // This allows us to look ahead and remember how far the rule matched 1264 // before following the new branch (see next() for more information). 1265 // The temporary state table has an extra "flag column" at the end where this 1266 // information is stored. We mark the end states by setting a flag in their 1267 // flag column. 1268 // Now if we saw the / in the rule, then everything after it is lookahead 1269 // material and the break really goes where the slash is. In this case, 1270 // we mark these states as BOTH accepting states and lookahead states. This 1271 // signals that these states cause the break position to be updated to the 1272 // position of the slash rather than the current break position. 1273 for (int i = 0; i < decisionPointList.size(); i++) { 1274 int rowNum = decisionPointList.elementAt(i).intValue(); 1275 state = tempStateTable.elementAt(rowNum); 1276 state[numCategories] |= END_STATE_FLAG; 1277 if (sawEarlyBreak) { 1278 state[numCategories] |= LOOKAHEAD_STATE_FLAG; 1279 } 1280 } 1281 } 1282 1283 1284 /** 1285 * Update entries in the state table, and merge states when necessary to keep 1286 * the table deterministic. 1287 * @param rows The list of rows that need updating (the decision point list) 1288 * @param pendingChars A character category list, encoded in a String. This is the 1289 * list of the columns that need updating. 1290 * @param newValue Update the cells specfied above to contain this value 1291 */ 1292 private void updateStateTable(Vector<Integer> rows, 1293 String pendingChars, 1294 short newValue) { 1295 // create a dummy state that has the specified row number (newValue) in 1296 // the cells that need to be updated (those specified by pendingChars) 1297 // and 0 in the other cells 1298 short[] newValues = new short[numCategories + 1]; 1299 for (int i = 0; i < pendingChars.length(); i++) 1300 newValues[(int)(pendingChars.charAt(i)) - 0x100] = newValue; 1301 1302 // go through the list of rows to update, and update them by calling 1303 // mergeStates() to merge them the the dummy state we created 1304 for (int i = 0; i < rows.size(); i++) { 1305 mergeStates(rows.elementAt(i).intValue(), newValues, rows); 1306 } 1307 } 1308 1309 /** 1310 * The real work of making the state table deterministic happens here. This function 1311 * merges a state in the state table (specified by rowNum) with a state that is 1312 * passed in (newValues). The basic process is to copy the nonzero cells in newStates 1313 * into the state in the state table (we'll call that oldValues). If there's a 1314 * collision (i.e., if the same cell has a nonzero value in both states, and it's 1315 * not the SAME value), then we have to reconcile the collision. We do this by 1316 * creating a new state, adding it to the end of the state table, and using this 1317 * function recursively to merge the original two states into a single, combined 1318 * state. This process may happen recursively (i.e., each successive level may 1319 * involve collisions). To prevent infinite recursion, we keep a log of merge 1320 * operations. Any time we're merging two states we've merged before, we can just 1321 * supply the row number for the result of that merge operation rather than creating 1322 * a new state just like it. 1323 * @param rowNum The row number in the state table of the state to be updated 1324 * @param newValues The state to merge it with. 1325 * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() 1326 * (itself a copy of the decision point list from parseRule()). Newly-created 1327 * states get added to the decision point list if their "parents" were on it. 1328 */ 1329 private void mergeStates(int rowNum, 1330 short[] newValues, 1331 Vector<Integer> rowsBeingUpdated) { 1332 short[] oldValues = tempStateTable.elementAt(rowNum); 1333 boolean isLoopingState = loopingStates.contains(new Integer(rowNum)); 1334 1335 // for each of the cells in the rows we're reconciling, do... 1336 for (int i = 0; i < oldValues.length; i++) { 1337 1338 // if they contain the same value, we don't have to do anything 1339 if (oldValues[i] == newValues[i]) { 1340 continue; 1341 } 1342 1343 // if oldValues is a looping state and the state the current cell points to 1344 // is too, then we can just stomp over the current value of that cell (and 1345 // set the clear-looping-states flag if necessary) 1346 else if (isLoopingState && loopingStates.contains(new Integer(oldValues[i]))) { 1347 if (newValues[i] != 0) { 1348 if (oldValues[i] == 0) { 1349 clearLoopingStates = true; 1350 } 1351 oldValues[i] = newValues[i]; 1352 } 1353 } 1354 1355 // if the current cell in oldValues is 0, copy in the corresponding value 1356 // from newValues 1357 else if (oldValues[i] == 0) { 1358 oldValues[i] = newValues[i]; 1359 } 1360 1361 // the last column of each row is the flag column. Take care to merge the 1362 // flag words correctly 1363 else if (i == numCategories) { 1364 oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]); 1365 } 1366 1367 // if both newValues and oldValues have a nonzero value in the current 1368 // cell, and it isn't the same value both places... 1369 else if (oldValues[i] != 0 && newValues[i] != 0) { 1370 1371 // look up this pair of cell values in the merge list. If it's 1372 // found, update the cell in oldValues to point to the merged state 1373 int combinedRowNum = searchMergeList(oldValues[i], newValues[i]); 1374 if (combinedRowNum != 0) { 1375 oldValues[i] = (short)combinedRowNum; 1376 } 1377 1378 // otherwise, we have to reconcile them... 1379 else { 1380 // copy our row numbers into variables to make things easier 1381 int oldRowNum = oldValues[i]; 1382 int newRowNum = newValues[i]; 1383 combinedRowNum = tempStateTable.size(); 1384 1385 // add this pair of row numbers to the merge list (create it first 1386 // if we haven't created the merge list yet) 1387 if (mergeList == null) { 1388 mergeList = new Vector<>(); 1389 } 1390 mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum }); 1391 1392 // create a new row to represent the merged state, and copy the 1393 // contents of oldRow into it, then add it to the end of the 1394 // state table and update the original row (oldValues) to point 1395 // to the new, merged, state 1396 short[] newRow = new short[numCategories + 1]; 1397 short[] oldRow = tempStateTable.elementAt(oldRowNum); 1398 System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1); 1399 tempStateTable.addElement(newRow); 1400 oldValues[i] = (short)combinedRowNum; 1401 1402 // if the decision point list contains either of the parent rows, 1403 // update it to include the new row as well 1404 if ((decisionPointList.contains(new Integer(oldRowNum)) 1405 || decisionPointList.contains(new Integer(newRowNum))) 1406 && !decisionPointList.contains(new Integer(combinedRowNum)) 1407 ) { 1408 decisionPointList.addElement(new Integer(combinedRowNum)); 1409 } 1410 1411 // do the same thing with the list of rows being updated 1412 if ((rowsBeingUpdated.contains(new Integer(oldRowNum)) 1413 || rowsBeingUpdated.contains(new Integer(newRowNum))) 1414 && !rowsBeingUpdated.contains(new Integer(combinedRowNum)) 1415 ) { 1416 decisionPointList.addElement(new Integer(combinedRowNum)); 1417 } 1418 // now (groan) do the same thing for all the entries on the 1419 // decision point stack 1420 for (int k = 0; k < decisionPointStack.size(); k++) { 1421 Vector<Integer> dpl = decisionPointStack.elementAt(k); 1422 if ((dpl.contains(new Integer(oldRowNum)) 1423 || dpl.contains(new Integer(newRowNum))) 1424 && !dpl.contains(new Integer(combinedRowNum)) 1425 ) { 1426 dpl.addElement(new Integer(combinedRowNum)); 1427 } 1428 } 1429 1430 // FINALLY (puff puff puff), call mergeStates() recursively to copy 1431 // the row referred to by newValues into the new row and resolve any 1432 // conflicts that come up at that level 1433 mergeStates(combinedRowNum, tempStateTable.elementAt( 1434 newValues[i]), rowsBeingUpdated); 1435 } 1436 } 1437 } 1438 return; 1439 } 1440 1441 /** 1442 * The merge list is a list of pairs of rows that have been merged somewhere in 1443 * the process of building this state table, along with the row number of the 1444 * row containing the merged state. This function looks up a pair of row numbers 1445 * and returns the row number of the row they combine into. (It returns 0 if 1446 * this pair of rows isn't in the merge list.) 1447 */ 1448 private int searchMergeList(int a, int b) { 1449 // if there is no merge list, there obviously isn't anything in it 1450 if (mergeList == null) { 1451 return 0; 1452 } 1453 1454 // otherwise, for each element in the merge list... 1455 else { 1456 int[] entry; 1457 for (int i = 0; i < mergeList.size(); i++) { 1458 entry = mergeList.elementAt(i); 1459 1460 // we have a hit if the two row numbers match the two row numbers 1461 // in the beginning of the entry (the two that combine), in either 1462 // order 1463 if ((entry[0] == a && entry[1] == b) || (entry[0] == b && entry[1] == a)) { 1464 return entry[2]; 1465 } 1466 1467 // we also have a hit if one of the two row numbers matches the marged 1468 // row number and the other one matches one of the original row numbers 1469 if ((entry[2] == a && (entry[0] == b || entry[1] == b))) { 1470 return entry[2]; 1471 } 1472 if ((entry[2] == b && (entry[0] == a || entry[1] == a))) { 1473 return entry[2]; 1474 } 1475 } 1476 return 0; 1477 } 1478 } 1479 1480 /** 1481 * This function is used to update the list of current loooping states (i.e., 1482 * states that are controlled by a *? construct). It backfills values from 1483 * the looping states into unpopulated cells of the states that are currently 1484 * marked for backfilling, and then updates the list of looping states to be 1485 * the new list 1486 * @param newLoopingStates The list of new looping states 1487 * @param endStates The list of states to treat as end states (states that 1488 * can exit the loop). 1489 */ 1490 private void setLoopingStates(Vector<Integer> newLoopingStates, 1491 Vector<Integer> endStates) { 1492 1493 // if the current list of looping states isn't empty, we have to backfill 1494 // values from the looping states into the states that are waiting to be 1495 // backfilled 1496 if (!loopingStates.isEmpty()) { 1497 int loopingState = loopingStates.lastElement().intValue(); 1498 int rowNum; 1499 1500 // don't backfill into an end state OR any state reachable from an end state 1501 // (since the search for reachable states is recursive, it's split out into 1502 // a separate function, eliminateBackfillStates(), below) 1503 for (int i = 0; i < endStates.size(); i++) { 1504 eliminateBackfillStates(endStates.elementAt(i).intValue()); 1505 } 1506 1507 // we DON'T actually backfill the states that need to be backfilled here. 1508 // Instead, we MARK them for backfilling. The reason for this is that if 1509 // there are multiple rules in the state-table description, the looping 1510 // states may have some of their values changed by a succeeding rule, and 1511 // this wouldn't be reflected in the backfilled states. We mark a state 1512 // for backfilling by putting the row number of the state to copy from 1513 // into the flag cell at the end of the row 1514 for (int i = 0; i < statesToBackfill.size(); i++) { 1515 rowNum = statesToBackfill.elementAt(i).intValue(); 1516 short[] state = tempStateTable.elementAt(rowNum); 1517 state[numCategories] = 1518 (short)((state[numCategories] & ALL_FLAGS) | loopingState); 1519 } 1520 statesToBackfill.removeAllElements(); 1521 loopingStates.removeAllElements(); 1522 } 1523 1524 if (newLoopingStates != null) { 1525 @SuppressWarnings("unchecked") 1526 Vector<Integer> clone = (Vector<Integer>)newLoopingStates.clone(); 1527 loopingStates = clone; 1528 } 1529 } 1530 1531 /** 1532 * This removes "ending states" and states reachable from them from the 1533 * list of states to backfill. 1534 * @param The row number of the state to remove from the backfill list 1535 */ 1536 private void eliminateBackfillStates(int baseState) { 1537 1538 // don't do anything unless this state is actually in the backfill list... 1539 if (statesToBackfill.contains(new Integer(baseState))) { 1540 1541 // if it is, take it out 1542 statesToBackfill.removeElement(new Integer(baseState)); 1543 1544 // then go through and recursively call this function for every 1545 // state that the base state points to 1546 short[] state = tempStateTable.elementAt(baseState); 1547 for (int i = 0; i < numCategories; i++) { 1548 if (state[i] != 0) { 1549 eliminateBackfillStates(state[i]); 1550 } 1551 } 1552 } 1553 } 1554 1555 /** 1556 * This function completes the backfilling process by actually doing the 1557 * backfilling on the states that are marked for it 1558 */ 1559 private void backfillLoopingStates() { 1560 short[] state; 1561 short[] loopingState = null; 1562 int loopingStateRowNum = 0; 1563 int fromState; 1564 1565 // for each state in the state table... 1566 for (int i = 0; i < tempStateTable.size(); i++) { 1567 state = tempStateTable.elementAt(i); 1568 1569 // check the state's flag word to see if it's marked for backfilling 1570 // (it's marked for backfilling if any bits other than the two high-order 1571 // bits are set-- if they are, then the flag word, minus the two high bits, 1572 // is the row number to copy from) 1573 fromState = state[numCategories] & ~ALL_FLAGS; 1574 if (fromState > 0) { 1575 1576 // load up the state to copy from (if we haven't already) 1577 if (fromState != loopingStateRowNum) { 1578 loopingStateRowNum = fromState; 1579 loopingState = tempStateTable.elementAt(loopingStateRowNum); 1580 } 1581 1582 // clear out the backfill part of the flag word 1583 state[numCategories] &= ALL_FLAGS; 1584 1585 // then fill all zero cells in the current state with values 1586 // from the corresponding cells of the fromState 1587 for (int j = 0; j < state.length; j++) { 1588 if (state[j] == 0) { 1589 state[j] = loopingState[j]; 1590 } 1591 else if (state[j] == DONT_LOOP_FLAG) { 1592 state[j] = 0; 1593 } 1594 } 1595 } 1596 } 1597 } 1598 1599 /** 1600 * This function completes the state-table-building process by doing several 1601 * postprocessing steps and copying everything into its final resting place 1602 * in the iterator itself 1603 * @param forward True if we're working on the forward state table 1604 */ 1605 private void finishBuildingStateTable(boolean forward) { 1606 // start by backfilling the looping states 1607 backfillLoopingStates(); 1608 1609 int[] rowNumMap = new int[tempStateTable.size()]; 1610 Stack<Integer> rowsToFollow = new Stack<>(); 1611 rowsToFollow.push(new Integer(1)); 1612 rowNumMap[1] = 1; 1613 1614 // determine which states are no longer reachable from the start state 1615 // (the reachable states will have their row numbers in the row number 1616 // map, and the nonreachable states will have zero in the row number map) 1617 while (rowsToFollow.size() != 0) { 1618 int rowNum = rowsToFollow.pop().intValue(); 1619 short[] row = tempStateTable.elementAt(rowNum); 1620 1621 for (int i = 0; i < numCategories; i++) { 1622 if (row[i] != 0) { 1623 if (rowNumMap[row[i]] == 0) { 1624 rowNumMap[row[i]] = row[i]; 1625 rowsToFollow.push(new Integer(row[i])); 1626 } 1627 } 1628 } 1629 } 1630 1631 boolean madeChange; 1632 int newRowNum; 1633 1634 // algorithm for minimizing the number of states in the table adapted from 1635 // Aho & Ullman, "Principles of Compiler Design" 1636 // The basic idea here is to organize the states into classes. When we're done, 1637 // all states in the same class can be considered identical and all but one eliminated. 1638 1639 // initially assign states to classes based on the number of populated cells they 1640 // contain (the class number is the number of populated cells) 1641 int[] stateClasses = new int[tempStateTable.size()]; 1642 int nextClass = numCategories + 1; 1643 short[] state1, state2; 1644 for (int i = 1; i < stateClasses.length; i++) { 1645 if (rowNumMap[i] == 0) { 1646 continue; 1647 } 1648 state1 = tempStateTable.elementAt(i); 1649 for (int j = 0; j < numCategories; j++) { 1650 if (state1[j] != 0) { 1651 ++stateClasses[i]; 1652 } 1653 } 1654 if (stateClasses[i] == 0) { 1655 stateClasses[i] = nextClass; 1656 } 1657 } 1658 ++nextClass; 1659 1660 // then, for each class, elect the first member of that class as that class's 1661 // "representative". For each member of the class, compare it to the "representative." 1662 // If there's a column position where the state being tested transitions to a 1663 // state in a DIFFERENT class from the class where the "representative" transitions, 1664 // then move the state into a new class. Repeat this process until no new classes 1665 // are created. 1666 int currentClass; 1667 int lastClass; 1668 boolean split; 1669 1670 do { 1671 currentClass = 1; 1672 lastClass = nextClass; 1673 while (currentClass < nextClass) { 1674 split = false; 1675 state1 = state2 = null; 1676 for (int i = 0; i < stateClasses.length; i++) { 1677 if (stateClasses[i] == currentClass) { 1678 if (state1 == null) { 1679 state1 = tempStateTable.elementAt(i); 1680 } 1681 else { 1682 state2 = tempStateTable.elementAt(i); 1683 for (int j = 0; j < state2.length; j++) { 1684 if ((j == numCategories && state1[j] != state2[j] && forward) 1685 || (j != numCategories && stateClasses[state1[j]] 1686 != stateClasses[state2[j]])) { 1687 stateClasses[i] = nextClass; 1688 split = true; 1689 break; 1690 } 1691 } 1692 } 1693 } 1694 } 1695 if (split) { 1696 ++nextClass; 1697 } 1698 ++currentClass; 1699 } 1700 } while (lastClass != nextClass); 1701 1702 // at this point, all of the states in a class except the first one (the 1703 //"representative") can be eliminated, so update the row-number map accordingly 1704 int[] representatives = new int[nextClass]; 1705 for (int i = 1; i < stateClasses.length; i++) 1706 if (representatives[stateClasses[i]] == 0) { 1707 representatives[stateClasses[i]] = i; 1708 } 1709 else { 1710 rowNumMap[i] = representatives[stateClasses[i]]; 1711 } 1712 1713 // renumber all remaining rows... 1714 // first drop all that are either unreferenced or not a class representative 1715 for (int i = 1; i < rowNumMap.length; i++) { 1716 if (rowNumMap[i] != i) { 1717 tempStateTable.setElementAt(null, i); 1718 } 1719 } 1720 1721 // then calculate everybody's new row number and update the row 1722 // number map appropriately (the first pass updates the row numbers 1723 // of all the class representatives [the rows we're keeping], and the 1724 // second pass updates the cross references for all the rows that 1725 // are being deleted) 1726 newRowNum = 1; 1727 for (int i = 1; i < rowNumMap.length; i++) { 1728 if (tempStateTable.elementAt(i) != null) { 1729 rowNumMap[i] = newRowNum++; 1730 } 1731 } 1732 for (int i = 1; i < rowNumMap.length; i++) { 1733 if (tempStateTable.elementAt(i) == null) { 1734 rowNumMap[i] = rowNumMap[rowNumMap[i]]; 1735 } 1736 } 1737 1738 // allocate the permanent state table, and copy the remaining rows into it 1739 // (adjusting all the cell values, of course) 1740 1741 // this section does that for the forward state table 1742 if (forward) { 1743 endStates = new boolean[newRowNum]; 1744 lookaheadStates = new boolean[newRowNum]; 1745 stateTable = new short[newRowNum * numCategories]; 1746 int p = 0; 1747 int p2 = 0; 1748 for (int i = 0; i < tempStateTable.size(); i++) { 1749 short[] row = tempStateTable.elementAt(i); 1750 if (row == null) { 1751 continue; 1752 } 1753 for (int j = 0; j < numCategories; j++) { 1754 stateTable[p] = (short)(rowNumMap[row[j]]); 1755 ++p; 1756 } 1757 endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0); 1758 lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0); 1759 ++p2; 1760 } 1761 } 1762 1763 // and this section does it for the backward state table 1764 else { 1765 backwardsStateTable = new short[newRowNum * numCategories]; 1766 int p = 0; 1767 for (int i = 0; i < tempStateTable.size(); i++) { 1768 short[] row = tempStateTable.elementAt(i); 1769 if (row == null) { 1770 continue; 1771 } 1772 for (int j = 0; j < numCategories; j++) { 1773 backwardsStateTable[p] = (short)(rowNumMap[row[j]]); 1774 ++p; 1775 } 1776 } 1777 } 1778 } 1779 1780 /** 1781 * This function builds the backward state table from the forward state 1782 * table and any additional rules (identified by the ! on the front) 1783 * supplied in the description 1784 */ 1785 private void buildBackwardsStateTable(Vector<String> tempRuleList) { 1786 1787 // create the temporary state table and seed it with two rows (row 0 1788 // isn't used for anything, and we have to create row 1 (the initial 1789 // state) before we can do anything else 1790 tempStateTable = new Vector<>(); 1791 tempStateTable.addElement(new short[numCategories + 1]); 1792 tempStateTable.addElement(new short[numCategories + 1]); 1793 1794 // although the backwards state table is built automatically from the forward 1795 // state table, there are some situations (the default sentence-break rules, 1796 // for example) where this doesn't yield enough stop states, causing a dramatic 1797 // drop in performance. To help with these cases, the user may supply 1798 // supplemental rules that are added to the backward state table. These have 1799 // the same syntax as the normal break rules, but begin with '!' to distinguish 1800 // them from normal break rules 1801 for (int i = 0; i < tempRuleList.size(); i++) { 1802 String rule = tempRuleList.elementAt(i); 1803 if (rule.charAt(0) == '!') { 1804 parseRule(rule.substring(1), false); 1805 } 1806 } 1807 backfillLoopingStates(); 1808 1809 // Backwards iteration is qualitatively different from forwards iteration. 1810 // This is because backwards iteration has to be made to operate from no context 1811 // at all-- the user should be able to ask BreakIterator for the break position 1812 // immediately on either side of some arbitrary offset in the text. The 1813 // forward iteration table doesn't let us do that-- it assumes complete 1814 // information on the context, which means starting from the beginning of the 1815 // document. 1816 // The way we do backward and random-access iteration is to back up from the 1817 // current (or user-specified) position until we see something we're sure is 1818 // a break position (it may not be the last break position immediately 1819 // preceding our starting point, however). Then we roll forward from there to 1820 // locate the actual break position we're after. 1821 // This means that the backwards state table doesn't have to identify every 1822 // break position, allowing the building algorithm to be much simpler. Here, 1823 // we use a "pairs" approach, scanning the forward-iteration state table for 1824 // pairs of character categories we ALWAYS break between, and building a state 1825 // table from that information. No context is required-- all this state table 1826 // looks at is a pair of adjacent characters. 1827 1828 // It's possible that the user has supplied supplementary rules (see above). 1829 // This has to be done first to keep parseRule() and friends from becoming 1830 // EVEN MORE complicated. The automatically-generated states are appended 1831 // onto the end of the state table, and then the two sets of rules are 1832 // stitched together at the end. Take note of the row number of the 1833 // first row of the auromatically-generated part. 1834 int backTableOffset = tempStateTable.size(); 1835 if (backTableOffset > 2) { 1836 ++backTableOffset; 1837 } 1838 1839 // the automatically-generated part of the table models a two-dimensional 1840 // array where the two dimensions represent the two characters we're currently 1841 // looking at. To model this as a state table, we actually need one additional 1842 // row to represent the initial state. It gets populated with the row numbers 1843 // of the other rows (in order). 1844 for (int i = 0; i < numCategories + 1; i++) 1845 tempStateTable.addElement(new short[numCategories + 1]); 1846 1847 short[] state = tempStateTable.elementAt(backTableOffset - 1); 1848 for (int i = 0; i < numCategories; i++) 1849 state[i] = (short)(i + backTableOffset); 1850 1851 // scavenge the forward state table for pairs of character categories 1852 // that always have a break between them. The algorithm is as follows: 1853 // Look down each column in the state table. For each nonzero cell in 1854 // that column, look up the row it points to. For each nonzero cell in 1855 // that row, populate a cell in the backwards state table: the row number 1856 // of that cell is the number of the column we were scanning (plus the 1857 // offset that locates this sub-table), and the column number of that cell 1858 // is the column number of the nonzero cell we just found. This cell is 1859 // populated with its own column number (adjusted according to the actual 1860 // location of the sub-table). This process will produce a state table 1861 // whose behavior is the same as looking up successive pairs of characters 1862 // in an array of Booleans to determine whether there is a break. 1863 int numRows = stateTable.length / numCategories; 1864 for (int column = 0; column < numCategories; column++) { 1865 for (int row = 0; row < numRows; row++) { 1866 int nextRow = lookupState(row, column); 1867 if (nextRow != 0) { 1868 for (int nextColumn = 0; nextColumn < numCategories; nextColumn++) { 1869 int cellValue = lookupState(nextRow, nextColumn); 1870 if (cellValue != 0) { 1871 state = tempStateTable.elementAt(nextColumn + 1872 backTableOffset); 1873 state[column] = (short)(column + backTableOffset); 1874 } 1875 } 1876 } 1877 } 1878 } 1879 1880 // if the user specified some backward-iteration rules with the ! token, 1881 // we have to merge the resulting state table with the auto-generated one 1882 // above. First copy the populated cells from row 1 over the populated 1883 // cells in the auto-generated table. Then copy values from row 1 of the 1884 // auto-generated table into all of the the unpopulated cells of the 1885 // rule-based table. 1886 if (backTableOffset > 1) { 1887 1888 // for every row in the auto-generated sub-table, if a cell is 1889 // populated that is also populated in row 1 of the rule-based 1890 // sub-table, copy the value from row 1 over the value in the 1891 // auto-generated sub-table 1892 state = tempStateTable.elementAt(1); 1893 for (int i = backTableOffset - 1; i < tempStateTable.size(); i++) { 1894 short[] state2 = tempStateTable.elementAt(i); 1895 for (int j = 0; j < numCategories; j++) { 1896 if (state[j] != 0 && state2[j] != 0) { 1897 state2[j] = state[j]; 1898 } 1899 } 1900 } 1901 1902 // now, for every row in the rule-based sub-table that is not 1903 // an end state, fill in all unpopulated cells with the values 1904 // of the corresponding cells in the first row of the auto- 1905 // generated sub-table. 1906 state = tempStateTable.elementAt(backTableOffset - 1); 1907 for (int i = 1; i < backTableOffset - 1; i++) { 1908 short[] state2 = tempStateTable.elementAt(i); 1909 if ((state2[numCategories] & END_STATE_FLAG) == 0) { 1910 for (int j = 0; j < numCategories; j++) { 1911 if (state2[j] == 0) { 1912 state2[j] = state[j]; 1913 } 1914 } 1915 } 1916 } 1917 } 1918 1919 // finally, clean everything up and copy it into the actual BreakIterator 1920 // by calling finishBuildingStateTable() 1921 finishBuildingStateTable(false); 1922 } 1923 1924 /** 1925 * Given a current state and a character category, looks up the 1926 * next state to transition to in the state table. 1927 */ 1928 protected int lookupState(int state, int category) { 1929 return stateTable[state * numCategories + category]; 1930 } 1931 1932 /** 1933 * Throws an IllegalArgumentException representing a syntax error in the rule 1934 * description. The exception's message contains some debugging information. 1935 * @param message A message describing the problem 1936 * @param position The position in the description where the problem was 1937 * discovered 1938 * @param context The string containing the error 1939 */ 1940 protected void error(String message, int position, String context) { 1941 throw new IllegalArgumentException("Parse error at position (" + position + "): " + message + "\n" + 1942 context.substring(0, position) + " -here- " + context.substring(position)); 1943 } 1944 1945 void makeFile(String filename) { 1946 writeTables(filename); 1947 } 1948 1949 /** 1950 * Magic number for the BreakIterator data file format. 1951 */ 1952 private static final byte[] LABEL = { 1953 (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a', 1954 (byte)'\0' 1955 }; 1956 1957 /** 1958 * Version number of the dictionary that was read in. 1959 */ 1960 private static final byte[] supportedVersion = { (byte)1 }; 1961 1962 /** 1963 * Header size in byte count 1964 */ 1965 private static final int HEADER_LENGTH = 36; 1966 1967 /** 1968 * Array length of indices for BMP characters 1969 */ 1970 private static final int BMP_INDICES_LENGTH = 512; 1971 1972 /** 1973 * Read datafile. The datafile's format is as follows: 1974 * <pre> 1975 * BreakIteratorData { 1976 * u1 magic[7]; 1977 * u1 version; 1978 * u4 totalDataSize; 1979 * header_info header; 1980 * body value; 1981 * } 1982 * </pre> 1983 * <code>totalDataSize</code> is the summation of the size of 1984 * <code>header_info</code> and <code>body</code> in byte count. 1985 * <p> 1986 * In <code>header</code>, each field except for checksum implies the 1987 * length of each field. Since <code>BMPdataLength</code> is a fixed-length 1988 * data(512 entries), its length isn't included in <code>header</code>. 1989 * <code>checksum</code> is a CRC32 value of all in <code>body</code>. 1990 * <pre> 1991 * header_info { 1992 * u4 stateTableLength; 1993 * u4 backwardsStateTableLength; 1994 * u4 endStatesLength; 1995 * u4 lookaheadStatesLength; 1996 * u4 BMPdataLength; 1997 * u4 nonBMPdataLength; 1998 * u4 additionalDataLength; 1999 * u8 checksum; 2000 * } 2001 * </pre> 2002 * <p> 2003 * 2004 * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to 2005 * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to 2006 * <code>supplementaryCharCategoryTable</code>. 2007 * <pre> 2008 * body { 2009 * u2 stateTable[stateTableLength]; 2010 * u2 backwardsStateTable[backwardsStateTableLength]; 2011 * u1 endStates[endStatesLength]; 2012 * u1 lookaheadStates[lookaheadStatesLength]; 2013 * u2 BMPindices[512]; 2014 * u1 BMPdata[BMPdataLength]; 2015 * u4 nonBMPdata[numNonBMPdataLength]; 2016 * u1 additionalData[additionalDataLength]; 2017 * } 2018 * </pre> 2019 */ 2020 protected void writeTables(String datafile) { 2021 final String filename; 2022 final String outputDir; 2023 String tmpbuf = GenerateBreakIteratorData.getOutputDirectory(); 2024 2025 if (tmpbuf.equals("")) { 2026 filename = datafile; 2027 outputDir = ""; 2028 } else { 2029 char sep = File.separatorChar; 2030 if (sep == '/') { 2031 outputDir = tmpbuf; 2032 } else if (sep == '\\') { 2033 outputDir = tmpbuf.replaceAll("/", "\\\\"); 2034 } else { 2035 outputDir = tmpbuf.replaceAll("/", String.valueOf(sep)); 2036 } 2037 2038 filename = outputDir + sep + datafile; 2039 } 2040 2041 try { 2042 if (!outputDir.equals("")) { 2043 new File(outputDir).mkdirs(); 2044 } 2045 BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(filename)); 2046 2047 byte[] BMPdata = charCategoryTable.getStringArray(); 2048 short[] BMPindices = charCategoryTable.getIndexArray(); 2049 int[] nonBMPdata = supplementaryCharCategoryTable.getArray(); 2050 2051 if (BMPdata.length <= 0) { 2052 throw new InternalError("Wrong BMP data length(" + BMPdata.length + ")"); 2053 } 2054 if (BMPindices.length != BMP_INDICES_LENGTH) { 2055 throw new InternalError("Wrong BMP indices length(" + BMPindices.length + ")"); 2056 } 2057 if (nonBMPdata.length <= 0) { 2058 throw new InternalError("Wrong non-BMP data length(" + nonBMPdata.length + ")"); 2059 } 2060 2061 int len; 2062 2063 /* Compute checksum */ 2064 CRC32 crc32 = new CRC32(); 2065 len = stateTable.length; 2066 for (int i = 0; i < len; i++) { 2067 crc32.update(stateTable[i]); 2068 } 2069 len = backwardsStateTable.length; 2070 for (int i = 0; i < len; i++) { 2071 crc32.update(backwardsStateTable[i]); 2072 } 2073 crc32.update(toByteArray(endStates)); 2074 crc32.update(toByteArray(lookaheadStates)); 2075 for (int i = 0; i < BMP_INDICES_LENGTH; i++) { 2076 crc32.update(BMPindices[i]); 2077 } 2078 crc32.update(BMPdata); 2079 len = nonBMPdata.length; 2080 for (int i = 0; i < len; i++) { 2081 crc32.update(nonBMPdata[i]); 2082 } 2083 if (additionalData != null) { 2084 len = additionalData.length; 2085 for (int i = 0; i < len; i++) { 2086 crc32.update(additionalData[i]); 2087 } 2088 } 2089 2090 /* First, write magic, version, and totalDataSize. */ 2091 len = HEADER_LENGTH + 2092 (stateTable.length + backwardsStateTable.length) * 2 + 2093 endStates.length + lookaheadStates.length + 1024 + 2094 BMPdata.length + nonBMPdata.length * 4 + 2095 ((additionalData == null) ? 0 : additionalData.length); 2096 out.write(LABEL); 2097 out.write(supportedVersion); 2098 out.write(toByteArray(len)); 2099 2100 /* Write header_info. */ 2101 out.write(toByteArray(stateTable.length)); 2102 out.write(toByteArray(backwardsStateTable.length)); 2103 out.write(toByteArray(endStates.length)); 2104 out.write(toByteArray(lookaheadStates.length)); 2105 out.write(toByteArray(BMPdata.length)); 2106 out.write(toByteArray(nonBMPdata.length)); 2107 if (additionalData == null) { 2108 out.write(toByteArray(0)); 2109 } else { 2110 out.write(toByteArray(additionalData.length)); 2111 } 2112 out.write(toByteArray(crc32.getValue())); 2113 2114 /* Write stateTable[numCategories * numRows] */ 2115 len = stateTable.length; 2116 for (int i = 0; i < len; i++) { 2117 out.write(toByteArray(stateTable[i])); 2118 } 2119 2120 /* Write backwardsStateTable[numCategories * numRows] */ 2121 len = backwardsStateTable.length; 2122 for (int i = 0; i < len; i++) { 2123 out.write(toByteArray(backwardsStateTable[i])); 2124 } 2125 2126 /* Write endStates[numRows] */ 2127 out.write(toByteArray(endStates)); 2128 2129 /* Write lookaheadStates[numRows] */ 2130 out.write(toByteArray(lookaheadStates)); 2131 2132 for (int i = 0; i < BMP_INDICES_LENGTH; i++) { 2133 out.write(toByteArray(BMPindices[i])); 2134 } 2135 BMPindices = null; 2136 out.write(BMPdata); 2137 BMPdata = null; 2138 2139 /* Write a category table for non-BMP characters. */ 2140 len = nonBMPdata.length; 2141 for (int i = 0; i < len; i++) { 2142 out.write(toByteArray(nonBMPdata[i])); 2143 } 2144 nonBMPdata = null; 2145 2146 /* Write additional data */ 2147 if (additionalData != null) { 2148 out.write(additionalData); 2149 } 2150 2151 out.close(); 2152 } 2153 catch (Exception e) { 2154 throw new InternalError(e.toString()); 2155 } 2156 } 2157 2158 byte[] toByteArray(short val) { 2159 byte[] buf = new byte[2]; 2160 buf[0] = (byte)((val>>>8) & 0xFF); 2161 buf[1] = (byte)(val & 0xFF); 2162 return buf; 2163 } 2164 2165 byte[] toByteArray(int val) { 2166 byte[] buf = new byte[4]; 2167 buf[0] = (byte)((val>>>24) & 0xFF); 2168 buf[1] = (byte)((val>>>16) & 0xFF); 2169 buf[2] = (byte)((val>>>8) & 0xFF); 2170 buf[3] = (byte)(val & 0xFF); 2171 return buf; 2172 } 2173 2174 byte[] toByteArray(long val) { 2175 byte[] buf = new byte[8]; 2176 buf[0] = (byte)((val>>>56) & 0xff); 2177 buf[1] = (byte)((val>>>48) & 0xff); 2178 buf[2] = (byte)((val>>>40) & 0xff); 2179 buf[3] = (byte)((val>>>32) & 0xff); 2180 buf[4] = (byte)((val>>>24) & 0xff); 2181 buf[5] = (byte)((val>>>16) & 0xff); 2182 buf[6] = (byte)((val>>>8) & 0xff); 2183 buf[7] = (byte)(val & 0xff); 2184 return buf; 2185 } 2186 2187 byte[] toByteArray(boolean[] data) { 2188 byte[] buf = new byte[data.length]; 2189 for (int i = 0; i < data.length; i++) { 2190 buf[i] = data[i] ? (byte)1 : (byte)0; 2191 } 2192 return buf; 2193 } 2194 2195 void setAdditionalData(byte[] data) { 2196 additionalData = data; 2197 } 2198} 2199