1/* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26/* 27 ****************************************************************************** 28 * 29 * Copyright (C) 2009-2014, International Business Machines 30 * Corporation and others. All Rights Reserved. 31 * 32 ****************************************************************************** 33 */ 34 35package sun.text.normalizer; 36 37import java.util.ArrayList; 38 39import sun.text.normalizer.UnicodeSet.SpanCondition; 40 41/* 42 * Implement span() etc. for a set with strings. 43 * Avoid recursion because of its exponential complexity. 44 * Instead, try multiple paths at once and track them with an IndexList. 45 */ 46class UnicodeSetStringSpan { 47 48 /* 49 * Which span() variant will be used? The object is either built for one variant and used once, 50 * or built for all and may be used many times. 51 */ 52 public static final int WITH_COUNT = 0x40; // spanAndCount() may be called 53 public static final int FWD = 0x20; 54 public static final int BACK = 0x10; 55 // public static final int UTF16 = 8; 56 public static final int CONTAINED = 2; 57 public static final int NOT_CONTAINED = 1; 58 59 public static final int ALL = 0x7f; 60 61 public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; 62 public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; 63 public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; 64 public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; 65 66 /** 67 * Special spanLength short values. (since Java has not unsigned byte type) 68 * All code points in the string are contained in the parent set. 69 */ 70 static final short ALL_CP_CONTAINED = 0xff; 71 72 /** The spanLength is >=0xfe. */ 73 static final short LONG_SPAN = ALL_CP_CONTAINED - 1; 74 75 /** Set for span(). Same as parent but without strings. */ 76 private UnicodeSet spanSet; 77 78 /** 79 * Set for span(not contained). 80 * Same as spanSet, plus characters that start or end strings. 81 */ 82 private UnicodeSet spanNotSet; 83 84 /** The strings of the parent set. */ 85 private ArrayList<String> strings; 86 87 /** The lengths of span(), spanBack() etc. for each string. */ 88 private short[] spanLengths; 89 90 /** Maximum lengths of relevant strings. */ 91 private int maxLength16; 92 93 /** Are there strings that are not fully contained in the code point set? */ 94 private boolean someRelevant; 95 96 /** Set up for all variants of span()? */ 97 private boolean all; 98 99 /** Span helper */ 100 private OffsetList offsets; 101 102 /** 103 * Constructs for all variants of span(), or only for any one variant. 104 * Initializes as little as possible, for single use. 105 */ 106 public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { 107 spanSet = new UnicodeSet(0, 0x10ffff); 108 // TODO: With Java 6, just take the parent set's strings as is, 109 // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. 110 // Then iterate via the first() and higher() methods. 111 // (We do not want to create multiple Iterator objects in each span().) 112 // See ICU ticket #7454. 113 strings = setStrings; 114 all = (which == ALL); 115 spanSet.retainAll(set); 116 if (0 != (which & NOT_CONTAINED)) { 117 // Default to the same sets. 118 // addToSpanNotSet() will create a separate set if necessary. 119 spanNotSet = spanSet; 120 } 121 offsets = new OffsetList(); 122 123 // Determine if the strings even need to be taken into account at all for span() etc. 124 // If any string is relevant, then all strings need to be used for 125 // span(longest match) but only the relevant ones for span(while contained). 126 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 127 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 128 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 129 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 130 int stringsLength = strings.size(); 131 132 int i, spanLength; 133 someRelevant = false; 134 for (i = 0; i < stringsLength; ++i) { 135 String string = strings.get(i); 136 int length16 = string.length(); 137 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 138 if (spanLength < length16) { // Relevant string. 139 someRelevant = true; 140 } 141 if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { 142 maxLength16 = length16; 143 } 144 } 145 if (!someRelevant && (which & WITH_COUNT) == 0) { 146 return; 147 } 148 149 // Freeze after checking for the need to use strings at all because freezing 150 // a set takes some time and memory which are wasted if there are no relevant strings. 151 if (all) { 152 spanSet.freeze(); 153 } 154 155 int spanBackLengthsOffset; 156 157 // Allocate a block of meta data. 158 int allocSize; 159 if (all) { 160 // 2 sets of span lengths 161 allocSize = stringsLength * (2); 162 } else { 163 allocSize = stringsLength; // One set of span lengths. 164 } 165 spanLengths = new short[allocSize]; 166 167 if (all) { 168 // Store span lengths for all span() variants. 169 spanBackLengthsOffset = stringsLength; 170 } else { 171 // Store span lengths for only one span() variant. 172 spanBackLengthsOffset = 0; 173 } 174 175 // Set the meta data and spanNotSet and write the UTF-8 strings. 176 177 for (i = 0; i < stringsLength; ++i) { 178 String string = strings.get(i); 179 int length16 = string.length(); 180 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 181 if (spanLength < length16) { // Relevant string. 182 if (true /* 0 != (which & UTF16) */) { 183 if (0 != (which & CONTAINED)) { 184 if (0 != (which & FWD)) { 185 spanLengths[i] = makeSpanLengthByte(spanLength); 186 } 187 if (0 != (which & BACK)) { 188 spanLength = length16 189 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); 190 spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); 191 } 192 } else /* not CONTAINED, not all, but NOT_CONTAINED */{ 193 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant 194 // flag. 195 } 196 } 197 if (0 != (which & NOT_CONTAINED)) { 198 // Add string start and end code points to the spanNotSet so that 199 // a span(while not contained) stops before any string. 200 int c; 201 if (0 != (which & FWD)) { 202 c = string.codePointAt(0); 203 addToSpanNotSet(c); 204 } 205 if (0 != (which & BACK)) { 206 c = string.codePointBefore(length16); 207 addToSpanNotSet(c); 208 } 209 } 210 } else { // Irrelevant string. 211 if (all) { 212 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; 213 } else { 214 // All spanXYZLengths pointers contain the same address. 215 spanLengths[i] = ALL_CP_CONTAINED; 216 } 217 } 218 } 219 220 // Finish. 221 if (all) { 222 spanNotSet.freeze(); 223 } 224 } 225 226 /** 227 * Do the strings need to be checked in span() etc.? 228 * 229 * @return true if strings need to be checked (call span() here), 230 * false if not (use a BMPSet for best performance). 231 */ 232 public boolean needsStringSpanUTF16() { 233 return someRelevant; 234 } 235 236 /** For fast UnicodeSet::contains(c). */ 237 public boolean contains(int c) { 238 return spanSet.contains(c); 239 } 240 241 /** 242 * Adds a starting or ending string character to the spanNotSet 243 * so that a character span ends before any string. 244 */ 245 private void addToSpanNotSet(int c) { 246 if (spanNotSet == null || spanNotSet == spanSet) { 247 if (spanSet.contains(c)) { 248 return; // Nothing to do. 249 } 250 spanNotSet = spanSet.cloneAsThawed(); 251 } 252 spanNotSet.add(c); 253 } 254 255 /* 256 * Note: In span() when spanLength==0 257 * (after a string match, or at the beginning after an empty code point span) 258 * and in spanNot() and spanNotUTF8(), 259 * string matching could use a binary search because all string matches are done 260 * from the same start index. 261 * 262 * For UTF-8, this would require a comparison function that returns UTF-16 order. 263 * 264 * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets 265 * with strings have very few very short strings. For cases with many strings, it might be better to use a different 266 * API and implementation with a DFA (state machine). 267 */ 268 269 /* 270 * Algorithm for span(SpanCondition.CONTAINED) 271 * 272 * Theoretical algorithm: 273 * - Iterate through the string, and at each code point boundary: 274 * + If the code point there is in the set, then remember to continue after it. 275 * + If a set string matches at the current position, then remember to continue after it. 276 * + Either recursively span for each code point or string match, or recursively span 277 * for all but the shortest one and iteratively continue the span with the shortest local match. 278 * + Remember the longest recursive span (the farthest end point). 279 * + If there is no match at the current position, 280 * neither for the code point there nor for any set string, 281 * then stop and return the longest recursive span length. 282 * 283 * Optimized implementation: 284 * 285 * (We assume that most sets will have very few very short strings. 286 * A span using a string-less set is extremely fast.) 287 * 288 * Create and cache a spanSet which contains all of the single code points of the original set 289 * but none of its strings. 290 * 291 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 292 * - Loop: 293 * + Try to match each set string at the end of the spanLength. 294 * ~ Set strings that start with set-contained code points 295 * must be matched with a partial overlap 296 * because the recursive algorithm would have tried to match them at every position. 297 * ~ Set strings that entirely consist of set-contained code points 298 * are irrelevant for span(SpanCondition.CONTAINED) 299 * because the recursive algorithm would continue after them anyway and 300 * find the longest recursive match from their end. 301 * ~ Rather than recursing, note each end point of a set string match. 302 * + If no set string matched after spanSet.span(), 303 * then return with where the spanSet.span() ended. 304 * + If at least one set string matched after spanSet.span(), 305 * then pop the shortest string match end point and continue the loop, 306 * trying to match all set strings from there. 307 * + If at least one more set string matched after a previous string match, then test if the 308 * code point after the previous string match is also contained in the set. 309 * Continue the loop with the shortest end point of 310 * either this code point or a matching set string. 311 * + If no more set string matched after a previous string match, 312 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 313 * Stop if spanLength==0, otherwise continue the loop. 314 * 315 * By noting each end point of a set string match, the function visits each string position at most once and 316 * finishes in linear time. 317 * 318 * The recursive algorithm may visit the same string position many times 319 * if multiple paths lead to it and finishes in exponential time. 320 */ 321 322 /* 323 * Algorithm for span(SIMPLE) 324 * 325 * Theoretical algorithm: 326 * - Iterate through the string, and at each code point boundary: 327 * + If the code point there is in the set, then remember to continue after it. 328 * + If a set string matches at the current position, then remember to continue after it. 329 * + Continue from the farthest match position and ignore all others. 330 * + If there is no match at the current position, then stop and return the current position. 331 * 332 * Optimized implementation: 333 * 334 * (Same assumption and spanSet as above.) 335 * 336 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 337 * - Loop: 338 * + Try to match each set string at the end of the spanLength. 339 * ~ Set strings that start with set-contained code points 340 * must be matched with a partial overlap 341 * because the standard algorithm would have tried to match them earlier. 342 * ~ Set strings that entirely consist of set-contained code points 343 * must be matched with a full overlap because the longest-match algorithm 344 * would hide set string matches that end earlier. 345 * Such set strings need not be matched earlier inside the code point span 346 * because the standard algorithm would then have 347 * continued after the set string match anyway. 348 * ~ Remember the longest set string match (farthest end point) 349 * from the earliest starting point. 350 * + If no set string matched after spanSet.span(), 351 * then return with where the spanSet.span() ended. 352 * + If at least one set string matched, 353 * then continue the loop after the longest match from the earliest position. 354 * + If no more set string matched after a previous string match, 355 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 356 * Stop if spanLength==0, otherwise continue the loop. 357 */ 358 /** 359 * Spans a string. 360 * 361 * @param s The string to be spanned 362 * @param start The start index that the span begins 363 * @param spanCondition The span condition 364 * @return the limit (exclusive end) of the span 365 */ 366 public int span(CharSequence s, int start, SpanCondition spanCondition) { 367 if (spanCondition == SpanCondition.NOT_CONTAINED) { 368 return spanNot(s, start, null); 369 } 370 int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); 371 if (spanLimit == s.length()) { 372 return spanLimit; 373 } 374 return spanWithStrings(s, start, spanLimit, spanCondition); 375 } 376 377 /** 378 * Synchronized method for complicated spans using the offsets. 379 * Avoids synchronization for simple cases. 380 * 381 * @param spanLimit = spanSet.span(s, start, CONTAINED) 382 */ 383 private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, 384 SpanCondition spanCondition) { 385 // Consider strings; they may overlap with the span. 386 int initSize = 0; 387 if (spanCondition == SpanCondition.CONTAINED) { 388 // Use offset list to try all possibilities. 389 initSize = maxLength16; 390 } 391 offsets.setMaxLength(initSize); 392 int length = s.length(); 393 int pos = spanLimit, rest = length - spanLimit; 394 int spanLength = spanLimit - start; 395 int i, stringsLength = strings.size(); 396 for (;;) { 397 if (spanCondition == SpanCondition.CONTAINED) { 398 for (i = 0; i < stringsLength; ++i) { 399 int overlap = spanLengths[i]; 400 if (overlap == ALL_CP_CONTAINED) { 401 continue; // Irrelevant string. 402 } 403 String string = strings.get(i); 404 405 int length16 = string.length(); 406 407 // Try to match this string at pos-overlap..pos. 408 if (overlap >= LONG_SPAN) { 409 overlap = length16; 410 // While contained: No point matching fully inside the code point span. 411 overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code 412 // point. 413 } 414 if (overlap > spanLength) { 415 overlap = spanLength; 416 } 417 int inc = length16 - overlap; // Keep overlap+inc==length16. 418 for (;;) { 419 if (inc > rest) { 420 break; 421 } 422 // Try to match if the increment is not listed already. 423 if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { 424 if (inc == rest) { 425 return length; // Reached the end of the string. 426 } 427 offsets.addOffset(inc); 428 } 429 if (overlap == 0) { 430 break; 431 } 432 --overlap; 433 ++inc; 434 } 435 } 436 } else /* SIMPLE */{ 437 int maxInc = 0, maxOverlap = 0; 438 for (i = 0; i < stringsLength; ++i) { 439 int overlap = spanLengths[i]; 440 // For longest match, we do need to try to match even an all-contained string 441 // to find the match from the earliest start. 442 443 String string = strings.get(i); 444 445 int length16 = string.length(); 446 447 // Try to match this string at pos-overlap..pos. 448 if (overlap >= LONG_SPAN) { 449 overlap = length16; 450 // Longest match: Need to match fully inside the code point span 451 // to find the match from the earliest start. 452 } 453 if (overlap > spanLength) { 454 overlap = spanLength; 455 } 456 int inc = length16 - overlap; // Keep overlap+inc==length16. 457 for (;;) { 458 if (inc > rest || overlap < maxOverlap) { 459 break; 460 } 461 // Try to match if the string is longer or starts earlier. 462 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) 463 && matches16CPB(s, pos - overlap, length, string, length16)) { 464 maxInc = inc; // Longest match from earliest start. 465 maxOverlap = overlap; 466 break; 467 } 468 --overlap; 469 ++inc; 470 } 471 } 472 473 if (maxInc != 0 || maxOverlap != 0) { 474 // Longest-match algorithm, and there was a string match. 475 // Simply continue after it. 476 pos += maxInc; 477 rest -= maxInc; 478 if (rest == 0) { 479 return length; // Reached the end of the string. 480 } 481 spanLength = 0; // Match strings from after a string match. 482 continue; 483 } 484 } 485 // Finished trying to match all strings at pos. 486 487 if (spanLength != 0 || pos == 0) { 488 // The position is after an unlimited code point span (spanLength!=0), 489 // not after a string match. 490 // The only position where spanLength==0 after a span is pos==0. 491 // Otherwise, an unlimited code point span is only tried again when no 492 // strings match, and if such a non-initial span fails we stop. 493 if (offsets.isEmpty()) { 494 return pos; // No strings matched after a span. 495 } 496 // Match strings from after the next string match. 497 } else { 498 // The position is after a string match (or a single code point). 499 if (offsets.isEmpty()) { 500 // No more strings matched after a previous string match. 501 // Try another code point span from after the last string match. 502 spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); 503 spanLength = spanLimit - pos; 504 if (spanLength == rest || // Reached the end of the string, or 505 spanLength == 0 // neither strings nor span progressed. 506 ) { 507 return spanLimit; 508 } 509 pos += spanLength; 510 rest -= spanLength; 511 continue; // spanLength>0: Match strings from after a span. 512 } else { 513 // Try to match only one code point from after a string match if some 514 // string matched beyond it, so that we try all possible positions 515 // and don't overshoot. 516 spanLength = spanOne(spanSet, s, pos, rest); 517 if (spanLength > 0) { 518 if (spanLength == rest) { 519 return length; // Reached the end of the string. 520 } 521 // Match strings after this code point. 522 // There cannot be any increments below it because UnicodeSet strings 523 // contain multiple code points. 524 pos += spanLength; 525 rest -= spanLength; 526 offsets.shift(spanLength); 527 spanLength = 0; 528 continue; // Match strings from after a single code point. 529 } 530 // Match strings from after the next string match. 531 } 532 } 533 int minOffset = offsets.popMinimum(null); 534 pos += minOffset; 535 rest -= minOffset; 536 spanLength = 0; // Match strings from after a string match. 537 } 538 } 539 540 /** 541 * Spans a string and counts the smallest number of set elements on any path across the span. 542 * 543 * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. 544 * 545 * <p>If the set does not have any fully-contained strings, then we could optimize this 546 * like span(), but such sets are likely rare, and this is at least still linear. 547 * 548 * @param s The string to be spanned 549 * @param start The start index that the span begins 550 * @param spanCondition The span condition 551 * @param outCount The count 552 * @return the limit (exclusive end) of the span 553 */ 554 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, 555 OutputInt outCount) { 556 if (spanCondition == SpanCondition.NOT_CONTAINED) { 557 return spanNot(s, start, outCount); 558 } 559 // Consider strings; they may overlap with the span, 560 // and they may result in a smaller count that with just code points. 561 if (spanCondition == SpanCondition.CONTAINED) { 562 return spanContainedAndCount(s, start, outCount); 563 } 564 // SIMPLE (not synchronized, does not use offsets) 565 int stringsLength = strings.size(); 566 int length = s.length(); 567 int pos = start; 568 int rest = length - start; 569 int count = 0; 570 while (rest != 0) { 571 // Try to match the next code point. 572 int cpLength = spanOne(spanSet, s, pos, rest); 573 int maxInc = (cpLength > 0) ? cpLength : 0; 574 // Try to match all of the strings. 575 for (int i = 0; i < stringsLength; ++i) { 576 String string = strings.get(i); 577 int length16 = string.length(); 578 if (maxInc < length16 && length16 <= rest && 579 matches16CPB(s, pos, length, string, length16)) { 580 maxInc = length16; 581 } 582 } 583 // We are done if there is no match beyond pos. 584 if (maxInc == 0) { 585 outCount.value = count; 586 return pos; 587 } 588 // Continue from the longest match. 589 ++count; 590 pos += maxInc; 591 rest -= maxInc; 592 } 593 outCount.value = count; 594 return pos; 595 } 596 597 private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { 598 // Use offset list to try all possibilities. 599 offsets.setMaxLength(maxLength16); 600 int stringsLength = strings.size(); 601 int length = s.length(); 602 int pos = start; 603 int rest = length - start; 604 int count = 0; 605 while (rest != 0) { 606 // Try to match the next code point. 607 int cpLength = spanOne(spanSet, s, pos, rest); 608 if (cpLength > 0) { 609 offsets.addOffsetAndCount(cpLength, count + 1); 610 } 611 // Try to match all of the strings. 612 for (int i = 0; i < stringsLength; ++i) { 613 String string = strings.get(i); 614 int length16 = string.length(); 615 // Note: If the strings were sorted by length, then we could also 616 // avoid trying to match if there is already a match of the same length. 617 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && 618 matches16CPB(s, pos, length, string, length16)) { 619 offsets.addOffsetAndCount(length16, count + 1); 620 } 621 } 622 // We are done if there is no match beyond pos. 623 if (offsets.isEmpty()) { 624 outCount.value = count; 625 return pos; 626 } 627 // Continue from the nearest match. 628 int minOffset = offsets.popMinimum(outCount); 629 count = outCount.value; 630 pos += minOffset; 631 rest -= minOffset; 632 } 633 outCount.value = count; 634 return pos; 635 } 636 637 /** 638 * Span a string backwards. 639 * 640 * @param s The string to be spanned 641 * @param spanCondition The span condition 642 * @return The string index which starts the span (i.e. inclusive). 643 */ 644 public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { 645 if (spanCondition == SpanCondition.NOT_CONTAINED) { 646 return spanNotBack(s, length); 647 } 648 int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); 649 if (pos == 0) { 650 return 0; 651 } 652 int spanLength = length - pos; 653 654 // Consider strings; they may overlap with the span. 655 int initSize = 0; 656 if (spanCondition == SpanCondition.CONTAINED) { 657 // Use offset list to try all possibilities. 658 initSize = maxLength16; 659 } 660 offsets.setMaxLength(initSize); 661 int i, stringsLength = strings.size(); 662 int spanBackLengthsOffset = 0; 663 if (all) { 664 spanBackLengthsOffset = stringsLength; 665 } 666 for (;;) { 667 if (spanCondition == SpanCondition.CONTAINED) { 668 for (i = 0; i < stringsLength; ++i) { 669 int overlap = spanLengths[spanBackLengthsOffset + i]; 670 if (overlap == ALL_CP_CONTAINED) { 671 continue; // Irrelevant string. 672 } 673 String string = strings.get(i); 674 675 int length16 = string.length(); 676 677 // Try to match this string at pos-(length16-overlap)..pos-length16. 678 if (overlap >= LONG_SPAN) { 679 overlap = length16; 680 // While contained: No point matching fully inside the code point span. 681 int len1 = 0; 682 len1 = string.offsetByCodePoints(0, 1); 683 overlap -= len1; // Length of the string minus the first code point. 684 } 685 if (overlap > spanLength) { 686 overlap = spanLength; 687 } 688 int dec = length16 - overlap; // Keep dec+overlap==length16. 689 for (;;) { 690 if (dec > pos) { 691 break; 692 } 693 // Try to match if the decrement is not listed already. 694 if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { 695 if (dec == pos) { 696 return 0; // Reached the start of the string. 697 } 698 offsets.addOffset(dec); 699 } 700 if (overlap == 0) { 701 break; 702 } 703 --overlap; 704 ++dec; 705 } 706 } 707 } else /* SIMPLE */{ 708 int maxDec = 0, maxOverlap = 0; 709 for (i = 0; i < stringsLength; ++i) { 710 int overlap = spanLengths[spanBackLengthsOffset + i]; 711 // For longest match, we do need to try to match even an all-contained string 712 // to find the match from the latest end. 713 714 String string = strings.get(i); 715 716 int length16 = string.length(); 717 718 // Try to match this string at pos-(length16-overlap)..pos-length16. 719 if (overlap >= LONG_SPAN) { 720 overlap = length16; 721 // Longest match: Need to match fully inside the code point span 722 // to find the match from the latest end. 723 } 724 if (overlap > spanLength) { 725 overlap = spanLength; 726 } 727 int dec = length16 - overlap; // Keep dec+overlap==length16. 728 for (;;) { 729 if (dec > pos || overlap < maxOverlap) { 730 break; 731 } 732 // Try to match if the string is longer or ends later. 733 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) 734 && matches16CPB(s, pos - dec, length, string, length16)) { 735 maxDec = dec; // Longest match from latest end. 736 maxOverlap = overlap; 737 break; 738 } 739 --overlap; 740 ++dec; 741 } 742 } 743 744 if (maxDec != 0 || maxOverlap != 0) { 745 // Longest-match algorithm, and there was a string match. 746 // Simply continue before it. 747 pos -= maxDec; 748 if (pos == 0) { 749 return 0; // Reached the start of the string. 750 } 751 spanLength = 0; // Match strings from before a string match. 752 continue; 753 } 754 } 755 // Finished trying to match all strings at pos. 756 757 if (spanLength != 0 || pos == length) { 758 // The position is before an unlimited code point span (spanLength!=0), 759 // not before a string match. 760 // The only position where spanLength==0 before a span is pos==length. 761 // Otherwise, an unlimited code point span is only tried again when no 762 // strings match, and if such a non-initial span fails we stop. 763 if (offsets.isEmpty()) { 764 return pos; // No strings matched before a span. 765 } 766 // Match strings from before the next string match. 767 } else { 768 // The position is before a string match (or a single code point). 769 if (offsets.isEmpty()) { 770 // No more strings matched before a previous string match. 771 // Try another code point span from before the last string match. 772 int oldPos = pos; 773 pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); 774 spanLength = oldPos - pos; 775 if (pos == 0 || // Reached the start of the string, or 776 spanLength == 0 // neither strings nor span progressed. 777 ) { 778 return pos; 779 } 780 continue; // spanLength>0: Match strings from before a span. 781 } else { 782 // Try to match only one code point from before a string match if some 783 // string matched beyond it, so that we try all possible positions 784 // and don't overshoot. 785 spanLength = spanOneBack(spanSet, s, pos); 786 if (spanLength > 0) { 787 if (spanLength == pos) { 788 return 0; // Reached the start of the string. 789 } 790 // Match strings before this code point. 791 // There cannot be any decrements below it because UnicodeSet strings 792 // contain multiple code points. 793 pos -= spanLength; 794 offsets.shift(spanLength); 795 spanLength = 0; 796 continue; // Match strings from before a single code point. 797 } 798 // Match strings from before the next string match. 799 } 800 } 801 pos -= offsets.popMinimum(null); 802 spanLength = 0; // Match strings from before a string match. 803 } 804 } 805 806 /** 807 * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) 808 * 809 * Theoretical algorithm: 810 * - Iterate through the string, and at each code point boundary: 811 * + If the code point there is in the set, then return with the current position. 812 * + If a set string matches at the current position, then return with the current position. 813 * 814 * Optimized implementation: 815 * 816 * (Same assumption as for span() above.) 817 * 818 * Create and cache a spanNotSet which contains 819 * all of the single code points of the original set but none of its strings. 820 * For each set string add its initial code point to the spanNotSet. 821 * (Also add its final code point for spanNotBack().) 822 * 823 * - Loop: 824 * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). 825 * + If the current code point is in the original set, then return the current position. 826 * + If any set string matches at the current position, then return the current position. 827 * + If there is no match at the current position, neither for the code point 828 * there nor for any set string, then skip this code point and continue the loop. 829 * This happens for set-string-initial code points that were added to spanNotSet 830 * when there is not actually a match for such a set string. 831 * 832 * @param s The string to be spanned 833 * @param start The start index that the span begins 834 * @param outCount If not null: Receives the number of code points across the span. 835 * @return the limit (exclusive end) of the span 836 */ 837 private int spanNot(CharSequence s, int start, OutputInt outCount) { 838 int length = s.length(); 839 int pos = start, rest = length - start; 840 int stringsLength = strings.size(); 841 int count = 0; 842 do { 843 // Span until we find a code point from the set, 844 // or a code point that starts or ends some string. 845 int spanLimit; 846 if (outCount == null) { 847 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); 848 } else { 849 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); 850 outCount.value = count = count + outCount.value; 851 } 852 if (spanLimit == length) { 853 return length; // Reached the end of the string. 854 } 855 pos = spanLimit; 856 rest = length - spanLimit; 857 858 // Check whether the current code point is in the original set, 859 // without the string starts and ends. 860 int cpLength = spanOne(spanSet, s, pos, rest); 861 if (cpLength > 0) { 862 return pos; // There is a set element at pos. 863 } 864 865 // Try to match the strings at pos. 866 for (int i = 0; i < stringsLength; ++i) { 867 if (spanLengths[i] == ALL_CP_CONTAINED) { 868 continue; // Irrelevant string. 869 } 870 String string = strings.get(i); 871 872 int length16 = string.length(); 873 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { 874 return pos; // There is a set element at pos. 875 } 876 } 877 878 // The span(while not contained) ended on a string start/end which is 879 // not in the original set. Skip this code point and continue. 880 // cpLength<0 881 pos -= cpLength; 882 rest += cpLength; 883 ++count; 884 } while (rest != 0); 885 if (outCount != null) { 886 outCount.value = count; 887 } 888 return length; // Reached the end of the string. 889 } 890 891 private int spanNotBack(CharSequence s, int length) { 892 int pos = length; 893 int i, stringsLength = strings.size(); 894 do { 895 // Span until we find a code point from the set, 896 // or a code point that starts or ends some string. 897 pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); 898 if (pos == 0) { 899 return 0; // Reached the start of the string. 900 } 901 902 // Check whether the current code point is in the original set, 903 // without the string starts and ends. 904 int cpLength = spanOneBack(spanSet, s, pos); 905 if (cpLength > 0) { 906 return pos; // There is a set element at pos. 907 } 908 909 // Try to match the strings at pos. 910 for (i = 0; i < stringsLength; ++i) { 911 // Use spanLengths rather than a spanLengths pointer because 912 // it is easier and we only need to know whether the string is irrelevant 913 // which is the same in either array. 914 if (spanLengths[i] == ALL_CP_CONTAINED) { 915 continue; // Irrelevant string. 916 } 917 String string = strings.get(i); 918 919 int length16 = string.length(); 920 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { 921 return pos; // There is a set element at pos. 922 } 923 } 924 925 // The span(while not contained) ended on a string start/end which is 926 // not in the original set. Skip this code point and continue. 927 // cpLength<0 928 pos += cpLength; 929 } while (pos != 0); 930 return 0; // Reached the start of the string. 931 } 932 933 static short makeSpanLengthByte(int spanLength) { 934 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 935 return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; 936 } 937 938 // Compare strings without any argument checks. Requires length>0. 939 private static boolean matches16(CharSequence s, int start, final String t, int length) { 940 int end = start + length; 941 while (length-- > 0) { 942 if (s.charAt(--end) != t.charAt(length)) { 943 return false; 944 } 945 } 946 return true; 947 } 948 949 /** 950 * Compare 16-bit Unicode strings (which may be malformed UTF-16) 951 * at code point boundaries. 952 * That is, each edge of a match must not be in the middle of a surrogate pair. 953 * @param s The string to match in. 954 * @param start The start index of s. 955 * @param limit The limit of the subsequence of s being spanned. 956 * @param t The substring to be matched in s. 957 * @param tlength The length of t. 958 */ 959 static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { 960 return matches16(s, start, t, tlength) 961 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && 962 Character.isLowSurrogate(s.charAt(start))) 963 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && 964 Character.isLowSurrogate(s.charAt(start + tlength))); 965 } 966 967 /** 968 * Does the set contain the next code point? 969 * If so, return its length; otherwise return its negative length. 970 */ 971 static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { 972 char c = s.charAt(start); 973 if (c >= 0xd800 && c <= 0xdbff && length >= 2) { 974 char c2 = s.charAt(start + 1); 975 if (UTF16.isTrailSurrogate(c2)) { 976 int supplementary = UCharacterProperty.getRawSupplementary(c, c2); 977 return set.contains(supplementary) ? 2 : -2; 978 } 979 } 980 return set.contains(c) ? 1 : -1; 981 } 982 983 static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { 984 char c = s.charAt(length - 1); 985 if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { 986 char c2 = s.charAt(length - 2); 987 if (UTF16.isLeadSurrogate(c2)) { 988 int supplementary = UCharacterProperty.getRawSupplementary(c2, c); 989 return set.contains(supplementary) ? 2 : -2; 990 } 991 } 992 return set.contains(c) ? 1 : -1; 993 } 994 995 /** 996 * Helper class for UnicodeSetStringSpan. 997 * 998 * <p>List of offsets from the current position from where to try matching 999 * a code point or a string. 1000 * Stores offsets rather than indexes to simplify the code and use the same list 1001 * for both increments (in span()) and decrements (in spanBack()). 1002 * 1003 * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time 1004 * are relatively dense, that is, 1005 * there are normally no gaps of hundreds or thousands of offset values. 1006 * 1007 * <p>This class optionally also tracks the minimum non-negative count for each position, 1008 * intended to count the smallest number of elements of any path leading to that position. 1009 * 1010 * <p>The implementation uses a circular buffer of count integers, 1011 * each indicating whether the corresponding offset is in the list, 1012 * and its path element count. 1013 * This avoids inserting into a sorted list of offsets (or absolute indexes) 1014 * and physically moving part of the list. 1015 * 1016 * <p>Note: In principle, the caller should setMaxLength() to 1017 * the maximum of the max string length and U16_LENGTH/U8_LENGTH 1018 * to account for "long" single code points. 1019 * 1020 * <p>Note: An earlier version did not track counts and stored only byte flags. 1021 * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, 1022 * the list could be stored as bit flags in a single integer. 1023 * Rather than handling a circular buffer with a start list index, 1024 * the integer would simply be shifted when lower offsets are removed. 1025 * UnicodeSet does not have a limit on the lengths of strings. 1026 */ 1027 private static final class OffsetList { 1028 private int[] list; 1029 private int length; 1030 private int start; 1031 1032 public OffsetList() { 1033 list = new int[16]; // default size 1034 } 1035 1036 public void setMaxLength(int maxLength) { 1037 if (maxLength > list.length) { 1038 list = new int[maxLength]; 1039 } 1040 clear(); 1041 } 1042 1043 public void clear() { 1044 for (int i = list.length; i-- > 0;) { 1045 list[i] = 0; 1046 } 1047 start = length = 0; 1048 } 1049 1050 public boolean isEmpty() { 1051 return (length == 0); 1052 } 1053 1054 /** 1055 * Reduces all stored offsets by delta, used when the current position moves by delta. 1056 * There must not be any offsets lower than delta. 1057 * If there is an offset equal to delta, it is removed. 1058 * 1059 * @param delta [1..maxLength] 1060 */ 1061 public void shift(int delta) { 1062 int i = start + delta; 1063 if (i >= list.length) { 1064 i -= list.length; 1065 } 1066 if (list[i] != 0) { 1067 list[i] = 0; 1068 --length; 1069 } 1070 start = i; 1071 } 1072 1073 /** 1074 * Adds an offset. The list must not contain it yet. 1075 * @param offset [1..maxLength] 1076 */ 1077 public void addOffset(int offset) { 1078 int i = start + offset; 1079 if (i >= list.length) { 1080 i -= list.length; 1081 } 1082 assert list[i] == 0; 1083 list[i] = 1; 1084 ++length; 1085 } 1086 1087 /** 1088 * Adds an offset and updates its count. 1089 * The list may already contain the offset. 1090 * @param offset [1..maxLength] 1091 */ 1092 public void addOffsetAndCount(int offset, int count) { 1093 assert count > 0; 1094 int i = start + offset; 1095 if (i >= list.length) { 1096 i -= list.length; 1097 } 1098 if (list[i] == 0) { 1099 list[i] = count; 1100 ++length; 1101 } else if (count < list[i]) { 1102 list[i] = count; 1103 } 1104 } 1105 1106 /** 1107 * @param offset [1..maxLength] 1108 */ 1109 public boolean containsOffset(int offset) { 1110 int i = start + offset; 1111 if (i >= list.length) { 1112 i -= list.length; 1113 } 1114 return list[i] != 0; 1115 } 1116 1117 /** 1118 * @param offset [1..maxLength] 1119 */ 1120 public boolean hasCountAtOffset(int offset, int count) { 1121 int i = start + offset; 1122 if (i >= list.length) { 1123 i -= list.length; 1124 } 1125 int oldCount = list[i]; 1126 return oldCount != 0 && oldCount <= count; 1127 } 1128 1129 /** 1130 * Finds the lowest stored offset from a non-empty list, removes it, 1131 * and reduces all other offsets by this minimum. 1132 * @return min=[1..maxLength] 1133 */ 1134 public int popMinimum(OutputInt outCount) { 1135 // Look for the next offset in list[start+1..list.length-1]. 1136 int i = start, result; 1137 while (++i < list.length) { 1138 int count = list[i]; 1139 if (count != 0) { 1140 list[i] = 0; 1141 --length; 1142 result = i - start; 1143 start = i; 1144 if (outCount != null) { outCount.value = count; } 1145 return result; 1146 } 1147 } 1148 // i==list.length 1149 1150 // Wrap around and look for the next offset in list[0..start]. 1151 // Since the list is not empty, there will be one. 1152 result = list.length - start; 1153 i = 0; 1154 int count; 1155 while ((count = list[i]) == 0) { 1156 ++i; 1157 } 1158 list[i] = 0; 1159 --length; 1160 start = i; 1161 if (outCount != null) { outCount.value = count; } 1162 return result + i; 1163 } 1164 } 1165} 1166