1/*
2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 ******************************************************************************
28 *
29 *   Copyright (C) 2009-2014, International Business Machines
30 *   Corporation and others.  All Rights Reserved.
31 *
32 ******************************************************************************
33 */
34
35package sun.text.normalizer;
36
37import java.util.ArrayList;
38
39import sun.text.normalizer.UnicodeSet.SpanCondition;
40
41/*
42 * Implement span() etc. for a set with strings.
43 * Avoid recursion because of its exponential complexity.
44 * Instead, try multiple paths at once and track them with an IndexList.
45 */
46class UnicodeSetStringSpan {
47
48    /*
49     * Which span() variant will be used? The object is either built for one variant and used once,
50     * or built for all and may be used many times.
51     */
52    public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
53    public static final int FWD           = 0x20;
54    public static final int BACK          = 0x10;
55    // public static final int UTF16      = 8;
56    public static final int CONTAINED     = 2;
57    public static final int NOT_CONTAINED = 1;
58
59    public static final int ALL = 0x7f;
60
61    public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
62    public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
63    public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
64    public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
65
66    /**
67     * Special spanLength short values. (since Java has not unsigned byte type)
68     * All code points in the string are contained in the parent set.
69     */
70    static final short ALL_CP_CONTAINED = 0xff;
71
72    /** The spanLength is >=0xfe. */
73    static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
74
75    /** Set for span(). Same as parent but without strings. */
76    private UnicodeSet spanSet;
77
78    /**
79     * Set for span(not contained).
80     * Same as spanSet, plus characters that start or end strings.
81     */
82    private UnicodeSet spanNotSet;
83
84    /** The strings of the parent set. */
85    private ArrayList<String> strings;
86
87    /** The lengths of span(), spanBack() etc. for each string. */
88    private short[] spanLengths;
89
90    /** Maximum lengths of relevant strings. */
91    private int maxLength16;
92
93    /** Are there strings that are not fully contained in the code point set? */
94    private boolean someRelevant;
95
96    /** Set up for all variants of span()? */
97    private boolean all;
98
99    /** Span helper */
100    private OffsetList offsets;
101
102    /**
103     * Constructs for all variants of span(), or only for any one variant.
104     * Initializes as little as possible, for single use.
105     */
106    public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
107        spanSet = new UnicodeSet(0, 0x10ffff);
108        // TODO: With Java 6, just take the parent set's strings as is,
109        // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
110        // Then iterate via the first() and higher() methods.
111        // (We do not want to create multiple Iterator objects in each span().)
112        // See ICU ticket #7454.
113        strings = setStrings;
114        all = (which == ALL);
115        spanSet.retainAll(set);
116        if (0 != (which & NOT_CONTAINED)) {
117            // Default to the same sets.
118            // addToSpanNotSet() will create a separate set if necessary.
119            spanNotSet = spanSet;
120        }
121        offsets = new OffsetList();
122
123        // Determine if the strings even need to be taken into account at all for span() etc.
124        // If any string is relevant, then all strings need to be used for
125        // span(longest match) but only the relevant ones for span(while contained).
126        // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
127        // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
128        // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
129        // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
130        int stringsLength = strings.size();
131
132        int i, spanLength;
133        someRelevant = false;
134        for (i = 0; i < stringsLength; ++i) {
135            String string = strings.get(i);
136            int length16 = string.length();
137            spanLength = spanSet.span(string, SpanCondition.CONTAINED);
138            if (spanLength < length16) { // Relevant string.
139                someRelevant = true;
140            }
141            if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
142                maxLength16 = length16;
143            }
144        }
145        if (!someRelevant && (which & WITH_COUNT) == 0) {
146            return;
147        }
148
149        // Freeze after checking for the need to use strings at all because freezing
150        // a set takes some time and memory which are wasted if there are no relevant strings.
151        if (all) {
152            spanSet.freeze();
153        }
154
155        int spanBackLengthsOffset;
156
157        // Allocate a block of meta data.
158        int allocSize;
159        if (all) {
160            // 2 sets of span lengths
161            allocSize = stringsLength * (2);
162        } else {
163            allocSize = stringsLength; // One set of span lengths.
164        }
165        spanLengths = new short[allocSize];
166
167        if (all) {
168            // Store span lengths for all span() variants.
169            spanBackLengthsOffset = stringsLength;
170        } else {
171            // Store span lengths for only one span() variant.
172            spanBackLengthsOffset = 0;
173        }
174
175        // Set the meta data and spanNotSet and write the UTF-8 strings.
176
177        for (i = 0; i < stringsLength; ++i) {
178            String string = strings.get(i);
179            int length16 = string.length();
180            spanLength = spanSet.span(string, SpanCondition.CONTAINED);
181            if (spanLength < length16) { // Relevant string.
182                if (true /* 0 != (which & UTF16) */) {
183                    if (0 != (which & CONTAINED)) {
184                        if (0 != (which & FWD)) {
185                            spanLengths[i] = makeSpanLengthByte(spanLength);
186                        }
187                        if (0 != (which & BACK)) {
188                            spanLength = length16
189                                    - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
190                            spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
191                        }
192                    } else /* not CONTAINED, not all, but NOT_CONTAINED */{
193                        spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
194                                                                                     // flag.
195                    }
196                }
197                if (0 != (which & NOT_CONTAINED)) {
198                    // Add string start and end code points to the spanNotSet so that
199                    // a span(while not contained) stops before any string.
200                    int c;
201                    if (0 != (which & FWD)) {
202                        c = string.codePointAt(0);
203                        addToSpanNotSet(c);
204                    }
205                    if (0 != (which & BACK)) {
206                        c = string.codePointBefore(length16);
207                        addToSpanNotSet(c);
208                    }
209                }
210            } else { // Irrelevant string.
211                if (all) {
212                    spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
213                } else {
214                    // All spanXYZLengths pointers contain the same address.
215                    spanLengths[i] = ALL_CP_CONTAINED;
216                }
217            }
218        }
219
220        // Finish.
221        if (all) {
222            spanNotSet.freeze();
223        }
224    }
225
226    /**
227     * Do the strings need to be checked in span() etc.?
228     *
229     * @return true if strings need to be checked (call span() here),
230     *         false if not (use a BMPSet for best performance).
231     */
232    public boolean needsStringSpanUTF16() {
233        return someRelevant;
234    }
235
236    /** For fast UnicodeSet::contains(c). */
237    public boolean contains(int c) {
238        return spanSet.contains(c);
239    }
240
241    /**
242     * Adds a starting or ending string character to the spanNotSet
243     * so that a character span ends before any string.
244     */
245    private void addToSpanNotSet(int c) {
246        if (spanNotSet == null || spanNotSet == spanSet) {
247            if (spanSet.contains(c)) {
248                return; // Nothing to do.
249            }
250            spanNotSet = spanSet.cloneAsThawed();
251        }
252        spanNotSet.add(c);
253    }
254
255    /*
256     * Note: In span() when spanLength==0
257     * (after a string match, or at the beginning after an empty code point span)
258     * and in spanNot() and spanNotUTF8(),
259     * string matching could use a binary search because all string matches are done
260     * from the same start index.
261     *
262     * For UTF-8, this would require a comparison function that returns UTF-16 order.
263     *
264     * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
265     * with strings have very few very short strings. For cases with many strings, it might be better to use a different
266     * API and implementation with a DFA (state machine).
267     */
268
269    /*
270     * Algorithm for span(SpanCondition.CONTAINED)
271     *
272     * Theoretical algorithm:
273     * - Iterate through the string, and at each code point boundary:
274     *   + If the code point there is in the set, then remember to continue after it.
275     *   + If a set string matches at the current position, then remember to continue after it.
276     *   + Either recursively span for each code point or string match, or recursively span
277     *     for all but the shortest one and iteratively continue the span with the shortest local match.
278     *   + Remember the longest recursive span (the farthest end point).
279     *   + If there is no match at the current position,
280     *     neither for the code point there nor for any set string,
281     *     then stop and return the longest recursive span length.
282     *
283     * Optimized implementation:
284     *
285     * (We assume that most sets will have very few very short strings.
286     * A span using a string-less set is extremely fast.)
287     *
288     * Create and cache a spanSet which contains all of the single code points of the original set
289     * but none of its strings.
290     *
291     * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
292     * - Loop:
293     *   + Try to match each set string at the end of the spanLength.
294     *     ~ Set strings that start with set-contained code points
295     *       must be matched with a partial overlap
296     *       because the recursive algorithm would have tried to match them at every position.
297     *     ~ Set strings that entirely consist of set-contained code points
298     *       are irrelevant for span(SpanCondition.CONTAINED)
299     *       because the recursive algorithm would continue after them anyway and
300     *       find the longest recursive match from their end.
301     *     ~ Rather than recursing, note each end point of a set string match.
302     *   + If no set string matched after spanSet.span(),
303     *     then return with where the spanSet.span() ended.
304     *   + If at least one set string matched after spanSet.span(),
305     *     then pop the shortest string match end point and continue the loop,
306     *     trying to match all set strings from there.
307     *   + If at least one more set string matched after a previous string match, then test if the
308     *     code point after the previous string match is also contained in the set.
309     *     Continue the loop with the shortest end point of
310     *     either this code point or a matching set string.
311     *   + If no more set string matched after a previous string match,
312     *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
313     *     Stop if spanLength==0, otherwise continue the loop.
314     *
315     * By noting each end point of a set string match, the function visits each string position at most once and
316     * finishes in linear time.
317     *
318     * The recursive algorithm may visit the same string position many times
319     * if multiple paths lead to it and finishes in exponential time.
320     */
321
322    /*
323     * Algorithm for span(SIMPLE)
324     *
325     * Theoretical algorithm:
326     * - Iterate through the string, and at each code point boundary:
327     *   + If the code point there is in the set, then remember to continue after it.
328     *   + If a set string matches at the current position, then remember to continue after it.
329     *   + Continue from the farthest match position and ignore all others.
330     *   + If there is no match at the current position, then stop and return the current position.
331     *
332     * Optimized implementation:
333     *
334     * (Same assumption and spanSet as above.)
335     *
336     * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
337     * - Loop:
338     *   + Try to match each set string at the end of the spanLength.
339     *     ~ Set strings that start with set-contained code points
340     *       must be matched with a partial overlap
341     *       because the standard algorithm would have tried to match them earlier.
342     *     ~ Set strings that entirely consist of set-contained code points
343     *       must be matched with a full overlap because the longest-match algorithm
344     *       would hide set string matches that end earlier.
345     *       Such set strings need not be matched earlier inside the code point span
346     *       because the standard algorithm would then have
347     *       continued after the set string match anyway.
348     *     ~ Remember the longest set string match (farthest end point)
349     *       from the earliest starting point.
350     *   + If no set string matched after spanSet.span(),
351     *     then return with where the spanSet.span() ended.
352     *   + If at least one set string matched,
353     *     then continue the loop after the longest match from the earliest position.
354     *   + If no more set string matched after a previous string match,
355     *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
356     *     Stop if spanLength==0, otherwise continue the loop.
357     */
358    /**
359     * Spans a string.
360     *
361     * @param s The string to be spanned
362     * @param start The start index that the span begins
363     * @param spanCondition The span condition
364     * @return the limit (exclusive end) of the span
365     */
366    public int span(CharSequence s, int start, SpanCondition spanCondition) {
367        if (spanCondition == SpanCondition.NOT_CONTAINED) {
368            return spanNot(s, start, null);
369        }
370        int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
371        if (spanLimit == s.length()) {
372            return spanLimit;
373        }
374        return spanWithStrings(s, start, spanLimit, spanCondition);
375    }
376
377    /**
378     * Synchronized method for complicated spans using the offsets.
379     * Avoids synchronization for simple cases.
380     *
381     * @param spanLimit = spanSet.span(s, start, CONTAINED)
382     */
383    private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
384            SpanCondition spanCondition) {
385        // Consider strings; they may overlap with the span.
386        int initSize = 0;
387        if (spanCondition == SpanCondition.CONTAINED) {
388            // Use offset list to try all possibilities.
389            initSize = maxLength16;
390        }
391        offsets.setMaxLength(initSize);
392        int length = s.length();
393        int pos = spanLimit, rest = length - spanLimit;
394        int spanLength = spanLimit - start;
395        int i, stringsLength = strings.size();
396        for (;;) {
397            if (spanCondition == SpanCondition.CONTAINED) {
398                for (i = 0; i < stringsLength; ++i) {
399                    int overlap = spanLengths[i];
400                    if (overlap == ALL_CP_CONTAINED) {
401                        continue; // Irrelevant string.
402                    }
403                    String string = strings.get(i);
404
405                    int length16 = string.length();
406
407                    // Try to match this string at pos-overlap..pos.
408                    if (overlap >= LONG_SPAN) {
409                        overlap = length16;
410                        // While contained: No point matching fully inside the code point span.
411                        overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
412                                                                          // point.
413                    }
414                    if (overlap > spanLength) {
415                        overlap = spanLength;
416                    }
417                    int inc = length16 - overlap; // Keep overlap+inc==length16.
418                    for (;;) {
419                        if (inc > rest) {
420                            break;
421                        }
422                        // Try to match if the increment is not listed already.
423                        if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
424                            if (inc == rest) {
425                                return length; // Reached the end of the string.
426                            }
427                            offsets.addOffset(inc);
428                        }
429                        if (overlap == 0) {
430                            break;
431                        }
432                        --overlap;
433                        ++inc;
434                    }
435                }
436            } else /* SIMPLE */{
437                int maxInc = 0, maxOverlap = 0;
438                for (i = 0; i < stringsLength; ++i) {
439                    int overlap = spanLengths[i];
440                    // For longest match, we do need to try to match even an all-contained string
441                    // to find the match from the earliest start.
442
443                    String string = strings.get(i);
444
445                    int length16 = string.length();
446
447                    // Try to match this string at pos-overlap..pos.
448                    if (overlap >= LONG_SPAN) {
449                        overlap = length16;
450                        // Longest match: Need to match fully inside the code point span
451                        // to find the match from the earliest start.
452                    }
453                    if (overlap > spanLength) {
454                        overlap = spanLength;
455                    }
456                    int inc = length16 - overlap; // Keep overlap+inc==length16.
457                    for (;;) {
458                        if (inc > rest || overlap < maxOverlap) {
459                            break;
460                        }
461                        // Try to match if the string is longer or starts earlier.
462                        if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
463                                && matches16CPB(s, pos - overlap, length, string, length16)) {
464                            maxInc = inc; // Longest match from earliest start.
465                            maxOverlap = overlap;
466                            break;
467                        }
468                        --overlap;
469                        ++inc;
470                    }
471                }
472
473                if (maxInc != 0 || maxOverlap != 0) {
474                    // Longest-match algorithm, and there was a string match.
475                    // Simply continue after it.
476                    pos += maxInc;
477                    rest -= maxInc;
478                    if (rest == 0) {
479                        return length; // Reached the end of the string.
480                    }
481                    spanLength = 0; // Match strings from after a string match.
482                    continue;
483                }
484            }
485            // Finished trying to match all strings at pos.
486
487            if (spanLength != 0 || pos == 0) {
488                // The position is after an unlimited code point span (spanLength!=0),
489                // not after a string match.
490                // The only position where spanLength==0 after a span is pos==0.
491                // Otherwise, an unlimited code point span is only tried again when no
492                // strings match, and if such a non-initial span fails we stop.
493                if (offsets.isEmpty()) {
494                    return pos; // No strings matched after a span.
495                }
496                // Match strings from after the next string match.
497            } else {
498                // The position is after a string match (or a single code point).
499                if (offsets.isEmpty()) {
500                    // No more strings matched after a previous string match.
501                    // Try another code point span from after the last string match.
502                    spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
503                    spanLength = spanLimit - pos;
504                    if (spanLength == rest || // Reached the end of the string, or
505                            spanLength == 0 // neither strings nor span progressed.
506                    ) {
507                        return spanLimit;
508                    }
509                    pos += spanLength;
510                    rest -= spanLength;
511                    continue; // spanLength>0: Match strings from after a span.
512                } else {
513                    // Try to match only one code point from after a string match if some
514                    // string matched beyond it, so that we try all possible positions
515                    // and don't overshoot.
516                    spanLength = spanOne(spanSet, s, pos, rest);
517                    if (spanLength > 0) {
518                        if (spanLength == rest) {
519                            return length; // Reached the end of the string.
520                        }
521                        // Match strings after this code point.
522                        // There cannot be any increments below it because UnicodeSet strings
523                        // contain multiple code points.
524                        pos += spanLength;
525                        rest -= spanLength;
526                        offsets.shift(spanLength);
527                        spanLength = 0;
528                        continue; // Match strings from after a single code point.
529                    }
530                    // Match strings from after the next string match.
531                }
532            }
533            int minOffset = offsets.popMinimum(null);
534            pos += minOffset;
535            rest -= minOffset;
536            spanLength = 0; // Match strings from after a string match.
537        }
538    }
539
540    /**
541     * Spans a string and counts the smallest number of set elements on any path across the span.
542     *
543     * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
544     *
545     * <p>If the set does not have any fully-contained strings, then we could optimize this
546     * like span(), but such sets are likely rare, and this is at least still linear.
547     *
548     * @param s The string to be spanned
549     * @param start The start index that the span begins
550     * @param spanCondition The span condition
551     * @param outCount The count
552     * @return the limit (exclusive end) of the span
553     */
554    public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
555            OutputInt outCount) {
556        if (spanCondition == SpanCondition.NOT_CONTAINED) {
557            return spanNot(s, start, outCount);
558        }
559        // Consider strings; they may overlap with the span,
560        // and they may result in a smaller count that with just code points.
561        if (spanCondition == SpanCondition.CONTAINED) {
562            return spanContainedAndCount(s, start, outCount);
563        }
564        // SIMPLE (not synchronized, does not use offsets)
565        int stringsLength = strings.size();
566        int length = s.length();
567        int pos = start;
568        int rest = length - start;
569        int count = 0;
570        while (rest != 0) {
571            // Try to match the next code point.
572            int cpLength = spanOne(spanSet, s, pos, rest);
573            int maxInc = (cpLength > 0) ? cpLength : 0;
574            // Try to match all of the strings.
575            for (int i = 0; i < stringsLength; ++i) {
576                String string = strings.get(i);
577                int length16 = string.length();
578                if (maxInc < length16 && length16 <= rest &&
579                        matches16CPB(s, pos, length, string, length16)) {
580                    maxInc = length16;
581                }
582            }
583            // We are done if there is no match beyond pos.
584            if (maxInc == 0) {
585                outCount.value = count;
586                return pos;
587            }
588            // Continue from the longest match.
589            ++count;
590            pos += maxInc;
591            rest -= maxInc;
592        }
593        outCount.value = count;
594        return pos;
595    }
596
597    private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
598        // Use offset list to try all possibilities.
599        offsets.setMaxLength(maxLength16);
600        int stringsLength = strings.size();
601        int length = s.length();
602        int pos = start;
603        int rest = length - start;
604        int count = 0;
605        while (rest != 0) {
606            // Try to match the next code point.
607            int cpLength = spanOne(spanSet, s, pos, rest);
608            if (cpLength > 0) {
609                offsets.addOffsetAndCount(cpLength, count + 1);
610            }
611            // Try to match all of the strings.
612            for (int i = 0; i < stringsLength; ++i) {
613                String string = strings.get(i);
614                int length16 = string.length();
615                // Note: If the strings were sorted by length, then we could also
616                // avoid trying to match if there is already a match of the same length.
617                if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
618                        matches16CPB(s, pos, length, string, length16)) {
619                    offsets.addOffsetAndCount(length16, count + 1);
620                }
621            }
622            // We are done if there is no match beyond pos.
623            if (offsets.isEmpty()) {
624                outCount.value = count;
625                return pos;
626            }
627            // Continue from the nearest match.
628            int minOffset = offsets.popMinimum(outCount);
629            count = outCount.value;
630            pos += minOffset;
631            rest -= minOffset;
632        }
633        outCount.value = count;
634        return pos;
635    }
636
637    /**
638     * Span a string backwards.
639     *
640     * @param s The string to be spanned
641     * @param spanCondition The span condition
642     * @return The string index which starts the span (i.e. inclusive).
643     */
644    public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
645        if (spanCondition == SpanCondition.NOT_CONTAINED) {
646            return spanNotBack(s, length);
647        }
648        int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
649        if (pos == 0) {
650            return 0;
651        }
652        int spanLength = length - pos;
653
654        // Consider strings; they may overlap with the span.
655        int initSize = 0;
656        if (spanCondition == SpanCondition.CONTAINED) {
657            // Use offset list to try all possibilities.
658            initSize = maxLength16;
659        }
660        offsets.setMaxLength(initSize);
661        int i, stringsLength = strings.size();
662        int spanBackLengthsOffset = 0;
663        if (all) {
664            spanBackLengthsOffset = stringsLength;
665        }
666        for (;;) {
667            if (spanCondition == SpanCondition.CONTAINED) {
668                for (i = 0; i < stringsLength; ++i) {
669                    int overlap = spanLengths[spanBackLengthsOffset + i];
670                    if (overlap == ALL_CP_CONTAINED) {
671                        continue; // Irrelevant string.
672                    }
673                    String string = strings.get(i);
674
675                    int length16 = string.length();
676
677                    // Try to match this string at pos-(length16-overlap)..pos-length16.
678                    if (overlap >= LONG_SPAN) {
679                        overlap = length16;
680                        // While contained: No point matching fully inside the code point span.
681                        int len1 = 0;
682                        len1 = string.offsetByCodePoints(0, 1);
683                        overlap -= len1; // Length of the string minus the first code point.
684                    }
685                    if (overlap > spanLength) {
686                        overlap = spanLength;
687                    }
688                    int dec = length16 - overlap; // Keep dec+overlap==length16.
689                    for (;;) {
690                        if (dec > pos) {
691                            break;
692                        }
693                        // Try to match if the decrement is not listed already.
694                        if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
695                            if (dec == pos) {
696                                return 0; // Reached the start of the string.
697                            }
698                            offsets.addOffset(dec);
699                        }
700                        if (overlap == 0) {
701                            break;
702                        }
703                        --overlap;
704                        ++dec;
705                    }
706                }
707            } else /* SIMPLE */{
708                int maxDec = 0, maxOverlap = 0;
709                for (i = 0; i < stringsLength; ++i) {
710                    int overlap = spanLengths[spanBackLengthsOffset + i];
711                    // For longest match, we do need to try to match even an all-contained string
712                    // to find the match from the latest end.
713
714                    String string = strings.get(i);
715
716                    int length16 = string.length();
717
718                    // Try to match this string at pos-(length16-overlap)..pos-length16.
719                    if (overlap >= LONG_SPAN) {
720                        overlap = length16;
721                        // Longest match: Need to match fully inside the code point span
722                        // to find the match from the latest end.
723                    }
724                    if (overlap > spanLength) {
725                      overlap = spanLength;
726                    }
727                    int dec = length16 - overlap; // Keep dec+overlap==length16.
728                    for (;;) {
729                        if (dec > pos || overlap < maxOverlap) {
730                            break;
731                        }
732                        // Try to match if the string is longer or ends later.
733                        if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
734                                && matches16CPB(s, pos - dec, length, string, length16)) {
735                            maxDec = dec; // Longest match from latest end.
736                            maxOverlap = overlap;
737                            break;
738                        }
739                        --overlap;
740                        ++dec;
741                    }
742                }
743
744                if (maxDec != 0 || maxOverlap != 0) {
745                    // Longest-match algorithm, and there was a string match.
746                    // Simply continue before it.
747                    pos -= maxDec;
748                    if (pos == 0) {
749                        return 0; // Reached the start of the string.
750                    }
751                    spanLength = 0; // Match strings from before a string match.
752                    continue;
753                }
754            }
755            // Finished trying to match all strings at pos.
756
757            if (spanLength != 0 || pos == length) {
758                // The position is before an unlimited code point span (spanLength!=0),
759                // not before a string match.
760                // The only position where spanLength==0 before a span is pos==length.
761                // Otherwise, an unlimited code point span is only tried again when no
762                // strings match, and if such a non-initial span fails we stop.
763                if (offsets.isEmpty()) {
764                    return pos; // No strings matched before a span.
765                }
766                // Match strings from before the next string match.
767            } else {
768                // The position is before a string match (or a single code point).
769                if (offsets.isEmpty()) {
770                    // No more strings matched before a previous string match.
771                    // Try another code point span from before the last string match.
772                    int oldPos = pos;
773                    pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
774                    spanLength = oldPos - pos;
775                    if (pos == 0 || // Reached the start of the string, or
776                            spanLength == 0 // neither strings nor span progressed.
777                    ) {
778                        return pos;
779                    }
780                    continue; // spanLength>0: Match strings from before a span.
781                } else {
782                    // Try to match only one code point from before a string match if some
783                    // string matched beyond it, so that we try all possible positions
784                    // and don't overshoot.
785                    spanLength = spanOneBack(spanSet, s, pos);
786                    if (spanLength > 0) {
787                        if (spanLength == pos) {
788                            return 0; // Reached the start of the string.
789                        }
790                        // Match strings before this code point.
791                        // There cannot be any decrements below it because UnicodeSet strings
792                        // contain multiple code points.
793                        pos -= spanLength;
794                        offsets.shift(spanLength);
795                        spanLength = 0;
796                        continue; // Match strings from before a single code point.
797                    }
798                    // Match strings from before the next string match.
799                }
800            }
801            pos -= offsets.popMinimum(null);
802            spanLength = 0; // Match strings from before a string match.
803        }
804    }
805
806    /**
807     * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
808     *
809     * Theoretical algorithm:
810     * - Iterate through the string, and at each code point boundary:
811     *   + If the code point there is in the set, then return with the current position.
812     *   + If a set string matches at the current position, then return with the current position.
813     *
814     * Optimized implementation:
815     *
816     * (Same assumption as for span() above.)
817     *
818     * Create and cache a spanNotSet which contains
819     * all of the single code points of the original set but none of its strings.
820     * For each set string add its initial code point to the spanNotSet.
821     * (Also add its final code point for spanNotBack().)
822     *
823     * - Loop:
824     *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
825     *   + If the current code point is in the original set, then return the current position.
826     *   + If any set string matches at the current position, then return the current position.
827     *   + If there is no match at the current position, neither for the code point
828     *     there nor for any set string, then skip this code point and continue the loop.
829     *     This happens for set-string-initial code points that were added to spanNotSet
830     *     when there is not actually a match for such a set string.
831     *
832     * @param s The string to be spanned
833     * @param start The start index that the span begins
834     * @param outCount If not null: Receives the number of code points across the span.
835     * @return the limit (exclusive end) of the span
836     */
837    private int spanNot(CharSequence s, int start, OutputInt outCount) {
838        int length = s.length();
839        int pos = start, rest = length - start;
840        int stringsLength = strings.size();
841        int count = 0;
842        do {
843            // Span until we find a code point from the set,
844            // or a code point that starts or ends some string.
845            int spanLimit;
846            if (outCount == null) {
847                spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
848            } else {
849                spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
850                outCount.value = count = count + outCount.value;
851            }
852            if (spanLimit == length) {
853                return length; // Reached the end of the string.
854            }
855            pos = spanLimit;
856            rest = length - spanLimit;
857
858            // Check whether the current code point is in the original set,
859            // without the string starts and ends.
860            int cpLength = spanOne(spanSet, s, pos, rest);
861            if (cpLength > 0) {
862                return pos; // There is a set element at pos.
863            }
864
865            // Try to match the strings at pos.
866            for (int i = 0; i < stringsLength; ++i) {
867                if (spanLengths[i] == ALL_CP_CONTAINED) {
868                    continue; // Irrelevant string.
869                }
870                String string = strings.get(i);
871
872                int length16 = string.length();
873                if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
874                    return pos; // There is a set element at pos.
875                }
876            }
877
878            // The span(while not contained) ended on a string start/end which is
879            // not in the original set. Skip this code point and continue.
880            // cpLength<0
881            pos -= cpLength;
882            rest += cpLength;
883            ++count;
884        } while (rest != 0);
885        if (outCount != null) {
886            outCount.value = count;
887        }
888        return length; // Reached the end of the string.
889    }
890
891    private int spanNotBack(CharSequence s, int length) {
892        int pos = length;
893        int i, stringsLength = strings.size();
894        do {
895            // Span until we find a code point from the set,
896            // or a code point that starts or ends some string.
897            pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
898            if (pos == 0) {
899                return 0; // Reached the start of the string.
900            }
901
902            // Check whether the current code point is in the original set,
903            // without the string starts and ends.
904            int cpLength = spanOneBack(spanSet, s, pos);
905            if (cpLength > 0) {
906                return pos; // There is a set element at pos.
907            }
908
909            // Try to match the strings at pos.
910            for (i = 0; i < stringsLength; ++i) {
911                // Use spanLengths rather than a spanLengths pointer because
912                // it is easier and we only need to know whether the string is irrelevant
913                // which is the same in either array.
914                if (spanLengths[i] == ALL_CP_CONTAINED) {
915                    continue; // Irrelevant string.
916                }
917                String string = strings.get(i);
918
919                int length16 = string.length();
920                if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
921                    return pos; // There is a set element at pos.
922                }
923            }
924
925            // The span(while not contained) ended on a string start/end which is
926            // not in the original set. Skip this code point and continue.
927            // cpLength<0
928            pos += cpLength;
929        } while (pos != 0);
930        return 0; // Reached the start of the string.
931    }
932
933    static short makeSpanLengthByte(int spanLength) {
934        // 0xfe==UnicodeSetStringSpan::LONG_SPAN
935        return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
936    }
937
938    // Compare strings without any argument checks. Requires length>0.
939    private static boolean matches16(CharSequence s, int start, final String t, int length) {
940        int end = start + length;
941        while (length-- > 0) {
942            if (s.charAt(--end) != t.charAt(length)) {
943                return false;
944            }
945        }
946        return true;
947    }
948
949    /**
950     * Compare 16-bit Unicode strings (which may be malformed UTF-16)
951     * at code point boundaries.
952     * That is, each edge of a match must not be in the middle of a surrogate pair.
953     * @param s       The string to match in.
954     * @param start   The start index of s.
955     * @param limit   The limit of the subsequence of s being spanned.
956     * @param t       The substring to be matched in s.
957     * @param tlength The length of t.
958     */
959    static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
960        return matches16(s, start, t, tlength)
961                && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
962                        Character.isLowSurrogate(s.charAt(start)))
963                && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
964                        Character.isLowSurrogate(s.charAt(start + tlength)));
965    }
966
967    /**
968     * Does the set contain the next code point?
969     * If so, return its length; otherwise return its negative length.
970     */
971    static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
972        char c = s.charAt(start);
973        if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
974            char c2 = s.charAt(start + 1);
975            if (UTF16.isTrailSurrogate(c2)) {
976                int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
977                return set.contains(supplementary) ? 2 : -2;
978            }
979        }
980        return set.contains(c) ? 1 : -1;
981    }
982
983    static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
984        char c = s.charAt(length - 1);
985        if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
986            char c2 = s.charAt(length - 2);
987            if (UTF16.isLeadSurrogate(c2)) {
988                int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
989                return set.contains(supplementary) ? 2 : -2;
990            }
991        }
992        return set.contains(c) ? 1 : -1;
993    }
994
995    /**
996     * Helper class for UnicodeSetStringSpan.
997     *
998     * <p>List of offsets from the current position from where to try matching
999     * a code point or a string.
1000     * Stores offsets rather than indexes to simplify the code and use the same list
1001     * for both increments (in span()) and decrements (in spanBack()).
1002     *
1003     * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1004     * are relatively dense, that is,
1005     * there are normally no gaps of hundreds or thousands of offset values.
1006     *
1007     * <p>This class optionally also tracks the minimum non-negative count for each position,
1008     * intended to count the smallest number of elements of any path leading to that position.
1009     *
1010     * <p>The implementation uses a circular buffer of count integers,
1011     * each indicating whether the corresponding offset is in the list,
1012     * and its path element count.
1013     * This avoids inserting into a sorted list of offsets (or absolute indexes)
1014     * and physically moving part of the list.
1015     *
1016     * <p>Note: In principle, the caller should setMaxLength() to
1017     * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1018     * to account for "long" single code points.
1019     *
1020     * <p>Note: An earlier version did not track counts and stored only byte flags.
1021     * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1022     * the list could be stored as bit flags in a single integer.
1023     * Rather than handling a circular buffer with a start list index,
1024     * the integer would simply be shifted when lower offsets are removed.
1025     * UnicodeSet does not have a limit on the lengths of strings.
1026     */
1027    private static final class OffsetList {
1028        private int[] list;
1029        private int length;
1030        private int start;
1031
1032        public OffsetList() {
1033            list = new int[16];  // default size
1034        }
1035
1036        public void setMaxLength(int maxLength) {
1037            if (maxLength > list.length) {
1038                list = new int[maxLength];
1039            }
1040            clear();
1041        }
1042
1043        public void clear() {
1044            for (int i = list.length; i-- > 0;) {
1045                list[i] = 0;
1046            }
1047            start = length = 0;
1048        }
1049
1050        public boolean isEmpty() {
1051            return (length == 0);
1052        }
1053
1054        /**
1055         * Reduces all stored offsets by delta, used when the current position moves by delta.
1056         * There must not be any offsets lower than delta.
1057         * If there is an offset equal to delta, it is removed.
1058         *
1059         * @param delta [1..maxLength]
1060         */
1061        public void shift(int delta) {
1062            int i = start + delta;
1063            if (i >= list.length) {
1064                i -= list.length;
1065            }
1066            if (list[i] != 0) {
1067                list[i] = 0;
1068                --length;
1069            }
1070            start = i;
1071        }
1072
1073        /**
1074         * Adds an offset. The list must not contain it yet.
1075         * @param offset [1..maxLength]
1076         */
1077        public void addOffset(int offset) {
1078            int i = start + offset;
1079            if (i >= list.length) {
1080                i -= list.length;
1081            }
1082            assert list[i] == 0;
1083            list[i] = 1;
1084            ++length;
1085        }
1086
1087        /**
1088         * Adds an offset and updates its count.
1089         * The list may already contain the offset.
1090         * @param offset [1..maxLength]
1091         */
1092        public void addOffsetAndCount(int offset, int count) {
1093            assert count > 0;
1094            int i = start + offset;
1095            if (i >= list.length) {
1096                i -= list.length;
1097            }
1098            if (list[i] == 0) {
1099                list[i] = count;
1100                ++length;
1101            } else if (count < list[i]) {
1102                list[i] = count;
1103            }
1104        }
1105
1106        /**
1107         * @param offset [1..maxLength]
1108         */
1109        public boolean containsOffset(int offset) {
1110            int i = start + offset;
1111            if (i >= list.length) {
1112                i -= list.length;
1113            }
1114            return list[i] != 0;
1115        }
1116
1117        /**
1118         * @param offset [1..maxLength]
1119         */
1120        public boolean hasCountAtOffset(int offset, int count) {
1121            int i = start + offset;
1122            if (i >= list.length) {
1123                i -= list.length;
1124            }
1125            int oldCount = list[i];
1126            return oldCount != 0 && oldCount <= count;
1127        }
1128
1129        /**
1130         * Finds the lowest stored offset from a non-empty list, removes it,
1131         * and reduces all other offsets by this minimum.
1132         * @return min=[1..maxLength]
1133         */
1134        public int popMinimum(OutputInt outCount) {
1135            // Look for the next offset in list[start+1..list.length-1].
1136            int i = start, result;
1137            while (++i < list.length) {
1138                int count = list[i];
1139                if (count != 0) {
1140                    list[i] = 0;
1141                    --length;
1142                    result = i - start;
1143                    start = i;
1144                    if (outCount != null) { outCount.value = count; }
1145                    return result;
1146                }
1147            }
1148            // i==list.length
1149
1150            // Wrap around and look for the next offset in list[0..start].
1151            // Since the list is not empty, there will be one.
1152            result = list.length - start;
1153            i = 0;
1154            int count;
1155            while ((count = list[i]) == 0) {
1156                ++i;
1157            }
1158            list[i] = 0;
1159            --length;
1160            start = i;
1161            if (outCount != null) { outCount.value = count; }
1162            return result + i;
1163        }
1164    }
1165}
1166