1/*
2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 ******************************************************************************
28 *
29 *   Copyright (C) 2009-2014, International Business Machines
30 *   Corporation and others.  All Rights Reserved.
31 *
32 ******************************************************************************
33 */
34
35package sun.text.normalizer;
36
37import sun.text.normalizer.UnicodeSet.SpanCondition;
38
39/**
40 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
41 *
42 * Latin-1: Look up bytes.
43 * 2-byte characters: Bits organized vertically.
44 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
45 * Supplementary characters: Call contains() on the parent set.
46 */
47final class BMPSet {
48
49    /**
50     * One boolean ('true' or 'false') per Latin-1 character.
51     */
52    private boolean[] latin1Contains;
53
54    /**
55     * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
56     * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
57     * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
58     *
59     * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
60     * runtime.
61     */
62    private int[] table7FF;
63
64    /**
65     * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
66     * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
67     * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
68     * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
69     * and set.contains(c) must be called.
70     *
71     * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
72     * validity checking at runtime.
73     */
74    private int[] bmpBlockBits;
75
76    /**
77     * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
78     * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
79     * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
80     */
81    private int[] list4kStarts;
82
83    /**
84     * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
85     * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
86     */
87    private final int[] list;
88    private final int listLength; // length used; list may be longer to minimize reallocs
89
90    public BMPSet(final int[] parentList, int parentListLength) {
91        list = parentList;
92        listLength = parentListLength;
93        latin1Contains = new boolean[0x100];
94        table7FF = new int[64];
95        bmpBlockBits = new int[64];
96        list4kStarts = new int[18];
97
98        /*
99         * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
100         * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
101         * indexes is for finding supplementary code points.
102         */
103        list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
104        int i;
105        for (i = 1; i <= 0x10; ++i) {
106            list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
107        }
108        list4kStarts[0x11] = listLength - 1;
109
110        initBits();
111    }
112
113    public boolean contains(int c) {
114        if (c <= 0xff) {
115            return (latin1Contains[c]);
116        } else if (c <= 0x7ff) {
117            return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
118        } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
119            int lead = c >> 12;
120            int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
121            if (twoBits <= 1) {
122                // All 64 code points with the same bits 15..6
123                // are either in the set or not.
124                return (0 != twoBits);
125            } else {
126                // Look up the code point in its 4k block of code points.
127                return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
128            }
129        } else if (c <= 0x10ffff) {
130            // surrogate or supplementary code point
131            return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
132        } else {
133            // Out-of-range code points get false, consistent with long-standing
134            // behavior of UnicodeSet.contains(c).
135            return false;
136        }
137    }
138
139    /**
140     * Span the initial substring for which each character c has spanCondition==contains(c). It must be
141     * spanCondition==0 or 1.
142     *
143     * @param start The start index
144     * @param outCount If not null: Receives the number of code points in the span.
145     * @return the limit (exclusive end) of the span
146     *
147     * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
148     * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
149     * as usual in ICU.
150     */
151    public final int span(CharSequence s, int start, SpanCondition spanCondition,
152            OutputInt outCount) {
153        char c, c2;
154        int i = start;
155        int limit = s.length();
156        int numSupplementary = 0;
157        if (SpanCondition.NOT_CONTAINED != spanCondition) {
158            // span
159            while (i < limit) {
160                c = s.charAt(i);
161                if (c <= 0xff) {
162                    if (!latin1Contains[c]) {
163                        break;
164                    }
165                } else if (c <= 0x7ff) {
166                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
167                        break;
168                    }
169                } else if (c < 0xd800 ||
170                           c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
171                    int lead = c >> 12;
172                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
173                    if (twoBits <= 1) {
174                        // All 64 code points with the same bits 15..6
175                        // are either in the set or not.
176                        if (twoBits == 0) {
177                            break;
178                        }
179                    } else {
180                        // Look up the code point in its 4k block of code points.
181                        if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
182                            break;
183                        }
184                    }
185                } else {
186                    // surrogate pair
187                    int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
188                    if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
189                        break;
190                    }
191                    ++numSupplementary;
192                    ++i;
193                }
194                ++i;
195            }
196        } else {
197            // span not
198            while (i < limit) {
199                c = s.charAt(i);
200                if (c <= 0xff) {
201                    if (latin1Contains[c]) {
202                        break;
203                    }
204                } else if (c <= 0x7ff) {
205                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
206                        break;
207                    }
208                } else if (c < 0xd800 ||
209                           c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
210                    int lead = c >> 12;
211                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
212                    if (twoBits <= 1) {
213                        // All 64 code points with the same bits 15..6
214                        // are either in the set or not.
215                        if (twoBits != 0) {
216                            break;
217                        }
218                    } else {
219                        // Look up the code point in its 4k block of code points.
220                        if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
221                            break;
222                        }
223                    }
224                } else {
225                    // surrogate pair
226                    int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
227                    if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
228                        break;
229                    }
230                    ++numSupplementary;
231                    ++i;
232                }
233                ++i;
234            }
235        }
236        if (outCount != null) {
237            int spanLength = i - start;
238            outCount.value = spanLength - numSupplementary;  // number of code points
239        }
240        return i;
241    }
242
243    /**
244     * Symmetrical with span().
245     * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
246     * limit and spanCondition==0 or 1.
247     *
248     * @return The string index which starts the span (i.e. inclusive).
249     */
250    public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
251        char c, c2;
252
253        if (SpanCondition.NOT_CONTAINED != spanCondition) {
254            // span
255            for (;;) {
256                c = s.charAt(--limit);
257                if (c <= 0xff) {
258                    if (!latin1Contains[c]) {
259                        break;
260                    }
261                } else if (c <= 0x7ff) {
262                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
263                        break;
264                    }
265                } else if (c < 0xd800 ||
266                           c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
267                    int lead = c >> 12;
268                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
269                    if (twoBits <= 1) {
270                        // All 64 code points with the same bits 15..6
271                        // are either in the set or not.
272                        if (twoBits == 0) {
273                            break;
274                        }
275                    } else {
276                        // Look up the code point in its 4k block of code points.
277                        if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
278                            break;
279                        }
280                    }
281                } else {
282                    // surrogate pair
283                    int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
284                    if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
285                        break;
286                    }
287                    --limit;
288                }
289                if (0 == limit) {
290                    return 0;
291                }
292            }
293        } else {
294            // span not
295            for (;;) {
296                c = s.charAt(--limit);
297                if (c <= 0xff) {
298                    if (latin1Contains[c]) {
299                        break;
300                    }
301                } else if (c <= 0x7ff) {
302                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
303                        break;
304                    }
305                } else if (c < 0xd800 ||
306                           c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
307                    int lead = c >> 12;
308                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
309                    if (twoBits <= 1) {
310                        // All 64 code points with the same bits 15..6
311                        // are either in the set or not.
312                        if (twoBits != 0) {
313                            break;
314                        }
315                    } else {
316                        // Look up the code point in its 4k block of code points.
317                        if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
318                            break;
319                        }
320                    }
321                } else {
322                    // surrogate pair
323                    int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
324                    if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
325                        break;
326                    }
327                    --limit;
328                }
329                if (0 == limit) {
330                    return 0;
331                }
332            }
333        }
334        return limit + 1;
335    }
336
337    /**
338     * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
339     */
340    private static void set32x64Bits(int[] table, int start, int limit) {
341        assert (64 == table.length);
342        int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
343        int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
344
345        // Set one bit indicating an all-one block.
346        int bits = 1 << lead;
347        if ((start + 1) == limit) { // Single-character shortcut.
348            table[trail] |= bits;
349            return;
350        }
351
352        int limitLead = limit >> 6;
353        int limitTrail = limit & 0x3f;
354
355        if (lead == limitLead) {
356            // Partial vertical bit column.
357            while (trail < limitTrail) {
358                table[trail++] |= bits;
359            }
360        } else {
361            // Partial vertical bit column,
362            // followed by a bit rectangle,
363            // followed by another partial vertical bit column.
364            if (trail > 0) {
365                do {
366                    table[trail++] |= bits;
367                } while (trail < 64);
368                ++lead;
369            }
370            if (lead < limitLead) {
371                bits = ~((1 << lead) - 1);
372                if (limitLead < 0x20) {
373                    bits &= (1 << limitLead) - 1;
374                }
375                for (trail = 0; trail < 64; ++trail) {
376                    table[trail] |= bits;
377                }
378            }
379            // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
380            // In that case, bits=1<<limitLead == 1<<0 == 1
381            // (because Java << uses only the lower 5 bits of the shift operand)
382            // but the bits value is not used because trail<limitTrail is already false.
383            bits = 1 << limitLead;
384            for (trail = 0; trail < limitTrail; ++trail) {
385                table[trail] |= bits;
386            }
387        }
388    }
389
390    private void initBits() {
391        int start, limit;
392        int listIndex = 0;
393
394        // Set latin1Contains[].
395        do {
396            start = list[listIndex++];
397            if (listIndex < listLength) {
398                limit = list[listIndex++];
399            } else {
400                limit = 0x110000;
401            }
402            if (start >= 0x100) {
403                break;
404            }
405            do {
406                latin1Contains[start++] = true;
407            } while (start < limit && start < 0x100);
408        } while (limit <= 0x100);
409
410        // Set table7FF[].
411        while (start < 0x800) {
412            set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
413            if (limit > 0x800) {
414                start = 0x800;
415                break;
416            }
417
418            start = list[listIndex++];
419            if (listIndex < listLength) {
420                limit = list[listIndex++];
421            } else {
422                limit = 0x110000;
423            }
424        }
425
426        // Set bmpBlockBits[].
427        int minStart = 0x800;
428        while (start < 0x10000) {
429            if (limit > 0x10000) {
430                limit = 0x10000;
431            }
432
433            if (start < minStart) {
434                start = minStart;
435            }
436            if (start < limit) { // Else: Another range entirely in a known mixed-value block.
437                if (0 != (start & 0x3f)) {
438                    // Mixed-value block of 64 code points.
439                    start >>= 6;
440                    bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
441                    start = (start + 1) << 6; // Round up to the next block boundary.
442                    minStart = start; // Ignore further ranges in this block.
443                }
444                if (start < limit) {
445                    if (start < (limit & ~0x3f)) {
446                        // Multiple all-ones blocks of 64 code points each.
447                        set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
448                    }
449
450                    if (0 != (limit & 0x3f)) {
451                        // Mixed-value block of 64 code points.
452                        limit >>= 6;
453                        bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
454                      limit = (limit + 1) << 6; // Round up to the next block boundary.
455                        minStart = limit; // Ignore further ranges in this block.
456                    }
457                }
458            }
459
460            if (limit == 0x10000) {
461                break;
462          }
463
464            start = list[listIndex++];
465            if (listIndex < listLength) {
466                limit = list[listIndex++];
467            } else {
468                limit = 0x110000;
469            }
470        }
471    }
472
473    /**
474     * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
475     * points in a certain range.
476     *
477     * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
478     * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
479     *
480     * @param c
481     *            a character in a subrange of MIN_VALUE..MAX_VALUE
482     * @param lo
483     *            The lowest index to be returned.
484     * @param hi
485     *            The highest index to be returned.
486     * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
487     */
488    private int findCodePoint(int c, int lo, int hi) {
489        /* Examples:
490                                           findCodePoint(c)
491           set              list[]         c=0 1 3 4 7 8
492           ===              ==============   ===========
493           []               [110000]         0 0 0 0 0 0
494           [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
495           [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
496           [:Any:]          [0, 110000]      1 1 1 1 1 1
497         */
498
499        // Return the smallest i such that c < list[i]. Assume
500        // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
501        if (c < list[lo])
502            return lo;
503        // High runner test. c is often after the last range, so an
504        // initial check for this condition pays off.
505        if (lo >= hi || c >= list[hi - 1])
506            return hi;
507        // invariant: c >= list[lo]
508        // invariant: c < list[hi]
509        for (;;) {
510            int i = (lo + hi) >>> 1;
511            if (i == lo) {
512                break; // Found!
513            } else if (c < list[i]) {
514                hi = i;
515            } else {
516                lo = i;
517            }
518        }
519        return hi;
520    }
521
522    private final boolean containsSlow(int c, int lo, int hi) {
523        return (0 != (findCodePoint(c, lo, hi) & 1));
524    }
525}
526
527