1/*
2 * Permission is hereby granted, free of charge, to any person obtaining a copy of
3 * this software and associated documentation files (the "Software"), to deal in
4 * the Software without restriction, including without limitation the rights to
5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
6 * of the Software, and to permit persons to whom the Software is furnished to do
7 * so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in all
10 * copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18 * SOFTWARE.
19 */
20package jdk.nashorn.internal.runtime.regexp.joni;
21
22import java.util.Arrays;
23import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
24import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
25
26@SuppressWarnings("javadoc")
27public final class EncodingHelper {
28
29    final static int NEW_LINE            = 0x000a;
30    final static int RETURN              = 0x000d;
31    final static int LINE_SEPARATOR      = 0x2028;
32    final static int PARAGRAPH_SEPARATOR = 0x2029;
33
34    final static char[] EMPTYCHARS = new char[0];
35    final static int[][] codeRanges = new int[15][];
36
37    public static int digitVal(final int code) {
38        return code - '0';
39    }
40
41    public static int odigitVal(final int code) {
42        return digitVal(code);
43    }
44
45    public static boolean isXDigit(final int code) {
46        return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
47    }
48
49    public static int xdigitVal(final int code) {
50        if (Character.isDigit(code)) {
51            return code - '0';
52        } else if (code >= 'a' && code <= 'f') {
53            return code - 'a' + 10;
54        } else {
55            return code - 'A' + 10;
56        }
57    }
58
59    public static boolean isDigit(final int code) {
60        return code >= '0' && code <= '9';
61    }
62
63    public static boolean isWord(final int code) {
64        // letter, digit, or '_'
65        return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
66    }
67
68    public static boolean isNewLine(final int code) {
69        return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
70    }
71
72    public static boolean isNewLine(final char[] chars, final int p, final int end) {
73        return p < end && isNewLine(chars[p]);
74    }
75
76    // Encoding.prevCharHead
77    public static int prevCharHead(final int p, final int s) {
78        return s <= p ? -1 : s - 1;
79    }
80
81    /* onigenc_get_right_adjust_char_head_with_prev */
82    public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
83        if (prev != null) {
84            prev.value = -1; /* Sorry */
85        }
86        return s;
87    }
88
89    // Encoding.stepBack
90    public static int stepBack(final int p, final int sp, final int np) {
91        int s = sp, n = np;
92        while (s != -1 && n-- > 0) {
93           if (s <= p) {
94            return -1;
95        }
96           s--;
97       }
98       return s;
99    }
100
101    public static int mbcodeStartPosition() {
102        return 0x80;
103    }
104
105    public static char[] caseFoldCodesByString(final int flag, final char c) {
106        char[] codes = EMPTYCHARS;
107        final char upper = toUpperCase(c);
108
109        if (upper != toLowerCase(upper)) {
110            int count = 0;
111            char ch = 0;
112
113            do {
114                final char u = toUpperCase(ch);
115                if (u == upper && ch != c) {
116                    // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
117                    codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
118                    codes[count++] = ch;
119                }
120            } while (ch++ < 0xffff);
121        }
122        return codes;
123    }
124
125    public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
126        for (int c = 0; c < 0xffff; c++) {
127            if (Character.isLowerCase(c)) {
128                final int upper = toUpperCase(c);
129
130                if (upper != c) {
131                    ApplyCaseFold.apply(c, upper, arg);
132                }
133            }
134        }
135
136        // Some characters have multiple lower case variants, hence we need to do a second run
137        for (int c = 0; c < 0xffff; c++) {
138            if (Character.isLowerCase(c)) {
139                final int upper = toUpperCase(c);
140
141                if (upper != c) {
142                    ApplyCaseFold.apply(upper, c, arg);
143                }
144            }
145        }
146    }
147
148    public static char toLowerCase(final char c) {
149        return (char)toLowerCase((int)c);
150    }
151
152    public static int toLowerCase(final int c) {
153        if (c < 128) {
154            return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
155        }
156        // Do not convert non-ASCII upper case character to ASCII lower case.
157        final int lower = Character.toLowerCase(c);
158        return (lower < 128) ? c : lower;
159
160    }
161
162    public static char toUpperCase(final char c) {
163        return (char)toUpperCase((int)c);
164    }
165
166    public static int toUpperCase(final int c) {
167        if (c < 128) {
168            return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
169        }
170        // Do not convert non-ASCII lower case character to ASCII upper case.
171        final int upper = Character.toUpperCase(c);
172        return (upper < 128) ? c : upper;
173    }
174
175    public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
176        sbOut.value = 0x100; // use bitset for codes smaller than 256
177        int[] range = null;
178
179        if (ctype < codeRanges.length) {
180            range = codeRanges[ctype];
181
182            if (range == null) {
183                // format: [numberOfRanges, rangeStart, rangeEnd, ...]
184                range = new int[16];
185                int rangeCount = 0;
186                int lastCode = -2;
187
188                for (int code = 0; code <= 0xffff; code++) {
189                    if (isCodeCType(code, ctype)) {
190                        if (lastCode < code -1) {
191                            if (rangeCount * 2 + 2 >= range.length) {
192                                range = Arrays.copyOf(range, range.length * 2);
193                            }
194                            range[rangeCount * 2 + 1] = code;
195                            rangeCount++;
196                        }
197                        range[rangeCount * 2] = lastCode = code;
198                    }
199                }
200
201                if (rangeCount * 2 + 1 < range.length) {
202                    range = Arrays.copyOf(range, rangeCount * 2 + 1);
203                }
204
205                range[0] = rangeCount;
206                codeRanges[ctype] = range;
207            }
208        }
209
210        return range;
211    }
212
213    // CodeRange.isInCodeRange
214    public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
215        int low = 0;
216        final int n = p[offset];
217        int high = n ;
218
219        while (low < high) {
220            final int x = (low + high) >> 1;
221            if (code > p[(x << 1) + 2 + offset]) {
222                low = x + 1;
223            } else {
224                high = x;
225            }
226        }
227        return low < n && code >= p[(low << 1) + 1 + offset];
228    }
229
230    /**
231     * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
232     *
233     * @param code code
234     * @param ctype ctype
235     *
236     * @return isCodeCType
237     */
238    public static boolean isCodeCType(final int code, final int ctype) {
239        int type;
240        switch (ctype) {
241            case CharacterType.NEWLINE:
242                return isNewLine(code);
243            case CharacterType.ALPHA:
244                return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
245            case CharacterType.BLANK:
246                return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
247            case CharacterType.CNTRL:
248                type = Character.getType(code);
249                return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
250            case CharacterType.DIGIT:
251                return EncodingHelper.isDigit(code);
252            case CharacterType.GRAPH:
253                switch (code) {
254                    case 0x09:
255                    case 0x0a:
256                    case 0x0b:
257                    case 0x0c:
258                    case 0x0d:
259                        return false;
260                    default:
261                        type = Character.getType(code);
262                        return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
263                }
264            case CharacterType.LOWER:
265                return Character.isLowerCase(code);
266            case CharacterType.PRINT:
267                type = Character.getType(code);
268                return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
269            case CharacterType.PUNCT:
270                return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
271            case CharacterType.SPACE:
272                // ECMA 7.2 and 7.3
273                switch (code) {
274                    case 0x09:
275                    case 0x0a:
276                    case 0x0b:
277                    case 0x0c:
278                    case 0x0d:
279                        return true;
280                    default:
281                        // true if Unicode separator or BOM or U+180E (see JDK-8138758)
282                        return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0
283                                || code == 0xfeff || code == 0x180e;
284                }
285            case CharacterType.UPPER:
286                return Character.isUpperCase(code);
287            case CharacterType.XDIGIT:
288                return EncodingHelper.isXDigit(code);
289            case CharacterType.WORD:
290                return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
291            case CharacterType.ALNUM:
292                return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
293            case CharacterType.ASCII:
294                return code < 0x80;
295            default:
296                throw new RuntimeException("illegal character type: " + ctype);
297        }
298    }
299}
300
301