EncodingHelper.java revision 953:221a84ef44c0
1249259Sdim/*
2249259Sdim * Permission is hereby granted, free of charge, to any person obtaining a copy of
3249259Sdim * this software and associated documentation files (the "Software"), to deal in
4249259Sdim * the Software without restriction, including without limitation the rights to
5249259Sdim * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
6249259Sdim * of the Software, and to permit persons to whom the Software is furnished to do
7249259Sdim * so, subject to the following conditions:
8249259Sdim *
9249259Sdim * The above copyright notice and this permission notice shall be included in all
10249259Sdim * copies or substantial portions of the Software.
11249259Sdim *
12249259Sdim * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13249259Sdim * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14249259Sdim * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15249259Sdim * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16249259Sdim * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17249259Sdim * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18249259Sdim * SOFTWARE.
19249259Sdim */
20249259Sdimpackage jdk.nashorn.internal.runtime.regexp.joni;
21249259Sdim
22249259Sdimimport java.util.Arrays;
23249259Sdimimport jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
24249259Sdimimport jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
25249259Sdim
26249259Sdimpublic final class EncodingHelper {
27249259Sdim
28249259Sdim    final static int NEW_LINE            = 0x000a;
29249259Sdim    final static int RETURN              = 0x000d;
30249259Sdim    final static int LINE_SEPARATOR      = 0x2028;
31249259Sdim    final static int PARAGRAPH_SEPARATOR = 0x2029;
32249259Sdim
33249259Sdim    final static char[] EMPTYCHARS = new char[0];
34249259Sdim    final static int[][] codeRanges = new int[15][];
35249259Sdim
36249259Sdim    public static int digitVal(final int code) {
37249259Sdim        return code - '0';
38249259Sdim    }
39249259Sdim
40249259Sdim    public static int odigitVal(final int code) {
41249259Sdim        return digitVal(code);
42249259Sdim    }
43249259Sdim
44249259Sdim    public static boolean isXDigit(final int code) {
45249259Sdim        return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
46249259Sdim    }
47249259Sdim
48249259Sdim    public static int xdigitVal(final int code) {
49249259Sdim        if (Character.isDigit(code)) {
50249259Sdim            return code - '0';
51249259Sdim        } else if (code >= 'a' && code <= 'f') {
52249259Sdim            return code - 'a' + 10;
53249259Sdim        } else {
54249259Sdim            return code - 'A' + 10;
55249259Sdim        }
56249259Sdim    }
57249259Sdim
58249259Sdim    public static boolean isDigit(final int code) {
59249259Sdim        return code >= '0' && code <= '9';
60249259Sdim    }
61249259Sdim
62249259Sdim    public static boolean isWord(final int code) {
63249259Sdim        // letter, digit, or '_'
64249259Sdim        return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
65249259Sdim    }
66249259Sdim
67249259Sdim    public static boolean isNewLine(final int code) {
68249259Sdim        return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
69249259Sdim    }
70249259Sdim
71249259Sdim    public static boolean isNewLine(final char[] chars, final int p, final int end) {
72249259Sdim        return p < end && isNewLine(chars[p]);
73249259Sdim    }
74249259Sdim
75249259Sdim    // Encoding.prevCharHead
76249259Sdim    public static int prevCharHead(final int p, final int s) {
77249259Sdim        return s <= p ? -1 : s - 1;
78249259Sdim    }
79249259Sdim
80249259Sdim    /* onigenc_get_right_adjust_char_head_with_prev */
81249259Sdim    public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
82249259Sdim        if (prev != null) prev.value = -1; /* Sorry */
83249259Sdim        return s;
84249259Sdim    }
85249259Sdim
86249259Sdim    // Encoding.stepBack
87249259Sdim    public static int stepBack(final int p, int s, int n) {
88249259Sdim       while (s != -1 && n-- > 0) {
89249259Sdim           if (s <= p) return -1;
90249259Sdim           s--;
91249259Sdim       }
92249259Sdim       return s;
93249259Sdim    }
94249259Sdim
95249259Sdim    public static int mbcodeStartPosition() {
96249259Sdim        return 0x80;
97249259Sdim    }
98249259Sdim
99249259Sdim    public static char[] caseFoldCodesByString(final int flag, final char c) {
100249259Sdim        char[] codes = EMPTYCHARS;
101249259Sdim        final char upper = toUpperCase(c);
102249259Sdim
103249259Sdim        if (upper != toLowerCase(upper)) {
104249259Sdim            int count = 0;
105249259Sdim            char ch = 0;
106249259Sdim
107249259Sdim            do {
108249259Sdim                final char u = toUpperCase(ch);
109249259Sdim                if (u == upper && ch != c) {
110249259Sdim                    // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
111249259Sdim                    codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
112249259Sdim                    codes[count++] = ch;
113249259Sdim                }
114249259Sdim            } while (ch++ < 0xffff);
115249259Sdim        }
116249259Sdim        return codes;
117249259Sdim    }
118249259Sdim
119249259Sdim    public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
120249259Sdim        for (int c = 0; c < 0xffff; c++) {
121249259Sdim            if (Character.isLowerCase(c)) {
122249259Sdim                final int upper = toUpperCase(c);
123249259Sdim
124249259Sdim                if (upper != c) {
125249259Sdim                    fun.apply(c, upper, arg);
126249259Sdim                }
127249259Sdim            }
128249259Sdim        }
129249259Sdim
130249259Sdim        // Some characters have multiple lower case variants, hence we need to do a second run
131249259Sdim        for (int c = 0; c < 0xffff; c++) {
132249259Sdim            if (Character.isLowerCase(c)) {
133249259Sdim                final int upper = toUpperCase(c);
134249259Sdim
135249259Sdim                if (upper != c) {
136249259Sdim                    fun.apply(upper, c, arg);
137249259Sdim                }
138249259Sdim            }
139249259Sdim        }
140249259Sdim    }
141249259Sdim
142249259Sdim    public static char toLowerCase(final char c) {
143249259Sdim        return (char)toLowerCase((int)c);
144249259Sdim    }
145249259Sdim
146249259Sdim    public static int toLowerCase(final int c) {
147249259Sdim        if (c < 128) {
148249259Sdim            return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
149249259Sdim        }
150249259Sdim        // Do not convert non-ASCII upper case character to ASCII lower case.
151249259Sdim        final int lower = Character.toLowerCase(c);
152249259Sdim        return (lower < 128) ? c : lower;
153249259Sdim
154249259Sdim    }
155249259Sdim
156249259Sdim    public static char toUpperCase(final char c) {
157249259Sdim        return (char)toUpperCase((int)c);
158249259Sdim    }
159249259Sdim
160249259Sdim    public static int toUpperCase(final int c) {
161249259Sdim        if (c < 128) {
162249259Sdim            return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
163249259Sdim        }
164249259Sdim        // Do not convert non-ASCII lower case character to ASCII upper case.
165249259Sdim        final int upper = Character.toUpperCase(c);
166249259Sdim        return (upper < 128) ? c : upper;
167249259Sdim    }
168249259Sdim
169249259Sdim    public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
170249259Sdim        sbOut.value = 0x100; // use bitset for codes smaller than 256
171249259Sdim        int[] range = null;
172249259Sdim
173249259Sdim        if (ctype < codeRanges.length) {
174249259Sdim            range = codeRanges[ctype];
175249259Sdim
176249259Sdim            if (range == null) {
177249259Sdim                // format: [numberOfRanges, rangeStart, rangeEnd, ...]
178249259Sdim                range = new int[16];
179249259Sdim                int rangeCount = 0;
180249259Sdim                int lastCode = -2;
181249259Sdim
182249259Sdim                for (int code = 0; code <= 0xffff; code++) {
183249259Sdim                    if (isCodeCType(code, ctype)) {
184249259Sdim                        if (lastCode < code -1) {
185249259Sdim                            if (rangeCount * 2 + 2 >= range.length) {
186249259Sdim                                range = Arrays.copyOf(range, range.length * 2);
187249259Sdim                            }
188249259Sdim                            range[rangeCount * 2 + 1] = code;
189249259Sdim                            rangeCount++;
190249259Sdim                        }
191249259Sdim                        range[rangeCount * 2] = lastCode = code;
192249259Sdim                    }
193249259Sdim                }
194249259Sdim
195249259Sdim                if (rangeCount * 2 + 1 < range.length) {
196249259Sdim                    range = Arrays.copyOf(range, rangeCount * 2 + 1);
197249259Sdim                }
198249259Sdim
199249259Sdim                range[0] = rangeCount;
200249259Sdim                codeRanges[ctype] = range;
201249259Sdim            }
202249259Sdim        }
203249259Sdim
204249259Sdim        return range;
205249259Sdim    }
206249259Sdim
207249259Sdim    // CodeRange.isInCodeRange
208249259Sdim    public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
209249259Sdim        int low = 0;
210249259Sdim        final int n = p[offset];
211249259Sdim        int high = n ;
212249259Sdim
213249259Sdim        while (low < high) {
214249259Sdim            final int x = (low + high) >> 1;
215249259Sdim            if (code > p[(x << 1) + 2 + offset]) {
216249259Sdim                low = x + 1;
217249259Sdim            } else {
218249259Sdim                high = x;
219249259Sdim            }
220249259Sdim        }
221249259Sdim        return low < n && code >= p[(low << 1) + 1 + offset];
222249259Sdim    }
223249259Sdim
224249259Sdim    /**
225249259Sdim     * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
226249259Sdim     */
227249259Sdim    public static boolean isCodeCType(final int code, final int ctype) {
228249259Sdim        int type;
229249259Sdim        switch (ctype) {
230249259Sdim            case CharacterType.NEWLINE:
231249259Sdim                return isNewLine(code);
232249259Sdim            case CharacterType.ALPHA:
233249259Sdim                return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
234249259Sdim            case CharacterType.BLANK:
235249259Sdim                return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
236249259Sdim            case CharacterType.CNTRL:
237249259Sdim                type = Character.getType(code);
238249259Sdim                return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
239249259Sdim            case CharacterType.DIGIT:
240249259Sdim                return EncodingHelper.isDigit(code);
241249259Sdim            case CharacterType.GRAPH:
242249259Sdim                switch (code) {
243249259Sdim                    case 0x09:
244249259Sdim                    case 0x0a:
245249259Sdim                    case 0x0b:
246249259Sdim                    case 0x0c:
247249259Sdim                    case 0x0d:
248249259Sdim                        return false;
249249259Sdim                    default:
250249259Sdim                        type = Character.getType(code);
251249259Sdim                        return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
252249259Sdim                }
253249259Sdim            case CharacterType.LOWER:
254249259Sdim                return Character.isLowerCase(code);
255249259Sdim            case CharacterType.PRINT:
256249259Sdim                type = Character.getType(code);
257249259Sdim                return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
258249259Sdim            case CharacterType.PUNCT:
259249259Sdim                return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
260249259Sdim            case CharacterType.SPACE:
261249259Sdim                // ECMA 7.2 and 7.3
262249259Sdim                switch (code) {
263249259Sdim                    case 0x09:
264249259Sdim                    case 0x0a:
265249259Sdim                    case 0x0b:
266249259Sdim                    case 0x0c:
267249259Sdim                    case 0x0d:
268249259Sdim                        return true;
269249259Sdim                    default:
270249259Sdim                        // true if Unicode separator or BOM
271249259Sdim                        return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff;
272249259Sdim                }
273249259Sdim            case CharacterType.UPPER:
274249259Sdim                return Character.isUpperCase(code);
275249259Sdim            case CharacterType.XDIGIT:
276249259Sdim                return EncodingHelper.isXDigit(code);
277249259Sdim            case CharacterType.WORD:
278249259Sdim                return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
279249259Sdim            case CharacterType.ALNUM:
280249259Sdim                return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
281249259Sdim            case CharacterType.ASCII:
282249259Sdim                return code < 0x80;
283249259Sdim            default:
284249259Sdim                throw new RuntimeException("illegal character type: " + ctype);
285249259Sdim        }
286249259Sdim    }
287249259Sdim}
288249259Sdim
289249259Sdim