EncodingHelper.java revision 953:221a84ef44c0
1249259Sdim/* 2249259Sdim * Permission is hereby granted, free of charge, to any person obtaining a copy of 3249259Sdim * this software and associated documentation files (the "Software"), to deal in 4249259Sdim * the Software without restriction, including without limitation the rights to 5249259Sdim * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6249259Sdim * of the Software, and to permit persons to whom the Software is furnished to do 7249259Sdim * so, subject to the following conditions: 8249259Sdim * 9249259Sdim * The above copyright notice and this permission notice shall be included in all 10249259Sdim * copies or substantial portions of the Software. 11249259Sdim * 12249259Sdim * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13249259Sdim * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14249259Sdim * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15249259Sdim * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16249259Sdim * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17249259Sdim * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18249259Sdim * SOFTWARE. 19249259Sdim */ 20249259Sdimpackage jdk.nashorn.internal.runtime.regexp.joni; 21249259Sdim 22249259Sdimimport java.util.Arrays; 23249259Sdimimport jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 24249259Sdimimport jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 25249259Sdim 26249259Sdimpublic final class EncodingHelper { 27249259Sdim 28249259Sdim final static int NEW_LINE = 0x000a; 29249259Sdim final static int RETURN = 0x000d; 30249259Sdim final static int LINE_SEPARATOR = 0x2028; 31249259Sdim final static int PARAGRAPH_SEPARATOR = 0x2029; 32249259Sdim 33249259Sdim final static char[] EMPTYCHARS = new char[0]; 34249259Sdim final static int[][] codeRanges = new int[15][]; 35249259Sdim 36249259Sdim public static int digitVal(final int code) { 37249259Sdim return code - '0'; 38249259Sdim } 39249259Sdim 40249259Sdim public static int odigitVal(final int code) { 41249259Sdim return digitVal(code); 42249259Sdim } 43249259Sdim 44249259Sdim public static boolean isXDigit(final int code) { 45249259Sdim return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 46249259Sdim } 47249259Sdim 48249259Sdim public static int xdigitVal(final int code) { 49249259Sdim if (Character.isDigit(code)) { 50249259Sdim return code - '0'; 51249259Sdim } else if (code >= 'a' && code <= 'f') { 52249259Sdim return code - 'a' + 10; 53249259Sdim } else { 54249259Sdim return code - 'A' + 10; 55249259Sdim } 56249259Sdim } 57249259Sdim 58249259Sdim public static boolean isDigit(final int code) { 59249259Sdim return code >= '0' && code <= '9'; 60249259Sdim } 61249259Sdim 62249259Sdim public static boolean isWord(final int code) { 63249259Sdim // letter, digit, or '_' 64249259Sdim return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 65249259Sdim } 66249259Sdim 67249259Sdim public static boolean isNewLine(final int code) { 68249259Sdim return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; 69249259Sdim } 70249259Sdim 71249259Sdim public static boolean isNewLine(final char[] chars, final int p, final int end) { 72249259Sdim return p < end && isNewLine(chars[p]); 73249259Sdim } 74249259Sdim 75249259Sdim // Encoding.prevCharHead 76249259Sdim public static int prevCharHead(final int p, final int s) { 77249259Sdim return s <= p ? -1 : s - 1; 78249259Sdim } 79249259Sdim 80249259Sdim /* onigenc_get_right_adjust_char_head_with_prev */ 81249259Sdim public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) { 82249259Sdim if (prev != null) prev.value = -1; /* Sorry */ 83249259Sdim return s; 84249259Sdim } 85249259Sdim 86249259Sdim // Encoding.stepBack 87249259Sdim public static int stepBack(final int p, int s, int n) { 88249259Sdim while (s != -1 && n-- > 0) { 89249259Sdim if (s <= p) return -1; 90249259Sdim s--; 91249259Sdim } 92249259Sdim return s; 93249259Sdim } 94249259Sdim 95249259Sdim public static int mbcodeStartPosition() { 96249259Sdim return 0x80; 97249259Sdim } 98249259Sdim 99249259Sdim public static char[] caseFoldCodesByString(final int flag, final char c) { 100249259Sdim char[] codes = EMPTYCHARS; 101249259Sdim final char upper = toUpperCase(c); 102249259Sdim 103249259Sdim if (upper != toLowerCase(upper)) { 104249259Sdim int count = 0; 105249259Sdim char ch = 0; 106249259Sdim 107249259Sdim do { 108249259Sdim final char u = toUpperCase(ch); 109249259Sdim if (u == upper && ch != c) { 110249259Sdim // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine. 111249259Sdim codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1); 112249259Sdim codes[count++] = ch; 113249259Sdim } 114249259Sdim } while (ch++ < 0xffff); 115249259Sdim } 116249259Sdim return codes; 117249259Sdim } 118249259Sdim 119249259Sdim public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) { 120249259Sdim for (int c = 0; c < 0xffff; c++) { 121249259Sdim if (Character.isLowerCase(c)) { 122249259Sdim final int upper = toUpperCase(c); 123249259Sdim 124249259Sdim if (upper != c) { 125249259Sdim fun.apply(c, upper, arg); 126249259Sdim } 127249259Sdim } 128249259Sdim } 129249259Sdim 130249259Sdim // Some characters have multiple lower case variants, hence we need to do a second run 131249259Sdim for (int c = 0; c < 0xffff; c++) { 132249259Sdim if (Character.isLowerCase(c)) { 133249259Sdim final int upper = toUpperCase(c); 134249259Sdim 135249259Sdim if (upper != c) { 136249259Sdim fun.apply(upper, c, arg); 137249259Sdim } 138249259Sdim } 139249259Sdim } 140249259Sdim } 141249259Sdim 142249259Sdim public static char toLowerCase(final char c) { 143249259Sdim return (char)toLowerCase((int)c); 144249259Sdim } 145249259Sdim 146249259Sdim public static int toLowerCase(final int c) { 147249259Sdim if (c < 128) { 148249259Sdim return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c; 149249259Sdim } 150249259Sdim // Do not convert non-ASCII upper case character to ASCII lower case. 151249259Sdim final int lower = Character.toLowerCase(c); 152249259Sdim return (lower < 128) ? c : lower; 153249259Sdim 154249259Sdim } 155249259Sdim 156249259Sdim public static char toUpperCase(final char c) { 157249259Sdim return (char)toUpperCase((int)c); 158249259Sdim } 159249259Sdim 160249259Sdim public static int toUpperCase(final int c) { 161249259Sdim if (c < 128) { 162249259Sdim return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c; 163249259Sdim } 164249259Sdim // Do not convert non-ASCII lower case character to ASCII upper case. 165249259Sdim final int upper = Character.toUpperCase(c); 166249259Sdim return (upper < 128) ? c : upper; 167249259Sdim } 168249259Sdim 169249259Sdim public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) { 170249259Sdim sbOut.value = 0x100; // use bitset for codes smaller than 256 171249259Sdim int[] range = null; 172249259Sdim 173249259Sdim if (ctype < codeRanges.length) { 174249259Sdim range = codeRanges[ctype]; 175249259Sdim 176249259Sdim if (range == null) { 177249259Sdim // format: [numberOfRanges, rangeStart, rangeEnd, ...] 178249259Sdim range = new int[16]; 179249259Sdim int rangeCount = 0; 180249259Sdim int lastCode = -2; 181249259Sdim 182249259Sdim for (int code = 0; code <= 0xffff; code++) { 183249259Sdim if (isCodeCType(code, ctype)) { 184249259Sdim if (lastCode < code -1) { 185249259Sdim if (rangeCount * 2 + 2 >= range.length) { 186249259Sdim range = Arrays.copyOf(range, range.length * 2); 187249259Sdim } 188249259Sdim range[rangeCount * 2 + 1] = code; 189249259Sdim rangeCount++; 190249259Sdim } 191249259Sdim range[rangeCount * 2] = lastCode = code; 192249259Sdim } 193249259Sdim } 194249259Sdim 195249259Sdim if (rangeCount * 2 + 1 < range.length) { 196249259Sdim range = Arrays.copyOf(range, rangeCount * 2 + 1); 197249259Sdim } 198249259Sdim 199249259Sdim range[0] = rangeCount; 200249259Sdim codeRanges[ctype] = range; 201249259Sdim } 202249259Sdim } 203249259Sdim 204249259Sdim return range; 205249259Sdim } 206249259Sdim 207249259Sdim // CodeRange.isInCodeRange 208249259Sdim public static boolean isInCodeRange(final int[] p, final int offset, final int code) { 209249259Sdim int low = 0; 210249259Sdim final int n = p[offset]; 211249259Sdim int high = n ; 212249259Sdim 213249259Sdim while (low < high) { 214249259Sdim final int x = (low + high) >> 1; 215249259Sdim if (code > p[(x << 1) + 2 + offset]) { 216249259Sdim low = x + 1; 217249259Sdim } else { 218249259Sdim high = x; 219249259Sdim } 220249259Sdim } 221249259Sdim return low < n && code >= p[(low << 1) + 1 + offset]; 222249259Sdim } 223249259Sdim 224249259Sdim /** 225249259Sdim * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> 226249259Sdim */ 227249259Sdim public static boolean isCodeCType(final int code, final int ctype) { 228249259Sdim int type; 229249259Sdim switch (ctype) { 230249259Sdim case CharacterType.NEWLINE: 231249259Sdim return isNewLine(code); 232249259Sdim case CharacterType.ALPHA: 233249259Sdim return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 234249259Sdim case CharacterType.BLANK: 235249259Sdim return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 236249259Sdim case CharacterType.CNTRL: 237249259Sdim type = Character.getType(code); 238249259Sdim return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 239249259Sdim case CharacterType.DIGIT: 240249259Sdim return EncodingHelper.isDigit(code); 241249259Sdim case CharacterType.GRAPH: 242249259Sdim switch (code) { 243249259Sdim case 0x09: 244249259Sdim case 0x0a: 245249259Sdim case 0x0b: 246249259Sdim case 0x0c: 247249259Sdim case 0x0d: 248249259Sdim return false; 249249259Sdim default: 250249259Sdim type = Character.getType(code); 251249259Sdim return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 252249259Sdim } 253249259Sdim case CharacterType.LOWER: 254249259Sdim return Character.isLowerCase(code); 255249259Sdim case CharacterType.PRINT: 256249259Sdim type = Character.getType(code); 257249259Sdim return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 258249259Sdim case CharacterType.PUNCT: 259249259Sdim return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 260249259Sdim case CharacterType.SPACE: 261249259Sdim // ECMA 7.2 and 7.3 262249259Sdim switch (code) { 263249259Sdim case 0x09: 264249259Sdim case 0x0a: 265249259Sdim case 0x0b: 266249259Sdim case 0x0c: 267249259Sdim case 0x0d: 268249259Sdim return true; 269249259Sdim default: 270249259Sdim // true if Unicode separator or BOM 271249259Sdim return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff; 272249259Sdim } 273249259Sdim case CharacterType.UPPER: 274249259Sdim return Character.isUpperCase(code); 275249259Sdim case CharacterType.XDIGIT: 276249259Sdim return EncodingHelper.isXDigit(code); 277249259Sdim case CharacterType.WORD: 278249259Sdim return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 279249259Sdim case CharacterType.ALNUM: 280249259Sdim return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 281249259Sdim case CharacterType.ASCII: 282249259Sdim return code < 0x80; 283249259Sdim default: 284249259Sdim throw new RuntimeException("illegal character type: " + ctype); 285249259Sdim } 286249259Sdim } 287249259Sdim} 288249259Sdim 289249259Sdim