EncodingHelper.java revision 1453:a261947d1e0e
1/* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20package jdk.nashorn.internal.runtime.regexp.joni; 21 22import java.util.Arrays; 23import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; 24import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; 25 26@SuppressWarnings("javadoc") 27public final class EncodingHelper { 28 29 final static int NEW_LINE = 0x000a; 30 final static int RETURN = 0x000d; 31 final static int LINE_SEPARATOR = 0x2028; 32 final static int PARAGRAPH_SEPARATOR = 0x2029; 33 34 final static char[] EMPTYCHARS = new char[0]; 35 final static int[][] codeRanges = new int[15][]; 36 37 public static int digitVal(final int code) { 38 return code - '0'; 39 } 40 41 public static int odigitVal(final int code) { 42 return digitVal(code); 43 } 44 45 public static boolean isXDigit(final int code) { 46 return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); 47 } 48 49 public static int xdigitVal(final int code) { 50 if (Character.isDigit(code)) { 51 return code - '0'; 52 } else if (code >= 'a' && code <= 'f') { 53 return code - 'a' + 10; 54 } else { 55 return code - 'A' + 10; 56 } 57 } 58 59 public static boolean isDigit(final int code) { 60 return code >= '0' && code <= '9'; 61 } 62 63 public static boolean isWord(final int code) { 64 // letter, digit, or '_' 65 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 66 } 67 68 public static boolean isNewLine(final int code) { 69 return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; 70 } 71 72 public static boolean isNewLine(final char[] chars, final int p, final int end) { 73 return p < end && isNewLine(chars[p]); 74 } 75 76 // Encoding.prevCharHead 77 public static int prevCharHead(final int p, final int s) { 78 return s <= p ? -1 : s - 1; 79 } 80 81 /* onigenc_get_right_adjust_char_head_with_prev */ 82 public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) { 83 if (prev != null) { 84 prev.value = -1; /* Sorry */ 85 } 86 return s; 87 } 88 89 // Encoding.stepBack 90 public static int stepBack(final int p, final int sp, final int np) { 91 int s = sp, n = np; 92 while (s != -1 && n-- > 0) { 93 if (s <= p) { 94 return -1; 95 } 96 s--; 97 } 98 return s; 99 } 100 101 public static int mbcodeStartPosition() { 102 return 0x80; 103 } 104 105 public static char[] caseFoldCodesByString(final int flag, final char c) { 106 char[] codes = EMPTYCHARS; 107 final char upper = toUpperCase(c); 108 109 if (upper != toLowerCase(upper)) { 110 int count = 0; 111 char ch = 0; 112 113 do { 114 final char u = toUpperCase(ch); 115 if (u == upper && ch != c) { 116 // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine. 117 codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1); 118 codes[count++] = ch; 119 } 120 } while (ch++ < 0xffff); 121 } 122 return codes; 123 } 124 125 public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) { 126 for (int c = 0; c < 0xffff; c++) { 127 if (Character.isLowerCase(c)) { 128 final int upper = toUpperCase(c); 129 130 if (upper != c) { 131 ApplyCaseFold.apply(c, upper, arg); 132 } 133 } 134 } 135 136 // Some characters have multiple lower case variants, hence we need to do a second run 137 for (int c = 0; c < 0xffff; c++) { 138 if (Character.isLowerCase(c)) { 139 final int upper = toUpperCase(c); 140 141 if (upper != c) { 142 ApplyCaseFold.apply(upper, c, arg); 143 } 144 } 145 } 146 } 147 148 public static char toLowerCase(final char c) { 149 return (char)toLowerCase((int)c); 150 } 151 152 public static int toLowerCase(final int c) { 153 if (c < 128) { 154 return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c; 155 } 156 // Do not convert non-ASCII upper case character to ASCII lower case. 157 final int lower = Character.toLowerCase(c); 158 return (lower < 128) ? c : lower; 159 160 } 161 162 public static char toUpperCase(final char c) { 163 return (char)toUpperCase((int)c); 164 } 165 166 public static int toUpperCase(final int c) { 167 if (c < 128) { 168 return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c; 169 } 170 // Do not convert non-ASCII lower case character to ASCII upper case. 171 final int upper = Character.toUpperCase(c); 172 return (upper < 128) ? c : upper; 173 } 174 175 public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) { 176 sbOut.value = 0x100; // use bitset for codes smaller than 256 177 int[] range = null; 178 179 if (ctype < codeRanges.length) { 180 range = codeRanges[ctype]; 181 182 if (range == null) { 183 // format: [numberOfRanges, rangeStart, rangeEnd, ...] 184 range = new int[16]; 185 int rangeCount = 0; 186 int lastCode = -2; 187 188 for (int code = 0; code <= 0xffff; code++) { 189 if (isCodeCType(code, ctype)) { 190 if (lastCode < code -1) { 191 if (rangeCount * 2 + 2 >= range.length) { 192 range = Arrays.copyOf(range, range.length * 2); 193 } 194 range[rangeCount * 2 + 1] = code; 195 rangeCount++; 196 } 197 range[rangeCount * 2] = lastCode = code; 198 } 199 } 200 201 if (rangeCount * 2 + 1 < range.length) { 202 range = Arrays.copyOf(range, rangeCount * 2 + 1); 203 } 204 205 range[0] = rangeCount; 206 codeRanges[ctype] = range; 207 } 208 } 209 210 return range; 211 } 212 213 // CodeRange.isInCodeRange 214 public static boolean isInCodeRange(final int[] p, final int offset, final int code) { 215 int low = 0; 216 final int n = p[offset]; 217 int high = n ; 218 219 while (low < high) { 220 final int x = (low + high) >> 1; 221 if (code > p[(x << 1) + 2 + offset]) { 222 low = x + 1; 223 } else { 224 high = x; 225 } 226 } 227 return low < n && code >= p[(low << 1) + 1 + offset]; 228 } 229 230 /** 231 * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> 232 * 233 * @param code code 234 * @param ctype ctype 235 * 236 * @return isCodeCType 237 */ 238 public static boolean isCodeCType(final int code, final int ctype) { 239 int type; 240 switch (ctype) { 241 case CharacterType.NEWLINE: 242 return isNewLine(code); 243 case CharacterType.ALPHA: 244 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; 245 case CharacterType.BLANK: 246 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; 247 case CharacterType.CNTRL: 248 type = Character.getType(code); 249 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; 250 case CharacterType.DIGIT: 251 return EncodingHelper.isDigit(code); 252 case CharacterType.GRAPH: 253 switch (code) { 254 case 0x09: 255 case 0x0a: 256 case 0x0b: 257 case 0x0c: 258 case 0x0d: 259 return false; 260 default: 261 type = Character.getType(code); 262 return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; 263 } 264 case CharacterType.LOWER: 265 return Character.isLowerCase(code); 266 case CharacterType.PRINT: 267 type = Character.getType(code); 268 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; 269 case CharacterType.PUNCT: 270 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; 271 case CharacterType.SPACE: 272 // ECMA 7.2 and 7.3 273 switch (code) { 274 case 0x09: 275 case 0x0a: 276 case 0x0b: 277 case 0x0c: 278 case 0x0d: 279 return true; 280 default: 281 // true if Unicode separator or BOM or U+180E (see JDK-8138758) 282 return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 283 || code == 0xfeff || code == 0x180e; 284 } 285 case CharacterType.UPPER: 286 return Character.isUpperCase(code); 287 case CharacterType.XDIGIT: 288 return EncodingHelper.isXDigit(code); 289 case CharacterType.WORD: 290 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; 291 case CharacterType.ALNUM: 292 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; 293 case CharacterType.ASCII: 294 return code < 0x80; 295 default: 296 throw new RuntimeException("illegal character type: " + ctype); 297 } 298 } 299} 300 301