1/* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4/* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23import com.sun.org.apache.xerces.internal.utils.SecuritySupport; 24import java.util.Locale; 25import java.util.MissingResourceException; 26import java.util.ResourceBundle; 27import java.util.ArrayList; 28 29/** 30 * A Regular Expression Parser. 31 * 32 * @xerces.internal 33 * 34 */ 35class RegexParser { 36 static final int T_CHAR = 0; 37 static final int T_EOF = 1; 38 static final int T_OR = 2; // '|' 39 static final int T_STAR = 3; // '*' 40 static final int T_PLUS = 4; // '+' 41 static final int T_QUESTION = 5; // '?' 42 static final int T_LPAREN = 6; // '(' 43 static final int T_RPAREN = 7; // ')' 44 static final int T_DOT = 8; // '.' 45 static final int T_LBRACKET = 9; // '[' 46 static final int T_BACKSOLIDUS = 10; // '\' 47 static final int T_CARET = 11; // '^' 48 static final int T_DOLLAR = 12; // '$' 49 static final int T_LPAREN2 = 13; // '(?:' 50 static final int T_LOOKAHEAD = 14; // '(?=' 51 static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' 52 static final int T_LOOKBEHIND = 16; // '(?<=' 53 static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' 54 static final int T_INDEPENDENT = 18; // '(?>' 55 static final int T_SET_OPERATIONS = 19; // '(?[' 56 static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class 57 static final int T_COMMENT = 21; // '(?#' 58 static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] 59 static final int T_CONDITION = 23; // '(?(' 60 static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class 61 62 static class ReferencePosition { 63 int refNumber; 64 int position; 65 ReferencePosition(int n, int pos) { 66 this.refNumber = n; 67 this.position = pos; 68 } 69 } 70 71 int offset; 72 String regex; 73 int regexlen; 74 int options; 75 ResourceBundle resources; 76 int chardata; 77 int nexttoken; 78 static protected final int S_NORMAL = 0; 79 static protected final int S_INBRACKETS = 1; 80 static protected final int S_INXBRACKETS = 2; 81 int context = S_NORMAL; 82 int parenOpened = 1; 83 int parennumber = 1; 84 boolean hasBackReferences; 85 ArrayList<ReferencePosition> references = null; 86 87 public RegexParser() { 88 this.setLocale(Locale.getDefault()); 89 } 90 public RegexParser(Locale locale) { 91 this.setLocale(locale); 92 } 93 94 public void setLocale(Locale locale) { 95 try { 96 if (locale != null) { 97 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale); 98 } 99 else { 100 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message"); 101 } 102 } 103 catch (MissingResourceException mre) { 104 throw new RuntimeException("Installation Problem??? Couldn't load messages: " 105 + mre.getMessage()); 106 } 107 } 108 109 final ParseException ex(String key, int loc) { 110 return new ParseException(this.resources.getString(key), loc); 111 } 112 113 protected final boolean isSet(int flag) { 114 return (this.options & flag) == flag; 115 } 116 117 Token parse(String regex, int options) throws ParseException { 118 this.options = options; 119 this.offset = 0; 120 this.setContext(S_NORMAL); 121 this.parennumber = 1; 122 this.parenOpened = 1; 123 this.hasBackReferences = false; 124 this.regex = regex; 125 if (this.isSet(RegularExpression.EXTENDED_COMMENT)) 126 this.regex = REUtil.stripExtendedComment(this.regex); 127 this.regexlen = this.regex.length(); 128 129 130 this.next(); 131 Token ret = this.parseRegex(); 132 if (this.offset != this.regexlen) 133 throw ex("parser.parse.1", this.offset); 134 if (this.read() != T_EOF) { 135 throw ex("parser.parse.1", this.offset-1); 136 } 137 if (this.references != null) { 138 for (int i = 0; i < this.references.size(); i ++) { 139 ReferencePosition position = this.references.get(i); 140 if (this.parennumber <= position.refNumber) 141 throw ex("parser.parse.2", position.position); 142 } 143 this.references.clear(); 144 } 145 return ret; 146 } 147 148 /* 149 public RegularExpression createRegex(String regex, int options) throws ParseException { 150 Token tok = this.parse(regex, options); 151 return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); 152 } 153 */ 154 155 protected final void setContext(int con) { 156 this.context = con; 157 } 158 159 final int read() { 160 return this.nexttoken; 161 } 162 163 @SuppressWarnings("fallthrough") 164 final void next() { 165 if (this.offset >= this.regexlen) { 166 this.chardata = -1; 167 this.nexttoken = T_EOF; 168 return; 169 } 170 171 int ret; 172 int ch = this.regex.charAt(this.offset++); 173 this.chardata = ch; 174 175 if (this.context == S_INBRACKETS) { 176 // In a character class, this.chardata has one character, that is to say, 177 // a pair of surrogates is composed and stored to this.chardata. 178 switch (ch) { 179 case '\\': 180 ret = T_BACKSOLIDUS; 181 if (this.offset >= this.regexlen) 182 throw ex("parser.next.1", this.offset-1); 183 this.chardata = this.regex.charAt(this.offset++); 184 break; 185 186 case '-': 187 // Allow character class subtraction (regardless of whether we are in 188 // XML Schema mode or not) 189 if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { 190 this.offset++; 191 ret = T_XMLSCHEMA_CC_SUBTRACTION; 192 } else 193 ret = T_CHAR; 194 break; 195 196 case '[': 197 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) 198 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { 199 this.offset++; 200 ret = T_POSIX_CHARCLASS_START; 201 break; 202 } // Through down 203 default: 204 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { 205 int low = this.regex.charAt(this.offset); 206 if (REUtil.isLowSurrogate(low)) { 207 this.chardata = REUtil.composeFromSurrogates(ch, low); 208 this.offset ++; 209 } 210 } 211 ret = T_CHAR; 212 } 213 this.nexttoken = ret; 214 return; 215 } 216 217 switch (ch) { 218 case '|': ret = T_OR; break; 219 case '*': ret = T_STAR; break; 220 case '+': ret = T_PLUS; break; 221 case '?': ret = T_QUESTION; break; 222 case ')': ret = T_RPAREN; break; 223 case '.': ret = T_DOT; break; 224 case '[': ret = T_LBRACKET; break; 225 case '^': 226 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { 227 ret = T_CHAR; 228 } 229 else { 230 ret = T_CARET; 231 } 232 break; 233 case '$': 234 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { 235 ret = T_CHAR; 236 } 237 else { 238 ret = T_DOLLAR; 239 } 240 break; 241 case '(': 242 ret = T_LPAREN; 243 if (this.offset >= this.regexlen) 244 break; 245 if (this.regex.charAt(this.offset) != '?') 246 break; 247 if (++this.offset >= this.regexlen) 248 throw ex("parser.next.2", this.offset-1); 249 ch = this.regex.charAt(this.offset++); 250 switch (ch) { 251 case ':': ret = T_LPAREN2; break; 252 case '=': ret = T_LOOKAHEAD; break; 253 case '!': ret = T_NEGATIVELOOKAHEAD; break; 254 case '[': ret = T_SET_OPERATIONS; break; 255 case '>': ret = T_INDEPENDENT; break; 256 case '<': 257 if (this.offset >= this.regexlen) 258 throw ex("parser.next.2", this.offset-3); 259 ch = this.regex.charAt(this.offset++); 260 if (ch == '=') { 261 ret = T_LOOKBEHIND; 262 } else if (ch == '!') { 263 ret = T_NEGATIVELOOKBEHIND; 264 } else 265 throw ex("parser.next.3", this.offset-3); 266 break; 267 case '#': 268 while (this.offset < this.regexlen) { 269 ch = this.regex.charAt(this.offset++); 270 if (ch == ')') break; 271 } 272 if (ch != ')') 273 throw ex("parser.next.4", this.offset-1); 274 ret = T_COMMENT; 275 break; 276 default: 277 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options 278 this.offset --; 279 ret = T_MODIFIERS; 280 break; 281 } else if (ch == '(') { // conditional 282 ret = T_CONDITION; // this.offsets points the next of '('. 283 break; 284 } 285 throw ex("parser.next.2", this.offset-2); 286 } 287 break; 288 289 case '\\': 290 ret = T_BACKSOLIDUS; 291 if (this.offset >= this.regexlen) 292 throw ex("parser.next.1", this.offset-1); 293 this.chardata = this.regex.charAt(this.offset++); 294 break; 295 296 default: 297 ret = T_CHAR; 298 } 299 this.nexttoken = ret; 300 } 301 302 /** 303 * regex ::= term (`|` term)* 304 * term ::= factor+ 305 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 306 * | atom (('*' | '+' | '?' | minmax ) '?'? )?) 307 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' 308 * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 309 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 310 */ 311 Token parseRegex() throws ParseException { 312 Token tok = this.parseTerm(); 313 Token parent = null; 314 while (this.read() == T_OR) { 315 this.next(); // '|' 316 if (parent == null) { 317 parent = Token.createUnion(); 318 parent.addChild(tok); 319 tok = parent; 320 } 321 tok.addChild(this.parseTerm()); 322 } 323 return tok; 324 } 325 326 /** 327 * term ::= factor+ 328 */ 329 Token parseTerm() throws ParseException { 330 int ch = this.read(); 331 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { 332 return Token.createEmpty(); 333 } else { 334 Token tok = this.parseFactor(); 335 Token concat = null; 336 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { 337 if (concat == null) { 338 concat = Token.createConcat(); 339 concat.addChild(tok); 340 tok = concat; 341 } 342 concat.addChild(this.parseFactor()); 343 //tok = Token.createConcat(tok, this.parseFactor()); 344 } 345 return tok; 346 } 347 } 348 349 // ---------------------------------------------------------------- 350 351 Token processCaret() throws ParseException { 352 this.next(); 353 return Token.token_linebeginning; 354 } 355 Token processDollar() throws ParseException { 356 this.next(); 357 return Token.token_lineend; 358 } 359 Token processLookahead() throws ParseException { 360 this.next(); 361 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); 362 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 363 this.next(); // ')' 364 return tok; 365 } 366 Token processNegativelookahead() throws ParseException { 367 this.next(); 368 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); 369 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 370 this.next(); // ')' 371 return tok; 372 } 373 Token processLookbehind() throws ParseException { 374 this.next(); 375 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); 376 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 377 this.next(); // ')' 378 return tok; 379 } 380 Token processNegativelookbehind() throws ParseException { 381 this.next(); 382 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); 383 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 384 this.next(); // ')' 385 return tok; 386 } 387 Token processBacksolidus_A() throws ParseException { 388 this.next(); 389 return Token.token_stringbeginning; 390 } 391 Token processBacksolidus_Z() throws ParseException { 392 this.next(); 393 return Token.token_stringend2; 394 } 395 Token processBacksolidus_z() throws ParseException { 396 this.next(); 397 return Token.token_stringend; 398 } 399 Token processBacksolidus_b() throws ParseException { 400 this.next(); 401 return Token.token_wordedge; 402 } 403 Token processBacksolidus_B() throws ParseException { 404 this.next(); 405 return Token.token_not_wordedge; 406 } 407 Token processBacksolidus_lt() throws ParseException { 408 this.next(); 409 return Token.token_wordbeginning; 410 } 411 Token processBacksolidus_gt() throws ParseException { 412 this.next(); 413 return Token.token_wordend; 414 } 415 Token processStar(Token tok) throws ParseException { 416 this.next(); 417 if (this.read() == T_QUESTION) { 418 this.next(); 419 return Token.createNGClosure(tok); 420 } else 421 return Token.createClosure(tok); 422 } 423 Token processPlus(Token tok) throws ParseException { 424 // X+ -> XX* 425 this.next(); 426 if (this.read() == T_QUESTION) { 427 this.next(); 428 return Token.createConcat(tok, Token.createNGClosure(tok)); 429 } else 430 return Token.createConcat(tok, Token.createClosure(tok)); 431 } 432 Token processQuestion(Token tok) throws ParseException { 433 // X? -> X| 434 this.next(); 435 Token par = Token.createUnion(); 436 if (this.read() == T_QUESTION) { 437 this.next(); 438 par.addChild(Token.createEmpty()); 439 par.addChild(tok); 440 } else { 441 par.addChild(tok); 442 par.addChild(Token.createEmpty()); 443 } 444 return par; 445 } 446 boolean checkQuestion(int off) { 447 return off < this.regexlen && this.regex.charAt(off) == '?'; 448 } 449 Token processParen() throws ParseException { 450 this.next(); 451 int p = this.parenOpened++; 452 Token tok = Token.createParen(this.parseRegex(), p); 453 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 454 this.parennumber++; 455 this.next(); // Skips ')' 456 return tok; 457 } 458 Token processParen2() throws ParseException { 459 this.next(); 460 Token tok = Token.createParen(this.parseRegex(), 0); 461 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 462 this.next(); // Skips ')' 463 return tok; 464 } 465 Token processCondition() throws ParseException { 466 // this.offset points the next of '(' 467 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); 468 // Parses a condition. 469 int refno = -1; 470 Token condition = null; 471 int ch = this.regex.charAt(this.offset); 472 if ('1' <= ch && ch <= '9') { 473 refno = ch-'0'; 474 int finalRefno = refno; 475 476 if (this.parennumber <= refno) 477 throw ex("parser.parse.2", this.offset); 478 479 while (this.offset + 1 < this.regexlen) { 480 ch = this.regex.charAt(this.offset + 1); 481 if ('0' <= ch && ch <= '9') { 482 refno = (refno * 10) + (ch - '0'); 483 if (refno < this.parennumber) { 484 finalRefno= refno; 485 ++this.offset; 486 } 487 else { 488 break; 489 } 490 } 491 else { 492 break; 493 } 494 } 495 496 this.hasBackReferences = true; 497 if (this.references == null) this.references = new ArrayList<>(); 498 this.references.add(new ReferencePosition(finalRefno, this.offset)); 499 this.offset ++; 500 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); 501 this.offset ++; 502 } else { 503 if (ch == '?') this.offset --; // Points '('. 504 this.next(); 505 condition = this.parseFactor(); 506 switch (condition.type) { 507 case Token.LOOKAHEAD: 508 case Token.NEGATIVELOOKAHEAD: 509 case Token.LOOKBEHIND: 510 case Token.NEGATIVELOOKBEHIND: 511 break; 512 case Token.ANCHOR: 513 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 514 break; 515 default: 516 throw ex("parser.factor.5", this.offset); 517 } 518 } 519 // Parses yes/no-patterns. 520 this.next(); 521 Token yesPattern = this.parseRegex(); 522 Token noPattern = null; 523 if (yesPattern.type == Token.UNION) { 524 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); 525 noPattern = yesPattern.getChild(1); 526 yesPattern = yesPattern.getChild(0); 527 } 528 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 529 this.next(); 530 return Token.createCondition(refno, condition, yesPattern, noPattern); 531 } 532 Token processModifiers() throws ParseException { 533 // this.offset points the next of '?'. 534 // modifiers ::= [imsw]* ('-' [imsw]*)? ':' 535 int add = 0, mask = 0, ch = -1; 536 while (this.offset < this.regexlen) { 537 ch = this.regex.charAt(this.offset); 538 int v = REUtil.getOptionValue(ch); 539 if (v == 0) break; // '-' or ':'? 540 add |= v; 541 this.offset ++; 542 } 543 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 544 if (ch == '-') { 545 this.offset ++; 546 while (this.offset < this.regexlen) { 547 ch = this.regex.charAt(this.offset); 548 int v = REUtil.getOptionValue(ch); 549 if (v == 0) break; // ':'? 550 mask |= v; 551 this.offset ++; 552 } 553 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); 554 } 555 Token tok; 556 if (ch == ':') { 557 this.offset ++; 558 this.next(); 559 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 560 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 561 this.next(); 562 } else if (ch == ')') { // such as (?-i) 563 this.offset ++; 564 this.next(); 565 tok = Token.createModifierGroup(this.parseRegex(), add, mask); 566 } else 567 throw ex("parser.factor.3", this.offset); 568 569 return tok; 570 } 571 Token processIndependent() throws ParseException { 572 this.next(); 573 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); 574 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); 575 this.next(); // Skips ')' 576 return tok; 577 } 578 Token processBacksolidus_c() throws ParseException { 579 int ch2; // Must be in 0x0040-0x005f 580 if (this.offset >= this.regexlen 581 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) 582 throw ex("parser.atom.1", this.offset-1); 583 this.next(); 584 return Token.createChar(ch2-0x40); 585 } 586 Token processBacksolidus_C() throws ParseException { 587 throw ex("parser.process.1", this.offset); 588 } 589 Token processBacksolidus_i() throws ParseException { 590 Token tok = Token.createChar('i'); 591 this.next(); 592 return tok; 593 } 594 Token processBacksolidus_I() throws ParseException { 595 throw ex("parser.process.1", this.offset); 596 } 597 Token processBacksolidus_g() throws ParseException { 598 this.next(); 599 return Token.getGraphemePattern(); 600 } 601 Token processBacksolidus_X() throws ParseException { 602 this.next(); 603 return Token.getCombiningCharacterSequence(); 604 } 605 Token processBackreference() throws ParseException { 606 int refnum = this.chardata-'0'; 607 int finalRefnum = refnum; 608 609 if (this.parennumber <= refnum) 610 throw ex("parser.parse.2", this.offset-2); 611 612 while (this.offset < this.regexlen) { 613 final int ch = this.regex.charAt(this.offset); 614 if ('0' <= ch && ch <= '9') { 615 refnum = (refnum * 10) + (ch - '0'); 616 if (refnum < this.parennumber) { 617 ++this.offset; 618 finalRefnum = refnum; 619 this.chardata = ch; 620 } 621 else { 622 break; 623 } 624 } 625 else { 626 break; 627 } 628 } 629 630 Token tok = Token.createBackReference(finalRefnum); 631 this.hasBackReferences = true; 632 if (this.references == null) this.references = new ArrayList<>(); 633 this.references.add(new ReferencePosition(finalRefnum, this.offset-2)); 634 this.next(); 635 return tok; 636 } 637 638 // ---------------------------------------------------------------- 639 640 /** 641 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 642 * | atom (('*' | '+' | '?' | minmax ) '?'? )?) 643 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' 644 * | '(?#' [^)]* ')' 645 * minmax ::= '{' min (',' max?)? '}' 646 * min ::= [0-9]+ 647 * max ::= [0-9]+ 648 */ 649 Token parseFactor() throws ParseException { 650 int ch = this.read(); 651 Token tok; 652 switch (ch) { 653 case T_CARET: return this.processCaret(); 654 case T_DOLLAR: return this.processDollar(); 655 case T_LOOKAHEAD: return this.processLookahead(); 656 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); 657 case T_LOOKBEHIND: return this.processLookbehind(); 658 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); 659 660 case T_COMMENT: 661 this.next(); 662 return Token.createEmpty(); 663 664 case T_BACKSOLIDUS: 665 switch (this.chardata) { 666 case 'A': return this.processBacksolidus_A(); 667 case 'Z': return this.processBacksolidus_Z(); 668 case 'z': return this.processBacksolidus_z(); 669 case 'b': return this.processBacksolidus_b(); 670 case 'B': return this.processBacksolidus_B(); 671 case '<': return this.processBacksolidus_lt(); 672 case '>': return this.processBacksolidus_gt(); 673 } 674 // through down 675 } 676 tok = this.parseAtom(); 677 ch = this.read(); 678 switch (ch) { 679 case T_STAR: return this.processStar(tok); 680 case T_PLUS: return this.processPlus(tok); 681 case T_QUESTION: return this.processQuestion(tok); 682 case T_CHAR: 683 if (this.chardata == '{' && this.offset < this.regexlen) { 684 685 int off = this.offset; // this.offset -> next of '{' 686 int min = 0, max = -1; 687 688 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 689 690 min = ch -'0'; 691 while (off < this.regexlen 692 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 693 min = min*10 +ch-'0'; 694 if (min < 0) 695 throw ex("parser.quantifier.5", this.offset); 696 } 697 } 698 else { 699 throw ex("parser.quantifier.1", this.offset); 700 } 701 702 max = min; 703 if (ch == ',') { 704 705 if (off >= this.regexlen) { 706 throw ex("parser.quantifier.3", this.offset); 707 } 708 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { 709 710 max = ch -'0'; // {min,max} 711 while (off < this.regexlen 712 && (ch = this.regex.charAt(off++)) >= '0' 713 && ch <= '9') { 714 max = max*10 +ch-'0'; 715 if (max < 0) 716 throw ex("parser.quantifier.5", this.offset); 717 } 718 719 if (min > max) 720 throw ex("parser.quantifier.4", this.offset); 721 } 722 else { // assume {min,} 723 max = -1; 724 } 725 } 726 727 if (ch != '}') 728 throw ex("parser.quantifier.2", this.offset); 729 730 if (this.checkQuestion(off)) { // off -> next of '}' 731 tok = Token.createNGClosure(tok); 732 this.offset = off+1; 733 } else { 734 tok = Token.createClosure(tok); 735 this.offset = off; 736 } 737 738 tok.setMin(min); 739 tok.setMax(max); 740 //System.err.println("CLOSURE: "+min+", "+max); 741 this.next(); 742 } 743 } 744 return tok; 745 } 746 747 /** 748 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 749 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 750 * | '(?>' regex ')' 751 * char ::= '\\' | '\' [efnrt] | bmp-code | character-1 752 */ 753 Token parseAtom() throws ParseException { 754 int ch = this.read(); 755 Token tok = null; 756 switch (ch) { 757 case T_LPAREN: return this.processParen(); 758 case T_LPAREN2: return this.processParen2(); // '(?:' 759 case T_CONDITION: return this.processCondition(); // '(?(' 760 case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) 761 case T_INDEPENDENT: return this.processIndependent(); 762 case T_DOT: 763 this.next(); // Skips '.' 764 tok = Token.token_dot; 765 break; 766 767 /** 768 * char-class ::= '[' ( '^'? range ','?)+ ']' 769 * range ::= '\d' | '\w' | '\s' | category-block | range-char 770 * | range-char '-' range-char 771 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 772 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 773 */ 774 case T_LBRACKET: return this.parseCharacterClass(true); 775 case T_SET_OPERATIONS: return this.parseSetOperations(); 776 777 case T_BACKSOLIDUS: 778 switch (this.chardata) { 779 case 'd': case 'D': 780 case 'w': case 'W': 781 case 's': case 'S': 782 tok = this.getTokenForShorthand(this.chardata); 783 this.next(); 784 return tok; 785 786 case 'e': case 'f': case 'n': case 'r': 787 case 't': case 'u': case 'v': case 'x': 788 { 789 int ch2 = this.decodeEscaped(); 790 if (ch2 < 0x10000) { 791 tok = Token.createChar(ch2); 792 } else { 793 tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); 794 } 795 } 796 break; 797 798 case 'c': return this.processBacksolidus_c(); 799 case 'C': return this.processBacksolidus_C(); 800 case 'i': return this.processBacksolidus_i(); 801 case 'I': return this.processBacksolidus_I(); 802 case 'g': return this.processBacksolidus_g(); 803 case 'X': return this.processBacksolidus_X(); 804 case '1': case '2': case '3': case '4': 805 case '5': case '6': case '7': case '8': case '9': 806 return this.processBackreference(); 807 808 case 'P': 809 case 'p': 810 int pstart = this.offset; 811 tok = processBacksolidus_pP(this.chardata); 812 if (tok == null) throw this.ex("parser.atom.5", pstart); 813 break; 814 815 default: 816 tok = Token.createChar(this.chardata); 817 } 818 this.next(); 819 break; 820 821 case T_CHAR: 822 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') 823 throw this.ex("parser.atom.4", this.offset-1); 824 tok = Token.createChar(this.chardata); 825 int high = this.chardata; 826 this.next(); 827 if (REUtil.isHighSurrogate(high) 828 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { 829 char[] sur = new char[2]; 830 sur[0] = (char)high; 831 sur[1] = (char)this.chardata; 832 tok = Token.createParen(Token.createString(new String(sur)), 0); 833 this.next(); 834 } 835 break; 836 837 default: 838 throw this.ex("parser.atom.4", this.offset-1); 839 } 840 return tok; 841 } 842 843 protected RangeToken processBacksolidus_pP(int c) throws ParseException { 844 845 this.next(); 846 if (this.read() != T_CHAR || this.chardata != '{') 847 throw this.ex("parser.atom.2", this.offset-1); 848 849 // handle category escape 850 boolean positive = c == 'p'; 851 int namestart = this.offset; 852 int nameend = this.regex.indexOf('}', namestart); 853 854 if (nameend < 0) 855 throw this.ex("parser.atom.3", this.offset); 856 857 String pname = this.regex.substring(namestart, nameend); 858 this.offset = nameend+1; 859 860 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); 861 } 862 863 int processCIinCharacterClass(RangeToken tok, int c) { 864 return this.decodeEscaped(); 865 } 866 867 /** 868 * char-class ::= '[' ( '^'? range ','?)+ ']' 869 * range ::= '\d' | '\w' | '\s' | category-block | range-char 870 * | range-char '-' range-char 871 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 872 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 873 */ 874 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { 875 this.setContext(S_INBRACKETS); 876 this.next(); // '[' 877 boolean nrange = false; 878 RangeToken base = null; 879 RangeToken tok; 880 if (this.read() == T_CHAR && this.chardata == '^') { 881 nrange = true; 882 this.next(); // '^' 883 if (useNrange) { 884 tok = Token.createNRange(); 885 } else { 886 base = Token.createRange(); 887 base.addRange(0, Token.UTF16_MAX); 888 tok = Token.createRange(); 889 } 890 } else { 891 tok = Token.createRange(); 892 } 893 int type; 894 boolean firstloop = true; 895 while ((type = this.read()) != T_EOF) { 896 if (type == T_CHAR && this.chardata == ']' && !firstloop) 897 break; 898 int c = this.chardata; 899 boolean end = false; 900 if (type == T_BACKSOLIDUS) { 901 switch (c) { 902 case 'd': case 'D': 903 case 'w': case 'W': 904 case 's': case 'S': 905 tok.mergeRanges(this.getTokenForShorthand(c)); 906 end = true; 907 break; 908 909 case 'i': case 'I': 910 case 'c': case 'C': 911 c = this.processCIinCharacterClass(tok, c); 912 if (c < 0) end = true; 913 break; 914 915 case 'p': 916 case 'P': 917 int pstart = this.offset; 918 RangeToken tok2 = this.processBacksolidus_pP(c); 919 if (tok2 == null) throw this.ex("parser.atom.5", pstart); 920 tok.mergeRanges(tok2); 921 end = true; 922 break; 923 924 default: 925 c = this.decodeEscaped(); 926 } // \ + c 927 } // backsolidus 928 // POSIX Character class such as [:alnum:] 929 else if (type == T_POSIX_CHARCLASS_START) { 930 int nameend = this.regex.indexOf(':', this.offset); 931 if (nameend < 0) throw this.ex("parser.cc.1", this.offset); 932 boolean positive = true; 933 if (this.regex.charAt(this.offset) == '^') { 934 this.offset ++; 935 positive = false; 936 } 937 String name = this.regex.substring(this.offset, nameend); 938 RangeToken range = Token.getRange(name, positive, 939 this.isSet(RegularExpression.XMLSCHEMA_MODE)); 940 if (range == null) throw this.ex("parser.cc.3", this.offset); 941 tok.mergeRanges(range); 942 end = true; 943 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') 944 throw this.ex("parser.cc.1", nameend); 945 this.offset = nameend+2; 946 } 947 else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) { 948 if (nrange) { 949 nrange = false; 950 if (useNrange) { 951 tok = (RangeToken) Token.complementRanges(tok); 952 } 953 else { 954 base.subtractRanges(tok); 955 tok = base; 956 } 957 } 958 RangeToken range2 = this.parseCharacterClass(false); 959 tok.subtractRanges(range2); 960 if (this.read() != T_CHAR || this.chardata != ']') { 961 throw this.ex("parser.cc.5", this.offset); 962 } 963 break; // Exit this loop 964 } 965 this.next(); 966 if (!end) { // if not shorthands... 967 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. 968 if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) { 969 tok.addRange(c, c); 970 } 971 else { 972 addCaseInsensitiveChar(tok, c); 973 } 974 } 975 else if (type == T_XMLSCHEMA_CC_SUBTRACTION) { 976 throw this.ex("parser.cc.8", this.offset-1); 977 } 978 else { 979 this.next(); // Skips '-' 980 if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); 981 if (type == T_CHAR && this.chardata == ']') { 982 if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) { 983 tok.addRange(c, c); 984 } 985 else { 986 addCaseInsensitiveChar(tok, c); 987 } 988 tok.addRange('-', '-'); 989 } else { 990 int rangeend = this.chardata; 991 if (type == T_BACKSOLIDUS) { 992 rangeend = this.decodeEscaped(); 993 } 994 this.next(); 995 if (c > rangeend) { 996 throw this.ex("parser.ope.3", this.offset-1); 997 } 998 if (!this.isSet(RegularExpression.IGNORE_CASE) || 999 (c > 0xffff && rangeend > 0xffff)) { 1000 tok.addRange(c, rangeend); 1001 } 1002 else { 1003 addCaseInsensitiveCharRange(tok, c, rangeend); 1004 } 1005 } 1006 } 1007 } 1008 if (this.isSet(RegularExpression.SPECIAL_COMMA) 1009 && this.read() == T_CHAR && this.chardata == ',') { 1010 this.next(); 1011 } 1012 firstloop = false; 1013 } 1014 if (this.read() == T_EOF) { 1015 throw this.ex("parser.cc.2", this.offset); 1016 } 1017 1018 if (!useNrange && nrange) { 1019 base.subtractRanges(tok); 1020 tok = base; 1021 } 1022 tok.sortRanges(); 1023 tok.compactRanges(); 1024 this.setContext(S_NORMAL); 1025 this.next(); // Skips ']' 1026 1027 return tok; 1028 } 1029 1030 /** 1031 * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' 1032 */ 1033 protected RangeToken parseSetOperations() throws ParseException { 1034 RangeToken tok = this.parseCharacterClass(false); 1035 int type; 1036 while ((type = this.read()) != T_RPAREN) { 1037 int ch = this.chardata; 1038 if (type == T_CHAR && (ch == '-' || ch == '&') 1039 || type == T_PLUS) { 1040 this.next(); 1041 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); 1042 RangeToken t2 = this.parseCharacterClass(false); 1043 if (type == T_PLUS) 1044 tok.mergeRanges(t2); 1045 else if (ch == '-') 1046 tok.subtractRanges(t2); 1047 else if (ch == '&') 1048 tok.intersectRanges(t2); 1049 else 1050 throw new RuntimeException("ASSERT"); 1051 } else { 1052 throw ex("parser.ope.2", this.offset-1); 1053 } 1054 } 1055 this.next(); 1056 return tok; 1057 } 1058 1059 Token getTokenForShorthand(int ch) { 1060 Token tok; 1061 switch (ch) { 1062 case 'd': 1063 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1064 ? Token.getRange("Nd", true) : Token.token_0to9; 1065 break; 1066 case 'D': 1067 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1068 ? Token.getRange("Nd", false) : Token.token_not_0to9; 1069 break; 1070 case 'w': 1071 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1072 ? Token.getRange("IsWord", true) : Token.token_wordchars; 1073 break; 1074 case 'W': 1075 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1076 ? Token.getRange("IsWord", false) : Token.token_not_wordchars; 1077 break; 1078 case 's': 1079 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1080 ? Token.getRange("IsSpace", true) : Token.token_spaces; 1081 break; 1082 case 'S': 1083 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) 1084 ? Token.getRange("IsSpace", false) : Token.token_not_spaces; 1085 break; 1086 1087 default: 1088 throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); 1089 } 1090 return tok; 1091 } 1092 1093 /** 1094 */ 1095 int decodeEscaped() throws ParseException { 1096 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); 1097 int c = this.chardata; 1098 switch (c) { 1099 case 'e': c = 0x1b; break; // ESCAPE U+001B 1100 case 'f': c = '\f'; break; // FORM FEED U+000C 1101 case 'n': c = '\n'; break; // LINE FEED U+000A 1102 case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D 1103 case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 1104 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B 1105 case 'x': 1106 this.next(); 1107 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1108 if (this.chardata == '{') { 1109 int v1 = 0; 1110 int uv = 0; 1111 do { 1112 this.next(); 1113 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); 1114 if ((v1 = hexChar(this.chardata)) < 0) 1115 break; 1116 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); 1117 uv = uv*16+v1; 1118 } while (true); 1119 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); 1120 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); 1121 c = uv; 1122 } else { 1123 int v1 = 0; 1124 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1125 throw ex("parser.descape.1", this.offset-1); 1126 int uv = v1; 1127 this.next(); 1128 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1129 throw ex("parser.descape.1", this.offset-1); 1130 uv = uv*16+v1; 1131 c = uv; 1132 } 1133 break; 1134 1135 case 'u': 1136 int v1 = 0; 1137 this.next(); 1138 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1139 throw ex("parser.descape.1", this.offset-1); 1140 int uv = v1; 1141 this.next(); 1142 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1143 throw ex("parser.descape.1", this.offset-1); 1144 uv = uv*16+v1; 1145 this.next(); 1146 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1147 throw ex("parser.descape.1", this.offset-1); 1148 uv = uv*16+v1; 1149 this.next(); 1150 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1151 throw ex("parser.descape.1", this.offset-1); 1152 uv = uv*16+v1; 1153 c = uv; 1154 break; 1155 1156 case 'v': 1157 this.next(); 1158 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1159 throw ex("parser.descape.1", this.offset-1); 1160 uv = v1; 1161 this.next(); 1162 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1163 throw ex("parser.descape.1", this.offset-1); 1164 uv = uv*16+v1; 1165 this.next(); 1166 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1167 throw ex("parser.descape.1", this.offset-1); 1168 uv = uv*16+v1; 1169 this.next(); 1170 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1171 throw ex("parser.descape.1", this.offset-1); 1172 uv = uv*16+v1; 1173 this.next(); 1174 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1175 throw ex("parser.descape.1", this.offset-1); 1176 uv = uv*16+v1; 1177 this.next(); 1178 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) 1179 throw ex("parser.descape.1", this.offset-1); 1180 uv = uv*16+v1; 1181 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); 1182 c = uv; 1183 break; 1184 case 'A': 1185 case 'Z': 1186 case 'z': 1187 throw ex("parser.descape.5", this.offset-2); 1188 default: 1189 } 1190 return c; 1191 } 1192 1193 static private final int hexChar(int ch) { 1194 if (ch < '0') return -1; 1195 if (ch > 'f') return -1; 1196 if (ch <= '9') return ch-'0'; 1197 if (ch < 'A') return -1; 1198 if (ch <= 'F') return ch-'A'+10; 1199 if (ch < 'a') return -1; 1200 return ch-'a'+10; 1201 } 1202 1203 static protected final void addCaseInsensitiveChar(RangeToken tok, int c) { 1204 final int[] caseMap = CaseInsensitiveMap.get(c); 1205 tok.addRange(c, c); 1206 1207 if (caseMap != null) { 1208 for (int i=0; i<caseMap.length; i+=2) { 1209 tok.addRange(caseMap[i], caseMap[i]); 1210 } 1211 } 1212 1213 } 1214 1215 static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) { 1216 int[] caseMap; 1217 int r1, r2; 1218 if (start <= end) { 1219 r1 = start; 1220 r2 = end; 1221 } else { 1222 r1 = end; 1223 r2 = start; 1224 } 1225 1226 tok.addRange(r1, r2); 1227 for (int ch = r1; ch <= r2; ch++) { 1228 caseMap = CaseInsensitiveMap.get(ch); 1229 if (caseMap != null) { 1230 for (int i=0; i<caseMap.length; i+=2) { 1231 tok.addRange(caseMap[i], caseMap[i]); 1232 } 1233 } 1234 } 1235 } 1236} 1237