1/* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4/* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23import java.io.IOException; 24import java.io.ObjectInputStream; 25import java.io.ObjectOutputStream; 26import java.io.ObjectStreamField; 27import java.util.ArrayList; 28import java.util.Collections; 29import java.util.HashMap; 30import java.util.HashSet; 31import java.util.List; 32import java.util.Map; 33import java.util.Set; 34import java.util.Vector; 35 36/** 37 * This class represents a node in parse tree. 38 * 39 * @xerces.internal 40 * 41 */ 42class Token implements java.io.Serializable { 43 44 private static final long serialVersionUID = 8484976002585487481L; 45 46 static final boolean COUNTTOKENS = true; 47 static int tokens = 0; 48 49 static final int CHAR = 0; // Literal char 50 static final int DOT = 11; // . 51 static final int CONCAT = 1; // XY 52 static final int UNION = 2; // X|Y|Z 53 static final int CLOSURE = 3; // X* 54 static final int RANGE = 4; // [a-zA-Z] etc. 55 static final int NRANGE = 5; // [^a-zA-Z] etc. 56 static final int PAREN = 6; // (X) or (?:X) 57 static final int EMPTY = 7; // 58 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z 59 static final int NONGREEDYCLOSURE = 9; // *? +? 60 static final int STRING = 10; // strings 61 static final int BACKREFERENCE = 12; // back references 62 static final int LOOKAHEAD = 20; // (?=...) 63 static final int NEGATIVELOOKAHEAD = 21; // (?!...) 64 static final int LOOKBEHIND = 22; // (?<=...) 65 static final int NEGATIVELOOKBEHIND = 23; // (?<!...) 66 static final int INDEPENDENT = 24; // (?>...) 67 static final int MODIFIERGROUP = 25; // (?ims-ims:...) 68 static final int CONDITION = 26; // (?(...)yes|no) 69 70 static final int UTF16_MAX = 0x10ffff; 71 72 final int type; 73 74 static Token token_dot; 75 static Token token_0to9; 76 static Token token_wordchars; 77 static Token token_not_0to9; 78 static Token token_not_wordchars; 79 static Token token_spaces; 80 static Token token_not_spaces; 81 static Token token_empty; 82 static Token token_linebeginning; 83 static Token token_linebeginning2; 84 static Token token_lineend; 85 static Token token_stringbeginning; 86 static Token token_stringend; 87 static Token token_stringend2; 88 static Token token_wordedge; 89 static Token token_not_wordedge; 90 static Token token_wordbeginning; 91 static Token token_wordend; 92 static { 93 Token.token_empty = new Token(Token.EMPTY); 94 95 Token.token_linebeginning = Token.createAnchor('^'); 96 Token.token_linebeginning2 = Token.createAnchor('@'); 97 Token.token_lineend = Token.createAnchor('$'); 98 Token.token_stringbeginning = Token.createAnchor('A'); 99 Token.token_stringend = Token.createAnchor('z'); 100 Token.token_stringend2 = Token.createAnchor('Z'); 101 Token.token_wordedge = Token.createAnchor('b'); 102 Token.token_not_wordedge = Token.createAnchor('B'); 103 Token.token_wordbeginning = Token.createAnchor('<'); 104 Token.token_wordend = Token.createAnchor('>'); 105 106 Token.token_dot = new Token(Token.DOT); 107 108 Token.token_0to9 = Token.createRange(); 109 Token.token_0to9.addRange('0', '9'); 110 Token.token_wordchars = Token.createRange(); 111 Token.token_wordchars.addRange('0', '9'); 112 Token.token_wordchars.addRange('A', 'Z'); 113 Token.token_wordchars.addRange('_', '_'); 114 Token.token_wordchars.addRange('a', 'z'); 115 Token.token_spaces = Token.createRange(); 116 Token.token_spaces.addRange('\t', '\t'); 117 Token.token_spaces.addRange('\n', '\n'); 118 Token.token_spaces.addRange('\f', '\f'); 119 Token.token_spaces.addRange('\r', '\r'); 120 Token.token_spaces.addRange(' ', ' '); 121 122 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 123 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 124 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 125 } 126 127 static Token.ParenToken createLook(int type, Token child) { 128 if (COUNTTOKENS) Token.tokens ++; 129 return new Token.ParenToken(type, child, 0); 130 } 131 static Token.ParenToken createParen(Token child, int pnumber) { 132 if (COUNTTOKENS) Token.tokens ++; 133 return new Token.ParenToken(Token.PAREN, child, pnumber); 134 } 135 static Token.ClosureToken createClosure(Token tok) { 136 if (COUNTTOKENS) Token.tokens ++; 137 return new Token.ClosureToken(Token.CLOSURE, tok); 138 } 139 static Token.ClosureToken createNGClosure(Token tok) { 140 if (COUNTTOKENS) Token.tokens ++; 141 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 142 } 143 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 144 if (COUNTTOKENS) Token.tokens ++; 145 return new Token.ConcatToken(tok1, tok2); 146 } 147 static Token.UnionToken createConcat() { 148 if (COUNTTOKENS) Token.tokens ++; 149 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug. 150 } 151 static Token.UnionToken createUnion() { 152 if (COUNTTOKENS) Token.tokens ++; 153 return new Token.UnionToken(Token.UNION); 154 } 155 static Token createEmpty() { 156 return Token.token_empty; 157 } 158 static RangeToken createRange() { 159 if (COUNTTOKENS) Token.tokens ++; 160 return new RangeToken(Token.RANGE); 161 } 162 static RangeToken createNRange() { 163 if (COUNTTOKENS) Token.tokens ++; 164 return new RangeToken(Token.NRANGE); 165 } 166 static Token.CharToken createChar(int ch) { 167 if (COUNTTOKENS) Token.tokens ++; 168 return new Token.CharToken(Token.CHAR, ch); 169 } 170 static private Token.CharToken createAnchor(int ch) { 171 if (COUNTTOKENS) Token.tokens ++; 172 return new Token.CharToken(Token.ANCHOR, ch); 173 } 174 static Token.StringToken createBackReference(int refno) { 175 if (COUNTTOKENS) Token.tokens ++; 176 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 177 } 178 static Token.StringToken createString(String str) { 179 if (COUNTTOKENS) Token.tokens ++; 180 return new Token.StringToken(Token.STRING, str, 0); 181 } 182 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 183 if (COUNTTOKENS) Token.tokens ++; 184 return new Token.ModifierToken(child, add, mask); 185 } 186 static Token.ConditionToken createCondition(int refno, Token condition, 187 Token yespat, Token nopat) { 188 if (COUNTTOKENS) Token.tokens ++; 189 return new Token.ConditionToken(refno, condition, yespat, nopat); 190 } 191 192 protected Token(int type) { 193 this.type = type; 194 } 195 196 /** 197 * A number of children. 198 */ 199 int size() { 200 return 0; 201 } 202 Token getChild(int index) { 203 return null; 204 } 205 void addChild(Token tok) { 206 throw new RuntimeException("Not supported."); 207 } 208 209 // for RANGE or NRANGE 210 protected void addRange(int start, int end) { 211 throw new RuntimeException("Not supported."); 212 } 213 protected void sortRanges() { 214 throw new RuntimeException("Not supported."); 215 } 216 protected void compactRanges() { 217 throw new RuntimeException("Not supported."); 218 } 219 protected void mergeRanges(Token tok) { 220 throw new RuntimeException("Not supported."); 221 } 222 protected void subtractRanges(Token tok) { 223 throw new RuntimeException("Not supported."); 224 } 225 protected void intersectRanges(Token tok) { 226 throw new RuntimeException("Not supported."); 227 } 228 static Token complementRanges(Token tok) { 229 return RangeToken.complementRanges(tok); 230 } 231 232 233 void setMin(int min) { // for CLOSURE 234 } 235 void setMax(int max) { // for CLOSURE 236 } 237 int getMin() { // for CLOSURE 238 return -1; 239 } 240 int getMax() { // for CLOSURE 241 return -1; 242 } 243 int getReferenceNumber() { // for STRING 244 return 0; 245 } 246 String getString() { // for STRING 247 return null; 248 } 249 250 int getParenNumber() { 251 return 0; 252 } 253 int getChar() { 254 return -1; 255 } 256 257 public String toString() { 258 return this.toString(0); 259 } 260 public String toString(int options) { 261 return this.type == Token.DOT ? "." : ""; 262 } 263 264 /** 265 * How many characters are needed? 266 */ 267 final int getMinLength() { 268 switch (this.type) { 269 case CONCAT: 270 int sum = 0; 271 for (int i = 0; i < this.size(); i ++) 272 sum += this.getChild(i).getMinLength(); 273 return sum; 274 275 case CONDITION: 276 case UNION: 277 if (this.size() == 0) 278 return 0; 279 int ret = this.getChild(0).getMinLength(); 280 for (int i = 1; i < this.size(); i ++) { 281 int min = this.getChild(i).getMinLength(); 282 if (min < ret) ret = min; 283 } 284 return ret; 285 286 case CLOSURE: 287 case NONGREEDYCLOSURE: 288 if (this.getMin() >= 0) 289 return this.getMin() * this.getChild(0).getMinLength(); 290 return 0; 291 292 case EMPTY: 293 case ANCHOR: 294 return 0; 295 296 case DOT: 297 case CHAR: 298 case RANGE: 299 case NRANGE: 300 return 1; 301 302 case INDEPENDENT: 303 case PAREN: 304 case MODIFIERGROUP: 305 return this.getChild(0).getMinLength(); 306 307 case BACKREFERENCE: 308 return 0; // ******* 309 310 case STRING: 311 return this.getString().length(); 312 313 case LOOKAHEAD: 314 case NEGATIVELOOKAHEAD: 315 case LOOKBEHIND: 316 case NEGATIVELOOKBEHIND: 317 return 0; // ***** Really? 318 319 default: 320 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type); 321 } 322 } 323 324 final int getMaxLength() { 325 switch (this.type) { 326 case CONCAT: 327 int sum = 0; 328 for (int i = 0; i < this.size(); i ++) { 329 int d = this.getChild(i).getMaxLength(); 330 if (d < 0) return -1; 331 sum += d; 332 } 333 return sum; 334 335 case CONDITION: 336 case UNION: 337 if (this.size() == 0) 338 return 0; 339 int ret = this.getChild(0).getMaxLength(); 340 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 341 int max = this.getChild(i).getMaxLength(); 342 if (max < 0) { // infinity 343 ret = -1; 344 break; 345 } 346 if (max > ret) ret = max; 347 } 348 return ret; 349 350 case CLOSURE: 351 case NONGREEDYCLOSURE: 352 if (this.getMax() >= 0) 353 // When this.child.getMaxLength() < 0, 354 // this returns minus value 355 return this.getMax() * this.getChild(0).getMaxLength(); 356 return -1; 357 358 case EMPTY: 359 case ANCHOR: 360 return 0; 361 362 case CHAR: 363 return 1; 364 case DOT: 365 case RANGE: 366 case NRANGE: 367 return 2; 368 369 case INDEPENDENT: 370 case PAREN: 371 case MODIFIERGROUP: 372 return this.getChild(0).getMaxLength(); 373 374 case BACKREFERENCE: 375 return -1; // ****** 376 377 case STRING: 378 return this.getString().length(); 379 380 case LOOKAHEAD: 381 case NEGATIVELOOKAHEAD: 382 case LOOKBEHIND: 383 case NEGATIVELOOKBEHIND: 384 return 0; // ***** Really? 385 386 default: 387 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type); 388 } 389 } 390 391 static final int FC_CONTINUE = 0; 392 static final int FC_TERMINAL = 1; 393 static final int FC_ANY = 2; 394 private static final boolean isSet(int options, int flag) { 395 return (options & flag) == flag; 396 } 397 final int analyzeFirstCharacter(RangeToken result, int options) { 398 switch (this.type) { 399 case CONCAT: 400 int ret = FC_CONTINUE; 401 for (int i = 0; i < this.size(); i ++) 402 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 403 break; 404 return ret; 405 406 case UNION: 407 if (this.size() == 0) 408 return FC_CONTINUE; 409 /* 410 * a|b|c -> FC_TERMINAL 411 * a|.|c -> FC_ANY 412 * a|b| -> FC_CONTINUE 413 */ 414 int ret2 = FC_CONTINUE; 415 boolean hasEmpty = false; 416 for (int i = 0; i < this.size(); i ++) { 417 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 418 if (ret2 == FC_ANY) 419 break; 420 else if (ret2 == FC_CONTINUE) 421 hasEmpty = true; 422 } 423 return hasEmpty ? FC_CONTINUE : ret2; 424 425 case CONDITION: 426 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 427 if (this.size() == 1) return FC_CONTINUE; 428 if (ret3 == FC_ANY) return ret3; 429 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 430 if (ret4 == FC_ANY) return ret4; 431 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 432 433 case CLOSURE: 434 case NONGREEDYCLOSURE: 435 this.getChild(0).analyzeFirstCharacter(result, options); 436 return FC_CONTINUE; 437 438 case EMPTY: 439 case ANCHOR: 440 return FC_CONTINUE; 441 442 case CHAR: 443 int ch = this.getChar(); 444 result.addRange(ch, ch); 445 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 446 ch = Character.toUpperCase((char)ch); 447 result.addRange(ch, ch); 448 ch = Character.toLowerCase((char)ch); 449 result.addRange(ch, ch); 450 } 451 return FC_TERMINAL; 452 453 case DOT: 454 return FC_ANY; 455 456 case RANGE: 457 result.mergeRanges(this); 458 return FC_TERMINAL; 459 460 case NRANGE: // **** 461 result.mergeRanges(Token.complementRanges(this)); 462 return FC_TERMINAL; 463 464 case INDEPENDENT: 465 case PAREN: 466 return this.getChild(0).analyzeFirstCharacter(result, options); 467 468 case MODIFIERGROUP: 469 options |= ((ModifierToken)this).getOptions(); 470 options &= ~((ModifierToken)this).getOptionsMask(); 471 return this.getChild(0).analyzeFirstCharacter(result, options); 472 473 case BACKREFERENCE: 474 result.addRange(0, UTF16_MAX); // **** We can not optimize. 475 return FC_ANY; 476 477 case STRING: 478 int cha = this.getString().charAt(0); 479 int ch2; 480 if (REUtil.isHighSurrogate(cha) 481 && this.getString().length() >= 2 482 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 483 cha = REUtil.composeFromSurrogates(cha, ch2); 484 result.addRange(cha, cha); 485 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 486 cha = Character.toUpperCase((char)cha); 487 result.addRange(cha, cha); 488 cha = Character.toLowerCase((char)cha); 489 result.addRange(cha, cha); 490 } 491 return FC_TERMINAL; 492 493 case LOOKAHEAD: 494 case NEGATIVELOOKAHEAD: 495 case LOOKBEHIND: 496 case NEGATIVELOOKBEHIND: 497 return FC_CONTINUE; 498 499 default: 500 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 501 } 502 } 503 504 private final boolean isShorterThan(Token tok) { 505 if (tok == null) return false; 506 /* 507 int mylength; 508 if (this.type == STRING) mylength = this.getString().length(); 509 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1; 510 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 511 int otherlength; 512 if (tok.type == STRING) otherlength = tok.getString().length(); 513 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1; 514 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 515 */ 516 int mylength; 517 if (this.type == STRING) mylength = this.getString().length(); 518 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 519 int otherlength; 520 if (tok.type == STRING) otherlength = tok.getString().length(); 521 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 522 return mylength < otherlength; 523 } 524 525 static class FixedStringContainer { 526 Token token = null; 527 int options = 0; 528 FixedStringContainer() { 529 } 530 } 531 532 final void findFixedString(FixedStringContainer container, int options) { 533 switch (this.type) { 534 case CONCAT: 535 Token prevToken = null; 536 int prevOptions = 0; 537 for (int i = 0; i < this.size(); i ++) { 538 this.getChild(i).findFixedString(container, options); 539 if (prevToken == null || prevToken.isShorterThan(container.token)) { 540 prevToken = container.token; 541 prevOptions = container.options; 542 } 543 } 544 container.token = prevToken; 545 container.options = prevOptions; 546 return; 547 548 case UNION: 549 case CLOSURE: 550 case NONGREEDYCLOSURE: 551 case EMPTY: 552 case ANCHOR: 553 case RANGE: 554 case DOT: 555 case NRANGE: 556 case BACKREFERENCE: 557 case LOOKAHEAD: 558 case NEGATIVELOOKAHEAD: 559 case LOOKBEHIND: 560 case NEGATIVELOOKBEHIND: 561 case CONDITION: 562 container.token = null; 563 return; 564 565 case CHAR: // Ignore CHAR tokens. 566 container.token = null; // ** 567 return; // ** 568 569 case STRING: 570 container.token = this; 571 container.options = options; 572 return; 573 574 case INDEPENDENT: 575 case PAREN: 576 this.getChild(0).findFixedString(container, options); 577 return; 578 579 case MODIFIERGROUP: 580 options |= ((ModifierToken)this).getOptions(); 581 options &= ~((ModifierToken)this).getOptionsMask(); 582 this.getChild(0).findFixedString(container, options); 583 return; 584 585 default: 586 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); 587 } 588 } 589 590 boolean match(int ch) { 591 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 592 } 593 594 // ------------------------------------------------------ 595 private final static Map<String, Token> categories = new HashMap<>(); 596 private final static Map<String, Token> categories2 = new HashMap<>(); 597 private static final String[] categoryNames = { 598 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 599 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 600 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28 601 "Pi", "Pf", // 29, 30 602 "L", "M", "N", "Z", "C", "P", "S", // 31-37 603 }; 604 605 // Schema Rec. {Datatypes} - Punctuation 606 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote 607 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote 608 static final int CHAR_LETTER = 31; 609 static final int CHAR_MARK = 32; 610 static final int CHAR_NUMBER = 33; 611 static final int CHAR_SEPARATOR = 34; 612 static final int CHAR_OTHER = 35; 613 static final int CHAR_PUNCTUATION = 36; 614 static final int CHAR_SYMBOL = 37; 615 616 //blockNames in UNICODE 3.1 that supported by XML Schema REC 617 private static final String[] blockNames = { 618 /*0000..007F;*/ "Basic Latin", 619 /*0080..00FF;*/ "Latin-1 Supplement", 620 /*0100..017F;*/ "Latin Extended-A", 621 /*0180..024F;*/ "Latin Extended-B", 622 /*0250..02AF;*/ "IPA Extensions", 623 /*02B0..02FF;*/ "Spacing Modifier Letters", 624 /*0300..036F;*/ "Combining Diacritical Marks", 625 /*0370..03FF;*/ "Greek", 626 /*0400..04FF;*/ "Cyrillic", 627 /*0530..058F;*/ "Armenian", 628 /*0590..05FF;*/ "Hebrew", 629 /*0600..06FF;*/ "Arabic", 630 /*0700..074F;*/ "Syriac", 631 /*0780..07BF;*/ "Thaana", 632 /*0900..097F;*/ "Devanagari", 633 /*0980..09FF;*/ "Bengali", 634 /*0A00..0A7F;*/ "Gurmukhi", 635 /*0A80..0AFF;*/ "Gujarati", 636 /*0B00..0B7F;*/ "Oriya", 637 /*0B80..0BFF;*/ "Tamil", 638 /*0C00..0C7F;*/ "Telugu", 639 /*0C80..0CFF;*/ "Kannada", 640 /*0D00..0D7F;*/ "Malayalam", 641 /*0D80..0DFF;*/ "Sinhala", 642 /*0E00..0E7F;*/ "Thai", 643 /*0E80..0EFF;*/ "Lao", 644 /*0F00..0FFF;*/ "Tibetan", 645 /*1000..109F;*/ "Myanmar", 646 /*10A0..10FF;*/ "Georgian", 647 /*1100..11FF;*/ "Hangul Jamo", 648 /*1200..137F;*/ "Ethiopic", 649 /*13A0..13FF;*/ "Cherokee", 650 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics", 651 /*1680..169F;*/ "Ogham", 652 /*16A0..16FF;*/ "Runic", 653 /*1780..17FF;*/ "Khmer", 654 /*1800..18AF;*/ "Mongolian", 655 /*1E00..1EFF;*/ "Latin Extended Additional", 656 /*1F00..1FFF;*/ "Greek Extended", 657 /*2000..206F;*/ "General Punctuation", 658 /*2070..209F;*/ "Superscripts and Subscripts", 659 /*20A0..20CF;*/ "Currency Symbols", 660 /*20D0..20FF;*/ "Combining Marks for Symbols", 661 /*2100..214F;*/ "Letterlike Symbols", 662 /*2150..218F;*/ "Number Forms", 663 /*2190..21FF;*/ "Arrows", 664 /*2200..22FF;*/ "Mathematical Operators", 665 /*2300..23FF;*/ "Miscellaneous Technical", 666 /*2400..243F;*/ "Control Pictures", 667 /*2440..245F;*/ "Optical Character Recognition", 668 /*2460..24FF;*/ "Enclosed Alphanumerics", 669 /*2500..257F;*/ "Box Drawing", 670 /*2580..259F;*/ "Block Elements", 671 /*25A0..25FF;*/ "Geometric Shapes", 672 /*2600..26FF;*/ "Miscellaneous Symbols", 673 /*2700..27BF;*/ "Dingbats", 674 /*2800..28FF;*/ "Braille Patterns", 675 /*2E80..2EFF;*/ "CJK Radicals Supplement", 676 /*2F00..2FDF;*/ "Kangxi Radicals", 677 /*2FF0..2FFF;*/ "Ideographic Description Characters", 678 /*3000..303F;*/ "CJK Symbols and Punctuation", 679 /*3040..309F;*/ "Hiragana", 680 /*30A0..30FF;*/ "Katakana", 681 /*3100..312F;*/ "Bopomofo", 682 /*3130..318F;*/ "Hangul Compatibility Jamo", 683 /*3190..319F;*/ "Kanbun", 684 /*31A0..31BF;*/ "Bopomofo Extended", 685 /*3200..32FF;*/ "Enclosed CJK Letters and Months", 686 /*3300..33FF;*/ "CJK Compatibility", 687 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A", 688 /*4E00..9FFF;*/ "CJK Unified Ideographs", 689 /*A000..A48F;*/ "Yi Syllables", 690 /*A490..A4CF;*/ "Yi Radicals", 691 /*AC00..D7A3;*/ "Hangul Syllables", 692 /*E000..F8FF;*/ "Private Use", 693 /*F900..FAFF;*/ "CJK Compatibility Ideographs", 694 /*FB00..FB4F;*/ "Alphabetic Presentation Forms", 695 /*FB50..FDFF;*/ "Arabic Presentation Forms-A", 696 /*FE20..FE2F;*/ "Combining Half Marks", 697 /*FE30..FE4F;*/ "CJK Compatibility Forms", 698 /*FE50..FE6F;*/ "Small Form Variants", 699 /*FE70..FEFE;*/ "Arabic Presentation Forms-B", 700 /*FEFF..FEFF;*/ "Specials", 701 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", 702 //missing Specials add manually 703 /*10300..1032F;*/ "Old Italic", // 84 704 /*10330..1034F;*/ "Gothic", 705 /*10400..1044F;*/ "Deseret", 706 /*1D000..1D0FF;*/ "Byzantine Musical Symbols", 707 /*1D100..1D1FF;*/ "Musical Symbols", 708 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols", 709 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B", 710 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement", 711 /*E0000..E007F;*/ "Tags", 712 //missing 2 private use add manually 713 714 }; 715 //ADD THOSE MANUALLY 716 //F0000..FFFFD; "Private Use", 717 //100000..10FFFD; "Private Use" 718 //FFF0..FFFD; "Specials", 719 static final String blockRanges = 720 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 721 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 722 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" 723 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" 724 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" 725 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" 726 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" 727 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" 728 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" 729 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" 730 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; 731 static final int[] nonBMPBlockRanges = { 732 0x10300, 0x1032F, // 84 733 0x10330, 0x1034F, 734 0x10400, 0x1044F, 735 0x1D000, 0x1D0FF, 736 0x1D100, 0x1D1FF, 737 0x1D400, 0x1D7FF, 738 0x20000, 0x2A6D6, 739 0x2F800, 0x2FA1F, 740 0xE0000, 0xE007F 741 }; 742 private static final int NONBMP_BLOCK_START = 84; 743 744 static protected RangeToken getRange(String name, boolean positive) { 745 if (Token.categories.size() == 0) { 746 synchronized (Token.categories) { 747 Token[] ranges = new Token[Token.categoryNames.length]; 748 for (int i = 0; i < ranges.length; i ++) { 749 ranges[i] = Token.createRange(); 750 } 751 int type; 752 for (int i = 0; i < 0x10000; i ++) { 753 type = Character.getType((char)i); 754 if (type == Character.START_PUNCTUATION || 755 type == Character.END_PUNCTUATION) { 756 //build table of Pi values 757 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || 758 i == 0x201F || i == 0x2039) { 759 type = CHAR_INIT_QUOTE; 760 } 761 //build table of Pf values 762 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { 763 type = CHAR_FINAL_QUOTE; 764 } 765 } 766 ranges[type].addRange(i, i); 767 switch (type) { 768 case Character.UPPERCASE_LETTER: 769 case Character.LOWERCASE_LETTER: 770 case Character.TITLECASE_LETTER: 771 case Character.MODIFIER_LETTER: 772 case Character.OTHER_LETTER: 773 type = CHAR_LETTER; 774 break; 775 case Character.NON_SPACING_MARK: 776 case Character.COMBINING_SPACING_MARK: 777 case Character.ENCLOSING_MARK: 778 type = CHAR_MARK; 779 break; 780 case Character.DECIMAL_DIGIT_NUMBER: 781 case Character.LETTER_NUMBER: 782 case Character.OTHER_NUMBER: 783 type = CHAR_NUMBER; 784 break; 785 case Character.SPACE_SEPARATOR: 786 case Character.LINE_SEPARATOR: 787 case Character.PARAGRAPH_SEPARATOR: 788 type = CHAR_SEPARATOR; 789 break; 790 case Character.CONTROL: 791 case Character.FORMAT: 792 case Character.SURROGATE: 793 case Character.PRIVATE_USE: 794 case Character.UNASSIGNED: 795 type = CHAR_OTHER; 796 break; 797 case Character.CONNECTOR_PUNCTUATION: 798 case Character.DASH_PUNCTUATION: 799 case Character.START_PUNCTUATION: 800 case Character.END_PUNCTUATION: 801 case CHAR_INIT_QUOTE: 802 case CHAR_FINAL_QUOTE: 803 case Character.OTHER_PUNCTUATION: 804 type = CHAR_PUNCTUATION; 805 break; 806 case Character.MATH_SYMBOL: 807 case Character.CURRENCY_SYMBOL: 808 case Character.MODIFIER_SYMBOL: 809 case Character.OTHER_SYMBOL: 810 type = CHAR_SYMBOL; 811 break; 812 default: 813 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); 814 } 815 ranges[type].addRange(i, i); 816 } // for all characters 817 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); 818 819 for (int i = 0; i < ranges.length; i ++) { 820 if (Token.categoryNames[i] != null) { 821 if (i == Character.UNASSIGNED) { // Unassigned 822 ranges[i].addRange(0x10000, Token.UTF16_MAX); 823 } 824 Token.categories.put(Token.categoryNames[i], ranges[i]); 825 Token.categories2.put(Token.categoryNames[i], 826 Token.complementRanges(ranges[i])); 827 } 828 } 829 //REVISIT: do we really need to support block names as in Unicode 3.1 830 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? 831 // 832 StringBuilder buffer = new StringBuilder(50); 833 for (int i = 0; i < Token.blockNames.length; i ++) { 834 Token r1 = Token.createRange(); 835 int location; 836 if (i < NONBMP_BLOCK_START) { 837 location = i*2; 838 int rstart = Token.blockRanges.charAt(location); 839 int rend = Token.blockRanges.charAt(location+1); 840 //DEBUGING 841 //System.out.println(n+" " +Integer.toHexString(rstart) 842 // +"-"+ Integer.toHexString(rend)); 843 r1.addRange(rstart, rend); 844 } else { 845 location = (i - NONBMP_BLOCK_START) * 2; 846 r1.addRange(Token.nonBMPBlockRanges[location], 847 Token.nonBMPBlockRanges[location + 1]); 848 } 849 String n = Token.blockNames[i]; 850 if (n.equals("Specials")) 851 r1.addRange(0xfff0, 0xfffd); 852 if (n.equals("Private Use")) { 853 r1.addRange(0xF0000,0xFFFFD); 854 r1.addRange(0x100000,0x10FFFD); 855 } 856 Token.categories.put(n, r1); 857 Token.categories2.put(n, Token.complementRanges(r1)); 858 buffer.setLength(0); 859 buffer.append("Is"); 860 if (n.indexOf(' ') >= 0) { 861 for (int ci = 0; ci < n.length(); ci ++) 862 if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); 863 } 864 else { 865 buffer.append(n); 866 } 867 Token.setAlias(buffer.toString(), n, true); 868 } 869 870 // TR#18 1.2 871 Token.setAlias("ASSIGNED", "Cn", false); 872 Token.setAlias("UNASSIGNED", "Cn", true); 873 Token all = Token.createRange(); 874 all.addRange(0, Token.UTF16_MAX); 875 Token.categories.put("ALL", all); 876 Token.categories2.put("ALL", Token.complementRanges(all)); 877 Token.registerNonXS("ASSIGNED"); 878 Token.registerNonXS("UNASSIGNED"); 879 Token.registerNonXS("ALL"); 880 881 Token isalpha = Token.createRange(); 882 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu 883 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll 884 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo 885 Token.categories.put("IsAlpha", isalpha); 886 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); 887 Token.registerNonXS("IsAlpha"); 888 889 Token isalnum = Token.createRange(); 890 isalnum.mergeRanges(isalpha); // Lu Ll Lo 891 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd 892 Token.categories.put("IsAlnum", isalnum); 893 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); 894 Token.registerNonXS("IsAlnum"); 895 896 Token isspace = Token.createRange(); 897 isspace.mergeRanges(Token.token_spaces); 898 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z 899 Token.categories.put("IsSpace", isspace); 900 Token.categories2.put("IsSpace", Token.complementRanges(isspace)); 901 Token.registerNonXS("IsSpace"); 902 903 Token isword = Token.createRange(); 904 isword.mergeRanges(isalnum); // Lu Ll Lo Nd 905 isword.addRange('_', '_'); 906 Token.categories.put("IsWord", isword); 907 Token.categories2.put("IsWord", Token.complementRanges(isword)); 908 Token.registerNonXS("IsWord"); 909 910 Token isascii = Token.createRange(); 911 isascii.addRange(0, 127); 912 Token.categories.put("IsASCII", isascii); 913 Token.categories2.put("IsASCII", Token.complementRanges(isascii)); 914 Token.registerNonXS("IsASCII"); 915 916 Token isnotgraph = Token.createRange(); 917 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); 918 isnotgraph.addRange(' ', ' '); 919 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); 920 Token.categories2.put("IsGraph", isnotgraph); 921 Token.registerNonXS("IsGraph"); 922 923 Token isxdigit = Token.createRange(); 924 isxdigit.addRange('0', '9'); 925 isxdigit.addRange('A', 'F'); 926 isxdigit.addRange('a', 'f'); 927 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); 928 Token.categories2.put("IsXDigit", isxdigit); 929 Token.registerNonXS("IsXDigit"); 930 931 Token.setAlias("IsDigit", "Nd", true); 932 Token.setAlias("IsUpper", "Lu", true); 933 Token.setAlias("IsLower", "Ll", true); 934 Token.setAlias("IsCntrl", "C", true); 935 Token.setAlias("IsPrint", "C", false); 936 Token.setAlias("IsPunct", "P", true); 937 Token.registerNonXS("IsDigit"); 938 Token.registerNonXS("IsUpper"); 939 Token.registerNonXS("IsLower"); 940 Token.registerNonXS("IsCntrl"); 941 Token.registerNonXS("IsPrint"); 942 Token.registerNonXS("IsPunct"); 943 944 Token.setAlias("alpha", "IsAlpha", true); 945 Token.setAlias("alnum", "IsAlnum", true); 946 Token.setAlias("ascii", "IsASCII", true); 947 Token.setAlias("cntrl", "IsCntrl", true); 948 Token.setAlias("digit", "IsDigit", true); 949 Token.setAlias("graph", "IsGraph", true); 950 Token.setAlias("lower", "IsLower", true); 951 Token.setAlias("print", "IsPrint", true); 952 Token.setAlias("punct", "IsPunct", true); 953 Token.setAlias("space", "IsSpace", true); 954 Token.setAlias("upper", "IsUpper", true); 955 Token.setAlias("word", "IsWord", true); // Perl extension 956 Token.setAlias("xdigit", "IsXDigit", true); 957 Token.registerNonXS("alpha"); 958 Token.registerNonXS("alnum"); 959 Token.registerNonXS("ascii"); 960 Token.registerNonXS("cntrl"); 961 Token.registerNonXS("digit"); 962 Token.registerNonXS("graph"); 963 Token.registerNonXS("lower"); 964 Token.registerNonXS("print"); 965 Token.registerNonXS("punct"); 966 Token.registerNonXS("space"); 967 Token.registerNonXS("upper"); 968 Token.registerNonXS("word"); 969 Token.registerNonXS("xdigit"); 970 } // synchronized 971 } // if null 972 RangeToken tok = positive ? (RangeToken)Token.categories.get(name) 973 : (RangeToken)Token.categories2.get(name); 974 //if (tok == null) System.out.println(name); 975 return tok; 976 } 977 static protected RangeToken getRange(String name, boolean positive, boolean xs) { 978 RangeToken range = Token.getRange(name, positive); 979 if (xs && range != null && Token.isRegisterNonXS(name)) 980 range = null; 981 return range; 982 } 983 984 static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>()); 985 /** 986 * This method is called by only getRange(). 987 * So this method need not MT-safe. 988 */ 989 static protected void registerNonXS(String name) { 990 Token.nonxs.add(name); 991 } 992 993 static protected boolean isRegisterNonXS(String name) { 994 return Token.nonxs.contains(name); 995 } 996 997 private static void setAlias(String newName, String name, boolean positive) { 998 Token t1 = Token.categories.get(name); 999 Token t2 = Token.categories2.get(name); 1000 if (positive) { 1001 Token.categories.put(newName, t1); 1002 Token.categories2.put(newName, t2); 1003 } else { 1004 Token.categories2.put(newName, t1); 1005 Token.categories.put(newName, t2); 1006 } 1007 } 1008 1009 // ------------------------------------------------------ 1010 1011 static final String viramaString = 1012 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1013 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1014 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1015 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1016 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1017 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1018 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1019 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1020 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1021 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;; 1022 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;; 1023 1024 static private Token token_grapheme = null; 1025 static synchronized Token getGraphemePattern() { 1026 if (Token.token_grapheme != null) 1027 return Token.token_grapheme; 1028 1029 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}] 1030 base_char.mergeRanges(Token.getRange("ASSIGNED", true)); 1031 base_char.subtractRanges(Token.getRange("M", true)); 1032 base_char.subtractRanges(Token.getRange("C", true)); 1033 1034 Token virama = Token.createRange(); 1035 for (int i = 0; i < Token.viramaString.length(); i++) { 1036 virama.addRange(i, i); 1037 } 1038 1039 Token combiner_wo_virama = Token.createRange(); 1040 combiner_wo_virama.mergeRanges(Token.getRange("M", true)); 1041 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final 1042 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras 1043 1044 Token left = Token.createUnion(); // base_char? 1045 left.addChild(base_char); 1046 left.addChild(Token.token_empty); 1047 1048 Token foo = Token.createUnion(); 1049 foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); 1050 foo.addChild(combiner_wo_virama); 1051 1052 foo = Token.createClosure(foo); 1053 1054 foo = Token.createConcat(left, foo); 1055 1056 Token.token_grapheme = foo; 1057 return Token.token_grapheme; 1058 } 1059 1060 /** 1061 * Combing Character Sequence in Perl 5.6. 1062 */ 1063 static private Token token_ccs = null; 1064 static synchronized Token getCombiningCharacterSequence() { 1065 if (Token.token_ccs != null) 1066 return Token.token_ccs; 1067 1068 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM* 1069 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM* 1070 Token.token_ccs = foo; 1071 return Token.token_ccs; 1072 } 1073 1074 // ------------------------------------------------------ 1075 1076 // ------------------------------------------------------ 1077 /** 1078 * This class represents a node in parse tree. 1079 */ 1080 static class StringToken extends Token implements java.io.Serializable { 1081 1082 private static final long serialVersionUID = -4614366944218504172L; 1083 1084 String string; 1085 final int refNumber; 1086 1087 StringToken(int type, String str, int n) { 1088 super(type); 1089 this.string = str; 1090 this.refNumber = n; 1091 } 1092 1093 int getReferenceNumber() { // for STRING 1094 return this.refNumber; 1095 } 1096 String getString() { // for STRING 1097 return this.string; 1098 } 1099 1100 public String toString(int options) { 1101 if (this.type == BACKREFERENCE) 1102 return "\\"+this.refNumber; 1103 else 1104 return REUtil.quoteMeta(this.string); 1105 } 1106 } 1107 1108 /** 1109 * This class represents a node in parse tree. 1110 */ 1111 static class ConcatToken extends Token implements java.io.Serializable { 1112 1113 private static final long serialVersionUID = 8717321425541346381L; 1114 1115 final Token child; 1116 final Token child2; 1117 1118 ConcatToken(Token t1, Token t2) { 1119 super(Token.CONCAT); 1120 this.child = t1; 1121 this.child2 = t2; 1122 } 1123 1124 int size() { 1125 return 2; 1126 } 1127 Token getChild(int index) { 1128 return index == 0 ? this.child : this.child2; 1129 } 1130 1131 public String toString(int options) { 1132 String ret; 1133 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { 1134 ret = this.child.toString(options)+"+"; 1135 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { 1136 ret = this.child.toString(options)+"+?"; 1137 } else 1138 ret = this.child.toString(options)+this.child2.toString(options); 1139 return ret; 1140 } 1141 } 1142 1143 /** 1144 * This class represents a node in parse tree. 1145 */ 1146 static class CharToken extends Token implements java.io.Serializable { 1147 1148 private static final long serialVersionUID = -4394272816279496989L; 1149 1150 final int chardata; 1151 1152 CharToken(int type, int ch) { 1153 super(type); 1154 this.chardata = ch; 1155 } 1156 1157 int getChar() { 1158 return this.chardata; 1159 } 1160 1161 public String toString(int options) { 1162 String ret; 1163 switch (this.type) { 1164 case CHAR: 1165 switch (this.chardata) { 1166 case '|': case '*': case '+': case '?': 1167 case '(': case ')': case '.': case '[': 1168 case '{': case '\\': 1169 ret = "\\"+(char)this.chardata; 1170 break; 1171 case '\f': ret = "\\f"; break; 1172 case '\n': ret = "\\n"; break; 1173 case '\r': ret = "\\r"; break; 1174 case '\t': ret = "\\t"; break; 1175 case 0x1b: ret = "\\e"; break; 1176 //case 0x0b: ret = "\\v"; break; 1177 default: 1178 if (this.chardata >= 0x10000) { 1179 String pre = "0"+Integer.toHexString(this.chardata); 1180 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 1181 } else 1182 ret = ""+(char)this.chardata; 1183 } 1184 break; 1185 1186 case ANCHOR: 1187 if (this == Token.token_linebeginning || this == Token.token_lineend) 1188 ret = ""+(char)this.chardata; 1189 else 1190 ret = "\\"+(char)this.chardata; 1191 break; 1192 1193 default: 1194 ret = null; 1195 } 1196 return ret; 1197 } 1198 1199 boolean match(int ch) { 1200 if (this.type == CHAR) { 1201 return ch == this.chardata; 1202 } else 1203 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 1204 } 1205 } 1206 1207 /** 1208 * This class represents a node in parse tree. 1209 */ 1210 static class ClosureToken extends Token implements java.io.Serializable { 1211 1212 private static final long serialVersionUID = 1308971930673997452L; 1213 1214 int min; 1215 int max; 1216 final Token child; 1217 1218 ClosureToken(int type, Token tok) { 1219 super(type); 1220 this.child = tok; 1221 this.setMin(-1); 1222 this.setMax(-1); 1223 } 1224 1225 int size() { 1226 return 1; 1227 } 1228 Token getChild(int index) { 1229 return this.child; 1230 } 1231 1232 final void setMin(int min) { 1233 this.min = min; 1234 } 1235 final void setMax(int max) { 1236 this.max = max; 1237 } 1238 final int getMin() { 1239 return this.min; 1240 } 1241 final int getMax() { 1242 return this.max; 1243 } 1244 1245 public String toString(int options) { 1246 String ret; 1247 if (this.type == CLOSURE) { 1248 if (this.getMin() < 0 && this.getMax() < 0) { 1249 ret = this.child.toString(options)+"*"; 1250 } else if (this.getMin() == this.getMax()) { 1251 ret = this.child.toString(options)+"{"+this.getMin()+"}"; 1252 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1253 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; 1254 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1255 ret = this.child.toString(options)+"{"+this.getMin()+",}"; 1256 } else 1257 throw new RuntimeException("Token#toString(): CLOSURE " 1258 +this.getMin()+", "+this.getMax()); 1259 } else { 1260 if (this.getMin() < 0 && this.getMax() < 0) { 1261 ret = this.child.toString(options)+"*?"; 1262 } else if (this.getMin() == this.getMax()) { 1263 ret = this.child.toString(options)+"{"+this.getMin()+"}?"; 1264 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1265 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; 1266 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1267 ret = this.child.toString(options)+"{"+this.getMin()+",}?"; 1268 } else 1269 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE " 1270 +this.getMin()+", "+this.getMax()); 1271 } 1272 return ret; 1273 } 1274 } 1275 1276 /** 1277 * This class represents a node in parse tree. 1278 */ 1279 static class ParenToken extends Token implements java.io.Serializable { 1280 1281 private static final long serialVersionUID = -5938014719827987704L; 1282 1283 final Token child; 1284 final int parennumber; 1285 1286 ParenToken(int type, Token tok, int paren) { 1287 super(type); 1288 this.child = tok; 1289 this.parennumber = paren; 1290 } 1291 1292 int size() { 1293 return 1; 1294 } 1295 Token getChild(int index) { 1296 return this.child; 1297 } 1298 1299 int getParenNumber() { 1300 return this.parennumber; 1301 } 1302 1303 public String toString(int options) { 1304 String ret = null; 1305 switch (this.type) { 1306 case PAREN: 1307 if (this.parennumber == 0) { 1308 ret = "(?:"+this.child.toString(options)+")"; 1309 } else { 1310 ret = "("+this.child.toString(options)+")"; 1311 } 1312 break; 1313 1314 case LOOKAHEAD: 1315 ret = "(?="+this.child.toString(options)+")"; 1316 break; 1317 case NEGATIVELOOKAHEAD: 1318 ret = "(?!"+this.child.toString(options)+")"; 1319 break; 1320 case LOOKBEHIND: 1321 ret = "(?<="+this.child.toString(options)+")"; 1322 break; 1323 case NEGATIVELOOKBEHIND: 1324 ret = "(?<!"+this.child.toString(options)+")"; 1325 break; 1326 case INDEPENDENT: 1327 ret = "(?>"+this.child.toString(options)+")"; 1328 break; 1329 } 1330 return ret; 1331 } 1332 } 1333 1334 /** 1335 * (?(condition)yes-pattern|no-pattern) 1336 */ 1337 static class ConditionToken extends Token implements java.io.Serializable { 1338 1339 private static final long serialVersionUID = 4353765277910594411L; 1340 1341 final int refNumber; 1342 final Token condition; 1343 final Token yes; 1344 final Token no; 1345 ConditionToken(int refno, Token cond, Token yespat, Token nopat) { 1346 super(Token.CONDITION); 1347 this.refNumber = refno; 1348 this.condition = cond; 1349 this.yes = yespat; 1350 this.no = nopat; 1351 } 1352 int size() { 1353 return this.no == null ? 1 : 2; 1354 } 1355 Token getChild(int index) { 1356 if (index == 0) return this.yes; 1357 if (index == 1) return this.no; 1358 throw new RuntimeException("Internal Error: "+index); 1359 } 1360 1361 public String toString(int options) { 1362 String ret; 1363 if (refNumber > 0) { 1364 ret = "(?("+refNumber+")"; 1365 } else if (this.condition.type == Token.ANCHOR) { 1366 ret = "(?("+this.condition+")"; 1367 } else { 1368 ret = "(?"+this.condition; 1369 } 1370 1371 if (this.no == null) { 1372 ret += this.yes+")"; 1373 } else { 1374 ret += this.yes+"|"+this.no+")"; 1375 } 1376 return ret; 1377 } 1378 } 1379 1380 /** 1381 * (ims-ims: .... ) 1382 */ 1383 static class ModifierToken extends Token implements java.io.Serializable { 1384 1385 private static final long serialVersionUID = -9114536559696480356L; 1386 1387 final Token child; 1388 final int add; 1389 final int mask; 1390 1391 ModifierToken(Token tok, int add, int mask) { 1392 super(Token.MODIFIERGROUP); 1393 this.child = tok; 1394 this.add = add; 1395 this.mask = mask; 1396 } 1397 1398 int size() { 1399 return 1; 1400 } 1401 Token getChild(int index) { 1402 return this.child; 1403 } 1404 1405 int getOptions() { 1406 return this.add; 1407 } 1408 int getOptionsMask() { 1409 return this.mask; 1410 } 1411 1412 public String toString(int options) { 1413 return "(?" 1414 +(this.add == 0 ? "" : REUtil.createOptionString(this.add)) 1415 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) 1416 +":" 1417 +this.child.toString(options) 1418 +")"; 1419 } 1420 } 1421 1422 /** 1423 * This class represents a node in parse tree. 1424 * for UNION or CONCAT. 1425 */ 1426 static class UnionToken extends Token implements java.io.Serializable { 1427 1428 private static final long serialVersionUID = -2568843945989489861L; 1429 1430 List<Token> children; 1431 1432 /** 1433 * @serialField children Vector children 1434 */ 1435 private static final ObjectStreamField[] serialPersistentFields = 1436 new ObjectStreamField[] { 1437 new ObjectStreamField("children", Vector.class), 1438 }; 1439 1440 UnionToken(int type) { 1441 super(type); 1442 } 1443 1444 @Override 1445 void addChild(Token tok) { 1446 if (tok == null) return; 1447 if (this.children == null) this.children = new ArrayList<>(); 1448 if (this.type == UNION) { 1449 this.children.add(tok); 1450 return; 1451 } 1452 // This is CONCAT, and new child is CONCAT. 1453 if (tok.type == CONCAT) { 1454 for (int i = 0; i < tok.size(); i ++) 1455 this.addChild(tok.getChild(i)); // Recursion 1456 return; 1457 } 1458 int size = this.children.size(); 1459 if (size == 0) { 1460 this.children.add(tok); 1461 return; 1462 } 1463 Token previous = this.children.get(size - 1); 1464 if (!((previous.type == CHAR || previous.type == STRING) 1465 && (tok.type == CHAR || tok.type == STRING))) { 1466 this.children.add(tok); 1467 return; 1468 } 1469 1470 //System.err.println("Merge '"+previous+"' and '"+tok+"'."); 1471 1472 StringBuilder buffer; 1473 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); 1474 if (previous.type == CHAR) { // Replace previous token by STRING 1475 buffer = new StringBuilder(2 + nextMaxLength); 1476 int ch = previous.getChar(); 1477 if (ch >= 0x10000) 1478 buffer.append(REUtil.decomposeToSurrogates(ch)); 1479 else 1480 buffer.append((char)ch); 1481 previous = Token.createString(null); 1482 this.children.set(size - 1, previous); 1483 } else { // STRING 1484 buffer = new StringBuilder(previous.getString().length() + nextMaxLength); 1485 buffer.append(previous.getString()); 1486 } 1487 1488 if (tok.type == CHAR) { 1489 int ch = tok.getChar(); 1490 if (ch >= 0x10000) 1491 buffer.append(REUtil.decomposeToSurrogates(ch)); 1492 else 1493 buffer.append((char)ch); 1494 } else { 1495 buffer.append(tok.getString()); 1496 } 1497 1498 ((StringToken)previous).string = new String(buffer); 1499 } 1500 1501 @Override 1502 int size() { 1503 return this.children == null ? 0 : this.children.size(); 1504 } 1505 @Override 1506 Token getChild(int index) { 1507 return this.children.get(index); 1508 } 1509 1510 @Override 1511 public String toString(int options) { 1512 String ret; 1513 if (this.type == CONCAT) { 1514 if (this.children.size() == 2) { 1515 Token ch = this.getChild(0); 1516 Token ch2 = this.getChild(1); 1517 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) { 1518 ret = ch.toString(options)+"+"; 1519 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) { 1520 ret = ch.toString(options)+"+?"; 1521 } else 1522 ret = ch.toString(options)+ch2.toString(options); 1523 } else { 1524 StringBuilder sb = new StringBuilder(); 1525 this.children.stream().forEach((children1) -> { 1526 sb.append((children1).toString(options)); 1527 }); 1528 ret = sb.toString(); 1529 } 1530 return ret; 1531 } 1532 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) { 1533 ret = this.getChild(0).toString(options)+"?"; 1534 } else if (this.children.size() == 2 1535 && this.getChild(0).type == EMPTY) { 1536 ret = this.getChild(1).toString(options)+"??"; 1537 } else { 1538 StringBuilder sb = new StringBuilder(); 1539 sb.append((this.children.get(0)).toString(options)); 1540 for (int i = 1; i < this.children.size(); i ++) { 1541 sb.append('|'); 1542 sb.append((this.children.get(i)).toString(options)); 1543 } 1544 ret = sb.toString(); 1545 } 1546 return ret; 1547 } 1548 1549 /** 1550 * @serialData Serialized fields. Convert the List to Vector for backward compatibility. 1551 */ 1552 private void writeObject(ObjectOutputStream out) throws IOException { 1553 // Convert List to Vector 1554 Vector<Token> vChildren = (children == null)? null : new Vector<>(children); 1555 1556 // Write serialized fields 1557 ObjectOutputStream.PutField pf = out.putFields(); 1558 pf.put("children", vChildren); 1559 out.writeFields(); 1560 } 1561 1562 @SuppressWarnings("unchecked") 1563 private void readObject(ObjectInputStream in) 1564 throws IOException, ClassNotFoundException { 1565 // We have to read serialized fields first. 1566 ObjectInputStream.GetField gf = in.readFields(); 1567 Vector<Token> vChildren = (Vector<Token>)gf.get("children", null); 1568 1569 //convert Vector back to List 1570 if (vChildren != null) children = new ArrayList<>(vChildren); 1571 } 1572 } 1573} 1574