1/*
2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3 */
4/*
5 * Licensed to the Apache Software Foundation (ASF) under one or more
6 * contributor license agreements.  See the NOTICE file distributed with
7 * this work for additional information regarding copyright ownership.
8 * The ASF licenses this file to You under the Apache License, Version 2.0
9 * (the "License"); you may not use this file except in compliance with
10 * the License.  You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22
23import java.io.IOException;
24import java.io.ObjectInputStream;
25import java.io.ObjectOutputStream;
26import java.io.ObjectStreamField;
27import java.util.ArrayList;
28import java.util.Collections;
29import java.util.HashMap;
30import java.util.HashSet;
31import java.util.List;
32import java.util.Map;
33import java.util.Set;
34import java.util.Vector;
35
36/**
37 * This class represents a node in parse tree.
38 *
39 * @xerces.internal
40 *
41 */
42class Token implements java.io.Serializable {
43
44    private static final long serialVersionUID = 8484976002585487481L;
45
46    static final boolean COUNTTOKENS = true;
47    static int tokens = 0;
48
49    static final int CHAR = 0;                  // Literal char
50    static final int DOT = 11;                  // .
51    static final int CONCAT = 1;                // XY
52    static final int UNION = 2;                 // X|Y|Z
53    static final int CLOSURE = 3;               // X*
54    static final int RANGE = 4;                 // [a-zA-Z] etc.
55    static final int NRANGE = 5;                // [^a-zA-Z] etc.
56    static final int PAREN = 6;                 // (X) or (?:X)
57    static final int EMPTY = 7;                 //
58    static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
59    static final int NONGREEDYCLOSURE = 9;      // *? +?
60    static final int STRING = 10;               // strings
61    static final int BACKREFERENCE = 12;        // back references
62    static final int LOOKAHEAD = 20;            // (?=...)
63    static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
64    static final int LOOKBEHIND = 22;           // (?<=...)
65    static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
66    static final int INDEPENDENT = 24;          // (?>...)
67    static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
68    static final int CONDITION = 26;            // (?(...)yes|no)
69
70    static final int UTF16_MAX = 0x10ffff;
71
72    final int type;
73
74    static Token token_dot;
75    static Token token_0to9;
76    static Token token_wordchars;
77    static Token token_not_0to9;
78    static Token token_not_wordchars;
79    static Token token_spaces;
80    static Token token_not_spaces;
81    static Token token_empty;
82    static Token token_linebeginning;
83    static Token token_linebeginning2;
84    static Token token_lineend;
85    static Token token_stringbeginning;
86    static Token token_stringend;
87    static Token token_stringend2;
88    static Token token_wordedge;
89    static Token token_not_wordedge;
90    static Token token_wordbeginning;
91    static Token token_wordend;
92    static {
93        Token.token_empty = new Token(Token.EMPTY);
94
95        Token.token_linebeginning = Token.createAnchor('^');
96        Token.token_linebeginning2 = Token.createAnchor('@');
97        Token.token_lineend = Token.createAnchor('$');
98        Token.token_stringbeginning = Token.createAnchor('A');
99        Token.token_stringend = Token.createAnchor('z');
100        Token.token_stringend2 = Token.createAnchor('Z');
101        Token.token_wordedge = Token.createAnchor('b');
102        Token.token_not_wordedge = Token.createAnchor('B');
103        Token.token_wordbeginning = Token.createAnchor('<');
104        Token.token_wordend = Token.createAnchor('>');
105
106        Token.token_dot = new Token(Token.DOT);
107
108        Token.token_0to9 = Token.createRange();
109        Token.token_0to9.addRange('0', '9');
110        Token.token_wordchars = Token.createRange();
111        Token.token_wordchars.addRange('0', '9');
112        Token.token_wordchars.addRange('A', 'Z');
113        Token.token_wordchars.addRange('_', '_');
114        Token.token_wordchars.addRange('a', 'z');
115        Token.token_spaces = Token.createRange();
116        Token.token_spaces.addRange('\t', '\t');
117        Token.token_spaces.addRange('\n', '\n');
118        Token.token_spaces.addRange('\f', '\f');
119        Token.token_spaces.addRange('\r', '\r');
120        Token.token_spaces.addRange(' ', ' ');
121
122        Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
123        Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
124        Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
125    }
126
127    static Token.ParenToken createLook(int type, Token child) {
128        if (COUNTTOKENS)  Token.tokens ++;
129        return new Token.ParenToken(type, child, 0);
130    }
131    static Token.ParenToken createParen(Token child, int pnumber) {
132        if (COUNTTOKENS)  Token.tokens ++;
133        return new Token.ParenToken(Token.PAREN, child, pnumber);
134    }
135    static Token.ClosureToken createClosure(Token tok) {
136        if (COUNTTOKENS)  Token.tokens ++;
137        return new Token.ClosureToken(Token.CLOSURE, tok);
138    }
139    static Token.ClosureToken createNGClosure(Token tok) {
140        if (COUNTTOKENS)  Token.tokens ++;
141        return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
142    }
143    static Token.ConcatToken createConcat(Token tok1, Token tok2) {
144        if (COUNTTOKENS)  Token.tokens ++;
145        return new Token.ConcatToken(tok1, tok2);
146    }
147    static Token.UnionToken createConcat() {
148        if (COUNTTOKENS)  Token.tokens ++;
149        return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
150    }
151    static Token.UnionToken createUnion() {
152        if (COUNTTOKENS)  Token.tokens ++;
153        return new Token.UnionToken(Token.UNION);
154    }
155    static Token createEmpty() {
156        return Token.token_empty;
157    }
158    static RangeToken createRange() {
159        if (COUNTTOKENS)  Token.tokens ++;
160        return new RangeToken(Token.RANGE);
161    }
162    static RangeToken createNRange() {
163        if (COUNTTOKENS)  Token.tokens ++;
164        return new RangeToken(Token.NRANGE);
165    }
166    static Token.CharToken createChar(int ch) {
167        if (COUNTTOKENS)  Token.tokens ++;
168        return new Token.CharToken(Token.CHAR, ch);
169    }
170    static private Token.CharToken createAnchor(int ch) {
171        if (COUNTTOKENS)  Token.tokens ++;
172        return new Token.CharToken(Token.ANCHOR, ch);
173    }
174    static Token.StringToken createBackReference(int refno) {
175        if (COUNTTOKENS)  Token.tokens ++;
176        return new Token.StringToken(Token.BACKREFERENCE, null, refno);
177    }
178    static Token.StringToken createString(String str) {
179        if (COUNTTOKENS)  Token.tokens ++;
180        return new Token.StringToken(Token.STRING, str, 0);
181    }
182    static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
183        if (COUNTTOKENS)  Token.tokens ++;
184        return new Token.ModifierToken(child, add, mask);
185    }
186    static Token.ConditionToken createCondition(int refno, Token condition,
187                                                Token yespat, Token nopat) {
188        if (COUNTTOKENS)  Token.tokens ++;
189        return new Token.ConditionToken(refno, condition, yespat, nopat);
190    }
191
192    protected Token(int type) {
193        this.type = type;
194    }
195
196    /**
197     * A number of children.
198     */
199    int size() {
200        return 0;
201    }
202    Token getChild(int index) {
203        return null;
204    }
205    void addChild(Token tok) {
206        throw new RuntimeException("Not supported.");
207    }
208
209                                                // for RANGE or NRANGE
210    protected void addRange(int start, int end) {
211        throw new RuntimeException("Not supported.");
212    }
213    protected void sortRanges() {
214        throw new RuntimeException("Not supported.");
215    }
216    protected void compactRanges() {
217        throw new RuntimeException("Not supported.");
218    }
219    protected void mergeRanges(Token tok) {
220        throw new RuntimeException("Not supported.");
221    }
222    protected void subtractRanges(Token tok) {
223        throw new RuntimeException("Not supported.");
224    }
225    protected void intersectRanges(Token tok) {
226        throw new RuntimeException("Not supported.");
227    }
228    static Token complementRanges(Token tok) {
229        return RangeToken.complementRanges(tok);
230    }
231
232
233    void setMin(int min) {                      // for CLOSURE
234    }
235    void setMax(int max) {                      // for CLOSURE
236    }
237    int getMin() {                              // for CLOSURE
238        return -1;
239    }
240    int getMax() {                              // for CLOSURE
241        return -1;
242    }
243    int getReferenceNumber() {                  // for STRING
244        return 0;
245    }
246    String getString() {                        // for STRING
247        return null;
248    }
249
250    int getParenNumber() {
251        return 0;
252    }
253    int getChar() {
254        return -1;
255    }
256
257    public String toString() {
258        return this.toString(0);
259    }
260    public String toString(int options) {
261        return this.type == Token.DOT ? "." : "";
262    }
263
264    /**
265     * How many characters are needed?
266     */
267    final int getMinLength() {
268        switch (this.type) {
269          case CONCAT:
270            int sum = 0;
271            for (int i = 0;  i < this.size();  i ++)
272                sum += this.getChild(i).getMinLength();
273            return sum;
274
275          case CONDITION:
276          case UNION:
277            if (this.size() == 0)
278                return 0;
279            int ret = this.getChild(0).getMinLength();
280            for (int i = 1;  i < this.size();  i ++) {
281                int min = this.getChild(i).getMinLength();
282                if (min < ret)  ret = min;
283            }
284            return ret;
285
286          case CLOSURE:
287          case NONGREEDYCLOSURE:
288            if (this.getMin() >= 0)
289                return this.getMin() * this.getChild(0).getMinLength();
290            return 0;
291
292          case EMPTY:
293          case ANCHOR:
294            return 0;
295
296          case DOT:
297          case CHAR:
298          case RANGE:
299          case NRANGE:
300            return 1;
301
302          case INDEPENDENT:
303          case PAREN:
304          case MODIFIERGROUP:
305            return this.getChild(0).getMinLength();
306
307          case BACKREFERENCE:
308            return 0;                           // *******
309
310          case STRING:
311            return this.getString().length();
312
313          case LOOKAHEAD:
314          case NEGATIVELOOKAHEAD:
315          case LOOKBEHIND:
316          case NEGATIVELOOKBEHIND:
317            return 0;                           // ***** Really?
318
319          default:
320            throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
321        }
322    }
323
324    final int getMaxLength() {
325        switch (this.type) {
326          case CONCAT:
327            int sum = 0;
328            for (int i = 0;  i < this.size();  i ++) {
329                int d = this.getChild(i).getMaxLength();
330                if (d < 0)  return -1;
331                sum += d;
332            }
333            return sum;
334
335          case CONDITION:
336          case UNION:
337            if (this.size() == 0)
338                return 0;
339            int ret = this.getChild(0).getMaxLength();
340            for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
341                int max = this.getChild(i).getMaxLength();
342                if (max < 0) {                  // infinity
343                    ret = -1;
344                    break;
345                }
346                if (max > ret)  ret = max;
347            }
348            return ret;
349
350          case CLOSURE:
351          case NONGREEDYCLOSURE:
352            if (this.getMax() >= 0)
353                                                // When this.child.getMaxLength() < 0,
354                                                // this returns minus value
355                return this.getMax() * this.getChild(0).getMaxLength();
356            return -1;
357
358          case EMPTY:
359          case ANCHOR:
360            return 0;
361
362          case CHAR:
363            return 1;
364          case DOT:
365          case RANGE:
366          case NRANGE:
367            return 2;
368
369          case INDEPENDENT:
370          case PAREN:
371          case MODIFIERGROUP:
372            return this.getChild(0).getMaxLength();
373
374          case BACKREFERENCE:
375            return -1;                          // ******
376
377          case STRING:
378            return this.getString().length();
379
380          case LOOKAHEAD:
381          case NEGATIVELOOKAHEAD:
382          case LOOKBEHIND:
383          case NEGATIVELOOKBEHIND:
384            return 0;                           // ***** Really?
385
386          default:
387            throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
388        }
389    }
390
391    static final int FC_CONTINUE = 0;
392    static final int FC_TERMINAL = 1;
393    static final int FC_ANY = 2;
394    private static final boolean isSet(int options, int flag) {
395        return (options & flag) == flag;
396    }
397    final int analyzeFirstCharacter(RangeToken result, int options) {
398        switch (this.type) {
399          case CONCAT:
400            int ret = FC_CONTINUE;
401            for (int i = 0;  i < this.size();  i ++)
402                if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
403                    break;
404            return ret;
405
406          case UNION:
407            if (this.size() == 0)
408                return FC_CONTINUE;
409            /*
410             *  a|b|c -> FC_TERMINAL
411             *  a|.|c -> FC_ANY
412             *  a|b|  -> FC_CONTINUE
413             */
414            int ret2 = FC_CONTINUE;
415            boolean hasEmpty = false;
416            for (int i = 0;  i < this.size();  i ++) {
417                ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
418                if (ret2 == FC_ANY)
419                    break;
420                else if (ret2 == FC_CONTINUE)
421                    hasEmpty = true;
422            }
423            return hasEmpty ? FC_CONTINUE : ret2;
424
425          case CONDITION:
426            int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
427            if (this.size() == 1)  return FC_CONTINUE;
428            if (ret3 == FC_ANY)  return ret3;
429            int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
430            if (ret4 == FC_ANY)  return ret4;
431            return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
432
433          case CLOSURE:
434          case NONGREEDYCLOSURE:
435            this.getChild(0).analyzeFirstCharacter(result, options);
436            return FC_CONTINUE;
437
438          case EMPTY:
439          case ANCHOR:
440            return FC_CONTINUE;
441
442          case CHAR:
443            int ch = this.getChar();
444            result.addRange(ch, ch);
445            if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
446                ch = Character.toUpperCase((char)ch);
447                result.addRange(ch, ch);
448                ch = Character.toLowerCase((char)ch);
449                result.addRange(ch, ch);
450            }
451            return FC_TERMINAL;
452
453          case DOT:
454              return FC_ANY;
455
456          case RANGE:
457            result.mergeRanges(this);
458            return FC_TERMINAL;
459
460          case NRANGE:                          // ****
461            result.mergeRanges(Token.complementRanges(this));
462            return FC_TERMINAL;
463
464          case INDEPENDENT:
465          case PAREN:
466            return this.getChild(0).analyzeFirstCharacter(result, options);
467
468          case MODIFIERGROUP:
469            options |= ((ModifierToken)this).getOptions();
470            options &= ~((ModifierToken)this).getOptionsMask();
471            return this.getChild(0).analyzeFirstCharacter(result, options);
472
473          case BACKREFERENCE:
474            result.addRange(0, UTF16_MAX);  // **** We can not optimize.
475            return FC_ANY;
476
477          case STRING:
478            int cha = this.getString().charAt(0);
479            int ch2;
480            if (REUtil.isHighSurrogate(cha)
481                && this.getString().length() >= 2
482                && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
483                cha = REUtil.composeFromSurrogates(cha, ch2);
484            result.addRange(cha, cha);
485            if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
486                cha = Character.toUpperCase((char)cha);
487                result.addRange(cha, cha);
488                cha = Character.toLowerCase((char)cha);
489                result.addRange(cha, cha);
490            }
491            return FC_TERMINAL;
492
493          case LOOKAHEAD:
494          case NEGATIVELOOKAHEAD:
495          case LOOKBEHIND:
496          case NEGATIVELOOKBEHIND:
497            return FC_CONTINUE;
498
499          default:
500            throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
501        }
502    }
503
504    private final boolean isShorterThan(Token tok) {
505        if (tok == null)  return false;
506        /*
507        int mylength;
508        if (this.type == STRING)  mylength = this.getString().length();
509        else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
510        else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
511        int otherlength;
512        if (tok.type == STRING)  otherlength = tok.getString().length();
513        else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
514        else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
515        */
516        int mylength;
517        if (this.type == STRING)  mylength = this.getString().length();
518        else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
519        int otherlength;
520        if (tok.type == STRING)  otherlength = tok.getString().length();
521        else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
522        return mylength < otherlength;
523    }
524
525    static class FixedStringContainer {
526        Token token = null;
527        int options = 0;
528        FixedStringContainer() {
529        }
530    }
531
532    final void findFixedString(FixedStringContainer container, int options) {
533        switch (this.type) {
534          case CONCAT:
535            Token prevToken = null;
536            int prevOptions = 0;
537            for (int i = 0;  i < this.size();  i ++) {
538                this.getChild(i).findFixedString(container, options);
539                if (prevToken == null || prevToken.isShorterThan(container.token)) {
540                    prevToken = container.token;
541                    prevOptions = container.options;
542                }
543            }
544            container.token = prevToken;
545            container.options = prevOptions;
546            return;
547
548          case UNION:
549          case CLOSURE:
550          case NONGREEDYCLOSURE:
551          case EMPTY:
552          case ANCHOR:
553          case RANGE:
554          case DOT:
555          case NRANGE:
556          case BACKREFERENCE:
557          case LOOKAHEAD:
558          case NEGATIVELOOKAHEAD:
559          case LOOKBEHIND:
560          case NEGATIVELOOKBEHIND:
561          case CONDITION:
562            container.token = null;
563            return;
564
565          case CHAR:                            // Ignore CHAR tokens.
566            container.token = null;             // **
567            return;                             // **
568
569          case STRING:
570            container.token = this;
571            container.options = options;
572            return;
573
574          case INDEPENDENT:
575          case PAREN:
576            this.getChild(0).findFixedString(container, options);
577            return;
578
579          case MODIFIERGROUP:
580            options |= ((ModifierToken)this).getOptions();
581            options &= ~((ModifierToken)this).getOptionsMask();
582            this.getChild(0).findFixedString(container, options);
583            return;
584
585          default:
586            throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
587        }
588    }
589
590    boolean match(int ch) {
591        throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
592    }
593
594    // ------------------------------------------------------
595    private final static Map<String, Token> categories = new HashMap<>();
596    private final static Map<String, Token> categories2 = new HashMap<>();
597    private static final String[] categoryNames = {
598        "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
599        "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
600        "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
601        "Pi", "Pf",  // 29, 30
602        "L", "M", "N", "Z", "C", "P", "S",      // 31-37
603    };
604
605    // Schema Rec. {Datatypes} - Punctuation
606    static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
607    static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
608    static final int CHAR_LETTER = 31;
609    static final int CHAR_MARK = 32;
610    static final int CHAR_NUMBER = 33;
611    static final int CHAR_SEPARATOR = 34;
612    static final int CHAR_OTHER = 35;
613    static final int CHAR_PUNCTUATION = 36;
614    static final int CHAR_SYMBOL = 37;
615
616    //blockNames in UNICODE 3.1 that supported by XML Schema REC
617    private static final String[] blockNames = {
618        /*0000..007F;*/ "Basic Latin",
619        /*0080..00FF;*/ "Latin-1 Supplement",
620        /*0100..017F;*/ "Latin Extended-A",
621        /*0180..024F;*/ "Latin Extended-B",
622        /*0250..02AF;*/ "IPA Extensions",
623        /*02B0..02FF;*/ "Spacing Modifier Letters",
624        /*0300..036F;*/ "Combining Diacritical Marks",
625        /*0370..03FF;*/ "Greek",
626        /*0400..04FF;*/ "Cyrillic",
627        /*0530..058F;*/ "Armenian",
628        /*0590..05FF;*/ "Hebrew",
629        /*0600..06FF;*/ "Arabic",
630        /*0700..074F;*/ "Syriac",
631        /*0780..07BF;*/ "Thaana",
632        /*0900..097F;*/ "Devanagari",
633        /*0980..09FF;*/ "Bengali",
634        /*0A00..0A7F;*/ "Gurmukhi",
635        /*0A80..0AFF;*/ "Gujarati",
636        /*0B00..0B7F;*/ "Oriya",
637        /*0B80..0BFF;*/ "Tamil",
638        /*0C00..0C7F;*/ "Telugu",
639        /*0C80..0CFF;*/ "Kannada",
640        /*0D00..0D7F;*/ "Malayalam",
641        /*0D80..0DFF;*/ "Sinhala",
642        /*0E00..0E7F;*/ "Thai",
643        /*0E80..0EFF;*/ "Lao",
644        /*0F00..0FFF;*/ "Tibetan",
645        /*1000..109F;*/ "Myanmar",
646        /*10A0..10FF;*/ "Georgian",
647        /*1100..11FF;*/ "Hangul Jamo",
648        /*1200..137F;*/ "Ethiopic",
649        /*13A0..13FF;*/ "Cherokee",
650        /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
651        /*1680..169F;*/ "Ogham",
652        /*16A0..16FF;*/ "Runic",
653        /*1780..17FF;*/ "Khmer",
654        /*1800..18AF;*/ "Mongolian",
655        /*1E00..1EFF;*/ "Latin Extended Additional",
656        /*1F00..1FFF;*/ "Greek Extended",
657        /*2000..206F;*/ "General Punctuation",
658        /*2070..209F;*/ "Superscripts and Subscripts",
659        /*20A0..20CF;*/ "Currency Symbols",
660        /*20D0..20FF;*/ "Combining Marks for Symbols",
661        /*2100..214F;*/ "Letterlike Symbols",
662        /*2150..218F;*/ "Number Forms",
663        /*2190..21FF;*/ "Arrows",
664        /*2200..22FF;*/ "Mathematical Operators",
665        /*2300..23FF;*/ "Miscellaneous Technical",
666        /*2400..243F;*/ "Control Pictures",
667        /*2440..245F;*/ "Optical Character Recognition",
668        /*2460..24FF;*/ "Enclosed Alphanumerics",
669        /*2500..257F;*/ "Box Drawing",
670        /*2580..259F;*/ "Block Elements",
671        /*25A0..25FF;*/ "Geometric Shapes",
672        /*2600..26FF;*/ "Miscellaneous Symbols",
673        /*2700..27BF;*/ "Dingbats",
674        /*2800..28FF;*/ "Braille Patterns",
675        /*2E80..2EFF;*/ "CJK Radicals Supplement",
676        /*2F00..2FDF;*/ "Kangxi Radicals",
677        /*2FF0..2FFF;*/ "Ideographic Description Characters",
678        /*3000..303F;*/ "CJK Symbols and Punctuation",
679        /*3040..309F;*/ "Hiragana",
680        /*30A0..30FF;*/ "Katakana",
681        /*3100..312F;*/ "Bopomofo",
682        /*3130..318F;*/ "Hangul Compatibility Jamo",
683        /*3190..319F;*/ "Kanbun",
684        /*31A0..31BF;*/ "Bopomofo Extended",
685        /*3200..32FF;*/ "Enclosed CJK Letters and Months",
686        /*3300..33FF;*/ "CJK Compatibility",
687        /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
688        /*4E00..9FFF;*/ "CJK Unified Ideographs",
689        /*A000..A48F;*/ "Yi Syllables",
690        /*A490..A4CF;*/ "Yi Radicals",
691        /*AC00..D7A3;*/ "Hangul Syllables",
692        /*E000..F8FF;*/ "Private Use",
693        /*F900..FAFF;*/ "CJK Compatibility Ideographs",
694        /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
695        /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
696        /*FE20..FE2F;*/ "Combining Half Marks",
697        /*FE30..FE4F;*/ "CJK Compatibility Forms",
698        /*FE50..FE6F;*/ "Small Form Variants",
699        /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
700        /*FEFF..FEFF;*/ "Specials",
701        /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
702         //missing Specials add manually
703        /*10300..1032F;*/ "Old Italic",         // 84
704        /*10330..1034F;*/ "Gothic",
705        /*10400..1044F;*/ "Deseret",
706        /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
707        /*1D100..1D1FF;*/ "Musical Symbols",
708        /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
709        /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
710        /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
711        /*E0000..E007F;*/ "Tags",
712        //missing 2 private use add manually
713
714    };
715    //ADD THOSE MANUALLY
716    //F0000..FFFFD; "Private Use",
717    //100000..10FFFD; "Private Use"
718    //FFF0..FFFD; "Specials",
719    static final String blockRanges =
720       "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
721        +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
722        +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
723        +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
724        +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
725        +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
726        +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
727        +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
728        +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
729        +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
730        +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
731    static final int[] nonBMPBlockRanges = {
732        0x10300, 0x1032F,       // 84
733        0x10330, 0x1034F,
734        0x10400, 0x1044F,
735        0x1D000, 0x1D0FF,
736        0x1D100, 0x1D1FF,
737        0x1D400, 0x1D7FF,
738        0x20000, 0x2A6D6,
739        0x2F800, 0x2FA1F,
740        0xE0000, 0xE007F
741    };
742    private static final int NONBMP_BLOCK_START = 84;
743
744    static protected RangeToken getRange(String name, boolean positive) {
745        if (Token.categories.size() == 0) {
746            synchronized (Token.categories) {
747                Token[] ranges = new Token[Token.categoryNames.length];
748                for (int i = 0;  i < ranges.length;  i ++) {
749                    ranges[i] = Token.createRange();
750                }
751                int type;
752                for (int i = 0;  i < 0x10000;  i ++) {
753                    type = Character.getType((char)i);
754                    if (type == Character.START_PUNCTUATION ||
755                        type == Character.END_PUNCTUATION) {
756                        //build table of Pi values
757                        if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
758                            i == 0x201F || i == 0x2039) {
759                            type = CHAR_INIT_QUOTE;
760                        }
761                        //build table of Pf values
762                        if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
763                            type = CHAR_FINAL_QUOTE;
764                        }
765                    }
766                    ranges[type].addRange(i, i);
767                    switch (type) {
768                      case Character.UPPERCASE_LETTER:
769                      case Character.LOWERCASE_LETTER:
770                      case Character.TITLECASE_LETTER:
771                      case Character.MODIFIER_LETTER:
772                      case Character.OTHER_LETTER:
773                        type = CHAR_LETTER;
774                        break;
775                      case Character.NON_SPACING_MARK:
776                      case Character.COMBINING_SPACING_MARK:
777                      case Character.ENCLOSING_MARK:
778                        type = CHAR_MARK;
779                        break;
780                      case Character.DECIMAL_DIGIT_NUMBER:
781                      case Character.LETTER_NUMBER:
782                      case Character.OTHER_NUMBER:
783                        type = CHAR_NUMBER;
784                        break;
785                      case Character.SPACE_SEPARATOR:
786                      case Character.LINE_SEPARATOR:
787                      case Character.PARAGRAPH_SEPARATOR:
788                        type = CHAR_SEPARATOR;
789                        break;
790                      case Character.CONTROL:
791                      case Character.FORMAT:
792                      case Character.SURROGATE:
793                      case Character.PRIVATE_USE:
794                      case Character.UNASSIGNED:
795                        type = CHAR_OTHER;
796                        break;
797                      case Character.CONNECTOR_PUNCTUATION:
798                      case Character.DASH_PUNCTUATION:
799                      case Character.START_PUNCTUATION:
800                      case Character.END_PUNCTUATION:
801                      case CHAR_INIT_QUOTE:
802                      case CHAR_FINAL_QUOTE:
803                      case Character.OTHER_PUNCTUATION:
804                        type = CHAR_PUNCTUATION;
805                        break;
806                      case Character.MATH_SYMBOL:
807                      case Character.CURRENCY_SYMBOL:
808                      case Character.MODIFIER_SYMBOL:
809                      case Character.OTHER_SYMBOL:
810                        type = CHAR_SYMBOL;
811                        break;
812                      default:
813                        throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
814                    }
815                    ranges[type].addRange(i, i);
816                } // for all characters
817                ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
818
819                for (int i = 0;  i < ranges.length;  i ++) {
820                    if (Token.categoryNames[i] != null) {
821                        if (i == Character.UNASSIGNED) { // Unassigned
822                            ranges[i].addRange(0x10000, Token.UTF16_MAX);
823                        }
824                        Token.categories.put(Token.categoryNames[i], ranges[i]);
825                        Token.categories2.put(Token.categoryNames[i],
826                                              Token.complementRanges(ranges[i]));
827                    }
828                }
829                //REVISIT: do we really need to support block names as in Unicode 3.1
830                //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
831                //
832                StringBuilder buffer = new StringBuilder(50);
833                for (int i = 0;  i < Token.blockNames.length;  i ++) {
834                    Token r1 = Token.createRange();
835                    int location;
836                    if (i < NONBMP_BLOCK_START) {
837                        location = i*2;
838                        int rstart = Token.blockRanges.charAt(location);
839                        int rend = Token.blockRanges.charAt(location+1);
840                        //DEBUGING
841                        //System.out.println(n+" " +Integer.toHexString(rstart)
842                        //                     +"-"+ Integer.toHexString(rend));
843                        r1.addRange(rstart, rend);
844                    } else {
845                        location = (i - NONBMP_BLOCK_START) * 2;
846                        r1.addRange(Token.nonBMPBlockRanges[location],
847                                    Token.nonBMPBlockRanges[location + 1]);
848                    }
849                    String n = Token.blockNames[i];
850                    if (n.equals("Specials"))
851                        r1.addRange(0xfff0, 0xfffd);
852                    if (n.equals("Private Use")) {
853                        r1.addRange(0xF0000,0xFFFFD);
854                        r1.addRange(0x100000,0x10FFFD);
855                    }
856                    Token.categories.put(n, r1);
857                    Token.categories2.put(n, Token.complementRanges(r1));
858                    buffer.setLength(0);
859                    buffer.append("Is");
860                    if (n.indexOf(' ') >= 0) {
861                        for (int ci = 0;  ci < n.length();  ci ++)
862                            if (n.charAt(ci) != ' ')  buffer.append(n.charAt(ci));
863                    }
864                    else {
865                        buffer.append(n);
866                    }
867                    Token.setAlias(buffer.toString(), n, true);
868                }
869
870                // TR#18 1.2
871                Token.setAlias("ASSIGNED", "Cn", false);
872                Token.setAlias("UNASSIGNED", "Cn", true);
873                Token all = Token.createRange();
874                all.addRange(0, Token.UTF16_MAX);
875                Token.categories.put("ALL", all);
876                Token.categories2.put("ALL", Token.complementRanges(all));
877                Token.registerNonXS("ASSIGNED");
878                Token.registerNonXS("UNASSIGNED");
879                Token.registerNonXS("ALL");
880
881                Token isalpha = Token.createRange();
882                isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
883                isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
884                isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
885                Token.categories.put("IsAlpha", isalpha);
886                Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
887                Token.registerNonXS("IsAlpha");
888
889                Token isalnum = Token.createRange();
890                isalnum.mergeRanges(isalpha);   // Lu Ll Lo
891                isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
892                Token.categories.put("IsAlnum", isalnum);
893                Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
894                Token.registerNonXS("IsAlnum");
895
896                Token isspace = Token.createRange();
897                isspace.mergeRanges(Token.token_spaces);
898                isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
899                Token.categories.put("IsSpace", isspace);
900                Token.categories2.put("IsSpace", Token.complementRanges(isspace));
901                Token.registerNonXS("IsSpace");
902
903                Token isword = Token.createRange();
904                isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
905                isword.addRange('_', '_');
906                Token.categories.put("IsWord", isword);
907                Token.categories2.put("IsWord", Token.complementRanges(isword));
908                Token.registerNonXS("IsWord");
909
910                Token isascii = Token.createRange();
911                isascii.addRange(0, 127);
912                Token.categories.put("IsASCII", isascii);
913                Token.categories2.put("IsASCII", Token.complementRanges(isascii));
914                Token.registerNonXS("IsASCII");
915
916                Token isnotgraph = Token.createRange();
917                isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
918                isnotgraph.addRange(' ', ' ');
919                Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
920                Token.categories2.put("IsGraph", isnotgraph);
921                Token.registerNonXS("IsGraph");
922
923                Token isxdigit = Token.createRange();
924                isxdigit.addRange('0', '9');
925                isxdigit.addRange('A', 'F');
926                isxdigit.addRange('a', 'f');
927                Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
928                Token.categories2.put("IsXDigit", isxdigit);
929                Token.registerNonXS("IsXDigit");
930
931                Token.setAlias("IsDigit", "Nd", true);
932                Token.setAlias("IsUpper", "Lu", true);
933                Token.setAlias("IsLower", "Ll", true);
934                Token.setAlias("IsCntrl", "C", true);
935                Token.setAlias("IsPrint", "C", false);
936                Token.setAlias("IsPunct", "P", true);
937                Token.registerNonXS("IsDigit");
938                Token.registerNonXS("IsUpper");
939                Token.registerNonXS("IsLower");
940                Token.registerNonXS("IsCntrl");
941                Token.registerNonXS("IsPrint");
942                Token.registerNonXS("IsPunct");
943
944                Token.setAlias("alpha", "IsAlpha", true);
945                Token.setAlias("alnum", "IsAlnum", true);
946                Token.setAlias("ascii", "IsASCII", true);
947                Token.setAlias("cntrl", "IsCntrl", true);
948                Token.setAlias("digit", "IsDigit", true);
949                Token.setAlias("graph", "IsGraph", true);
950                Token.setAlias("lower", "IsLower", true);
951                Token.setAlias("print", "IsPrint", true);
952                Token.setAlias("punct", "IsPunct", true);
953                Token.setAlias("space", "IsSpace", true);
954                Token.setAlias("upper", "IsUpper", true);
955                Token.setAlias("word", "IsWord", true); // Perl extension
956                Token.setAlias("xdigit", "IsXDigit", true);
957                Token.registerNonXS("alpha");
958                Token.registerNonXS("alnum");
959                Token.registerNonXS("ascii");
960                Token.registerNonXS("cntrl");
961                Token.registerNonXS("digit");
962                Token.registerNonXS("graph");
963                Token.registerNonXS("lower");
964                Token.registerNonXS("print");
965                Token.registerNonXS("punct");
966                Token.registerNonXS("space");
967                Token.registerNonXS("upper");
968                Token.registerNonXS("word");
969                Token.registerNonXS("xdigit");
970            } // synchronized
971        } // if null
972        RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
973            : (RangeToken)Token.categories2.get(name);
974        //if (tok == null) System.out.println(name);
975        return tok;
976    }
977    static protected RangeToken getRange(String name, boolean positive, boolean xs) {
978        RangeToken range = Token.getRange(name, positive);
979        if (xs && range != null && Token.isRegisterNonXS(name))
980            range = null;
981        return range;
982    }
983
984    static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>());
985    /**
986     * This method is called by only getRange().
987     * So this method need not MT-safe.
988     */
989    static protected void registerNonXS(String name) {
990        Token.nonxs.add(name);
991    }
992
993    static protected boolean isRegisterNonXS(String name) {
994        return Token.nonxs.contains(name);
995    }
996
997    private static void setAlias(String newName, String name, boolean positive) {
998        Token t1 = Token.categories.get(name);
999        Token t2 = Token.categories2.get(name);
1000        if (positive) {
1001            Token.categories.put(newName, t1);
1002            Token.categories2.put(newName, t2);
1003        } else {
1004            Token.categories2.put(newName, t1);
1005            Token.categories.put(newName, t2);
1006        }
1007    }
1008
1009    // ------------------------------------------------------
1010
1011    static final String viramaString =
1012    "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1013    +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1014    +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1015    +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1016    +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1017    +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1018    +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1019    +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1020    +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1021    +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1022    +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1023
1024    static private Token token_grapheme = null;
1025    static synchronized Token getGraphemePattern() {
1026        if (Token.token_grapheme != null)
1027            return Token.token_grapheme;
1028
1029        Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
1030        base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1031        base_char.subtractRanges(Token.getRange("M", true));
1032        base_char.subtractRanges(Token.getRange("C", true));
1033
1034        Token virama = Token.createRange();
1035        for (int i = 0;  i < Token.viramaString.length(); i++) {
1036            virama.addRange(i, i);
1037        }
1038
1039        Token combiner_wo_virama = Token.createRange();
1040        combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1041        combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1042        combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1043
1044        Token left = Token.createUnion();       // base_char?
1045        left.addChild(base_char);
1046        left.addChild(Token.token_empty);
1047
1048        Token foo = Token.createUnion();
1049        foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1050        foo.addChild(combiner_wo_virama);
1051
1052        foo = Token.createClosure(foo);
1053
1054        foo = Token.createConcat(left, foo);
1055
1056        Token.token_grapheme = foo;
1057        return Token.token_grapheme;
1058    }
1059
1060    /**
1061     * Combing Character Sequence in Perl 5.6.
1062     */
1063    static private Token token_ccs = null;
1064    static synchronized Token getCombiningCharacterSequence() {
1065        if (Token.token_ccs != null)
1066            return Token.token_ccs;
1067
1068        Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1069        foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1070        Token.token_ccs = foo;
1071        return Token.token_ccs;
1072    }
1073
1074    // ------------------------------------------------------
1075
1076    // ------------------------------------------------------
1077    /**
1078     * This class represents a node in parse tree.
1079     */
1080    static class StringToken extends Token implements java.io.Serializable {
1081
1082        private static final long serialVersionUID = -4614366944218504172L;
1083
1084        String string;
1085        final int refNumber;
1086
1087        StringToken(int type, String str, int n) {
1088            super(type);
1089            this.string = str;
1090            this.refNumber = n;
1091        }
1092
1093        int getReferenceNumber() {              // for STRING
1094            return this.refNumber;
1095        }
1096        String getString() {                    // for STRING
1097            return this.string;
1098        }
1099
1100        public String toString(int options) {
1101            if (this.type == BACKREFERENCE)
1102                return "\\"+this.refNumber;
1103            else
1104                return REUtil.quoteMeta(this.string);
1105        }
1106    }
1107
1108    /**
1109     * This class represents a node in parse tree.
1110     */
1111    static class ConcatToken extends Token implements java.io.Serializable {
1112
1113        private static final long serialVersionUID = 8717321425541346381L;
1114
1115        final Token child;
1116        final Token child2;
1117
1118        ConcatToken(Token t1, Token t2) {
1119            super(Token.CONCAT);
1120            this.child = t1;
1121            this.child2 = t2;
1122        }
1123
1124        int size() {
1125            return 2;
1126        }
1127        Token getChild(int index) {
1128            return index == 0 ? this.child : this.child2;
1129        }
1130
1131        public String toString(int options) {
1132            String ret;
1133            if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1134                ret = this.child.toString(options)+"+";
1135            } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1136                ret = this.child.toString(options)+"+?";
1137            } else
1138                ret = this.child.toString(options)+this.child2.toString(options);
1139            return ret;
1140        }
1141    }
1142
1143    /**
1144     * This class represents a node in parse tree.
1145     */
1146    static class CharToken extends Token implements java.io.Serializable {
1147
1148        private static final long serialVersionUID = -4394272816279496989L;
1149
1150        final int chardata;
1151
1152        CharToken(int type, int ch) {
1153            super(type);
1154            this.chardata = ch;
1155        }
1156
1157        int getChar() {
1158            return this.chardata;
1159        }
1160
1161        public String toString(int options) {
1162            String ret;
1163            switch (this.type) {
1164              case CHAR:
1165                switch (this.chardata) {
1166                  case '|':  case '*':  case '+':  case '?':
1167                  case '(':  case ')':  case '.':  case '[':
1168                  case '{':  case '\\':
1169                    ret = "\\"+(char)this.chardata;
1170                    break;
1171                  case '\f':  ret = "\\f";  break;
1172                  case '\n':  ret = "\\n";  break;
1173                  case '\r':  ret = "\\r";  break;
1174                  case '\t':  ret = "\\t";  break;
1175                  case 0x1b:  ret = "\\e";  break;
1176                    //case 0x0b:  ret = "\\v";  break;
1177                  default:
1178                    if (this.chardata >= 0x10000) {
1179                        String pre = "0"+Integer.toHexString(this.chardata);
1180                        ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1181                    } else
1182                        ret = ""+(char)this.chardata;
1183                }
1184                break;
1185
1186              case ANCHOR:
1187                if (this == Token.token_linebeginning || this == Token.token_lineend)
1188                    ret = ""+(char)this.chardata;
1189                else
1190                    ret = "\\"+(char)this.chardata;
1191                break;
1192
1193              default:
1194                ret = null;
1195            }
1196            return ret;
1197        }
1198
1199        boolean match(int ch) {
1200            if (this.type == CHAR) {
1201                return ch == this.chardata;
1202            } else
1203                throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1204        }
1205    }
1206
1207    /**
1208     * This class represents a node in parse tree.
1209     */
1210    static class ClosureToken extends Token implements java.io.Serializable {
1211
1212        private static final long serialVersionUID = 1308971930673997452L;
1213
1214        int min;
1215        int max;
1216        final Token child;
1217
1218        ClosureToken(int type, Token tok) {
1219            super(type);
1220            this.child = tok;
1221            this.setMin(-1);
1222            this.setMax(-1);
1223        }
1224
1225        int size() {
1226            return 1;
1227        }
1228        Token getChild(int index) {
1229            return this.child;
1230        }
1231
1232        final void setMin(int min) {
1233            this.min = min;
1234        }
1235        final void setMax(int max) {
1236            this.max = max;
1237        }
1238        final int getMin() {
1239            return this.min;
1240        }
1241        final int getMax() {
1242            return this.max;
1243        }
1244
1245        public String toString(int options) {
1246            String ret;
1247            if (this.type == CLOSURE) {
1248                if (this.getMin() < 0 && this.getMax() < 0) {
1249                    ret = this.child.toString(options)+"*";
1250                } else if (this.getMin() == this.getMax()) {
1251                    ret = this.child.toString(options)+"{"+this.getMin()+"}";
1252                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1253                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1254                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1255                    ret = this.child.toString(options)+"{"+this.getMin()+",}";
1256                } else
1257                    throw new RuntimeException("Token#toString(): CLOSURE "
1258                                               +this.getMin()+", "+this.getMax());
1259            } else {
1260                if (this.getMin() < 0 && this.getMax() < 0) {
1261                    ret = this.child.toString(options)+"*?";
1262                } else if (this.getMin() == this.getMax()) {
1263                    ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1264                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1265                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1266                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1267                    ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1268                } else
1269                    throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1270                                               +this.getMin()+", "+this.getMax());
1271            }
1272            return ret;
1273        }
1274    }
1275
1276    /**
1277     * This class represents a node in parse tree.
1278     */
1279    static class ParenToken extends Token implements java.io.Serializable {
1280
1281        private static final long serialVersionUID = -5938014719827987704L;
1282
1283        final Token child;
1284        final int parennumber;
1285
1286        ParenToken(int type, Token tok, int paren) {
1287            super(type);
1288            this.child = tok;
1289            this.parennumber = paren;
1290        }
1291
1292        int size() {
1293            return 1;
1294        }
1295        Token getChild(int index) {
1296            return this.child;
1297        }
1298
1299        int getParenNumber() {
1300            return this.parennumber;
1301        }
1302
1303        public String toString(int options) {
1304            String ret = null;
1305            switch (this.type) {
1306              case PAREN:
1307                if (this.parennumber == 0) {
1308                    ret = "(?:"+this.child.toString(options)+")";
1309                } else {
1310                    ret = "("+this.child.toString(options)+")";
1311                }
1312                break;
1313
1314              case LOOKAHEAD:
1315                ret = "(?="+this.child.toString(options)+")";
1316                break;
1317              case NEGATIVELOOKAHEAD:
1318                ret = "(?!"+this.child.toString(options)+")";
1319                break;
1320              case LOOKBEHIND:
1321                ret = "(?<="+this.child.toString(options)+")";
1322                break;
1323              case NEGATIVELOOKBEHIND:
1324                ret = "(?<!"+this.child.toString(options)+")";
1325                break;
1326              case INDEPENDENT:
1327                ret = "(?>"+this.child.toString(options)+")";
1328                break;
1329            }
1330            return ret;
1331        }
1332    }
1333
1334    /**
1335     * (?(condition)yes-pattern|no-pattern)
1336     */
1337    static class ConditionToken extends Token implements java.io.Serializable {
1338
1339        private static final long serialVersionUID = 4353765277910594411L;
1340
1341        final int refNumber;
1342        final Token condition;
1343        final Token yes;
1344        final Token no;
1345        ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1346            super(Token.CONDITION);
1347            this.refNumber = refno;
1348            this.condition = cond;
1349            this.yes = yespat;
1350            this.no = nopat;
1351        }
1352        int size() {
1353            return this.no == null ? 1 : 2;
1354        }
1355        Token getChild(int index) {
1356            if (index == 0)  return this.yes;
1357            if (index == 1)  return this.no;
1358            throw new RuntimeException("Internal Error: "+index);
1359        }
1360
1361        public String toString(int options) {
1362            String ret;
1363            if (refNumber > 0) {
1364                ret = "(?("+refNumber+")";
1365            } else if (this.condition.type == Token.ANCHOR) {
1366                ret = "(?("+this.condition+")";
1367            } else {
1368                ret = "(?"+this.condition;
1369            }
1370
1371            if (this.no == null) {
1372                ret += this.yes+")";
1373            } else {
1374                ret += this.yes+"|"+this.no+")";
1375            }
1376            return ret;
1377        }
1378    }
1379
1380    /**
1381     * (ims-ims: .... )
1382     */
1383    static class ModifierToken extends Token implements java.io.Serializable {
1384
1385        private static final long serialVersionUID = -9114536559696480356L;
1386
1387        final Token child;
1388        final int add;
1389        final int mask;
1390
1391        ModifierToken(Token tok, int add, int mask) {
1392            super(Token.MODIFIERGROUP);
1393            this.child = tok;
1394            this.add = add;
1395            this.mask = mask;
1396        }
1397
1398        int size() {
1399            return 1;
1400        }
1401        Token getChild(int index) {
1402            return this.child;
1403        }
1404
1405        int getOptions() {
1406            return this.add;
1407        }
1408        int getOptionsMask() {
1409            return this.mask;
1410        }
1411
1412        public String toString(int options) {
1413            return "(?"
1414                +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1415                +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1416                +":"
1417                +this.child.toString(options)
1418                +")";
1419        }
1420    }
1421
1422    /**
1423     * This class represents a node in parse tree.
1424     * for UNION or CONCAT.
1425     */
1426    static class UnionToken extends Token implements java.io.Serializable {
1427
1428        private static final long serialVersionUID = -2568843945989489861L;
1429
1430        List<Token> children;
1431
1432        /**
1433         * @serialField children Vector children
1434         */
1435        private static final ObjectStreamField[] serialPersistentFields =
1436            new ObjectStreamField[] {
1437                new ObjectStreamField("children", Vector.class),
1438            };
1439
1440        UnionToken(int type) {
1441            super(type);
1442        }
1443
1444        @Override
1445        void addChild(Token tok) {
1446            if (tok == null)  return;
1447            if (this.children == null)  this.children = new ArrayList<>();
1448            if (this.type == UNION) {
1449                this.children.add(tok);
1450                return;
1451            }
1452                                                // This is CONCAT, and new child is CONCAT.
1453            if (tok.type == CONCAT) {
1454                for (int i = 0;  i < tok.size();  i ++)
1455                    this.addChild(tok.getChild(i)); // Recursion
1456                return;
1457            }
1458            int size = this.children.size();
1459            if (size == 0) {
1460                this.children.add(tok);
1461                return;
1462            }
1463            Token previous = this.children.get(size - 1);
1464            if (!((previous.type == CHAR || previous.type == STRING)
1465                  && (tok.type == CHAR || tok.type == STRING))) {
1466                this.children.add(tok);
1467                return;
1468            }
1469
1470            //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1471
1472            StringBuilder buffer;
1473            int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1474            if (previous.type == CHAR) {        // Replace previous token by STRING
1475                buffer = new StringBuilder(2 + nextMaxLength);
1476                int ch = previous.getChar();
1477                if (ch >= 0x10000)
1478                    buffer.append(REUtil.decomposeToSurrogates(ch));
1479                else
1480                    buffer.append((char)ch);
1481                previous = Token.createString(null);
1482                this.children.set(size - 1, previous);
1483            } else {                            // STRING
1484                buffer = new StringBuilder(previous.getString().length() + nextMaxLength);
1485                buffer.append(previous.getString());
1486            }
1487
1488            if (tok.type == CHAR) {
1489                int ch = tok.getChar();
1490                if (ch >= 0x10000)
1491                    buffer.append(REUtil.decomposeToSurrogates(ch));
1492                else
1493                    buffer.append((char)ch);
1494            } else {
1495                buffer.append(tok.getString());
1496            }
1497
1498            ((StringToken)previous).string = new String(buffer);
1499        }
1500
1501        @Override
1502        int size() {
1503            return this.children == null ? 0 : this.children.size();
1504        }
1505        @Override
1506        Token getChild(int index) {
1507            return this.children.get(index);
1508        }
1509
1510        @Override
1511        public String toString(int options) {
1512            String ret;
1513            if (this.type == CONCAT) {
1514                if (this.children.size() == 2) {
1515                    Token ch = this.getChild(0);
1516                    Token ch2 = this.getChild(1);
1517                    if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1518                        ret = ch.toString(options)+"+";
1519                    } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1520                        ret = ch.toString(options)+"+?";
1521                    } else
1522                        ret = ch.toString(options)+ch2.toString(options);
1523                } else {
1524                    StringBuilder sb = new StringBuilder();
1525                    this.children.stream().forEach((children1) -> {
1526                        sb.append((children1).toString(options));
1527                    });
1528                    ret = sb.toString();
1529                }
1530                return ret;
1531            }
1532            if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1533                ret = this.getChild(0).toString(options)+"?";
1534            } else if (this.children.size() == 2
1535                       && this.getChild(0).type == EMPTY) {
1536                ret = this.getChild(1).toString(options)+"??";
1537            } else {
1538                StringBuilder sb = new StringBuilder();
1539                sb.append((this.children.get(0)).toString(options));
1540                for (int i = 1;  i < this.children.size();  i ++) {
1541                    sb.append('|');
1542                    sb.append((this.children.get(i)).toString(options));
1543                }
1544                ret = sb.toString();
1545            }
1546            return ret;
1547        }
1548
1549        /**
1550         * @serialData Serialized fields. Convert the List to Vector for backward compatibility.
1551         */
1552        private void writeObject(ObjectOutputStream out) throws IOException {
1553            // Convert List to Vector
1554            Vector<Token> vChildren = (children == null)? null : new Vector<>(children);
1555
1556            // Write serialized fields
1557            ObjectOutputStream.PutField pf = out.putFields();
1558            pf.put("children", vChildren);
1559            out.writeFields();
1560    }
1561
1562        @SuppressWarnings("unchecked")
1563        private void readObject(ObjectInputStream in)
1564                            throws IOException, ClassNotFoundException {
1565            // We have to read serialized fields first.
1566            ObjectInputStream.GetField gf = in.readFields();
1567            Vector<Token> vChildren = (Vector<Token>)gf.get("children", null);
1568
1569            //convert Vector back to List
1570            if (vChildren != null) children = new ArrayList<>(vChildren);
1571        }
1572    }
1573}
1574