1/*
2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3 */
4/*
5 * Licensed to the Apache Software Foundation (ASF) under one or more
6 * contributor license agreements.  See the NOTICE file distributed with
7 * this work for additional information regarding copyright ownership.
8 * The ASF licenses this file to You under the Apache License, Version 2.0
9 * (the "License"); you may not use this file except in compliance with
10 * the License.  You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22
23import com.sun.org.apache.xerces.internal.utils.SecuritySupport;
24import java.util.Locale;
25import java.util.MissingResourceException;
26import java.util.ResourceBundle;
27import java.util.ArrayList;
28
29/**
30 * A Regular Expression Parser.
31 *
32 * @xerces.internal
33 *
34 */
35class RegexParser {
36    static final int T_CHAR = 0;
37    static final int T_EOF = 1;
38    static final int T_OR = 2;                  // '|'
39    static final int T_STAR = 3;                // '*'
40    static final int T_PLUS = 4;                // '+'
41    static final int T_QUESTION = 5;            // '?'
42    static final int T_LPAREN = 6;              // '('
43    static final int T_RPAREN = 7;              // ')'
44    static final int T_DOT = 8;                 // '.'
45    static final int T_LBRACKET = 9;            // '['
46    static final int T_BACKSOLIDUS = 10;        // '\'
47    static final int T_CARET = 11;              // '^'
48    static final int T_DOLLAR = 12;             // '$'
49    static final int T_LPAREN2 = 13;            // '(?:'
50    static final int T_LOOKAHEAD = 14;          // '(?='
51    static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
52    static final int T_LOOKBEHIND = 16;         // '(?<='
53    static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
54    static final int T_INDEPENDENT = 18;        // '(?>'
55    static final int T_SET_OPERATIONS = 19;     // '(?['
56    static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
57    static final int T_COMMENT = 21;            // '(?#'
58    static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
59    static final int T_CONDITION = 23;          // '(?('
60    static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
61
62    static class ReferencePosition {
63        int refNumber;
64        int position;
65        ReferencePosition(int n, int pos) {
66            this.refNumber = n;
67            this.position = pos;
68        }
69    }
70
71    int offset;
72    String regex;
73    int regexlen;
74    int options;
75    ResourceBundle resources;
76    int chardata;
77    int nexttoken;
78    static protected final int S_NORMAL = 0;
79    static protected final int S_INBRACKETS = 1;
80    static protected final int S_INXBRACKETS = 2;
81    int context = S_NORMAL;
82    int parenOpened = 1;
83    int parennumber = 1;
84    boolean hasBackReferences;
85    ArrayList<ReferencePosition> references = null;
86
87    public RegexParser() {
88        this.setLocale(Locale.getDefault());
89    }
90    public RegexParser(Locale locale) {
91        this.setLocale(locale);
92    }
93
94    public void setLocale(Locale locale) {
95        try {
96            if (locale != null) {
97                this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
98            }
99            else {
100                this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message");
101            }
102        }
103        catch (MissingResourceException mre) {
104            throw new RuntimeException("Installation Problem???  Couldn't load messages: "
105                                       + mre.getMessage());
106        }
107    }
108
109    final ParseException ex(String key, int loc) {
110        return new ParseException(this.resources.getString(key), loc);
111    }
112
113    protected final boolean isSet(int flag) {
114        return (this.options & flag) == flag;
115    }
116
117    Token parse(String regex, int options) throws ParseException {
118        this.options = options;
119        this.offset = 0;
120        this.setContext(S_NORMAL);
121        this.parennumber = 1;
122        this.parenOpened = 1;
123        this.hasBackReferences = false;
124        this.regex = regex;
125        if (this.isSet(RegularExpression.EXTENDED_COMMENT))
126            this.regex = REUtil.stripExtendedComment(this.regex);
127        this.regexlen = this.regex.length();
128
129
130        this.next();
131        Token ret = this.parseRegex();
132        if (this.offset != this.regexlen)
133            throw ex("parser.parse.1", this.offset);
134        if (this.read() != T_EOF) {
135            throw ex("parser.parse.1", this.offset-1);
136        }
137        if (this.references != null) {
138            for (int i = 0;  i < this.references.size();  i ++) {
139                ReferencePosition position = this.references.get(i);
140                if (this.parennumber <= position.refNumber)
141                    throw ex("parser.parse.2", position.position);
142            }
143            this.references.clear();
144        }
145        return ret;
146    }
147
148    /*
149    public RegularExpression createRegex(String regex, int options) throws ParseException {
150        Token tok = this.parse(regex, options);
151        return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
152    }
153    */
154
155    protected final void setContext(int con) {
156        this.context = con;
157    }
158
159    final int read() {
160        return this.nexttoken;
161    }
162
163    @SuppressWarnings("fallthrough")
164    final void next() {
165        if (this.offset >= this.regexlen) {
166            this.chardata = -1;
167            this.nexttoken = T_EOF;
168            return;
169        }
170
171        int ret;
172        int ch = this.regex.charAt(this.offset++);
173        this.chardata = ch;
174
175        if (this.context == S_INBRACKETS) {
176            // In a character class, this.chardata has one character, that is to say,
177            // a pair of surrogates is composed and stored to this.chardata.
178            switch (ch) {
179              case '\\':
180                ret = T_BACKSOLIDUS;
181                if (this.offset >= this.regexlen)
182                    throw ex("parser.next.1", this.offset-1);
183                this.chardata = this.regex.charAt(this.offset++);
184                break;
185
186              case '-':
187                // Allow character class subtraction (regardless of whether we are in
188                // XML Schema mode or not)
189                if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
190                    this.offset++;
191                    ret = T_XMLSCHEMA_CC_SUBTRACTION;
192                } else
193                    ret = T_CHAR;
194                break;
195
196              case '[':
197                if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
198                    && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
199                    this.offset++;
200                    ret = T_POSIX_CHARCLASS_START;
201                    break;
202                } // Through down
203              default:
204                if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
205                    int low = this.regex.charAt(this.offset);
206                    if (REUtil.isLowSurrogate(low)) {
207                        this.chardata = REUtil.composeFromSurrogates(ch, low);
208                        this.offset ++;
209                    }
210                }
211                ret = T_CHAR;
212            }
213            this.nexttoken = ret;
214            return;
215        }
216
217        switch (ch) {
218          case '|': ret = T_OR;             break;
219          case '*': ret = T_STAR;           break;
220          case '+': ret = T_PLUS;           break;
221          case '?': ret = T_QUESTION;       break;
222          case ')': ret = T_RPAREN;         break;
223          case '.': ret = T_DOT;            break;
224          case '[': ret = T_LBRACKET;       break;
225          case '^':
226              if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
227                  ret = T_CHAR;
228              }
229              else {
230                  ret = T_CARET;
231              }
232              break;
233          case '$':
234              if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
235                  ret = T_CHAR;
236              }
237              else {
238                  ret = T_DOLLAR;
239              }
240              break;
241          case '(':
242            ret = T_LPAREN;
243            if (this.offset >= this.regexlen)
244                break;
245            if (this.regex.charAt(this.offset) != '?')
246                break;
247            if (++this.offset >= this.regexlen)
248                throw ex("parser.next.2", this.offset-1);
249            ch = this.regex.charAt(this.offset++);
250            switch (ch) {
251              case ':':  ret = T_LPAREN2;            break;
252              case '=':  ret = T_LOOKAHEAD;          break;
253              case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
254              case '[':  ret = T_SET_OPERATIONS;     break;
255              case '>':  ret = T_INDEPENDENT;        break;
256              case '<':
257                if (this.offset >= this.regexlen)
258                    throw ex("parser.next.2", this.offset-3);
259                ch = this.regex.charAt(this.offset++);
260                if (ch == '=') {
261                    ret = T_LOOKBEHIND;
262                } else if (ch == '!') {
263                    ret = T_NEGATIVELOOKBEHIND;
264                } else
265                    throw ex("parser.next.3", this.offset-3);
266                break;
267              case '#':
268                while (this.offset < this.regexlen) {
269                    ch = this.regex.charAt(this.offset++);
270                    if (ch == ')')  break;
271                }
272                if (ch != ')')
273                    throw ex("parser.next.4", this.offset-1);
274                ret = T_COMMENT;
275                break;
276              default:
277                if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
278                    this.offset --;
279                    ret = T_MODIFIERS;
280                    break;
281                } else if (ch == '(') {         // conditional
282                    ret = T_CONDITION;          // this.offsets points the next of '('.
283                    break;
284                }
285                throw ex("parser.next.2", this.offset-2);
286            }
287            break;
288
289          case '\\':
290            ret = T_BACKSOLIDUS;
291            if (this.offset >= this.regexlen)
292                throw ex("parser.next.1", this.offset-1);
293            this.chardata = this.regex.charAt(this.offset++);
294            break;
295
296          default:
297            ret = T_CHAR;
298        }
299        this.nexttoken = ret;
300    }
301
302    /**
303     * regex ::= term (`|` term)*
304     * term ::= factor+
305     * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
306     *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
307     *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
308     * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
309     *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
310     */
311    Token parseRegex() throws ParseException {
312        Token tok = this.parseTerm();
313        Token parent = null;
314        while (this.read() == T_OR) {
315            this.next();                    // '|'
316            if (parent == null) {
317                parent = Token.createUnion();
318                parent.addChild(tok);
319                tok = parent;
320            }
321            tok.addChild(this.parseTerm());
322        }
323        return tok;
324    }
325
326    /**
327     * term ::= factor+
328     */
329    Token parseTerm() throws ParseException {
330        int ch = this.read();
331        if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
332            return Token.createEmpty();
333        } else {
334            Token tok = this.parseFactor();
335            Token concat = null;
336            while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
337                if (concat == null) {
338                    concat = Token.createConcat();
339                    concat.addChild(tok);
340                    tok = concat;
341                }
342                concat.addChild(this.parseFactor());
343                //tok = Token.createConcat(tok, this.parseFactor());
344            }
345            return tok;
346        }
347    }
348
349    // ----------------------------------------------------------------
350
351    Token processCaret() throws ParseException {
352        this.next();
353        return Token.token_linebeginning;
354    }
355    Token processDollar() throws ParseException {
356        this.next();
357        return Token.token_lineend;
358    }
359    Token processLookahead() throws ParseException {
360        this.next();
361        Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
362        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
363        this.next();                            // ')'
364        return tok;
365    }
366    Token processNegativelookahead() throws ParseException {
367        this.next();
368        Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
369        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
370        this.next();                            // ')'
371        return tok;
372    }
373    Token processLookbehind() throws ParseException {
374        this.next();
375        Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
376        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
377        this.next();                            // ')'
378        return tok;
379    }
380    Token processNegativelookbehind() throws ParseException {
381        this.next();
382        Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
383        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
384        this.next();                    // ')'
385        return tok;
386    }
387    Token processBacksolidus_A() throws ParseException {
388        this.next();
389        return Token.token_stringbeginning;
390    }
391    Token processBacksolidus_Z() throws ParseException {
392        this.next();
393        return Token.token_stringend2;
394    }
395    Token processBacksolidus_z() throws ParseException {
396        this.next();
397        return Token.token_stringend;
398    }
399    Token processBacksolidus_b() throws ParseException {
400        this.next();
401        return Token.token_wordedge;
402    }
403    Token processBacksolidus_B() throws ParseException {
404        this.next();
405        return Token.token_not_wordedge;
406    }
407    Token processBacksolidus_lt() throws ParseException {
408        this.next();
409        return Token.token_wordbeginning;
410    }
411    Token processBacksolidus_gt() throws ParseException {
412        this.next();
413        return Token.token_wordend;
414    }
415    Token processStar(Token tok) throws ParseException {
416        this.next();
417        if (this.read() == T_QUESTION) {
418            this.next();
419            return Token.createNGClosure(tok);
420        } else
421            return Token.createClosure(tok);
422    }
423    Token processPlus(Token tok) throws ParseException {
424        // X+ -> XX*
425        this.next();
426        if (this.read() == T_QUESTION) {
427            this.next();
428            return Token.createConcat(tok, Token.createNGClosure(tok));
429        } else
430            return Token.createConcat(tok, Token.createClosure(tok));
431    }
432    Token processQuestion(Token tok) throws ParseException {
433        // X? -> X|
434        this.next();
435        Token par = Token.createUnion();
436        if (this.read() == T_QUESTION) {
437            this.next();
438            par.addChild(Token.createEmpty());
439            par.addChild(tok);
440        } else {
441            par.addChild(tok);
442            par.addChild(Token.createEmpty());
443        }
444        return par;
445    }
446    boolean checkQuestion(int off) {
447        return off < this.regexlen && this.regex.charAt(off) == '?';
448    }
449    Token processParen() throws ParseException {
450        this.next();
451        int p = this.parenOpened++;
452        Token tok = Token.createParen(this.parseRegex(), p);
453        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
454        this.parennumber++;
455        this.next();                            // Skips ')'
456        return tok;
457    }
458    Token processParen2() throws ParseException {
459        this.next();
460        Token tok = Token.createParen(this.parseRegex(), 0);
461        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
462        this.next();                            // Skips ')'
463        return tok;
464    }
465    Token processCondition() throws ParseException {
466                                                // this.offset points the next of '('
467        if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
468                                                // Parses a condition.
469        int refno = -1;
470        Token condition = null;
471        int ch = this.regex.charAt(this.offset);
472        if ('1' <= ch && ch <= '9') {
473            refno = ch-'0';
474            int finalRefno = refno;
475
476            if (this.parennumber <= refno)
477                throw ex("parser.parse.2", this.offset);
478
479            while (this.offset + 1 < this.regexlen) {
480                ch = this.regex.charAt(this.offset + 1);
481                if ('0' <= ch && ch <= '9') {
482                    refno = (refno * 10) + (ch - '0');
483                    if (refno < this.parennumber) {
484                        finalRefno= refno;
485                        ++this.offset;
486                    }
487                    else {
488                        break;
489                    }
490                }
491                else {
492                    break;
493                }
494            }
495
496            this.hasBackReferences = true;
497            if (this.references == null)  this.references = new ArrayList<>();
498            this.references.add(new ReferencePosition(finalRefno, this.offset));
499            this.offset ++;
500            if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
501            this.offset ++;
502        } else {
503            if (ch == '?')  this.offset --; // Points '('.
504            this.next();
505            condition = this.parseFactor();
506            switch (condition.type) {
507              case Token.LOOKAHEAD:
508              case Token.NEGATIVELOOKAHEAD:
509              case Token.LOOKBEHIND:
510              case Token.NEGATIVELOOKBEHIND:
511                break;
512              case Token.ANCHOR:
513                if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
514                break;
515              default:
516                throw ex("parser.factor.5", this.offset);
517            }
518        }
519                                                // Parses yes/no-patterns.
520        this.next();
521        Token yesPattern = this.parseRegex();
522        Token noPattern = null;
523        if (yesPattern.type == Token.UNION) {
524            if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
525            noPattern = yesPattern.getChild(1);
526            yesPattern = yesPattern.getChild(0);
527        }
528        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
529        this.next();
530        return Token.createCondition(refno, condition, yesPattern, noPattern);
531    }
532    Token processModifiers() throws ParseException {
533                                                // this.offset points the next of '?'.
534                                                // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
535        int add = 0, mask = 0, ch = -1;
536        while (this.offset < this.regexlen) {
537            ch = this.regex.charAt(this.offset);
538            int v = REUtil.getOptionValue(ch);
539            if (v == 0)  break;                 // '-' or ':'?
540            add |= v;
541            this.offset ++;
542        }
543        if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
544        if (ch == '-') {
545            this.offset ++;
546            while (this.offset < this.regexlen) {
547                ch = this.regex.charAt(this.offset);
548                int v = REUtil.getOptionValue(ch);
549                if (v == 0)  break;             // ':'?
550                mask |= v;
551                this.offset ++;
552            }
553            if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
554        }
555        Token tok;
556        if (ch == ':') {
557            this.offset ++;
558            this.next();
559            tok = Token.createModifierGroup(this.parseRegex(), add, mask);
560            if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
561            this.next();
562        } else if (ch == ')') {                 // such as (?-i)
563            this.offset ++;
564            this.next();
565            tok = Token.createModifierGroup(this.parseRegex(), add, mask);
566        } else
567            throw ex("parser.factor.3", this.offset);
568
569        return tok;
570    }
571    Token processIndependent() throws ParseException {
572        this.next();
573        Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
574        if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
575        this.next();                            // Skips ')'
576        return tok;
577    }
578    Token processBacksolidus_c() throws ParseException {
579        int ch2;                                // Must be in 0x0040-0x005f
580        if (this.offset >= this.regexlen
581            || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
582            throw ex("parser.atom.1", this.offset-1);
583        this.next();
584        return Token.createChar(ch2-0x40);
585    }
586    Token processBacksolidus_C() throws ParseException {
587        throw ex("parser.process.1", this.offset);
588    }
589    Token processBacksolidus_i() throws ParseException {
590        Token tok = Token.createChar('i');
591        this.next();
592        return tok;
593    }
594    Token processBacksolidus_I() throws ParseException {
595        throw ex("parser.process.1", this.offset);
596    }
597    Token processBacksolidus_g() throws ParseException {
598        this.next();
599        return Token.getGraphemePattern();
600    }
601    Token processBacksolidus_X() throws ParseException {
602        this.next();
603        return Token.getCombiningCharacterSequence();
604    }
605    Token processBackreference() throws ParseException {
606        int refnum = this.chardata-'0';
607        int finalRefnum = refnum;
608
609        if (this.parennumber <= refnum)
610            throw ex("parser.parse.2", this.offset-2);
611
612        while  (this.offset < this.regexlen) {
613            final int ch = this.regex.charAt(this.offset);
614            if ('0' <= ch && ch <= '9') {
615                refnum = (refnum * 10) + (ch - '0');
616                if (refnum < this.parennumber) {
617                    ++this.offset;
618                    finalRefnum = refnum;
619                    this.chardata = ch;
620                }
621                else {
622                    break;
623                }
624            }
625            else {
626                break;
627            }
628        }
629
630        Token tok = Token.createBackReference(finalRefnum);
631        this.hasBackReferences = true;
632        if (this.references == null)  this.references = new ArrayList<>();
633        this.references.add(new ReferencePosition(finalRefnum, this.offset-2));
634        this.next();
635        return tok;
636    }
637
638    // ----------------------------------------------------------------
639
640    /**
641     * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
642     *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
643     *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
644     *            | '(?#' [^)]* ')'
645     * minmax ::= '{' min (',' max?)? '}'
646     * min ::= [0-9]+
647     * max ::= [0-9]+
648     */
649    Token parseFactor() throws ParseException {
650        int ch = this.read();
651        Token tok;
652        switch (ch) {
653          case T_CARET:         return this.processCaret();
654          case T_DOLLAR:        return this.processDollar();
655          case T_LOOKAHEAD:     return this.processLookahead();
656          case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
657          case T_LOOKBEHIND:    return this.processLookbehind();
658          case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
659
660          case T_COMMENT:
661            this.next();
662            return Token.createEmpty();
663
664          case T_BACKSOLIDUS:
665            switch (this.chardata) {
666              case 'A': return this.processBacksolidus_A();
667              case 'Z': return this.processBacksolidus_Z();
668              case 'z': return this.processBacksolidus_z();
669              case 'b': return this.processBacksolidus_b();
670              case 'B': return this.processBacksolidus_B();
671              case '<': return this.processBacksolidus_lt();
672              case '>': return this.processBacksolidus_gt();
673            }
674                                                // through down
675        }
676        tok = this.parseAtom();
677        ch = this.read();
678        switch (ch) {
679          case T_STAR:  return this.processStar(tok);
680          case T_PLUS:  return this.processPlus(tok);
681          case T_QUESTION: return this.processQuestion(tok);
682          case T_CHAR:
683            if (this.chardata == '{' && this.offset < this.regexlen) {
684
685                int off = this.offset;          // this.offset -> next of '{'
686                int min = 0, max = -1;
687
688                if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
689
690                    min = ch -'0';
691                    while (off < this.regexlen
692                           && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
693                        min = min*10 +ch-'0';
694                        if (min < 0)
695                            throw ex("parser.quantifier.5", this.offset);
696                    }
697                }
698                else {
699                    throw ex("parser.quantifier.1", this.offset);
700                }
701
702                max = min;
703                if (ch == ',') {
704
705                   if (off >= this.regexlen) {
706                       throw ex("parser.quantifier.3", this.offset);
707                   }
708                   else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
709
710                        max = ch -'0';       // {min,max}
711                        while (off < this.regexlen
712                               && (ch = this.regex.charAt(off++)) >= '0'
713                               && ch <= '9') {
714                            max = max*10 +ch-'0';
715                            if (max < 0)
716                                throw ex("parser.quantifier.5", this.offset);
717                        }
718
719                        if (min > max)
720                            throw ex("parser.quantifier.4", this.offset);
721                   }
722                   else { // assume {min,}
723                        max = -1;
724                    }
725                }
726
727               if (ch != '}')
728                   throw ex("parser.quantifier.2", this.offset);
729
730               if (this.checkQuestion(off)) {  // off -> next of '}'
731                    tok = Token.createNGClosure(tok);
732                    this.offset = off+1;
733                } else {
734                    tok = Token.createClosure(tok);
735                    this.offset = off;
736                }
737
738                tok.setMin(min);
739                tok.setMax(max);
740                //System.err.println("CLOSURE: "+min+", "+max);
741                this.next();
742            }
743        }
744        return tok;
745    }
746
747    /**
748     * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
749     *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
750     *          | '(?>' regex ')'
751     * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
752     */
753    Token parseAtom() throws ParseException {
754        int ch = this.read();
755        Token tok = null;
756        switch (ch) {
757          case T_LPAREN:        return this.processParen();
758          case T_LPAREN2:       return this.processParen2(); // '(?:'
759          case T_CONDITION:     return this.processCondition(); // '(?('
760          case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
761          case T_INDEPENDENT:   return this.processIndependent();
762          case T_DOT:
763            this.next();                    // Skips '.'
764            tok = Token.token_dot;
765            break;
766
767            /**
768             * char-class ::= '[' ( '^'? range ','?)+ ']'
769             * range ::= '\d' | '\w' | '\s' | category-block | range-char
770             *           | range-char '-' range-char
771             * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
772             * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
773             */
774          case T_LBRACKET:      return this.parseCharacterClass(true);
775          case T_SET_OPERATIONS: return this.parseSetOperations();
776
777          case T_BACKSOLIDUS:
778            switch (this.chardata) {
779              case 'd':  case 'D':
780              case 'w':  case 'W':
781              case 's':  case 'S':
782                tok = this.getTokenForShorthand(this.chardata);
783                this.next();
784                return tok;
785
786              case 'e':  case 'f':  case 'n':  case 'r':
787              case 't':  case 'u':  case 'v':  case 'x':
788                {
789                    int ch2 = this.decodeEscaped();
790                    if (ch2 < 0x10000) {
791                        tok = Token.createChar(ch2);
792                    } else {
793                        tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
794                    }
795                }
796                break;
797
798              case 'c': return this.processBacksolidus_c();
799              case 'C': return this.processBacksolidus_C();
800              case 'i': return this.processBacksolidus_i();
801              case 'I': return this.processBacksolidus_I();
802              case 'g': return this.processBacksolidus_g();
803              case 'X': return this.processBacksolidus_X();
804              case '1':  case '2':  case '3':  case '4':
805              case '5':  case '6':  case '7':  case '8':  case '9':
806                return this.processBackreference();
807
808              case 'P':
809              case 'p':
810                int pstart = this.offset;
811                tok = processBacksolidus_pP(this.chardata);
812                if (tok == null)  throw this.ex("parser.atom.5", pstart);
813                break;
814
815              default:
816                tok = Token.createChar(this.chardata);
817            }
818            this.next();
819            break;
820
821          case T_CHAR:
822            if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
823                throw this.ex("parser.atom.4", this.offset-1);
824            tok = Token.createChar(this.chardata);
825            int high = this.chardata;
826            this.next();
827            if (REUtil.isHighSurrogate(high)
828                && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
829                char[] sur = new char[2];
830                sur[0] = (char)high;
831                sur[1] = (char)this.chardata;
832                tok = Token.createParen(Token.createString(new String(sur)), 0);
833                this.next();
834            }
835            break;
836
837          default:
838            throw this.ex("parser.atom.4", this.offset-1);
839        }
840        return tok;
841    }
842
843    protected RangeToken processBacksolidus_pP(int c) throws ParseException {
844
845        this.next();
846        if (this.read() != T_CHAR || this.chardata != '{')
847            throw this.ex("parser.atom.2", this.offset-1);
848
849        // handle category escape
850        boolean positive = c == 'p';
851        int namestart = this.offset;
852        int nameend = this.regex.indexOf('}', namestart);
853
854        if (nameend < 0)
855            throw this.ex("parser.atom.3", this.offset);
856
857        String pname = this.regex.substring(namestart, nameend);
858        this.offset = nameend+1;
859
860        return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
861    }
862
863    int processCIinCharacterClass(RangeToken tok, int c) {
864        return this.decodeEscaped();
865    }
866
867    /**
868     * char-class ::= '[' ( '^'? range ','?)+ ']'
869     * range ::= '\d' | '\w' | '\s' | category-block | range-char
870     *           | range-char '-' range-char
871     * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
872     * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
873     */
874    protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
875        this.setContext(S_INBRACKETS);
876        this.next();                            // '['
877        boolean nrange = false;
878        RangeToken base = null;
879        RangeToken tok;
880        if (this.read() == T_CHAR && this.chardata == '^') {
881            nrange = true;
882            this.next();                        // '^'
883            if (useNrange) {
884                tok = Token.createNRange();
885            } else {
886                base = Token.createRange();
887                base.addRange(0, Token.UTF16_MAX);
888                tok = Token.createRange();
889            }
890        } else {
891            tok = Token.createRange();
892        }
893        int type;
894        boolean firstloop = true;
895        while ((type = this.read()) != T_EOF) {
896            if (type == T_CHAR && this.chardata == ']' && !firstloop)
897                break;
898            int c = this.chardata;
899            boolean end = false;
900            if (type == T_BACKSOLIDUS) {
901                switch (c) {
902                  case 'd':  case 'D':
903                  case 'w':  case 'W':
904                  case 's':  case 'S':
905                    tok.mergeRanges(this.getTokenForShorthand(c));
906                    end = true;
907                    break;
908
909                  case 'i':  case 'I':
910                  case 'c':  case 'C':
911                    c = this.processCIinCharacterClass(tok, c);
912                    if (c < 0)  end = true;
913                    break;
914
915                  case 'p':
916                  case 'P':
917                    int pstart = this.offset;
918                    RangeToken tok2 = this.processBacksolidus_pP(c);
919                    if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
920                    tok.mergeRanges(tok2);
921                    end = true;
922                    break;
923
924                  default:
925                    c = this.decodeEscaped();
926                } // \ + c
927            } // backsolidus
928                                                // POSIX Character class such as [:alnum:]
929            else if (type == T_POSIX_CHARCLASS_START) {
930                int nameend = this.regex.indexOf(':', this.offset);
931                if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
932                boolean positive = true;
933                if (this.regex.charAt(this.offset) == '^') {
934                    this.offset ++;
935                    positive = false;
936                }
937                String name = this.regex.substring(this.offset, nameend);
938                RangeToken range = Token.getRange(name, positive,
939                                                  this.isSet(RegularExpression.XMLSCHEMA_MODE));
940                if (range == null)  throw this.ex("parser.cc.3", this.offset);
941                tok.mergeRanges(range);
942                end = true;
943                if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
944                    throw this.ex("parser.cc.1", nameend);
945                this.offset = nameend+2;
946            }
947            else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
948                if (nrange) {
949                    nrange = false;
950                    if (useNrange) {
951                        tok = (RangeToken) Token.complementRanges(tok);
952                    }
953                    else {
954                        base.subtractRanges(tok);
955                        tok = base;
956                    }
957                }
958                RangeToken range2 = this.parseCharacterClass(false);
959                tok.subtractRanges(range2);
960                if (this.read() != T_CHAR || this.chardata != ']') {
961                    throw this.ex("parser.cc.5", this.offset);
962                }
963                break;                          // Exit this loop
964            }
965            this.next();
966            if (!end) {                         // if not shorthands...
967                if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
968                    if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
969                        tok.addRange(c, c);
970                    }
971                    else {
972                        addCaseInsensitiveChar(tok, c);
973                    }
974                }
975                else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
976                    throw this.ex("parser.cc.8", this.offset-1);
977                }
978                else {
979                    this.next(); // Skips '-'
980                    if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
981                    if (type == T_CHAR && this.chardata == ']') {
982                        if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
983                            tok.addRange(c, c);
984                        }
985                        else {
986                            addCaseInsensitiveChar(tok, c);
987                        }
988                        tok.addRange('-', '-');
989                    } else {
990                        int rangeend = this.chardata;
991                        if (type == T_BACKSOLIDUS) {
992                            rangeend = this.decodeEscaped();
993                        }
994                        this.next();
995                        if (c > rangeend) {
996                            throw this.ex("parser.ope.3", this.offset-1);
997                        }
998                        if (!this.isSet(RegularExpression.IGNORE_CASE) ||
999                                (c > 0xffff && rangeend > 0xffff)) {
1000                            tok.addRange(c, rangeend);
1001                        }
1002                        else {
1003                            addCaseInsensitiveCharRange(tok, c, rangeend);
1004                        }
1005                    }
1006                }
1007            }
1008            if (this.isSet(RegularExpression.SPECIAL_COMMA)
1009                && this.read() == T_CHAR && this.chardata == ',') {
1010                this.next();
1011            }
1012            firstloop = false;
1013        }
1014        if (this.read() == T_EOF) {
1015            throw this.ex("parser.cc.2", this.offset);
1016        }
1017
1018        if (!useNrange && nrange) {
1019            base.subtractRanges(tok);
1020            tok = base;
1021        }
1022        tok.sortRanges();
1023        tok.compactRanges();
1024        this.setContext(S_NORMAL);
1025        this.next();                    // Skips ']'
1026
1027        return tok;
1028    }
1029
1030    /**
1031     * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1032     */
1033    protected RangeToken parseSetOperations() throws ParseException {
1034        RangeToken tok = this.parseCharacterClass(false);
1035        int type;
1036        while ((type = this.read()) != T_RPAREN) {
1037            int ch = this.chardata;
1038            if (type == T_CHAR && (ch == '-' || ch == '&')
1039                || type == T_PLUS) {
1040                this.next();
1041                if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
1042                RangeToken t2 = this.parseCharacterClass(false);
1043                if (type == T_PLUS)
1044                    tok.mergeRanges(t2);
1045                else if (ch == '-')
1046                    tok.subtractRanges(t2);
1047                else if (ch == '&')
1048                    tok.intersectRanges(t2);
1049                else
1050                    throw new RuntimeException("ASSERT");
1051            } else {
1052                throw ex("parser.ope.2", this.offset-1);
1053            }
1054        }
1055        this.next();
1056        return tok;
1057    }
1058
1059    Token getTokenForShorthand(int ch) {
1060        Token tok;
1061        switch (ch) {
1062          case 'd':
1063            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1064                ? Token.getRange("Nd", true) : Token.token_0to9;
1065            break;
1066          case 'D':
1067            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1068                ? Token.getRange("Nd", false) : Token.token_not_0to9;
1069            break;
1070          case 'w':
1071            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1072                ? Token.getRange("IsWord", true) : Token.token_wordchars;
1073            break;
1074          case 'W':
1075            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1076                ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
1077            break;
1078          case 's':
1079            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1080                ? Token.getRange("IsSpace", true) : Token.token_spaces;
1081            break;
1082          case 'S':
1083            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1084                ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1085            break;
1086
1087          default:
1088            throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1089        }
1090        return tok;
1091    }
1092
1093    /**
1094     */
1095    int decodeEscaped() throws ParseException {
1096        if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
1097        int c = this.chardata;
1098        switch (c) {
1099          case 'e':  c = 0x1b;  break; // ESCAPE U+001B
1100          case 'f':  c = '\f';  break; // FORM FEED U+000C
1101          case 'n':  c = '\n';  break; // LINE FEED U+000A
1102          case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
1103          case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
1104          //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
1105          case 'x':
1106            this.next();
1107            if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1108            if (this.chardata == '{') {
1109                int v1 = 0;
1110                int uv = 0;
1111                do {
1112                    this.next();
1113                    if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1114                    if ((v1 = hexChar(this.chardata)) < 0)
1115                        break;
1116                    if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1117                    uv = uv*16+v1;
1118                } while (true);
1119                if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
1120                if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
1121                c = uv;
1122            } else {
1123                int v1 = 0;
1124                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1125                    throw ex("parser.descape.1", this.offset-1);
1126                int uv = v1;
1127                this.next();
1128                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1129                    throw ex("parser.descape.1", this.offset-1);
1130                uv = uv*16+v1;
1131                c = uv;
1132            }
1133            break;
1134
1135          case 'u':
1136            int v1 = 0;
1137            this.next();
1138            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1139                throw ex("parser.descape.1", this.offset-1);
1140            int uv = v1;
1141            this.next();
1142            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1143                throw ex("parser.descape.1", this.offset-1);
1144            uv = uv*16+v1;
1145            this.next();
1146            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1147                throw ex("parser.descape.1", this.offset-1);
1148            uv = uv*16+v1;
1149            this.next();
1150            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1151                throw ex("parser.descape.1", this.offset-1);
1152            uv = uv*16+v1;
1153            c = uv;
1154            break;
1155
1156          case 'v':
1157            this.next();
1158            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1159                throw ex("parser.descape.1", this.offset-1);
1160            uv = v1;
1161            this.next();
1162            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1163                throw ex("parser.descape.1", this.offset-1);
1164            uv = uv*16+v1;
1165            this.next();
1166            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1167                throw ex("parser.descape.1", this.offset-1);
1168            uv = uv*16+v1;
1169            this.next();
1170            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1171                throw ex("parser.descape.1", this.offset-1);
1172            uv = uv*16+v1;
1173            this.next();
1174            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1175                throw ex("parser.descape.1", this.offset-1);
1176            uv = uv*16+v1;
1177            this.next();
1178            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1179                throw ex("parser.descape.1", this.offset-1);
1180            uv = uv*16+v1;
1181            if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
1182            c = uv;
1183            break;
1184          case 'A':
1185          case 'Z':
1186          case 'z':
1187            throw ex("parser.descape.5", this.offset-2);
1188          default:
1189        }
1190        return c;
1191    }
1192
1193    static private final int hexChar(int ch) {
1194        if (ch < '0')  return -1;
1195        if (ch > 'f')  return -1;
1196        if (ch <= '9')  return ch-'0';
1197        if (ch < 'A')  return -1;
1198        if (ch <= 'F')  return ch-'A'+10;
1199        if (ch < 'a')  return -1;
1200        return ch-'a'+10;
1201    }
1202
1203    static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
1204        final int[] caseMap = CaseInsensitiveMap.get(c);
1205        tok.addRange(c, c);
1206
1207        if (caseMap != null) {
1208            for (int i=0; i<caseMap.length; i+=2) {
1209                tok.addRange(caseMap[i], caseMap[i]);
1210            }
1211        }
1212
1213    }
1214
1215    static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
1216        int[] caseMap;
1217        int r1, r2;
1218        if (start <= end) {
1219            r1 = start;
1220            r2 = end;
1221        } else {
1222            r1 = end;
1223            r2 = start;
1224        }
1225
1226        tok.addRange(r1, r2);
1227        for (int ch = r1;  ch <= r2;  ch++) {
1228            caseMap = CaseInsensitiveMap.get(ch);
1229            if (caseMap != null) {
1230                for (int i=0; i<caseMap.length; i+=2) {
1231                    tok.addRange(caseMap[i], caseMap[i]);
1232                }
1233            }
1234        }
1235    }
1236}
1237