1/*
2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3 */
4/*
5 * Licensed to the Apache Software Foundation (ASF) under one or more
6 * contributor license agreements.  See the NOTICE file distributed with
7 * this work for additional information regarding copyright ownership.
8 * The ASF licenses this file to You under the Apache License, Version 2.0
9 * (the "License"); you may not use this file except in compliance with
10 * the License.  You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22
23import java.text.CharacterIterator;
24import java.util.Locale;
25import java.util.Stack;
26
27import com.sun.org.apache.xerces.internal.util.IntStack;
28
29/**
30 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
31 * This engine does not conform to the POSIX regular expression.
32 *
33 * <hr width="50%">
34 * <h3>How to use</h3>
35 *
36 * <dl>
37 *   <dt>A. Standard way
38 *   <dd>
39 * <pre>
40 * RegularExpression re = new RegularExpression(<var>regex</var>);
41 * if (re.matches(text)) { ... }
42 * </pre>
43 *
44 *   <dt>B. Capturing groups
45 *   <dd>
46 * <pre>
47 * RegularExpression re = new RegularExpression(<var>regex</var>);
48 * Match match = new Match();
49 * if (re.matches(text, match)) {
50 *     ... // You can refer captured texts with methods of the <code>Match</code> class.
51 * }
52 * </pre>
53 *
54 * </dl>
55 *
56 * <h4>Case-insensitive matching</h4>
57 * <pre>
58 * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
59 * if (re.matches(text) >= 0) { ...}
60 * </pre>
61 *
62 * <h4>Options</h4>
63 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
64 *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
65 *    This <var>options</var> parameter consists of the following characters.
66 * </p>
67 * <dl>
68 *   <dt><a name="I_OPTION"><code>"i"</code></a>
69 *   <dd>This option indicates case-insensitive matching.
70 *   <dt><a name="M_OPTION"><code>"m"</code></a>
71 *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
72 *   <dt><a name="S_OPTION"><code>"s"</code></a>
73 *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
74 *   <dt><a name="U_OPTION"><code>"u"</code></a>
75 *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
76 *   <dt><a name="W_OPTION"><code>"w"</code></a>
77 *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
78 *      'Unicode Regular Expression Guidelines' Revision 4.
79 *      When "w" and "u" are specified at the same time,
80 *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
81 *   <dt><a name="COMMA_OPTION"><code>","</code></a>
82 *   <dd>The parser treats a comma in a character class as a range separator.
83 *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
84 *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
85 *
86 *   <dt><a name="X_OPTION"><code>"X"</code></a>
87 *   <dd class="REGEX">
88 *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
89 *       The <code>match()</code> method does not do subsring matching
90 *       but entire string matching.
91 *
92 * </dl>
93 *
94 * <hr width="50%">
95 * <h3>Syntax</h3>
96 * <table border="1" bgcolor="#ddeeff">
97 *   <tr>
98 *    <td>
99 *     <h4>Differences from the Perl 5 regular expression</h4>
100 *     <ul>
101 *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
102 *      <li>Supports subtraction, union, and intersection operations for character classes.
103 *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
104 *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
105 *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
106 *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
107 *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
108 *     </ul>
109 *    </td>
110 *   </tr>
111 * </table>
112 *
113 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
114 * <ul>
115 *   <li>Character
116 *     <dl>
117 *       <dt class="REGEX"><kbd>.</kbd> (A period)
118 *       <dd>Matches any one character except the following characters.
119 *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
120 *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
121 *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
122 *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
123 *           it matches any character including the above four characters.
124 *
125 *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
126 *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
127 *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
128 *
129 *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
130 *       <dd>Matches a control character.
131 *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
132 *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
133 *           It matches a control character of which the character code is less than
134 *           the character code of the <var>C</var> by 0x0040.
135 *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
136 *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
137 *
138 *       <dt class="REGEX">a non-meta character
139 *       <dd>Matches the character.
140 *
141 *       <dt class="REGEX"><KBD>\</KBD> + a meta character
142 *       <dd>Matches the meta character.
143 *
144 *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
145 *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
146 *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
147 *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
148 *
149 *       <!--
150 *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
151 *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
152 *       -->
153 *
154 *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
155 *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
156 *
157 *       <dt class="REGEX"><kbd>\g</kbd>
158 *       <dd>Matches a grapheme.
159 *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
160 *
161 *       <dt class="REGEX"><kbd>\X</kbd>
162 *       <dd class="REGEX">Matches a combining character sequence.
163 *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
164 *     </dl>
165 *   </li>
166 *
167 *   <li>Character class
168 *     <dl>
169+ *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
170+ *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
171 *       <dd>Positive character class.  It matches a character in ranges.
172 *       <dd><var>R<sub>n</sub></var>:
173 *       <ul>
174 *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
175 *             <p>This range matches the character.
176 *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
177 *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
178+ *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
179+ *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
180 *             <p>...
181 *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
182 *             <p>These expressions specifies the same ranges as the following expressions.
183 *       </ul>
184 *       <p class="REGEX">Enumerated ranges are merged (union operation).
185 *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
186 *
187 *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
188 *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
189 *       <dd>Negative character class.  It matches a character not in ranges.
190 *
191 *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
192 *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
193 *       <dd>Subtraction or union or intersection for character classes.
194 *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
195 *       <dd>The result of this operations is a <u>positive character class</u>
196 *           even if an expression includes any negative character classes.
197 *           You have to take care on this in case-insensitive matching.
198 *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
199 *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
200 *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
201 *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
202 *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
203 *
204 *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
205 *       <dd>Character class subtraction for the XML Schema.
206 *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
207 *
208 *       <dt class="REGEX"><kbd>\d</kbd>
209 *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
210 *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
211 *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
212 *
213 *       <dt class="REGEX"><kbd>\D</kbd>
214 *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
215 *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
216 *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
217 *
218 *       <dt class="REGEX"><kbd>\s</kbd>
219 *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
220 *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
221 *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
222 *
223 *       <dt class="REGEX"><kbd>\S</kbd>
224 *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
225 *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
226 *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
227 *
228 *       <dt class="REGEX"><kbd>\w</kbd>
229 *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
230 *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
231 *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
232 *
233 *       <dt class="REGEX"><kbd>\W</kbd>
234 *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
235 *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
236 *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
237 *
238 *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
239 *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
240 *       The following names are available:
241 *       <dl>
242 *         <dt>Unicode General Categories:
243 *         <dd><kbd>
244 *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
245 *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
246 *         </kbd>
247 *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
248 *         <dt>Unicode Blocks:
249 *         <dd><kbd>
250 *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
251 *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
252 *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
253 *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
254 *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
255 *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
256 *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
257 *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
258 *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
259 *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
260 *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
261 *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
262 *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
263 *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
264 *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
265 *       Small Form Variants, Arabic Presentation Forms-B, Specials,
266 *       Halfwidth and Fullwidth Forms
267 *         </kbd>
268 *         <dt>Others:
269 *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
270 *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
271 *         <dd><kbd>UNASSGINED</kbd>
272 *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
273 *       </dl>
274 *
275 *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
276 *       <dd>Matches one character not in the specified General Category or the specified Block.
277 *     </dl>
278 *   </li>
279 *
280 *   <li>Selection and Quantifier
281 *     <dl>
282 *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
283 *       <dd>...
284 *
285 *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
286 *       <dd>Matches 0 or more <var>X</var>.
287 *
288 *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
289 *       <dd>Matches 1 or more <var>X</var>.
290 *
291 *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
292 *       <dd>Matches 0 or 1 <var>X</var>.
293 *
294 *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
295 *       <dd>Matches <var>number</var> times.
296 *
297 *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
298 *       <dd>...
299 *
300 *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
301 *       <dd>...
302 *
303 *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
304 *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
305 *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
306 *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
307 *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
308 *       <dd>Non-greedy matching.
309 *     </dl>
310 *   </li>
311 *
312 *   <li>Grouping, Capturing, and Back-reference
313 *     <dl>
314 *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
315 *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
316 *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
317 *       you have to write "<KBD>(?:foo)+</KBD>".
318 *
319 *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
320 *       <dd>Grouping with capturing.
321 * It make a group and applications can know
322 * where in target text a group matched with methods of a <code>Match</code> instance
323 * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>.
324 * The 0th group means whole of this regular expression.
325 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
326 *
327 *   <p>For instance, a regular expression is
328 *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
329 *   and target text is
330 *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
331 *   <ul>
332 *     <li><code>Match.getCapturedText(0)</code>:
333 *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
334 *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
335 *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
336 *   </ul>
337 *
338 *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
339 *       <dd>
340 *
341 *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
342 *       <dd>Independent expression group. ................
343 *
344 *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
345 *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
346 *       <dd>............................
347 *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
348 *           Note that it can not contain 'u'.
349 *
350 *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
351 *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
352 *       <dd>......
353 *       <dd>These expressions must be at the beginning of a group.
354 *     </dl>
355 *   </li>
356 *
357 *   <li>Anchor
358 *     <dl>
359 *       <dt class="REGEX"><kbd>\A</kbd>
360 *       <dd>Matches the beginnig of the text.
361 *
362 *       <dt class="REGEX"><kbd>\Z</kbd>
363 *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
364 *           or CARRIAGE RETURN + LINE FEED at the end of the text.
365 *
366 *       <dt class="REGEX"><kbd>\z</kbd>
367 *       <dd>Matches the end of the text.
368 *
369 *       <dt class="REGEX"><kbd>^</kbd>
370 *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
371 *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
372 *           it matches the beginning of the text, or after one of EOL characters (
373 *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
374 *           PARAGRAPH SEPARATOR (U+2029).)
375 *
376 *       <dt class="REGEX"><kbd>$</kbd>
377 *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
378 *           or CARRIAGE RETURN + LINE FEED at the end of the text.
379 *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
380 *           it matches the end of the text, or before an EOL character.
381 *
382 *       <dt class="REGEX"><kbd>\b</kbd>
383 *       <dd>Matches word boundary.
384 *           (See <a href="#W_OPTION">a "w" option</a>)
385 *
386 *       <dt class="REGEX"><kbd>\B</kbd>
387 *       <dd>Matches non word boundary.
388 *           (See <a href="#W_OPTION">a "w" option</a>)
389 *
390 *       <dt class="REGEX"><kbd>\&lt;</kbd>
391 *       <dd>Matches the beginning of a word.
392 *           (See <a href="#W_OPTION">a "w" option</a>)
393 *
394 *       <dt class="REGEX"><kbd>\&gt;</kbd>
395 *       <dd>Matches the end of a word.
396 *           (See <a href="#W_OPTION">a "w" option</a>)
397 *     </dl>
398 *   </li>
399 *   <li>Lookahead and lookbehind
400 *     <dl>
401 *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
402 *       <dd>Lookahead.
403 *
404 *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
405 *       <dd>Negative lookahead.
406 *
407 *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
408 *       <dd>Lookbehind.
409 *       <dd>(Note for text capturing......)
410 *
411 *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
412 *       <dd>Negative lookbehind.
413 *     </dl>
414 *   </li>
415 *
416 *   <li>Misc.
417 *     <dl>
418 *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
419 *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
420 *       <dd>......
421 *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
422 *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
423 *           You can not write comments in character classes and before quantifiers.
424 *     </dl>
425 *   </li>
426 * </ul>
427 *
428 *
429 * <hr width="50%">
430 * <h3>BNF for the regular expression</h3>
431 * <pre>
432 * regex ::= ('(?' options ')')? term ('|' term)*
433 * term ::= factor+
434 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
435 *            | '(?#' [^)]* ')'
436 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
437 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
438 *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
439 *          | '(?>' regex ')' | '(?' options ':' regex ')'
440 *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
441 * options ::= [imsw]* ('-' [imsw]+)?
442 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
443 * looks ::= '(?=' regex ')'  | '(?!' regex ')'
444 *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
445 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
446 * category-block ::= '\' [pP] category-symbol-1
447 *                    | ('\p{' | '\P{') (category-symbol | block-name
448 *                                       | other-properties) '}'
449 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
450 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
451 *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
452 *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
453 *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
454 *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
455 * block-name ::= (See above)
456 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
457 * character-1 ::= (any character except meta-characters)
458 *
459 * char-class ::= '[' ranges ']'
460 *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
461 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
462 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
463 *           | range-char | range-char '-' range-char
464 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
465 * code-point ::= '\x' hex-char hex-char
466 *                | '\x{' hex-char+ '}'
467 * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
468 * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
469 * hex-char ::= [0-9a-fA-F]
470 * character-2 ::= (any character except \[]-,)
471 * </pre>
472 *
473 * <hr width="50%">
474 * <h3>TODO</h3>
475 * <ul>
476 *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
477 *     <ul>
478 *       <li>2.4 Canonical Equivalents
479 *       <li>Level 3
480 *     </ul>
481 *   <li>Parsing performance
482 * </ul>
483 *
484 * <hr width="50%">
485 *
486 * @xerces.internal
487 *
488 * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
489 */
490public class RegularExpression implements java.io.Serializable {
491
492    private static final long serialVersionUID = 6242499334195006401L;
493
494    static final boolean DEBUG = false;
495
496    /**
497     * Compiles a token tree into an operation flow.
498     */
499    private synchronized void compile(Token tok) {
500        if (this.operations != null)
501            return;
502        this.numberOfClosures = 0;
503        this.operations = this.compile(tok, null, false);
504    }
505
506    /**
507     * Converts a token to an operation.
508     */
509    private Op compile(Token tok, Op next, boolean reverse) {
510        Op ret;
511        switch (tok.type) {
512        case Token.DOT:
513            ret = Op.createDot();
514            ret.next = next;
515            break;
516
517        case Token.CHAR:
518            ret = Op.createChar(tok.getChar());
519            ret.next = next;
520            break;
521
522        case Token.ANCHOR:
523            ret = Op.createAnchor(tok.getChar());
524            ret.next = next;
525            break;
526
527        case Token.RANGE:
528        case Token.NRANGE:
529            ret = Op.createRange(tok);
530            ret.next = next;
531            break;
532
533        case Token.CONCAT:
534            ret = next;
535            if (!reverse) {
536                for (int i = tok.size()-1;  i >= 0;  i --) {
537                    ret = compile(tok.getChild(i), ret, false);
538                }
539            } else {
540                for (int i = 0;  i < tok.size();  i ++) {
541                    ret = compile(tok.getChild(i), ret, true);
542                }
543            }
544            break;
545
546        case Token.UNION:
547            Op.UnionOp uni = Op.createUnion(tok.size());
548            for (int i = 0;  i < tok.size();  i ++) {
549                uni.addElement(compile(tok.getChild(i), next, reverse));
550            }
551            ret = uni;                          // ret.next is null.
552            break;
553
554        case Token.CLOSURE:
555        case Token.NONGREEDYCLOSURE:
556            Token child = tok.getChild(0);
557            int min = tok.getMin();
558            int max = tok.getMax();
559            if (min >= 0 && min == max) { // {n}
560                ret = next;
561                for (int i = 0; i < min;  i ++) {
562                    ret = compile(child, ret, reverse);
563                }
564                break;
565            }
566            if (min > 0 && max > 0)
567                max -= min;
568            if (max > 0) {
569                // X{2,6} -> XX(X(X(XX?)?)?)?
570                ret = next;
571                for (int i = 0;  i < max;  i ++) {
572                    Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
573                    q.next = next;
574                    q.setChild(compile(child, ret, reverse));
575                    ret = q;
576                }
577            } else {
578                Op.ChildOp op;
579                if (tok.type == Token.NONGREEDYCLOSURE) {
580                    op = Op.createNonGreedyClosure();
581                } else {                        // Token.CLOSURE
582                    op = Op.createClosure(this.numberOfClosures++);
583                }
584                op.next = next;
585                op.setChild(compile(child, op, reverse));
586                ret = op;
587            }
588            if (min > 0) {
589                for (int i = 0;  i < min;  i ++) {
590                    ret = compile(child, ret, reverse);
591                }
592            }
593            break;
594
595        case Token.EMPTY:
596            ret = next;
597            break;
598
599        case Token.STRING:
600            ret = Op.createString(tok.getString());
601            ret.next = next;
602            break;
603
604        case Token.BACKREFERENCE:
605            ret = Op.createBackReference(tok.getReferenceNumber());
606            ret.next = next;
607            break;
608
609        case Token.PAREN:
610            if (tok.getParenNumber() == 0) {
611                ret = compile(tok.getChild(0), next, reverse);
612            } else if (reverse) {
613                next = Op.createCapture(tok.getParenNumber(), next);
614                next = compile(tok.getChild(0), next, reverse);
615                ret = Op.createCapture(-tok.getParenNumber(), next);
616            } else {
617                next = Op.createCapture(-tok.getParenNumber(), next);
618                next = compile(tok.getChild(0), next, reverse);
619                ret = Op.createCapture(tok.getParenNumber(), next);
620            }
621            break;
622
623        case Token.LOOKAHEAD:
624            ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
625            break;
626        case Token.NEGATIVELOOKAHEAD:
627            ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
628            break;
629        case Token.LOOKBEHIND:
630            ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
631            break;
632        case Token.NEGATIVELOOKBEHIND:
633            ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
634            break;
635
636        case Token.INDEPENDENT:
637            ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
638            break;
639
640        case Token.MODIFIERGROUP:
641            ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
642                                    ((Token.ModifierToken)tok).getOptions(),
643                                    ((Token.ModifierToken)tok).getOptionsMask());
644            break;
645
646        case Token.CONDITION:
647            Token.ConditionToken ctok = (Token.ConditionToken)tok;
648            int ref = ctok.refNumber;
649            Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
650            Op yes = compile(ctok.yes, next, reverse);
651            Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
652            ret = Op.createCondition(next, ref, condition, yes, no);
653            break;
654
655        default:
656            throw new RuntimeException("Unknown token type: "+tok.type);
657        } // switch (tok.type)
658        return ret;
659    }
660
661
662//Public
663
664    /**
665     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
666     *
667     * @return true if the target is matched to this regular expression.
668     */
669    public boolean matches(char[]  target) {
670        return this.matches(target, 0,  target .length , (Match)null);
671    }
672
673    /**
674     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
675     * in specified range or not.
676     *
677     * @param start Start offset of the range.
678     * @param end  End offset +1 of the range.
679     * @return true if the target is matched to this regular expression.
680     */
681    public boolean matches(char[]  target, int start, int end) {
682        return this.matches(target, start, end, (Match)null);
683    }
684
685    /**
686     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
687     *
688     * @param match A Match instance for storing matching result.
689     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
690     */
691    public boolean matches(char[]  target, Match match) {
692        return this.matches(target, 0,  target .length , match);
693    }
694
695
696    /**
697     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
698     * in specified range or not.
699     *
700     * @param start Start offset of the range.
701     * @param end  End offset +1 of the range.
702     * @param match A Match instance for storing matching result.
703     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
704     */
705    public boolean matches(char[] target, int start, int end, Match match) {
706
707        synchronized (this) {
708            if (this.operations == null)
709                this.prepare();
710            if (this.context == null)
711                this.context = new Context();
712        }
713        Context con = null;
714        synchronized (this.context) {
715            con = this.context.inuse ? new Context() : this.context;
716            con.reset(target, start, end, this.numberOfClosures);
717        }
718        if (match != null) {
719            match.setNumberOfGroups(this.nofparen);
720            match.setSource(target);
721        } else if (this.hasBackReferences) {
722            match = new Match();
723            match.setNumberOfGroups(this.nofparen);
724            // Need not to call setSource() because
725            // a caller can not access this match instance.
726        }
727        con.match = match;
728
729        if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
730            int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
731            //System.err.println("DEBUG: matchEnd="+matchEnd);
732            if (matchEnd == con.limit) {
733                if (con.match != null) {
734                    con.match.setBeginning(0, con.start);
735                    con.match.setEnd(0, matchEnd);
736                }
737                con.setInUse(false);
738                return true;
739            }
740            return false;
741        }
742
743        /*
744         * The pattern has only fixed string.
745         * The engine uses Boyer-Moore.
746         */
747        if (this.fixedStringOnly) {
748            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
749            int o = this.fixedStringTable.matches(target, con.start, con.limit);
750            if (o >= 0) {
751                if (con.match != null) {
752                    con.match.setBeginning(0, o);
753                    con.match.setEnd(0, o+this.fixedString.length());
754                }
755                con.setInUse(false);
756                return true;
757            }
758            con.setInUse(false);
759            return false;
760        }
761
762        /*
763         * The pattern contains a fixed string.
764         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
765         * If not, it return with false.
766         */
767        if (this.fixedString != null) {
768            int o = this.fixedStringTable.matches(target, con.start, con.limit);
769            if (o < 0) {
770                //System.err.println("Non-match in fixed-string search.");
771                con.setInUse(false);
772                return false;
773            }
774        }
775
776        int limit = con.limit-this.minlength;
777        int matchStart;
778        int matchEnd = -1;
779
780        /*
781         * Checks whether the expression starts with ".*".
782         */
783        if (this.operations != null
784            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
785            if (isSet(this.options, SINGLE_LINE)) {
786                matchStart = con.start;
787                matchEnd = this. match(con, this.operations, con.start, 1, this.options);
788            } else {
789                boolean previousIsEOL = true;
790                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
791                    int ch =  target [  matchStart ] ;
792                    if (isEOLChar(ch)) {
793                        previousIsEOL = true;
794                    } else {
795                        if (previousIsEOL) {
796                            if (0 <= (matchEnd = this. match(con, this.operations,
797                                                             matchStart, 1, this.options)))
798                                break;
799                        }
800                        previousIsEOL = false;
801                    }
802                }
803            }
804        }
805
806        /*
807         * Optimization against the first character.
808         */
809        else if (this.firstChar != null) {
810            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
811            RangeToken range = this.firstChar;
812            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
813                int ch =  target [matchStart] ;
814                if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
815                    ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
816                }
817                if (!range.match(ch))  {
818                    continue;
819                }
820                if (0 <= (matchEnd = this. match(con, this.operations,
821                                                 matchStart, 1, this.options))) {
822                        break;
823                }
824            }
825        }
826
827        /*
828         * Straightforward matching.
829         */
830        else {
831            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
832                if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
833                    break;
834            }
835        }
836
837        if (matchEnd >= 0) {
838            if (con.match != null) {
839                con.match.setBeginning(0, matchStart);
840                con.match.setEnd(0, matchEnd);
841            }
842            con.setInUse(false);
843            return true;
844        } else {
845            con.setInUse(false);
846            return false;
847        }
848    }
849
850    /**
851     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
852     *
853     * @return true if the target is matched to this regular expression.
854     */
855    public boolean matches(String  target) {
856        return this.matches(target, 0,  target .length() , (Match)null);
857    }
858
859    /**
860     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
861     * in specified range or not.
862     *
863     * @param start Start offset of the range.
864     * @param end  End offset +1 of the range.
865     * @return true if the target is matched to this regular expression.
866     */
867    public boolean matches(String  target, int start, int end) {
868        return this.matches(target, start, end, (Match)null);
869    }
870
871    /**
872     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
873     *
874     * @param match A Match instance for storing matching result.
875     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
876     */
877    public boolean matches(String  target, Match match) {
878        return this.matches(target, 0,  target .length() , match);
879    }
880
881    /**
882     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
883     * in specified range or not.
884     *
885     * @param start Start offset of the range.
886     * @param end  End offset +1 of the range.
887     * @param match A Match instance for storing matching result.
888     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
889     */
890    public boolean matches(String  target, int start, int end, Match match) {
891
892        synchronized (this) {
893            if (this.operations == null)
894                this.prepare();
895            if (this.context == null)
896                this.context = new Context();
897        }
898        Context con = null;
899        synchronized (this.context) {
900            con = this.context.inuse ? new Context() : this.context;
901            con.reset(target, start, end, this.numberOfClosures);
902        }
903        if (match != null) {
904            match.setNumberOfGroups(this.nofparen);
905            match.setSource(target);
906        } else if (this.hasBackReferences) {
907            match = new Match();
908            match.setNumberOfGroups(this.nofparen);
909            // Need not to call setSource() because
910            // a caller can not access this match instance.
911        }
912        con.match = match;
913
914        if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
915            if (DEBUG) {
916                System.err.println("target string="+target);
917            }
918            int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
919            if (DEBUG) {
920                System.err.println("matchEnd="+matchEnd);
921                System.err.println("con.limit="+con.limit);
922            }
923            if (matchEnd == con.limit) {
924                if (con.match != null) {
925                    con.match.setBeginning(0, con.start);
926                    con.match.setEnd(0, matchEnd);
927                }
928                con.setInUse(false);
929                return true;
930            }
931            return false;
932        }
933
934        /*
935         * The pattern has only fixed string.
936         * The engine uses Boyer-Moore.
937         */
938        if (this.fixedStringOnly) {
939            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
940            int o = this.fixedStringTable.matches(target, con.start, con.limit);
941            if (o >= 0) {
942                if (con.match != null) {
943                    con.match.setBeginning(0, o);
944                    con.match.setEnd(0, o+this.fixedString.length());
945                }
946                con.setInUse(false);
947                return true;
948            }
949            con.setInUse(false);
950            return false;
951        }
952
953        /*
954         * The pattern contains a fixed string.
955         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
956         * If not, it return with false.
957         */
958        if (this.fixedString != null) {
959            int o = this.fixedStringTable.matches(target, con.start, con.limit);
960            if (o < 0) {
961                //System.err.println("Non-match in fixed-string search.");
962                con.setInUse(false);
963                return false;
964            }
965        }
966
967        int limit = con.limit-this.minlength;
968        int matchStart;
969        int matchEnd = -1;
970
971        /*
972         * Checks whether the expression starts with ".*".
973         */
974        if (this.operations != null
975            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
976            if (isSet(this.options, SINGLE_LINE)) {
977                matchStart = con.start;
978                matchEnd = this.match(con, this.operations, con.start, 1, this.options);
979            } else {
980                boolean previousIsEOL = true;
981                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
982                    int ch =  target .charAt(  matchStart ) ;
983                    if (isEOLChar(ch)) {
984                        previousIsEOL = true;
985                    } else {
986                        if (previousIsEOL) {
987                            if (0 <= (matchEnd = this.match(con, this.operations,
988                                                            matchStart, 1, this.options)))
989                                break;
990                        }
991                        previousIsEOL = false;
992                    }
993                }
994            }
995        }
996
997        /*
998         * Optimization against the first character.
999         */
1000        else if (this.firstChar != null) {
1001            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1002            RangeToken range = this.firstChar;
1003            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1004                int ch =  target .charAt(  matchStart ) ;
1005                if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1006                    ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
1007                }
1008                if (!range.match(ch)) {
1009                    continue;
1010                }
1011                if (0 <= (matchEnd = this.match(con, this.operations,
1012                                                matchStart, 1, this.options))) {
1013                        break;
1014                }
1015            }
1016        }
1017
1018        /*
1019         * Straightforward matching.
1020         */
1021        else {
1022            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1023                if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options)))
1024                    break;
1025            }
1026        }
1027
1028        if (matchEnd >= 0) {
1029            if (con.match != null) {
1030                con.match.setBeginning(0, matchStart);
1031                con.match.setEnd(0, matchEnd);
1032            }
1033            con.setInUse(false);
1034            return true;
1035        } else {
1036            con.setInUse(false);
1037            return false;
1038        }
1039    }
1040
1041    /**
1042     * @return -1 when not match; offset of the end of matched string when match.
1043     */
1044    @SuppressWarnings("fallthrough")
1045    private int match(Context con, Op op, int offset, int dx, int opts) {
1046        final ExpressionTarget target = con.target;
1047        final Stack<Op> opStack = new Stack<>();
1048        final IntStack dataStack = new IntStack();
1049        final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
1050        int retValue = -1;
1051        boolean returned = false;
1052
1053        for (;;) {
1054            if (op == null || offset > con.limit || offset < con.start) {
1055                if (op == null) {
1056                    retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1057                }
1058                else {
1059                   retValue = -1;
1060                }
1061                returned = true;
1062            }
1063            else  {
1064                retValue = -1;
1065                // dx value is either 1 or -1
1066                switch (op.type) {
1067                case Op.CHAR:
1068                    {
1069                        final int o1 = (dx > 0) ? offset : offset -1;
1070                        if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
1071                            returned = true;
1072                            break;
1073                        }
1074                        offset += dx;
1075                        op = op.next;
1076                    }
1077                    break;
1078
1079                case Op.DOT:
1080                    {
1081                        int o1 = (dx > 0) ? offset : offset - 1;
1082                        if (o1 >= con.limit || o1 < 0) {
1083                            returned = true;
1084                            break;
1085                        }
1086                        if (isSet(opts, SINGLE_LINE)) {
1087                            if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
1088                                o1 += dx;
1089                            }
1090                        }
1091                        else {
1092                            int ch = target.charAt(o1);
1093                            if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
1094                                o1 += dx;
1095                                ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1096                            }
1097                            if (isEOLChar(ch)) {
1098                                returned = true;
1099                                break;
1100                            }
1101                        }
1102                        offset = (dx > 0) ? o1 + 1 : o1;
1103                        op = op.next;
1104                    }
1105                    break;
1106
1107                case Op.RANGE:
1108                case Op.NRANGE:
1109                    {
1110                        int o1 = (dx > 0) ? offset : offset -1;
1111                        if (o1 >= con.limit || o1 < 0) {
1112                            returned = true;
1113                            break;
1114                        }
1115                        int ch = target.charAt(offset);
1116                        if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
1117                            o1 += dx;
1118                            ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1119                        }
1120                        final RangeToken tok = op.getToken();
1121                        if (!tok.match(ch)) {
1122                            returned = true;
1123                            break;
1124                        }
1125                        offset = (dx > 0) ? o1+1 : o1;
1126                        op = op.next;
1127                    }
1128                    break;
1129
1130                case Op.ANCHOR:
1131                    {
1132                        if (!matchAnchor(target, op, con, offset, opts)) {
1133                            returned = true;
1134                            break;
1135                        }
1136                        op = op.next;
1137                    }
1138                    break;
1139
1140                case Op.BACKREFERENCE:
1141                    {
1142                        int refno = op.getData();
1143                        if (refno <= 0 || refno >= this.nofparen) {
1144                            throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1145                        }
1146                        if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
1147                            returned = true;
1148                            break;
1149                        }
1150                        int o2 = con.match.getBeginning(refno);
1151                        int literallen = con.match.getEnd(refno)-o2;
1152                        if (dx > 0) {
1153                            if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
1154                                returned = true;
1155                                break;
1156                            }
1157                            offset += literallen;
1158                        }
1159                        else {
1160                            if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
1161                                returned = true;
1162                                break;
1163                            }
1164                            offset -= literallen;
1165                        }
1166                        op = op.next;
1167                    }
1168                    break;
1169
1170                case Op.STRING:
1171                    {
1172                        String literal = op.getString();
1173                        int literallen = literal.length();
1174                        if (dx > 0) {
1175                            if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
1176                                returned = true;
1177                                break;
1178                            }
1179                            offset += literallen;
1180                        }
1181                        else {
1182                            if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
1183                                returned = true;
1184                                break;
1185                            }
1186                            offset -= literallen;
1187                        }
1188                        op = op.next;
1189                    }
1190                    break;
1191
1192                case Op.CLOSURE:
1193                    {
1194                        // Saves current position to avoid zero-width repeats.
1195                        final int id = op.getData();
1196                        if (con.closureContexts[id].contains(offset)) {
1197                            returned = true;
1198                            break;
1199                        }
1200
1201                        con.closureContexts[id].addOffset(offset);
1202                    }
1203                    // fall through
1204
1205                case Op.QUESTION:
1206                    {
1207                        opStack.push(op);
1208                        dataStack.push(offset);
1209                        op = op.getChild();
1210                    }
1211                    break;
1212
1213                case Op.NONGREEDYCLOSURE:
1214                case Op.NONGREEDYQUESTION:
1215                    {
1216                        opStack.push(op);
1217                        dataStack.push(offset);
1218                        op = op.next;
1219                    }
1220                    break;
1221
1222                case Op.UNION:
1223                    if (op.size() == 0) {
1224                        returned = true;
1225                    }
1226                    else {
1227                        opStack.push(op);
1228                        dataStack.push(0);
1229                        dataStack.push(offset);
1230                        op = op.elementAt(0);
1231                    }
1232                    break;
1233
1234                case Op.CAPTURE:
1235                    {
1236                        final int refno = op.getData();
1237                        if (con.match != null) {
1238                            if (refno > 0) {
1239                                dataStack.push(con.match.getBeginning(refno));
1240                                con.match.setBeginning(refno, offset);
1241                            }
1242                            else {
1243                                final int index = -refno;
1244                                dataStack.push(con.match.getEnd(index));
1245                                con.match.setEnd(index, offset);
1246                            }
1247                            opStack.push(op);
1248                            dataStack.push(offset);
1249                        }
1250                        op = op.next;
1251                    }
1252                    break;
1253
1254                case Op.LOOKAHEAD:
1255                case Op.NEGATIVELOOKAHEAD:
1256                case Op.LOOKBEHIND:
1257                case Op.NEGATIVELOOKBEHIND:
1258                    {
1259                        opStack.push(op);
1260                        dataStack.push(dx);
1261                        dataStack.push(offset);
1262                        dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
1263                        op = op.getChild();
1264                    }
1265                    break;
1266
1267                case Op.INDEPENDENT:
1268                    {
1269                        opStack.push(op);
1270                        dataStack.push(offset);
1271                        op = op.getChild();
1272                    }
1273                    break;
1274
1275                case Op.MODIFIER:
1276                    {
1277                        int localopts = opts;
1278                        localopts |= op.getData();
1279                        localopts &= ~op.getData2();
1280                        opStack.push(op);
1281                        dataStack.push(opts);
1282                        dataStack.push(offset);
1283                        opts = localopts;
1284                        op = op.getChild();
1285                    }
1286                    break;
1287
1288                case Op.CONDITION:
1289                    {
1290                        Op.ConditionOp cop = (Op.ConditionOp)op;
1291                        if (cop.refNumber > 0) {
1292                            if (cop.refNumber >= this.nofparen) {
1293                                throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1294                            }
1295                            if (con.match.getBeginning(cop.refNumber) >= 0
1296                                    && con.match.getEnd(cop.refNumber) >= 0) {
1297                                op = cop.yes;
1298                            }
1299                            else if (cop.no != null) {
1300                                op = cop.no;
1301                            }
1302                            else {
1303                                op = cop.next;
1304                            }
1305                        }
1306                        else {
1307                            opStack.push(op);
1308                            dataStack.push(offset);
1309                            op = cop.condition;
1310                        }
1311                    }
1312                    break;
1313
1314                default:
1315                    throw new RuntimeException("Unknown operation type: " + op.type);
1316                }
1317            }
1318
1319            // handle recursive operations
1320            while (returned) {
1321                // exhausted all the operations
1322                if (opStack.isEmpty()) {
1323                    return retValue;
1324                }
1325
1326                op = opStack.pop();
1327                offset = dataStack.pop();
1328
1329                switch (op.type) {
1330                case Op.CLOSURE:
1331                case Op.QUESTION:
1332                    if (retValue < 0) {
1333                        op = op.next;
1334                        returned = false;
1335                    }
1336                    break;
1337
1338                case Op.NONGREEDYCLOSURE:
1339                case Op.NONGREEDYQUESTION:
1340                    if (retValue < 0) {
1341                        op = op.getChild();
1342                        returned = false;
1343                    }
1344                    break;
1345
1346                case Op.UNION:
1347                    {
1348                        int unionIndex = dataStack.pop();
1349                        if (DEBUG) {
1350                            System.err.println("UNION: "+unionIndex+", ret="+retValue);
1351                        }
1352
1353                        if (retValue < 0) {
1354                            if (++unionIndex < op.size()) {
1355                                opStack.push(op);
1356                                dataStack.push(unionIndex);
1357                                dataStack.push(offset);
1358                                op = op.elementAt(unionIndex);
1359                                returned = false;
1360                            }
1361                            else {
1362                                retValue = -1;
1363                            }
1364                        }
1365                    }
1366                    break;
1367
1368                case Op.CAPTURE:
1369                    final int refno = op.getData();
1370                    final int saved = dataStack.pop();
1371                    if (retValue < 0) {
1372                        if (refno > 0) {
1373                            con.match.setBeginning(refno, saved);
1374                        }
1375                        else {
1376                            con.match.setEnd(-refno, saved);
1377                        }
1378                    }
1379                    break;
1380
1381                case Op.LOOKAHEAD:
1382                case Op.LOOKBEHIND:
1383                    {
1384                        dx = dataStack.pop();
1385                        if (0 <= retValue) {
1386                            op = op.next;
1387                            returned = false;
1388                        }
1389                        retValue = -1;
1390                    }
1391                    break;
1392
1393                case Op.NEGATIVELOOKAHEAD:
1394                case Op.NEGATIVELOOKBEHIND:
1395                    {
1396                        dx = dataStack.pop();
1397                        if (0 > retValue)  {
1398                            op = op.next;
1399                            returned = false;
1400                        }
1401                        retValue = -1;
1402                    }
1403                    break;
1404
1405                case Op.MODIFIER:
1406                    opts = dataStack.pop();
1407                    // fall through
1408
1409                case Op.INDEPENDENT:
1410                    if (retValue >= 0)  {
1411                        offset = retValue;
1412                        op = op.next;
1413                        returned = false;
1414                    }
1415                    break;
1416
1417                case Op.CONDITION:
1418                    {
1419                        final Op.ConditionOp cop = (Op.ConditionOp)op;
1420                        if (0 <= retValue) {
1421                            op = cop.yes;
1422                        }
1423                        else if (cop.no != null) {
1424                            op = cop.no;
1425                        }
1426                        else {
1427                            op = cop.next;
1428                        }
1429                    }
1430                    returned = false;
1431                    break;
1432
1433                default:
1434                    break;
1435                }
1436            }
1437        }
1438    }
1439
1440    private boolean matchChar(int ch, int other, boolean ignoreCase) {
1441        return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
1442    }
1443
1444    boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
1445        boolean go = false;
1446        switch (op.getData()) {
1447        case '^':
1448            if (isSet(opts, MULTIPLE_LINES)) {
1449                if (!(offset == con.start
1450                      || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1))))
1451                    return false;
1452            } else {
1453                if (offset != con.start)
1454                    return false;
1455            }
1456            break;
1457
1458        case '@':                         // Internal use only.
1459            // The @ always matches line beginnings.
1460            if (!(offset == con.start
1461                  || offset > con.start && isEOLChar(target.charAt(offset-1))))
1462                return false;
1463            break;
1464
1465        case '$':
1466            if (isSet(opts, MULTIPLE_LINES)) {
1467                if (!(offset == con.limit
1468                      || offset < con.limit && isEOLChar(target.charAt(offset))))
1469                    return false;
1470            } else {
1471                if (!(offset == con.limit
1472                      || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1473                      || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1474                      &&  target.charAt(offset+1) == LINE_FEED))
1475                    return false;
1476            }
1477            break;
1478
1479        case 'A':
1480            if (offset != con.start)  return false;
1481            break;
1482
1483        case 'Z':
1484            if (!(offset == con.limit
1485                  || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1486                  || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1487                  &&  target.charAt(offset+1) == LINE_FEED))
1488                return false;
1489            break;
1490
1491        case 'z':
1492            if (offset != con.limit)  return false;
1493            break;
1494
1495        case 'b':
1496            if (con.length == 0)
1497                return false;
1498            {
1499                int after = getWordType(target, con.start, con.limit, offset, opts);
1500                if (after == WT_IGNORE)  return false;
1501                int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1502                if (after == before)  return false;
1503            }
1504            break;
1505
1506        case 'B':
1507            if (con.length == 0)
1508                go = true;
1509            else {
1510                int after = getWordType(target, con.start, con.limit, offset, opts);
1511                go = after == WT_IGNORE
1512                     || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1513            }
1514            if (!go)  return false;
1515            break;
1516
1517        case '<':
1518            if (con.length == 0 || offset == con.limit)  return false;
1519            if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1520                || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1521                return false;
1522            break;
1523
1524        case '>':
1525            if (con.length == 0 || offset == con.start)  return false;
1526            if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1527                || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1528                return false;
1529            break;
1530        } // switch anchor type
1531
1532        return true;
1533    }
1534
1535    private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
1536                                                 int offset, int opts) {
1537        int ret = getWordType(target, begin, end, --offset, opts);
1538        while (ret == WT_IGNORE)
1539            ret = getWordType(target, begin, end, --offset, opts);
1540        return ret;
1541    }
1542
1543    private static final int getWordType(ExpressionTarget target, int begin, int end,
1544                                         int offset, int opts) {
1545        if (offset < begin || offset >= end)  return WT_OTHER;
1546        return getWordType0(target.charAt(offset) , opts);
1547    }
1548
1549
1550    /**
1551     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1552     *
1553     * @return true if the target is matched to this regular expression.
1554     */
1555    public boolean matches(CharacterIterator target) {
1556        return this.matches(target, (Match)null);
1557    }
1558
1559
1560    /**
1561     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1562     *
1563     * @param match A Match instance for storing matching result.
1564     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1565     */
1566    public boolean matches(CharacterIterator  target, Match match) {
1567        int start = target.getBeginIndex();
1568        int end = target.getEndIndex();
1569
1570
1571
1572        synchronized (this) {
1573            if (this.operations == null)
1574                this.prepare();
1575            if (this.context == null)
1576                this.context = new Context();
1577        }
1578        Context con = null;
1579        synchronized (this.context) {
1580            con = this.context.inuse ? new Context() : this.context;
1581            con.reset(target, start, end, this.numberOfClosures);
1582        }
1583        if (match != null) {
1584            match.setNumberOfGroups(this.nofparen);
1585            match.setSource(target);
1586        } else if (this.hasBackReferences) {
1587            match = new Match();
1588            match.setNumberOfGroups(this.nofparen);
1589            // Need not to call setSource() because
1590            // a caller can not access this match instance.
1591        }
1592        con.match = match;
1593
1594        if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1595            int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1596            //System.err.println("DEBUG: matchEnd="+matchEnd);
1597            if (matchEnd == con.limit) {
1598                if (con.match != null) {
1599                    con.match.setBeginning(0, con.start);
1600                    con.match.setEnd(0, matchEnd);
1601                }
1602                con.setInUse(false);
1603                return true;
1604            }
1605            return false;
1606        }
1607
1608        /*
1609         * The pattern has only fixed string.
1610         * The engine uses Boyer-Moore.
1611         */
1612        if (this.fixedStringOnly) {
1613            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1614            int o = this.fixedStringTable.matches(target, con.start, con.limit);
1615            if (o >= 0) {
1616                if (con.match != null) {
1617                    con.match.setBeginning(0, o);
1618                    con.match.setEnd(0, o+this.fixedString.length());
1619                }
1620                con.setInUse(false);
1621                return true;
1622            }
1623            con.setInUse(false);
1624            return false;
1625        }
1626
1627        /*
1628         * The pattern contains a fixed string.
1629         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1630         * If not, it return with false.
1631         */
1632        if (this.fixedString != null) {
1633            int o = this.fixedStringTable.matches(target, con.start, con.limit);
1634            if (o < 0) {
1635                //System.err.println("Non-match in fixed-string search.");
1636                con.setInUse(false);
1637                return false;
1638            }
1639        }
1640
1641        int limit = con.limit-this.minlength;
1642        int matchStart;
1643        int matchEnd = -1;
1644
1645        /*
1646         * Checks whether the expression starts with ".*".
1647         */
1648        if (this.operations != null
1649            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1650            if (isSet(this.options, SINGLE_LINE)) {
1651                matchStart = con.start;
1652                matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1653            } else {
1654                boolean previousIsEOL = true;
1655                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1656                    int ch =  target .setIndex(  matchStart ) ;
1657                    if (isEOLChar(ch)) {
1658                        previousIsEOL = true;
1659                    } else {
1660                        if (previousIsEOL) {
1661                            if (0 <= (matchEnd = this.match(con, this.operations,
1662                                                            matchStart, 1, this.options)))
1663                                break;
1664                        }
1665                        previousIsEOL = false;
1666                    }
1667                }
1668            }
1669        }
1670
1671        /*
1672         * Optimization against the first character.
1673         */
1674        else if (this.firstChar != null) {
1675            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1676            RangeToken range = this.firstChar;
1677            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1678                int ch =  target .setIndex(  matchStart ) ;
1679                if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1680                    ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
1681                }
1682                if (!range.match(ch)) {
1683                    continue;
1684                }
1685                if (0 <= (matchEnd = this.match(con, this.operations,
1686                                                matchStart, 1, this.options))) {
1687                    break;
1688                }
1689            }
1690        }
1691
1692        /*
1693         * Straightforward matching.
1694         */
1695        else {
1696            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1697                if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
1698                    break;
1699            }
1700        }
1701
1702        if (matchEnd >= 0) {
1703            if (con.match != null) {
1704                con.match.setBeginning(0, matchStart);
1705                con.match.setEnd(0, matchEnd);
1706            }
1707            con.setInUse(false);
1708            return true;
1709        } else {
1710            con.setInUse(false);
1711            return false;
1712        }
1713    }
1714
1715    // ================================================================
1716
1717    /**
1718     * A regular expression.
1719     * @serial
1720     */
1721    String regex;
1722    /**
1723     * @serial
1724     */
1725    int options;
1726
1727    /**
1728     * The number of parenthesis in the regular expression.
1729     * @serial
1730     */
1731    int nofparen;
1732    /**
1733     * Internal representation of the regular expression.
1734     * @serial
1735     */
1736    Token tokentree;
1737
1738    boolean hasBackReferences = false;
1739
1740    transient int minlength;
1741    transient Op operations = null;
1742    transient int numberOfClosures;
1743    transient Context context = null;
1744    transient RangeToken firstChar = null;
1745
1746    transient String fixedString = null;
1747    transient int fixedStringOptions;
1748    transient BMPattern fixedStringTable = null;
1749    transient boolean fixedStringOnly = false;
1750
1751    static abstract class ExpressionTarget {
1752        abstract char charAt(int index);
1753        abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
1754        abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
1755    }
1756
1757    static final class StringTarget extends ExpressionTarget {
1758
1759        private String target;
1760
1761        StringTarget(String target) {
1762            this.target = target;
1763        }
1764
1765        final void resetTarget(String target) {
1766            this.target = target;
1767        }
1768
1769        final char charAt(int index) {
1770            return target.charAt(index);
1771        }
1772
1773        final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1774                              String part, int partlen) {
1775            if (limit-offset < partlen) {
1776                return false;
1777            }
1778            return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
1779        }
1780
1781        final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1782                                    int offset2, int partlen) {
1783            if (limit-offset < partlen) {
1784                return false;
1785            }
1786            return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
1787                                : target.regionMatches(offset, target, offset2, partlen);
1788        }
1789    }
1790
1791    static final class CharArrayTarget extends ExpressionTarget {
1792
1793        char[] target;
1794
1795        CharArrayTarget(char[] target) {
1796            this.target = target;
1797        }
1798
1799        final void resetTarget(char[] target) {
1800            this.target = target;
1801        }
1802
1803        char charAt(int index) {
1804            return target[index];
1805        }
1806
1807        final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1808                String part, int partlen) {
1809            if (offset < 0 || limit-offset < partlen)  {
1810                return false;
1811            }
1812            return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1813                                : regionMatches(offset, limit, part, partlen);
1814        }
1815
1816        private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1817            int i = 0;
1818            while (partlen-- > 0) {
1819                if (target[offset++] != part.charAt(i++)) {
1820                    return false;
1821                }
1822            }
1823            return true;
1824        }
1825
1826        private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1827            int i = 0;
1828            while (partlen-- > 0) {
1829                final char ch1 = target[offset++] ;
1830                final char ch2 = part.charAt(i++);
1831                if (ch1 == ch2) {
1832                    continue;
1833                }
1834                final char uch1 = Character.toUpperCase(ch1);
1835                final char uch2 = Character.toUpperCase(ch2);
1836                if (uch1 == uch2) {
1837                    continue;
1838                }
1839                if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1840                    return false;
1841                }
1842            }
1843            return true;
1844        }
1845
1846        final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1847            if (offset < 0 || limit-offset < partlen) {
1848                return false;
1849            }
1850            return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1851                                : regionMatches(offset, limit, offset2, partlen);
1852        }
1853
1854        private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1855            int i = offset2;
1856            while (partlen-- > 0) {
1857                if ( target [  offset++ ]  !=  target [  i++ ] )
1858                    return false;
1859            }
1860            return true;
1861        }
1862
1863        private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1864            int i = offset2;
1865            while (partlen-- > 0) {
1866                final char ch1 =  target[offset++] ;
1867                final char ch2 =  target[i++] ;
1868                if (ch1 == ch2) {
1869                    continue;
1870                }
1871                final char uch1 = Character.toUpperCase(ch1);
1872                final char uch2 = Character.toUpperCase(ch2);
1873                if (uch1 == uch2) {
1874                    continue;
1875                }
1876                if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1877                    return false;
1878                }
1879            }
1880            return true;
1881        }
1882    }
1883
1884    static final class CharacterIteratorTarget extends ExpressionTarget {
1885        CharacterIterator target;
1886
1887        CharacterIteratorTarget(CharacterIterator target) {
1888            this.target = target;
1889        }
1890
1891        final void resetTarget(CharacterIterator target) {
1892            this.target = target;
1893        }
1894
1895        final char charAt(int index) {
1896            return target.setIndex(index);
1897        }
1898
1899        final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1900                String part, int partlen) {
1901            if (offset < 0 || limit-offset < partlen)  {
1902                return false;
1903            }
1904            return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1905                                : regionMatches(offset, limit, part, partlen);
1906        }
1907
1908        private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1909            int i = 0;
1910            while (partlen-- > 0) {
1911                if (target.setIndex(offset++) != part.charAt(i++)) {
1912                    return false;
1913                }
1914            }
1915            return true;
1916        }
1917
1918        private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1919            int i = 0;
1920            while (partlen-- > 0) {
1921                final char ch1 = target.setIndex(offset++) ;
1922                final char ch2 = part.charAt(i++);
1923                if (ch1 == ch2) {
1924                    continue;
1925                }
1926                final char uch1 = Character.toUpperCase(ch1);
1927                final char uch2 = Character.toUpperCase(ch2);
1928                if (uch1 == uch2) {
1929                    continue;
1930                }
1931                if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1932                    return false;
1933                }
1934            }
1935            return true;
1936        }
1937
1938        final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1939            if (offset < 0 || limit-offset < partlen) {
1940                return false;
1941            }
1942            return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1943                                : regionMatches(offset, limit, offset2, partlen);
1944        }
1945
1946        private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1947            int i = offset2;
1948            while (partlen-- > 0) {
1949                if (target.setIndex(offset++) != target.setIndex(i++)) {
1950                    return false;
1951                }
1952            }
1953            return true;
1954        }
1955
1956        private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1957            int i = offset2;
1958            while (partlen-- > 0) {
1959                final char ch1 = target.setIndex(offset++) ;
1960                final char ch2 = target.setIndex(i++) ;
1961                if (ch1 == ch2) {
1962                    continue;
1963                }
1964                final char uch1 = Character.toUpperCase(ch1);
1965                final char uch2 = Character.toUpperCase(ch2);
1966                if (uch1 == uch2) {
1967                    continue;
1968                }
1969                if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1970                    return false;
1971                }
1972            }
1973            return true;
1974        }
1975    }
1976
1977    static final class ClosureContext {
1978
1979        int[] offsets = new int[4];
1980        int currentIndex = 0;
1981
1982        boolean contains(int offset) {
1983            for (int i=0; i<currentIndex;++i) {
1984                if (offsets[i] == offset) {
1985                    return true;
1986                }
1987            }
1988            return false;
1989        }
1990
1991        void reset() {
1992            currentIndex = 0;
1993        }
1994
1995        void addOffset(int offset) {
1996            // We do not check for duplicates, caller is responsible for that
1997            if (currentIndex == offsets.length) {
1998                offsets = expandOffsets();
1999            }
2000            offsets[currentIndex++] = offset;
2001        }
2002
2003        private int[] expandOffsets() {
2004            final int len = offsets.length;
2005            final int newLen = len << 1;
2006            int[] newOffsets = new int[newLen];
2007
2008            System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
2009            return newOffsets;
2010        }
2011    }
2012
2013    static final class Context {
2014        int start;
2015        int limit;
2016        int length;
2017        Match match;
2018        boolean inuse = false;
2019        ClosureContext[] closureContexts;
2020
2021        private StringTarget stringTarget;
2022        private CharArrayTarget charArrayTarget;
2023        private CharacterIteratorTarget characterIteratorTarget;
2024
2025        ExpressionTarget target;
2026
2027        Context() {
2028        }
2029
2030        private void resetCommon(int nofclosures) {
2031            this.length = this.limit-this.start;
2032            setInUse(true);
2033            this.match = null;
2034            if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
2035                this.closureContexts = new ClosureContext[nofclosures];
2036            }
2037            for (int i = 0;  i < nofclosures;  i ++)  {
2038                if (this.closureContexts[i] == null) {
2039                    this.closureContexts[i] = new ClosureContext();
2040                }
2041                else {
2042                    this.closureContexts[i].reset();
2043                }
2044            }
2045        }
2046
2047        void reset(CharacterIterator target, int start, int limit, int nofclosures) {
2048            if (characterIteratorTarget == null) {
2049                characterIteratorTarget = new CharacterIteratorTarget(target);
2050            }
2051            else {
2052                characterIteratorTarget.resetTarget(target);
2053            }
2054            this.target = characterIteratorTarget;
2055            this.start = start;
2056            this.limit = limit;
2057            this.resetCommon(nofclosures);
2058        }
2059
2060        void reset(String target, int start, int limit, int nofclosures) {
2061            if (stringTarget == null) {
2062                stringTarget = new StringTarget(target);
2063            }
2064            else {
2065                stringTarget.resetTarget(target);
2066            }
2067            this.target = stringTarget;
2068            this.start = start;
2069            this.limit = limit;
2070            this.resetCommon(nofclosures);
2071        }
2072
2073        void reset(char[] target, int start, int limit, int nofclosures) {
2074            if (charArrayTarget == null) {
2075                charArrayTarget = new CharArrayTarget(target);
2076            }
2077            else {
2078                charArrayTarget.resetTarget(target);
2079            }
2080            this.target = charArrayTarget;
2081            this.start = start;
2082            this.limit = limit;
2083            this.resetCommon(nofclosures);
2084        }
2085        synchronized void setInUse(boolean inUse) {
2086            this.inuse = inUse;
2087        }
2088    }
2089
2090    /**
2091     * Prepares for matching.  This method is called just before starting matching.
2092     */
2093    void prepare() {
2094        if (Op.COUNT)  Op.nofinstances = 0;
2095        this.compile(this.tokentree);
2096        /*
2097        if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2098            Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2099            anchor.next = this.operations;
2100            this.operations = anchor;
2101        }
2102        */
2103        if (Op.COUNT)  System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2104
2105        this.minlength = this.tokentree.getMinLength();
2106
2107        this.firstChar = null;
2108        if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2109            && !isSet(this.options, XMLSCHEMA_MODE)) {
2110            RangeToken firstChar = Token.createRange();
2111            int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2112            if (fresult == Token.FC_TERMINAL) {
2113                firstChar.compactRanges();
2114                this.firstChar = firstChar;
2115                if (DEBUG)
2116                    System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2117            }
2118        }
2119
2120        if (this.operations != null
2121            && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2122            && this.operations.next == null) {
2123            if (DEBUG)
2124                System.err.print(" *** Only fixed string! *** ");
2125            this.fixedStringOnly = true;
2126            if (this.operations.type == Op.STRING)
2127                this.fixedString = this.operations.getString();
2128            else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2129                this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2130            } else {
2131                char[] ac = new char[1];
2132                ac[0] = (char)this.operations.getData();
2133                this.fixedString = new String(ac);
2134            }
2135            this.fixedStringOptions = this.options;
2136            this.fixedStringTable = new BMPattern(this.fixedString, 256,
2137                                                  isSet(this.fixedStringOptions, IGNORE_CASE));
2138        } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2139                   && !isSet(this.options, XMLSCHEMA_MODE)) {
2140            Token.FixedStringContainer container = new Token.FixedStringContainer();
2141            this.tokentree.findFixedString(container, this.options);
2142            this.fixedString = container.token == null ? null : container.token.getString();
2143            this.fixedStringOptions = container.options;
2144            if (this.fixedString != null && this.fixedString.length() < 2)
2145                this.fixedString = null;
2146            // This pattern has a fixed string of which length is more than one.
2147            if (this.fixedString != null) {
2148                this.fixedStringTable = new BMPattern(this.fixedString, 256,
2149                                                      isSet(this.fixedStringOptions, IGNORE_CASE));
2150                if (DEBUG) {
2151                    System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2152                                       +"/" //+this.fixedString
2153                                       +"/"+REUtil.createOptionString(this.fixedStringOptions));
2154                    System.err.print("String: ");
2155                    REUtil.dumpString(this.fixedString);
2156                }
2157            }
2158        }
2159    }
2160
2161    /**
2162     * An option.
2163     * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2164     * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2165     * does not capture.
2166     *
2167     * @see #RegularExpression(java.lang.String,int)
2168     * @see #setPattern(java.lang.String,int)
2169    static final int MARK_PARENS = 1<<0;
2170     */
2171
2172    /**
2173     * "i"
2174     */
2175    static final int IGNORE_CASE = 1<<1;
2176
2177    /**
2178     * "s"
2179     */
2180    static final int SINGLE_LINE = 1<<2;
2181
2182    /**
2183     * "m"
2184     */
2185    static final int MULTIPLE_LINES = 1<<3;
2186
2187    /**
2188     * "x"
2189     */
2190    static final int EXTENDED_COMMENT = 1<<4;
2191
2192    /**
2193     * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2194     *
2195     * @see #RegularExpression(java.lang.String,int)
2196     * @see #setPattern(java.lang.String,int)
2197     * @see #UNICODE_WORD_BOUNDARY
2198     */
2199    static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2200
2201    /**
2202     * An option.
2203     * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2204     * <p>By default, the engine considers a position between a word character
2205     * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2206     * is a word boundary.
2207     * <p>By this option, the engine checks word boundaries with the method of
2208     * 'Unicode Regular Expression Guidelines' Revision 4.
2209     *
2210     * @see #RegularExpression(java.lang.String,int)
2211     * @see #setPattern(java.lang.String,int)
2212     */
2213    static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2214
2215    /**
2216     * "H"
2217     */
2218    static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2219    /**
2220     * "F"
2221     */
2222    static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2223    /**
2224     * "X". XML Schema mode.
2225     */
2226    static final int XMLSCHEMA_MODE = 1<<9;
2227    /**
2228     * ",".
2229     */
2230    static final int SPECIAL_COMMA = 1<<10;
2231
2232
2233    private static final boolean isSet(int options, int flag) {
2234        return (options & flag) == flag;
2235    }
2236
2237    /**
2238     * Creates a new RegularExpression instance.
2239     *
2240     * @param regex A regular expression
2241     * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2242     */
2243    public RegularExpression(String regex) throws ParseException {
2244        this(regex, null);
2245    }
2246
2247    /**
2248     * Creates a new RegularExpression instance with options.
2249     *
2250     * @param regex A regular expression
2251     * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2252     * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2253     */
2254    public RegularExpression(String regex, String options) throws ParseException {
2255        this.setPattern(regex, options);
2256    }
2257
2258    /**
2259     * Creates a new RegularExpression instance with options.
2260     *
2261     * @param regex A regular expression
2262     * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2263     * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2264     */
2265    public RegularExpression(String regex, String options, Locale locale) throws ParseException {
2266        this.setPattern(regex, options, locale);
2267    }
2268
2269    RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
2270        this.regex = regex;
2271        this.tokentree = tok;
2272        this.nofparen = parens;
2273        this.options = options;
2274        this.hasBackReferences = hasBackReferences;
2275    }
2276
2277    /**
2278     *
2279     */
2280    public void setPattern(String newPattern) throws ParseException {
2281        this.setPattern(newPattern, Locale.getDefault());
2282    }
2283
2284    public void setPattern(String newPattern, Locale locale) throws ParseException {
2285        this.setPattern(newPattern, this.options, locale);
2286    }
2287
2288    private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
2289        this.regex = newPattern;
2290        this.options = options;
2291        RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
2292                         ? new ParserForXMLSchema(locale) : new RegexParser(locale);
2293        this.tokentree = rp.parse(this.regex, this.options);
2294        this.nofparen = rp.parennumber;
2295        this.hasBackReferences = rp.hasBackReferences;
2296
2297        this.operations = null;
2298        this.context = null;
2299    }
2300    /**
2301     *
2302     */
2303    public void setPattern(String newPattern, String options) throws ParseException {
2304        this.setPattern(newPattern, options, Locale.getDefault());
2305    }
2306
2307    public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
2308        this.setPattern(newPattern, REUtil.parseOptions(options), locale);
2309    }
2310
2311    /**
2312     *
2313     */
2314    public String getPattern() {
2315        return this.regex;
2316    }
2317
2318    /**
2319     * Represents this instence in String.
2320     */
2321    public String toString() {
2322        return this.tokentree.toString(this.options);
2323    }
2324
2325    /**
2326     * Returns a option string.
2327     * The order of letters in it may be different from a string specified
2328     * in a constructor or <code>setPattern()</code>.
2329     *
2330     * @see #RegularExpression(java.lang.String,java.lang.String)
2331     * @see #setPattern(java.lang.String,java.lang.String)
2332     */
2333    public String getOptions() {
2334        return REUtil.createOptionString(this.options);
2335    }
2336
2337    /**
2338     *  Return true if patterns are the same and the options are equivalent.
2339     */
2340    public boolean equals(Object obj) {
2341        if (obj == null)  return false;
2342        if (!(obj instanceof RegularExpression))
2343            return false;
2344        RegularExpression r = (RegularExpression)obj;
2345        return this.regex.equals(r.regex) && this.options == r.options;
2346    }
2347
2348    boolean equals(String pattern, int options) {
2349        return this.regex.equals(pattern) && this.options == options;
2350    }
2351
2352    /**
2353     *
2354     */
2355    public int hashCode() {
2356        return (this.regex+"/"+this.getOptions()).hashCode();
2357    }
2358
2359    /**
2360     * Return the number of regular expression groups.
2361     * This method returns 1 when the regular expression has no capturing-parenthesis.
2362     *
2363     */
2364    public int getNumberOfGroups() {
2365        return this.nofparen;
2366    }
2367
2368    // ================================================================
2369
2370    private static final int WT_IGNORE = 0;
2371    private static final int WT_LETTER = 1;
2372    private static final int WT_OTHER = 2;
2373    private static final int getWordType0(char ch, int opts) {
2374        if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
2375            if (isSet(opts, USE_UNICODE_CATEGORY)) {
2376                return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
2377            }
2378            return isWordChar(ch) ? WT_LETTER : WT_OTHER;
2379        }
2380
2381        switch (Character.getType(ch)) {
2382        case Character.UPPERCASE_LETTER:      // L
2383        case Character.LOWERCASE_LETTER:      // L
2384        case Character.TITLECASE_LETTER:      // L
2385        case Character.MODIFIER_LETTER:       // L
2386        case Character.OTHER_LETTER:          // L
2387        case Character.LETTER_NUMBER:         // N
2388        case Character.DECIMAL_DIGIT_NUMBER:  // N
2389        case Character.OTHER_NUMBER:          // N
2390        case Character.COMBINING_SPACING_MARK: // Mc
2391            return WT_LETTER;
2392
2393        case Character.FORMAT:                // Cf
2394        case Character.NON_SPACING_MARK:      // Mn
2395        case Character.ENCLOSING_MARK:        // Mc
2396            return WT_IGNORE;
2397
2398        case Character.CONTROL:               // Cc
2399            switch (ch) {
2400            case '\t':
2401            case '\n':
2402            case '\u000B':
2403            case '\f':
2404            case '\r':
2405                return WT_OTHER;
2406            default:
2407                return WT_IGNORE;
2408            }
2409
2410        default:
2411            return WT_OTHER;
2412        }
2413    }
2414
2415    // ================================================================
2416
2417    static final int LINE_FEED = 0x000A;
2418    static final int CARRIAGE_RETURN = 0x000D;
2419    static final int LINE_SEPARATOR = 0x2028;
2420    static final int PARAGRAPH_SEPARATOR = 0x2029;
2421
2422    private static final boolean isEOLChar(int ch) {
2423        return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
2424        || ch == PARAGRAPH_SEPARATOR;
2425    }
2426
2427    private static final boolean isWordChar(int ch) { // Legacy word characters
2428        if (ch == '_')  return true;
2429        if (ch < '0')  return false;
2430        if (ch > 'z')  return false;
2431        if (ch <= '9')  return true;
2432        if (ch < 'A')  return false;
2433        if (ch <= 'Z')  return true;
2434        if (ch < 'a')  return false;
2435        return true;
2436    }
2437
2438    private static final boolean matchIgnoreCase(int chardata, int ch) {
2439        if (chardata == ch)  return true;
2440        if (chardata > 0xffff || ch > 0xffff)  return false;
2441        char uch1 = Character.toUpperCase((char)chardata);
2442        char uch2 = Character.toUpperCase((char)ch);
2443        if (uch1 == uch2)  return true;
2444        return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
2445    }
2446}
2447