Pattern.java revision 14405:1049321b86cb
1/*
2 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package java.util.regex;
27
28import java.text.Normalizer;
29import java.util.Locale;
30import java.util.Iterator;
31import java.util.Map;
32import java.util.ArrayList;
33import java.util.HashMap;
34import java.util.Arrays;
35import java.util.NoSuchElementException;
36import java.util.Spliterator;
37import java.util.Spliterators;
38import java.util.function.Predicate;
39import java.util.stream.Stream;
40import java.util.stream.StreamSupport;
41
42
43/**
44 * A compiled representation of a regular expression.
45 *
46 * <p> A regular expression, specified as a string, must first be compiled into
47 * an instance of this class.  The resulting pattern can then be used to create
48 * a {@link Matcher} object that can match arbitrary {@linkplain
49 * java.lang.CharSequence character sequences} against the regular
50 * expression.  All of the state involved in performing a match resides in the
51 * matcher, so many matchers can share the same pattern.
52 *
53 * <p> A typical invocation sequence is thus
54 *
55 * <blockquote><pre>
56 * Pattern p = Pattern.{@link #compile compile}("a*b");
57 * Matcher m = p.{@link #matcher matcher}("aaaaab");
58 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
59 *
60 * <p> A {@link #matches matches} method is defined by this class as a
61 * convenience for when a regular expression is used just once.  This method
62 * compiles an expression and matches an input sequence against it in a single
63 * invocation.  The statement
64 *
65 * <blockquote><pre>
66 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>
67 *
68 * is equivalent to the three statements above, though for repeated matches it
69 * is less efficient since it does not allow the compiled pattern to be reused.
70 *
71 * <p> Instances of this class are immutable and are safe for use by multiple
72 * concurrent threads.  Instances of the {@link Matcher} class are not safe for
73 * such use.
74 *
75 *
76 * <h3><a name="sum">Summary of regular-expression constructs</a></h3>
77 *
78 * <table border="0" cellpadding="1" cellspacing="0"
79 *  summary="Regular expression constructs, and what they match">
80 *
81 * <tr align="left">
82 * <th align="left" id="construct">Construct</th>
83 * <th align="left" id="matches">Matches</th>
84 * </tr>
85 *
86 * <tr><th>&nbsp;</th></tr>
87 * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>
88 *
89 * <tr><td valign="top" headers="construct characters"><i>x</i></td>
90 *     <td headers="matches">The character <i>x</i></td></tr>
91 * <tr><td valign="top" headers="construct characters">{@code \\}</td>
92 *     <td headers="matches">The backslash character</td></tr>
93 * <tr><td valign="top" headers="construct characters">{@code \0}<i>n</i></td>
94 *     <td headers="matches">The character with octal value {@code 0}<i>n</i>
95 *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
96 * <tr><td valign="top" headers="construct characters">{@code \0}<i>nn</i></td>
97 *     <td headers="matches">The character with octal value {@code 0}<i>nn</i>
98 *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
99 * <tr><td valign="top" headers="construct characters">{@code \0}<i>mnn</i></td>
100 *     <td headers="matches">The character with octal value {@code 0}<i>mnn</i>
101 *         (0&nbsp;{@code <=}&nbsp;<i>m</i>&nbsp;{@code <=}&nbsp;3,
102 *         0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>
103 * <tr><td valign="top" headers="construct characters">{@code \x}<i>hh</i></td>
104 *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hh</i></td></tr>
105 * <tr><td valign="top" headers="construct characters"><code>&#92;u</code><i>hhhh</i></td>
106 *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hhhh</i></td></tr>
107 * <tr><td valign="top" headers="construct characters"><code>&#92;x</code><i>{h...h}</i></td>
108 *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>h...h</i>
109 *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
110 *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
111 *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
112 * <tr><td valign="top" headers="construct characters"><code>&#92;N{</code><i>name</i><code>}</code></td>
113 *     <td headers="matches">The character with Unicode character name <i>'name'</i></td></tr>
114 * <tr><td valign="top" headers="matches">{@code \t}</td>
115 *     <td headers="matches">The tab character (<code>'&#92;u0009'</code>)</td></tr>
116 * <tr><td valign="top" headers="construct characters">{@code \n}</td>
117 *     <td headers="matches">The newline (line feed) character (<code>'&#92;u000A'</code>)</td></tr>
118 * <tr><td valign="top" headers="construct characters">{@code \r}</td>
119 *     <td headers="matches">The carriage-return character (<code>'&#92;u000D'</code>)</td></tr>
120 * <tr><td valign="top" headers="construct characters">{@code \f}</td>
121 *     <td headers="matches">The form-feed character (<code>'&#92;u000C'</code>)</td></tr>
122 * <tr><td valign="top" headers="construct characters">{@code \a}</td>
123 *     <td headers="matches">The alert (bell) character (<code>'&#92;u0007'</code>)</td></tr>
124 * <tr><td valign="top" headers="construct characters">{@code \e}</td>
125 *     <td headers="matches">The escape character (<code>'&#92;u001B'</code>)</td></tr>
126 * <tr><td valign="top" headers="construct characters">{@code \c}<i>x</i></td>
127 *     <td headers="matches">The control character corresponding to <i>x</i></td></tr>
128 *
129 * <tr><th>&nbsp;</th></tr>
130 * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
131 *
132 * <tr><td valign="top" headers="construct classes">{@code [abc]}</td>
133 *     <td headers="matches">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
134 * <tr><td valign="top" headers="construct classes">{@code [^abc]}</td>
135 *     <td headers="matches">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr>
136 * <tr><td valign="top" headers="construct classes">{@code [a-zA-Z]}</td>
137 *     <td headers="matches">{@code a} through {@code z}
138 *         or {@code A} through {@code Z}, inclusive (range)</td></tr>
139 * <tr><td valign="top" headers="construct classes">{@code [a-d[m-p]]}</td>
140 *     <td headers="matches">{@code a} through {@code d},
141 *      or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr>
142 * <tr><td valign="top" headers="construct classes">{@code [a-z&&[def]]}</td>
143 *     <td headers="matches">{@code d}, {@code e}, or {@code f} (intersection)</tr>
144 * <tr><td valign="top" headers="construct classes">{@code [a-z&&[^bc]]}</td>
145 *     <td headers="matches">{@code a} through {@code z},
146 *         except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr>
147 * <tr><td valign="top" headers="construct classes">{@code [a-z&&[^m-p]]}</td>
148 *     <td headers="matches">{@code a} through {@code z},
149 *          and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr>
150 * <tr><th>&nbsp;</th></tr>
151 *
152 * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>
153 *
154 * <tr><td valign="top" headers="construct predef">{@code .}</td>
155 *     <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
156 * <tr><td valign="top" headers="construct predef">{@code \d}</td>
157 *     <td headers="matches">A digit: {@code [0-9]}</td></tr>
158 * <tr><td valign="top" headers="construct predef">{@code \D}</td>
159 *     <td headers="matches">A non-digit: {@code [^0-9]}</td></tr>
160 * <tr><td valign="top" headers="construct predef">{@code \h}</td>
161 *     <td headers="matches">A horizontal whitespace character:
162 *     <code>[ \t\xA0&#92;u1680&#92;u180e&#92;u2000-&#92;u200a&#92;u202f&#92;u205f&#92;u3000]</code></td></tr>
163 * <tr><td valign="top" headers="construct predef">{@code \H}</td>
164 *     <td headers="matches">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
165 * <tr><td valign="top" headers="construct predef">{@code \s}</td>
166 *     <td headers="matches">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
167 * <tr><td valign="top" headers="construct predef">{@code \S}</td>
168 *     <td headers="matches">A non-whitespace character: {@code [^\s]}</td></tr>
169 * <tr><td valign="top" headers="construct predef">{@code \v}</td>
170 *     <td headers="matches">A vertical whitespace character: <code>[\n\x0B\f\r\x85&#92;u2028&#92;u2029]</code>
171 *     </td></tr>
172 * <tr><td valign="top" headers="construct predef">{@code \V}</td>
173 *     <td headers="matches">A non-vertical whitespace character: {@code [^\v]}</td></tr>
174 * <tr><td valign="top" headers="construct predef">{@code \w}</td>
175 *     <td headers="matches">A word character: {@code [a-zA-Z_0-9]}</td></tr>
176 * <tr><td valign="top" headers="construct predef">{@code \W}</td>
177 *     <td headers="matches">A non-word character: {@code [^\w]}</td></tr>
178 * <tr><th>&nbsp;</th></tr>
179 * <tr align="left"><th colspan="2" id="posix"><b>POSIX character classes (US-ASCII only)</b></th></tr>
180 *
181 * <tr><td valign="top" headers="construct posix">{@code \p{Lower}}</td>
182 *     <td headers="matches">A lower-case alphabetic character: {@code [a-z]}</td></tr>
183 * <tr><td valign="top" headers="construct posix">{@code \p{Upper}}</td>
184 *     <td headers="matches">An upper-case alphabetic character:{@code [A-Z]}</td></tr>
185 * <tr><td valign="top" headers="construct posix">{@code \p{ASCII}}</td>
186 *     <td headers="matches">All ASCII:{@code [\x00-\x7F]}</td></tr>
187 * <tr><td valign="top" headers="construct posix">{@code \p{Alpha}}</td>
188 *     <td headers="matches">An alphabetic character:{@code [\p{Lower}\p{Upper}]}</td></tr>
189 * <tr><td valign="top" headers="construct posix">{@code \p{Digit}}</td>
190 *     <td headers="matches">A decimal digit: {@code [0-9]}</td></tr>
191 * <tr><td valign="top" headers="construct posix">{@code \p{Alnum}}</td>
192 *     <td headers="matches">An alphanumeric character:{@code [\p{Alpha}\p{Digit}]}</td></tr>
193 * <tr><td valign="top" headers="construct posix">{@code \p{Punct}}</td>
194 *     <td headers="matches">Punctuation: One of {@code !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}</td></tr>
195 *     <!-- {@code [\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]}
196 *          {@code [\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]} -->
197 * <tr><td valign="top" headers="construct posix">{@code \p{Graph}}</td>
198 *     <td headers="matches">A visible character: {@code [\p{Alnum}\p{Punct}]}</td></tr>
199 * <tr><td valign="top" headers="construct posix">{@code \p{Print}}</td>
200 *     <td headers="matches">A printable character: {@code [\p{Graph}\x20]}</td></tr>
201 * <tr><td valign="top" headers="construct posix">{@code \p{Blank}}</td>
202 *     <td headers="matches">A space or a tab: {@code [ \t]}</td></tr>
203 * <tr><td valign="top" headers="construct posix">{@code \p{Cntrl}}</td>
204 *     <td headers="matches">A control character: {@code [\x00-\x1F\x7F]}</td></tr>
205 * <tr><td valign="top" headers="construct posix">{@code \p{XDigit}}</td>
206 *     <td headers="matches">A hexadecimal digit: {@code [0-9a-fA-F]}</td></tr>
207 * <tr><td valign="top" headers="construct posix">{@code \p{Space}}</td>
208 *     <td headers="matches">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
209 *
210 * <tr><th>&nbsp;</th></tr>
211 * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
212 *
213 * <tr><td valign="top">{@code \p{javaLowerCase}}</td>
214 *     <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
215 * <tr><td valign="top">{@code \p{javaUpperCase}}</td>
216 *     <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
217 * <tr><td valign="top">{@code \p{javaWhitespace}}</td>
218 *     <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
219 * <tr><td valign="top">{@code \p{javaMirrored}}</td>
220 *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
221 *
222 * <tr><th>&nbsp;</th></tr>
223 * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
224 * <tr><td valign="top" headers="construct unicode">{@code \p{IsLatin}}</td>
225 *     <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>
226 * <tr><td valign="top" headers="construct unicode">{@code \p{InGreek}}</td>
227 *     <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>
228 * <tr><td valign="top" headers="construct unicode">{@code \p{Lu}}</td>
229 *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
230 * <tr><td valign="top" headers="construct unicode">{@code \p{IsAlphabetic}}</td>
231 *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
232 * <tr><td valign="top" headers="construct unicode">{@code \p{Sc}}</td>
233 *     <td headers="matches">A currency symbol</td></tr>
234 * <tr><td valign="top" headers="construct unicode">{@code \P{InGreek}}</td>
235 *     <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
236 * <tr><td valign="top" headers="construct unicode">{@code [\p{L}&&[^\p{Lu}]]}</td>
237 *     <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
238 *
239 * <tr><th>&nbsp;</th></tr>
240 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
241 *
242 * <tr><td valign="top" headers="construct bounds">{@code ^}</td>
243 *     <td headers="matches">The beginning of a line</td></tr>
244 * <tr><td valign="top" headers="construct bounds">{@code $}</td>
245 *     <td headers="matches">The end of a line</td></tr>
246 * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
247 *     <td headers="matches">A word boundary</td></tr>
248 * <tr><td valign="top" headers="construct bounds">{@code \b{g}}</td>
249 *     <td headers="matches">A Unicode extended grapheme cluster boundary</td></tr>
250 * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
251 *     <td headers="matches">A non-word boundary</td></tr>
252 * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
253 *     <td headers="matches">The beginning of the input</td></tr>
254 * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
255 *     <td headers="matches">The end of the previous match</td></tr>
256 * <tr><td valign="top" headers="construct bounds">{@code \Z}</td>
257 *     <td headers="matches">The end of the input but for the final
258 *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>
259 * <tr><td valign="top" headers="construct bounds">{@code \z}</td>
260 *     <td headers="matches">The end of the input</td></tr>
261 *
262 * <tr><th>&nbsp;</th></tr>
263 * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
264 * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
265 *     <td headers="matches">Any Unicode linebreak sequence, is equivalent to
266 *     <code>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
267 *     </code></td></tr>
268 *
269 * <tr><th>&nbsp;</th></tr>
270 * <tr align="left"><th colspan="2" id="grapheme">Unicode Extended Grapheme matcher</th></tr>
271 * <tr><td valign="top" headers="construct grapheme">{@code \X}</td>
272 *     <td headers="matches">Any Unicode extended grapheme cluster</td></tr>
273 *
274 * <tr><th>&nbsp;</th></tr>
275 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
276 *
277 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
278 *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
279 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code *}</td>
280 *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
281 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code +}</td>
282 *     <td headers="matches"><i>X</i>, one or more times</td></tr>
283 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i><code>}</code></td>
284 *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
285 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}}</td>
286 *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
287 * <tr><td valign="top" headers="construct greedy"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></td>
288 *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
289 *
290 * <tr><th>&nbsp;</th></tr>
291 * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
292 *
293 * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code ??}</td>
294 *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
295 * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code *?}</td>
296 *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
297 * <tr><td valign="top" headers="construct reluc"><i>X</i>{@code +?}</td>
298 *     <td headers="matches"><i>X</i>, one or more times</td></tr>
299 * <tr><td valign="top" headers="construct reluc"><i>X</i><code>{</code><i>n</i><code>}?</code></td>
300 *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
301 * <tr><td valign="top" headers="construct reluc"><i>X</i><code>{</code><i>n</i><code>,}?</code></td>
302 *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
303 * <tr><td valign="top" headers="construct reluc"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}?</code></td>
304 *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
305 *
306 * <tr><th>&nbsp;</th></tr>
307 * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>
308 *
309 * <tr><td valign="top" headers="construct poss"><i>X</i>{@code ?+}</td>
310 *     <td headers="matches"><i>X</i>, once or not at all</td></tr>
311 * <tr><td valign="top" headers="construct poss"><i>X</i>{@code *+}</td>
312 *     <td headers="matches"><i>X</i>, zero or more times</td></tr>
313 * <tr><td valign="top" headers="construct poss"><i>X</i>{@code ++}</td>
314 *     <td headers="matches"><i>X</i>, one or more times</td></tr>
315 * <tr><td valign="top" headers="construct poss"><i>X</i><code>{</code><i>n</i><code>}+</code></td>
316 *     <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
317 * <tr><td valign="top" headers="construct poss"><i>X</i><code>{</code><i>n</i><code>,}+</code></td>
318 *     <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
319 * <tr><td valign="top" headers="construct poss"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}+</code></td>
320 *     <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
321 *
322 * <tr><th>&nbsp;</th></tr>
323 * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
324 *
325 * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
326 *     <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
327 * <tr><td valign="top" headers="construct logical"><i>X</i>{@code |}<i>Y</i></td>
328 *     <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
329 * <tr><td valign="top" headers="construct logical">{@code (}<i>X</i>{@code )}</td>
330 *     <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
331 *
332 * <tr><th>&nbsp;</th></tr>
333 * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
334 *
335 * <tr><td valign="bottom" headers="construct backref">{@code \}<i>n</i></td>
336 *     <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
337 *     <a href="#cg">capturing group</a> matched</td></tr>
338 *
339 * <tr><td valign="bottom" headers="construct backref">{@code \}<i>k</i>&lt;<i>name</i>&gt;</td>
340 *     <td valign="bottom" headers="matches">Whatever the
341 *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>
342 *
343 * <tr><th>&nbsp;</th></tr>
344 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
345 *
346 * <tr><td valign="top" headers="construct quot">{@code \}</td>
347 *     <td headers="matches">Nothing, but quotes the following character</td></tr>
348 * <tr><td valign="top" headers="construct quot">{@code \Q}</td>
349 *     <td headers="matches">Nothing, but quotes all characters until {@code \E}</td></tr>
350 * <tr><td valign="top" headers="construct quot">{@code \E}</td>
351 *     <td headers="matches">Nothing, but ends quoting started by {@code \Q}</td></tr>
352 *     <!-- Metachars: !$()*+.<>?[\]^{|} -->
353 *
354 * <tr><th>&nbsp;</th></tr>
355 * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr>
356 *
357 * <tr><td valign="top" headers="construct special"><code>(?&lt;<a href="#groupname">name</a>&gt;</code><i>X</i>{@code )}</td>
358 *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
359 * <tr><td valign="top" headers="construct special">{@code (?:}<i>X</i>{@code )}</td>
360 *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
361 * <tr><td valign="top" headers="construct special"><code>(?idmsuxU-idmsuxU)&nbsp;</code></td>
362 *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
363 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
364 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
365 * on - off</td></tr>
366 * <tr><td valign="top" headers="construct special"><code>(?idmsux-idmsux:</code><i>X</i>{@code )}&nbsp;&nbsp;</td>
367 *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
368 *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
369 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
370 * <a href="#COMMENTS">x</a> on - off</td></tr>
371 * <tr><td valign="top" headers="construct special">{@code (?=}<i>X</i>{@code )}</td>
372 *     <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
373 * <tr><td valign="top" headers="construct special">{@code (?!}<i>X</i>{@code )}</td>
374 *     <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
375 * <tr><td valign="top" headers="construct special">{@code (?<=}<i>X</i>{@code )}</td>
376 *     <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
377 * <tr><td valign="top" headers="construct special">{@code (?<!}<i>X</i>{@code )}</td>
378 *     <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
379 * <tr><td valign="top" headers="construct special">{@code (?>}<i>X</i>{@code )}</td>
380 *     <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
381 *
382 * </table>
383 *
384 * <hr>
385 *
386 *
387 * <h3><a name="bs">Backslashes, escapes, and quoting</a></h3>
388 *
389 * <p> The backslash character ({@code '\'}) serves to introduce escaped
390 * constructs, as defined in the table above, as well as to quote characters
391 * that otherwise would be interpreted as unescaped constructs.  Thus the
392 * expression {@code \\} matches a single backslash and <code>\{</code> matches a
393 * left brace.
394 *
395 * <p> It is an error to use a backslash prior to any alphabetic character that
396 * does not denote an escaped construct; these are reserved for future
397 * extensions to the regular-expression language.  A backslash may be used
398 * prior to a non-alphabetic character regardless of whether that character is
399 * part of an unescaped construct.
400 *
401 * <p> Backslashes within string literals in Java source code are interpreted
402 * as required by
403 * <cite>The Java&trade; Language Specification</cite>
404 * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6)
405 * It is therefore necessary to double backslashes in string
406 * literals that represent regular expressions to protect them from
407 * interpretation by the Java bytecode compiler.  The string literal
408 * <code>"&#92;b"</code>, for example, matches a single backspace character when
409 * interpreted as a regular expression, while {@code "\\b"} matches a
410 * word boundary.  The string literal {@code "\(hello\)"} is illegal
411 * and leads to a compile-time error; in order to match the string
412 * {@code (hello)} the string literal {@code "\\(hello\\)"}
413 * must be used.
414 *
415 * <h3><a name="cc">Character Classes</a></h3>
416 *
417 *    <p> Character classes may appear within other character classes, and
418 *    may be composed by the union operator (implicit) and the intersection
419 *    operator ({@code &&}).
420 *    The union operator denotes a class that contains every character that is
421 *    in at least one of its operand classes.  The intersection operator
422 *    denotes a class that contains every character that is in both of its
423 *    operand classes.
424 *
425 *    <p> The precedence of character-class operators is as follows, from
426 *    highest to lowest:
427 *
428 *    <blockquote><table border="0" cellpadding="1" cellspacing="0"
429 *                 summary="Precedence of character class operators.">
430 *      <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
431 *        <td>Literal escape&nbsp;&nbsp;&nbsp;&nbsp;</td>
432 *        <td>{@code \x}</td></tr>
433 *     <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
434 *        <td>Grouping</td>
435 *        <td>{@code [...]}</td></tr>
436 *     <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
437 *        <td>Range</td>
438 *        <td>{@code a-z}</td></tr>
439 *      <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
440 *        <td>Union</td>
441 *        <td>{@code [a-e][i-u]}</td></tr>
442 *      <tr><th>5&nbsp;&nbsp;&nbsp;&nbsp;</th>
443 *        <td>Intersection</td>
444 *        <td>{@code [a-z&&[aeiou]]}</td></tr>
445 *    </table></blockquote>
446 *
447 *    <p> Note that a different set of metacharacters are in effect inside
448 *    a character class than outside a character class. For instance, the
449 *    regular expression {@code .} loses its special meaning inside a
450 *    character class, while the expression {@code -} becomes a range
451 *    forming metacharacter.
452 *
453 * <h3><a name="lt">Line terminators</a></h3>
454 *
455 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks
456 * the end of a line of the input character sequence.  The following are
457 * recognized as line terminators:
458 *
459 * <ul>
460 *
461 *   <li> A newline (line feed) character&nbsp;({@code '\n'}),
462 *
463 *   <li> A carriage-return character followed immediately by a newline
464 *   character&nbsp;({@code "\r\n"}),
465 *
466 *   <li> A standalone carriage-return character&nbsp;({@code '\r'}),
467 *
468 *   <li> A next-line character&nbsp;(<code>'&#92;u0085'</code>),
469 *
470 *   <li> A line-separator character&nbsp;(<code>'&#92;u2028'</code>), or
471 *
472 *   <li> A paragraph-separator character&nbsp;(<code>'&#92;u2029'</code>).
473 *
474 * </ul>
475 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
476 * recognized are newline characters.
477 *
478 * <p> The regular expression {@code .} matches any character except a line
479 * terminator unless the {@link #DOTALL} flag is specified.
480 *
481 * <p> By default, the regular expressions {@code ^} and {@code $} ignore
482 * line terminators and only match at the beginning and the end, respectively,
483 * of the entire input sequence. If {@link #MULTILINE} mode is activated then
484 * {@code ^} matches at the beginning of input and after any line terminator
485 * except at the end of input. When in {@link #MULTILINE} mode {@code $}
486 * matches just before a line terminator or the end of the input sequence.
487 *
488 * <h3><a name="cg">Groups and capturing</a></h3>
489 *
490 * <h4><a name="gnumber">Group number</a></h4>
491 * <p> Capturing groups are numbered by counting their opening parentheses from
492 * left to right.  In the expression {@code ((A)(B(C)))}, for example, there
493 * are four such groups: </p>
494 *
495 * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
496 * <tr><th>1&nbsp;&nbsp;&nbsp;&nbsp;</th>
497 *     <td>{@code ((A)(B(C)))}</td></tr>
498 * <tr><th>2&nbsp;&nbsp;&nbsp;&nbsp;</th>
499 *     <td>{@code (A)}</td></tr>
500 * <tr><th>3&nbsp;&nbsp;&nbsp;&nbsp;</th>
501 *     <td>{@code (B(C))}</td></tr>
502 * <tr><th>4&nbsp;&nbsp;&nbsp;&nbsp;</th>
503 *     <td>{@code (C)}</td></tr>
504 * </table></blockquote>
505 *
506 * <p> Group zero always stands for the entire expression.
507 *
508 * <p> Capturing groups are so named because, during a match, each subsequence
509 * of the input sequence that matches such a group is saved.  The captured
510 * subsequence may be used later in the expression, via a back reference, and
511 * may also be retrieved from the matcher once the match operation is complete.
512 *
513 * <h4><a name="groupname">Group name</a></h4>
514 * <p>A capturing group can also be assigned a "name", a {@code named-capturing group},
515 * and then be back-referenced later by the "name". Group names are composed of
516 * the following characters. The first character must be a {@code letter}.
517 *
518 * <ul>
519 *   <li> The uppercase letters {@code 'A'} through {@code 'Z'}
520 *        (<code>'&#92;u0041'</code>&nbsp;through&nbsp;<code>'&#92;u005a'</code>),
521 *   <li> The lowercase letters {@code 'a'} through {@code 'z'}
522 *        (<code>'&#92;u0061'</code>&nbsp;through&nbsp;<code>'&#92;u007a'</code>),
523 *   <li> The digits {@code '0'} through {@code '9'}
524 *        (<code>'&#92;u0030'</code>&nbsp;through&nbsp;<code>'&#92;u0039'</code>),
525 * </ul>
526 *
527 * <p> A {@code named-capturing group} is still numbered as described in
528 * <a href="#gnumber">Group number</a>.
529 *
530 * <p> The captured input associated with a group is always the subsequence
531 * that the group most recently matched.  If a group is evaluated a second time
532 * because of quantification then its previously-captured value, if any, will
533 * be retained if the second evaluation fails.  Matching the string
534 * {@code "aba"} against the expression {@code (a(b)?)+}, for example, leaves
535 * group two set to {@code "b"}.  All captured input is discarded at the
536 * beginning of each match.
537 *
538 * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups
539 * that do not capture text and do not count towards the group total, or
540 * <i>named-capturing</i> group.
541 *
542 * <h3> Unicode support </h3>
543 *
544 * <p> This class is in conformance with Level 1 of <a
545 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
546 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
547 * Canonical Equivalents.
548 * <p>
549 * <b>Unicode escape sequences</b> such as <code>&#92;u2014</code> in Java source code
550 * are processed as described in section 3.3 of
551 * <cite>The Java&trade; Language Specification</cite>.
552 * Such escape sequences are also implemented directly by the regular-expression
553 * parser so that Unicode escapes can be used in expressions that are read from
554 * files or from the keyboard.  Thus the strings <code>"&#92;u2014"</code> and
555 * {@code "\\u2014"}, while not equal, compile into the same pattern, which
556 * matches the character with hexadecimal value {@code 0x2014}.
557 * <p>
558 * A Unicode character can also be represented by using its <b>Hex notation</b>
559 * (hexadecimal code point value) directly as described in construct
560 * <code>&#92;x{...}</code>, for example a supplementary character U+2011F can be
561 * specified as <code>&#92;x{2011F}</code>, instead of two consecutive Unicode escape
562 * sequences of the surrogate pair <code>&#92;uD840</code><code>&#92;uDD1F</code>.
563 * <p>
564 * <b>Unicode character names</b> are supported by the named character construct
565 * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code>
566 * specifies character <code>&#92;u263A</code>. The character names supported
567 * by this class are the valid Unicode character names matched by
568 * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.
569 * <p>
570 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
571 * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme
572 * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}.
573 * <p>
574 * Unicode scripts, blocks, categories and binary properties are written with
575 * the {@code \p} and {@code \P} constructs as in Perl.
576 * <code>\p{</code><i>prop</i><code>}</code> matches if
577 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
578 * does not match if the input has that property.
579 * <p>
580 * Scripts, blocks, categories and binary properties can be used both inside
581 * and outside of a character class.
582 *
583 * <p>
584 * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
585 * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
586 * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.
587 * <p>
588 * The script names supported by {@code Pattern} are the valid script names
589 * accepted and defined by
590 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
591 *
592 * <p>
593 * <b><a name="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in
594 * {@code InMongolian}, or by using the keyword {@code block} (or its short
595 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
596 * <p>
597 * The block names supported by {@code Pattern} are the valid block names
598 * accepted and defined by
599 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
600 * <p>
601 *
602 * <b><a name="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}:
603 * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
604 * letters. Same as scripts and blocks, categories can also be specified
605 * by using the keyword {@code general_category} (or its short form
606 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
607 * <p>
608 * The supported categories are those of
609 * <a href="http://www.unicode.org/unicode/standard/standard.html">
610 * <i>The Unicode Standard</i></a> in the version specified by the
611 * {@link java.lang.Character Character} class. The category names are those
612 * defined in the Standard, both normative and informative.
613 * <p>
614 *
615 * <b><a name="ubpc">Binary properties</a></b> are specified with the prefix {@code Is}, as in
616 * {@code IsAlphabetic}. The supported binary properties by {@code Pattern}
617 * are
618 * <ul>
619 *   <li> Alphabetic
620 *   <li> Ideographic
621 *   <li> Letter
622 *   <li> Lowercase
623 *   <li> Uppercase
624 *   <li> Titlecase
625 *   <li> Punctuation
626 *   <Li> Control
627 *   <li> White_Space
628 *   <li> Digit
629 *   <li> Hex_Digit
630 *   <li> Join_Control
631 *   <li> Noncharacter_Code_Point
632 *   <li> Assigned
633 * </ul>
634 * <p>
635 * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b>
636 * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
637 * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
638 * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
639 *
640 * <table border="0" cellpadding="1" cellspacing="0"
641 *  summary="predefined and posix character classes in Unicode mode">
642 * <tr align="left">
643 * <th align="left" id="predef_classes">Classes</th>
644 * <th align="left" id="predef_matches">Matches</th>
645 *</tr>
646 * <tr><td>{@code \p{Lower}}</td>
647 *     <td>A lowercase character:{@code \p{IsLowercase}}</td></tr>
648 * <tr><td>{@code \p{Upper}}</td>
649 *     <td>An uppercase character:{@code \p{IsUppercase}}</td></tr>
650 * <tr><td>{@code \p{ASCII}}</td>
651 *     <td>All ASCII:{@code [\x00-\x7F]}</td></tr>
652 * <tr><td>{@code \p{Alpha}}</td>
653 *     <td>An alphabetic character:{@code \p{IsAlphabetic}}</td></tr>
654 * <tr><td>{@code \p{Digit}}</td>
655 *     <td>A decimal digit character:{@code p{IsDigit}}</td></tr>
656 * <tr><td>{@code \p{Alnum}}</td>
657 *     <td>An alphanumeric character:{@code [\p{IsAlphabetic}\p{IsDigit}]}</td></tr>
658 * <tr><td>{@code \p{Punct}}</td>
659 *     <td>A punctuation character:{@code p{IsPunctuation}}</td></tr>
660 * <tr><td>{@code \p{Graph}}</td>
661 *     <td>A visible character: {@code [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]}</td></tr>
662 * <tr><td>{@code \p{Print}}</td>
663 *     <td>A printable character: {@code [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]}</td></tr>
664 * <tr><td>{@code \p{Blank}}</td>
665 *     <td>A space or a tab: {@code [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]}</td></tr>
666 * <tr><td>{@code \p{Cntrl}}</td>
667 *     <td>A control character: {@code \p{gc=Cc}}</td></tr>
668 * <tr><td>{@code \p{XDigit}}</td>
669 *     <td>A hexadecimal digit: {@code [\p{gc=Nd}\p{IsHex_Digit}]}</td></tr>
670 * <tr><td>{@code \p{Space}}</td>
671 *     <td>A whitespace character:{@code \p{IsWhite_Space}}</td></tr>
672 * <tr><td>{@code \d}</td>
673 *     <td>A digit: {@code \p{IsDigit}}</td></tr>
674 * <tr><td>{@code \D}</td>
675 *     <td>A non-digit: {@code [^\d]}</td></tr>
676 * <tr><td>{@code \s}</td>
677 *     <td>A whitespace character: {@code \p{IsWhite_Space}}</td></tr>
678 * <tr><td>{@code \S}</td>
679 *     <td>A non-whitespace character: {@code [^\s]}</td></tr>
680 * <tr><td>{@code \w}</td>
681 *     <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr>
682 * <tr><td>{@code \W}</td>
683 *     <td>A non-word character: {@code [^\w]}</td></tr>
684 * </table>
685 * <p>
686 * <a name="jcc">
687 * Categories that behave like the java.lang.Character
688 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
689 * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where
690 * the specified property has the name <code>java<i>methodname</i></code></a>.
691 *
692 * <h3> Comparison to Perl 5 </h3>
693 *
694 * <p>The {@code Pattern} engine performs traditional NFA-based matching
695 * with ordered alternation as occurs in Perl 5.
696 *
697 * <p> Perl constructs not supported by this class: </p>
698 *
699 * <ul>
700 *    <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
701 *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
702 *    <code>\g{</code><i>name</i><code>}</code> for
703 *    <a href="#groupname">named-capturing group</a>.
704 *    </p></li>
705 *
706 *    <li><p> The conditional constructs
707 *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
708 *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
709 *    </p></li>
710 *
711 *    <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code>
712 *    and <code>(??{</code><i>code</i><code>})</code>,</p></li>
713 *
714 *    <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li>
715 *
716 *    <li><p> The preprocessing operations {@code \l} <code>&#92;u</code>,
717 *    {@code \L}, and {@code \U}.  </p></li>
718 *
719 * </ul>
720 *
721 * <p> Constructs supported by this class but not by Perl: </p>
722 *
723 * <ul>
724 *
725 *    <li><p> Character-class union and intersection as described
726 *    <a href="#cc">above</a>.</p></li>
727 *
728 * </ul>
729 *
730 * <p> Notable differences from Perl: </p>
731 *
732 * <ul>
733 *
734 *    <li><p> In Perl, {@code \1} through {@code \9} are always interpreted
735 *    as back references; a backslash-escaped number greater than {@code 9} is
736 *    treated as a back reference if at least that many subexpressions exist,
737 *    otherwise it is interpreted, if possible, as an octal escape.  In this
738 *    class octal escapes must always begin with a zero. In this class,
739 *    {@code \1} through {@code \9} are always interpreted as back
740 *    references, and a larger number is accepted as a back reference if at
741 *    least that many subexpressions exist at that point in the regular
742 *    expression, otherwise the parser will drop digits until the number is
743 *    smaller or equal to the existing number of groups or it is one digit.
744 *    </p></li>
745 *
746 *    <li><p> Perl uses the {@code g} flag to request a match that resumes
747 *    where the last match left off.  This functionality is provided implicitly
748 *    by the {@link Matcher} class: Repeated invocations of the {@link
749 *    Matcher#find find} method will resume where the last match left off,
750 *    unless the matcher is reset.  </p></li>
751 *
752 *    <li><p> In Perl, embedded flags at the top level of an expression affect
753 *    the whole expression.  In this class, embedded flags always take effect
754 *    at the point at which they appear, whether they are at the top level or
755 *    within a group; in the latter case, flags are restored at the end of the
756 *    group just as in Perl.  </p></li>
757 *
758 * </ul>
759 *
760 *
761 * <p> For a more precise description of the behavior of regular expression
762 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">
763 * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,
764 * O'Reilly and Associates, 2006.</a>
765 * </p>
766 *
767 * @see java.lang.String#split(String, int)
768 * @see java.lang.String#split(String)
769 *
770 * @author      Mike McCloskey
771 * @author      Mark Reinhold
772 * @author      JSR-51 Expert Group
773 * @since       1.4
774 * @spec        JSR-51
775 */
776
777public final class Pattern
778    implements java.io.Serializable
779{
780
781    /**
782     * Regular expression modifier values.  Instead of being passed as
783     * arguments, they can also be passed as inline modifiers.
784     * For example, the following statements have the same effect.
785     * <pre>
786     * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
787     * RegExp r2 = RegExp.compile("(?im)abc", 0);
788     * </pre>
789     *
790     * The flags are duplicated so that the familiar Perl match flag
791     * names are available.
792     */
793
794    /**
795     * Enables Unix lines mode.
796     *
797     * <p> In this mode, only the {@code '\n'} line terminator is recognized
798     * in the behavior of {@code .}, {@code ^}, and {@code $}.
799     *
800     * <p> Unix lines mode can also be enabled via the embedded flag
801     * expression&nbsp;{@code (?d)}.
802     */
803    public static final int UNIX_LINES = 0x01;
804
805    /**
806     * Enables case-insensitive matching.
807     *
808     * <p> By default, case-insensitive matching assumes that only characters
809     * in the US-ASCII charset are being matched.  Unicode-aware
810     * case-insensitive matching can be enabled by specifying the {@link
811     * #UNICODE_CASE} flag in conjunction with this flag.
812     *
813     * <p> Case-insensitive matching can also be enabled via the embedded flag
814     * expression&nbsp;{@code (?i)}.
815     *
816     * <p> Specifying this flag may impose a slight performance penalty.  </p>
817     */
818    public static final int CASE_INSENSITIVE = 0x02;
819
820    /**
821     * Permits whitespace and comments in pattern.
822     *
823     * <p> In this mode, whitespace is ignored, and embedded comments starting
824     * with {@code #} are ignored until the end of a line.
825     *
826     * <p> Comments mode can also be enabled via the embedded flag
827     * expression&nbsp;{@code (?x)}.
828     */
829    public static final int COMMENTS = 0x04;
830
831    /**
832     * Enables multiline mode.
833     *
834     * <p> In multiline mode the expressions {@code ^} and {@code $} match
835     * just after or just before, respectively, a line terminator or the end of
836     * the input sequence.  By default these expressions only match at the
837     * beginning and the end of the entire input sequence.
838     *
839     * <p> Multiline mode can also be enabled via the embedded flag
840     * expression&nbsp;{@code (?m)}.  </p>
841     */
842    public static final int MULTILINE = 0x08;
843
844    /**
845     * Enables literal parsing of the pattern.
846     *
847     * <p> When this flag is specified then the input string that specifies
848     * the pattern is treated as a sequence of literal characters.
849     * Metacharacters or escape sequences in the input sequence will be
850     * given no special meaning.
851     *
852     * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
853     * matching when used in conjunction with this flag. The other flags
854     * become superfluous.
855     *
856     * <p> There is no embedded flag character for enabling literal parsing.
857     * @since 1.5
858     */
859    public static final int LITERAL = 0x10;
860
861    /**
862     * Enables dotall mode.
863     *
864     * <p> In dotall mode, the expression {@code .} matches any character,
865     * including a line terminator.  By default this expression does not match
866     * line terminators.
867     *
868     * <p> Dotall mode can also be enabled via the embedded flag
869     * expression&nbsp;{@code (?s)}.  (The {@code s} is a mnemonic for
870     * "single-line" mode, which is what this is called in Perl.)  </p>
871     */
872    public static final int DOTALL = 0x20;
873
874    /**
875     * Enables Unicode-aware case folding.
876     *
877     * <p> When this flag is specified then case-insensitive matching, when
878     * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
879     * consistent with the Unicode Standard.  By default, case-insensitive
880     * matching assumes that only characters in the US-ASCII charset are being
881     * matched.
882     *
883     * <p> Unicode-aware case folding can also be enabled via the embedded flag
884     * expression&nbsp;{@code (?u)}.
885     *
886     * <p> Specifying this flag may impose a performance penalty.  </p>
887     */
888    public static final int UNICODE_CASE = 0x40;
889
890    /**
891     * Enables canonical equivalence.
892     *
893     * <p> When this flag is specified then two characters will be considered
894     * to match if, and only if, their full canonical decompositions match.
895     * The expression <code>"a&#92;u030A"</code>, for example, will match the
896     * string <code>"&#92;u00E5"</code> when this flag is specified.  By default,
897     * matching does not take canonical equivalence into account.
898     *
899     * <p> There is no embedded flag character for enabling canonical
900     * equivalence.
901     *
902     * <p> Specifying this flag may impose a performance penalty.  </p>
903     */
904    public static final int CANON_EQ = 0x80;
905
906    /**
907     * Enables the Unicode version of <i>Predefined character classes</i> and
908     * <i>POSIX character classes</i>.
909     *
910     * <p> When this flag is specified then the (US-ASCII only)
911     * <i>Predefined character classes</i> and <i>POSIX character classes</i>
912     * are in conformance with
913     * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
914     * Standard #18: Unicode Regular Expression</i></a>
915     * <i>Annex C: Compatibility Properties</i>.
916     * <p>
917     * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
918     * flag expression&nbsp;{@code (?U)}.
919     * <p>
920     * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
921     * folding.
922     * <p>
923     * Specifying this flag may impose a performance penalty.  </p>
924     * @since 1.7
925     */
926    public static final int UNICODE_CHARACTER_CLASS = 0x100;
927
928    /**
929     * Contains all possible flags for compile(regex, flags).
930     */
931    private static final int ALL_FLAGS = CASE_INSENSITIVE | MULTILINE |
932            DOTALL | UNICODE_CASE | CANON_EQ | UNIX_LINES | LITERAL |
933            UNICODE_CHARACTER_CLASS | COMMENTS;
934
935    /* Pattern has only two serialized components: The pattern string
936     * and the flags, which are all that is needed to recompile the pattern
937     * when it is deserialized.
938     */
939
940    /** use serialVersionUID from Merlin b59 for interoperability */
941    private static final long serialVersionUID = 5073258162644648461L;
942
943    /**
944     * The original regular-expression pattern string.
945     *
946     * @serial
947     */
948    private String pattern;
949
950    /**
951     * The original pattern flags.
952     *
953     * @serial
954     */
955    private int flags;
956
957    /**
958     * Boolean indicating this Pattern is compiled; this is necessary in order
959     * to lazily compile deserialized Patterns.
960     */
961    private transient volatile boolean compiled;
962
963    /**
964     * The normalized pattern string.
965     */
966    private transient String normalizedPattern;
967
968    /**
969     * The starting point of state machine for the find operation.  This allows
970     * a match to start anywhere in the input.
971     */
972    transient Node root;
973
974    /**
975     * The root of object tree for a match operation.  The pattern is matched
976     * at the beginning.  This may include a find that uses BnM or a First
977     * node.
978     */
979    transient Node matchRoot;
980
981    /**
982     * Temporary storage used by parsing pattern slice.
983     */
984    transient int[] buffer;
985
986    /**
987     * Map the "name" of the "named capturing group" to its group id
988     * node.
989     */
990    transient volatile Map<String, Integer> namedGroups;
991
992    /**
993     * Temporary storage used while parsing group references.
994     */
995    transient GroupHead[] groupNodes;
996
997    /**
998     * Temporary null terminated code point array used by pattern compiling.
999     */
1000    private transient int[] temp;
1001
1002    /**
1003     * The number of capturing groups in this Pattern. Used by matchers to
1004     * allocate storage needed to perform a match.
1005     */
1006    transient int capturingGroupCount;
1007
1008    /**
1009     * The local variable count used by parsing tree. Used by matchers to
1010     * allocate storage needed to perform a match.
1011     */
1012    transient int localCount;
1013
1014    /**
1015     * Index into the pattern string that keeps track of how much has been
1016     * parsed.
1017     */
1018    private transient int cursor;
1019
1020    /**
1021     * Holds the length of the pattern string.
1022     */
1023    private transient int patternLength;
1024
1025    /**
1026     * If the Start node might possibly match supplementary characters.
1027     * It is set to true during compiling if
1028     * (1) There is supplementary char in pattern, or
1029     * (2) There is complement node of Category or Block
1030     */
1031    private transient boolean hasSupplementary;
1032
1033    /**
1034     * Compiles the given regular expression into a pattern.
1035     *
1036     * @param  regex
1037     *         The expression to be compiled
1038     * @return the given regular expression compiled into a pattern
1039     * @throws  PatternSyntaxException
1040     *          If the expression's syntax is invalid
1041     */
1042    public static Pattern compile(String regex) {
1043        return new Pattern(regex, 0);
1044    }
1045
1046    /**
1047     * Compiles the given regular expression into a pattern with the given
1048     * flags.
1049     *
1050     * @param  regex
1051     *         The expression to be compiled
1052     *
1053     * @param  flags
1054     *         Match flags, a bit mask that may include
1055     *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
1056     *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
1057     *         {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
1058     *         and {@link #COMMENTS}
1059     *
1060     * @return the given regular expression compiled into a pattern with the given flags
1061     * @throws  IllegalArgumentException
1062     *          If bit values other than those corresponding to the defined
1063     *          match flags are set in {@code flags}
1064     *
1065     * @throws  PatternSyntaxException
1066     *          If the expression's syntax is invalid
1067     */
1068    public static Pattern compile(String regex, int flags) {
1069        return new Pattern(regex, flags);
1070    }
1071
1072    /**
1073     * Returns the regular expression from which this pattern was compiled.
1074     *
1075     * @return  The source of this pattern
1076     */
1077    public String pattern() {
1078        return pattern;
1079    }
1080
1081    /**
1082     * <p>Returns the string representation of this pattern. This
1083     * is the regular expression from which this pattern was
1084     * compiled.</p>
1085     *
1086     * @return  The string representation of this pattern
1087     * @since 1.5
1088     */
1089    public String toString() {
1090        return pattern;
1091    }
1092
1093    /**
1094     * Creates a matcher that will match the given input against this pattern.
1095     *
1096     * @param  input
1097     *         The character sequence to be matched
1098     *
1099     * @return  A new matcher for this pattern
1100     */
1101    public Matcher matcher(CharSequence input) {
1102        if (!compiled) {
1103            synchronized(this) {
1104                if (!compiled)
1105                    compile();
1106            }
1107        }
1108        Matcher m = new Matcher(this, input);
1109        return m;
1110    }
1111
1112    /**
1113     * Returns this pattern's match flags.
1114     *
1115     * @return  The match flags specified when this pattern was compiled
1116     */
1117    public int flags() {
1118        return flags;
1119    }
1120
1121    /**
1122     * Compiles the given regular expression and attempts to match the given
1123     * input against it.
1124     *
1125     * <p> An invocation of this convenience method of the form
1126     *
1127     * <blockquote><pre>
1128     * Pattern.matches(regex, input);</pre></blockquote>
1129     *
1130     * behaves in exactly the same way as the expression
1131     *
1132     * <blockquote><pre>
1133     * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
1134     *
1135     * <p> If a pattern is to be used multiple times, compiling it once and reusing
1136     * it will be more efficient than invoking this method each time.  </p>
1137     *
1138     * @param  regex
1139     *         The expression to be compiled
1140     *
1141     * @param  input
1142     *         The character sequence to be matched
1143     * @return whether or not the regular expression matches on the input
1144     * @throws  PatternSyntaxException
1145     *          If the expression's syntax is invalid
1146     */
1147    public static boolean matches(String regex, CharSequence input) {
1148        Pattern p = Pattern.compile(regex);
1149        Matcher m = p.matcher(input);
1150        return m.matches();
1151    }
1152
1153    /**
1154     * Splits the given input sequence around matches of this pattern.
1155     *
1156     * <p> The array returned by this method contains each substring of the
1157     * input sequence that is terminated by another subsequence that matches
1158     * this pattern or is terminated by the end of the input sequence.  The
1159     * substrings in the array are in the order in which they occur in the
1160     * input. If this pattern does not match any subsequence of the input then
1161     * the resulting array has just one element, namely the input sequence in
1162     * string form.
1163     *
1164     * <p> When there is a positive-width match at the beginning of the input
1165     * sequence then an empty leading substring is included at the beginning
1166     * of the resulting array. A zero-width match at the beginning however
1167     * never produces such empty leading substring.
1168     *
1169     * <p> The {@code limit} parameter controls the number of times the
1170     * pattern is applied and therefore affects the length of the resulting
1171     * array.  If the limit <i>n</i> is greater than zero then the pattern
1172     * will be applied at most <i>n</i>&nbsp;-&nbsp;1 times, the array's
1173     * length will be no greater than <i>n</i>, and the array's last entry
1174     * will contain all input beyond the last matched delimiter.  If <i>n</i>
1175     * is non-positive then the pattern will be applied as many times as
1176     * possible and the array can have any length.  If <i>n</i> is zero then
1177     * the pattern will be applied as many times as possible, the array can
1178     * have any length, and trailing empty strings will be discarded.
1179     *
1180     * <p> The input {@code "boo:and:foo"}, for example, yields the following
1181     * results with these parameters:
1182     *
1183     * <blockquote><table cellpadding=1 cellspacing=0
1184     *              summary="Split examples showing regex, limit, and result">
1185     * <tr><th align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
1186     *     <th align="left"><i>Limit&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
1187     *     <th align="left"><i>Result&nbsp;&nbsp;&nbsp;&nbsp;</i></th></tr>
1188     * <tr><td align=center>:</td>
1189     *     <td align=center>2</td>
1190     *     <td>{@code { "boo", "and:foo" }}</td></tr>
1191     * <tr><td align=center>:</td>
1192     *     <td align=center>5</td>
1193     *     <td>{@code { "boo", "and", "foo" }}</td></tr>
1194     * <tr><td align=center>:</td>
1195     *     <td align=center>-2</td>
1196     *     <td>{@code { "boo", "and", "foo" }}</td></tr>
1197     * <tr><td align=center>o</td>
1198     *     <td align=center>5</td>
1199     *     <td>{@code { "b", "", ":and:f", "", "" }}</td></tr>
1200     * <tr><td align=center>o</td>
1201     *     <td align=center>-2</td>
1202     *     <td>{@code { "b", "", ":and:f", "", "" }}</td></tr>
1203     * <tr><td align=center>o</td>
1204     *     <td align=center>0</td>
1205     *     <td>{@code { "b", "", ":and:f" }}</td></tr>
1206     * </table></blockquote>
1207     *
1208     * @param  input
1209     *         The character sequence to be split
1210     *
1211     * @param  limit
1212     *         The result threshold, as described above
1213     *
1214     * @return  The array of strings computed by splitting the input
1215     *          around matches of this pattern
1216     */
1217    public String[] split(CharSequence input, int limit) {
1218        int index = 0;
1219        boolean matchLimited = limit > 0;
1220        ArrayList<String> matchList = new ArrayList<>();
1221        Matcher m = matcher(input);
1222
1223        // Add segments before each match found
1224        while(m.find()) {
1225            if (!matchLimited || matchList.size() < limit - 1) {
1226                if (index == 0 && index == m.start() && m.start() == m.end()) {
1227                    // no empty leading substring included for zero-width match
1228                    // at the beginning of the input char sequence.
1229                    continue;
1230                }
1231                String match = input.subSequence(index, m.start()).toString();
1232                matchList.add(match);
1233                index = m.end();
1234            } else if (matchList.size() == limit - 1) { // last one
1235                String match = input.subSequence(index,
1236                                                 input.length()).toString();
1237                matchList.add(match);
1238                index = m.end();
1239            }
1240        }
1241
1242        // If no match was found, return this
1243        if (index == 0)
1244            return new String[] {input.toString()};
1245
1246        // Add remaining segment
1247        if (!matchLimited || matchList.size() < limit)
1248            matchList.add(input.subSequence(index, input.length()).toString());
1249
1250        // Construct result
1251        int resultSize = matchList.size();
1252        if (limit == 0)
1253            while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
1254                resultSize--;
1255        String[] result = new String[resultSize];
1256        return matchList.subList(0, resultSize).toArray(result);
1257    }
1258
1259    /**
1260     * Splits the given input sequence around matches of this pattern.
1261     *
1262     * <p> This method works as if by invoking the two-argument {@link
1263     * #split(java.lang.CharSequence, int) split} method with the given input
1264     * sequence and a limit argument of zero.  Trailing empty strings are
1265     * therefore not included in the resulting array. </p>
1266     *
1267     * <p> The input {@code "boo:and:foo"}, for example, yields the following
1268     * results with these expressions:
1269     *
1270     * <blockquote><table cellpadding=1 cellspacing=0
1271     *              summary="Split examples showing regex and result">
1272     * <tr><th align="left"><i>Regex&nbsp;&nbsp;&nbsp;&nbsp;</i></th>
1273     *     <th align="left"><i>Result</i></th></tr>
1274     * <tr><td align=center>:</td>
1275     *     <td>{@code { "boo", "and", "foo" }}</td></tr>
1276     * <tr><td align=center>o</td>
1277     *     <td>{@code { "b", "", ":and:f" }}</td></tr>
1278     * </table></blockquote>
1279     *
1280     *
1281     * @param  input
1282     *         The character sequence to be split
1283     *
1284     * @return  The array of strings computed by splitting the input
1285     *          around matches of this pattern
1286     */
1287    public String[] split(CharSequence input) {
1288        return split(input, 0);
1289    }
1290
1291    /**
1292     * Returns a literal pattern {@code String} for the specified
1293     * {@code String}.
1294     *
1295     * <p>This method produces a {@code String} that can be used to
1296     * create a {@code Pattern} that would match the string
1297     * {@code s} as if it were a literal pattern.</p> Metacharacters
1298     * or escape sequences in the input sequence will be given no special
1299     * meaning.
1300     *
1301     * @param  s The string to be literalized
1302     * @return  A literal string replacement
1303     * @since 1.5
1304     */
1305    public static String quote(String s) {
1306        int slashEIndex = s.indexOf("\\E");
1307        if (slashEIndex == -1)
1308            return "\\Q" + s + "\\E";
1309
1310        int lenHint = s.length();
1311        lenHint = (lenHint < Integer.MAX_VALUE - 8 - lenHint) ?
1312                (lenHint << 1) : (Integer.MAX_VALUE - 8);
1313
1314        StringBuilder sb = new StringBuilder(lenHint);
1315        sb.append("\\Q");
1316        int current = 0;
1317        do {
1318            sb.append(s, current, slashEIndex)
1319                    .append("\\E\\\\E\\Q");
1320            current = slashEIndex + 2;
1321        } while ((slashEIndex = s.indexOf("\\E", current)) != -1);
1322
1323        return sb.append(s, current, s.length())
1324                .append("\\E")
1325                .toString();
1326    }
1327
1328    /**
1329     * Recompile the Pattern instance from a stream.  The original pattern
1330     * string is read in and the object tree is recompiled from it.
1331     */
1332    private void readObject(java.io.ObjectInputStream s)
1333        throws java.io.IOException, ClassNotFoundException {
1334
1335        // Read in all fields
1336        s.defaultReadObject();
1337
1338        // Initialize counts
1339        capturingGroupCount = 1;
1340        localCount = 0;
1341
1342        // if length > 0, the Pattern is lazily compiled
1343        if (pattern.length() == 0) {
1344            root = new Start(lastAccept);
1345            matchRoot = lastAccept;
1346            compiled = true;
1347        }
1348    }
1349
1350    /**
1351     * This private constructor is used to create all Patterns. The pattern
1352     * string and match flags are all that is needed to completely describe
1353     * a Pattern. An empty pattern string results in an object tree with
1354     * only a Start node and a LastNode node.
1355     */
1356    private Pattern(String p, int f) {
1357        if ((f & ~ALL_FLAGS) != 0) {
1358            throw new IllegalArgumentException("Unknown flag 0x"
1359                                               + Integer.toHexString(f));
1360        }
1361        pattern = p;
1362        flags = f;
1363
1364        // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1365        if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1366            flags |= UNICODE_CASE;
1367
1368        // Reset group index count
1369        capturingGroupCount = 1;
1370        localCount = 0;
1371
1372        if (pattern.length() > 0) {
1373            compile();
1374        } else {
1375            root = new Start(lastAccept);
1376            matchRoot = lastAccept;
1377        }
1378    }
1379
1380    /**
1381     * The pattern is converted to normalized form ({@linkplain
1382     * java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
1383     * and then a pure group is constructed to match canonical
1384     * equivalences of the characters.
1385     */
1386    private void normalize() {
1387        int lastCodePoint = -1;
1388
1389        // Convert pattern into normalized form
1390        normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
1391        patternLength = normalizedPattern.length();
1392
1393        // Modify pattern to match canonical equivalences
1394        StringBuilder newPattern = new StringBuilder(patternLength);
1395        for(int i=0; i<patternLength; ) {
1396            int c = normalizedPattern.codePointAt(i);
1397            StringBuilder sequenceBuffer;
1398            if ((Character.getType(c) == Character.NON_SPACING_MARK)
1399                && (lastCodePoint != -1)) {
1400                sequenceBuffer = new StringBuilder();
1401                sequenceBuffer.appendCodePoint(lastCodePoint);
1402                sequenceBuffer.appendCodePoint(c);
1403                while(Character.getType(c) == Character.NON_SPACING_MARK) {
1404                    i += Character.charCount(c);
1405                    if (i >= patternLength)
1406                        break;
1407                    c = normalizedPattern.codePointAt(i);
1408                    sequenceBuffer.appendCodePoint(c);
1409                }
1410                String ea = produceEquivalentAlternation(
1411                                               sequenceBuffer.toString());
1412                newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
1413                newPattern.append("(?:").append(ea).append(")");
1414            } else if (c == '[' && lastCodePoint != '\\') {
1415                i = normalizeCharClass(newPattern, i);
1416            } else {
1417                newPattern.appendCodePoint(c);
1418            }
1419            lastCodePoint = c;
1420            i += Character.charCount(c);
1421        }
1422        normalizedPattern = newPattern.toString();
1423    }
1424
1425    /**
1426     * Complete the character class being parsed and add a set
1427     * of alternations to it that will match the canonical equivalences
1428     * of the characters within the class.
1429     */
1430    private int normalizeCharClass(StringBuilder newPattern, int i) {
1431        StringBuilder charClass = new StringBuilder();
1432        StringBuilder eq = null;
1433        int lastCodePoint = -1;
1434        String result;
1435
1436        i++;
1437        charClass.append("[");
1438        while(true) {
1439            int c = normalizedPattern.codePointAt(i);
1440            StringBuilder sequenceBuffer;
1441
1442            if (c == ']' && lastCodePoint != '\\') {
1443                charClass.append((char)c);
1444                break;
1445            } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
1446                sequenceBuffer = new StringBuilder();
1447                sequenceBuffer.appendCodePoint(lastCodePoint);
1448                while(Character.getType(c) == Character.NON_SPACING_MARK) {
1449                    sequenceBuffer.appendCodePoint(c);
1450                    i += Character.charCount(c);
1451                    if (i >= normalizedPattern.length())
1452                        break;
1453                    c = normalizedPattern.codePointAt(i);
1454                }
1455                String ea = produceEquivalentAlternation(
1456                                                  sequenceBuffer.toString());
1457
1458                charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
1459                if (eq == null)
1460                    eq = new StringBuilder();
1461                eq.append('|');
1462                eq.append(ea);
1463            } else {
1464                charClass.appendCodePoint(c);
1465                i++;
1466            }
1467            if (i == normalizedPattern.length())
1468                throw error("Unclosed character class");
1469            lastCodePoint = c;
1470        }
1471
1472        if (eq != null) {
1473            result = "(?:"+charClass.toString()+eq.toString()+")";
1474        } else {
1475            result = charClass.toString();
1476        }
1477
1478        newPattern.append(result);
1479        return i;
1480    }
1481
1482    /**
1483     * Given a specific sequence composed of a regular character and
1484     * combining marks that follow it, produce the alternation that will
1485     * match all canonical equivalences of that sequence.
1486     */
1487    private String produceEquivalentAlternation(String source) {
1488        int len = countChars(source, 0, 1);
1489        if (source.length() == len)
1490            // source has one character.
1491            return source;
1492
1493        String base = source.substring(0,len);
1494        String combiningMarks = source.substring(len);
1495
1496        String[] perms = producePermutations(combiningMarks);
1497        StringBuilder result = new StringBuilder(source);
1498
1499        // Add combined permutations
1500        for(int x=0; x<perms.length; x++) {
1501            String next = base + perms[x];
1502            if (x>0)
1503                result.append("|"+next);
1504            next = composeOneStep(next);
1505            if (next != null)
1506                result.append("|"+produceEquivalentAlternation(next));
1507        }
1508        return result.toString();
1509    }
1510
1511    /**
1512     * Returns an array of strings that have all the possible
1513     * permutations of the characters in the input string.
1514     * This is used to get a list of all possible orderings
1515     * of a set of combining marks. Note that some of the permutations
1516     * are invalid because of combining class collisions, and these
1517     * possibilities must be removed because they are not canonically
1518     * equivalent.
1519     */
1520    private String[] producePermutations(String input) {
1521        if (input.length() == countChars(input, 0, 1))
1522            return new String[] {input};
1523
1524        if (input.length() == countChars(input, 0, 2)) {
1525            int c0 = Character.codePointAt(input, 0);
1526            int c1 = Character.codePointAt(input, Character.charCount(c0));
1527            if (getClass(c1) == getClass(c0)) {
1528                return new String[] {input};
1529            }
1530            String[] result = new String[2];
1531            result[0] = input;
1532            StringBuilder sb = new StringBuilder(2);
1533            sb.appendCodePoint(c1);
1534            sb.appendCodePoint(c0);
1535            result[1] = sb.toString();
1536            return result;
1537        }
1538
1539        int length = 1;
1540        int nCodePoints = countCodePoints(input);
1541        for(int x=1; x<nCodePoints; x++)
1542            length = length * (x+1);
1543
1544        String[] temp = new String[length];
1545
1546        int combClass[] = new int[nCodePoints];
1547        for(int x=0, i=0; x<nCodePoints; x++) {
1548            int c = Character.codePointAt(input, i);
1549            combClass[x] = getClass(c);
1550            i +=  Character.charCount(c);
1551        }
1552
1553        // For each char, take it out and add the permutations
1554        // of the remaining chars
1555        int index = 0;
1556        int len;
1557        // offset maintains the index in code units.
1558loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
1559            len = countChars(input, offset, 1);
1560            for(int y=x-1; y>=0; y--) {
1561                if (combClass[y] == combClass[x]) {
1562                    continue loop;
1563                }
1564            }
1565            StringBuilder sb = new StringBuilder(input);
1566            String otherChars = sb.delete(offset, offset+len).toString();
1567            String[] subResult = producePermutations(otherChars);
1568
1569            String prefix = input.substring(offset, offset+len);
1570            for (String sre : subResult)
1571                temp[index++] = prefix + sre;
1572        }
1573        String[] result = new String[index];
1574        System.arraycopy(temp, 0, result, 0, index);
1575        return result;
1576    }
1577
1578    private int getClass(int c) {
1579        return sun.text.Normalizer.getCombiningClass(c);
1580    }
1581
1582    /**
1583     * Attempts to compose input by combining the first character
1584     * with the first combining mark following it. Returns a String
1585     * that is the composition of the leading character with its first
1586     * combining mark followed by the remaining combining marks. Returns
1587     * null if the first two characters cannot be further composed.
1588     */
1589    private String composeOneStep(String input) {
1590        int len = countChars(input, 0, 2);
1591        String firstTwoCharacters = input.substring(0, len);
1592        String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
1593
1594        if (result.equals(firstTwoCharacters))
1595            return null;
1596        else {
1597            String remainder = input.substring(len);
1598            return result + remainder;
1599        }
1600    }
1601
1602    /**
1603     * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
1604     * See the description of `quotemeta' in perlfunc(1).
1605     */
1606    private void RemoveQEQuoting() {
1607        final int pLen = patternLength;
1608        int i = 0;
1609        while (i < pLen-1) {
1610            if (temp[i] != '\\')
1611                i += 1;
1612            else if (temp[i + 1] != 'Q')
1613                i += 2;
1614            else
1615                break;
1616        }
1617        if (i >= pLen - 1)    // No \Q sequence found
1618            return;
1619        int j = i;
1620        i += 2;
1621        int[] newtemp = new int[j + 3*(pLen-i) + 2];
1622        System.arraycopy(temp, 0, newtemp, 0, j);
1623
1624        boolean inQuote = true;
1625        boolean beginQuote = true;
1626        while (i < pLen) {
1627            int c = temp[i++];
1628            if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) {
1629                newtemp[j++] = c;
1630            } else if (ASCII.isDigit(c)) {
1631                if (beginQuote) {
1632                    /*
1633                     * A unicode escape \[0xu] could be before this quote,
1634                     * and we don't want this numeric char to processed as
1635                     * part of the escape.
1636                     */
1637                    newtemp[j++] = '\\';
1638                    newtemp[j++] = 'x';
1639                    newtemp[j++] = '3';
1640                }
1641                newtemp[j++] = c;
1642            } else if (c != '\\') {
1643                if (inQuote) newtemp[j++] = '\\';
1644                newtemp[j++] = c;
1645            } else if (inQuote) {
1646                if (temp[i] == 'E') {
1647                    i++;
1648                    inQuote = false;
1649                } else {
1650                    newtemp[j++] = '\\';
1651                    newtemp[j++] = '\\';
1652                }
1653            } else {
1654                if (temp[i] == 'Q') {
1655                    i++;
1656                    inQuote = true;
1657                    beginQuote = true;
1658                    continue;
1659                } else {
1660                    newtemp[j++] = c;
1661                    if (i != pLen)
1662                        newtemp[j++] = temp[i++];
1663                }
1664            }
1665
1666            beginQuote = false;
1667        }
1668
1669        patternLength = j;
1670        temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
1671    }
1672
1673    /**
1674     * Copies regular expression to an int array and invokes the parsing
1675     * of the expression which will create the object tree.
1676     */
1677    private void compile() {
1678        // Handle canonical equivalences
1679        if (has(CANON_EQ) && !has(LITERAL)) {
1680            normalize();
1681        } else {
1682            normalizedPattern = pattern;
1683        }
1684        patternLength = normalizedPattern.length();
1685
1686        // Copy pattern to int array for convenience
1687        // Use double zero to terminate pattern
1688        temp = new int[patternLength + 2];
1689
1690        hasSupplementary = false;
1691        int c, count = 0;
1692        // Convert all chars into code points
1693        for (int x = 0; x < patternLength; x += Character.charCount(c)) {
1694            c = normalizedPattern.codePointAt(x);
1695            if (isSupplementary(c)) {
1696                hasSupplementary = true;
1697            }
1698            temp[count++] = c;
1699        }
1700
1701        patternLength = count;   // patternLength now in code points
1702
1703        if (! has(LITERAL))
1704            RemoveQEQuoting();
1705
1706        // Allocate all temporary objects here.
1707        buffer = new int[32];
1708        groupNodes = new GroupHead[10];
1709        namedGroups = null;
1710
1711        if (has(LITERAL)) {
1712            // Literal pattern handling
1713            matchRoot = newSlice(temp, patternLength, hasSupplementary);
1714            matchRoot.next = lastAccept;
1715        } else {
1716            // Start recursive descent parsing
1717            matchRoot = expr(lastAccept);
1718            // Check extra pattern characters
1719            if (patternLength != cursor) {
1720                if (peek() == ')') {
1721                    throw error("Unmatched closing ')'");
1722                } else {
1723                    throw error("Unexpected internal error");
1724                }
1725            }
1726        }
1727
1728        // Peephole optimization
1729        if (matchRoot instanceof Slice) {
1730            root = BnM.optimize(matchRoot);
1731            if (root == matchRoot) {
1732                root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1733            }
1734        } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
1735            root = matchRoot;
1736        } else {
1737            root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
1738        }
1739
1740        // Release temporary storage
1741        temp = null;
1742        buffer = null;
1743        groupNodes = null;
1744        patternLength = 0;
1745        compiled = true;
1746    }
1747
1748    Map<String, Integer> namedGroups() {
1749        Map<String, Integer> groups = namedGroups;
1750        if (groups == null) {
1751            namedGroups = groups = new HashMap<>(2);
1752        }
1753        return groups;
1754    }
1755
1756    /**
1757     * Used to print out a subtree of the Pattern to help with debugging.
1758     */
1759    private static void printObjectTree(Node node) {
1760        while(node != null) {
1761            if (node instanceof Prolog) {
1762                System.out.println(node);
1763                printObjectTree(((Prolog)node).loop);
1764                System.out.println("**** end contents prolog loop");
1765            } else if (node instanceof Loop) {
1766                System.out.println(node);
1767                printObjectTree(((Loop)node).body);
1768                System.out.println("**** end contents Loop body");
1769            } else if (node instanceof Curly) {
1770                System.out.println(node);
1771                printObjectTree(((Curly)node).atom);
1772                System.out.println("**** end contents Curly body");
1773            } else if (node instanceof GroupCurly) {
1774                System.out.println(node);
1775                printObjectTree(((GroupCurly)node).atom);
1776                System.out.println("**** end contents GroupCurly body");
1777            } else if (node instanceof GroupTail) {
1778                System.out.println(node);
1779                System.out.println("Tail next is "+node.next);
1780                return;
1781            } else {
1782                System.out.println(node);
1783            }
1784            node = node.next;
1785            if (node != null)
1786                System.out.println("->next:");
1787            if (node == Pattern.accept) {
1788                System.out.println("Accept Node");
1789                node = null;
1790            }
1791       }
1792    }
1793
1794    /**
1795     * Used to accumulate information about a subtree of the object graph
1796     * so that optimizations can be applied to the subtree.
1797     */
1798    static final class TreeInfo {
1799        int minLength;
1800        int maxLength;
1801        boolean maxValid;
1802        boolean deterministic;
1803
1804        TreeInfo() {
1805            reset();
1806        }
1807        void reset() {
1808            minLength = 0;
1809            maxLength = 0;
1810            maxValid = true;
1811            deterministic = true;
1812        }
1813    }
1814
1815    /*
1816     * The following private methods are mainly used to improve the
1817     * readability of the code. In order to let the Java compiler easily
1818     * inline them, we should not put many assertions or error checks in them.
1819     */
1820
1821    /**
1822     * Indicates whether a particular flag is set or not.
1823     */
1824    private boolean has(int f) {
1825        return (flags & f) != 0;
1826    }
1827
1828    /**
1829     * Match next character, signal error if failed.
1830     */
1831    private void accept(int ch, String s) {
1832        int testChar = temp[cursor++];
1833        if (has(COMMENTS))
1834            testChar = parsePastWhitespace(testChar);
1835        if (ch != testChar) {
1836            throw error(s);
1837        }
1838    }
1839
1840    /**
1841     * Mark the end of pattern with a specific character.
1842     */
1843    private void mark(int c) {
1844        temp[patternLength] = c;
1845    }
1846
1847    /**
1848     * Peek the next character, and do not advance the cursor.
1849     */
1850    private int peek() {
1851        int ch = temp[cursor];
1852        if (has(COMMENTS))
1853            ch = peekPastWhitespace(ch);
1854        return ch;
1855    }
1856
1857    /**
1858     * Read the next character, and advance the cursor by one.
1859     */
1860    private int read() {
1861        int ch = temp[cursor++];
1862        if (has(COMMENTS))
1863            ch = parsePastWhitespace(ch);
1864        return ch;
1865    }
1866
1867    /**
1868     * Read the next character, and advance the cursor by one,
1869     * ignoring the COMMENTS setting
1870     */
1871    private int readEscaped() {
1872        int ch = temp[cursor++];
1873        return ch;
1874    }
1875
1876    /**
1877     * Advance the cursor by one, and peek the next character.
1878     */
1879    private int next() {
1880        int ch = temp[++cursor];
1881        if (has(COMMENTS))
1882            ch = peekPastWhitespace(ch);
1883        return ch;
1884    }
1885
1886    /**
1887     * Advance the cursor by one, and peek the next character,
1888     * ignoring the COMMENTS setting
1889     */
1890    private int nextEscaped() {
1891        int ch = temp[++cursor];
1892        return ch;
1893    }
1894
1895    /**
1896     * If in xmode peek past whitespace and comments.
1897     */
1898    private int peekPastWhitespace(int ch) {
1899        while (ASCII.isSpace(ch) || ch == '#') {
1900            while (ASCII.isSpace(ch))
1901                ch = temp[++cursor];
1902            if (ch == '#') {
1903                ch = peekPastLine();
1904            }
1905        }
1906        return ch;
1907    }
1908
1909    /**
1910     * If in xmode parse past whitespace and comments.
1911     */
1912    private int parsePastWhitespace(int ch) {
1913        while (ASCII.isSpace(ch) || ch == '#') {
1914            while (ASCII.isSpace(ch))
1915                ch = temp[cursor++];
1916            if (ch == '#')
1917                ch = parsePastLine();
1918        }
1919        return ch;
1920    }
1921
1922    /**
1923     * xmode parse past comment to end of line.
1924     */
1925    private int parsePastLine() {
1926        int ch = temp[cursor++];
1927        while (ch != 0 && !isLineSeparator(ch))
1928            ch = temp[cursor++];
1929        return ch;
1930    }
1931
1932    /**
1933     * xmode peek past comment to end of line.
1934     */
1935    private int peekPastLine() {
1936        int ch = temp[++cursor];
1937        while (ch != 0 && !isLineSeparator(ch))
1938            ch = temp[++cursor];
1939        return ch;
1940    }
1941
1942    /**
1943     * Determines if character is a line separator in the current mode
1944     */
1945    private boolean isLineSeparator(int ch) {
1946        if (has(UNIX_LINES)) {
1947            return ch == '\n';
1948        } else {
1949            return (ch == '\n' ||
1950                    ch == '\r' ||
1951                    (ch|1) == '\u2029' ||
1952                    ch == '\u0085');
1953        }
1954    }
1955
1956    /**
1957     * Read the character after the next one, and advance the cursor by two.
1958     */
1959    private int skip() {
1960        int i = cursor;
1961        int ch = temp[i+1];
1962        cursor = i + 2;
1963        return ch;
1964    }
1965
1966    /**
1967     * Unread one next character, and retreat cursor by one.
1968     */
1969    private void unread() {
1970        cursor--;
1971    }
1972
1973    /**
1974     * Internal method used for handling all syntax errors. The pattern is
1975     * displayed with a pointer to aid in locating the syntax error.
1976     */
1977    private PatternSyntaxException error(String s) {
1978        return new PatternSyntaxException(s, normalizedPattern,  cursor - 1);
1979    }
1980
1981    /**
1982     * Determines if there is any supplementary character or unpaired
1983     * surrogate in the specified range.
1984     */
1985    private boolean findSupplementary(int start, int end) {
1986        for (int i = start; i < end; i++) {
1987            if (isSupplementary(temp[i]))
1988                return true;
1989        }
1990        return false;
1991    }
1992
1993    /**
1994     * Determines if the specified code point is a supplementary
1995     * character or unpaired surrogate.
1996     */
1997    private static final boolean isSupplementary(int ch) {
1998        return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
1999               Character.isSurrogate((char)ch);
2000    }
2001
2002    /**
2003     *  The following methods handle the main parsing. They are sorted
2004     *  according to their precedence order, the lowest one first.
2005     */
2006
2007    /**
2008     * The expression is parsed with branch nodes added for alternations.
2009     * This may be called recursively to parse sub expressions that may
2010     * contain alternations.
2011     */
2012    private Node expr(Node end) {
2013        Node prev = null;
2014        Node firstTail = null;
2015        Branch branch = null;
2016        Node branchConn = null;
2017
2018        for (;;) {
2019            Node node = sequence(end);
2020            Node nodeTail = root;      //double return
2021            if (prev == null) {
2022                prev = node;
2023                firstTail = nodeTail;
2024            } else {
2025                // Branch
2026                if (branchConn == null) {
2027                    branchConn = new BranchConn();
2028                    branchConn.next = end;
2029                }
2030                if (node == end) {
2031                    // if the node returned from sequence() is "end"
2032                    // we have an empty expr, set a null atom into
2033                    // the branch to indicate to go "next" directly.
2034                    node = null;
2035                } else {
2036                    // the "tail.next" of each atom goes to branchConn
2037                    nodeTail.next = branchConn;
2038                }
2039                if (prev == branch) {
2040                    branch.add(node);
2041                } else {
2042                    if (prev == end) {
2043                        prev = null;
2044                    } else {
2045                        // replace the "end" with "branchConn" at its tail.next
2046                        // when put the "prev" into the branch as the first atom.
2047                        firstTail.next = branchConn;
2048                    }
2049                    prev = branch = new Branch(prev, node, branchConn);
2050                }
2051            }
2052            if (peek() != '|') {
2053                return prev;
2054            }
2055            next();
2056        }
2057    }
2058
2059    @SuppressWarnings("fallthrough")
2060    /**
2061     * Parsing of sequences between alternations.
2062     */
2063    private Node sequence(Node end) {
2064        Node head = null;
2065        Node tail = null;
2066        Node node = null;
2067    LOOP:
2068        for (;;) {
2069            int ch = peek();
2070            switch (ch) {
2071            case '(':
2072                // Because group handles its own closure,
2073                // we need to treat it differently
2074                node = group0();
2075                // Check for comment or flag group
2076                if (node == null)
2077                    continue;
2078                if (head == null)
2079                    head = node;
2080                else
2081                    tail.next = node;
2082                // Double return: Tail was returned in root
2083                tail = root;
2084                continue;
2085            case '[':
2086                node = clazz(true);
2087                break;
2088            case '\\':
2089                ch = nextEscaped();
2090                if (ch == 'p' || ch == 'P') {
2091                    boolean oneLetter = true;
2092                    boolean comp = (ch == 'P');
2093                    ch = next(); // Consume { if present
2094                    if (ch != '{') {
2095                        unread();
2096                    } else {
2097                        oneLetter = false;
2098                    }
2099                    node = family(oneLetter, comp);
2100                } else {
2101                    unread();
2102                    node = atom();
2103                }
2104                break;
2105            case '^':
2106                next();
2107                if (has(MULTILINE)) {
2108                    if (has(UNIX_LINES))
2109                        node = new UnixCaret();
2110                    else
2111                        node = new Caret();
2112                } else {
2113                    node = new Begin();
2114                }
2115                break;
2116            case '$':
2117                next();
2118                if (has(UNIX_LINES))
2119                    node = new UnixDollar(has(MULTILINE));
2120                else
2121                    node = new Dollar(has(MULTILINE));
2122                break;
2123            case '.':
2124                next();
2125                if (has(DOTALL)) {
2126                    node = new All();
2127                } else {
2128                    if (has(UNIX_LINES))
2129                        node = new UnixDot();
2130                    else {
2131                        node = new Dot();
2132                    }
2133                }
2134                break;
2135            case '|':
2136            case ')':
2137                break LOOP;
2138            case ']': // Now interpreting dangling ] and } as literals
2139            case '}':
2140                node = atom();
2141                break;
2142            case '?':
2143            case '*':
2144            case '+':
2145                next();
2146                throw error("Dangling meta character '" + ((char)ch) + "'");
2147            case 0:
2148                if (cursor >= patternLength) {
2149                    break LOOP;
2150                }
2151                // Fall through
2152            default:
2153                node = atom();
2154                break;
2155            }
2156
2157            node = closure(node);
2158
2159            if (head == null) {
2160                head = tail = node;
2161            } else {
2162                tail.next = node;
2163                tail = node;
2164            }
2165        }
2166        if (head == null) {
2167            return end;
2168        }
2169        tail.next = end;
2170        root = tail;      //double return
2171        return head;
2172    }
2173
2174    @SuppressWarnings("fallthrough")
2175    /**
2176     * Parse and add a new Single or Slice.
2177     */
2178    private Node atom() {
2179        int first = 0;
2180        int prev = -1;
2181        boolean hasSupplementary = false;
2182        int ch = peek();
2183        for (;;) {
2184            switch (ch) {
2185            case '*':
2186            case '+':
2187            case '?':
2188            case '{':
2189                if (first > 1) {
2190                    cursor = prev;    // Unwind one character
2191                    first--;
2192                }
2193                break;
2194            case '$':
2195            case '.':
2196            case '^':
2197            case '(':
2198            case '[':
2199            case '|':
2200            case ')':
2201                break;
2202            case '\\':
2203                ch = nextEscaped();
2204                if (ch == 'p' || ch == 'P') { // Property
2205                    if (first > 0) { // Slice is waiting; handle it first
2206                        unread();
2207                        break;
2208                    } else { // No slice; just return the family node
2209                        boolean comp = (ch == 'P');
2210                        boolean oneLetter = true;
2211                        ch = next(); // Consume { if present
2212                        if (ch != '{')
2213                            unread();
2214                        else
2215                            oneLetter = false;
2216                        return family(oneLetter, comp);
2217                    }
2218                }
2219                unread();
2220                prev = cursor;
2221                ch = escape(false, first == 0, false);
2222                if (ch >= 0) {
2223                    append(ch, first);
2224                    first++;
2225                    if (isSupplementary(ch)) {
2226                        hasSupplementary = true;
2227                    }
2228                    ch = peek();
2229                    continue;
2230                } else if (first == 0) {
2231                    return root;
2232                }
2233                // Unwind meta escape sequence
2234                cursor = prev;
2235                break;
2236            case 0:
2237                if (cursor >= patternLength) {
2238                    break;
2239                }
2240                // Fall through
2241            default:
2242                prev = cursor;
2243                append(ch, first);
2244                first++;
2245                if (isSupplementary(ch)) {
2246                    hasSupplementary = true;
2247                }
2248                ch = next();
2249                continue;
2250            }
2251            break;
2252        }
2253        if (first == 1) {
2254            return newSingle(buffer[0]);
2255        } else {
2256            return newSlice(buffer, first, hasSupplementary);
2257        }
2258    }
2259
2260    private void append(int ch, int len) {
2261        if (len >= buffer.length) {
2262            int[] tmp = new int[len+len];
2263            System.arraycopy(buffer, 0, tmp, 0, len);
2264            buffer = tmp;
2265        }
2266        buffer[len] = ch;
2267    }
2268
2269    /**
2270     * Parses a backref greedily, taking as many numbers as it
2271     * can. The first digit is always treated as a backref, but
2272     * multi digit numbers are only treated as a backref if at
2273     * least that many backrefs exist at this point in the regex.
2274     */
2275    private Node ref(int refNum) {
2276        boolean done = false;
2277        while(!done) {
2278            int ch = peek();
2279            switch(ch) {
2280            case '0':
2281            case '1':
2282            case '2':
2283            case '3':
2284            case '4':
2285            case '5':
2286            case '6':
2287            case '7':
2288            case '8':
2289            case '9':
2290                int newRefNum = (refNum * 10) + (ch - '0');
2291                // Add another number if it doesn't make a group
2292                // that doesn't exist
2293                if (capturingGroupCount - 1 < newRefNum) {
2294                    done = true;
2295                    break;
2296                }
2297                refNum = newRefNum;
2298                read();
2299                break;
2300            default:
2301                done = true;
2302                break;
2303            }
2304        }
2305        if (has(CASE_INSENSITIVE))
2306            return new CIBackRef(refNum, has(UNICODE_CASE));
2307        else
2308            return new BackRef(refNum);
2309    }
2310
2311    /**
2312     * Parses an escape sequence to determine the actual value that needs
2313     * to be matched.
2314     * If -1 is returned and create was true a new object was added to the tree
2315     * to handle the escape sequence.
2316     * If the returned value is greater than zero, it is the value that
2317     * matches the escape sequence.
2318     */
2319    private int escape(boolean inclass, boolean create, boolean isrange) {
2320        int ch = skip();
2321        switch (ch) {
2322        case '0':
2323            return o();
2324        case '1':
2325        case '2':
2326        case '3':
2327        case '4':
2328        case '5':
2329        case '6':
2330        case '7':
2331        case '8':
2332        case '9':
2333            if (inclass) break;
2334            if (create) {
2335                root = ref((ch - '0'));
2336            }
2337            return -1;
2338        case 'A':
2339            if (inclass) break;
2340            if (create) root = new Begin();
2341            return -1;
2342        case 'B':
2343            if (inclass) break;
2344            if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
2345            return -1;
2346        case 'C':
2347            break;
2348        case 'D':
2349            if (create) root = has(UNICODE_CHARACTER_CLASS)
2350                               ? new Utype(UnicodeProp.DIGIT).complement()
2351                               : new Ctype(ASCII.DIGIT).complement();
2352            return -1;
2353        case 'E':
2354        case 'F':
2355            break;
2356        case 'G':
2357            if (inclass) break;
2358            if (create) root = new LastMatch();
2359            return -1;
2360        case 'H':
2361            if (create) root = new HorizWS().complement();
2362            return -1;
2363        case 'I':
2364        case 'J':
2365        case 'K':
2366        case 'L':
2367        case 'M':
2368            break;
2369        case 'N':
2370            return N();
2371        case 'O':
2372        case 'P':
2373        case 'Q':
2374            break;
2375        case 'R':
2376            if (inclass) break;
2377            if (create) root = new LineEnding();
2378            return -1;
2379        case 'S':
2380            if (create) root = has(UNICODE_CHARACTER_CLASS)
2381                               ? new Utype(UnicodeProp.WHITE_SPACE).complement()
2382                               : new Ctype(ASCII.SPACE).complement();
2383            return -1;
2384        case 'T':
2385        case 'U':
2386            break;
2387        case 'V':
2388            if (create) root = new VertWS().complement();
2389            return -1;
2390        case 'W':
2391            if (create) root = has(UNICODE_CHARACTER_CLASS)
2392                               ? new Utype(UnicodeProp.WORD).complement()
2393                               : new Ctype(ASCII.WORD).complement();
2394            return -1;
2395        case 'X':
2396            if (inclass) break;
2397            if (create) {
2398                root = new XGrapheme();
2399            }
2400            return -1;
2401        case 'Y':
2402            break;
2403        case 'Z':
2404            if (inclass) break;
2405            if (create) {
2406                if (has(UNIX_LINES))
2407                    root = new UnixDollar(false);
2408                else
2409                    root = new Dollar(false);
2410            }
2411            return -1;
2412        case 'a':
2413            return '\007';
2414        case 'b':
2415            if (inclass) break;
2416            if (create) {
2417                if (peek() == '{') {
2418                    if (skip() == 'g') {
2419                        if (read() == '}') {
2420                            root = new GraphemeBound();
2421                            return -1;
2422                        }
2423                        break;  // error missing trailing }
2424                    }
2425                    unread(); unread();
2426                }
2427                root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
2428            }
2429            return -1;
2430        case 'c':
2431            return c();
2432        case 'd':
2433            if (create) root = has(UNICODE_CHARACTER_CLASS)
2434                               ? new Utype(UnicodeProp.DIGIT)
2435                               : new Ctype(ASCII.DIGIT);
2436            return -1;
2437        case 'e':
2438            return '\033';
2439        case 'f':
2440            return '\f';
2441        case 'g':
2442            break;
2443        case 'h':
2444            if (create) root = new HorizWS();
2445            return -1;
2446        case 'i':
2447        case 'j':
2448            break;
2449        case 'k':
2450            if (inclass)
2451                break;
2452            if (read() != '<')
2453                throw error("\\k is not followed by '<' for named capturing group");
2454            String name = groupname(read());
2455            if (!namedGroups().containsKey(name))
2456                throw error("(named capturing group <"+ name+"> does not exit");
2457            if (create) {
2458                if (has(CASE_INSENSITIVE))
2459                    root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
2460                else
2461                    root = new BackRef(namedGroups().get(name));
2462            }
2463            return -1;
2464        case 'l':
2465        case 'm':
2466            break;
2467        case 'n':
2468            return '\n';
2469        case 'o':
2470        case 'p':
2471        case 'q':
2472            break;
2473        case 'r':
2474            return '\r';
2475        case 's':
2476            if (create) root = has(UNICODE_CHARACTER_CLASS)
2477                               ? new Utype(UnicodeProp.WHITE_SPACE)
2478                               : new Ctype(ASCII.SPACE);
2479            return -1;
2480        case 't':
2481            return '\t';
2482        case 'u':
2483            return u();
2484        case 'v':
2485            // '\v' was implemented as VT/0x0B in releases < 1.8 (though
2486            // undocumented). In JDK8 '\v' is specified as a predefined
2487            // character class for all vertical whitespace characters.
2488            // So [-1, root=VertWS node] pair is returned (instead of a
2489            // single 0x0B). This breaks the range if '\v' is used as
2490            // the start or end value, such as [\v-...] or [...-\v], in
2491            // which a single definite value (0x0B) is expected. For
2492            // compatibility concern '\013'/0x0B is returned if isrange.
2493            if (isrange)
2494                return '\013';
2495            if (create) root = new VertWS();
2496            return -1;
2497        case 'w':
2498            if (create) root = has(UNICODE_CHARACTER_CLASS)
2499                               ? new Utype(UnicodeProp.WORD)
2500                               : new Ctype(ASCII.WORD);
2501            return -1;
2502        case 'x':
2503            return x();
2504        case 'y':
2505            break;
2506        case 'z':
2507            if (inclass) break;
2508            if (create) root = new End();
2509            return -1;
2510        default:
2511            return ch;
2512        }
2513        throw error("Illegal/unsupported escape sequence");
2514    }
2515
2516    /**
2517     * Parse a character class, and return the node that matches it.
2518     *
2519     * Consumes a ] on the way out if consume is true. Usually consume
2520     * is true except for the case of [abc&&def] where def is a separate
2521     * right hand node with "understood" brackets.
2522     */
2523    private CharProperty clazz(boolean consume) {
2524        CharProperty prev = null;
2525        CharProperty node = null;
2526        BitClass bits = new BitClass();
2527        boolean include = true;
2528        boolean firstInClass = true;
2529        int ch = next();
2530        for (;;) {
2531            switch (ch) {
2532                case '^':
2533                    // Negates if first char in a class, otherwise literal
2534                    if (firstInClass) {
2535                        if (temp[cursor-1] != '[')
2536                            break;
2537                        ch = next();
2538                        include = !include;
2539                        continue;
2540                    } else {
2541                        // ^ not first in class, treat as literal
2542                        break;
2543                    }
2544                case '[':
2545                    firstInClass = false;
2546                    node = clazz(true);
2547                    if (prev == null)
2548                        prev = node;
2549                    else
2550                        prev = union(prev, node);
2551                    ch = peek();
2552                    continue;
2553                case '&':
2554                    firstInClass = false;
2555                    ch = next();
2556                    if (ch == '&') {
2557                        ch = next();
2558                        CharProperty rightNode = null;
2559                        while (ch != ']' && ch != '&') {
2560                            if (ch == '[') {
2561                                if (rightNode == null)
2562                                    rightNode = clazz(true);
2563                                else
2564                                    rightNode = union(rightNode, clazz(true));
2565                            } else { // abc&&def
2566                                unread();
2567                                rightNode = clazz(false);
2568                            }
2569                            ch = peek();
2570                        }
2571                        if (rightNode != null)
2572                            node = rightNode;
2573                        if (prev == null) {
2574                            if (rightNode == null)
2575                                throw error("Bad class syntax");
2576                            else
2577                                prev = rightNode;
2578                        } else {
2579                            prev = intersection(prev, node);
2580                        }
2581                    } else {
2582                        // treat as a literal &
2583                        unread();
2584                        break;
2585                    }
2586                    continue;
2587                case 0:
2588                    firstInClass = false;
2589                    if (cursor >= patternLength)
2590                        throw error("Unclosed character class");
2591                    break;
2592                case ']':
2593                    firstInClass = false;
2594                    if (prev != null) {
2595                        if (consume)
2596                            next();
2597                        return prev;
2598                    }
2599                    break;
2600                default:
2601                    firstInClass = false;
2602                    break;
2603            }
2604            node = range(bits);
2605            if (include) {
2606                if (prev == null) {
2607                    prev = node;
2608                } else {
2609                    if (prev != node)
2610                        prev = union(prev, node);
2611                }
2612            } else {
2613                if (prev == null) {
2614                    prev = node.complement();
2615                } else {
2616                    if (prev != node)
2617                        prev = setDifference(prev, node);
2618                }
2619            }
2620            ch = peek();
2621        }
2622    }
2623
2624    private CharProperty bitsOrSingle(BitClass bits, int ch) {
2625        /* Bits can only handle codepoints in [u+0000-u+00ff] range.
2626           Use "single" node instead of bits when dealing with unicode
2627           case folding for codepoints listed below.
2628           (1)Uppercase out of range: u+00ff, u+00b5
2629              toUpperCase(u+00ff) -> u+0178
2630              toUpperCase(u+00b5) -> u+039c
2631           (2)LatinSmallLetterLongS u+17f
2632              toUpperCase(u+017f) -> u+0053
2633           (3)LatinSmallLetterDotlessI u+131
2634              toUpperCase(u+0131) -> u+0049
2635           (4)LatinCapitalLetterIWithDotAbove u+0130
2636              toLowerCase(u+0130) -> u+0069
2637           (5)KelvinSign u+212a
2638              toLowerCase(u+212a) ==> u+006B
2639           (6)AngstromSign u+212b
2640              toLowerCase(u+212b) ==> u+00e5
2641        */
2642        int d;
2643        if (ch < 256 &&
2644            !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
2645              (ch == 0xff || ch == 0xb5 ||
2646               ch == 0x49 || ch == 0x69 ||  //I and i
2647               ch == 0x53 || ch == 0x73 ||  //S and s
2648               ch == 0x4b || ch == 0x6b ||  //K and k
2649               ch == 0xc5 || ch == 0xe5)))  //A+ring
2650            return bits.add(ch, flags());
2651        return newSingle(ch);
2652    }
2653
2654    /**
2655     * Parse a single character or a character range in a character class
2656     * and return its representative node.
2657     */
2658    private CharProperty range(BitClass bits) {
2659        int ch = peek();
2660        if (ch == '\\') {
2661            ch = nextEscaped();
2662            if (ch == 'p' || ch == 'P') { // A property
2663                boolean comp = (ch == 'P');
2664                boolean oneLetter = true;
2665                // Consume { if present
2666                ch = next();
2667                if (ch != '{')
2668                    unread();
2669                else
2670                    oneLetter = false;
2671                return family(oneLetter, comp);
2672            } else { // ordinary escape
2673                boolean isrange = temp[cursor+1] == '-';
2674                unread();
2675                ch = escape(true, true, isrange);
2676                if (ch == -1)
2677                    return (CharProperty) root;
2678            }
2679        } else {
2680            next();
2681        }
2682        if (ch >= 0) {
2683            if (peek() == '-') {
2684                int endRange = temp[cursor+1];
2685                if (endRange == '[') {
2686                    return bitsOrSingle(bits, ch);
2687                }
2688                if (endRange != ']') {
2689                    next();
2690                    int m = peek();
2691                    if (m == '\\') {
2692                        m = escape(true, false, true);
2693                    } else {
2694                        next();
2695                    }
2696                    if (m < ch) {
2697                        throw error("Illegal character range");
2698                    }
2699                    if (has(CASE_INSENSITIVE))
2700                        return caseInsensitiveRangeFor(ch, m);
2701                    else
2702                        return rangeFor(ch, m);
2703                }
2704            }
2705            return bitsOrSingle(bits, ch);
2706        }
2707        throw error("Unexpected character '"+((char)ch)+"'");
2708    }
2709
2710    /**
2711     * Parses a Unicode character family and returns its representative node.
2712     */
2713    private CharProperty family(boolean singleLetter,
2714                                boolean maybeComplement)
2715    {
2716        next();
2717        String name;
2718        CharProperty node = null;
2719
2720        if (singleLetter) {
2721            int c = temp[cursor];
2722            if (!Character.isSupplementaryCodePoint(c)) {
2723                name = String.valueOf((char)c);
2724            } else {
2725                name = new String(temp, cursor, 1);
2726            }
2727            read();
2728        } else {
2729            int i = cursor;
2730            mark('}');
2731            while(read() != '}') {
2732            }
2733            mark('\000');
2734            int j = cursor;
2735            if (j > patternLength)
2736                throw error("Unclosed character family");
2737            if (i + 1 >= j)
2738                throw error("Empty character family");
2739            name = new String(temp, i, j-i-1);
2740        }
2741
2742        int i = name.indexOf('=');
2743        if (i != -1) {
2744            // property construct \p{name=value}
2745            String value = name.substring(i + 1);
2746            name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
2747            switch (name) {
2748                case "sc":
2749                case "script":
2750                    node = unicodeScriptPropertyFor(value);
2751                    break;
2752                case "blk":
2753                case "block":
2754                    node = unicodeBlockPropertyFor(value);
2755                    break;
2756                case "gc":
2757                case "general_category":
2758                    node = charPropertyNodeFor(value);
2759                    break;
2760                default:
2761                    throw error("Unknown Unicode property {name=<" + name + ">, "
2762                                + "value=<" + value + ">}");
2763            }
2764        } else {
2765            if (name.startsWith("In")) {
2766                // \p{inBlockName}
2767                node = unicodeBlockPropertyFor(name.substring(2));
2768            } else if (name.startsWith("Is")) {
2769                // \p{isGeneralCategory} and \p{isScriptName}
2770                name = name.substring(2);
2771                UnicodeProp uprop = UnicodeProp.forName(name);
2772                if (uprop != null)
2773                    node = new Utype(uprop);
2774                if (node == null)
2775                    node = CharPropertyNames.charPropertyFor(name);
2776                if (node == null)
2777                    node = unicodeScriptPropertyFor(name);
2778            } else {
2779                if (has(UNICODE_CHARACTER_CLASS)) {
2780                    UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
2781                    if (uprop != null)
2782                        node = new Utype(uprop);
2783                }
2784                if (node == null)
2785                    node = charPropertyNodeFor(name);
2786            }
2787        }
2788        if (maybeComplement) {
2789            if (node instanceof Category || node instanceof Block)
2790                hasSupplementary = true;
2791            node = node.complement();
2792        }
2793        return node;
2794    }
2795
2796
2797    /**
2798     * Returns a CharProperty matching all characters belong to
2799     * a UnicodeScript.
2800     */
2801    private CharProperty unicodeScriptPropertyFor(String name) {
2802        final Character.UnicodeScript script;
2803        try {
2804            script = Character.UnicodeScript.forName(name);
2805        } catch (IllegalArgumentException iae) {
2806            throw error("Unknown character script name {" + name + "}");
2807        }
2808        return new Script(script);
2809    }
2810
2811    /**
2812     * Returns a CharProperty matching all characters in a UnicodeBlock.
2813     */
2814    private CharProperty unicodeBlockPropertyFor(String name) {
2815        final Character.UnicodeBlock block;
2816        try {
2817            block = Character.UnicodeBlock.forName(name);
2818        } catch (IllegalArgumentException iae) {
2819            throw error("Unknown character block name {" + name + "}");
2820        }
2821        return new Block(block);
2822    }
2823
2824    /**
2825     * Returns a CharProperty matching all characters in a named property.
2826     */
2827    private CharProperty charPropertyNodeFor(String name) {
2828        CharProperty p = CharPropertyNames.charPropertyFor(name);
2829        if (p == null)
2830            throw error("Unknown character property name {" + name + "}");
2831        return p;
2832    }
2833
2834    /**
2835     * Parses and returns the name of a "named capturing group", the trailing
2836     * ">" is consumed after parsing.
2837     */
2838    private String groupname(int ch) {
2839        StringBuilder sb = new StringBuilder();
2840        sb.append(Character.toChars(ch));
2841        while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
2842               ASCII.isDigit(ch)) {
2843            sb.append(Character.toChars(ch));
2844        }
2845        if (sb.length() == 0)
2846            throw error("named capturing group has 0 length name");
2847        if (ch != '>')
2848            throw error("named capturing group is missing trailing '>'");
2849        return sb.toString();
2850    }
2851
2852    /**
2853     * Parses a group and returns the head node of a set of nodes that process
2854     * the group. Sometimes a double return system is used where the tail is
2855     * returned in root.
2856     */
2857    private Node group0() {
2858        boolean capturingGroup = false;
2859        Node head = null;
2860        Node tail = null;
2861        int save = flags;
2862        root = null;
2863        int ch = next();
2864        if (ch == '?') {
2865            ch = skip();
2866            switch (ch) {
2867            case ':':   //  (?:xxx) pure group
2868                head = createGroup(true);
2869                tail = root;
2870                head.next = expr(tail);
2871                break;
2872            case '=':   // (?=xxx) and (?!xxx) lookahead
2873            case '!':
2874                head = createGroup(true);
2875                tail = root;
2876                head.next = expr(tail);
2877                if (ch == '=') {
2878                    head = tail = new Pos(head);
2879                } else {
2880                    head = tail = new Neg(head);
2881                }
2882                break;
2883            case '>':   // (?>xxx)  independent group
2884                head = createGroup(true);
2885                tail = root;
2886                head.next = expr(tail);
2887                head = tail = new Ques(head, INDEPENDENT);
2888                break;
2889            case '<':   // (?<xxx)  look behind
2890                ch = read();
2891                if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
2892                    // named captured group
2893                    String name = groupname(ch);
2894                    if (namedGroups().containsKey(name))
2895                        throw error("Named capturing group <" + name
2896                                    + "> is already defined");
2897                    capturingGroup = true;
2898                    head = createGroup(false);
2899                    tail = root;
2900                    namedGroups().put(name, capturingGroupCount-1);
2901                    head.next = expr(tail);
2902                    break;
2903                }
2904                int start = cursor;
2905                head = createGroup(true);
2906                tail = root;
2907                head.next = expr(tail);
2908                tail.next = lookbehindEnd;
2909                TreeInfo info = new TreeInfo();
2910                head.study(info);
2911                if (info.maxValid == false) {
2912                    throw error("Look-behind group does not have "
2913                                + "an obvious maximum length");
2914                }
2915                boolean hasSupplementary = findSupplementary(start, patternLength);
2916                if (ch == '=') {
2917                    head = tail = (hasSupplementary ?
2918                                   new BehindS(head, info.maxLength,
2919                                               info.minLength) :
2920                                   new Behind(head, info.maxLength,
2921                                              info.minLength));
2922                } else if (ch == '!') {
2923                    head = tail = (hasSupplementary ?
2924                                   new NotBehindS(head, info.maxLength,
2925                                                  info.minLength) :
2926                                   new NotBehind(head, info.maxLength,
2927                                                 info.minLength));
2928                } else {
2929                    throw error("Unknown look-behind group");
2930                }
2931                break;
2932            case '$':
2933            case '@':
2934                throw error("Unknown group type");
2935            default:    // (?xxx:) inlined match flags
2936                unread();
2937                addFlag();
2938                ch = read();
2939                if (ch == ')') {
2940                    return null;    // Inline modifier only
2941                }
2942                if (ch != ':') {
2943                    throw error("Unknown inline modifier");
2944                }
2945                head = createGroup(true);
2946                tail = root;
2947                head.next = expr(tail);
2948                break;
2949            }
2950        } else { // (xxx) a regular group
2951            capturingGroup = true;
2952            head = createGroup(false);
2953            tail = root;
2954            head.next = expr(tail);
2955        }
2956
2957        accept(')', "Unclosed group");
2958        flags = save;
2959
2960        // Check for quantifiers
2961        Node node = closure(head);
2962        if (node == head) { // No closure
2963            root = tail;
2964            return node;    // Dual return
2965        }
2966        if (head == tail) { // Zero length assertion
2967            root = node;
2968            return node;    // Dual return
2969        }
2970
2971        if (node instanceof Ques) {
2972            Ques ques = (Ques) node;
2973            if (ques.type == POSSESSIVE) {
2974                root = node;
2975                return node;
2976            }
2977            tail.next = new BranchConn();
2978            tail = tail.next;
2979            if (ques.type == GREEDY) {
2980                head = new Branch(head, null, tail);
2981            } else { // Reluctant quantifier
2982                head = new Branch(null, head, tail);
2983            }
2984            root = tail;
2985            return head;
2986        } else if (node instanceof Curly) {
2987            Curly curly = (Curly) node;
2988            if (curly.type == POSSESSIVE) {
2989                root = node;
2990                return node;
2991            }
2992            // Discover if the group is deterministic
2993            TreeInfo info = new TreeInfo();
2994            if (head.study(info)) { // Deterministic
2995                GroupTail temp = (GroupTail) tail;
2996                head = root = new GroupCurly(head.next, curly.cmin,
2997                                   curly.cmax, curly.type,
2998                                   ((GroupTail)tail).localIndex,
2999                                   ((GroupTail)tail).groupIndex,
3000                                             capturingGroup);
3001                return head;
3002            } else { // Non-deterministic
3003                int temp = ((GroupHead) head).localIndex;
3004                Loop loop;
3005                if (curly.type == GREEDY)
3006                    loop = new Loop(this.localCount, temp);
3007                else  // Reluctant Curly
3008                    loop = new LazyLoop(this.localCount, temp);
3009                Prolog prolog = new Prolog(loop);
3010                this.localCount += 1;
3011                loop.cmin = curly.cmin;
3012                loop.cmax = curly.cmax;
3013                loop.body = head;
3014                tail.next = loop;
3015                root = loop;
3016                return prolog; // Dual return
3017            }
3018        }
3019        throw error("Internal logic error");
3020    }
3021
3022    /**
3023     * Create group head and tail nodes using double return. If the group is
3024     * created with anonymous true then it is a pure group and should not
3025     * affect group counting.
3026     */
3027    private Node createGroup(boolean anonymous) {
3028        int localIndex = localCount++;
3029        int groupIndex = 0;
3030        if (!anonymous)
3031            groupIndex = capturingGroupCount++;
3032        GroupHead head = new GroupHead(localIndex);
3033        root = new GroupTail(localIndex, groupIndex);
3034        if (!anonymous && groupIndex < 10)
3035            groupNodes[groupIndex] = head;
3036        return head;
3037    }
3038
3039    @SuppressWarnings("fallthrough")
3040    /**
3041     * Parses inlined match flags and set them appropriately.
3042     */
3043    private void addFlag() {
3044        int ch = peek();
3045        for (;;) {
3046            switch (ch) {
3047            case 'i':
3048                flags |= CASE_INSENSITIVE;
3049                break;
3050            case 'm':
3051                flags |= MULTILINE;
3052                break;
3053            case 's':
3054                flags |= DOTALL;
3055                break;
3056            case 'd':
3057                flags |= UNIX_LINES;
3058                break;
3059            case 'u':
3060                flags |= UNICODE_CASE;
3061                break;
3062            case 'c':
3063                flags |= CANON_EQ;
3064                break;
3065            case 'x':
3066                flags |= COMMENTS;
3067                break;
3068            case 'U':
3069                flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3070                break;
3071            case '-': // subFlag then fall through
3072                ch = next();
3073                subFlag();
3074            default:
3075                return;
3076            }
3077            ch = next();
3078        }
3079    }
3080
3081    @SuppressWarnings("fallthrough")
3082    /**
3083     * Parses the second part of inlined match flags and turns off
3084     * flags appropriately.
3085     */
3086    private void subFlag() {
3087        int ch = peek();
3088        for (;;) {
3089            switch (ch) {
3090            case 'i':
3091                flags &= ~CASE_INSENSITIVE;
3092                break;
3093            case 'm':
3094                flags &= ~MULTILINE;
3095                break;
3096            case 's':
3097                flags &= ~DOTALL;
3098                break;
3099            case 'd':
3100                flags &= ~UNIX_LINES;
3101                break;
3102            case 'u':
3103                flags &= ~UNICODE_CASE;
3104                break;
3105            case 'c':
3106                flags &= ~CANON_EQ;
3107                break;
3108            case 'x':
3109                flags &= ~COMMENTS;
3110                break;
3111            case 'U':
3112                flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
3113            default:
3114                return;
3115            }
3116            ch = next();
3117        }
3118    }
3119
3120    static final int MAX_REPS   = 0x7FFFFFFF;
3121
3122    static final int GREEDY     = 0;
3123
3124    static final int LAZY       = 1;
3125
3126    static final int POSSESSIVE = 2;
3127
3128    static final int INDEPENDENT = 3;
3129
3130    /**
3131     * Processes repetition. If the next character peeked is a quantifier
3132     * then new nodes must be appended to handle the repetition.
3133     * Prev could be a single or a group, so it could be a chain of nodes.
3134     */
3135    private Node closure(Node prev) {
3136        Node atom;
3137        int ch = peek();
3138        switch (ch) {
3139        case '?':
3140            ch = next();
3141            if (ch == '?') {
3142                next();
3143                return new Ques(prev, LAZY);
3144            } else if (ch == '+') {
3145                next();
3146                return new Ques(prev, POSSESSIVE);
3147            }
3148            return new Ques(prev, GREEDY);
3149        case '*':
3150            ch = next();
3151            if (ch == '?') {
3152                next();
3153                return new Curly(prev, 0, MAX_REPS, LAZY);
3154            } else if (ch == '+') {
3155                next();
3156                return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
3157            }
3158            return new Curly(prev, 0, MAX_REPS, GREEDY);
3159        case '+':
3160            ch = next();
3161            if (ch == '?') {
3162                next();
3163                return new Curly(prev, 1, MAX_REPS, LAZY);
3164            } else if (ch == '+') {
3165                next();
3166                return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
3167            }
3168            return new Curly(prev, 1, MAX_REPS, GREEDY);
3169        case '{':
3170            ch = temp[cursor+1];
3171            if (ASCII.isDigit(ch)) {
3172                skip();
3173                int cmin = 0;
3174                do {
3175                    cmin = cmin * 10 + (ch - '0');
3176                } while (ASCII.isDigit(ch = read()));
3177                int cmax = cmin;
3178                if (ch == ',') {
3179                    ch = read();
3180                    cmax = MAX_REPS;
3181                    if (ch != '}') {
3182                        cmax = 0;
3183                        while (ASCII.isDigit(ch)) {
3184                            cmax = cmax * 10 + (ch - '0');
3185                            ch = read();
3186                        }
3187                    }
3188                }
3189                if (ch != '}')
3190                    throw error("Unclosed counted closure");
3191                if (((cmin) | (cmax) | (cmax - cmin)) < 0)
3192                    throw error("Illegal repetition range");
3193                Curly curly;
3194                ch = peek();
3195                if (ch == '?') {
3196                    next();
3197                    curly = new Curly(prev, cmin, cmax, LAZY);
3198                } else if (ch == '+') {
3199                    next();
3200                    curly = new Curly(prev, cmin, cmax, POSSESSIVE);
3201                } else {
3202                    curly = new Curly(prev, cmin, cmax, GREEDY);
3203                }
3204                return curly;
3205            } else {
3206                throw error("Illegal repetition");
3207            }
3208        default:
3209            return prev;
3210        }
3211    }
3212
3213    /**
3214     *  Utility method for parsing control escape sequences.
3215     */
3216    private int c() {
3217        if (cursor < patternLength) {
3218            return read() ^ 64;
3219        }
3220        throw error("Illegal control escape sequence");
3221    }
3222
3223    /**
3224     *  Utility method for parsing octal escape sequences.
3225     */
3226    private int o() {
3227        int n = read();
3228        if (((n-'0')|('7'-n)) >= 0) {
3229            int m = read();
3230            if (((m-'0')|('7'-m)) >= 0) {
3231                int o = read();
3232                if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
3233                    return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
3234                }
3235                unread();
3236                return (n - '0') * 8 + (m - '0');
3237            }
3238            unread();
3239            return (n - '0');
3240        }
3241        throw error("Illegal octal escape sequence");
3242    }
3243
3244    /**
3245     *  Utility method for parsing hexadecimal escape sequences.
3246     */
3247    private int x() {
3248        int n = read();
3249        if (ASCII.isHexDigit(n)) {
3250            int m = read();
3251            if (ASCII.isHexDigit(m)) {
3252                return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
3253            }
3254        } else if (n == '{' && ASCII.isHexDigit(peek())) {
3255            int ch = 0;
3256            while (ASCII.isHexDigit(n = read())) {
3257                ch = (ch << 4) + ASCII.toDigit(n);
3258                if (ch > Character.MAX_CODE_POINT)
3259                    throw error("Hexadecimal codepoint is too big");
3260            }
3261            if (n != '}')
3262                throw error("Unclosed hexadecimal escape sequence");
3263            return ch;
3264        }
3265        throw error("Illegal hexadecimal escape sequence");
3266    }
3267
3268    /**
3269     *  Utility method for parsing unicode escape sequences.
3270     */
3271    private int cursor() {
3272        return cursor;
3273    }
3274
3275    private void setcursor(int pos) {
3276        cursor = pos;
3277    }
3278
3279    private int uxxxx() {
3280        int n = 0;
3281        for (int i = 0; i < 4; i++) {
3282            int ch = read();
3283            if (!ASCII.isHexDigit(ch)) {
3284                throw error("Illegal Unicode escape sequence");
3285            }
3286            n = n * 16 + ASCII.toDigit(ch);
3287        }
3288        return n;
3289    }
3290
3291    private int u() {
3292        int n = uxxxx();
3293        if (Character.isHighSurrogate((char)n)) {
3294            int cur = cursor();
3295            if (read() == '\\' && read() == 'u') {
3296                int n2 = uxxxx();
3297                if (Character.isLowSurrogate((char)n2))
3298                    return Character.toCodePoint((char)n, (char)n2);
3299            }
3300            setcursor(cur);
3301        }
3302        return n;
3303    }
3304
3305    private int N() {
3306        if (read() == '{') {
3307            int i = cursor;
3308            while (cursor < patternLength && read() != '}') {}
3309            if (cursor > patternLength)
3310                throw error("Unclosed character name escape sequence");
3311            String name = new String(temp, i, cursor - i - 1);
3312            try {
3313                return Character.codePointOf(name);
3314            } catch (IllegalArgumentException x) {
3315                throw error("Unknown character name [" + name + "]");
3316            }
3317        }
3318        throw error("Illegal character name escape sequence");
3319    }
3320
3321    //
3322    // Utility methods for code point support
3323    //
3324    private static final int countChars(CharSequence seq, int index,
3325                                        int lengthInCodePoints) {
3326        // optimization
3327        if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
3328            assert (index >= 0 && index < seq.length());
3329            return 1;
3330        }
3331        int length = seq.length();
3332        int x = index;
3333        if (lengthInCodePoints >= 0) {
3334            assert (index >= 0 && index < length);
3335            for (int i = 0; x < length && i < lengthInCodePoints; i++) {
3336                if (Character.isHighSurrogate(seq.charAt(x++))) {
3337                    if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
3338                        x++;
3339                    }
3340                }
3341            }
3342            return x - index;
3343        }
3344
3345        assert (index >= 0 && index <= length);
3346        if (index == 0) {
3347            return 0;
3348        }
3349        int len = -lengthInCodePoints;
3350        for (int i = 0; x > 0 && i < len; i++) {
3351            if (Character.isLowSurrogate(seq.charAt(--x))) {
3352                if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
3353                    x--;
3354                }
3355            }
3356        }
3357        return index - x;
3358    }
3359
3360    private static final int countCodePoints(CharSequence seq) {
3361        int length = seq.length();
3362        int n = 0;
3363        for (int i = 0; i < length; ) {
3364            n++;
3365            if (Character.isHighSurrogate(seq.charAt(i++))) {
3366                if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3367                    i++;
3368                }
3369            }
3370        }
3371        return n;
3372    }
3373
3374    /**
3375     *  Creates a bit vector for matching Latin-1 values. A normal BitClass
3376     *  never matches values above Latin-1, and a complemented BitClass always
3377     *  matches values above Latin-1.
3378     */
3379    private static final class BitClass extends BmpCharProperty {
3380        final boolean[] bits;
3381        BitClass() { bits = new boolean[256]; }
3382        private BitClass(boolean[] bits) { this.bits = bits; }
3383        BitClass add(int c, int flags) {
3384            assert c >= 0 && c <= 255;
3385            if ((flags & CASE_INSENSITIVE) != 0) {
3386                if (ASCII.isAscii(c)) {
3387                    bits[ASCII.toUpper(c)] = true;
3388                    bits[ASCII.toLower(c)] = true;
3389                } else if ((flags & UNICODE_CASE) != 0) {
3390                    bits[Character.toLowerCase(c)] = true;
3391                    bits[Character.toUpperCase(c)] = true;
3392                }
3393            }
3394            bits[c] = true;
3395            return this;
3396        }
3397        boolean isSatisfiedBy(int ch) {
3398            return ch < 256 && bits[ch];
3399        }
3400    }
3401
3402    /**
3403     *  Returns a suitably optimized, single character matcher.
3404     */
3405    private CharProperty newSingle(final int ch) {
3406        if (has(CASE_INSENSITIVE)) {
3407            int lower, upper;
3408            if (has(UNICODE_CASE)) {
3409                upper = Character.toUpperCase(ch);
3410                lower = Character.toLowerCase(upper);
3411                if (upper != lower)
3412                    return new SingleU(lower);
3413            } else if (ASCII.isAscii(ch)) {
3414                lower = ASCII.toLower(ch);
3415                upper = ASCII.toUpper(ch);
3416                if (lower != upper)
3417                    return new SingleI(lower, upper);
3418            }
3419        }
3420        if (isSupplementary(ch))
3421            return new SingleS(ch);    // Match a given Unicode character
3422        return new Single(ch);         // Match a given BMP character
3423    }
3424
3425    /**
3426     *  Utility method for creating a string slice matcher.
3427     */
3428    private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3429        int[] tmp = new int[count];
3430        if (has(CASE_INSENSITIVE)) {
3431            if (has(UNICODE_CASE)) {
3432                for (int i = 0; i < count; i++) {
3433                    tmp[i] = Character.toLowerCase(
3434                                 Character.toUpperCase(buf[i]));
3435                }
3436                return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3437            }
3438            for (int i = 0; i < count; i++) {
3439                tmp[i] = ASCII.toLower(buf[i]);
3440            }
3441            return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3442        }
3443        for (int i = 0; i < count; i++) {
3444            tmp[i] = buf[i];
3445        }
3446        return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
3447    }
3448
3449    /**
3450     * The following classes are the building components of the object
3451     * tree that represents a compiled regular expression. The object tree
3452     * is made of individual elements that handle constructs in the Pattern.
3453     * Each type of object knows how to match its equivalent construct with
3454     * the match() method.
3455     */
3456
3457    /**
3458     * Base class for all node classes. Subclasses should override the match()
3459     * method as appropriate. This class is an accepting node, so its match()
3460     * always returns true.
3461     */
3462    static class Node extends Object {
3463        Node next;
3464        Node() {
3465            next = Pattern.accept;
3466        }
3467        /**
3468         * This method implements the classic accept node.
3469         */
3470        boolean match(Matcher matcher, int i, CharSequence seq) {
3471            matcher.last = i;
3472            matcher.groups[0] = matcher.first;
3473            matcher.groups[1] = matcher.last;
3474            return true;
3475        }
3476        /**
3477         * This method is good for all zero length assertions.
3478         */
3479        boolean study(TreeInfo info) {
3480            if (next != null) {
3481                return next.study(info);
3482            } else {
3483                return info.deterministic;
3484            }
3485        }
3486    }
3487
3488    static class LastNode extends Node {
3489        /**
3490         * This method implements the classic accept node with
3491         * the addition of a check to see if the match occurred
3492         * using all of the input.
3493         */
3494        boolean match(Matcher matcher, int i, CharSequence seq) {
3495            if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
3496                return false;
3497            matcher.last = i;
3498            matcher.groups[0] = matcher.first;
3499            matcher.groups[1] = matcher.last;
3500            return true;
3501        }
3502    }
3503
3504    /**
3505     * Used for REs that can start anywhere within the input string.
3506     * This basically tries to match repeatedly at each spot in the
3507     * input string, moving forward after each try. An anchored search
3508     * or a BnM will bypass this node completely.
3509     */
3510    static class Start extends Node {
3511        int minLength;
3512        Start(Node node) {
3513            this.next = node;
3514            TreeInfo info = new TreeInfo();
3515            next.study(info);
3516            minLength = info.minLength;
3517        }
3518        boolean match(Matcher matcher, int i, CharSequence seq) {
3519            if (i > matcher.to - minLength) {
3520                matcher.hitEnd = true;
3521                return false;
3522            }
3523            int guard = matcher.to - minLength;
3524            for (; i <= guard; i++) {
3525                if (next.match(matcher, i, seq)) {
3526                    matcher.first = i;
3527                    matcher.groups[0] = matcher.first;
3528                    matcher.groups[1] = matcher.last;
3529                    return true;
3530                }
3531            }
3532            matcher.hitEnd = true;
3533            return false;
3534        }
3535        boolean study(TreeInfo info) {
3536            next.study(info);
3537            info.maxValid = false;
3538            info.deterministic = false;
3539            return false;
3540        }
3541    }
3542
3543    /*
3544     * StartS supports supplementary characters, including unpaired surrogates.
3545     */
3546    static final class StartS extends Start {
3547        StartS(Node node) {
3548            super(node);
3549        }
3550        boolean match(Matcher matcher, int i, CharSequence seq) {
3551            if (i > matcher.to - minLength) {
3552                matcher.hitEnd = true;
3553                return false;
3554            }
3555            int guard = matcher.to - minLength;
3556            while (i <= guard) {
3557                //if ((ret = next.match(matcher, i, seq)) || i == guard)
3558                if (next.match(matcher, i, seq)) {
3559                    matcher.first = i;
3560                    matcher.groups[0] = matcher.first;
3561                    matcher.groups[1] = matcher.last;
3562                    return true;
3563                }
3564                if (i == guard)
3565                    break;
3566                // Optimization to move to the next character. This is
3567                // faster than countChars(seq, i, 1).
3568                if (Character.isHighSurrogate(seq.charAt(i++))) {
3569                    if (i < seq.length() &&
3570                        Character.isLowSurrogate(seq.charAt(i))) {
3571                        i++;
3572                    }
3573                }
3574            }
3575            matcher.hitEnd = true;
3576            return false;
3577        }
3578    }
3579
3580    /**
3581     * Node to anchor at the beginning of input. This object implements the
3582     * match for a \A sequence, and the caret anchor will use this if not in
3583     * multiline mode.
3584     */
3585    static final class Begin extends Node {
3586        boolean match(Matcher matcher, int i, CharSequence seq) {
3587            int fromIndex = (matcher.anchoringBounds) ?
3588                matcher.from : 0;
3589            if (i == fromIndex && next.match(matcher, i, seq)) {
3590                matcher.first = i;
3591                matcher.groups[0] = i;
3592                matcher.groups[1] = matcher.last;
3593                return true;
3594            } else {
3595                return false;
3596            }
3597        }
3598    }
3599
3600    /**
3601     * Node to anchor at the end of input. This is the absolute end, so this
3602     * should not match at the last newline before the end as $ will.
3603     */
3604    static final class End extends Node {
3605        boolean match(Matcher matcher, int i, CharSequence seq) {
3606            int endIndex = (matcher.anchoringBounds) ?
3607                matcher.to : matcher.getTextLength();
3608            if (i == endIndex) {
3609                matcher.hitEnd = true;
3610                return next.match(matcher, i, seq);
3611            }
3612            return false;
3613        }
3614    }
3615
3616    /**
3617     * Node to anchor at the beginning of a line. This is essentially the
3618     * object to match for the multiline ^.
3619     */
3620    static final class Caret extends Node {
3621        boolean match(Matcher matcher, int i, CharSequence seq) {
3622            int startIndex = matcher.from;
3623            int endIndex = matcher.to;
3624            if (!matcher.anchoringBounds) {
3625                startIndex = 0;
3626                endIndex = matcher.getTextLength();
3627            }
3628            // Perl does not match ^ at end of input even after newline
3629            if (i == endIndex) {
3630                matcher.hitEnd = true;
3631                return false;
3632            }
3633            if (i > startIndex) {
3634                char ch = seq.charAt(i-1);
3635                if (ch != '\n' && ch != '\r'
3636                    && (ch|1) != '\u2029'
3637                    && ch != '\u0085' ) {
3638                    return false;
3639                }
3640                // Should treat /r/n as one newline
3641                if (ch == '\r' && seq.charAt(i) == '\n')
3642                    return false;
3643            }
3644            return next.match(matcher, i, seq);
3645        }
3646    }
3647
3648    /**
3649     * Node to anchor at the beginning of a line when in unixdot mode.
3650     */
3651    static final class UnixCaret extends Node {
3652        boolean match(Matcher matcher, int i, CharSequence seq) {
3653            int startIndex = matcher.from;
3654            int endIndex = matcher.to;
3655            if (!matcher.anchoringBounds) {
3656                startIndex = 0;
3657                endIndex = matcher.getTextLength();
3658            }
3659            // Perl does not match ^ at end of input even after newline
3660            if (i == endIndex) {
3661                matcher.hitEnd = true;
3662                return false;
3663            }
3664            if (i > startIndex) {
3665                char ch = seq.charAt(i-1);
3666                if (ch != '\n') {
3667                    return false;
3668                }
3669            }
3670            return next.match(matcher, i, seq);
3671        }
3672    }
3673
3674    /**
3675     * Node to match the location where the last match ended.
3676     * This is used for the \G construct.
3677     */
3678    static final class LastMatch extends Node {
3679        boolean match(Matcher matcher, int i, CharSequence seq) {
3680            if (i != matcher.oldLast)
3681                return false;
3682            return next.match(matcher, i, seq);
3683        }
3684    }
3685
3686    /**
3687     * Node to anchor at the end of a line or the end of input based on the
3688     * multiline mode.
3689     *
3690     * When not in multiline mode, the $ can only match at the very end
3691     * of the input, unless the input ends in a line terminator in which
3692     * it matches right before the last line terminator.
3693     *
3694     * Note that \r\n is considered an atomic line terminator.
3695     *
3696     * Like ^ the $ operator matches at a position, it does not match the
3697     * line terminators themselves.
3698     */
3699    static final class Dollar extends Node {
3700        boolean multiline;
3701        Dollar(boolean mul) {
3702            multiline = mul;
3703        }
3704        boolean match(Matcher matcher, int i, CharSequence seq) {
3705            int endIndex = (matcher.anchoringBounds) ?
3706                matcher.to : matcher.getTextLength();
3707            if (!multiline) {
3708                if (i < endIndex - 2)
3709                    return false;
3710                if (i == endIndex - 2) {
3711                    char ch = seq.charAt(i);
3712                    if (ch != '\r')
3713                        return false;
3714                    ch = seq.charAt(i + 1);
3715                    if (ch != '\n')
3716                        return false;
3717                }
3718            }
3719            // Matches before any line terminator; also matches at the
3720            // end of input
3721            // Before line terminator:
3722            // If multiline, we match here no matter what
3723            // If not multiline, fall through so that the end
3724            // is marked as hit; this must be a /r/n or a /n
3725            // at the very end so the end was hit; more input
3726            // could make this not match here
3727            if (i < endIndex) {
3728                char ch = seq.charAt(i);
3729                 if (ch == '\n') {
3730                     // No match between \r\n
3731                     if (i > 0 && seq.charAt(i-1) == '\r')
3732                         return false;
3733                     if (multiline)
3734                         return next.match(matcher, i, seq);
3735                 } else if (ch == '\r' || ch == '\u0085' ||
3736                            (ch|1) == '\u2029') {
3737                     if (multiline)
3738                         return next.match(matcher, i, seq);
3739                 } else { // No line terminator, no match
3740                     return false;
3741                 }
3742            }
3743            // Matched at current end so hit end
3744            matcher.hitEnd = true;
3745            // If a $ matches because of end of input, then more input
3746            // could cause it to fail!
3747            matcher.requireEnd = true;
3748            return next.match(matcher, i, seq);
3749        }
3750        boolean study(TreeInfo info) {
3751            next.study(info);
3752            return info.deterministic;
3753        }
3754    }
3755
3756    /**
3757     * Node to anchor at the end of a line or the end of input based on the
3758     * multiline mode when in unix lines mode.
3759     */
3760    static final class UnixDollar extends Node {
3761        boolean multiline;
3762        UnixDollar(boolean mul) {
3763            multiline = mul;
3764        }
3765        boolean match(Matcher matcher, int i, CharSequence seq) {
3766            int endIndex = (matcher.anchoringBounds) ?
3767                matcher.to : matcher.getTextLength();
3768            if (i < endIndex) {
3769                char ch = seq.charAt(i);
3770                if (ch == '\n') {
3771                    // If not multiline, then only possible to
3772                    // match at very end or one before end
3773                    if (multiline == false && i != endIndex - 1)
3774                        return false;
3775                    // If multiline return next.match without setting
3776                    // matcher.hitEnd
3777                    if (multiline)
3778                        return next.match(matcher, i, seq);
3779                } else {
3780                    return false;
3781                }
3782            }
3783            // Matching because at the end or 1 before the end;
3784            // more input could change this so set hitEnd
3785            matcher.hitEnd = true;
3786            // If a $ matches because of end of input, then more input
3787            // could cause it to fail!
3788            matcher.requireEnd = true;
3789            return next.match(matcher, i, seq);
3790        }
3791        boolean study(TreeInfo info) {
3792            next.study(info);
3793            return info.deterministic;
3794        }
3795    }
3796
3797    /**
3798     * Node class that matches a Unicode line ending '\R'
3799     */
3800    static final class LineEnding extends Node {
3801        boolean match(Matcher matcher, int i, CharSequence seq) {
3802            // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029])
3803            if (i < matcher.to) {
3804                int ch = seq.charAt(i);
3805                if (ch == 0x0A || ch == 0x0B || ch == 0x0C ||
3806                    ch == 0x85 || ch == 0x2028 || ch == 0x2029)
3807                    return next.match(matcher, i + 1, seq);
3808                if (ch == 0x0D) {
3809                    i++;
3810                    if (i < matcher.to && seq.charAt(i) == 0x0A)
3811                        i++;
3812                    return next.match(matcher, i, seq);
3813                }
3814            } else {
3815                matcher.hitEnd = true;
3816            }
3817            return false;
3818        }
3819        boolean study(TreeInfo info) {
3820            info.minLength++;
3821            info.maxLength += 2;
3822            return next.study(info);
3823        }
3824    }
3825
3826    /**
3827     * Abstract node class to match one character satisfying some
3828     * boolean property.
3829     */
3830    private abstract static class CharProperty extends Node {
3831        abstract boolean isSatisfiedBy(int ch);
3832        CharProperty complement() {
3833            return new CharProperty() {
3834                    boolean isSatisfiedBy(int ch) {
3835                        return ! CharProperty.this.isSatisfiedBy(ch);}};
3836        }
3837        boolean match(Matcher matcher, int i, CharSequence seq) {
3838            if (i < matcher.to) {
3839                int ch = Character.codePointAt(seq, i);
3840                return isSatisfiedBy(ch)
3841                    && next.match(matcher, i+Character.charCount(ch), seq);
3842            } else {
3843                matcher.hitEnd = true;
3844                return false;
3845            }
3846        }
3847        boolean study(TreeInfo info) {
3848            info.minLength++;
3849            info.maxLength++;
3850            return next.study(info);
3851        }
3852    }
3853
3854    /**
3855     * Optimized version of CharProperty that works only for
3856     * properties never satisfied by Supplementary characters.
3857     */
3858    private abstract static class BmpCharProperty extends CharProperty {
3859        boolean match(Matcher matcher, int i, CharSequence seq) {
3860            if (i < matcher.to) {
3861                return isSatisfiedBy(seq.charAt(i))
3862                    && next.match(matcher, i+1, seq);
3863            } else {
3864                matcher.hitEnd = true;
3865                return false;
3866            }
3867        }
3868    }
3869
3870    /**
3871     * Node class that matches a Supplementary Unicode character
3872     */
3873    static final class SingleS extends CharProperty {
3874        final int c;
3875        SingleS(int c) { this.c = c; }
3876        boolean isSatisfiedBy(int ch) {
3877            return ch == c;
3878        }
3879    }
3880
3881    /**
3882     * Optimization -- matches a given BMP character
3883     */
3884    static final class Single extends BmpCharProperty {
3885        final int c;
3886        Single(int c) { this.c = c; }
3887        boolean isSatisfiedBy(int ch) {
3888            return ch == c;
3889        }
3890    }
3891
3892    /**
3893     * Case insensitive matches a given BMP character
3894     */
3895    static final class SingleI extends BmpCharProperty {
3896        final int lower;
3897        final int upper;
3898        SingleI(int lower, int upper) {
3899            this.lower = lower;
3900            this.upper = upper;
3901        }
3902        boolean isSatisfiedBy(int ch) {
3903            return ch == lower || ch == upper;
3904        }
3905    }
3906
3907    /**
3908     * Unicode case insensitive matches a given Unicode character
3909     */
3910    static final class SingleU extends CharProperty {
3911        final int lower;
3912        SingleU(int lower) {
3913            this.lower = lower;
3914        }
3915        boolean isSatisfiedBy(int ch) {
3916            return lower == ch ||
3917                lower == Character.toLowerCase(Character.toUpperCase(ch));
3918        }
3919    }
3920
3921    /**
3922     * Node class that matches a Unicode block.
3923     */
3924    static final class Block extends CharProperty {
3925        final Character.UnicodeBlock block;
3926        Block(Character.UnicodeBlock block) {
3927            this.block = block;
3928        }
3929        boolean isSatisfiedBy(int ch) {
3930            return block == Character.UnicodeBlock.of(ch);
3931        }
3932    }
3933
3934    /**
3935     * Node class that matches a Unicode script
3936     */
3937    static final class Script extends CharProperty {
3938        final Character.UnicodeScript script;
3939        Script(Character.UnicodeScript script) {
3940            this.script = script;
3941        }
3942        boolean isSatisfiedBy(int ch) {
3943            return script == Character.UnicodeScript.of(ch);
3944        }
3945    }
3946
3947    /**
3948     * Node class that matches a Unicode category.
3949     */
3950    static final class Category extends CharProperty {
3951        final int typeMask;
3952        Category(int typeMask) { this.typeMask = typeMask; }
3953        boolean isSatisfiedBy(int ch) {
3954            return (typeMask & (1 << Character.getType(ch))) != 0;
3955        }
3956    }
3957
3958    /**
3959     * Node class that matches a Unicode "type"
3960     */
3961    static final class Utype extends CharProperty {
3962        final UnicodeProp uprop;
3963        Utype(UnicodeProp uprop) { this.uprop = uprop; }
3964        boolean isSatisfiedBy(int ch) {
3965            return uprop.is(ch);
3966        }
3967    }
3968
3969    /**
3970     * Node class that matches a POSIX type.
3971     */
3972    static final class Ctype extends BmpCharProperty {
3973        final int ctype;
3974        Ctype(int ctype) { this.ctype = ctype; }
3975        boolean isSatisfiedBy(int ch) {
3976            return ch < 128 && ASCII.isType(ch, ctype);
3977        }
3978    }
3979
3980    /**
3981     * Node class that matches a Perl vertical whitespace
3982     */
3983    static final class VertWS extends BmpCharProperty {
3984        boolean isSatisfiedBy(int cp) {
3985            return (cp >= 0x0A && cp <= 0x0D) ||
3986                   cp == 0x85 || cp == 0x2028 || cp == 0x2029;
3987        }
3988    }
3989
3990    /**
3991     * Node class that matches a Perl horizontal whitespace
3992     */
3993    static final class HorizWS extends BmpCharProperty {
3994        boolean isSatisfiedBy(int cp) {
3995            return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
3996                   cp == 0x1680 || cp == 0x180e ||
3997                   cp >= 0x2000 && cp <= 0x200a ||
3998                   cp == 0x202f || cp == 0x205f || cp == 0x3000;
3999        }
4000    }
4001
4002    /**
4003     * Node class that matches an unicode extended grapheme cluster
4004     */
4005    static class XGrapheme extends Node {
4006        boolean match(Matcher matcher, int i, CharSequence seq) {
4007            if (i < matcher.to) {
4008                int ch0 = Character.codePointAt(seq, i);
4009                    i += Character.charCount(ch0);
4010                while (i < matcher.to) {
4011                    int ch1 = Character.codePointAt(seq, i);
4012                    if (Grapheme.isBoundary(ch0, ch1))
4013                        break;
4014                    ch0 = ch1;
4015                    i += Character.charCount(ch1);
4016                }
4017                return next.match(matcher, i, seq);
4018            }
4019            matcher.hitEnd = true;
4020            return false;
4021        }
4022
4023        boolean study(TreeInfo info) {
4024            info.minLength++;
4025            info.deterministic = false;
4026            return next.study(info);
4027        }
4028    }
4029
4030    /**
4031     * Node class that handles grapheme boundaries
4032     */
4033    static class GraphemeBound extends Node {
4034        boolean match(Matcher matcher, int i, CharSequence seq) {
4035            int startIndex = matcher.from;
4036            int endIndex = matcher.to;
4037            if (matcher.transparentBounds) {
4038                startIndex = 0;
4039                endIndex = matcher.getTextLength();
4040            }
4041            if (i == startIndex) {
4042                return next.match(matcher, i, seq);
4043            }
4044            if (i < endIndex) {
4045                if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
4046                    !Grapheme.isBoundary(Character.codePointBefore(seq, i),
4047                                         Character.codePointAt(seq, i))) {
4048                    return false;
4049                }
4050            } else {
4051                matcher.hitEnd = true;
4052                matcher.requireEnd = true;
4053            }
4054            return next.match(matcher, i, seq);
4055        }
4056    }
4057
4058    /**
4059     * Base class for all Slice nodes
4060     */
4061    static class SliceNode extends Node {
4062        int[] buffer;
4063        SliceNode(int[] buf) {
4064            buffer = buf;
4065        }
4066        boolean study(TreeInfo info) {
4067            info.minLength += buffer.length;
4068            info.maxLength += buffer.length;
4069            return next.study(info);
4070        }
4071    }
4072
4073    /**
4074     * Node class for a case sensitive/BMP-only sequence of literal
4075     * characters.
4076     */
4077    static class Slice extends SliceNode {
4078        Slice(int[] buf) {
4079            super(buf);
4080        }
4081        boolean match(Matcher matcher, int i, CharSequence seq) {
4082            int[] buf = buffer;
4083            int len = buf.length;
4084            for (int j=0; j<len; j++) {
4085                if ((i+j) >= matcher.to) {
4086                    matcher.hitEnd = true;
4087                    return false;
4088                }
4089                if (buf[j] != seq.charAt(i+j))
4090                    return false;
4091            }
4092            return next.match(matcher, i+len, seq);
4093        }
4094    }
4095
4096    /**
4097     * Node class for a case_insensitive/BMP-only sequence of literal
4098     * characters.
4099     */
4100    static class SliceI extends SliceNode {
4101        SliceI(int[] buf) {
4102            super(buf);
4103        }
4104        boolean match(Matcher matcher, int i, CharSequence seq) {
4105            int[] buf = buffer;
4106            int len = buf.length;
4107            for (int j=0; j<len; j++) {
4108                if ((i+j) >= matcher.to) {
4109                    matcher.hitEnd = true;
4110                    return false;
4111                }
4112                int c = seq.charAt(i+j);
4113                if (buf[j] != c &&
4114                    buf[j] != ASCII.toLower(c))
4115                    return false;
4116            }
4117            return next.match(matcher, i+len, seq);
4118        }
4119    }
4120
4121    /**
4122     * Node class for a unicode_case_insensitive/BMP-only sequence of
4123     * literal characters. Uses unicode case folding.
4124     */
4125    static final class SliceU extends SliceNode {
4126        SliceU(int[] buf) {
4127            super(buf);
4128        }
4129        boolean match(Matcher matcher, int i, CharSequence seq) {
4130            int[] buf = buffer;
4131            int len = buf.length;
4132            for (int j=0; j<len; j++) {
4133                if ((i+j) >= matcher.to) {
4134                    matcher.hitEnd = true;
4135                    return false;
4136                }
4137                int c = seq.charAt(i+j);
4138                if (buf[j] != c &&
4139                    buf[j] != Character.toLowerCase(Character.toUpperCase(c)))
4140                    return false;
4141            }
4142            return next.match(matcher, i+len, seq);
4143        }
4144    }
4145
4146    /**
4147     * Node class for a case sensitive sequence of literal characters
4148     * including supplementary characters.
4149     */
4150    static final class SliceS extends Slice {
4151        SliceS(int[] buf) {
4152            super(buf);
4153        }
4154        boolean match(Matcher matcher, int i, CharSequence seq) {
4155            int[] buf = buffer;
4156            int x = i;
4157            for (int j = 0; j < buf.length; j++) {
4158                if (x >= matcher.to) {
4159                    matcher.hitEnd = true;
4160                    return false;
4161                }
4162                int c = Character.codePointAt(seq, x);
4163                if (buf[j] != c)
4164                    return false;
4165                x += Character.charCount(c);
4166                if (x > matcher.to) {
4167                    matcher.hitEnd = true;
4168                    return false;
4169                }
4170            }
4171            return next.match(matcher, x, seq);
4172        }
4173    }
4174
4175    /**
4176     * Node class for a case insensitive sequence of literal characters
4177     * including supplementary characters.
4178     */
4179    static class SliceIS extends SliceNode {
4180        SliceIS(int[] buf) {
4181            super(buf);
4182        }
4183        int toLower(int c) {
4184            return ASCII.toLower(c);
4185        }
4186        boolean match(Matcher matcher, int i, CharSequence seq) {
4187            int[] buf = buffer;
4188            int x = i;
4189            for (int j = 0; j < buf.length; j++) {
4190                if (x >= matcher.to) {
4191                    matcher.hitEnd = true;
4192                    return false;
4193                }
4194                int c = Character.codePointAt(seq, x);
4195                if (buf[j] != c && buf[j] != toLower(c))
4196                    return false;
4197                x += Character.charCount(c);
4198                if (x > matcher.to) {
4199                    matcher.hitEnd = true;
4200                    return false;
4201                }
4202            }
4203            return next.match(matcher, x, seq);
4204        }
4205    }
4206
4207    /**
4208     * Node class for a case insensitive sequence of literal characters.
4209     * Uses unicode case folding.
4210     */
4211    static final class SliceUS extends SliceIS {
4212        SliceUS(int[] buf) {
4213            super(buf);
4214        }
4215        int toLower(int c) {
4216            return Character.toLowerCase(Character.toUpperCase(c));
4217        }
4218    }
4219
4220    private static boolean inRange(int lower, int ch, int upper) {
4221        return lower <= ch && ch <= upper;
4222    }
4223
4224    /**
4225     * Returns node for matching characters within an explicit value range.
4226     */
4227    private static CharProperty rangeFor(final int lower,
4228                                         final int upper) {
4229        return new CharProperty() {
4230                boolean isSatisfiedBy(int ch) {
4231                    return inRange(lower, ch, upper);}};
4232    }
4233
4234    /**
4235     * Returns node for matching characters within an explicit value
4236     * range in a case insensitive manner.
4237     */
4238    private CharProperty caseInsensitiveRangeFor(final int lower,
4239                                                 final int upper) {
4240        if (has(UNICODE_CASE))
4241            return new CharProperty() {
4242                boolean isSatisfiedBy(int ch) {
4243                    if (inRange(lower, ch, upper))
4244                        return true;
4245                    int up = Character.toUpperCase(ch);
4246                    return inRange(lower, up, upper) ||
4247                           inRange(lower, Character.toLowerCase(up), upper);}};
4248        return new CharProperty() {
4249            boolean isSatisfiedBy(int ch) {
4250                return inRange(lower, ch, upper) ||
4251                    ASCII.isAscii(ch) &&
4252                        (inRange(lower, ASCII.toUpper(ch), upper) ||
4253                         inRange(lower, ASCII.toLower(ch), upper));
4254            }};
4255    }
4256
4257    /**
4258     * Implements the Unicode category ALL and the dot metacharacter when
4259     * in dotall mode.
4260     */
4261    static final class All extends CharProperty {
4262        boolean isSatisfiedBy(int ch) {
4263            return true;
4264        }
4265    }
4266
4267    /**
4268     * Node class for the dot metacharacter when dotall is not enabled.
4269     */
4270    static final class Dot extends CharProperty {
4271        boolean isSatisfiedBy(int ch) {
4272            return (ch != '\n' && ch != '\r'
4273                    && (ch|1) != '\u2029'
4274                    && ch != '\u0085');
4275        }
4276    }
4277
4278    /**
4279     * Node class for the dot metacharacter when dotall is not enabled
4280     * but UNIX_LINES is enabled.
4281     */
4282    static final class UnixDot extends CharProperty {
4283        boolean isSatisfiedBy(int ch) {
4284            return ch != '\n';
4285        }
4286    }
4287
4288    /**
4289     * The 0 or 1 quantifier. This one class implements all three types.
4290     */
4291    static final class Ques extends Node {
4292        Node atom;
4293        int type;
4294        Ques(Node node, int type) {
4295            this.atom = node;
4296            this.type = type;
4297        }
4298        boolean match(Matcher matcher, int i, CharSequence seq) {
4299            switch (type) {
4300            case GREEDY:
4301                return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
4302                    || next.match(matcher, i, seq);
4303            case LAZY:
4304                return next.match(matcher, i, seq)
4305                    || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
4306            case POSSESSIVE:
4307                if (atom.match(matcher, i, seq)) i = matcher.last;
4308                return next.match(matcher, i, seq);
4309            default:
4310                return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4311            }
4312        }
4313        boolean study(TreeInfo info) {
4314            if (type != INDEPENDENT) {
4315                int minL = info.minLength;
4316                atom.study(info);
4317                info.minLength = minL;
4318                info.deterministic = false;
4319                return next.study(info);
4320            } else {
4321                atom.study(info);
4322                return next.study(info);
4323            }
4324        }
4325    }
4326
4327    /**
4328     * Handles the curly-brace style repetition with a specified minimum and
4329     * maximum occurrences. The * quantifier is handled as a special case.
4330     * This class handles the three types.
4331     */
4332    static final class Curly extends Node {
4333        Node atom;
4334        int type;
4335        int cmin;
4336        int cmax;
4337
4338        Curly(Node node, int cmin, int cmax, int type) {
4339            this.atom = node;
4340            this.type = type;
4341            this.cmin = cmin;
4342            this.cmax = cmax;
4343        }
4344        boolean match(Matcher matcher, int i, CharSequence seq) {
4345            int j;
4346            for (j = 0; j < cmin; j++) {
4347                if (atom.match(matcher, i, seq)) {
4348                    i = matcher.last;
4349                    continue;
4350                }
4351                return false;
4352            }
4353            if (type == GREEDY)
4354                return match0(matcher, i, j, seq);
4355            else if (type == LAZY)
4356                return match1(matcher, i, j, seq);
4357            else
4358                return match2(matcher, i, j, seq);
4359        }
4360        // Greedy match.
4361        // i is the index to start matching at
4362        // j is the number of atoms that have matched
4363        boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4364            if (j >= cmax) {
4365                // We have matched the maximum... continue with the rest of
4366                // the regular expression
4367                return next.match(matcher, i, seq);
4368            }
4369            int backLimit = j;
4370            while (atom.match(matcher, i, seq)) {
4371                // k is the length of this match
4372                int k = matcher.last - i;
4373                if (k == 0) // Zero length match
4374                    break;
4375                // Move up index and number matched
4376                i = matcher.last;
4377                j++;
4378                // We are greedy so match as many as we can
4379                while (j < cmax) {
4380                    if (!atom.match(matcher, i, seq))
4381                        break;
4382                    if (i + k != matcher.last) {
4383                        if (match0(matcher, matcher.last, j+1, seq))
4384                            return true;
4385                        break;
4386                    }
4387                    i += k;
4388                    j++;
4389                }
4390                // Handle backing off if match fails
4391                while (j >= backLimit) {
4392                   if (next.match(matcher, i, seq))
4393                        return true;
4394                    i -= k;
4395                    j--;
4396                }
4397                return false;
4398            }
4399            return next.match(matcher, i, seq);
4400        }
4401        // Reluctant match. At this point, the minimum has been satisfied.
4402        // i is the index to start matching at
4403        // j is the number of atoms that have matched
4404        boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
4405            for (;;) {
4406                // Try finishing match without consuming any more
4407                if (next.match(matcher, i, seq))
4408                    return true;
4409                // At the maximum, no match found
4410                if (j >= cmax)
4411                    return false;
4412                // Okay, must try one more atom
4413                if (!atom.match(matcher, i, seq))
4414                    return false;
4415                // If we haven't moved forward then must break out
4416                if (i == matcher.last)
4417                    return false;
4418                // Move up index and number matched
4419                i = matcher.last;
4420                j++;
4421            }
4422        }
4423        boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
4424            for (; j < cmax; j++) {
4425                if (!atom.match(matcher, i, seq))
4426                    break;
4427                if (i == matcher.last)
4428                    break;
4429                i = matcher.last;
4430            }
4431            return next.match(matcher, i, seq);
4432        }
4433        boolean study(TreeInfo info) {
4434            // Save original info
4435            int minL = info.minLength;
4436            int maxL = info.maxLength;
4437            boolean maxV = info.maxValid;
4438            boolean detm = info.deterministic;
4439            info.reset();
4440
4441            atom.study(info);
4442
4443            int temp = info.minLength * cmin + minL;
4444            if (temp < minL) {
4445                temp = 0xFFFFFFF; // arbitrary large number
4446            }
4447            info.minLength = temp;
4448
4449            if (maxV & info.maxValid) {
4450                temp = info.maxLength * cmax + maxL;
4451                info.maxLength = temp;
4452                if (temp < maxL) {
4453                    info.maxValid = false;
4454                }
4455            } else {
4456                info.maxValid = false;
4457            }
4458
4459            if (info.deterministic && cmin == cmax)
4460                info.deterministic = detm;
4461            else
4462                info.deterministic = false;
4463            return next.study(info);
4464        }
4465    }
4466
4467    /**
4468     * Handles the curly-brace style repetition with a specified minimum and
4469     * maximum occurrences in deterministic cases. This is an iterative
4470     * optimization over the Prolog and Loop system which would handle this
4471     * in a recursive way. The * quantifier is handled as a special case.
4472     * If capture is true then this class saves group settings and ensures
4473     * that groups are unset when backing off of a group match.
4474     */
4475    static final class GroupCurly extends Node {
4476        Node atom;
4477        int type;
4478        int cmin;
4479        int cmax;
4480        int localIndex;
4481        int groupIndex;
4482        boolean capture;
4483
4484        GroupCurly(Node node, int cmin, int cmax, int type, int local,
4485                   int group, boolean capture) {
4486            this.atom = node;
4487            this.type = type;
4488            this.cmin = cmin;
4489            this.cmax = cmax;
4490            this.localIndex = local;
4491            this.groupIndex = group;
4492            this.capture = capture;
4493        }
4494        boolean match(Matcher matcher, int i, CharSequence seq) {
4495            int[] groups = matcher.groups;
4496            int[] locals = matcher.locals;
4497            int save0 = locals[localIndex];
4498            int save1 = 0;
4499            int save2 = 0;
4500
4501            if (capture) {
4502                save1 = groups[groupIndex];
4503                save2 = groups[groupIndex+1];
4504            }
4505
4506            // Notify GroupTail there is no need to setup group info
4507            // because it will be set here
4508            locals[localIndex] = -1;
4509
4510            boolean ret = true;
4511            for (int j = 0; j < cmin; j++) {
4512                if (atom.match(matcher, i, seq)) {
4513                    if (capture) {
4514                        groups[groupIndex] = i;
4515                        groups[groupIndex+1] = matcher.last;
4516                    }
4517                    i = matcher.last;
4518                } else {
4519                    ret = false;
4520                    break;
4521                }
4522            }
4523            if (ret) {
4524                if (type == GREEDY) {
4525                    ret = match0(matcher, i, cmin, seq);
4526                } else if (type == LAZY) {
4527                    ret = match1(matcher, i, cmin, seq);
4528                } else {
4529                    ret = match2(matcher, i, cmin, seq);
4530                }
4531            }
4532            if (!ret) {
4533                locals[localIndex] = save0;
4534                if (capture) {
4535                    groups[groupIndex] = save1;
4536                    groups[groupIndex+1] = save2;
4537                }
4538            }
4539            return ret;
4540        }
4541        // Aggressive group match
4542        boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
4543            // don't back off passing the starting "j"
4544            int min = j;
4545            int[] groups = matcher.groups;
4546            int save0 = 0;
4547            int save1 = 0;
4548            if (capture) {
4549                save0 = groups[groupIndex];
4550                save1 = groups[groupIndex+1];
4551            }
4552            for (;;) {
4553                if (j >= cmax)
4554                    break;
4555                if (!atom.match(matcher, i, seq))
4556                    break;
4557                int k = matcher.last - i;
4558                if (k <= 0) {
4559                    if (capture) {
4560                        groups[groupIndex] = i;
4561                        groups[groupIndex+1] = i + k;
4562                    }
4563                    i = i + k;
4564                    break;
4565                }
4566                for (;;) {
4567                    if (capture) {
4568                        groups[groupIndex] = i;
4569                        groups[groupIndex+1] = i + k;
4570                    }
4571                    i = i + k;
4572                    if (++j >= cmax)
4573                        break;
4574                    if (!atom.match(matcher, i, seq))
4575                        break;
4576                    if (i + k != matcher.last) {
4577                        if (match0(matcher, i, j, seq))
4578                            return true;
4579                        break;
4580                    }
4581                }
4582                while (j > min) {
4583                    if (next.match(matcher, i, seq)) {
4584                        if (capture) {
4585                            groups[groupIndex+1] = i;
4586                            groups[groupIndex] = i - k;
4587                        }
4588                        return true;
4589                    }
4590                    // backing off
4591                    i = i - k;
4592                    if (capture) {
4593                        groups[groupIndex+1] = i;
4594                        groups[groupIndex] = i - k;
4595                    }
4596                    j--;
4597
4598                }
4599                break;
4600            }
4601            if (capture) {
4602                groups[groupIndex] = save0;
4603                groups[groupIndex+1] = save1;
4604            }
4605            return next.match(matcher, i, seq);
4606        }
4607        // Reluctant matching
4608        boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
4609            for (;;) {
4610                if (next.match(matcher, i, seq))
4611                    return true;
4612                if (j >= cmax)
4613                    return false;
4614                if (!atom.match(matcher, i, seq))
4615                    return false;
4616                if (i == matcher.last)
4617                    return false;
4618                if (capture) {
4619                    matcher.groups[groupIndex] = i;
4620                    matcher.groups[groupIndex+1] = matcher.last;
4621                }
4622                i = matcher.last;
4623                j++;
4624            }
4625        }
4626        // Possessive matching
4627        boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
4628            for (; j < cmax; j++) {
4629                if (!atom.match(matcher, i, seq)) {
4630                    break;
4631                }
4632                if (capture) {
4633                    matcher.groups[groupIndex] = i;
4634                    matcher.groups[groupIndex+1] = matcher.last;
4635                }
4636                if (i == matcher.last) {
4637                    break;
4638                }
4639                i = matcher.last;
4640            }
4641            return next.match(matcher, i, seq);
4642        }
4643        boolean study(TreeInfo info) {
4644            // Save original info
4645            int minL = info.minLength;
4646            int maxL = info.maxLength;
4647            boolean maxV = info.maxValid;
4648            boolean detm = info.deterministic;
4649            info.reset();
4650
4651            atom.study(info);
4652
4653            int temp = info.minLength * cmin + minL;
4654            if (temp < minL) {
4655                temp = 0xFFFFFFF; // Arbitrary large number
4656            }
4657            info.minLength = temp;
4658
4659            if (maxV & info.maxValid) {
4660                temp = info.maxLength * cmax + maxL;
4661                info.maxLength = temp;
4662                if (temp < maxL) {
4663                    info.maxValid = false;
4664                }
4665            } else {
4666                info.maxValid = false;
4667            }
4668
4669            if (info.deterministic && cmin == cmax) {
4670                info.deterministic = detm;
4671            } else {
4672                info.deterministic = false;
4673            }
4674            return next.study(info);
4675        }
4676    }
4677
4678    /**
4679     * A Guard node at the end of each atom node in a Branch. It
4680     * serves the purpose of chaining the "match" operation to
4681     * "next" but not the "study", so we can collect the TreeInfo
4682     * of each atom node without including the TreeInfo of the
4683     * "next".
4684     */
4685    static final class BranchConn extends Node {
4686        BranchConn() {};
4687        boolean match(Matcher matcher, int i, CharSequence seq) {
4688            return next.match(matcher, i, seq);
4689        }
4690        boolean study(TreeInfo info) {
4691            return info.deterministic;
4692        }
4693    }
4694
4695    /**
4696     * Handles the branching of alternations. Note this is also used for
4697     * the ? quantifier to branch between the case where it matches once
4698     * and where it does not occur.
4699     */
4700    static final class Branch extends Node {
4701        Node[] atoms = new Node[2];
4702        int size = 2;
4703        Node conn;
4704        Branch(Node first, Node second, Node branchConn) {
4705            conn = branchConn;
4706            atoms[0] = first;
4707            atoms[1] = second;
4708        }
4709
4710        void add(Node node) {
4711            if (size >= atoms.length) {
4712                Node[] tmp = new Node[atoms.length*2];
4713                System.arraycopy(atoms, 0, tmp, 0, atoms.length);
4714                atoms = tmp;
4715            }
4716            atoms[size++] = node;
4717        }
4718
4719        boolean match(Matcher matcher, int i, CharSequence seq) {
4720            for (int n = 0; n < size; n++) {
4721                if (atoms[n] == null) {
4722                    if (conn.next.match(matcher, i, seq))
4723                        return true;
4724                } else if (atoms[n].match(matcher, i, seq)) {
4725                    return true;
4726                }
4727            }
4728            return false;
4729        }
4730
4731        boolean study(TreeInfo info) {
4732            int minL = info.minLength;
4733            int maxL = info.maxLength;
4734            boolean maxV = info.maxValid;
4735
4736            int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
4737            int maxL2 = -1;
4738            for (int n = 0; n < size; n++) {
4739                info.reset();
4740                if (atoms[n] != null)
4741                    atoms[n].study(info);
4742                minL2 = Math.min(minL2, info.minLength);
4743                maxL2 = Math.max(maxL2, info.maxLength);
4744                maxV = (maxV & info.maxValid);
4745            }
4746
4747            minL += minL2;
4748            maxL += maxL2;
4749
4750            info.reset();
4751            conn.next.study(info);
4752
4753            info.minLength += minL;
4754            info.maxLength += maxL;
4755            info.maxValid &= maxV;
4756            info.deterministic = false;
4757            return false;
4758        }
4759    }
4760
4761    /**
4762     * The GroupHead saves the location where the group begins in the locals
4763     * and restores them when the match is done.
4764     *
4765     * The matchRef is used when a reference to this group is accessed later
4766     * in the expression. The locals will have a negative value in them to
4767     * indicate that we do not want to unset the group if the reference
4768     * doesn't match.
4769     */
4770    static final class GroupHead extends Node {
4771        int localIndex;
4772        GroupHead(int localCount) {
4773            localIndex = localCount;
4774        }
4775        boolean match(Matcher matcher, int i, CharSequence seq) {
4776            int save = matcher.locals[localIndex];
4777            matcher.locals[localIndex] = i;
4778            boolean ret = next.match(matcher, i, seq);
4779            matcher.locals[localIndex] = save;
4780            return ret;
4781        }
4782        boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4783            int save = matcher.locals[localIndex];
4784            matcher.locals[localIndex] = ~i; // HACK
4785            boolean ret = next.match(matcher, i, seq);
4786            matcher.locals[localIndex] = save;
4787            return ret;
4788        }
4789    }
4790
4791    /**
4792     * Recursive reference to a group in the regular expression. It calls
4793     * matchRef because if the reference fails to match we would not unset
4794     * the group.
4795     */
4796    static final class GroupRef extends Node {
4797        GroupHead head;
4798        GroupRef(GroupHead head) {
4799            this.head = head;
4800        }
4801        boolean match(Matcher matcher, int i, CharSequence seq) {
4802            return head.matchRef(matcher, i, seq)
4803                && next.match(matcher, matcher.last, seq);
4804        }
4805        boolean study(TreeInfo info) {
4806            info.maxValid = false;
4807            info.deterministic = false;
4808            return next.study(info);
4809        }
4810    }
4811
4812    /**
4813     * The GroupTail handles the setting of group beginning and ending
4814     * locations when groups are successfully matched. It must also be able to
4815     * unset groups that have to be backed off of.
4816     *
4817     * The GroupTail node is also used when a previous group is referenced,
4818     * and in that case no group information needs to be set.
4819     */
4820    static final class GroupTail extends Node {
4821        int localIndex;
4822        int groupIndex;
4823        GroupTail(int localCount, int groupCount) {
4824            localIndex = localCount;
4825            groupIndex = groupCount + groupCount;
4826        }
4827        boolean match(Matcher matcher, int i, CharSequence seq) {
4828            int tmp = matcher.locals[localIndex];
4829            if (tmp >= 0) { // This is the normal group case.
4830                // Save the group so we can unset it if it
4831                // backs off of a match.
4832                int groupStart = matcher.groups[groupIndex];
4833                int groupEnd = matcher.groups[groupIndex+1];
4834
4835                matcher.groups[groupIndex] = tmp;
4836                matcher.groups[groupIndex+1] = i;
4837                if (next.match(matcher, i, seq)) {
4838                    return true;
4839                }
4840                matcher.groups[groupIndex] = groupStart;
4841                matcher.groups[groupIndex+1] = groupEnd;
4842                return false;
4843            } else {
4844                // This is a group reference case. We don't need to save any
4845                // group info because it isn't really a group.
4846                matcher.last = i;
4847                return true;
4848            }
4849        }
4850    }
4851
4852    /**
4853     * This sets up a loop to handle a recursive quantifier structure.
4854     */
4855    static final class Prolog extends Node {
4856        Loop loop;
4857        Prolog(Loop loop) {
4858            this.loop = loop;
4859        }
4860        boolean match(Matcher matcher, int i, CharSequence seq) {
4861            return loop.matchInit(matcher, i, seq);
4862        }
4863        boolean study(TreeInfo info) {
4864            return loop.study(info);
4865        }
4866    }
4867
4868    /**
4869     * Handles the repetition count for a greedy Curly. The matchInit
4870     * is called from the Prolog to save the index of where the group
4871     * beginning is stored. A zero length group check occurs in the
4872     * normal match but is skipped in the matchInit.
4873     */
4874    static class Loop extends Node {
4875        Node body;
4876        int countIndex; // local count index in matcher locals
4877        int beginIndex; // group beginning index
4878        int cmin, cmax;
4879        Loop(int countIndex, int beginIndex) {
4880            this.countIndex = countIndex;
4881            this.beginIndex = beginIndex;
4882        }
4883        boolean match(Matcher matcher, int i, CharSequence seq) {
4884            // Avoid infinite loop in zero-length case.
4885            if (i > matcher.locals[beginIndex]) {
4886                int count = matcher.locals[countIndex];
4887
4888                // This block is for before we reach the minimum
4889                // iterations required for the loop to match
4890                if (count < cmin) {
4891                    matcher.locals[countIndex] = count + 1;
4892                    boolean b = body.match(matcher, i, seq);
4893                    // If match failed we must backtrack, so
4894                    // the loop count should NOT be incremented
4895                    if (!b)
4896                        matcher.locals[countIndex] = count;
4897                    // Return success or failure since we are under
4898                    // minimum
4899                    return b;
4900                }
4901                // This block is for after we have the minimum
4902                // iterations required for the loop to match
4903                if (count < cmax) {
4904                    matcher.locals[countIndex] = count + 1;
4905                    boolean b = body.match(matcher, i, seq);
4906                    // If match failed we must backtrack, so
4907                    // the loop count should NOT be incremented
4908                    if (!b)
4909                        matcher.locals[countIndex] = count;
4910                    else
4911                        return true;
4912                }
4913            }
4914            return next.match(matcher, i, seq);
4915        }
4916        boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4917            int save = matcher.locals[countIndex];
4918            boolean ret = false;
4919            if (0 < cmin) {
4920                matcher.locals[countIndex] = 1;
4921                ret = body.match(matcher, i, seq);
4922            } else if (0 < cmax) {
4923                matcher.locals[countIndex] = 1;
4924                ret = body.match(matcher, i, seq);
4925                if (ret == false)
4926                    ret = next.match(matcher, i, seq);
4927            } else {
4928                ret = next.match(matcher, i, seq);
4929            }
4930            matcher.locals[countIndex] = save;
4931            return ret;
4932        }
4933        boolean study(TreeInfo info) {
4934            info.maxValid = false;
4935            info.deterministic = false;
4936            return false;
4937        }
4938    }
4939
4940    /**
4941     * Handles the repetition count for a reluctant Curly. The matchInit
4942     * is called from the Prolog to save the index of where the group
4943     * beginning is stored. A zero length group check occurs in the
4944     * normal match but is skipped in the matchInit.
4945     */
4946    static final class LazyLoop extends Loop {
4947        LazyLoop(int countIndex, int beginIndex) {
4948            super(countIndex, beginIndex);
4949        }
4950        boolean match(Matcher matcher, int i, CharSequence seq) {
4951            // Check for zero length group
4952            if (i > matcher.locals[beginIndex]) {
4953                int count = matcher.locals[countIndex];
4954                if (count < cmin) {
4955                    matcher.locals[countIndex] = count + 1;
4956                    boolean result = body.match(matcher, i, seq);
4957                    // If match failed we must backtrack, so
4958                    // the loop count should NOT be incremented
4959                    if (!result)
4960                        matcher.locals[countIndex] = count;
4961                    return result;
4962                }
4963                if (next.match(matcher, i, seq))
4964                    return true;
4965                if (count < cmax) {
4966                    matcher.locals[countIndex] = count + 1;
4967                    boolean result = body.match(matcher, i, seq);
4968                    // If match failed we must backtrack, so
4969                    // the loop count should NOT be incremented
4970                    if (!result)
4971                        matcher.locals[countIndex] = count;
4972                    return result;
4973                }
4974                return false;
4975            }
4976            return next.match(matcher, i, seq);
4977        }
4978        boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4979            int save = matcher.locals[countIndex];
4980            boolean ret = false;
4981            if (0 < cmin) {
4982                matcher.locals[countIndex] = 1;
4983                ret = body.match(matcher, i, seq);
4984            } else if (next.match(matcher, i, seq)) {
4985                ret = true;
4986            } else if (0 < cmax) {
4987                matcher.locals[countIndex] = 1;
4988                ret = body.match(matcher, i, seq);
4989            }
4990            matcher.locals[countIndex] = save;
4991            return ret;
4992        }
4993        boolean study(TreeInfo info) {
4994            info.maxValid = false;
4995            info.deterministic = false;
4996            return false;
4997        }
4998    }
4999
5000    /**
5001     * Refers to a group in the regular expression. Attempts to match
5002     * whatever the group referred to last matched.
5003     */
5004    static class BackRef extends Node {
5005        int groupIndex;
5006        BackRef(int groupCount) {
5007            super();
5008            groupIndex = groupCount + groupCount;
5009        }
5010        boolean match(Matcher matcher, int i, CharSequence seq) {
5011            int j = matcher.groups[groupIndex];
5012            int k = matcher.groups[groupIndex+1];
5013
5014            int groupSize = k - j;
5015            // If the referenced group didn't match, neither can this
5016            if (j < 0)
5017                return false;
5018
5019            // If there isn't enough input left no match
5020            if (i + groupSize > matcher.to) {
5021                matcher.hitEnd = true;
5022                return false;
5023            }
5024            // Check each new char to make sure it matches what the group
5025            // referenced matched last time around
5026            for (int index=0; index<groupSize; index++)
5027                if (seq.charAt(i+index) != seq.charAt(j+index))
5028                    return false;
5029
5030            return next.match(matcher, i+groupSize, seq);
5031        }
5032        boolean study(TreeInfo info) {
5033            info.maxValid = false;
5034            return next.study(info);
5035        }
5036    }
5037
5038    static class CIBackRef extends Node {
5039        int groupIndex;
5040        boolean doUnicodeCase;
5041        CIBackRef(int groupCount, boolean doUnicodeCase) {
5042            super();
5043            groupIndex = groupCount + groupCount;
5044            this.doUnicodeCase = doUnicodeCase;
5045        }
5046        boolean match(Matcher matcher, int i, CharSequence seq) {
5047            int j = matcher.groups[groupIndex];
5048            int k = matcher.groups[groupIndex+1];
5049
5050            int groupSize = k - j;
5051
5052            // If the referenced group didn't match, neither can this
5053            if (j < 0)
5054                return false;
5055
5056            // If there isn't enough input left no match
5057            if (i + groupSize > matcher.to) {
5058                matcher.hitEnd = true;
5059                return false;
5060            }
5061
5062            // Check each new char to make sure it matches what the group
5063            // referenced matched last time around
5064            int x = i;
5065            for (int index=0; index<groupSize; index++) {
5066                int c1 = Character.codePointAt(seq, x);
5067                int c2 = Character.codePointAt(seq, j);
5068                if (c1 != c2) {
5069                    if (doUnicodeCase) {
5070                        int cc1 = Character.toUpperCase(c1);
5071                        int cc2 = Character.toUpperCase(c2);
5072                        if (cc1 != cc2 &&
5073                            Character.toLowerCase(cc1) !=
5074                            Character.toLowerCase(cc2))
5075                            return false;
5076                    } else {
5077                        if (ASCII.toLower(c1) != ASCII.toLower(c2))
5078                            return false;
5079                    }
5080                }
5081                x += Character.charCount(c1);
5082                j += Character.charCount(c2);
5083            }
5084
5085            return next.match(matcher, i+groupSize, seq);
5086        }
5087        boolean study(TreeInfo info) {
5088            info.maxValid = false;
5089            return next.study(info);
5090        }
5091    }
5092
5093    /**
5094     * Searches until the next instance of its atom. This is useful for
5095     * finding the atom efficiently without passing an instance of it
5096     * (greedy problem) and without a lot of wasted search time (reluctant
5097     * problem).
5098     */
5099    static final class First extends Node {
5100        Node atom;
5101        First(Node node) {
5102            this.atom = BnM.optimize(node);
5103        }
5104        boolean match(Matcher matcher, int i, CharSequence seq) {
5105            if (atom instanceof BnM) {
5106                return atom.match(matcher, i, seq)
5107                    && next.match(matcher, matcher.last, seq);
5108            }
5109            for (;;) {
5110                if (i > matcher.to) {
5111                    matcher.hitEnd = true;
5112                    return false;
5113                }
5114                if (atom.match(matcher, i, seq)) {
5115                    return next.match(matcher, matcher.last, seq);
5116                }
5117                i += countChars(seq, i, 1);
5118                matcher.first++;
5119            }
5120        }
5121        boolean study(TreeInfo info) {
5122            atom.study(info);
5123            info.maxValid = false;
5124            info.deterministic = false;
5125            return next.study(info);
5126        }
5127    }
5128
5129    static final class Conditional extends Node {
5130        Node cond, yes, not;
5131        Conditional(Node cond, Node yes, Node not) {
5132            this.cond = cond;
5133            this.yes = yes;
5134            this.not = not;
5135        }
5136        boolean match(Matcher matcher, int i, CharSequence seq) {
5137            if (cond.match(matcher, i, seq)) {
5138                return yes.match(matcher, i, seq);
5139            } else {
5140                return not.match(matcher, i, seq);
5141            }
5142        }
5143        boolean study(TreeInfo info) {
5144            int minL = info.minLength;
5145            int maxL = info.maxLength;
5146            boolean maxV = info.maxValid;
5147            info.reset();
5148            yes.study(info);
5149
5150            int minL2 = info.minLength;
5151            int maxL2 = info.maxLength;
5152            boolean maxV2 = info.maxValid;
5153            info.reset();
5154            not.study(info);
5155
5156            info.minLength = minL + Math.min(minL2, info.minLength);
5157            info.maxLength = maxL + Math.max(maxL2, info.maxLength);
5158            info.maxValid = (maxV & maxV2 & info.maxValid);
5159            info.deterministic = false;
5160            return next.study(info);
5161        }
5162    }
5163
5164    /**
5165     * Zero width positive lookahead.
5166     */
5167    static final class Pos extends Node {
5168        Node cond;
5169        Pos(Node cond) {
5170            this.cond = cond;
5171        }
5172        boolean match(Matcher matcher, int i, CharSequence seq) {
5173            int savedTo = matcher.to;
5174            boolean conditionMatched = false;
5175
5176            // Relax transparent region boundaries for lookahead
5177            if (matcher.transparentBounds)
5178                matcher.to = matcher.getTextLength();
5179            try {
5180                conditionMatched = cond.match(matcher, i, seq);
5181            } finally {
5182                // Reinstate region boundaries
5183                matcher.to = savedTo;
5184            }
5185            return conditionMatched && next.match(matcher, i, seq);
5186        }
5187    }
5188
5189    /**
5190     * Zero width negative lookahead.
5191     */
5192    static final class Neg extends Node {
5193        Node cond;
5194        Neg(Node cond) {
5195            this.cond = cond;
5196        }
5197        boolean match(Matcher matcher, int i, CharSequence seq) {
5198            int savedTo = matcher.to;
5199            boolean conditionMatched = false;
5200
5201            // Relax transparent region boundaries for lookahead
5202            if (matcher.transparentBounds)
5203                matcher.to = matcher.getTextLength();
5204            try {
5205                if (i < matcher.to) {
5206                    conditionMatched = !cond.match(matcher, i, seq);
5207                } else {
5208                    // If a negative lookahead succeeds then more input
5209                    // could cause it to fail!
5210                    matcher.requireEnd = true;
5211                    conditionMatched = !cond.match(matcher, i, seq);
5212                }
5213            } finally {
5214                // Reinstate region boundaries
5215                matcher.to = savedTo;
5216            }
5217            return conditionMatched && next.match(matcher, i, seq);
5218        }
5219    }
5220
5221    /**
5222     * For use with lookbehinds; matches the position where the lookbehind
5223     * was encountered.
5224     */
5225    static Node lookbehindEnd = new Node() {
5226        boolean match(Matcher matcher, int i, CharSequence seq) {
5227            return i == matcher.lookbehindTo;
5228        }
5229    };
5230
5231    /**
5232     * Zero width positive lookbehind.
5233     */
5234    static class Behind extends Node {
5235        Node cond;
5236        int rmax, rmin;
5237        Behind(Node cond, int rmax, int rmin) {
5238            this.cond = cond;
5239            this.rmax = rmax;
5240            this.rmin = rmin;
5241        }
5242
5243        boolean match(Matcher matcher, int i, CharSequence seq) {
5244            int savedFrom = matcher.from;
5245            boolean conditionMatched = false;
5246            int startIndex = (!matcher.transparentBounds) ?
5247                             matcher.from : 0;
5248            int from = Math.max(i - rmax, startIndex);
5249            // Set end boundary
5250            int savedLBT = matcher.lookbehindTo;
5251            matcher.lookbehindTo = i;
5252            // Relax transparent region boundaries for lookbehind
5253            if (matcher.transparentBounds)
5254                matcher.from = 0;
5255            for (int j = i - rmin; !conditionMatched && j >= from; j--) {
5256                conditionMatched = cond.match(matcher, j, seq);
5257            }
5258            matcher.from = savedFrom;
5259            matcher.lookbehindTo = savedLBT;
5260            return conditionMatched && next.match(matcher, i, seq);
5261        }
5262    }
5263
5264    /**
5265     * Zero width positive lookbehind, including supplementary
5266     * characters or unpaired surrogates.
5267     */
5268    static final class BehindS extends Behind {
5269        BehindS(Node cond, int rmax, int rmin) {
5270            super(cond, rmax, rmin);
5271        }
5272        boolean match(Matcher matcher, int i, CharSequence seq) {
5273            int rmaxChars = countChars(seq, i, -rmax);
5274            int rminChars = countChars(seq, i, -rmin);
5275            int savedFrom = matcher.from;
5276            int startIndex = (!matcher.transparentBounds) ?
5277                             matcher.from : 0;
5278            boolean conditionMatched = false;
5279            int from = Math.max(i - rmaxChars, startIndex);
5280            // Set end boundary
5281            int savedLBT = matcher.lookbehindTo;
5282            matcher.lookbehindTo = i;
5283            // Relax transparent region boundaries for lookbehind
5284            if (matcher.transparentBounds)
5285                matcher.from = 0;
5286
5287            for (int j = i - rminChars;
5288                 !conditionMatched && j >= from;
5289                 j -= j>from ? countChars(seq, j, -1) : 1) {
5290                conditionMatched = cond.match(matcher, j, seq);
5291            }
5292            matcher.from = savedFrom;
5293            matcher.lookbehindTo = savedLBT;
5294            return conditionMatched && next.match(matcher, i, seq);
5295        }
5296    }
5297
5298    /**
5299     * Zero width negative lookbehind.
5300     */
5301    static class NotBehind extends Node {
5302        Node cond;
5303        int rmax, rmin;
5304        NotBehind(Node cond, int rmax, int rmin) {
5305            this.cond = cond;
5306            this.rmax = rmax;
5307            this.rmin = rmin;
5308        }
5309
5310        boolean match(Matcher matcher, int i, CharSequence seq) {
5311            int savedLBT = matcher.lookbehindTo;
5312            int savedFrom = matcher.from;
5313            boolean conditionMatched = false;
5314            int startIndex = (!matcher.transparentBounds) ?
5315                             matcher.from : 0;
5316            int from = Math.max(i - rmax, startIndex);
5317            matcher.lookbehindTo = i;
5318            // Relax transparent region boundaries for lookbehind
5319            if (matcher.transparentBounds)
5320                matcher.from = 0;
5321            for (int j = i - rmin; !conditionMatched && j >= from; j--) {
5322                conditionMatched = cond.match(matcher, j, seq);
5323            }
5324            // Reinstate region boundaries
5325            matcher.from = savedFrom;
5326            matcher.lookbehindTo = savedLBT;
5327            return !conditionMatched && next.match(matcher, i, seq);
5328        }
5329    }
5330
5331    /**
5332     * Zero width negative lookbehind, including supplementary
5333     * characters or unpaired surrogates.
5334     */
5335    static final class NotBehindS extends NotBehind {
5336        NotBehindS(Node cond, int rmax, int rmin) {
5337            super(cond, rmax, rmin);
5338        }
5339        boolean match(Matcher matcher, int i, CharSequence seq) {
5340            int rmaxChars = countChars(seq, i, -rmax);
5341            int rminChars = countChars(seq, i, -rmin);
5342            int savedFrom = matcher.from;
5343            int savedLBT = matcher.lookbehindTo;
5344            boolean conditionMatched = false;
5345            int startIndex = (!matcher.transparentBounds) ?
5346                             matcher.from : 0;
5347            int from = Math.max(i - rmaxChars, startIndex);
5348            matcher.lookbehindTo = i;
5349            // Relax transparent region boundaries for lookbehind
5350            if (matcher.transparentBounds)
5351                matcher.from = 0;
5352            for (int j = i - rminChars;
5353                 !conditionMatched && j >= from;
5354                 j -= j>from ? countChars(seq, j, -1) : 1) {
5355                conditionMatched = cond.match(matcher, j, seq);
5356            }
5357            //Reinstate region boundaries
5358            matcher.from = savedFrom;
5359            matcher.lookbehindTo = savedLBT;
5360            return !conditionMatched && next.match(matcher, i, seq);
5361        }
5362    }
5363
5364    /**
5365     * Returns the set union of two CharProperty nodes.
5366     */
5367    private static CharProperty union(final CharProperty lhs,
5368                                      final CharProperty rhs) {
5369        return new CharProperty() {
5370                boolean isSatisfiedBy(int ch) {
5371                    return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
5372    }
5373
5374    /**
5375     * Returns the set intersection of two CharProperty nodes.
5376     */
5377    private static CharProperty intersection(final CharProperty lhs,
5378                                             final CharProperty rhs) {
5379        return new CharProperty() {
5380                boolean isSatisfiedBy(int ch) {
5381                    return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
5382    }
5383
5384    /**
5385     * Returns the set difference of two CharProperty nodes.
5386     */
5387    private static CharProperty setDifference(final CharProperty lhs,
5388                                              final CharProperty rhs) {
5389        return new CharProperty() {
5390                boolean isSatisfiedBy(int ch) {
5391                    return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
5392    }
5393
5394    /**
5395     * Handles word boundaries. Includes a field to allow this one class to
5396     * deal with the different types of word boundaries we can match. The word
5397     * characters include underscores, letters, and digits. Non spacing marks
5398     * can are also part of a word if they have a base character, otherwise
5399     * they are ignored for purposes of finding word boundaries.
5400     */
5401    static final class Bound extends Node {
5402        static int LEFT = 0x1;
5403        static int RIGHT= 0x2;
5404        static int BOTH = 0x3;
5405        static int NONE = 0x4;
5406        int type;
5407        boolean useUWORD;
5408        Bound(int n, boolean useUWORD) {
5409            type = n;
5410            this.useUWORD = useUWORD;
5411        }
5412
5413        boolean isWord(int ch) {
5414            return useUWORD ? UnicodeProp.WORD.is(ch)
5415                            : (ch == '_' || Character.isLetterOrDigit(ch));
5416        }
5417
5418        int check(Matcher matcher, int i, CharSequence seq) {
5419            int ch;
5420            boolean left = false;
5421            int startIndex = matcher.from;
5422            int endIndex = matcher.to;
5423            if (matcher.transparentBounds) {
5424                startIndex = 0;
5425                endIndex = matcher.getTextLength();
5426            }
5427            if (i > startIndex) {
5428                ch = Character.codePointBefore(seq, i);
5429                left = (isWord(ch) ||
5430                    ((Character.getType(ch) == Character.NON_SPACING_MARK)
5431                     && hasBaseCharacter(matcher, i-1, seq)));
5432            }
5433            boolean right = false;
5434            if (i < endIndex) {
5435                ch = Character.codePointAt(seq, i);
5436                right = (isWord(ch) ||
5437                    ((Character.getType(ch) == Character.NON_SPACING_MARK)
5438                     && hasBaseCharacter(matcher, i, seq)));
5439            } else {
5440                // Tried to access char past the end
5441                matcher.hitEnd = true;
5442                // The addition of another char could wreck a boundary
5443                matcher.requireEnd = true;
5444            }
5445            return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
5446        }
5447        boolean match(Matcher matcher, int i, CharSequence seq) {
5448            return (check(matcher, i, seq) & type) > 0
5449                && next.match(matcher, i, seq);
5450        }
5451    }
5452
5453    /**
5454     * Non spacing marks only count as word characters in bounds calculations
5455     * if they have a base character.
5456     */
5457    private static boolean hasBaseCharacter(Matcher matcher, int i,
5458                                            CharSequence seq)
5459    {
5460        int start = (!matcher.transparentBounds) ?
5461            matcher.from : 0;
5462        for (int x=i; x >= start; x--) {
5463            int ch = Character.codePointAt(seq, x);
5464            if (Character.isLetterOrDigit(ch))
5465                return true;
5466            if (Character.getType(ch) == Character.NON_SPACING_MARK)
5467                continue;
5468            return false;
5469        }
5470        return false;
5471    }
5472
5473    /**
5474     * Attempts to match a slice in the input using the Boyer-Moore string
5475     * matching algorithm. The algorithm is based on the idea that the
5476     * pattern can be shifted farther ahead in the search text if it is
5477     * matched right to left.
5478     * <p>
5479     * The pattern is compared to the input one character at a time, from
5480     * the rightmost character in the pattern to the left. If the characters
5481     * all match the pattern has been found. If a character does not match,
5482     * the pattern is shifted right a distance that is the maximum of two
5483     * functions, the bad character shift and the good suffix shift. This
5484     * shift moves the attempted match position through the input more
5485     * quickly than a naive one position at a time check.
5486     * <p>
5487     * The bad character shift is based on the character from the text that
5488     * did not match. If the character does not appear in the pattern, the
5489     * pattern can be shifted completely beyond the bad character. If the
5490     * character does occur in the pattern, the pattern can be shifted to
5491     * line the pattern up with the next occurrence of that character.
5492     * <p>
5493     * The good suffix shift is based on the idea that some subset on the right
5494     * side of the pattern has matched. When a bad character is found, the
5495     * pattern can be shifted right by the pattern length if the subset does
5496     * not occur again in pattern, or by the amount of distance to the
5497     * next occurrence of the subset in the pattern.
5498     *
5499     * Boyer-Moore search methods adapted from code by Amy Yu.
5500     */
5501    static class BnM extends Node {
5502        int[] buffer;
5503        int[] lastOcc;
5504        int[] optoSft;
5505
5506        /**
5507         * Pre calculates arrays needed to generate the bad character
5508         * shift and the good suffix shift. Only the last seven bits
5509         * are used to see if chars match; This keeps the tables small
5510         * and covers the heavily used ASCII range, but occasionally
5511         * results in an aliased match for the bad character shift.
5512         */
5513        static Node optimize(Node node) {
5514            if (!(node instanceof Slice)) {
5515                return node;
5516            }
5517
5518            int[] src = ((Slice) node).buffer;
5519            int patternLength = src.length;
5520            // The BM algorithm requires a bit of overhead;
5521            // If the pattern is short don't use it, since
5522            // a shift larger than the pattern length cannot
5523            // be used anyway.
5524            if (patternLength < 4) {
5525                return node;
5526            }
5527            int i, j, k;
5528            int[] lastOcc = new int[128];
5529            int[] optoSft = new int[patternLength];
5530            // Precalculate part of the bad character shift
5531            // It is a table for where in the pattern each
5532            // lower 7-bit value occurs
5533            for (i = 0; i < patternLength; i++) {
5534                lastOcc[src[i]&0x7F] = i + 1;
5535            }
5536            // Precalculate the good suffix shift
5537            // i is the shift amount being considered
5538NEXT:       for (i = patternLength; i > 0; i--) {
5539                // j is the beginning index of suffix being considered
5540                for (j = patternLength - 1; j >= i; j--) {
5541                    // Testing for good suffix
5542                    if (src[j] == src[j-i]) {
5543                        // src[j..len] is a good suffix
5544                        optoSft[j-1] = i;
5545                    } else {
5546                        // No match. The array has already been
5547                        // filled up with correct values before.
5548                        continue NEXT;
5549                    }
5550                }
5551                // This fills up the remaining of optoSft
5552                // any suffix can not have larger shift amount
5553                // then its sub-suffix. Why???
5554                while (j > 0) {
5555                    optoSft[--j] = i;
5556                }
5557            }
5558            // Set the guard value because of unicode compression
5559            optoSft[patternLength-1] = 1;
5560            if (node instanceof SliceS)
5561                return new BnMS(src, lastOcc, optoSft, node.next);
5562            return new BnM(src, lastOcc, optoSft, node.next);
5563        }
5564        BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
5565            this.buffer = src;
5566            this.lastOcc = lastOcc;
5567            this.optoSft = optoSft;
5568            this.next = next;
5569        }
5570        boolean match(Matcher matcher, int i, CharSequence seq) {
5571            int[] src = buffer;
5572            int patternLength = src.length;
5573            int last = matcher.to - patternLength;
5574
5575            // Loop over all possible match positions in text
5576NEXT:       while (i <= last) {
5577                // Loop over pattern from right to left
5578                for (int j = patternLength - 1; j >= 0; j--) {
5579                    int ch = seq.charAt(i+j);
5580                    if (ch != src[j]) {
5581                        // Shift search to the right by the maximum of the
5582                        // bad character shift and the good suffix shift
5583                        i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
5584                        continue NEXT;
5585                    }
5586                }
5587                // Entire pattern matched starting at i
5588                matcher.first = i;
5589                boolean ret = next.match(matcher, i + patternLength, seq);
5590                if (ret) {
5591                    matcher.first = i;
5592                    matcher.groups[0] = matcher.first;
5593                    matcher.groups[1] = matcher.last;
5594                    return true;
5595                }
5596                i++;
5597            }
5598            // BnM is only used as the leading node in the unanchored case,
5599            // and it replaced its Start() which always searches to the end
5600            // if it doesn't find what it's looking for, so hitEnd is true.
5601            matcher.hitEnd = true;
5602            return false;
5603        }
5604        boolean study(TreeInfo info) {
5605            info.minLength += buffer.length;
5606            info.maxValid = false;
5607            return next.study(info);
5608        }
5609    }
5610
5611    /**
5612     * Supplementary support version of BnM(). Unpaired surrogates are
5613     * also handled by this class.
5614     */
5615    static final class BnMS extends BnM {
5616        int lengthInChars;
5617
5618        BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
5619            super(src, lastOcc, optoSft, next);
5620            for (int cp : buffer) {
5621                lengthInChars += Character.charCount(cp);
5622            }
5623        }
5624        boolean match(Matcher matcher, int i, CharSequence seq) {
5625            int[] src = buffer;
5626            int patternLength = src.length;
5627            int last = matcher.to - lengthInChars;
5628
5629            // Loop over all possible match positions in text
5630NEXT:       while (i <= last) {
5631                // Loop over pattern from right to left
5632                int ch;
5633                for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
5634                     j > 0; j -= Character.charCount(ch), x--) {
5635                    ch = Character.codePointBefore(seq, i+j);
5636                    if (ch != src[x]) {
5637                        // Shift search to the right by the maximum of the
5638                        // bad character shift and the good suffix shift
5639                        int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
5640                        i += countChars(seq, i, n);
5641                        continue NEXT;
5642                    }
5643                }
5644                // Entire pattern matched starting at i
5645                matcher.first = i;
5646                boolean ret = next.match(matcher, i + lengthInChars, seq);
5647                if (ret) {
5648                    matcher.first = i;
5649                    matcher.groups[0] = matcher.first;
5650                    matcher.groups[1] = matcher.last;
5651                    return true;
5652                }
5653                i += countChars(seq, i, 1);
5654            }
5655            matcher.hitEnd = true;
5656            return false;
5657        }
5658    }
5659
5660///////////////////////////////////////////////////////////////////////////////
5661///////////////////////////////////////////////////////////////////////////////
5662
5663    /**
5664     *  This must be the very first initializer.
5665     */
5666    static Node accept = new Node();
5667
5668    static Node lastAccept = new LastNode();
5669
5670    private static class CharPropertyNames {
5671
5672        static CharProperty charPropertyFor(String name) {
5673            CharPropertyFactory m = map.get(name);
5674            return m == null ? null : m.make();
5675        }
5676
5677        private abstract static class CharPropertyFactory {
5678            abstract CharProperty make();
5679        }
5680
5681        private static void defCategory(String name,
5682                                        final int typeMask) {
5683            map.put(name, new CharPropertyFactory() {
5684                    CharProperty make() { return new Category(typeMask);}});
5685        }
5686
5687        private static void defRange(String name,
5688                                     final int lower, final int upper) {
5689            map.put(name, new CharPropertyFactory() {
5690                    CharProperty make() { return rangeFor(lower, upper);}});
5691        }
5692
5693        private static void defCtype(String name,
5694                                     final int ctype) {
5695            map.put(name, new CharPropertyFactory() {
5696                    CharProperty make() { return new Ctype(ctype);}});
5697        }
5698
5699        private abstract static class CloneableProperty
5700            extends CharProperty implements Cloneable
5701        {
5702            public CloneableProperty clone() {
5703                try {
5704                    return (CloneableProperty) super.clone();
5705                } catch (CloneNotSupportedException e) {
5706                    throw new AssertionError(e);
5707                }
5708            }
5709        }
5710
5711        private static void defClone(String name,
5712                                     final CloneableProperty p) {
5713            map.put(name, new CharPropertyFactory() {
5714                    CharProperty make() { return p.clone();}});
5715        }
5716
5717        private static final HashMap<String, CharPropertyFactory> map
5718            = new HashMap<>();
5719
5720        static {
5721            // Unicode character property aliases, defined in
5722            // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
5723            defCategory("Cn", 1<<Character.UNASSIGNED);
5724            defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
5725            defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
5726            defCategory("Lt", 1<<Character.TITLECASE_LETTER);
5727            defCategory("Lm", 1<<Character.MODIFIER_LETTER);
5728            defCategory("Lo", 1<<Character.OTHER_LETTER);
5729            defCategory("Mn", 1<<Character.NON_SPACING_MARK);
5730            defCategory("Me", 1<<Character.ENCLOSING_MARK);
5731            defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
5732            defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
5733            defCategory("Nl", 1<<Character.LETTER_NUMBER);
5734            defCategory("No", 1<<Character.OTHER_NUMBER);
5735            defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
5736            defCategory("Zl", 1<<Character.LINE_SEPARATOR);
5737            defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
5738            defCategory("Cc", 1<<Character.CONTROL);
5739            defCategory("Cf", 1<<Character.FORMAT);
5740            defCategory("Co", 1<<Character.PRIVATE_USE);
5741            defCategory("Cs", 1<<Character.SURROGATE);
5742            defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
5743            defCategory("Ps", 1<<Character.START_PUNCTUATION);
5744            defCategory("Pe", 1<<Character.END_PUNCTUATION);
5745            defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
5746            defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
5747            defCategory("Sm", 1<<Character.MATH_SYMBOL);
5748            defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
5749            defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
5750            defCategory("So", 1<<Character.OTHER_SYMBOL);
5751            defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
5752            defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
5753            defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
5754                              (1<<Character.LOWERCASE_LETTER) |
5755                              (1<<Character.TITLECASE_LETTER) |
5756                              (1<<Character.MODIFIER_LETTER)  |
5757                              (1<<Character.OTHER_LETTER)));
5758            defCategory("M", ((1<<Character.NON_SPACING_MARK) |
5759                              (1<<Character.ENCLOSING_MARK)   |
5760                              (1<<Character.COMBINING_SPACING_MARK)));
5761            defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
5762                              (1<<Character.LETTER_NUMBER)        |
5763                              (1<<Character.OTHER_NUMBER)));
5764            defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
5765                              (1<<Character.LINE_SEPARATOR)  |
5766                              (1<<Character.PARAGRAPH_SEPARATOR)));
5767            defCategory("C", ((1<<Character.CONTROL)     |
5768                              (1<<Character.FORMAT)      |
5769                              (1<<Character.PRIVATE_USE) |
5770                              (1<<Character.SURROGATE))); // Other
5771            defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
5772                              (1<<Character.START_PUNCTUATION)     |
5773                              (1<<Character.END_PUNCTUATION)       |
5774                              (1<<Character.CONNECTOR_PUNCTUATION) |
5775                              (1<<Character.OTHER_PUNCTUATION)     |
5776                              (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
5777                              (1<<Character.FINAL_QUOTE_PUNCTUATION)));
5778            defCategory("S", ((1<<Character.MATH_SYMBOL)     |
5779                              (1<<Character.CURRENCY_SYMBOL) |
5780                              (1<<Character.MODIFIER_SYMBOL) |
5781                              (1<<Character.OTHER_SYMBOL)));
5782            defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
5783                               (1<<Character.LOWERCASE_LETTER) |
5784                               (1<<Character.TITLECASE_LETTER)));
5785            defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
5786                               (1<<Character.LOWERCASE_LETTER) |
5787                               (1<<Character.TITLECASE_LETTER) |
5788                               (1<<Character.MODIFIER_LETTER)  |
5789                               (1<<Character.OTHER_LETTER)     |
5790                               (1<<Character.DECIMAL_DIGIT_NUMBER)));
5791            defRange("L1", 0x00, 0xFF); // Latin-1
5792            map.put("all", new CharPropertyFactory() {
5793                    CharProperty make() { return new All(); }});
5794
5795            // Posix regular expression character classes, defined in
5796            // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
5797            defRange("ASCII", 0x00, 0x7F);   // ASCII
5798            defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
5799            defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
5800            defCtype("Blank", ASCII.BLANK);  // Space and tab characters
5801            defCtype("Cntrl", ASCII.CNTRL);  // Control characters
5802            defRange("Digit", '0', '9');     // Numeric characters
5803            defCtype("Graph", ASCII.GRAPH);  // printable and visible
5804            defRange("Lower", 'a', 'z');     // Lower-case alphabetic
5805            defRange("Print", 0x20, 0x7E);   // Printable characters
5806            defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
5807            defCtype("Space", ASCII.SPACE);  // Space characters
5808            defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
5809            defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
5810
5811            // Java character properties, defined by methods in Character.java
5812            defClone("javaLowerCase", new CloneableProperty() {
5813                boolean isSatisfiedBy(int ch) {
5814                    return Character.isLowerCase(ch);}});
5815            defClone("javaUpperCase", new CloneableProperty() {
5816                boolean isSatisfiedBy(int ch) {
5817                    return Character.isUpperCase(ch);}});
5818            defClone("javaAlphabetic", new CloneableProperty() {
5819                boolean isSatisfiedBy(int ch) {
5820                    return Character.isAlphabetic(ch);}});
5821            defClone("javaIdeographic", new CloneableProperty() {
5822                boolean isSatisfiedBy(int ch) {
5823                    return Character.isIdeographic(ch);}});
5824            defClone("javaTitleCase", new CloneableProperty() {
5825                boolean isSatisfiedBy(int ch) {
5826                    return Character.isTitleCase(ch);}});
5827            defClone("javaDigit", new CloneableProperty() {
5828                boolean isSatisfiedBy(int ch) {
5829                    return Character.isDigit(ch);}});
5830            defClone("javaDefined", new CloneableProperty() {
5831                boolean isSatisfiedBy(int ch) {
5832                    return Character.isDefined(ch);}});
5833            defClone("javaLetter", new CloneableProperty() {
5834                boolean isSatisfiedBy(int ch) {
5835                    return Character.isLetter(ch);}});
5836            defClone("javaLetterOrDigit", new CloneableProperty() {
5837                boolean isSatisfiedBy(int ch) {
5838                    return Character.isLetterOrDigit(ch);}});
5839            defClone("javaJavaIdentifierStart", new CloneableProperty() {
5840                boolean isSatisfiedBy(int ch) {
5841                    return Character.isJavaIdentifierStart(ch);}});
5842            defClone("javaJavaIdentifierPart", new CloneableProperty() {
5843                boolean isSatisfiedBy(int ch) {
5844                    return Character.isJavaIdentifierPart(ch);}});
5845            defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
5846                boolean isSatisfiedBy(int ch) {
5847                    return Character.isUnicodeIdentifierStart(ch);}});
5848            defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
5849                boolean isSatisfiedBy(int ch) {
5850                    return Character.isUnicodeIdentifierPart(ch);}});
5851            defClone("javaIdentifierIgnorable", new CloneableProperty() {
5852                boolean isSatisfiedBy(int ch) {
5853                    return Character.isIdentifierIgnorable(ch);}});
5854            defClone("javaSpaceChar", new CloneableProperty() {
5855                boolean isSatisfiedBy(int ch) {
5856                    return Character.isSpaceChar(ch);}});
5857            defClone("javaWhitespace", new CloneableProperty() {
5858                boolean isSatisfiedBy(int ch) {
5859                    return Character.isWhitespace(ch);}});
5860            defClone("javaISOControl", new CloneableProperty() {
5861                boolean isSatisfiedBy(int ch) {
5862                    return Character.isISOControl(ch);}});
5863            defClone("javaMirrored", new CloneableProperty() {
5864                boolean isSatisfiedBy(int ch) {
5865                    return Character.isMirrored(ch);}});
5866        }
5867    }
5868
5869    /**
5870     * Creates a predicate which can be used to match a string.
5871     *
5872     * @return  The predicate which can be used for matching on a string
5873     * @since   1.8
5874     */
5875    public Predicate<String> asPredicate() {
5876        return s -> matcher(s).find();
5877    }
5878
5879    /**
5880     * Creates a stream from the given input sequence around matches of this
5881     * pattern.
5882     *
5883     * <p> The stream returned by this method contains each substring of the
5884     * input sequence that is terminated by another subsequence that matches
5885     * this pattern or is terminated by the end of the input sequence.  The
5886     * substrings in the stream are in the order in which they occur in the
5887     * input. Trailing empty strings will be discarded and not encountered in
5888     * the stream.
5889     *
5890     * <p> If this pattern does not match any subsequence of the input then
5891     * the resulting stream has just one element, namely the input sequence in
5892     * string form.
5893     *
5894     * <p> When there is a positive-width match at the beginning of the input
5895     * sequence then an empty leading substring is included at the beginning
5896     * of the stream. A zero-width match at the beginning however never produces
5897     * such empty leading substring.
5898     *
5899     * <p> If the input sequence is mutable, it must remain constant during the
5900     * execution of the terminal stream operation.  Otherwise, the result of the
5901     * terminal stream operation is undefined.
5902     *
5903     * @param   input
5904     *          The character sequence to be split
5905     *
5906     * @return  The stream of strings computed by splitting the input
5907     *          around matches of this pattern
5908     * @see     #split(CharSequence)
5909     * @since   1.8
5910     */
5911    public Stream<String> splitAsStream(final CharSequence input) {
5912        class MatcherIterator implements Iterator<String> {
5913            private Matcher matcher;
5914            // The start position of the next sub-sequence of input
5915            // when current == input.length there are no more elements
5916            private int current;
5917            // null if the next element, if any, needs to obtained
5918            private String nextElement;
5919            // > 0 if there are N next empty elements
5920            private int emptyElementCount;
5921
5922            public String next() {
5923                if (!hasNext())
5924                    throw new NoSuchElementException();
5925
5926                if (emptyElementCount == 0) {
5927                    String n = nextElement;
5928                    nextElement = null;
5929                    return n;
5930                } else {
5931                    emptyElementCount--;
5932                    return "";
5933                }
5934            }
5935
5936            public boolean hasNext() {
5937                if (matcher == null) {
5938                    matcher = matcher(input);
5939                    // If the input is an empty string then the result can only be a
5940                    // stream of the input.  Induce that by setting the empty
5941                    // element count to 1
5942                    emptyElementCount = input.length() == 0 ? 1 : 0;
5943                }
5944                if (nextElement != null || emptyElementCount > 0)
5945                    return true;
5946
5947                if (current == input.length())
5948                    return false;
5949
5950                // Consume the next matching element
5951                // Count sequence of matching empty elements
5952                while (matcher.find()) {
5953                    nextElement = input.subSequence(current, matcher.start()).toString();
5954                    current = matcher.end();
5955                    if (!nextElement.isEmpty()) {
5956                        return true;
5957                    } else if (current > 0) { // no empty leading substring for zero-width
5958                                              // match at the beginning of the input
5959                        emptyElementCount++;
5960                    }
5961                }
5962
5963                // Consume last matching element
5964                nextElement = input.subSequence(current, input.length()).toString();
5965                current = input.length();
5966                if (!nextElement.isEmpty()) {
5967                    return true;
5968                } else {
5969                    // Ignore a terminal sequence of matching empty elements
5970                    emptyElementCount = 0;
5971                    nextElement = null;
5972                    return false;
5973                }
5974            }
5975        }
5976        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
5977                new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false);
5978    }
5979}
5980