Pattern.java revision 17639:46e9f2b0a472
1/* 2 * Copyright (c) 1999, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package java.util.regex; 27 28import java.text.Normalizer; 29import java.text.Normalizer.Form; 30import java.util.Locale; 31import java.util.Iterator; 32import java.util.Map; 33import java.util.ArrayList; 34import java.util.HashMap; 35import java.util.LinkedHashSet; 36import java.util.List; 37import java.util.Set; 38import java.util.Arrays; 39import java.util.NoSuchElementException; 40import java.util.Spliterator; 41import java.util.Spliterators; 42import java.util.function.Predicate; 43import java.util.stream.Stream; 44import java.util.stream.StreamSupport; 45 46 47/** 48 * A compiled representation of a regular expression. 49 * 50 * <p> A regular expression, specified as a string, must first be compiled into 51 * an instance of this class. The resulting pattern can then be used to create 52 * a {@link Matcher} object that can match arbitrary {@linkplain 53 * java.lang.CharSequence character sequences} against the regular 54 * expression. All of the state involved in performing a match resides in the 55 * matcher, so many matchers can share the same pattern. 56 * 57 * <p> A typical invocation sequence is thus 58 * 59 * <blockquote><pre> 60 * Pattern p = Pattern.{@link #compile compile}("a*b"); 61 * Matcher m = p.{@link #matcher matcher}("aaaaab"); 62 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote> 63 * 64 * <p> A {@link #matches matches} method is defined by this class as a 65 * convenience for when a regular expression is used just once. This method 66 * compiles an expression and matches an input sequence against it in a single 67 * invocation. The statement 68 * 69 * <blockquote><pre> 70 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote> 71 * 72 * is equivalent to the three statements above, though for repeated matches it 73 * is less efficient since it does not allow the compiled pattern to be reused. 74 * 75 * <p> Instances of this class are immutable and are safe for use by multiple 76 * concurrent threads. Instances of the {@link Matcher} class are not safe for 77 * such use. 78 * 79 * 80 * <h3><a id="sum">Summary of regular-expression constructs</a></h3> 81 * 82 * <table class="borderless"> 83 * <caption style="display:none">Regular expression constructs, and what they match</caption> 84 * <thead style="text-align:left"> 85 * <tr> 86 * <th id="construct">Construct</th> 87 * <th id="matches">Matches</th> 88 * </tr> 89 * </thead> 90 * <tbody style="text-align:left"> 91 * 92 * <tr><th colspan="2" style="padding-top:20px" id="characters">Characters</th></tr> 93 * 94 * <tr><th style="vertical-align:top; font-weight: normal" id="x"><i>x</i></th> 95 * <td headers="matches characters x">The character <i>x</i></td></tr> 96 * <tr><th style="vertical-align:top; font-weight: normal" id="backslash">{@code \\}</th> 97 * <td headers="matches characters backslash">The backslash character</td></tr> 98 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_n">{@code \0}<i>n</i></th> 99 * <td headers="matches characters octal_n">The character with octal value {@code 0}<i>n</i> 100 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 101 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nn">{@code \0}<i>nn</i></th> 102 * <td headers="matches characters octal_nn">The character with octal value {@code 0}<i>nn</i> 103 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 104 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nnn">{@code \0}<i>mnn</i></th> 105 * <td headers="matches characters octal_nnn">The character with octal value {@code 0}<i>mnn</i> 106 * (0 {@code <=} <i>m</i> {@code <=} 3, 107 * 0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 108 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hh">{@code \x}<i>hh</i></th> 109 * <td headers="matches characters hex_hh">The character with hexadecimal value {@code 0x}<i>hh</i></td></tr> 110 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hhhh"><code>\u</code><i>hhhh</i></th> 111 * <td headers="matches characters hex_hhhh">The character with hexadecimal value {@code 0x}<i>hhhh</i></td></tr> 112 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th> 113 * <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i> 114 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} 115 * <= {@code 0x}<i>h...h</i> <= 116 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr> 117 * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th> 118 * <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr> 119 * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th> 120 * <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr> 121 * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th> 122 * <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr> 123 * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th> 124 * <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr> 125 * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th> 126 * <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr> 127 * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th> 128 * <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr> 129 * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th> 130 * <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr> 131 * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th> 132 * <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i></td></tr> 133 * 134 * <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr> 135 * 136 * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th> 137 * <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr> 138 * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th> 139 * <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr> 140 * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th> 141 * <td headers="matches classes range">{@code a} through {@code z} 142 * or {@code A} through {@code Z}, inclusive (range)</td></tr> 143 * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th> 144 * <td headers="matches classes union">{@code a} through {@code d}, 145 * or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr> 146 * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th> 147 * <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr> 148 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th> 149 * <td headers="matches classes subtraction1">{@code a} through {@code z}, 150 * except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr> 151 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th> 152 * <td headers="matches classes subtraction2">{@code a} through {@code z}, 153 * and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr> 154 * 155 * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr> 156 * 157 * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th> 158 * <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr> 159 * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th> 160 * <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr> 161 * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th> 162 * <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr> 163 * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th> 164 * <td headers="matches predef horiz_white">A horizontal whitespace character: 165 * <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr> 166 * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th> 167 * <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr> 168 * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th> 169 * <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr> 170 * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th> 171 * <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr> 172 * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th> 173 * <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code> 174 * </td></tr> 175 * <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th> 176 * <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr> 177 * <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th> 178 * <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr> 179 * <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th> 180 * <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr> 181 * 182 * <tr><th colspan="2" style="padding-top:20px" id="posix"><b>POSIX character classes (US-ASCII only)</b></th></tr> 183 * 184 * <tr><th style="vertical-align:top; font-weight:normal" id="Lower">{@code \p{Lower}}</th> 185 * <td headers="matches posix Lower">A lower-case alphabetic character: {@code [a-z]}</td></tr> 186 * <tr><th style="vertical-align:top; font-weight:normal" id="Upper">{@code \p{Upper}}</th> 187 * <td headers="matches posix Upper">An upper-case alphabetic character:{@code [A-Z]}</td></tr> 188 * <tr><th style="vertical-align:top; font-weight:normal" id="ASCII">{@code \p{ASCII}}</th> 189 * <td headers="matches posix ASCII">All ASCII:{@code [\x00-\x7F]}</td></tr> 190 * <tr><th style="vertical-align:top; font-weight:normal" id="Alpha">{@code \p{Alpha}}</th> 191 * <td headers="matches posix Alpha">An alphabetic character:{@code [\p{Lower}\p{Upper}]}</td></tr> 192 * <tr><th style="vertical-align:top; font-weight:normal" id="Digit">{@code \p{Digit}}</th> 193 * <td headers="matches posix Digit">A decimal digit: {@code [0-9]}</td></tr> 194 * <tr><th style="vertical-align:top; font-weight:normal" id="Alnum">{@code \p{Alnum}}</th> 195 * <td headers="matches posix Alnum">An alphanumeric character:{@code [\p{Alpha}\p{Digit}]}</td></tr> 196 * <tr><th style="vertical-align:top; font-weight:normal" id="Punct">{@code \p{Punct}}</th> 197 * <td headers="matches posix Punct">Punctuation: One of {@code !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}</td></tr> 198 * <!-- {@code [\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]} 199 * {@code [\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]} --> 200 * <tr><th style="vertical-align:top; font-weight:normal" id="Graph">{@code \p{Graph}}</th> 201 * <td headers="matches posix Graph">A visible character: {@code [\p{Alnum}\p{Punct}]}</td></tr> 202 * <tr><th style="vertical-align:top; font-weight:normal" id="Print">{@code \p{Print}}</th> 203 * <td headers="matches posix Print">A printable character: {@code [\p{Graph}\x20]}</td></tr> 204 * <tr><th style="vertical-align:top; font-weight:normal" id="Blank">{@code \p{Blank}}</th> 205 * <td headers="matches posix Blank">A space or a tab: {@code [ \t]}</td></tr> 206 * <tr><th style="vertical-align:top; font-weight:normal" id="Cntrl">{@code \p{Cntrl}}</th> 207 * <td headers="matches posix Cntrl">A control character: {@code [\x00-\x1F\x7F]}</td></tr> 208 * <tr><th style="vertical-align:top; font-weight:normal" id="XDigit">{@code \p{XDigit}}</th> 209 * <td headers="matches posix XDigit">A hexadecimal digit: {@code [0-9a-fA-F]}</td></tr> 210 * <tr><th style="vertical-align:top; font-weight:normal" id="Space">{@code \p{Space}}</th> 211 * <td headers="matches posix Space">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr> 212 * 213 * <tr><th colspan="2" style="padding-top:20px" id="java">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr> 214 * 215 * <tr><th style="vertical-align:top; font-weight:normal" id="javaLowerCase">{@code \p{javaLowerCase}}</th> 216 * <td headers="matches java javaLowerCase">Equivalent to java.lang.Character.isLowerCase()</td></tr> 217 * <tr><th style="vertical-align:top; font-weight:normal" id="javaUpperCase">{@code \p{javaUpperCase}}</th> 218 * <td headers="matches java javaUpperCase">Equivalent to java.lang.Character.isUpperCase()</td></tr> 219 * <tr><th style="vertical-align:top; font-weight:normal" id="javaWhitespace">{@code \p{javaWhitespace}}</th> 220 * <td headers="matches java javaWhitespace">Equivalent to java.lang.Character.isWhitespace()</td></tr> 221 * <tr><th style="vertical-align:top; font-weight:normal" id="javaMirrored">{@code \p{javaMirrored}}</th> 222 * <td headers="matches java javaMirrored">Equivalent to java.lang.Character.isMirrored()</td></tr> 223 * 224 * <tr><th colspan="2" style="padding-top:20px" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr> 225 * 226 * <tr><th style="vertical-align:top; font-weight:normal" id="IsLatin">{@code \p{IsLatin}}</th> 227 * <td headers="matches unicode IsLatin">A Latin script character (<a href="#usc">script</a>)</td></tr> 228 * <tr><th style="vertical-align:top; font-weight:normal" id="InGreek">{@code \p{InGreek}}</th> 229 * <td headers="matches unicode InGreek">A character in the Greek block (<a href="#ubc">block</a>)</td></tr> 230 * <tr><th style="vertical-align:top; font-weight:normal" id="Lu">{@code \p{Lu}}</th> 231 * <td headers="matches unicode Lu">An uppercase letter (<a href="#ucc">category</a>)</td></tr> 232 * <tr><th style="vertical-align:top; font-weight:normal" id="IsAlphabetic">{@code \p{IsAlphabetic}}</th> 233 * <td headers="matches unicode IsAlphabetic">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr> 234 * <tr><th style="vertical-align:top; font-weight:normal" id="Sc">{@code \p{Sc}}</th> 235 * <td headers="matches unicode Sc">A currency symbol</td></tr> 236 * <tr><th style="vertical-align:top; font-weight:normal" id="not_InGreek">{@code \P{InGreek}}</th> 237 * <td headers="matches unicode not_InGreek">Any character except one in the Greek block (negation)</td></tr> 238 * <tr><th style="vertical-align:top; font-weight:normal" id="not_uppercase">{@code [\p{L}&&[^\p{Lu}]]}</th> 239 * <td headers="matches unicode not_uppercase">Any letter except an uppercase letter (subtraction)</td></tr> 240 * 241 * <tr><th colspan="2" style="padding-top:20px" id="bounds">Boundary matchers</th></tr> 242 * 243 * <tr><th style="vertical-align:top; font-weight:normal" id="begin_line">{@code ^}</th> 244 * <td headers="matches bounds begin_line">The beginning of a line</td></tr> 245 * <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th> 246 * <td headers="matches bounds end_line">The end of a line</td></tr> 247 * <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th> 248 * <td headers="matches bounds word_boundary">A word boundary</td></tr> 249 * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th> 250 * <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr> 251 * <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th> 252 * <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr> 253 * <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th> 254 * <td headers="matches bounds begin_input">The beginning of the input</td></tr> 255 * <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th> 256 * <td headers="matches bounds end_prev_match">The end of the previous match</td></tr> 257 * <tr><th style="vertical-align:top; font-weight:normal" id="end_input_except_term">{@code \Z}</th> 258 * <td headers="matches bounds end_input_except_term">The end of the input but for the final 259 * <a href="#lt">terminator</a>, if any</td></tr> 260 * <tr><th style="vertical-align:top; font-weight:normal" id="end_input">{@code \z}</th> 261 * <td headers="matches bounds end_input">The end of the input</td></tr> 262 * 263 * <tr><th colspan="2" style="padding-top:20px" id="linebreak">Linebreak matcher</th></tr> 264 * 265 * <tr><th style="vertical-align:top; font-weight:normal" id="any_unicode_linebreak">{@code \R}</th> 266 * <td headers="matches linebreak any_unicode_linebreak">Any Unicode linebreak sequence, is equivalent to 267 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] 268 * </code></td></tr> 269 * 270 * <tr><th colspan="2" style="padding-top:20px" id="grapheme">Unicode Extended Grapheme matcher</th></tr> 271 * 272 * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_any">{@code \X}</th> 273 * <td headers="matches grapheme grapheme_any">Any Unicode extended grapheme cluster</td></tr> 274 * 275 * <tr><th colspan="2" style="padding-top:20px" id="greedy">Greedy quantifiers</th></tr> 276 * 277 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_once_or_not"><i>X</i>{@code ?}</th> 278 * <td headers="matches greedy greedy_once_or_not"><i>X</i>, once or not at all</td></tr> 279 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_zero_or_more"><i>X</i>{@code *}</th> 280 * <td headers="matches greedy greedy_zero_or_more"><i>X</i>, zero or more times</td></tr> 281 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_one_or_more"><i>X</i>{@code +}</th> 282 * <td headers="matches greedy greedy_one_or_more"><i>X</i>, one or more times</td></tr> 283 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_exactly"><i>X</i><code>{</code><i>n</i><code>}</code></th> 284 * <td headers="matches greedy greedy_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 285 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least"><i>X</i><code>{</code><i>n</i>{@code ,}}</th> 286 * <td headers="matches greedy greedy_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 287 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></th> 288 * <td headers="matches greedy greedy_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 289 * 290 * <tr><th colspan="2" style="padding-top:20px" id="reluc">Reluctant quantifiers</th></tr> 291 * 292 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_once_or_not"><i>X</i>{@code ??}</th> 293 * <td headers="matches reluc reluc_once_or_not"><i>X</i>, once or not at all</td></tr> 294 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_zero_or_more"><i>X</i>{@code *?}</th> 295 * <td headers="matches reluc reluc_zero_or_more"><i>X</i>, zero or more times</td></tr> 296 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_one_or_more"><i>X</i>{@code +?}</th> 297 * <td headers="matches reluc reluc_one_or_more"><i>X</i>, one or more times</td></tr> 298 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_exactly"><i>X</i><code>{</code><i>n</i><code>}?</code></th> 299 * <td headers="matches reluc reluc_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 300 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least"><i>X</i><code>{</code><i>n</i><code>,}?</code></th> 301 * <td headers="matches reluc reluc_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 302 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}?</code></th> 303 * <td headers="matches reluc reluc_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 304 * 305 * <tr><th colspan="2" style="padding-top:20px" id="poss">Possessive quantifiers</th></tr> 306 * 307 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_once_or_not"><i>X</i>{@code ?+}</th> 308 * <td headers="matches poss poss_once_or_not"><i>X</i>, once or not at all</td></tr> 309 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_zero_or_more"><i>X</i>{@code *+}</th> 310 * <td headers="matches poss poss_zero_or_more"><i>X</i>, zero or more times</td></tr> 311 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_one_or_more"><i>X</i>{@code ++}</th> 312 * <td headers="matches poss poss_one_or_more"><i>X</i>, one or more times</td></tr> 313 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_exactly"><i>X</i><code>{</code><i>n</i><code>}+</code></th> 314 * <td headers="matches poss poss_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 315 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least"><i>X</i><code>{</code><i>n</i><code>,}+</code></th> 316 * <td headers="matches poss poss_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 317 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}+</code></th> 318 * <td headers="matches poss poss_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 319 * 320 * <tr><th colspan="2" style="padding-top:20px" id="logical">Logical operators</th></tr> 321 * 322 * <tr><th style="vertical-align:top; font-weight:normal" id="concat"><i>XY</i></th> 323 * <td headers="matches logical concat"><i>X</i> followed by <i>Y</i></td></tr> 324 * <tr><th style="vertical-align:top; font-weight:normal" id="alternate"><i>X</i>{@code |}<i>Y</i></th> 325 * <td headers="matches logical alternate">Either <i>X</i> or <i>Y</i></td></tr> 326 * <tr><th style="vertical-align:top; font-weight:normal" id="group">{@code (}<i>X</i>{@code )}</th> 327 * <td headers="matches logical group">X, as a <a href="#cg">capturing group</a></td></tr> 328 * 329 * <tr><th colspan="2" style="padding-top:20px" id="backref">Back references</th></tr> 330 * 331 * <tr><th style="vertical-align:top; font-weight:normal" id="back_nth">{@code \}<i>n</i></th> 332 * <td headers="matches backref back_nth">Whatever the <i>n</i><sup>th</sup> 333 * <a href="#cg">capturing group</a> matched</td></tr> 334 * <tr><th style="vertical-align:top; font-weight:normal" id="back_named">{@code \}<i>k</i><<i>name</i>></th> 335 * <td headers="matches backref back_named">Whatever the 336 * <a href="#groupname">named-capturing group</a> "name" matched</td></tr> 337 * 338 * <tr><th colspan="2" style="padding-top:20px" id="quote">Quotation</th></tr> 339 * 340 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_follow">{@code \}</th> 341 * <td headers="matches quote quote_follow">Nothing, but quotes the following character</td></tr> 342 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_begin">{@code \Q}</th> 343 * <td headers="matches quote quote_begin">Nothing, but quotes all characters until {@code \E}</td></tr> 344 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_end">{@code \E}</th> 345 * <td headers="matches quote quote_end">Nothing, but ends quoting started by {@code \Q}</td></tr> 346 * <!-- Metachars: !$()*+.<>?[\]^{|} --> 347 * 348 * <tr><th colspan="2" style="padding-top:20px" id="special">Special constructs (named-capturing and non-capturing)</th></tr> 349 * 350 * <tr><th style="vertical-align:top; font-weight:normal" id="named_group"><code>(?<<a href="#groupname">name</a>></code><i>X</i>{@code )}</th> 351 * <td headers="matches special named_group"><i>X</i>, as a named-capturing group</td></tr> 352 * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group">{@code (?:}<i>X</i>{@code )}</th> 353 * <td headers="matches special non_capture_group"><i>X</i>, as a non-capturing group</td></tr> 354 * <tr><th style="vertical-align:top; font-weight:normal" id="flags"><code>(?idmsuxU-idmsuxU) </code></th> 355 * <td headers="matches special flags">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a> 356 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> 357 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a> 358 * on - off</td></tr> 359 * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group_flags"><code>(?idmsux-idmsux:</code><i>X</i>{@code )} </th> 360 * <td headers="matches special non_capture_group_flags"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the 361 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a> 362 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a > 363 * <a href="#COMMENTS">x</a> on - off</td></tr> 364 * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookahead">{@code (?=}<i>X</i>{@code )}</th> 365 * <td headers="matches special pos_lookahead"><i>X</i>, via zero-width positive lookahead</td></tr> 366 * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookahead">{@code (?!}<i>X</i>{@code )}</th> 367 * <td headers="matches special neg_lookahead"><i>X</i>, via zero-width negative lookahead</td></tr> 368 * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookbehind">{@code (?<=}<i>X</i>{@code )}</th> 369 * <td headers="matches special pos_lookbehind"><i>X</i>, via zero-width positive lookbehind</td></tr> 370 * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookbehind">{@code (?<!}<i>X</i>{@code )}</th> 371 * <td headers="matches special neg_lookbehind"><i>X</i>, via zero-width negative lookbehind</td></tr> 372 * <tr><th style="vertical-align:top; font-weight:normal" id="indep_non_capture_group">{@code (?>}<i>X</i>{@code )}</th> 373 * <td headers="matches special indep_non_capture_group"><i>X</i>, as an independent, non-capturing group</td></tr> 374 * 375 * </tbody> 376 * </table> 377 * 378 * <hr> 379 * 380 * 381 * <h3><a id="bs">Backslashes, escapes, and quoting</a></h3> 382 * 383 * <p> The backslash character ({@code '\'}) serves to introduce escaped 384 * constructs, as defined in the table above, as well as to quote characters 385 * that otherwise would be interpreted as unescaped constructs. Thus the 386 * expression {@code \\} matches a single backslash and <code>\{</code> matches a 387 * left brace. 388 * 389 * <p> It is an error to use a backslash prior to any alphabetic character that 390 * does not denote an escaped construct; these are reserved for future 391 * extensions to the regular-expression language. A backslash may be used 392 * prior to a non-alphabetic character regardless of whether that character is 393 * part of an unescaped construct. 394 * 395 * <p> Backslashes within string literals in Java source code are interpreted 396 * as required by 397 * <cite>The Java™ Language Specification</cite> 398 * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) 399 * It is therefore necessary to double backslashes in string 400 * literals that represent regular expressions to protect them from 401 * interpretation by the Java bytecode compiler. The string literal 402 * <code>"\b"</code>, for example, matches a single backspace character when 403 * interpreted as a regular expression, while {@code "\\b"} matches a 404 * word boundary. The string literal {@code "\(hello\)"} is illegal 405 * and leads to a compile-time error; in order to match the string 406 * {@code (hello)} the string literal {@code "\\(hello\\)"} 407 * must be used. 408 * 409 * <h3><a id="cc">Character Classes</a></h3> 410 * 411 * <p> Character classes may appear within other character classes, and 412 * may be composed by the union operator (implicit) and the intersection 413 * operator ({@code &&}). 414 * The union operator denotes a class that contains every character that is 415 * in at least one of its operand classes. The intersection operator 416 * denotes a class that contains every character that is in both of its 417 * operand classes. 418 * 419 * <p> The precedence of character-class operators is as follows, from 420 * highest to lowest: 421 * 422 * <table class="striped" style="margin-left: 2em;"> 423 * <caption style="display:none">Precedence of character class operators.</caption> 424 * <thead> 425 * <tr><th scope="col">Precedence<th scope="col">Name<th scope="col">Example 426 * </thead> 427 * <tbody> 428 * <tr><th scope="row">1</th> 429 * <td>Literal escape </td> 430 * <td>{@code \x}</td></tr> 431 * <tr><th scope="row">2</th> 432 * <td>Grouping</td> 433 * <td>{@code [...]}</td></tr> 434 * <tr><th scope="row">3</th> 435 * <td>Range</td> 436 * <td>{@code a-z}</td></tr> 437 * <tr><th scope="row">4</th> 438 * <td>Union</td> 439 * <td>{@code [a-e][i-u]}</td></tr> 440 * <tr><th scope="row">5</th> 441 * <td>Intersection</td> 442 * <td>{@code [a-z&&[aeiou]]}</td></tr> 443 * </tbody> 444 * </table> 445 * 446 * <p> Note that a different set of metacharacters are in effect inside 447 * a character class than outside a character class. For instance, the 448 * regular expression {@code .} loses its special meaning inside a 449 * character class, while the expression {@code -} becomes a range 450 * forming metacharacter. 451 * 452 * <h3><a id="lt">Line terminators</a></h3> 453 * 454 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks 455 * the end of a line of the input character sequence. The following are 456 * recognized as line terminators: 457 * 458 * <ul> 459 * 460 * <li> A newline (line feed) character ({@code '\n'}), 461 * 462 * <li> A carriage-return character followed immediately by a newline 463 * character ({@code "\r\n"}), 464 * 465 * <li> A standalone carriage-return character ({@code '\r'}), 466 * 467 * <li> A next-line character (<code>'\u0085'</code>), 468 * 469 * <li> A line-separator character (<code>'\u2028'</code>), or 470 * 471 * <li> A paragraph-separator character (<code>'\u2029'</code>). 472 * 473 * </ul> 474 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators 475 * recognized are newline characters. 476 * 477 * <p> The regular expression {@code .} matches any character except a line 478 * terminator unless the {@link #DOTALL} flag is specified. 479 * 480 * <p> By default, the regular expressions {@code ^} and {@code $} ignore 481 * line terminators and only match at the beginning and the end, respectively, 482 * of the entire input sequence. If {@link #MULTILINE} mode is activated then 483 * {@code ^} matches at the beginning of input and after any line terminator 484 * except at the end of input. When in {@link #MULTILINE} mode {@code $} 485 * matches just before a line terminator or the end of the input sequence. 486 * 487 * <h3><a id="cg">Groups and capturing</a></h3> 488 * 489 * <h4><a id="gnumber">Group number</a></h4> 490 * <p> Capturing groups are numbered by counting their opening parentheses from 491 * left to right. In the expression {@code ((A)(B(C)))}, for example, there 492 * are four such groups: </p> 493 * 494 * <ol style="margin-left:2em;"> 495 * <li> {@code ((A)(B(C)))} 496 * <li> {@code (A)} 497 * <li> {@code (B(C))} 498 * <li> {@code (C)} 499 * </ol> 500 * 501 * <p> Group zero always stands for the entire expression. 502 * 503 * <p> Capturing groups are so named because, during a match, each subsequence 504 * of the input sequence that matches such a group is saved. The captured 505 * subsequence may be used later in the expression, via a back reference, and 506 * may also be retrieved from the matcher once the match operation is complete. 507 * 508 * <h4><a id="groupname">Group name</a></h4> 509 * <p>A capturing group can also be assigned a "name", a {@code named-capturing group}, 510 * and then be back-referenced later by the "name". Group names are composed of 511 * the following characters. The first character must be a {@code letter}. 512 * 513 * <ul> 514 * <li> The uppercase letters {@code 'A'} through {@code 'Z'} 515 * (<code>'\u0041'</code> through <code>'\u005a'</code>), 516 * <li> The lowercase letters {@code 'a'} through {@code 'z'} 517 * (<code>'\u0061'</code> through <code>'\u007a'</code>), 518 * <li> The digits {@code '0'} through {@code '9'} 519 * (<code>'\u0030'</code> through <code>'\u0039'</code>), 520 * </ul> 521 * 522 * <p> A {@code named-capturing group} is still numbered as described in 523 * <a href="#gnumber">Group number</a>. 524 * 525 * <p> The captured input associated with a group is always the subsequence 526 * that the group most recently matched. If a group is evaluated a second time 527 * because of quantification then its previously-captured value, if any, will 528 * be retained if the second evaluation fails. Matching the string 529 * {@code "aba"} against the expression {@code (a(b)?)+}, for example, leaves 530 * group two set to {@code "b"}. All captured input is discarded at the 531 * beginning of each match. 532 * 533 * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups 534 * that do not capture text and do not count towards the group total, or 535 * <i>named-capturing</i> group. 536 * 537 * <h3> Unicode support </h3> 538 * 539 * <p> This class is in conformance with Level 1 of <a 540 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 541 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1 542 * Canonical Equivalents. 543 * <p> 544 * <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code 545 * are processed as described in section 3.3 of 546 * <cite>The Java™ Language Specification</cite>. 547 * Such escape sequences are also implemented directly by the regular-expression 548 * parser so that Unicode escapes can be used in expressions that are read from 549 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and 550 * {@code "\\u2014"}, while not equal, compile into the same pattern, which 551 * matches the character with hexadecimal value {@code 0x2014}. 552 * <p> 553 * A Unicode character can also be represented by using its <b>Hex notation</b> 554 * (hexadecimal code point value) directly as described in construct 555 * <code>\x{...}</code>, for example a supplementary character U+2011F can be 556 * specified as <code>\x{2011F}</code>, instead of two consecutive Unicode escape 557 * sequences of the surrogate pair <code>\uD840</code><code>\uDD1F</code>. 558 * <p> 559 * <b>Unicode character names</b> are supported by the named character construct 560 * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code> 561 * specifies character <code>\u263A</code>. The character names supported 562 * by this class are the valid Unicode character names matched by 563 * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}. 564 * <p> 565 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> 566 * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme 567 * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}. 568 * <p> 569 * Unicode scripts, blocks, categories and binary properties are written with 570 * the {@code \p} and {@code \P} constructs as in Perl. 571 * <code>\p{</code><i>prop</i><code>}</code> matches if 572 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code> 573 * does not match if the input has that property. 574 * <p> 575 * Scripts, blocks, categories and binary properties can be used both inside 576 * and outside of a character class. 577 * 578 * <p> 579 * <b><a id="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in 580 * {@code IsHiragana}, or by using the {@code script} keyword (or its short 581 * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}. 582 * <p> 583 * The script names supported by {@code Pattern} are the valid script names 584 * accepted and defined by 585 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. 586 * 587 * <p> 588 * <b><a id="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in 589 * {@code InMongolian}, or by using the keyword {@code block} (or its short 590 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. 591 * <p> 592 * The block names supported by {@code Pattern} are the valid block names 593 * accepted and defined by 594 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. 595 * <p> 596 * 597 * <b><a id="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}: 598 * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode 599 * letters. Same as scripts and blocks, categories can also be specified 600 * by using the keyword {@code general_category} (or its short form 601 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. 602 * <p> 603 * The supported categories are those of 604 * <a href="http://www.unicode.org/unicode/standard/standard.html"> 605 * <i>The Unicode Standard</i></a> in the version specified by the 606 * {@link java.lang.Character Character} class. The category names are those 607 * defined in the Standard, both normative and informative. 608 * <p> 609 * 610 * <b><a id="ubpc">Binary properties</a></b> are specified with the prefix {@code Is}, as in 611 * {@code IsAlphabetic}. The supported binary properties by {@code Pattern} 612 * are 613 * <ul> 614 * <li> Alphabetic 615 * <li> Ideographic 616 * <li> Letter 617 * <li> Lowercase 618 * <li> Uppercase 619 * <li> Titlecase 620 * <li> Punctuation 621 * <Li> Control 622 * <li> White_Space 623 * <li> Digit 624 * <li> Hex_Digit 625 * <li> Join_Control 626 * <li> Noncharacter_Code_Point 627 * <li> Assigned 628 * </ul> 629 * <p> 630 * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b> 631 * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i> 632 * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression 633 * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified. 634 * 635 * <table class="striped"> 636 * <caption style="display:none">predefined and posix character classes in Unicode mode</caption> 637 * <thead> 638 * <tr> 639 * <th scope="col" id="predef_classes">Classes</th> 640 * <th scope="col" id="predef_matches">Matches</th> 641 * </tr> 642 * </thead> 643 * <tbody> 644 * <tr><th scope="row">{@code \p{Lower}}</th> 645 * <td>A lowercase character:{@code \p{IsLowercase}}</td></tr> 646 * <tr><th scope="row">{@code \p{Upper}}</th> 647 * <td>An uppercase character:{@code \p{IsUppercase}}</td></tr> 648 * <tr><th scope="row">{@code \p{ASCII}}</th> 649 * <td>All ASCII:{@code [\x00-\x7F]}</td></tr> 650 * <tr><th scope="row">{@code \p{Alpha}}</th> 651 * <td>An alphabetic character:{@code \p{IsAlphabetic}}</td></tr> 652 * <tr><th scope="row">{@code \p{Digit}}</th> 653 * <td>A decimal digit character:{@code \p{IsDigit}}</td></tr> 654 * <tr><th scope="row">{@code \p{Alnum}}</th> 655 * <td>An alphanumeric character:{@code [\p{IsAlphabetic}\p{IsDigit}]}</td></tr> 656 * <tr><th scope="row">{@code \p{Punct}}</th> 657 * <td>A punctuation character:{@code \p{IsPunctuation}}</td></tr> 658 * <tr><th scope="row">{@code \p{Graph}}</th> 659 * <td>A visible character: {@code [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]}</td></tr> 660 * <tr><th scope="row">{@code \p{Print}}</th> 661 * <td>A printable character: {@code [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]}</td></tr> 662 * <tr><th scope="row">{@code \p{Blank}}</th> 663 * <td>A space or a tab: {@code [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]}</td></tr> 664 * <tr><th scope="row">{@code \p{Cntrl}}</th> 665 * <td>A control character: {@code \p{gc=Cc}}</td></tr> 666 * <tr><th scope="row">{@code \p{XDigit}}</th> 667 * <td>A hexadecimal digit: {@code [\p{gc=Nd}\p{IsHex_Digit}]}</td></tr> 668 * <tr><th scope="row">{@code \p{Space}}</th> 669 * <td>A whitespace character:{@code \p{IsWhite_Space}}</td></tr> 670 * <tr><th scope="row">{@code \d}</th> 671 * <td>A digit: {@code \p{IsDigit}}</td></tr> 672 * <tr><th scope="row">{@code \D}</th> 673 * <td>A non-digit: {@code [^\d]}</td></tr> 674 * <tr><th scope="row">{@code \s}</th> 675 * <td>A whitespace character: {@code \p{IsWhite_Space}}</td></tr> 676 * <tr><th scope="row">{@code \S}</th> 677 * <td>A non-whitespace character: {@code [^\s]}</td></tr> 678 * <tr><th scope="row">{@code \w}</th> 679 * <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr> 680 * <tr><th scope="row">{@code \W}</th> 681 * <td>A non-word character: {@code [^\w]}</td></tr> 682 * </tbody> 683 * </table> 684 * <p> 685 * <a id="jcc"> 686 * Categories that behave like the java.lang.Character 687 * boolean is<i>methodname</i> methods (except for the deprecated ones) are 688 * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where 689 * the specified property has the name <code>java<i>methodname</i></code></a>. 690 * 691 * <h3> Comparison to Perl 5 </h3> 692 * 693 * <p>The {@code Pattern} engine performs traditional NFA-based matching 694 * with ordered alternation as occurs in Perl 5. 695 * 696 * <p> Perl constructs not supported by this class: </p> 697 * 698 * <ul> 699 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for 700 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and 701 * <code>\g{</code><i>name</i><code>}</code> for 702 * <a href="#groupname">named-capturing group</a>. 703 * </p></li> 704 * 705 * <li><p> The conditional constructs 706 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and 707 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )}, 708 * </p></li> 709 * 710 * <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code> 711 * and <code>(??{</code><i>code</i><code>})</code>,</p></li> 712 * 713 * <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li> 714 * 715 * <li><p> The preprocessing operations {@code \l} <code>\u</code>, 716 * {@code \L}, and {@code \U}. </p></li> 717 * 718 * </ul> 719 * 720 * <p> Constructs supported by this class but not by Perl: </p> 721 * 722 * <ul> 723 * 724 * <li><p> Character-class union and intersection as described 725 * <a href="#cc">above</a>.</p></li> 726 * 727 * </ul> 728 * 729 * <p> Notable differences from Perl: </p> 730 * 731 * <ul> 732 * 733 * <li><p> In Perl, {@code \1} through {@code \9} are always interpreted 734 * as back references; a backslash-escaped number greater than {@code 9} is 735 * treated as a back reference if at least that many subexpressions exist, 736 * otherwise it is interpreted, if possible, as an octal escape. In this 737 * class octal escapes must always begin with a zero. In this class, 738 * {@code \1} through {@code \9} are always interpreted as back 739 * references, and a larger number is accepted as a back reference if at 740 * least that many subexpressions exist at that point in the regular 741 * expression, otherwise the parser will drop digits until the number is 742 * smaller or equal to the existing number of groups or it is one digit. 743 * </p></li> 744 * 745 * <li><p> Perl uses the {@code g} flag to request a match that resumes 746 * where the last match left off. This functionality is provided implicitly 747 * by the {@link Matcher} class: Repeated invocations of the {@link 748 * Matcher#find find} method will resume where the last match left off, 749 * unless the matcher is reset. </p></li> 750 * 751 * <li><p> In Perl, embedded flags at the top level of an expression affect 752 * the whole expression. In this class, embedded flags always take effect 753 * at the point at which they appear, whether they are at the top level or 754 * within a group; in the latter case, flags are restored at the end of the 755 * group just as in Perl. </p></li> 756 * 757 * </ul> 758 * 759 * 760 * <p> For a more precise description of the behavior of regular expression 761 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/"> 762 * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl, 763 * O'Reilly and Associates, 2006.</a> 764 * </p> 765 * 766 * @see java.lang.String#split(String, int) 767 * @see java.lang.String#split(String) 768 * 769 * @author Mike McCloskey 770 * @author Mark Reinhold 771 * @author JSR-51 Expert Group 772 * @since 1.4 773 * @spec JSR-51 774 */ 775 776public final class Pattern 777 implements java.io.Serializable 778{ 779 780 /** 781 * Regular expression modifier values. Instead of being passed as 782 * arguments, they can also be passed as inline modifiers. 783 * For example, the following statements have the same effect. 784 * <pre> 785 * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M); 786 * RegExp r2 = RegExp.compile("(?im)abc", 0); 787 * </pre> 788 * 789 * The flags are duplicated so that the familiar Perl match flag 790 * names are available. 791 */ 792 793 /** 794 * Enables Unix lines mode. 795 * 796 * <p> In this mode, only the {@code '\n'} line terminator is recognized 797 * in the behavior of {@code .}, {@code ^}, and {@code $}. 798 * 799 * <p> Unix lines mode can also be enabled via the embedded flag 800 * expression {@code (?d)}. 801 */ 802 public static final int UNIX_LINES = 0x01; 803 804 /** 805 * Enables case-insensitive matching. 806 * 807 * <p> By default, case-insensitive matching assumes that only characters 808 * in the US-ASCII charset are being matched. Unicode-aware 809 * case-insensitive matching can be enabled by specifying the {@link 810 * #UNICODE_CASE} flag in conjunction with this flag. 811 * 812 * <p> Case-insensitive matching can also be enabled via the embedded flag 813 * expression {@code (?i)}. 814 * 815 * <p> Specifying this flag may impose a slight performance penalty. </p> 816 */ 817 public static final int CASE_INSENSITIVE = 0x02; 818 819 /** 820 * Permits whitespace and comments in pattern. 821 * 822 * <p> In this mode, whitespace is ignored, and embedded comments starting 823 * with {@code #} are ignored until the end of a line. 824 * 825 * <p> Comments mode can also be enabled via the embedded flag 826 * expression {@code (?x)}. 827 */ 828 public static final int COMMENTS = 0x04; 829 830 /** 831 * Enables multiline mode. 832 * 833 * <p> In multiline mode the expressions {@code ^} and {@code $} match 834 * just after or just before, respectively, a line terminator or the end of 835 * the input sequence. By default these expressions only match at the 836 * beginning and the end of the entire input sequence. 837 * 838 * <p> Multiline mode can also be enabled via the embedded flag 839 * expression {@code (?m)}. </p> 840 */ 841 public static final int MULTILINE = 0x08; 842 843 /** 844 * Enables literal parsing of the pattern. 845 * 846 * <p> When this flag is specified then the input string that specifies 847 * the pattern is treated as a sequence of literal characters. 848 * Metacharacters or escape sequences in the input sequence will be 849 * given no special meaning. 850 * 851 * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on 852 * matching when used in conjunction with this flag. The other flags 853 * become superfluous. 854 * 855 * <p> There is no embedded flag character for enabling literal parsing. 856 * @since 1.5 857 */ 858 public static final int LITERAL = 0x10; 859 860 /** 861 * Enables dotall mode. 862 * 863 * <p> In dotall mode, the expression {@code .} matches any character, 864 * including a line terminator. By default this expression does not match 865 * line terminators. 866 * 867 * <p> Dotall mode can also be enabled via the embedded flag 868 * expression {@code (?s)}. (The {@code s} is a mnemonic for 869 * "single-line" mode, which is what this is called in Perl.) </p> 870 */ 871 public static final int DOTALL = 0x20; 872 873 /** 874 * Enables Unicode-aware case folding. 875 * 876 * <p> When this flag is specified then case-insensitive matching, when 877 * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner 878 * consistent with the Unicode Standard. By default, case-insensitive 879 * matching assumes that only characters in the US-ASCII charset are being 880 * matched. 881 * 882 * <p> Unicode-aware case folding can also be enabled via the embedded flag 883 * expression {@code (?u)}. 884 * 885 * <p> Specifying this flag may impose a performance penalty. </p> 886 */ 887 public static final int UNICODE_CASE = 0x40; 888 889 /** 890 * Enables canonical equivalence. 891 * 892 * <p> When this flag is specified then two characters will be considered 893 * to match if, and only if, their full canonical decompositions match. 894 * The expression <code>"a\u030A"</code>, for example, will match the 895 * string <code>"\u00E5"</code> when this flag is specified. By default, 896 * matching does not take canonical equivalence into account. 897 * 898 * <p> There is no embedded flag character for enabling canonical 899 * equivalence. 900 * 901 * <p> Specifying this flag may impose a performance penalty. </p> 902 */ 903 public static final int CANON_EQ = 0x80; 904 905 /** 906 * Enables the Unicode version of <i>Predefined character classes</i> and 907 * <i>POSIX character classes</i>. 908 * 909 * <p> When this flag is specified then the (US-ASCII only) 910 * <i>Predefined character classes</i> and <i>POSIX character classes</i> 911 * are in conformance with 912 * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 913 * Standard #18: Unicode Regular Expression</i></a> 914 * <i>Annex C: Compatibility Properties</i>. 915 * <p> 916 * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded 917 * flag expression {@code (?U)}. 918 * <p> 919 * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case 920 * folding. 921 * <p> 922 * Specifying this flag may impose a performance penalty. </p> 923 * @since 1.7 924 */ 925 public static final int UNICODE_CHARACTER_CLASS = 0x100; 926 927 /** 928 * Contains all possible flags for compile(regex, flags). 929 */ 930 private static final int ALL_FLAGS = CASE_INSENSITIVE | MULTILINE | 931 DOTALL | UNICODE_CASE | CANON_EQ | UNIX_LINES | LITERAL | 932 UNICODE_CHARACTER_CLASS | COMMENTS; 933 934 /* Pattern has only two serialized components: The pattern string 935 * and the flags, which are all that is needed to recompile the pattern 936 * when it is deserialized. 937 */ 938 939 /** use serialVersionUID from Merlin b59 for interoperability */ 940 private static final long serialVersionUID = 5073258162644648461L; 941 942 /** 943 * The original regular-expression pattern string. 944 * 945 * @serial 946 */ 947 private String pattern; 948 949 /** 950 * The original pattern flags. 951 * 952 * @serial 953 */ 954 private int flags; 955 956 /** 957 * Boolean indicating this Pattern is compiled; this is necessary in order 958 * to lazily compile deserialized Patterns. 959 */ 960 private transient volatile boolean compiled; 961 962 /** 963 * The normalized pattern string. 964 */ 965 private transient String normalizedPattern; 966 967 /** 968 * The starting point of state machine for the find operation. This allows 969 * a match to start anywhere in the input. 970 */ 971 transient Node root; 972 973 /** 974 * The root of object tree for a match operation. The pattern is matched 975 * at the beginning. This may include a find that uses BnM or a First 976 * node. 977 */ 978 transient Node matchRoot; 979 980 /** 981 * Temporary storage used by parsing pattern slice. 982 */ 983 transient int[] buffer; 984 985 /** 986 * A temporary storage used for predicate for double return. 987 */ 988 transient CharPredicate predicate; 989 990 /** 991 * Map the "name" of the "named capturing group" to its group id 992 * node. 993 */ 994 transient volatile Map<String, Integer> namedGroups; 995 996 /** 997 * Temporary storage used while parsing group references. 998 */ 999 transient GroupHead[] groupNodes; 1000 1001 /** 1002 * Temporary storage used to store the top level closure nodes. 1003 */ 1004 transient List<Node> topClosureNodes; 1005 1006 /** 1007 * The number of top greedy closure nodes in this Pattern. Used by 1008 * matchers to allocate storage needed for a IntHashSet to keep the 1009 * beginning pos {@code i} of all failed match. 1010 */ 1011 transient int localTCNCount; 1012 1013 /* 1014 * Turn off the stop-exponential-backtracking optimization if there 1015 * is a group ref in the pattern. 1016 */ 1017 transient boolean hasGroupRef; 1018 1019 /** 1020 * Temporary null terminated code point array used by pattern compiling. 1021 */ 1022 private transient int[] temp; 1023 1024 /** 1025 * The number of capturing groups in this Pattern. Used by matchers to 1026 * allocate storage needed to perform a match. 1027 */ 1028 transient int capturingGroupCount; 1029 1030 /** 1031 * The local variable count used by parsing tree. Used by matchers to 1032 * allocate storage needed to perform a match. 1033 */ 1034 transient int localCount; 1035 1036 /** 1037 * Index into the pattern string that keeps track of how much has been 1038 * parsed. 1039 */ 1040 private transient int cursor; 1041 1042 /** 1043 * Holds the length of the pattern string. 1044 */ 1045 private transient int patternLength; 1046 1047 /** 1048 * If the Start node might possibly match supplementary characters. 1049 * It is set to true during compiling if 1050 * (1) There is supplementary char in pattern, or 1051 * (2) There is complement node of a "family" CharProperty 1052 */ 1053 private transient boolean hasSupplementary; 1054 1055 /** 1056 * Compiles the given regular expression into a pattern. 1057 * 1058 * @param regex 1059 * The expression to be compiled 1060 * @return the given regular expression compiled into a pattern 1061 * @throws PatternSyntaxException 1062 * If the expression's syntax is invalid 1063 */ 1064 public static Pattern compile(String regex) { 1065 return new Pattern(regex, 0); 1066 } 1067 1068 /** 1069 * Compiles the given regular expression into a pattern with the given 1070 * flags. 1071 * 1072 * @param regex 1073 * The expression to be compiled 1074 * 1075 * @param flags 1076 * Match flags, a bit mask that may include 1077 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, 1078 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, 1079 * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} 1080 * and {@link #COMMENTS} 1081 * 1082 * @return the given regular expression compiled into a pattern with the given flags 1083 * @throws IllegalArgumentException 1084 * If bit values other than those corresponding to the defined 1085 * match flags are set in {@code flags} 1086 * 1087 * @throws PatternSyntaxException 1088 * If the expression's syntax is invalid 1089 */ 1090 public static Pattern compile(String regex, int flags) { 1091 return new Pattern(regex, flags); 1092 } 1093 1094 /** 1095 * Returns the regular expression from which this pattern was compiled. 1096 * 1097 * @return The source of this pattern 1098 */ 1099 public String pattern() { 1100 return pattern; 1101 } 1102 1103 /** 1104 * <p>Returns the string representation of this pattern. This 1105 * is the regular expression from which this pattern was 1106 * compiled.</p> 1107 * 1108 * @return The string representation of this pattern 1109 * @since 1.5 1110 */ 1111 public String toString() { 1112 return pattern; 1113 } 1114 1115 /** 1116 * Creates a matcher that will match the given input against this pattern. 1117 * 1118 * @param input 1119 * The character sequence to be matched 1120 * 1121 * @return A new matcher for this pattern 1122 */ 1123 public Matcher matcher(CharSequence input) { 1124 if (!compiled) { 1125 synchronized(this) { 1126 if (!compiled) 1127 compile(); 1128 } 1129 } 1130 Matcher m = new Matcher(this, input); 1131 return m; 1132 } 1133 1134 /** 1135 * Returns this pattern's match flags. 1136 * 1137 * @return The match flags specified when this pattern was compiled 1138 */ 1139 public int flags() { 1140 return flags; 1141 } 1142 1143 /** 1144 * Compiles the given regular expression and attempts to match the given 1145 * input against it. 1146 * 1147 * <p> An invocation of this convenience method of the form 1148 * 1149 * <blockquote><pre> 1150 * Pattern.matches(regex, input);</pre></blockquote> 1151 * 1152 * behaves in exactly the same way as the expression 1153 * 1154 * <blockquote><pre> 1155 * Pattern.compile(regex).matcher(input).matches()</pre></blockquote> 1156 * 1157 * <p> If a pattern is to be used multiple times, compiling it once and reusing 1158 * it will be more efficient than invoking this method each time. </p> 1159 * 1160 * @param regex 1161 * The expression to be compiled 1162 * 1163 * @param input 1164 * The character sequence to be matched 1165 * @return whether or not the regular expression matches on the input 1166 * @throws PatternSyntaxException 1167 * If the expression's syntax is invalid 1168 */ 1169 public static boolean matches(String regex, CharSequence input) { 1170 Pattern p = Pattern.compile(regex); 1171 Matcher m = p.matcher(input); 1172 return m.matches(); 1173 } 1174 1175 /** 1176 * Splits the given input sequence around matches of this pattern. 1177 * 1178 * <p> The array returned by this method contains each substring of the 1179 * input sequence that is terminated by another subsequence that matches 1180 * this pattern or is terminated by the end of the input sequence. The 1181 * substrings in the array are in the order in which they occur in the 1182 * input. If this pattern does not match any subsequence of the input then 1183 * the resulting array has just one element, namely the input sequence in 1184 * string form. 1185 * 1186 * <p> When there is a positive-width match at the beginning of the input 1187 * sequence then an empty leading substring is included at the beginning 1188 * of the resulting array. A zero-width match at the beginning however 1189 * never produces such empty leading substring. 1190 * 1191 * <p> The {@code limit} parameter controls the number of times the 1192 * pattern is applied and therefore affects the length of the resulting 1193 * array. If the limit <i>n</i> is greater than zero then the pattern 1194 * will be applied at most <i>n</i> - 1 times, the array's 1195 * length will be no greater than <i>n</i>, and the array's last entry 1196 * will contain all input beyond the last matched delimiter. If <i>n</i> 1197 * is non-positive then the pattern will be applied as many times as 1198 * possible and the array can have any length. If <i>n</i> is zero then 1199 * the pattern will be applied as many times as possible, the array can 1200 * have any length, and trailing empty strings will be discarded. 1201 * 1202 * <p> The input {@code "boo:and:foo"}, for example, yields the following 1203 * results with these parameters: 1204 * 1205 * <table class="plain" style="margin-left:2em;"> 1206 * <caption style="display:none">Split example showing regex, limit, and result</caption> 1207 * <thead> 1208 * <tr> 1209 * <th scope="col">Regex</th> 1210 * <th scope="col">Limit</th> 1211 * <th scope="col">Result</th> 1212 * </tr> 1213 * </thead> 1214 * <tbody> 1215 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:</th> 1216 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 1217 * <td>{@code { "boo", "and:foo" }}</td></tr> 1218 * <tr><!-- : --> 1219 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 1220 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1221 * <tr><!-- : --> 1222 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 1223 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1224 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 1225 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 1226 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 1227 * <tr><!-- o --> 1228 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 1229 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 1230 * <tr><!-- o --> 1231 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 1232 * <td>{@code { "b", "", ":and:f" }}</td></tr> 1233 * </tbody> 1234 * </table> 1235 * 1236 * @param input 1237 * The character sequence to be split 1238 * 1239 * @param limit 1240 * The result threshold, as described above 1241 * 1242 * @return The array of strings computed by splitting the input 1243 * around matches of this pattern 1244 */ 1245 public String[] split(CharSequence input, int limit) { 1246 int index = 0; 1247 boolean matchLimited = limit > 0; 1248 ArrayList<String> matchList = new ArrayList<>(); 1249 Matcher m = matcher(input); 1250 1251 // Add segments before each match found 1252 while(m.find()) { 1253 if (!matchLimited || matchList.size() < limit - 1) { 1254 if (index == 0 && index == m.start() && m.start() == m.end()) { 1255 // no empty leading substring included for zero-width match 1256 // at the beginning of the input char sequence. 1257 continue; 1258 } 1259 String match = input.subSequence(index, m.start()).toString(); 1260 matchList.add(match); 1261 index = m.end(); 1262 } else if (matchList.size() == limit - 1) { // last one 1263 String match = input.subSequence(index, 1264 input.length()).toString(); 1265 matchList.add(match); 1266 index = m.end(); 1267 } 1268 } 1269 1270 // If no match was found, return this 1271 if (index == 0) 1272 return new String[] {input.toString()}; 1273 1274 // Add remaining segment 1275 if (!matchLimited || matchList.size() < limit) 1276 matchList.add(input.subSequence(index, input.length()).toString()); 1277 1278 // Construct result 1279 int resultSize = matchList.size(); 1280 if (limit == 0) 1281 while (resultSize > 0 && matchList.get(resultSize-1).equals("")) 1282 resultSize--; 1283 String[] result = new String[resultSize]; 1284 return matchList.subList(0, resultSize).toArray(result); 1285 } 1286 1287 /** 1288 * Splits the given input sequence around matches of this pattern. 1289 * 1290 * <p> This method works as if by invoking the two-argument {@link 1291 * #split(java.lang.CharSequence, int) split} method with the given input 1292 * sequence and a limit argument of zero. Trailing empty strings are 1293 * therefore not included in the resulting array. </p> 1294 * 1295 * <p> The input {@code "boo:and:foo"}, for example, yields the following 1296 * results with these expressions: 1297 * 1298 * <table class="plain" style="margin-left:2em"> 1299 * <caption style="display:none">Split examples showing regex and result</caption> 1300 * <thead> 1301 * <tr> 1302 * <th scope="col">Regex</th> 1303 * <th scope="col">Result</th> 1304 * </tr> 1305 * </thead> 1306 * <tbody> 1307 * <tr><th scope="row" style="text-weight:normal">:</th> 1308 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1309 * <tr><th scope="row" style="text-weight:normal">o</th> 1310 * <td>{@code { "b", "", ":and:f" }}</td></tr> 1311 * </tbody> 1312 * </table> 1313 * 1314 * 1315 * @param input 1316 * The character sequence to be split 1317 * 1318 * @return The array of strings computed by splitting the input 1319 * around matches of this pattern 1320 */ 1321 public String[] split(CharSequence input) { 1322 return split(input, 0); 1323 } 1324 1325 /** 1326 * Returns a literal pattern {@code String} for the specified 1327 * {@code String}. 1328 * 1329 * <p>This method produces a {@code String} that can be used to 1330 * create a {@code Pattern} that would match the string 1331 * {@code s} as if it were a literal pattern.</p> Metacharacters 1332 * or escape sequences in the input sequence will be given no special 1333 * meaning. 1334 * 1335 * @param s The string to be literalized 1336 * @return A literal string replacement 1337 * @since 1.5 1338 */ 1339 public static String quote(String s) { 1340 int slashEIndex = s.indexOf("\\E"); 1341 if (slashEIndex == -1) 1342 return "\\Q" + s + "\\E"; 1343 1344 int lenHint = s.length(); 1345 lenHint = (lenHint < Integer.MAX_VALUE - 8 - lenHint) ? 1346 (lenHint << 1) : (Integer.MAX_VALUE - 8); 1347 1348 StringBuilder sb = new StringBuilder(lenHint); 1349 sb.append("\\Q"); 1350 int current = 0; 1351 do { 1352 sb.append(s, current, slashEIndex) 1353 .append("\\E\\\\E\\Q"); 1354 current = slashEIndex + 2; 1355 } while ((slashEIndex = s.indexOf("\\E", current)) != -1); 1356 1357 return sb.append(s, current, s.length()) 1358 .append("\\E") 1359 .toString(); 1360 } 1361 1362 /** 1363 * Recompile the Pattern instance from a stream. The original pattern 1364 * string is read in and the object tree is recompiled from it. 1365 */ 1366 private void readObject(java.io.ObjectInputStream s) 1367 throws java.io.IOException, ClassNotFoundException { 1368 1369 // Read in all fields 1370 s.defaultReadObject(); 1371 1372 // Initialize counts 1373 capturingGroupCount = 1; 1374 localCount = 0; 1375 localTCNCount = 0; 1376 1377 // if length > 0, the Pattern is lazily compiled 1378 if (pattern.length() == 0) { 1379 root = new Start(lastAccept); 1380 matchRoot = lastAccept; 1381 compiled = true; 1382 } 1383 } 1384 1385 /** 1386 * This private constructor is used to create all Patterns. The pattern 1387 * string and match flags are all that is needed to completely describe 1388 * a Pattern. An empty pattern string results in an object tree with 1389 * only a Start node and a LastNode node. 1390 */ 1391 private Pattern(String p, int f) { 1392 if ((f & ~ALL_FLAGS) != 0) { 1393 throw new IllegalArgumentException("Unknown flag 0x" 1394 + Integer.toHexString(f)); 1395 } 1396 pattern = p; 1397 flags = f; 1398 1399 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present 1400 if ((flags & UNICODE_CHARACTER_CLASS) != 0) 1401 flags |= UNICODE_CASE; 1402 1403 // Reset group index count 1404 capturingGroupCount = 1; 1405 localCount = 0; 1406 localTCNCount = 0; 1407 1408 if (pattern.length() > 0) { 1409 compile(); 1410 } else { 1411 root = new Start(lastAccept); 1412 matchRoot = lastAccept; 1413 } 1414 } 1415 1416 /** 1417 * The pattern is converted to normalized form ({@link 1418 * java.text.Normalizer.Form.NFC NFC}, canonical decomposition, 1419 * followed by canonical composition for the character class 1420 * part, and {@link java.text.Normalizer.Form.NFD NFD}, 1421 * canonical decomposition) for the rest), and then a pure 1422 * group is constructed to match canonical equivalences of the 1423 * characters. 1424 */ 1425 private static String normalize(String pattern) { 1426 int plen = pattern.length(); 1427 StringBuilder pbuf = new StringBuilder(plen); 1428 char last = 0; 1429 int lastStart = 0; 1430 char cc = 0; 1431 for (int i = 0; i < plen;) { 1432 char c = pattern.charAt(i); 1433 if (cc == 0 && // top level 1434 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') { 1435 i += 2; last = 0; 1436 continue; 1437 } 1438 if (c == '[' && last != '\\') { 1439 if (cc == 0) { 1440 if (lastStart < i) 1441 normalizeSlice(pattern, lastStart, i, pbuf); 1442 lastStart = i; 1443 } 1444 cc++; 1445 } else if (c == ']' && last != '\\') { 1446 cc--; 1447 if (cc == 0) { 1448 normalizeClazz(pattern, lastStart, i + 1, pbuf); 1449 lastStart = i + 1; 1450 } 1451 } 1452 last = c; 1453 i++; 1454 } 1455 assert (cc == 0); 1456 if (lastStart < plen) 1457 normalizeSlice(pattern, lastStart, plen, pbuf); 1458 return pbuf.toString(); 1459 } 1460 1461 private static void normalizeSlice(String src, int off, int limit, 1462 StringBuilder dst) 1463 { 1464 int len = src.length(); 1465 int off0 = off; 1466 while (off < limit && ASCII.isAscii(src.charAt(off))) { 1467 off++; 1468 } 1469 if (off == limit) { 1470 dst.append(src, off0, limit); 1471 return; 1472 } 1473 off--; 1474 if (off < off0) 1475 off = off0; 1476 else 1477 dst.append(src, off0, off); 1478 while (off < limit) { 1479 int ch0 = src.codePointAt(off); 1480 if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) { 1481 dst.append((char)ch0); 1482 off++; 1483 continue; 1484 } 1485 int j = off + Character.charCount(ch0); 1486 int ch1; 1487 while (j < limit) { 1488 ch1 = src.codePointAt(j); 1489 if (Grapheme.isBoundary(ch0, ch1)) 1490 break; 1491 ch0 = ch1; 1492 j += Character.charCount(ch1); 1493 } 1494 String seq = src.substring(off, j); 1495 String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD); 1496 off = j; 1497 if (nfd.length() > 1) { 1498 ch0 = nfd.codePointAt(0); 1499 ch1 = nfd.codePointAt(Character.charCount(ch0)); 1500 if (Character.getType(ch1) == Character.NON_SPACING_MARK) { 1501 Set<String> altns = new LinkedHashSet<>(); 1502 altns.add(seq); 1503 produceEquivalentAlternation(nfd, altns); 1504 dst.append("(?:"); 1505 altns.forEach( s -> dst.append(s).append('|')); 1506 dst.delete(dst.length() - 1, dst.length()); 1507 dst.append(")"); 1508 continue; 1509 } 1510 } 1511 String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC); 1512 if (!seq.equals(nfc) && !nfd.equals(nfc)) 1513 dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")"); 1514 else if (!seq.equals(nfd)) 1515 dst.append("(?:" + seq + "|" + nfd + ")"); 1516 else 1517 dst.append(seq); 1518 } 1519 } 1520 1521 private static void normalizeClazz(String src, int off, int limit, 1522 StringBuilder dst) 1523 { 1524 dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC)); 1525 } 1526 1527 /** 1528 * Given a specific sequence composed of a regular character and 1529 * combining marks that follow it, produce the alternation that will 1530 * match all canonical equivalences of that sequence. 1531 */ 1532 private static void produceEquivalentAlternation(String src, 1533 Set<String> dst) 1534 { 1535 int len = countChars(src, 0, 1); 1536 if (src.length() == len) { 1537 dst.add(src); // source has one character. 1538 return; 1539 } 1540 String base = src.substring(0,len); 1541 String combiningMarks = src.substring(len); 1542 String[] perms = producePermutations(combiningMarks); 1543 // Add combined permutations 1544 for(int x = 0; x < perms.length; x++) { 1545 String next = base + perms[x]; 1546 dst.add(next); 1547 next = composeOneStep(next); 1548 if (next != null) { 1549 produceEquivalentAlternation(next, dst); 1550 } 1551 } 1552 } 1553 1554 /** 1555 * Returns an array of strings that have all the possible 1556 * permutations of the characters in the input string. 1557 * This is used to get a list of all possible orderings 1558 * of a set of combining marks. Note that some of the permutations 1559 * are invalid because of combining class collisions, and these 1560 * possibilities must be removed because they are not canonically 1561 * equivalent. 1562 */ 1563 private static String[] producePermutations(String input) { 1564 if (input.length() == countChars(input, 0, 1)) 1565 return new String[] {input}; 1566 1567 if (input.length() == countChars(input, 0, 2)) { 1568 int c0 = Character.codePointAt(input, 0); 1569 int c1 = Character.codePointAt(input, Character.charCount(c0)); 1570 if (getClass(c1) == getClass(c0)) { 1571 return new String[] {input}; 1572 } 1573 String[] result = new String[2]; 1574 result[0] = input; 1575 StringBuilder sb = new StringBuilder(2); 1576 sb.appendCodePoint(c1); 1577 sb.appendCodePoint(c0); 1578 result[1] = sb.toString(); 1579 return result; 1580 } 1581 1582 int length = 1; 1583 int nCodePoints = countCodePoints(input); 1584 for(int x=1; x<nCodePoints; x++) 1585 length = length * (x+1); 1586 1587 String[] temp = new String[length]; 1588 1589 int combClass[] = new int[nCodePoints]; 1590 for(int x=0, i=0; x<nCodePoints; x++) { 1591 int c = Character.codePointAt(input, i); 1592 combClass[x] = getClass(c); 1593 i += Character.charCount(c); 1594 } 1595 1596 // For each char, take it out and add the permutations 1597 // of the remaining chars 1598 int index = 0; 1599 int len; 1600 // offset maintains the index in code units. 1601loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { 1602 len = countChars(input, offset, 1); 1603 for(int y=x-1; y>=0; y--) { 1604 if (combClass[y] == combClass[x]) { 1605 continue loop; 1606 } 1607 } 1608 StringBuilder sb = new StringBuilder(input); 1609 String otherChars = sb.delete(offset, offset+len).toString(); 1610 String[] subResult = producePermutations(otherChars); 1611 1612 String prefix = input.substring(offset, offset+len); 1613 for (String sre : subResult) 1614 temp[index++] = prefix + sre; 1615 } 1616 String[] result = new String[index]; 1617 System.arraycopy(temp, 0, result, 0, index); 1618 return result; 1619 } 1620 1621 private static int getClass(int c) { 1622 return sun.text.Normalizer.getCombiningClass(c); 1623 } 1624 1625 /** 1626 * Attempts to compose input by combining the first character 1627 * with the first combining mark following it. Returns a String 1628 * that is the composition of the leading character with its first 1629 * combining mark followed by the remaining combining marks. Returns 1630 * null if the first two characters cannot be further composed. 1631 */ 1632 private static String composeOneStep(String input) { 1633 int len = countChars(input, 0, 2); 1634 String firstTwoCharacters = input.substring(0, len); 1635 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); 1636 if (result.equals(firstTwoCharacters)) 1637 return null; 1638 else { 1639 String remainder = input.substring(len); 1640 return result + remainder; 1641 } 1642 } 1643 1644 /** 1645 * Preprocess any \Q...\E sequences in `temp', meta-quoting them. 1646 * See the description of `quotemeta' in perlfunc(1). 1647 */ 1648 private void RemoveQEQuoting() { 1649 final int pLen = patternLength; 1650 int i = 0; 1651 while (i < pLen-1) { 1652 if (temp[i] != '\\') 1653 i += 1; 1654 else if (temp[i + 1] != 'Q') 1655 i += 2; 1656 else 1657 break; 1658 } 1659 if (i >= pLen - 1) // No \Q sequence found 1660 return; 1661 int j = i; 1662 i += 2; 1663 int[] newtemp = new int[j + 3*(pLen-i) + 2]; 1664 System.arraycopy(temp, 0, newtemp, 0, j); 1665 1666 boolean inQuote = true; 1667 boolean beginQuote = true; 1668 while (i < pLen) { 1669 int c = temp[i++]; 1670 if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) { 1671 newtemp[j++] = c; 1672 } else if (ASCII.isDigit(c)) { 1673 if (beginQuote) { 1674 /* 1675 * A unicode escape \[0xu] could be before this quote, 1676 * and we don't want this numeric char to processed as 1677 * part of the escape. 1678 */ 1679 newtemp[j++] = '\\'; 1680 newtemp[j++] = 'x'; 1681 newtemp[j++] = '3'; 1682 } 1683 newtemp[j++] = c; 1684 } else if (c != '\\') { 1685 if (inQuote) newtemp[j++] = '\\'; 1686 newtemp[j++] = c; 1687 } else if (inQuote) { 1688 if (temp[i] == 'E') { 1689 i++; 1690 inQuote = false; 1691 } else { 1692 newtemp[j++] = '\\'; 1693 newtemp[j++] = '\\'; 1694 } 1695 } else { 1696 if (temp[i] == 'Q') { 1697 i++; 1698 inQuote = true; 1699 beginQuote = true; 1700 continue; 1701 } else { 1702 newtemp[j++] = c; 1703 if (i != pLen) 1704 newtemp[j++] = temp[i++]; 1705 } 1706 } 1707 1708 beginQuote = false; 1709 } 1710 1711 patternLength = j; 1712 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination 1713 } 1714 1715 /** 1716 * Copies regular expression to an int array and invokes the parsing 1717 * of the expression which will create the object tree. 1718 */ 1719 private void compile() { 1720 // Handle canonical equivalences 1721 if (has(CANON_EQ) && !has(LITERAL)) { 1722 normalizedPattern = normalize(pattern); 1723 } else { 1724 normalizedPattern = pattern; 1725 } 1726 patternLength = normalizedPattern.length(); 1727 1728 // Copy pattern to int array for convenience 1729 // Use double zero to terminate pattern 1730 temp = new int[patternLength + 2]; 1731 1732 hasSupplementary = false; 1733 int c, count = 0; 1734 // Convert all chars into code points 1735 for (int x = 0; x < patternLength; x += Character.charCount(c)) { 1736 c = normalizedPattern.codePointAt(x); 1737 if (isSupplementary(c)) { 1738 hasSupplementary = true; 1739 } 1740 temp[count++] = c; 1741 } 1742 1743 patternLength = count; // patternLength now in code points 1744 1745 if (! has(LITERAL)) 1746 RemoveQEQuoting(); 1747 1748 // Allocate all temporary objects here. 1749 buffer = new int[32]; 1750 groupNodes = new GroupHead[10]; 1751 namedGroups = null; 1752 topClosureNodes = new ArrayList<>(10); 1753 1754 if (has(LITERAL)) { 1755 // Literal pattern handling 1756 matchRoot = newSlice(temp, patternLength, hasSupplementary); 1757 matchRoot.next = lastAccept; 1758 } else { 1759 // Start recursive descent parsing 1760 matchRoot = expr(lastAccept); 1761 // Check extra pattern characters 1762 if (patternLength != cursor) { 1763 if (peek() == ')') { 1764 throw error("Unmatched closing ')'"); 1765 } else { 1766 throw error("Unexpected internal error"); 1767 } 1768 } 1769 } 1770 1771 // Peephole optimization 1772 if (matchRoot instanceof Slice) { 1773 root = BnM.optimize(matchRoot); 1774 if (root == matchRoot) { 1775 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1776 } 1777 } else if (matchRoot instanceof Begin || matchRoot instanceof First) { 1778 root = matchRoot; 1779 } else { 1780 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1781 } 1782 1783 // Optimize the greedy Loop to prevent exponential backtracking, IF there 1784 // is no group ref in this pattern. With a non-negative localTCNCount value, 1785 // the greedy type Loop, Curly will skip the backtracking for any starting 1786 // position "i" that failed in the past. 1787 if (!hasGroupRef) { 1788 for (Node node : topClosureNodes) { 1789 if (node instanceof Loop) { 1790 // non-deterministic-greedy-group 1791 ((Loop)node).posIndex = localTCNCount++; 1792 } 1793 } 1794 } 1795 1796 // Release temporary storage 1797 temp = null; 1798 buffer = null; 1799 groupNodes = null; 1800 patternLength = 0; 1801 compiled = true; 1802 topClosureNodes = null; 1803 } 1804 1805 Map<String, Integer> namedGroups() { 1806 Map<String, Integer> groups = namedGroups; 1807 if (groups == null) { 1808 namedGroups = groups = new HashMap<>(2); 1809 } 1810 return groups; 1811 } 1812 1813 /** 1814 * Used to accumulate information about a subtree of the object graph 1815 * so that optimizations can be applied to the subtree. 1816 */ 1817 static final class TreeInfo { 1818 int minLength; 1819 int maxLength; 1820 boolean maxValid; 1821 boolean deterministic; 1822 1823 TreeInfo() { 1824 reset(); 1825 } 1826 void reset() { 1827 minLength = 0; 1828 maxLength = 0; 1829 maxValid = true; 1830 deterministic = true; 1831 } 1832 } 1833 1834 /* 1835 * The following private methods are mainly used to improve the 1836 * readability of the code. In order to let the Java compiler easily 1837 * inline them, we should not put many assertions or error checks in them. 1838 */ 1839 1840 /** 1841 * Indicates whether a particular flag is set or not. 1842 */ 1843 private boolean has(int f) { 1844 return (flags & f) != 0; 1845 } 1846 1847 /** 1848 * Match next character, signal error if failed. 1849 */ 1850 private void accept(int ch, String s) { 1851 int testChar = temp[cursor++]; 1852 if (has(COMMENTS)) 1853 testChar = parsePastWhitespace(testChar); 1854 if (ch != testChar) { 1855 throw error(s); 1856 } 1857 } 1858 1859 /** 1860 * Mark the end of pattern with a specific character. 1861 */ 1862 private void mark(int c) { 1863 temp[patternLength] = c; 1864 } 1865 1866 /** 1867 * Peek the next character, and do not advance the cursor. 1868 */ 1869 private int peek() { 1870 int ch = temp[cursor]; 1871 if (has(COMMENTS)) 1872 ch = peekPastWhitespace(ch); 1873 return ch; 1874 } 1875 1876 /** 1877 * Read the next character, and advance the cursor by one. 1878 */ 1879 private int read() { 1880 int ch = temp[cursor++]; 1881 if (has(COMMENTS)) 1882 ch = parsePastWhitespace(ch); 1883 return ch; 1884 } 1885 1886 /** 1887 * Read the next character, and advance the cursor by one, 1888 * ignoring the COMMENTS setting 1889 */ 1890 private int readEscaped() { 1891 int ch = temp[cursor++]; 1892 return ch; 1893 } 1894 1895 /** 1896 * Advance the cursor by one, and peek the next character. 1897 */ 1898 private int next() { 1899 int ch = temp[++cursor]; 1900 if (has(COMMENTS)) 1901 ch = peekPastWhitespace(ch); 1902 return ch; 1903 } 1904 1905 /** 1906 * Advance the cursor by one, and peek the next character, 1907 * ignoring the COMMENTS setting 1908 */ 1909 private int nextEscaped() { 1910 int ch = temp[++cursor]; 1911 return ch; 1912 } 1913 1914 /** 1915 * If in xmode peek past whitespace and comments. 1916 */ 1917 private int peekPastWhitespace(int ch) { 1918 while (ASCII.isSpace(ch) || ch == '#') { 1919 while (ASCII.isSpace(ch)) 1920 ch = temp[++cursor]; 1921 if (ch == '#') { 1922 ch = peekPastLine(); 1923 } 1924 } 1925 return ch; 1926 } 1927 1928 /** 1929 * If in xmode parse past whitespace and comments. 1930 */ 1931 private int parsePastWhitespace(int ch) { 1932 while (ASCII.isSpace(ch) || ch == '#') { 1933 while (ASCII.isSpace(ch)) 1934 ch = temp[cursor++]; 1935 if (ch == '#') 1936 ch = parsePastLine(); 1937 } 1938 return ch; 1939 } 1940 1941 /** 1942 * xmode parse past comment to end of line. 1943 */ 1944 private int parsePastLine() { 1945 int ch = temp[cursor++]; 1946 while (ch != 0 && !isLineSeparator(ch)) 1947 ch = temp[cursor++]; 1948 return ch; 1949 } 1950 1951 /** 1952 * xmode peek past comment to end of line. 1953 */ 1954 private int peekPastLine() { 1955 int ch = temp[++cursor]; 1956 while (ch != 0 && !isLineSeparator(ch)) 1957 ch = temp[++cursor]; 1958 return ch; 1959 } 1960 1961 /** 1962 * Determines if character is a line separator in the current mode 1963 */ 1964 private boolean isLineSeparator(int ch) { 1965 if (has(UNIX_LINES)) { 1966 return ch == '\n'; 1967 } else { 1968 return (ch == '\n' || 1969 ch == '\r' || 1970 (ch|1) == '\u2029' || 1971 ch == '\u0085'); 1972 } 1973 } 1974 1975 /** 1976 * Read the character after the next one, and advance the cursor by two. 1977 */ 1978 private int skip() { 1979 int i = cursor; 1980 int ch = temp[i+1]; 1981 cursor = i + 2; 1982 return ch; 1983 } 1984 1985 /** 1986 * Unread one next character, and retreat cursor by one. 1987 */ 1988 private void unread() { 1989 cursor--; 1990 } 1991 1992 /** 1993 * Internal method used for handling all syntax errors. The pattern is 1994 * displayed with a pointer to aid in locating the syntax error. 1995 */ 1996 private PatternSyntaxException error(String s) { 1997 return new PatternSyntaxException(s, normalizedPattern, cursor - 1); 1998 } 1999 2000 /** 2001 * Determines if there is any supplementary character or unpaired 2002 * surrogate in the specified range. 2003 */ 2004 private boolean findSupplementary(int start, int end) { 2005 for (int i = start; i < end; i++) { 2006 if (isSupplementary(temp[i])) 2007 return true; 2008 } 2009 return false; 2010 } 2011 2012 /** 2013 * Determines if the specified code point is a supplementary 2014 * character or unpaired surrogate. 2015 */ 2016 private static final boolean isSupplementary(int ch) { 2017 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || 2018 Character.isSurrogate((char)ch); 2019 } 2020 2021 /** 2022 * The following methods handle the main parsing. They are sorted 2023 * according to their precedence order, the lowest one first. 2024 */ 2025 2026 /** 2027 * The expression is parsed with branch nodes added for alternations. 2028 * This may be called recursively to parse sub expressions that may 2029 * contain alternations. 2030 */ 2031 private Node expr(Node end) { 2032 Node prev = null; 2033 Node firstTail = null; 2034 Branch branch = null; 2035 Node branchConn = null; 2036 2037 for (;;) { 2038 Node node = sequence(end); 2039 Node nodeTail = root; //double return 2040 if (prev == null) { 2041 prev = node; 2042 firstTail = nodeTail; 2043 } else { 2044 // Branch 2045 if (branchConn == null) { 2046 branchConn = new BranchConn(); 2047 branchConn.next = end; 2048 } 2049 if (node == end) { 2050 // if the node returned from sequence() is "end" 2051 // we have an empty expr, set a null atom into 2052 // the branch to indicate to go "next" directly. 2053 node = null; 2054 } else { 2055 // the "tail.next" of each atom goes to branchConn 2056 nodeTail.next = branchConn; 2057 } 2058 if (prev == branch) { 2059 branch.add(node); 2060 } else { 2061 if (prev == end) { 2062 prev = null; 2063 } else { 2064 // replace the "end" with "branchConn" at its tail.next 2065 // when put the "prev" into the branch as the first atom. 2066 firstTail.next = branchConn; 2067 } 2068 prev = branch = new Branch(prev, node, branchConn); 2069 } 2070 } 2071 if (peek() != '|') { 2072 return prev; 2073 } 2074 next(); 2075 } 2076 } 2077 2078 @SuppressWarnings("fallthrough") 2079 /** 2080 * Parsing of sequences between alternations. 2081 */ 2082 private Node sequence(Node end) { 2083 Node head = null; 2084 Node tail = null; 2085 Node node = null; 2086 LOOP: 2087 for (;;) { 2088 int ch = peek(); 2089 switch (ch) { 2090 case '(': 2091 // Because group handles its own closure, 2092 // we need to treat it differently 2093 node = group0(); 2094 // Check for comment or flag group 2095 if (node == null) 2096 continue; 2097 if (head == null) 2098 head = node; 2099 else 2100 tail.next = node; 2101 // Double return: Tail was returned in root 2102 tail = root; 2103 continue; 2104 case '[': 2105 if (has(CANON_EQ) && !has(LITERAL)) 2106 node = new NFCCharProperty(clazz(true)); 2107 else 2108 node = newCharProperty(clazz(true)); 2109 break; 2110 case '\\': 2111 ch = nextEscaped(); 2112 if (ch == 'p' || ch == 'P') { 2113 boolean oneLetter = true; 2114 boolean comp = (ch == 'P'); 2115 ch = next(); // Consume { if present 2116 if (ch != '{') { 2117 unread(); 2118 } else { 2119 oneLetter = false; 2120 } 2121 // node = newCharProperty(family(oneLetter, comp)); 2122 if (has(CANON_EQ) && !has(LITERAL)) 2123 node = new NFCCharProperty(family(oneLetter, comp)); 2124 else 2125 node = newCharProperty(family(oneLetter, comp)); 2126 } else { 2127 unread(); 2128 node = atom(); 2129 } 2130 break; 2131 case '^': 2132 next(); 2133 if (has(MULTILINE)) { 2134 if (has(UNIX_LINES)) 2135 node = new UnixCaret(); 2136 else 2137 node = new Caret(); 2138 } else { 2139 node = new Begin(); 2140 } 2141 break; 2142 case '$': 2143 next(); 2144 if (has(UNIX_LINES)) 2145 node = new UnixDollar(has(MULTILINE)); 2146 else 2147 node = new Dollar(has(MULTILINE)); 2148 break; 2149 case '.': 2150 next(); 2151 if (has(DOTALL)) { 2152 node = new CharProperty(ALL()); 2153 } else { 2154 if (has(UNIX_LINES)) { 2155 node = new CharProperty(UNIXDOT()); 2156 } else { 2157 node = new CharProperty(DOT()); 2158 } 2159 } 2160 break; 2161 case '|': 2162 case ')': 2163 break LOOP; 2164 case ']': // Now interpreting dangling ] and } as literals 2165 case '}': 2166 node = atom(); 2167 break; 2168 case '?': 2169 case '*': 2170 case '+': 2171 next(); 2172 throw error("Dangling meta character '" + ((char)ch) + "'"); 2173 case 0: 2174 if (cursor >= patternLength) { 2175 break LOOP; 2176 } 2177 // Fall through 2178 default: 2179 node = atom(); 2180 break; 2181 } 2182 2183 node = closure(node); 2184 /* save the top dot-greedy nodes (.*, .+) as well 2185 if (node instanceof GreedyCharProperty && 2186 ((GreedyCharProperty)node).cp instanceof Dot) { 2187 topClosureNodes.add(node); 2188 } 2189 */ 2190 if (head == null) { 2191 head = tail = node; 2192 } else { 2193 tail.next = node; 2194 tail = node; 2195 } 2196 } 2197 if (head == null) { 2198 return end; 2199 } 2200 tail.next = end; 2201 root = tail; //double return 2202 return head; 2203 } 2204 2205 @SuppressWarnings("fallthrough") 2206 /** 2207 * Parse and add a new Single or Slice. 2208 */ 2209 private Node atom() { 2210 int first = 0; 2211 int prev = -1; 2212 boolean hasSupplementary = false; 2213 int ch = peek(); 2214 for (;;) { 2215 switch (ch) { 2216 case '*': 2217 case '+': 2218 case '?': 2219 case '{': 2220 if (first > 1) { 2221 cursor = prev; // Unwind one character 2222 first--; 2223 } 2224 break; 2225 case '$': 2226 case '.': 2227 case '^': 2228 case '(': 2229 case '[': 2230 case '|': 2231 case ')': 2232 break; 2233 case '\\': 2234 ch = nextEscaped(); 2235 if (ch == 'p' || ch == 'P') { // Property 2236 if (first > 0) { // Slice is waiting; handle it first 2237 unread(); 2238 break; 2239 } else { // No slice; just return the family node 2240 boolean comp = (ch == 'P'); 2241 boolean oneLetter = true; 2242 ch = next(); // Consume { if present 2243 if (ch != '{') 2244 unread(); 2245 else 2246 oneLetter = false; 2247 if (has(CANON_EQ) && !has(LITERAL)) 2248 return new NFCCharProperty(family(oneLetter, comp)); 2249 else 2250 return newCharProperty(family(oneLetter, comp)); 2251 } 2252 } 2253 unread(); 2254 prev = cursor; 2255 ch = escape(false, first == 0, false); 2256 if (ch >= 0) { 2257 append(ch, first); 2258 first++; 2259 if (isSupplementary(ch)) { 2260 hasSupplementary = true; 2261 } 2262 ch = peek(); 2263 continue; 2264 } else if (first == 0) { 2265 return root; 2266 } 2267 // Unwind meta escape sequence 2268 cursor = prev; 2269 break; 2270 case 0: 2271 if (cursor >= patternLength) { 2272 break; 2273 } 2274 // Fall through 2275 default: 2276 prev = cursor; 2277 append(ch, first); 2278 first++; 2279 if (isSupplementary(ch)) { 2280 hasSupplementary = true; 2281 } 2282 ch = next(); 2283 continue; 2284 } 2285 break; 2286 } 2287 if (first == 1) { 2288 return newCharProperty(single(buffer[0])); 2289 } else { 2290 return newSlice(buffer, first, hasSupplementary); 2291 } 2292 } 2293 2294 private void append(int ch, int len) { 2295 if (len >= buffer.length) { 2296 int[] tmp = new int[len+len]; 2297 System.arraycopy(buffer, 0, tmp, 0, len); 2298 buffer = tmp; 2299 } 2300 buffer[len] = ch; 2301 } 2302 2303 /** 2304 * Parses a backref greedily, taking as many numbers as it 2305 * can. The first digit is always treated as a backref, but 2306 * multi digit numbers are only treated as a backref if at 2307 * least that many backrefs exist at this point in the regex. 2308 */ 2309 private Node ref(int refNum) { 2310 boolean done = false; 2311 while(!done) { 2312 int ch = peek(); 2313 switch(ch) { 2314 case '0': 2315 case '1': 2316 case '2': 2317 case '3': 2318 case '4': 2319 case '5': 2320 case '6': 2321 case '7': 2322 case '8': 2323 case '9': 2324 int newRefNum = (refNum * 10) + (ch - '0'); 2325 // Add another number if it doesn't make a group 2326 // that doesn't exist 2327 if (capturingGroupCount - 1 < newRefNum) { 2328 done = true; 2329 break; 2330 } 2331 refNum = newRefNum; 2332 read(); 2333 break; 2334 default: 2335 done = true; 2336 break; 2337 } 2338 } 2339 hasGroupRef = true; 2340 if (has(CASE_INSENSITIVE)) 2341 return new CIBackRef(refNum, has(UNICODE_CASE)); 2342 else 2343 return new BackRef(refNum); 2344 } 2345 2346 /** 2347 * Parses an escape sequence to determine the actual value that needs 2348 * to be matched. 2349 * If -1 is returned and create was true a new object was added to the tree 2350 * to handle the escape sequence. 2351 * If the returned value is greater than zero, it is the value that 2352 * matches the escape sequence. 2353 */ 2354 private int escape(boolean inclass, boolean create, boolean isrange) { 2355 int ch = skip(); 2356 switch (ch) { 2357 case '0': 2358 return o(); 2359 case '1': 2360 case '2': 2361 case '3': 2362 case '4': 2363 case '5': 2364 case '6': 2365 case '7': 2366 case '8': 2367 case '9': 2368 if (inclass) break; 2369 if (create) { 2370 root = ref((ch - '0')); 2371 } 2372 return -1; 2373 case 'A': 2374 if (inclass) break; 2375 if (create) root = new Begin(); 2376 return -1; 2377 case 'B': 2378 if (inclass) break; 2379 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); 2380 return -1; 2381 case 'C': 2382 break; 2383 case 'D': 2384 if (create) { 2385 predicate = has(UNICODE_CHARACTER_CLASS) ? 2386 CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT(); 2387 predicate = predicate.negate(); 2388 if (!inclass) 2389 root = newCharProperty(predicate); 2390 } 2391 return -1; 2392 case 'E': 2393 case 'F': 2394 break; 2395 case 'G': 2396 if (inclass) break; 2397 if (create) root = new LastMatch(); 2398 return -1; 2399 case 'H': 2400 if (create) { 2401 predicate = HorizWS().negate(); 2402 if (!inclass) 2403 root = newCharProperty(predicate); 2404 } 2405 return -1; 2406 case 'I': 2407 case 'J': 2408 case 'K': 2409 case 'L': 2410 case 'M': 2411 break; 2412 case 'N': 2413 return N(); 2414 case 'O': 2415 case 'P': 2416 case 'Q': 2417 break; 2418 case 'R': 2419 if (inclass) break; 2420 if (create) root = new LineEnding(); 2421 return -1; 2422 case 'S': 2423 if (create) { 2424 predicate = has(UNICODE_CHARACTER_CLASS) ? 2425 CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE(); 2426 predicate = predicate.negate(); 2427 if (!inclass) 2428 root = newCharProperty(predicate); 2429 } 2430 return -1; 2431 case 'T': 2432 case 'U': 2433 break; 2434 case 'V': 2435 if (create) { 2436 predicate = VertWS().negate(); 2437 if (!inclass) 2438 root = newCharProperty(predicate); 2439 } 2440 return -1; 2441 case 'W': 2442 if (create) { 2443 predicate = has(UNICODE_CHARACTER_CLASS) ? 2444 CharPredicates.WORD() : CharPredicates.ASCII_WORD(); 2445 predicate = predicate.negate(); 2446 if (!inclass) 2447 root = newCharProperty(predicate); 2448 } 2449 return -1; 2450 case 'X': 2451 if (inclass) break; 2452 if (create) { 2453 root = new XGrapheme(); 2454 } 2455 return -1; 2456 case 'Y': 2457 break; 2458 case 'Z': 2459 if (inclass) break; 2460 if (create) { 2461 if (has(UNIX_LINES)) 2462 root = new UnixDollar(false); 2463 else 2464 root = new Dollar(false); 2465 } 2466 return -1; 2467 case 'a': 2468 return '\007'; 2469 case 'b': 2470 if (inclass) break; 2471 if (create) { 2472 if (peek() == '{') { 2473 if (skip() == 'g') { 2474 if (read() == '}') { 2475 root = new GraphemeBound(); 2476 return -1; 2477 } 2478 break; // error missing trailing } 2479 } 2480 unread(); unread(); 2481 } 2482 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); 2483 } 2484 return -1; 2485 case 'c': 2486 return c(); 2487 case 'd': 2488 if (create) { 2489 predicate = has(UNICODE_CHARACTER_CLASS) ? 2490 CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT(); 2491 if (!inclass) 2492 root = newCharProperty(predicate); 2493 } 2494 return -1; 2495 case 'e': 2496 return '\033'; 2497 case 'f': 2498 return '\f'; 2499 case 'g': 2500 break; 2501 case 'h': 2502 if (create) { 2503 predicate = HorizWS(); 2504 if (!inclass) 2505 root = newCharProperty(predicate); 2506 } 2507 return -1; 2508 case 'i': 2509 case 'j': 2510 break; 2511 case 'k': 2512 if (inclass) 2513 break; 2514 if (read() != '<') 2515 throw error("\\k is not followed by '<' for named capturing group"); 2516 String name = groupname(read()); 2517 if (!namedGroups().containsKey(name)) 2518 throw error("(named capturing group <"+ name+"> does not exit"); 2519 if (create) { 2520 hasGroupRef = true; 2521 if (has(CASE_INSENSITIVE)) 2522 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); 2523 else 2524 root = new BackRef(namedGroups().get(name)); 2525 } 2526 return -1; 2527 case 'l': 2528 case 'm': 2529 break; 2530 case 'n': 2531 return '\n'; 2532 case 'o': 2533 case 'p': 2534 case 'q': 2535 break; 2536 case 'r': 2537 return '\r'; 2538 case 's': 2539 if (create) { 2540 predicate = has(UNICODE_CHARACTER_CLASS) ? 2541 CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE(); 2542 if (!inclass) 2543 root = newCharProperty(predicate); 2544 } 2545 return -1; 2546 case 't': 2547 return '\t'; 2548 case 'u': 2549 return u(); 2550 case 'v': 2551 // '\v' was implemented as VT/0x0B in releases < 1.8 (though 2552 // undocumented). In JDK8 '\v' is specified as a predefined 2553 // character class for all vertical whitespace characters. 2554 // So [-1, root=VertWS node] pair is returned (instead of a 2555 // single 0x0B). This breaks the range if '\v' is used as 2556 // the start or end value, such as [\v-...] or [...-\v], in 2557 // which a single definite value (0x0B) is expected. For 2558 // compatibility concern '\013'/0x0B is returned if isrange. 2559 if (isrange) 2560 return '\013'; 2561 if (create) { 2562 predicate = VertWS(); 2563 if (!inclass) 2564 root = newCharProperty(predicate); 2565 } 2566 return -1; 2567 case 'w': 2568 if (create) { 2569 predicate = has(UNICODE_CHARACTER_CLASS) ? 2570 CharPredicates.WORD() : CharPredicates.ASCII_WORD(); 2571 if (!inclass) 2572 root = newCharProperty(predicate); 2573 } 2574 return -1; 2575 case 'x': 2576 return x(); 2577 case 'y': 2578 break; 2579 case 'z': 2580 if (inclass) break; 2581 if (create) root = new End(); 2582 return -1; 2583 default: 2584 return ch; 2585 } 2586 throw error("Illegal/unsupported escape sequence"); 2587 } 2588 2589 /** 2590 * Parse a character class, and return the node that matches it. 2591 * 2592 * Consumes a ] on the way out if consume is true. Usually consume 2593 * is true except for the case of [abc&&def] where def is a separate 2594 * right hand node with "understood" brackets. 2595 */ 2596 private CharPredicate clazz(boolean consume) { 2597 CharPredicate prev = null; 2598 CharPredicate curr = null; 2599 BitClass bits = new BitClass(); 2600 BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch]; 2601 2602 boolean isNeg = false; 2603 boolean hasBits = false; 2604 int ch = next(); 2605 2606 // Negates if first char in a class, otherwise literal 2607 if (ch == '^' && temp[cursor-1] == '[') { 2608 ch = next(); 2609 isNeg = true; 2610 } 2611 for (;;) { 2612 switch (ch) { 2613 case '[': 2614 curr = clazz(true); 2615 if (prev == null) 2616 prev = curr; 2617 else 2618 prev = prev.union(curr); 2619 ch = peek(); 2620 continue; 2621 case '&': 2622 ch = next(); 2623 if (ch == '&') { 2624 ch = next(); 2625 CharPredicate right = null; 2626 while (ch != ']' && ch != '&') { 2627 if (ch == '[') { 2628 if (right == null) 2629 right = clazz(true); 2630 else 2631 right = right.union(clazz(true)); 2632 } else { // abc&&def 2633 unread(); 2634 right = clazz(false); 2635 } 2636 ch = peek(); 2637 } 2638 if (hasBits) { 2639 // bits used, union has high precedence 2640 if (prev == null) { 2641 prev = curr = bitsP; 2642 } else { 2643 prev = prev.union(bitsP); 2644 } 2645 hasBits = false; 2646 } 2647 if (right != null) 2648 curr = right; 2649 if (prev == null) { 2650 if (right == null) 2651 throw error("Bad class syntax"); 2652 else 2653 prev = right; 2654 } else { 2655 prev = prev.and(curr); 2656 } 2657 } else { 2658 // treat as a literal & 2659 unread(); 2660 break; 2661 } 2662 continue; 2663 case 0: 2664 if (cursor >= patternLength) 2665 throw error("Unclosed character class"); 2666 break; 2667 case ']': 2668 if (prev != null || hasBits) { 2669 if (consume) 2670 next(); 2671 if (prev == null) 2672 prev = bitsP; 2673 else if (hasBits) 2674 prev = prev.union(bitsP); 2675 if (isNeg) 2676 return prev.negate(); 2677 return prev; 2678 } 2679 break; 2680 default: 2681 break; 2682 } 2683 curr = range(bits); 2684 if (curr == null) { // the bits used 2685 hasBits = true; 2686 } else { 2687 if (prev == null) 2688 prev = curr; 2689 else if (prev != curr) 2690 prev = prev.union(curr); 2691 } 2692 ch = peek(); 2693 } 2694 } 2695 2696 private CharPredicate bitsOrSingle(BitClass bits, int ch) { 2697 /* Bits can only handle codepoints in [u+0000-u+00ff] range. 2698 Use "single" node instead of bits when dealing with unicode 2699 case folding for codepoints listed below. 2700 (1)Uppercase out of range: u+00ff, u+00b5 2701 toUpperCase(u+00ff) -> u+0178 2702 toUpperCase(u+00b5) -> u+039c 2703 (2)LatinSmallLetterLongS u+17f 2704 toUpperCase(u+017f) -> u+0053 2705 (3)LatinSmallLetterDotlessI u+131 2706 toUpperCase(u+0131) -> u+0049 2707 (4)LatinCapitalLetterIWithDotAbove u+0130 2708 toLowerCase(u+0130) -> u+0069 2709 (5)KelvinSign u+212a 2710 toLowerCase(u+212a) ==> u+006B 2711 (6)AngstromSign u+212b 2712 toLowerCase(u+212b) ==> u+00e5 2713 */ 2714 if (ch < 256 && 2715 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && 2716 (ch == 0xff || ch == 0xb5 || 2717 ch == 0x49 || ch == 0x69 || //I and i 2718 ch == 0x53 || ch == 0x73 || //S and s 2719 ch == 0x4b || ch == 0x6b || //K and k 2720 ch == 0xc5 || ch == 0xe5))) { //A+ring 2721 bits.add(ch, flags()); 2722 return null; 2723 } 2724 return single(ch); 2725 } 2726 2727 /** 2728 * Returns a suitably optimized, single character predicate 2729 */ 2730 private CharPredicate single(final int ch) { 2731 if (has(CASE_INSENSITIVE)) { 2732 int lower, upper; 2733 if (has(UNICODE_CASE)) { 2734 upper = Character.toUpperCase(ch); 2735 lower = Character.toLowerCase(upper); 2736 // Unicode case insensitive matches 2737 if (upper != lower) 2738 return SingleU(lower); 2739 } else if (ASCII.isAscii(ch)) { 2740 lower = ASCII.toLower(ch); 2741 upper = ASCII.toUpper(ch); 2742 // Case insensitive matches a given BMP character 2743 if (lower != upper) 2744 return SingleI(lower, upper); 2745 } 2746 } 2747 if (isSupplementary(ch)) 2748 return SingleS(ch); 2749 return Single(ch); // Match a given BMP character 2750 } 2751 2752 /** 2753 * Parse a single character or a character range in a character class 2754 * and return its representative node. 2755 */ 2756 private CharPredicate range(BitClass bits) { 2757 int ch = peek(); 2758 if (ch == '\\') { 2759 ch = nextEscaped(); 2760 if (ch == 'p' || ch == 'P') { // A property 2761 boolean comp = (ch == 'P'); 2762 boolean oneLetter = true; 2763 // Consume { if present 2764 ch = next(); 2765 if (ch != '{') 2766 unread(); 2767 else 2768 oneLetter = false; 2769 return family(oneLetter, comp); 2770 } else { // ordinary escape 2771 boolean isrange = temp[cursor+1] == '-'; 2772 unread(); 2773 ch = escape(true, true, isrange); 2774 if (ch == -1) 2775 return predicate; 2776 } 2777 } else { 2778 next(); 2779 } 2780 if (ch >= 0) { 2781 if (peek() == '-') { 2782 int endRange = temp[cursor+1]; 2783 if (endRange == '[') { 2784 return bitsOrSingle(bits, ch); 2785 } 2786 if (endRange != ']') { 2787 next(); 2788 int m = peek(); 2789 if (m == '\\') { 2790 m = escape(true, false, true); 2791 } else { 2792 next(); 2793 } 2794 if (m < ch) { 2795 throw error("Illegal character range"); 2796 } 2797 if (has(CASE_INSENSITIVE)) { 2798 if (has(UNICODE_CASE)) 2799 return CIRangeU(ch, m); 2800 return CIRange(ch, m); 2801 } else { 2802 return Range(ch, m); 2803 } 2804 } 2805 } 2806 return bitsOrSingle(bits, ch); 2807 } 2808 throw error("Unexpected character '"+((char)ch)+"'"); 2809 } 2810 2811 /** 2812 * Parses a Unicode character family and returns its representative node. 2813 */ 2814 private CharPredicate family(boolean singleLetter, boolean isComplement) { 2815 next(); 2816 String name; 2817 CharPredicate p = null; 2818 2819 if (singleLetter) { 2820 int c = temp[cursor]; 2821 if (!Character.isSupplementaryCodePoint(c)) { 2822 name = String.valueOf((char)c); 2823 } else { 2824 name = new String(temp, cursor, 1); 2825 } 2826 read(); 2827 } else { 2828 int i = cursor; 2829 mark('}'); 2830 while(read() != '}') { 2831 } 2832 mark('\000'); 2833 int j = cursor; 2834 if (j > patternLength) 2835 throw error("Unclosed character family"); 2836 if (i + 1 >= j) 2837 throw error("Empty character family"); 2838 name = new String(temp, i, j-i-1); 2839 } 2840 2841 int i = name.indexOf('='); 2842 if (i != -1) { 2843 // property construct \p{name=value} 2844 String value = name.substring(i + 1); 2845 name = name.substring(0, i).toLowerCase(Locale.ENGLISH); 2846 switch (name) { 2847 case "sc": 2848 case "script": 2849 p = CharPredicates.forUnicodeScript(value); 2850 break; 2851 case "blk": 2852 case "block": 2853 p = CharPredicates.forUnicodeBlock(value); 2854 break; 2855 case "gc": 2856 case "general_category": 2857 p = CharPredicates.forProperty(value); 2858 break; 2859 default: 2860 break; 2861 } 2862 if (p == null) 2863 throw error("Unknown Unicode property {name=<" + name + ">, " 2864 + "value=<" + value + ">}"); 2865 2866 } else { 2867 if (name.startsWith("In")) { 2868 // \p{InBlockName} 2869 p = CharPredicates.forUnicodeBlock(name.substring(2)); 2870 } else if (name.startsWith("Is")) { 2871 // \p{IsGeneralCategory} and \p{IsScriptName} 2872 name = name.substring(2); 2873 p = CharPredicates.forUnicodeProperty(name); 2874 if (p == null) 2875 p = CharPredicates.forProperty(name); 2876 if (p == null) 2877 p = CharPredicates.forUnicodeScript(name); 2878 } else { 2879 if (has(UNICODE_CHARACTER_CLASS)) { 2880 p = CharPredicates.forPOSIXName(name); 2881 } 2882 if (p == null) 2883 p = CharPredicates.forProperty(name); 2884 } 2885 if (p == null) 2886 throw error("Unknown character property name {In/Is" + name + "}"); 2887 } 2888 if (isComplement) { 2889 // it might be too expensive to detect if a complement of 2890 // CharProperty can match "certain" supplementary. So just 2891 // go with StartS. 2892 hasSupplementary = true; 2893 p = p.negate(); 2894 } 2895 return p; 2896 } 2897 2898 private CharProperty newCharProperty(CharPredicate p) { 2899 if (p == null) 2900 return null; 2901 if (p instanceof BmpCharPredicate) 2902 return new BmpCharProperty((BmpCharPredicate)p); 2903 else 2904 return new CharProperty(p); 2905 } 2906 2907 /** 2908 * Parses and returns the name of a "named capturing group", the trailing 2909 * ">" is consumed after parsing. 2910 */ 2911 private String groupname(int ch) { 2912 StringBuilder sb = new StringBuilder(); 2913 sb.append(Character.toChars(ch)); 2914 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) || 2915 ASCII.isDigit(ch)) { 2916 sb.append(Character.toChars(ch)); 2917 } 2918 if (sb.length() == 0) 2919 throw error("named capturing group has 0 length name"); 2920 if (ch != '>') 2921 throw error("named capturing group is missing trailing '>'"); 2922 return sb.toString(); 2923 } 2924 2925 /** 2926 * Parses a group and returns the head node of a set of nodes that process 2927 * the group. Sometimes a double return system is used where the tail is 2928 * returned in root. 2929 */ 2930 private Node group0() { 2931 boolean capturingGroup = false; 2932 Node head = null; 2933 Node tail = null; 2934 int save = flags; 2935 int saveTCNCount = topClosureNodes.size(); 2936 root = null; 2937 int ch = next(); 2938 if (ch == '?') { 2939 ch = skip(); 2940 switch (ch) { 2941 case ':': // (?:xxx) pure group 2942 head = createGroup(true); 2943 tail = root; 2944 head.next = expr(tail); 2945 break; 2946 case '=': // (?=xxx) and (?!xxx) lookahead 2947 case '!': 2948 head = createGroup(true); 2949 tail = root; 2950 head.next = expr(tail); 2951 if (ch == '=') { 2952 head = tail = new Pos(head); 2953 } else { 2954 head = tail = new Neg(head); 2955 } 2956 break; 2957 case '>': // (?>xxx) independent group 2958 head = createGroup(true); 2959 tail = root; 2960 head.next = expr(tail); 2961 head = tail = new Ques(head, Qtype.INDEPENDENT); 2962 break; 2963 case '<': // (?<xxx) look behind 2964 ch = read(); 2965 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { 2966 // named captured group 2967 String name = groupname(ch); 2968 if (namedGroups().containsKey(name)) 2969 throw error("Named capturing group <" + name 2970 + "> is already defined"); 2971 capturingGroup = true; 2972 head = createGroup(false); 2973 tail = root; 2974 namedGroups().put(name, capturingGroupCount-1); 2975 head.next = expr(tail); 2976 break; 2977 } 2978 int start = cursor; 2979 head = createGroup(true); 2980 tail = root; 2981 head.next = expr(tail); 2982 tail.next = lookbehindEnd; 2983 TreeInfo info = new TreeInfo(); 2984 head.study(info); 2985 if (info.maxValid == false) { 2986 throw error("Look-behind group does not have " 2987 + "an obvious maximum length"); 2988 } 2989 boolean hasSupplementary = findSupplementary(start, patternLength); 2990 if (ch == '=') { 2991 head = tail = (hasSupplementary ? 2992 new BehindS(head, info.maxLength, 2993 info.minLength) : 2994 new Behind(head, info.maxLength, 2995 info.minLength)); 2996 } else if (ch == '!') { 2997 head = tail = (hasSupplementary ? 2998 new NotBehindS(head, info.maxLength, 2999 info.minLength) : 3000 new NotBehind(head, info.maxLength, 3001 info.minLength)); 3002 } else { 3003 throw error("Unknown look-behind group"); 3004 } 3005 // clear all top-closure-nodes inside lookbehind 3006 if (saveTCNCount < topClosureNodes.size()) 3007 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); 3008 break; 3009 case '$': 3010 case '@': 3011 throw error("Unknown group type"); 3012 default: // (?xxx:) inlined match flags 3013 unread(); 3014 addFlag(); 3015 ch = read(); 3016 if (ch == ')') { 3017 return null; // Inline modifier only 3018 } 3019 if (ch != ':') { 3020 throw error("Unknown inline modifier"); 3021 } 3022 head = createGroup(true); 3023 tail = root; 3024 head.next = expr(tail); 3025 break; 3026 } 3027 } else { // (xxx) a regular group 3028 capturingGroup = true; 3029 head = createGroup(false); 3030 tail = root; 3031 head.next = expr(tail); 3032 } 3033 3034 accept(')', "Unclosed group"); 3035 flags = save; 3036 3037 // Check for quantifiers 3038 Node node = closure(head); 3039 if (node == head) { // No closure 3040 root = tail; 3041 return node; // Dual return 3042 } 3043 if (head == tail) { // Zero length assertion 3044 root = node; 3045 return node; // Dual return 3046 } 3047 3048 // have group closure, clear all inner closure nodes from the 3049 // top list (no backtracking stopper optimization for inner 3050 if (saveTCNCount < topClosureNodes.size()) 3051 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); 3052 3053 if (node instanceof Ques) { 3054 Ques ques = (Ques) node; 3055 if (ques.type == Qtype.POSSESSIVE) { 3056 root = node; 3057 return node; 3058 } 3059 tail.next = new BranchConn(); 3060 tail = tail.next; 3061 if (ques.type == Qtype.GREEDY) { 3062 head = new Branch(head, null, tail); 3063 } else { // Reluctant quantifier 3064 head = new Branch(null, head, tail); 3065 } 3066 root = tail; 3067 return head; 3068 } else if (node instanceof Curly) { 3069 Curly curly = (Curly) node; 3070 if (curly.type == Qtype.POSSESSIVE) { 3071 root = node; 3072 return node; 3073 } 3074 // Discover if the group is deterministic 3075 TreeInfo info = new TreeInfo(); 3076 if (head.study(info)) { // Deterministic 3077 GroupTail temp = (GroupTail) tail; 3078 head = root = new GroupCurly(head.next, curly.cmin, 3079 curly.cmax, curly.type, 3080 ((GroupTail)tail).localIndex, 3081 ((GroupTail)tail).groupIndex, 3082 capturingGroup); 3083 return head; 3084 } else { // Non-deterministic 3085 int temp = ((GroupHead) head).localIndex; 3086 Loop loop; 3087 if (curly.type == Qtype.GREEDY) { 3088 loop = new Loop(this.localCount, temp); 3089 // add the max_reps greedy to the top-closure-node list 3090 if (curly.cmax == MAX_REPS) 3091 topClosureNodes.add(loop); 3092 } else { // Reluctant Curly 3093 loop = new LazyLoop(this.localCount, temp); 3094 } 3095 Prolog prolog = new Prolog(loop); 3096 this.localCount += 1; 3097 loop.cmin = curly.cmin; 3098 loop.cmax = curly.cmax; 3099 loop.body = head; 3100 tail.next = loop; 3101 root = loop; 3102 return prolog; // Dual return 3103 } 3104 } 3105 throw error("Internal logic error"); 3106 } 3107 3108 /** 3109 * Create group head and tail nodes using double return. If the group is 3110 * created with anonymous true then it is a pure group and should not 3111 * affect group counting. 3112 */ 3113 private Node createGroup(boolean anonymous) { 3114 int localIndex = localCount++; 3115 int groupIndex = 0; 3116 if (!anonymous) 3117 groupIndex = capturingGroupCount++; 3118 GroupHead head = new GroupHead(localIndex); 3119 root = new GroupTail(localIndex, groupIndex); 3120 3121 // for debug/print only, head.match does NOT need the "tail" info 3122 head.tail = (GroupTail)root; 3123 3124 if (!anonymous && groupIndex < 10) 3125 groupNodes[groupIndex] = head; 3126 return head; 3127 } 3128 3129 @SuppressWarnings("fallthrough") 3130 /** 3131 * Parses inlined match flags and set them appropriately. 3132 */ 3133 private void addFlag() { 3134 int ch = peek(); 3135 for (;;) { 3136 switch (ch) { 3137 case 'i': 3138 flags |= CASE_INSENSITIVE; 3139 break; 3140 case 'm': 3141 flags |= MULTILINE; 3142 break; 3143 case 's': 3144 flags |= DOTALL; 3145 break; 3146 case 'd': 3147 flags |= UNIX_LINES; 3148 break; 3149 case 'u': 3150 flags |= UNICODE_CASE; 3151 break; 3152 case 'c': 3153 flags |= CANON_EQ; 3154 break; 3155 case 'x': 3156 flags |= COMMENTS; 3157 break; 3158 case 'U': 3159 flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3160 break; 3161 case '-': // subFlag then fall through 3162 ch = next(); 3163 subFlag(); 3164 default: 3165 return; 3166 } 3167 ch = next(); 3168 } 3169 } 3170 3171 @SuppressWarnings("fallthrough") 3172 /** 3173 * Parses the second part of inlined match flags and turns off 3174 * flags appropriately. 3175 */ 3176 private void subFlag() { 3177 int ch = peek(); 3178 for (;;) { 3179 switch (ch) { 3180 case 'i': 3181 flags &= ~CASE_INSENSITIVE; 3182 break; 3183 case 'm': 3184 flags &= ~MULTILINE; 3185 break; 3186 case 's': 3187 flags &= ~DOTALL; 3188 break; 3189 case 'd': 3190 flags &= ~UNIX_LINES; 3191 break; 3192 case 'u': 3193 flags &= ~UNICODE_CASE; 3194 break; 3195 case 'c': 3196 flags &= ~CANON_EQ; 3197 break; 3198 case 'x': 3199 flags &= ~COMMENTS; 3200 break; 3201 case 'U': 3202 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3203 break; 3204 default: 3205 return; 3206 } 3207 ch = next(); 3208 } 3209 } 3210 3211 static final int MAX_REPS = 0x7FFFFFFF; 3212 3213 static enum Qtype { 3214 GREEDY, LAZY, POSSESSIVE, INDEPENDENT 3215 } 3216 3217 private Node curly(Node prev, int cmin) { 3218 int ch = next(); 3219 if (ch == '?') { 3220 next(); 3221 return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY); 3222 } else if (ch == '+') { 3223 next(); 3224 return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE); 3225 } 3226 if (prev instanceof BmpCharProperty) { 3227 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin); 3228 } else if (prev instanceof CharProperty) { 3229 return new CharPropertyGreedy((CharProperty)prev, cmin); 3230 } 3231 return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY); 3232 } 3233 3234 /** 3235 * Processes repetition. If the next character peeked is a quantifier 3236 * then new nodes must be appended to handle the repetition. 3237 * Prev could be a single or a group, so it could be a chain of nodes. 3238 */ 3239 private Node closure(Node prev) { 3240 Node atom; 3241 int ch = peek(); 3242 switch (ch) { 3243 case '?': 3244 ch = next(); 3245 if (ch == '?') { 3246 next(); 3247 return new Ques(prev, Qtype.LAZY); 3248 } else if (ch == '+') { 3249 next(); 3250 return new Ques(prev, Qtype.POSSESSIVE); 3251 } 3252 return new Ques(prev, Qtype.GREEDY); 3253 case '*': 3254 return curly(prev, 0); 3255 case '+': 3256 return curly(prev, 1); 3257 case '{': 3258 ch = temp[cursor+1]; 3259 if (ASCII.isDigit(ch)) { 3260 skip(); 3261 int cmin = 0; 3262 do { 3263 cmin = cmin * 10 + (ch - '0'); 3264 } while (ASCII.isDigit(ch = read())); 3265 int cmax = cmin; 3266 if (ch == ',') { 3267 ch = read(); 3268 cmax = MAX_REPS; 3269 if (ch != '}') { 3270 cmax = 0; 3271 while (ASCII.isDigit(ch)) { 3272 cmax = cmax * 10 + (ch - '0'); 3273 ch = read(); 3274 } 3275 } 3276 } 3277 if (ch != '}') 3278 throw error("Unclosed counted closure"); 3279 if (((cmin) | (cmax) | (cmax - cmin)) < 0) 3280 throw error("Illegal repetition range"); 3281 Curly curly; 3282 ch = peek(); 3283 if (ch == '?') { 3284 next(); 3285 curly = new Curly(prev, cmin, cmax, Qtype.LAZY); 3286 } else if (ch == '+') { 3287 next(); 3288 curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE); 3289 } else { 3290 curly = new Curly(prev, cmin, cmax, Qtype.GREEDY); 3291 } 3292 return curly; 3293 } else { 3294 throw error("Illegal repetition"); 3295 } 3296 default: 3297 return prev; 3298 } 3299 } 3300 3301 /** 3302 * Utility method for parsing control escape sequences. 3303 */ 3304 private int c() { 3305 if (cursor < patternLength) { 3306 return read() ^ 64; 3307 } 3308 throw error("Illegal control escape sequence"); 3309 } 3310 3311 /** 3312 * Utility method for parsing octal escape sequences. 3313 */ 3314 private int o() { 3315 int n = read(); 3316 if (((n-'0')|('7'-n)) >= 0) { 3317 int m = read(); 3318 if (((m-'0')|('7'-m)) >= 0) { 3319 int o = read(); 3320 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) { 3321 return (n - '0') * 64 + (m - '0') * 8 + (o - '0'); 3322 } 3323 unread(); 3324 return (n - '0') * 8 + (m - '0'); 3325 } 3326 unread(); 3327 return (n - '0'); 3328 } 3329 throw error("Illegal octal escape sequence"); 3330 } 3331 3332 /** 3333 * Utility method for parsing hexadecimal escape sequences. 3334 */ 3335 private int x() { 3336 int n = read(); 3337 if (ASCII.isHexDigit(n)) { 3338 int m = read(); 3339 if (ASCII.isHexDigit(m)) { 3340 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); 3341 } 3342 } else if (n == '{' && ASCII.isHexDigit(peek())) { 3343 int ch = 0; 3344 while (ASCII.isHexDigit(n = read())) { 3345 ch = (ch << 4) + ASCII.toDigit(n); 3346 if (ch > Character.MAX_CODE_POINT) 3347 throw error("Hexadecimal codepoint is too big"); 3348 } 3349 if (n != '}') 3350 throw error("Unclosed hexadecimal escape sequence"); 3351 return ch; 3352 } 3353 throw error("Illegal hexadecimal escape sequence"); 3354 } 3355 3356 /** 3357 * Utility method for parsing unicode escape sequences. 3358 */ 3359 private int cursor() { 3360 return cursor; 3361 } 3362 3363 private void setcursor(int pos) { 3364 cursor = pos; 3365 } 3366 3367 private int uxxxx() { 3368 int n = 0; 3369 for (int i = 0; i < 4; i++) { 3370 int ch = read(); 3371 if (!ASCII.isHexDigit(ch)) { 3372 throw error("Illegal Unicode escape sequence"); 3373 } 3374 n = n * 16 + ASCII.toDigit(ch); 3375 } 3376 return n; 3377 } 3378 3379 private int u() { 3380 int n = uxxxx(); 3381 if (Character.isHighSurrogate((char)n)) { 3382 int cur = cursor(); 3383 if (read() == '\\' && read() == 'u') { 3384 int n2 = uxxxx(); 3385 if (Character.isLowSurrogate((char)n2)) 3386 return Character.toCodePoint((char)n, (char)n2); 3387 } 3388 setcursor(cur); 3389 } 3390 return n; 3391 } 3392 3393 private int N() { 3394 if (read() == '{') { 3395 int i = cursor; 3396 while (cursor < patternLength && read() != '}') {} 3397 if (cursor > patternLength) 3398 throw error("Unclosed character name escape sequence"); 3399 String name = new String(temp, i, cursor - i - 1); 3400 try { 3401 return Character.codePointOf(name); 3402 } catch (IllegalArgumentException x) { 3403 throw error("Unknown character name [" + name + "]"); 3404 } 3405 } 3406 throw error("Illegal character name escape sequence"); 3407 } 3408 3409 // 3410 // Utility methods for code point support 3411 // 3412 private static final int countChars(CharSequence seq, int index, 3413 int lengthInCodePoints) { 3414 // optimization 3415 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { 3416 assert (index >= 0 && index < seq.length()); 3417 return 1; 3418 } 3419 int length = seq.length(); 3420 int x = index; 3421 if (lengthInCodePoints >= 0) { 3422 assert (index >= 0 && index < length); 3423 for (int i = 0; x < length && i < lengthInCodePoints; i++) { 3424 if (Character.isHighSurrogate(seq.charAt(x++))) { 3425 if (x < length && Character.isLowSurrogate(seq.charAt(x))) { 3426 x++; 3427 } 3428 } 3429 } 3430 return x - index; 3431 } 3432 3433 assert (index >= 0 && index <= length); 3434 if (index == 0) { 3435 return 0; 3436 } 3437 int len = -lengthInCodePoints; 3438 for (int i = 0; x > 0 && i < len; i++) { 3439 if (Character.isLowSurrogate(seq.charAt(--x))) { 3440 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) { 3441 x--; 3442 } 3443 } 3444 } 3445 return index - x; 3446 } 3447 3448 private static final int countCodePoints(CharSequence seq) { 3449 int length = seq.length(); 3450 int n = 0; 3451 for (int i = 0; i < length; ) { 3452 n++; 3453 if (Character.isHighSurrogate(seq.charAt(i++))) { 3454 if (i < length && Character.isLowSurrogate(seq.charAt(i))) { 3455 i++; 3456 } 3457 } 3458 } 3459 return n; 3460 } 3461 3462 /** 3463 * Creates a bit vector for matching Latin-1 values. A normal BitClass 3464 * never matches values above Latin-1, and a complemented BitClass always 3465 * matches values above Latin-1. 3466 */ 3467 static final class BitClass extends BmpCharProperty { 3468 final boolean[] bits; 3469 BitClass() { 3470 this(new boolean[256]); 3471 } 3472 private BitClass(boolean[] bits) { 3473 super( ch -> ch < 256 && bits[ch]); 3474 this.bits = bits; 3475 } 3476 BitClass add(int c, int flags) { 3477 assert c >= 0 && c <= 255; 3478 if ((flags & CASE_INSENSITIVE) != 0) { 3479 if (ASCII.isAscii(c)) { 3480 bits[ASCII.toUpper(c)] = true; 3481 bits[ASCII.toLower(c)] = true; 3482 } else if ((flags & UNICODE_CASE) != 0) { 3483 bits[Character.toLowerCase(c)] = true; 3484 bits[Character.toUpperCase(c)] = true; 3485 } 3486 } 3487 bits[c] = true; 3488 return this; 3489 } 3490 } 3491 3492 /** 3493 * Utility method for creating a string slice matcher. 3494 */ 3495 private Node newSlice(int[] buf, int count, boolean hasSupplementary) { 3496 int[] tmp = new int[count]; 3497 if (has(CASE_INSENSITIVE)) { 3498 if (has(UNICODE_CASE)) { 3499 for (int i = 0; i < count; i++) { 3500 tmp[i] = Character.toLowerCase( 3501 Character.toUpperCase(buf[i])); 3502 } 3503 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp); 3504 } 3505 for (int i = 0; i < count; i++) { 3506 tmp[i] = ASCII.toLower(buf[i]); 3507 } 3508 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp); 3509 } 3510 for (int i = 0; i < count; i++) { 3511 tmp[i] = buf[i]; 3512 } 3513 return hasSupplementary ? new SliceS(tmp) : new Slice(tmp); 3514 } 3515 3516 /** 3517 * The following classes are the building components of the object 3518 * tree that represents a compiled regular expression. The object tree 3519 * is made of individual elements that handle constructs in the Pattern. 3520 * Each type of object knows how to match its equivalent construct with 3521 * the match() method. 3522 */ 3523 3524 /** 3525 * Base class for all node classes. Subclasses should override the match() 3526 * method as appropriate. This class is an accepting node, so its match() 3527 * always returns true. 3528 */ 3529 static class Node extends Object { 3530 Node next; 3531 Node() { 3532 next = Pattern.accept; 3533 } 3534 /** 3535 * This method implements the classic accept node. 3536 */ 3537 boolean match(Matcher matcher, int i, CharSequence seq) { 3538 matcher.last = i; 3539 matcher.groups[0] = matcher.first; 3540 matcher.groups[1] = matcher.last; 3541 return true; 3542 } 3543 /** 3544 * This method is good for all zero length assertions. 3545 */ 3546 boolean study(TreeInfo info) { 3547 if (next != null) { 3548 return next.study(info); 3549 } else { 3550 return info.deterministic; 3551 } 3552 } 3553 } 3554 3555 static class LastNode extends Node { 3556 /** 3557 * This method implements the classic accept node with 3558 * the addition of a check to see if the match occurred 3559 * using all of the input. 3560 */ 3561 boolean match(Matcher matcher, int i, CharSequence seq) { 3562 if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to) 3563 return false; 3564 matcher.last = i; 3565 matcher.groups[0] = matcher.first; 3566 matcher.groups[1] = matcher.last; 3567 return true; 3568 } 3569 } 3570 3571 /** 3572 * Used for REs that can start anywhere within the input string. 3573 * This basically tries to match repeatedly at each spot in the 3574 * input string, moving forward after each try. An anchored search 3575 * or a BnM will bypass this node completely. 3576 */ 3577 static class Start extends Node { 3578 int minLength; 3579 Start(Node node) { 3580 this.next = node; 3581 TreeInfo info = new TreeInfo(); 3582 next.study(info); 3583 minLength = info.minLength; 3584 } 3585 boolean match(Matcher matcher, int i, CharSequence seq) { 3586 if (i > matcher.to - minLength) { 3587 matcher.hitEnd = true; 3588 return false; 3589 } 3590 int guard = matcher.to - minLength; 3591 for (; i <= guard; i++) { 3592 if (next.match(matcher, i, seq)) { 3593 matcher.first = i; 3594 matcher.groups[0] = matcher.first; 3595 matcher.groups[1] = matcher.last; 3596 return true; 3597 } 3598 } 3599 matcher.hitEnd = true; 3600 return false; 3601 } 3602 boolean study(TreeInfo info) { 3603 next.study(info); 3604 info.maxValid = false; 3605 info.deterministic = false; 3606 return false; 3607 } 3608 } 3609 3610 /* 3611 * StartS supports supplementary characters, including unpaired surrogates. 3612 */ 3613 static final class StartS extends Start { 3614 StartS(Node node) { 3615 super(node); 3616 } 3617 boolean match(Matcher matcher, int i, CharSequence seq) { 3618 if (i > matcher.to - minLength) { 3619 matcher.hitEnd = true; 3620 return false; 3621 } 3622 int guard = matcher.to - minLength; 3623 while (i <= guard) { 3624 //if ((ret = next.match(matcher, i, seq)) || i == guard) 3625 if (next.match(matcher, i, seq)) { 3626 matcher.first = i; 3627 matcher.groups[0] = matcher.first; 3628 matcher.groups[1] = matcher.last; 3629 return true; 3630 } 3631 if (i == guard) 3632 break; 3633 // Optimization to move to the next character. This is 3634 // faster than countChars(seq, i, 1). 3635 if (Character.isHighSurrogate(seq.charAt(i++))) { 3636 if (i < seq.length() && 3637 Character.isLowSurrogate(seq.charAt(i))) { 3638 i++; 3639 } 3640 } 3641 } 3642 matcher.hitEnd = true; 3643 return false; 3644 } 3645 } 3646 3647 /** 3648 * Node to anchor at the beginning of input. This object implements the 3649 * match for a \A sequence, and the caret anchor will use this if not in 3650 * multiline mode. 3651 */ 3652 static final class Begin extends Node { 3653 boolean match(Matcher matcher, int i, CharSequence seq) { 3654 int fromIndex = (matcher.anchoringBounds) ? 3655 matcher.from : 0; 3656 if (i == fromIndex && next.match(matcher, i, seq)) { 3657 matcher.first = i; 3658 matcher.groups[0] = i; 3659 matcher.groups[1] = matcher.last; 3660 return true; 3661 } else { 3662 return false; 3663 } 3664 } 3665 } 3666 3667 /** 3668 * Node to anchor at the end of input. This is the absolute end, so this 3669 * should not match at the last newline before the end as $ will. 3670 */ 3671 static final class End extends Node { 3672 boolean match(Matcher matcher, int i, CharSequence seq) { 3673 int endIndex = (matcher.anchoringBounds) ? 3674 matcher.to : matcher.getTextLength(); 3675 if (i == endIndex) { 3676 matcher.hitEnd = true; 3677 return next.match(matcher, i, seq); 3678 } 3679 return false; 3680 } 3681 } 3682 3683 /** 3684 * Node to anchor at the beginning of a line. This is essentially the 3685 * object to match for the multiline ^. 3686 */ 3687 static final class Caret extends Node { 3688 boolean match(Matcher matcher, int i, CharSequence seq) { 3689 int startIndex = matcher.from; 3690 int endIndex = matcher.to; 3691 if (!matcher.anchoringBounds) { 3692 startIndex = 0; 3693 endIndex = matcher.getTextLength(); 3694 } 3695 // Perl does not match ^ at end of input even after newline 3696 if (i == endIndex) { 3697 matcher.hitEnd = true; 3698 return false; 3699 } 3700 if (i > startIndex) { 3701 char ch = seq.charAt(i-1); 3702 if (ch != '\n' && ch != '\r' 3703 && (ch|1) != '\u2029' 3704 && ch != '\u0085' ) { 3705 return false; 3706 } 3707 // Should treat /r/n as one newline 3708 if (ch == '\r' && seq.charAt(i) == '\n') 3709 return false; 3710 } 3711 return next.match(matcher, i, seq); 3712 } 3713 } 3714 3715 /** 3716 * Node to anchor at the beginning of a line when in unixdot mode. 3717 */ 3718 static final class UnixCaret extends Node { 3719 boolean match(Matcher matcher, int i, CharSequence seq) { 3720 int startIndex = matcher.from; 3721 int endIndex = matcher.to; 3722 if (!matcher.anchoringBounds) { 3723 startIndex = 0; 3724 endIndex = matcher.getTextLength(); 3725 } 3726 // Perl does not match ^ at end of input even after newline 3727 if (i == endIndex) { 3728 matcher.hitEnd = true; 3729 return false; 3730 } 3731 if (i > startIndex) { 3732 char ch = seq.charAt(i-1); 3733 if (ch != '\n') { 3734 return false; 3735 } 3736 } 3737 return next.match(matcher, i, seq); 3738 } 3739 } 3740 3741 /** 3742 * Node to match the location where the last match ended. 3743 * This is used for the \G construct. 3744 */ 3745 static final class LastMatch extends Node { 3746 boolean match(Matcher matcher, int i, CharSequence seq) { 3747 if (i != matcher.oldLast) 3748 return false; 3749 return next.match(matcher, i, seq); 3750 } 3751 } 3752 3753 /** 3754 * Node to anchor at the end of a line or the end of input based on the 3755 * multiline mode. 3756 * 3757 * When not in multiline mode, the $ can only match at the very end 3758 * of the input, unless the input ends in a line terminator in which 3759 * it matches right before the last line terminator. 3760 * 3761 * Note that \r\n is considered an atomic line terminator. 3762 * 3763 * Like ^ the $ operator matches at a position, it does not match the 3764 * line terminators themselves. 3765 */ 3766 static final class Dollar extends Node { 3767 boolean multiline; 3768 Dollar(boolean mul) { 3769 multiline = mul; 3770 } 3771 boolean match(Matcher matcher, int i, CharSequence seq) { 3772 int endIndex = (matcher.anchoringBounds) ? 3773 matcher.to : matcher.getTextLength(); 3774 if (!multiline) { 3775 if (i < endIndex - 2) 3776 return false; 3777 if (i == endIndex - 2) { 3778 char ch = seq.charAt(i); 3779 if (ch != '\r') 3780 return false; 3781 ch = seq.charAt(i + 1); 3782 if (ch != '\n') 3783 return false; 3784 } 3785 } 3786 // Matches before any line terminator; also matches at the 3787 // end of input 3788 // Before line terminator: 3789 // If multiline, we match here no matter what 3790 // If not multiline, fall through so that the end 3791 // is marked as hit; this must be a /r/n or a /n 3792 // at the very end so the end was hit; more input 3793 // could make this not match here 3794 if (i < endIndex) { 3795 char ch = seq.charAt(i); 3796 if (ch == '\n') { 3797 // No match between \r\n 3798 if (i > 0 && seq.charAt(i-1) == '\r') 3799 return false; 3800 if (multiline) 3801 return next.match(matcher, i, seq); 3802 } else if (ch == '\r' || ch == '\u0085' || 3803 (ch|1) == '\u2029') { 3804 if (multiline) 3805 return next.match(matcher, i, seq); 3806 } else { // No line terminator, no match 3807 return false; 3808 } 3809 } 3810 // Matched at current end so hit end 3811 matcher.hitEnd = true; 3812 // If a $ matches because of end of input, then more input 3813 // could cause it to fail! 3814 matcher.requireEnd = true; 3815 return next.match(matcher, i, seq); 3816 } 3817 boolean study(TreeInfo info) { 3818 next.study(info); 3819 return info.deterministic; 3820 } 3821 } 3822 3823 /** 3824 * Node to anchor at the end of a line or the end of input based on the 3825 * multiline mode when in unix lines mode. 3826 */ 3827 static final class UnixDollar extends Node { 3828 boolean multiline; 3829 UnixDollar(boolean mul) { 3830 multiline = mul; 3831 } 3832 boolean match(Matcher matcher, int i, CharSequence seq) { 3833 int endIndex = (matcher.anchoringBounds) ? 3834 matcher.to : matcher.getTextLength(); 3835 if (i < endIndex) { 3836 char ch = seq.charAt(i); 3837 if (ch == '\n') { 3838 // If not multiline, then only possible to 3839 // match at very end or one before end 3840 if (multiline == false && i != endIndex - 1) 3841 return false; 3842 // If multiline return next.match without setting 3843 // matcher.hitEnd 3844 if (multiline) 3845 return next.match(matcher, i, seq); 3846 } else { 3847 return false; 3848 } 3849 } 3850 // Matching because at the end or 1 before the end; 3851 // more input could change this so set hitEnd 3852 matcher.hitEnd = true; 3853 // If a $ matches because of end of input, then more input 3854 // could cause it to fail! 3855 matcher.requireEnd = true; 3856 return next.match(matcher, i, seq); 3857 } 3858 boolean study(TreeInfo info) { 3859 next.study(info); 3860 return info.deterministic; 3861 } 3862 } 3863 3864 /** 3865 * Node class that matches a Unicode line ending '\R' 3866 */ 3867 static final class LineEnding extends Node { 3868 boolean match(Matcher matcher, int i, CharSequence seq) { 3869 // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029]) 3870 if (i < matcher.to) { 3871 int ch = seq.charAt(i); 3872 if (ch == 0x0A || ch == 0x0B || ch == 0x0C || 3873 ch == 0x85 || ch == 0x2028 || ch == 0x2029) 3874 return next.match(matcher, i + 1, seq); 3875 if (ch == 0x0D) { 3876 i++; 3877 if (i < matcher.to) { 3878 if (seq.charAt(i) == 0x0A && 3879 next.match(matcher, i + 1, seq)) { 3880 return true; 3881 } 3882 } else { 3883 matcher.hitEnd = true; 3884 } 3885 return next.match(matcher, i, seq); 3886 } 3887 } else { 3888 matcher.hitEnd = true; 3889 } 3890 return false; 3891 } 3892 boolean study(TreeInfo info) { 3893 info.minLength++; 3894 info.maxLength += 2; 3895 return next.study(info); 3896 } 3897 } 3898 3899 /** 3900 * Abstract node class to match one character satisfying some 3901 * boolean property. 3902 */ 3903 static class CharProperty extends Node { 3904 CharPredicate predicate; 3905 3906 CharProperty (CharPredicate predicate) { 3907 this.predicate = predicate; 3908 } 3909 boolean match(Matcher matcher, int i, CharSequence seq) { 3910 if (i < matcher.to) { 3911 int ch = Character.codePointAt(seq, i); 3912 return predicate.is(ch) && 3913 next.match(matcher, i + Character.charCount(ch), seq); 3914 } else { 3915 matcher.hitEnd = true; 3916 return false; 3917 } 3918 } 3919 boolean study(TreeInfo info) { 3920 info.minLength++; 3921 info.maxLength++; 3922 return next.study(info); 3923 } 3924 } 3925 3926 /** 3927 * Optimized version of CharProperty that works only for 3928 * properties never satisfied by Supplementary characters. 3929 */ 3930 private static class BmpCharProperty extends CharProperty { 3931 BmpCharProperty (BmpCharPredicate predicate) { 3932 super(predicate); 3933 } 3934 boolean match(Matcher matcher, int i, CharSequence seq) { 3935 if (i < matcher.to) { 3936 return predicate.is(seq.charAt(i)) && 3937 next.match(matcher, i + 1, seq); 3938 } else { 3939 matcher.hitEnd = true; 3940 return false; 3941 } 3942 } 3943 } 3944 3945 private static class NFCCharProperty extends Node { 3946 CharPredicate predicate; 3947 NFCCharProperty (CharPredicate predicate) { 3948 this.predicate = predicate; 3949 } 3950 3951 boolean match(Matcher matcher, int i, CharSequence seq) { 3952 if (i < matcher.to) { 3953 int ch0 = Character.codePointAt(seq, i); 3954 int n = Character.charCount(ch0); 3955 int j = i + n; 3956 while (j < matcher.to) { 3957 int ch1 = Character.codePointAt(seq, j); 3958 if (Grapheme.isBoundary(ch0, ch1)) 3959 break; 3960 ch0 = ch1; 3961 j += Character.charCount(ch1); 3962 } 3963 if (i + n == j) { // single, assume nfc cp 3964 if (predicate.is(ch0)) 3965 return next.match(matcher, j, seq); 3966 } else { 3967 while (i + n < j) { 3968 String nfc = Normalizer.normalize( 3969 seq.toString().substring(i, j), Normalizer.Form.NFC); 3970 if (nfc.codePointCount(0, nfc.length()) == 1) { 3971 if (predicate.is(nfc.codePointAt(0)) && 3972 next.match(matcher, j, seq)) { 3973 return true; 3974 } 3975 } 3976 3977 ch0 = Character.codePointBefore(seq, j); 3978 j -= Character.charCount(ch0); 3979 } 3980 } 3981 if (j < matcher.to) 3982 return false; 3983 } 3984 matcher.hitEnd = true; 3985 return false; 3986 } 3987 3988 boolean study(TreeInfo info) { 3989 info.minLength++; 3990 info.deterministic = false; 3991 return next.study(info); 3992 } 3993 } 3994 3995 /** 3996 * Node class that matches an unicode extended grapheme cluster 3997 */ 3998 static class XGrapheme extends Node { 3999 boolean match(Matcher matcher, int i, CharSequence seq) { 4000 if (i < matcher.to) { 4001 int ch0 = Character.codePointAt(seq, i); 4002 i += Character.charCount(ch0); 4003 while (i < matcher.to) { 4004 int ch1 = Character.codePointAt(seq, i); 4005 if (Grapheme.isBoundary(ch0, ch1)) 4006 break; 4007 ch0 = ch1; 4008 i += Character.charCount(ch1); 4009 } 4010 return next.match(matcher, i, seq); 4011 } 4012 matcher.hitEnd = true; 4013 return false; 4014 } 4015 4016 boolean study(TreeInfo info) { 4017 info.minLength++; 4018 info.deterministic = false; 4019 return next.study(info); 4020 } 4021 } 4022 4023 /** 4024 * Node class that handles grapheme boundaries 4025 */ 4026 static class GraphemeBound extends Node { 4027 boolean match(Matcher matcher, int i, CharSequence seq) { 4028 int startIndex = matcher.from; 4029 int endIndex = matcher.to; 4030 if (matcher.transparentBounds) { 4031 startIndex = 0; 4032 endIndex = matcher.getTextLength(); 4033 } 4034 if (i == startIndex) { 4035 return next.match(matcher, i, seq); 4036 } 4037 if (i < endIndex) { 4038 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) || 4039 !Grapheme.isBoundary(Character.codePointBefore(seq, i), 4040 Character.codePointAt(seq, i))) { 4041 return false; 4042 } 4043 } else { 4044 matcher.hitEnd = true; 4045 matcher.requireEnd = true; 4046 } 4047 return next.match(matcher, i, seq); 4048 } 4049 } 4050 4051 /** 4052 * Base class for all Slice nodes 4053 */ 4054 static class SliceNode extends Node { 4055 int[] buffer; 4056 SliceNode(int[] buf) { 4057 buffer = buf; 4058 } 4059 boolean study(TreeInfo info) { 4060 info.minLength += buffer.length; 4061 info.maxLength += buffer.length; 4062 return next.study(info); 4063 } 4064 } 4065 4066 /** 4067 * Node class for a case sensitive/BMP-only sequence of literal 4068 * characters. 4069 */ 4070 static class Slice extends SliceNode { 4071 Slice(int[] buf) { 4072 super(buf); 4073 } 4074 boolean match(Matcher matcher, int i, CharSequence seq) { 4075 int[] buf = buffer; 4076 int len = buf.length; 4077 for (int j=0; j<len; j++) { 4078 if ((i+j) >= matcher.to) { 4079 matcher.hitEnd = true; 4080 return false; 4081 } 4082 if (buf[j] != seq.charAt(i+j)) 4083 return false; 4084 } 4085 return next.match(matcher, i+len, seq); 4086 } 4087 } 4088 4089 /** 4090 * Node class for a case_insensitive/BMP-only sequence of literal 4091 * characters. 4092 */ 4093 static class SliceI extends SliceNode { 4094 SliceI(int[] buf) { 4095 super(buf); 4096 } 4097 boolean match(Matcher matcher, int i, CharSequence seq) { 4098 int[] buf = buffer; 4099 int len = buf.length; 4100 for (int j=0; j<len; j++) { 4101 if ((i+j) >= matcher.to) { 4102 matcher.hitEnd = true; 4103 return false; 4104 } 4105 int c = seq.charAt(i+j); 4106 if (buf[j] != c && 4107 buf[j] != ASCII.toLower(c)) 4108 return false; 4109 } 4110 return next.match(matcher, i+len, seq); 4111 } 4112 } 4113 4114 /** 4115 * Node class for a unicode_case_insensitive/BMP-only sequence of 4116 * literal characters. Uses unicode case folding. 4117 */ 4118 static final class SliceU extends SliceNode { 4119 SliceU(int[] buf) { 4120 super(buf); 4121 } 4122 boolean match(Matcher matcher, int i, CharSequence seq) { 4123 int[] buf = buffer; 4124 int len = buf.length; 4125 for (int j=0; j<len; j++) { 4126 if ((i+j) >= matcher.to) { 4127 matcher.hitEnd = true; 4128 return false; 4129 } 4130 int c = seq.charAt(i+j); 4131 if (buf[j] != c && 4132 buf[j] != Character.toLowerCase(Character.toUpperCase(c))) 4133 return false; 4134 } 4135 return next.match(matcher, i+len, seq); 4136 } 4137 } 4138 4139 /** 4140 * Node class for a case sensitive sequence of literal characters 4141 * including supplementary characters. 4142 */ 4143 static final class SliceS extends Slice { 4144 SliceS(int[] buf) { 4145 super(buf); 4146 } 4147 boolean match(Matcher matcher, int i, CharSequence seq) { 4148 int[] buf = buffer; 4149 int x = i; 4150 for (int j = 0; j < buf.length; j++) { 4151 if (x >= matcher.to) { 4152 matcher.hitEnd = true; 4153 return false; 4154 } 4155 int c = Character.codePointAt(seq, x); 4156 if (buf[j] != c) 4157 return false; 4158 x += Character.charCount(c); 4159 if (x > matcher.to) { 4160 matcher.hitEnd = true; 4161 return false; 4162 } 4163 } 4164 return next.match(matcher, x, seq); 4165 } 4166 } 4167 4168 /** 4169 * Node class for a case insensitive sequence of literal characters 4170 * including supplementary characters. 4171 */ 4172 static class SliceIS extends SliceNode { 4173 SliceIS(int[] buf) { 4174 super(buf); 4175 } 4176 int toLower(int c) { 4177 return ASCII.toLower(c); 4178 } 4179 boolean match(Matcher matcher, int i, CharSequence seq) { 4180 int[] buf = buffer; 4181 int x = i; 4182 for (int j = 0; j < buf.length; j++) { 4183 if (x >= matcher.to) { 4184 matcher.hitEnd = true; 4185 return false; 4186 } 4187 int c = Character.codePointAt(seq, x); 4188 if (buf[j] != c && buf[j] != toLower(c)) 4189 return false; 4190 x += Character.charCount(c); 4191 if (x > matcher.to) { 4192 matcher.hitEnd = true; 4193 return false; 4194 } 4195 } 4196 return next.match(matcher, x, seq); 4197 } 4198 } 4199 4200 /** 4201 * Node class for a case insensitive sequence of literal characters. 4202 * Uses unicode case folding. 4203 */ 4204 static final class SliceUS extends SliceIS { 4205 SliceUS(int[] buf) { 4206 super(buf); 4207 } 4208 int toLower(int c) { 4209 return Character.toLowerCase(Character.toUpperCase(c)); 4210 } 4211 } 4212 4213 /** 4214 * The 0 or 1 quantifier. This one class implements all three types. 4215 */ 4216 static final class Ques extends Node { 4217 Node atom; 4218 Qtype type; 4219 Ques(Node node, Qtype type) { 4220 this.atom = node; 4221 this.type = type; 4222 } 4223 boolean match(Matcher matcher, int i, CharSequence seq) { 4224 switch (type) { 4225 case GREEDY: 4226 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)) 4227 || next.match(matcher, i, seq); 4228 case LAZY: 4229 return next.match(matcher, i, seq) 4230 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)); 4231 case POSSESSIVE: 4232 if (atom.match(matcher, i, seq)) i = matcher.last; 4233 return next.match(matcher, i, seq); 4234 default: 4235 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); 4236 } 4237 } 4238 boolean study(TreeInfo info) { 4239 if (type != Qtype.INDEPENDENT) { 4240 int minL = info.minLength; 4241 atom.study(info); 4242 info.minLength = minL; 4243 info.deterministic = false; 4244 return next.study(info); 4245 } else { 4246 atom.study(info); 4247 return next.study(info); 4248 } 4249 } 4250 } 4251 4252 /** 4253 * Handles the greedy style repetition with the minimum either be 4254 * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier. 4255 */ 4256 static class CharPropertyGreedy extends Node { 4257 final CharPredicate predicate; 4258 final int cmin; 4259 4260 CharPropertyGreedy(CharProperty cp, int cmin) { 4261 this.predicate = cp.predicate; 4262 this.cmin = cmin; 4263 } 4264 boolean match(Matcher matcher, int i, CharSequence seq) { 4265 int n = 0; 4266 int to = matcher.to; 4267 // greedy, all the way down 4268 while (i < to) { 4269 int ch = Character.codePointAt(seq, i); 4270 if (!predicate.is(ch)) 4271 break; 4272 i += Character.charCount(ch); 4273 n++; 4274 } 4275 if (i >= to) { 4276 matcher.hitEnd = true; 4277 } 4278 while (n >= cmin) { 4279 if (next.match(matcher, i, seq)) 4280 return true; 4281 if (n == cmin) 4282 return false; 4283 // backing off if match fails 4284 int ch = Character.codePointBefore(seq, i); 4285 i -= Character.charCount(ch); 4286 n--; 4287 } 4288 return false; 4289 } 4290 4291 boolean study(TreeInfo info) { 4292 info.minLength += cmin; 4293 if (info.maxValid) { 4294 info.maxLength += MAX_REPS; 4295 } 4296 info.deterministic = false; 4297 return next.study(info); 4298 } 4299 } 4300 4301 static final class BmpCharPropertyGreedy extends CharPropertyGreedy { 4302 4303 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { 4304 super(bcp, cmin); 4305 } 4306 4307 boolean match(Matcher matcher, int i, CharSequence seq) { 4308 int n = 0; 4309 int to = matcher.to; 4310 while (i < to && predicate.is(seq.charAt(i))) { 4311 i++; n++; 4312 } 4313 if (i >= to) { 4314 matcher.hitEnd = true; 4315 } 4316 while (n >= cmin) { 4317 if (next.match(matcher, i, seq)) 4318 return true; 4319 i--; n--; // backing off if match fails 4320 } 4321 return false; 4322 } 4323 } 4324 4325 /** 4326 * Handles the curly-brace style repetition with a specified minimum and 4327 * maximum occurrences. The * quantifier is handled as a special case. 4328 * This class handles the three types. 4329 */ 4330 static final class Curly extends Node { 4331 Node atom; 4332 Qtype type; 4333 int cmin; 4334 int cmax; 4335 4336 Curly(Node node, int cmin, int cmax, Qtype type) { 4337 this.atom = node; 4338 this.type = type; 4339 this.cmin = cmin; 4340 this.cmax = cmax; 4341 } 4342 boolean match(Matcher matcher, int i, CharSequence seq) { 4343 int j; 4344 for (j = 0; j < cmin; j++) { 4345 if (atom.match(matcher, i, seq)) { 4346 i = matcher.last; 4347 continue; 4348 } 4349 return false; 4350 } 4351 if (type == Qtype.GREEDY) 4352 return match0(matcher, i, j, seq); 4353 else if (type == Qtype.LAZY) 4354 return match1(matcher, i, j, seq); 4355 else 4356 return match2(matcher, i, j, seq); 4357 } 4358 // Greedy match. 4359 // i is the index to start matching at 4360 // j is the number of atoms that have matched 4361 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4362 if (j >= cmax) { 4363 // We have matched the maximum... continue with the rest of 4364 // the regular expression 4365 return next.match(matcher, i, seq); 4366 } 4367 int backLimit = j; 4368 while (atom.match(matcher, i, seq)) { 4369 // k is the length of this match 4370 int k = matcher.last - i; 4371 if (k == 0) // Zero length match 4372 break; 4373 // Move up index and number matched 4374 i = matcher.last; 4375 j++; 4376 // We are greedy so match as many as we can 4377 while (j < cmax) { 4378 if (!atom.match(matcher, i, seq)) 4379 break; 4380 if (i + k != matcher.last) { 4381 if (match0(matcher, matcher.last, j+1, seq)) 4382 return true; 4383 break; 4384 } 4385 i += k; 4386 j++; 4387 } 4388 // Handle backing off if match fails 4389 while (j >= backLimit) { 4390 if (next.match(matcher, i, seq)) 4391 return true; 4392 i -= k; 4393 j--; 4394 } 4395 return false; 4396 } 4397 return next.match(matcher, i, seq); 4398 } 4399 // Reluctant match. At this point, the minimum has been satisfied. 4400 // i is the index to start matching at 4401 // j is the number of atoms that have matched 4402 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4403 for (;;) { 4404 // Try finishing match without consuming any more 4405 if (next.match(matcher, i, seq)) 4406 return true; 4407 // At the maximum, no match found 4408 if (j >= cmax) 4409 return false; 4410 // Okay, must try one more atom 4411 if (!atom.match(matcher, i, seq)) 4412 return false; 4413 // If we haven't moved forward then must break out 4414 if (i == matcher.last) 4415 return false; 4416 // Move up index and number matched 4417 i = matcher.last; 4418 j++; 4419 } 4420 } 4421 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4422 for (; j < cmax; j++) { 4423 if (!atom.match(matcher, i, seq)) 4424 break; 4425 if (i == matcher.last) 4426 break; 4427 i = matcher.last; 4428 } 4429 return next.match(matcher, i, seq); 4430 } 4431 boolean study(TreeInfo info) { 4432 // Save original info 4433 int minL = info.minLength; 4434 int maxL = info.maxLength; 4435 boolean maxV = info.maxValid; 4436 boolean detm = info.deterministic; 4437 info.reset(); 4438 4439 atom.study(info); 4440 4441 int temp = info.minLength * cmin + minL; 4442 if (temp < minL) { 4443 temp = 0xFFFFFFF; // arbitrary large number 4444 } 4445 info.minLength = temp; 4446 4447 if (maxV & info.maxValid) { 4448 temp = info.maxLength * cmax + maxL; 4449 info.maxLength = temp; 4450 if (temp < maxL) { 4451 info.maxValid = false; 4452 } 4453 } else { 4454 info.maxValid = false; 4455 } 4456 4457 if (info.deterministic && cmin == cmax) 4458 info.deterministic = detm; 4459 else 4460 info.deterministic = false; 4461 return next.study(info); 4462 } 4463 } 4464 4465 /** 4466 * Handles the curly-brace style repetition with a specified minimum and 4467 * maximum occurrences in deterministic cases. This is an iterative 4468 * optimization over the Prolog and Loop system which would handle this 4469 * in a recursive way. The * quantifier is handled as a special case. 4470 * If capture is true then this class saves group settings and ensures 4471 * that groups are unset when backing off of a group match. 4472 */ 4473 static final class GroupCurly extends Node { 4474 Node atom; 4475 Qtype type; 4476 int cmin; 4477 int cmax; 4478 int localIndex; 4479 int groupIndex; 4480 boolean capture; 4481 4482 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local, 4483 int group, boolean capture) { 4484 this.atom = node; 4485 this.type = type; 4486 this.cmin = cmin; 4487 this.cmax = cmax; 4488 this.localIndex = local; 4489 this.groupIndex = group; 4490 this.capture = capture; 4491 } 4492 boolean match(Matcher matcher, int i, CharSequence seq) { 4493 int[] groups = matcher.groups; 4494 int[] locals = matcher.locals; 4495 int save0 = locals[localIndex]; 4496 int save1 = 0; 4497 int save2 = 0; 4498 4499 if (capture) { 4500 save1 = groups[groupIndex]; 4501 save2 = groups[groupIndex+1]; 4502 } 4503 4504 // Notify GroupTail there is no need to setup group info 4505 // because it will be set here 4506 locals[localIndex] = -1; 4507 4508 boolean ret = true; 4509 for (int j = 0; j < cmin; j++) { 4510 if (atom.match(matcher, i, seq)) { 4511 if (capture) { 4512 groups[groupIndex] = i; 4513 groups[groupIndex+1] = matcher.last; 4514 } 4515 i = matcher.last; 4516 } else { 4517 ret = false; 4518 break; 4519 } 4520 } 4521 if (ret) { 4522 if (type == Qtype.GREEDY) { 4523 ret = match0(matcher, i, cmin, seq); 4524 } else if (type == Qtype.LAZY) { 4525 ret = match1(matcher, i, cmin, seq); 4526 } else { 4527 ret = match2(matcher, i, cmin, seq); 4528 } 4529 } 4530 if (!ret) { 4531 locals[localIndex] = save0; 4532 if (capture) { 4533 groups[groupIndex] = save1; 4534 groups[groupIndex+1] = save2; 4535 } 4536 } 4537 return ret; 4538 } 4539 // Aggressive group match 4540 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4541 // don't back off passing the starting "j" 4542 int min = j; 4543 int[] groups = matcher.groups; 4544 int save0 = 0; 4545 int save1 = 0; 4546 if (capture) { 4547 save0 = groups[groupIndex]; 4548 save1 = groups[groupIndex+1]; 4549 } 4550 for (;;) { 4551 if (j >= cmax) 4552 break; 4553 if (!atom.match(matcher, i, seq)) 4554 break; 4555 int k = matcher.last - i; 4556 if (k <= 0) { 4557 if (capture) { 4558 groups[groupIndex] = i; 4559 groups[groupIndex+1] = i + k; 4560 } 4561 i = i + k; 4562 break; 4563 } 4564 for (;;) { 4565 if (capture) { 4566 groups[groupIndex] = i; 4567 groups[groupIndex+1] = i + k; 4568 } 4569 i = i + k; 4570 if (++j >= cmax) 4571 break; 4572 if (!atom.match(matcher, i, seq)) 4573 break; 4574 if (i + k != matcher.last) { 4575 if (match0(matcher, i, j, seq)) 4576 return true; 4577 break; 4578 } 4579 } 4580 while (j > min) { 4581 if (next.match(matcher, i, seq)) { 4582 if (capture) { 4583 groups[groupIndex+1] = i; 4584 groups[groupIndex] = i - k; 4585 } 4586 return true; 4587 } 4588 // backing off 4589 i = i - k; 4590 if (capture) { 4591 groups[groupIndex+1] = i; 4592 groups[groupIndex] = i - k; 4593 } 4594 j--; 4595 4596 } 4597 break; 4598 } 4599 if (capture) { 4600 groups[groupIndex] = save0; 4601 groups[groupIndex+1] = save1; 4602 } 4603 return next.match(matcher, i, seq); 4604 } 4605 // Reluctant matching 4606 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4607 for (;;) { 4608 if (next.match(matcher, i, seq)) 4609 return true; 4610 if (j >= cmax) 4611 return false; 4612 if (!atom.match(matcher, i, seq)) 4613 return false; 4614 if (i == matcher.last) 4615 return false; 4616 if (capture) { 4617 matcher.groups[groupIndex] = i; 4618 matcher.groups[groupIndex+1] = matcher.last; 4619 } 4620 i = matcher.last; 4621 j++; 4622 } 4623 } 4624 // Possessive matching 4625 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4626 for (; j < cmax; j++) { 4627 if (!atom.match(matcher, i, seq)) { 4628 break; 4629 } 4630 if (capture) { 4631 matcher.groups[groupIndex] = i; 4632 matcher.groups[groupIndex+1] = matcher.last; 4633 } 4634 if (i == matcher.last) { 4635 break; 4636 } 4637 i = matcher.last; 4638 } 4639 return next.match(matcher, i, seq); 4640 } 4641 boolean study(TreeInfo info) { 4642 // Save original info 4643 int minL = info.minLength; 4644 int maxL = info.maxLength; 4645 boolean maxV = info.maxValid; 4646 boolean detm = info.deterministic; 4647 info.reset(); 4648 4649 atom.study(info); 4650 4651 int temp = info.minLength * cmin + minL; 4652 if (temp < minL) { 4653 temp = 0xFFFFFFF; // Arbitrary large number 4654 } 4655 info.minLength = temp; 4656 4657 if (maxV & info.maxValid) { 4658 temp = info.maxLength * cmax + maxL; 4659 info.maxLength = temp; 4660 if (temp < maxL) { 4661 info.maxValid = false; 4662 } 4663 } else { 4664 info.maxValid = false; 4665 } 4666 4667 if (info.deterministic && cmin == cmax) { 4668 info.deterministic = detm; 4669 } else { 4670 info.deterministic = false; 4671 } 4672 return next.study(info); 4673 } 4674 } 4675 4676 /** 4677 * A Guard node at the end of each atom node in a Branch. It 4678 * serves the purpose of chaining the "match" operation to 4679 * "next" but not the "study", so we can collect the TreeInfo 4680 * of each atom node without including the TreeInfo of the 4681 * "next". 4682 */ 4683 static final class BranchConn extends Node { 4684 BranchConn() {}; 4685 boolean match(Matcher matcher, int i, CharSequence seq) { 4686 return next.match(matcher, i, seq); 4687 } 4688 boolean study(TreeInfo info) { 4689 return info.deterministic; 4690 } 4691 } 4692 4693 /** 4694 * Handles the branching of alternations. Note this is also used for 4695 * the ? quantifier to branch between the case where it matches once 4696 * and where it does not occur. 4697 */ 4698 static final class Branch extends Node { 4699 Node[] atoms = new Node[2]; 4700 int size = 2; 4701 Node conn; 4702 Branch(Node first, Node second, Node branchConn) { 4703 conn = branchConn; 4704 atoms[0] = first; 4705 atoms[1] = second; 4706 } 4707 4708 void add(Node node) { 4709 if (size >= atoms.length) { 4710 Node[] tmp = new Node[atoms.length*2]; 4711 System.arraycopy(atoms, 0, tmp, 0, atoms.length); 4712 atoms = tmp; 4713 } 4714 atoms[size++] = node; 4715 } 4716 4717 boolean match(Matcher matcher, int i, CharSequence seq) { 4718 for (int n = 0; n < size; n++) { 4719 if (atoms[n] == null) { 4720 if (conn.next.match(matcher, i, seq)) 4721 return true; 4722 } else if (atoms[n].match(matcher, i, seq)) { 4723 return true; 4724 } 4725 } 4726 return false; 4727 } 4728 4729 boolean study(TreeInfo info) { 4730 int minL = info.minLength; 4731 int maxL = info.maxLength; 4732 boolean maxV = info.maxValid; 4733 4734 int minL2 = Integer.MAX_VALUE; //arbitrary large enough num 4735 int maxL2 = -1; 4736 for (int n = 0; n < size; n++) { 4737 info.reset(); 4738 if (atoms[n] != null) 4739 atoms[n].study(info); 4740 minL2 = Math.min(minL2, info.minLength); 4741 maxL2 = Math.max(maxL2, info.maxLength); 4742 maxV = (maxV & info.maxValid); 4743 } 4744 4745 minL += minL2; 4746 maxL += maxL2; 4747 4748 info.reset(); 4749 conn.next.study(info); 4750 4751 info.minLength += minL; 4752 info.maxLength += maxL; 4753 info.maxValid &= maxV; 4754 info.deterministic = false; 4755 return false; 4756 } 4757 } 4758 4759 /** 4760 * The GroupHead saves the location where the group begins in the locals 4761 * and restores them when the match is done. 4762 * 4763 * The matchRef is used when a reference to this group is accessed later 4764 * in the expression. The locals will have a negative value in them to 4765 * indicate that we do not want to unset the group if the reference 4766 * doesn't match. 4767 */ 4768 static final class GroupHead extends Node { 4769 int localIndex; 4770 GroupTail tail; // for debug/print only, match does not need to know 4771 GroupHead(int localCount) { 4772 localIndex = localCount; 4773 } 4774 boolean match(Matcher matcher, int i, CharSequence seq) { 4775 int save = matcher.locals[localIndex]; 4776 matcher.locals[localIndex] = i; 4777 boolean ret = next.match(matcher, i, seq); 4778 matcher.locals[localIndex] = save; 4779 return ret; 4780 } 4781 boolean matchRef(Matcher matcher, int i, CharSequence seq) { 4782 int save = matcher.locals[localIndex]; 4783 matcher.locals[localIndex] = ~i; // HACK 4784 boolean ret = next.match(matcher, i, seq); 4785 matcher.locals[localIndex] = save; 4786 return ret; 4787 } 4788 } 4789 4790 /** 4791 * Recursive reference to a group in the regular expression. It calls 4792 * matchRef because if the reference fails to match we would not unset 4793 * the group. 4794 */ 4795 static final class GroupRef extends Node { 4796 GroupHead head; 4797 GroupRef(GroupHead head) { 4798 this.head = head; 4799 } 4800 boolean match(Matcher matcher, int i, CharSequence seq) { 4801 return head.matchRef(matcher, i, seq) 4802 && next.match(matcher, matcher.last, seq); 4803 } 4804 boolean study(TreeInfo info) { 4805 info.maxValid = false; 4806 info.deterministic = false; 4807 return next.study(info); 4808 } 4809 } 4810 4811 /** 4812 * The GroupTail handles the setting of group beginning and ending 4813 * locations when groups are successfully matched. It must also be able to 4814 * unset groups that have to be backed off of. 4815 * 4816 * The GroupTail node is also used when a previous group is referenced, 4817 * and in that case no group information needs to be set. 4818 */ 4819 static final class GroupTail extends Node { 4820 int localIndex; 4821 int groupIndex; 4822 GroupTail(int localCount, int groupCount) { 4823 localIndex = localCount; 4824 groupIndex = groupCount + groupCount; 4825 } 4826 boolean match(Matcher matcher, int i, CharSequence seq) { 4827 int tmp = matcher.locals[localIndex]; 4828 if (tmp >= 0) { // This is the normal group case. 4829 // Save the group so we can unset it if it 4830 // backs off of a match. 4831 int groupStart = matcher.groups[groupIndex]; 4832 int groupEnd = matcher.groups[groupIndex+1]; 4833 4834 matcher.groups[groupIndex] = tmp; 4835 matcher.groups[groupIndex+1] = i; 4836 if (next.match(matcher, i, seq)) { 4837 return true; 4838 } 4839 matcher.groups[groupIndex] = groupStart; 4840 matcher.groups[groupIndex+1] = groupEnd; 4841 return false; 4842 } else { 4843 // This is a group reference case. We don't need to save any 4844 // group info because it isn't really a group. 4845 matcher.last = i; 4846 return true; 4847 } 4848 } 4849 } 4850 4851 /** 4852 * This sets up a loop to handle a recursive quantifier structure. 4853 */ 4854 static final class Prolog extends Node { 4855 Loop loop; 4856 Prolog(Loop loop) { 4857 this.loop = loop; 4858 } 4859 boolean match(Matcher matcher, int i, CharSequence seq) { 4860 return loop.matchInit(matcher, i, seq); 4861 } 4862 boolean study(TreeInfo info) { 4863 return loop.study(info); 4864 } 4865 } 4866 4867 /** 4868 * Handles the repetition count for a greedy Curly. The matchInit 4869 * is called from the Prolog to save the index of where the group 4870 * beginning is stored. A zero length group check occurs in the 4871 * normal match but is skipped in the matchInit. 4872 */ 4873 static class Loop extends Node { 4874 Node body; 4875 int countIndex; // local count index in matcher locals 4876 int beginIndex; // group beginning index 4877 int cmin, cmax; 4878 int posIndex; 4879 Loop(int countIndex, int beginIndex) { 4880 this.countIndex = countIndex; 4881 this.beginIndex = beginIndex; 4882 this.posIndex = -1; 4883 } 4884 boolean match(Matcher matcher, int i, CharSequence seq) { 4885 // Avoid infinite loop in zero-length case. 4886 if (i > matcher.locals[beginIndex]) { 4887 int count = matcher.locals[countIndex]; 4888 4889 // This block is for before we reach the minimum 4890 // iterations required for the loop to match 4891 if (count < cmin) { 4892 matcher.locals[countIndex] = count + 1; 4893 boolean b = body.match(matcher, i, seq); 4894 // If match failed we must backtrack, so 4895 // the loop count should NOT be incremented 4896 if (!b) 4897 matcher.locals[countIndex] = count; 4898 // Return success or failure since we are under 4899 // minimum 4900 return b; 4901 } 4902 // This block is for after we have the minimum 4903 // iterations required for the loop to match 4904 if (count < cmax) { 4905 // Let's check if we have already tried and failed 4906 // at this starting position "i" in the past. 4907 // If yes, then just return false wihtout trying 4908 // again, to stop the exponential backtracking. 4909 if (posIndex != -1 && 4910 matcher.localsPos[posIndex].contains(i)) { 4911 return next.match(matcher, i, seq); 4912 } 4913 matcher.locals[countIndex] = count + 1; 4914 boolean b = body.match(matcher, i, seq); 4915 // If match failed we must backtrack, so 4916 // the loop count should NOT be incremented 4917 if (b) 4918 return true; 4919 matcher.locals[countIndex] = count; 4920 // save the failed position 4921 if (posIndex != -1) { 4922 matcher.localsPos[posIndex].add(i); 4923 } 4924 } 4925 } 4926 return next.match(matcher, i, seq); 4927 } 4928 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4929 int save = matcher.locals[countIndex]; 4930 boolean ret = false; 4931 if (posIndex != -1 && matcher.localsPos[posIndex] == null) { 4932 matcher.localsPos[posIndex] = new IntHashSet(); 4933 } 4934 if (0 < cmin) { 4935 matcher.locals[countIndex] = 1; 4936 ret = body.match(matcher, i, seq); 4937 } else if (0 < cmax) { 4938 matcher.locals[countIndex] = 1; 4939 ret = body.match(matcher, i, seq); 4940 if (ret == false) 4941 ret = next.match(matcher, i, seq); 4942 } else { 4943 ret = next.match(matcher, i, seq); 4944 } 4945 matcher.locals[countIndex] = save; 4946 return ret; 4947 } 4948 boolean study(TreeInfo info) { 4949 info.maxValid = false; 4950 info.deterministic = false; 4951 return false; 4952 } 4953 } 4954 4955 /** 4956 * Handles the repetition count for a reluctant Curly. The matchInit 4957 * is called from the Prolog to save the index of where the group 4958 * beginning is stored. A zero length group check occurs in the 4959 * normal match but is skipped in the matchInit. 4960 */ 4961 static final class LazyLoop extends Loop { 4962 LazyLoop(int countIndex, int beginIndex) { 4963 super(countIndex, beginIndex); 4964 } 4965 boolean match(Matcher matcher, int i, CharSequence seq) { 4966 // Check for zero length group 4967 if (i > matcher.locals[beginIndex]) { 4968 int count = matcher.locals[countIndex]; 4969 if (count < cmin) { 4970 matcher.locals[countIndex] = count + 1; 4971 boolean result = body.match(matcher, i, seq); 4972 // If match failed we must backtrack, so 4973 // the loop count should NOT be incremented 4974 if (!result) 4975 matcher.locals[countIndex] = count; 4976 return result; 4977 } 4978 if (next.match(matcher, i, seq)) 4979 return true; 4980 if (count < cmax) { 4981 matcher.locals[countIndex] = count + 1; 4982 boolean result = body.match(matcher, i, seq); 4983 // If match failed we must backtrack, so 4984 // the loop count should NOT be incremented 4985 if (!result) 4986 matcher.locals[countIndex] = count; 4987 return result; 4988 } 4989 return false; 4990 } 4991 return next.match(matcher, i, seq); 4992 } 4993 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4994 int save = matcher.locals[countIndex]; 4995 boolean ret = false; 4996 if (0 < cmin) { 4997 matcher.locals[countIndex] = 1; 4998 ret = body.match(matcher, i, seq); 4999 } else if (next.match(matcher, i, seq)) { 5000 ret = true; 5001 } else if (0 < cmax) { 5002 matcher.locals[countIndex] = 1; 5003 ret = body.match(matcher, i, seq); 5004 } 5005 matcher.locals[countIndex] = save; 5006 return ret; 5007 } 5008 boolean study(TreeInfo info) { 5009 info.maxValid = false; 5010 info.deterministic = false; 5011 return false; 5012 } 5013 } 5014 5015 /** 5016 * Refers to a group in the regular expression. Attempts to match 5017 * whatever the group referred to last matched. 5018 */ 5019 static class BackRef extends Node { 5020 int groupIndex; 5021 BackRef(int groupCount) { 5022 super(); 5023 groupIndex = groupCount + groupCount; 5024 } 5025 boolean match(Matcher matcher, int i, CharSequence seq) { 5026 int j = matcher.groups[groupIndex]; 5027 int k = matcher.groups[groupIndex+1]; 5028 5029 int groupSize = k - j; 5030 // If the referenced group didn't match, neither can this 5031 if (j < 0) 5032 return false; 5033 5034 // If there isn't enough input left no match 5035 if (i + groupSize > matcher.to) { 5036 matcher.hitEnd = true; 5037 return false; 5038 } 5039 // Check each new char to make sure it matches what the group 5040 // referenced matched last time around 5041 for (int index=0; index<groupSize; index++) 5042 if (seq.charAt(i+index) != seq.charAt(j+index)) 5043 return false; 5044 5045 return next.match(matcher, i+groupSize, seq); 5046 } 5047 boolean study(TreeInfo info) { 5048 info.maxValid = false; 5049 return next.study(info); 5050 } 5051 } 5052 5053 static class CIBackRef extends Node { 5054 int groupIndex; 5055 boolean doUnicodeCase; 5056 CIBackRef(int groupCount, boolean doUnicodeCase) { 5057 super(); 5058 groupIndex = groupCount + groupCount; 5059 this.doUnicodeCase = doUnicodeCase; 5060 } 5061 boolean match(Matcher matcher, int i, CharSequence seq) { 5062 int j = matcher.groups[groupIndex]; 5063 int k = matcher.groups[groupIndex+1]; 5064 5065 int groupSize = k - j; 5066 5067 // If the referenced group didn't match, neither can this 5068 if (j < 0) 5069 return false; 5070 5071 // If there isn't enough input left no match 5072 if (i + groupSize > matcher.to) { 5073 matcher.hitEnd = true; 5074 return false; 5075 } 5076 5077 // Check each new char to make sure it matches what the group 5078 // referenced matched last time around 5079 int x = i; 5080 for (int index=0; index<groupSize; index++) { 5081 int c1 = Character.codePointAt(seq, x); 5082 int c2 = Character.codePointAt(seq, j); 5083 if (c1 != c2) { 5084 if (doUnicodeCase) { 5085 int cc1 = Character.toUpperCase(c1); 5086 int cc2 = Character.toUpperCase(c2); 5087 if (cc1 != cc2 && 5088 Character.toLowerCase(cc1) != 5089 Character.toLowerCase(cc2)) 5090 return false; 5091 } else { 5092 if (ASCII.toLower(c1) != ASCII.toLower(c2)) 5093 return false; 5094 } 5095 } 5096 x += Character.charCount(c1); 5097 j += Character.charCount(c2); 5098 } 5099 5100 return next.match(matcher, i+groupSize, seq); 5101 } 5102 boolean study(TreeInfo info) { 5103 info.maxValid = false; 5104 return next.study(info); 5105 } 5106 } 5107 5108 /** 5109 * Searches until the next instance of its atom. This is useful for 5110 * finding the atom efficiently without passing an instance of it 5111 * (greedy problem) and without a lot of wasted search time (reluctant 5112 * problem). 5113 */ 5114 static final class First extends Node { 5115 Node atom; 5116 First(Node node) { 5117 this.atom = BnM.optimize(node); 5118 } 5119 boolean match(Matcher matcher, int i, CharSequence seq) { 5120 if (atom instanceof BnM) { 5121 return atom.match(matcher, i, seq) 5122 && next.match(matcher, matcher.last, seq); 5123 } 5124 for (;;) { 5125 if (i > matcher.to) { 5126 matcher.hitEnd = true; 5127 return false; 5128 } 5129 if (atom.match(matcher, i, seq)) { 5130 return next.match(matcher, matcher.last, seq); 5131 } 5132 i += countChars(seq, i, 1); 5133 matcher.first++; 5134 } 5135 } 5136 boolean study(TreeInfo info) { 5137 atom.study(info); 5138 info.maxValid = false; 5139 info.deterministic = false; 5140 return next.study(info); 5141 } 5142 } 5143 5144 static final class Conditional extends Node { 5145 Node cond, yes, not; 5146 Conditional(Node cond, Node yes, Node not) { 5147 this.cond = cond; 5148 this.yes = yes; 5149 this.not = not; 5150 } 5151 boolean match(Matcher matcher, int i, CharSequence seq) { 5152 if (cond.match(matcher, i, seq)) { 5153 return yes.match(matcher, i, seq); 5154 } else { 5155 return not.match(matcher, i, seq); 5156 } 5157 } 5158 boolean study(TreeInfo info) { 5159 int minL = info.minLength; 5160 int maxL = info.maxLength; 5161 boolean maxV = info.maxValid; 5162 info.reset(); 5163 yes.study(info); 5164 5165 int minL2 = info.minLength; 5166 int maxL2 = info.maxLength; 5167 boolean maxV2 = info.maxValid; 5168 info.reset(); 5169 not.study(info); 5170 5171 info.minLength = minL + Math.min(minL2, info.minLength); 5172 info.maxLength = maxL + Math.max(maxL2, info.maxLength); 5173 info.maxValid = (maxV & maxV2 & info.maxValid); 5174 info.deterministic = false; 5175 return next.study(info); 5176 } 5177 } 5178 5179 /** 5180 * Zero width positive lookahead. 5181 */ 5182 static final class Pos extends Node { 5183 Node cond; 5184 Pos(Node cond) { 5185 this.cond = cond; 5186 } 5187 boolean match(Matcher matcher, int i, CharSequence seq) { 5188 int savedTo = matcher.to; 5189 boolean conditionMatched = false; 5190 5191 // Relax transparent region boundaries for lookahead 5192 if (matcher.transparentBounds) 5193 matcher.to = matcher.getTextLength(); 5194 try { 5195 conditionMatched = cond.match(matcher, i, seq); 5196 } finally { 5197 // Reinstate region boundaries 5198 matcher.to = savedTo; 5199 } 5200 return conditionMatched && next.match(matcher, i, seq); 5201 } 5202 } 5203 5204 /** 5205 * Zero width negative lookahead. 5206 */ 5207 static final class Neg extends Node { 5208 Node cond; 5209 Neg(Node cond) { 5210 this.cond = cond; 5211 } 5212 boolean match(Matcher matcher, int i, CharSequence seq) { 5213 int savedTo = matcher.to; 5214 boolean conditionMatched = false; 5215 5216 // Relax transparent region boundaries for lookahead 5217 if (matcher.transparentBounds) 5218 matcher.to = matcher.getTextLength(); 5219 try { 5220 if (i < matcher.to) { 5221 conditionMatched = !cond.match(matcher, i, seq); 5222 } else { 5223 // If a negative lookahead succeeds then more input 5224 // could cause it to fail! 5225 matcher.requireEnd = true; 5226 conditionMatched = !cond.match(matcher, i, seq); 5227 } 5228 } finally { 5229 // Reinstate region boundaries 5230 matcher.to = savedTo; 5231 } 5232 return conditionMatched && next.match(matcher, i, seq); 5233 } 5234 } 5235 5236 /** 5237 * For use with lookbehinds; matches the position where the lookbehind 5238 * was encountered. 5239 */ 5240 static Node lookbehindEnd = new Node() { 5241 boolean match(Matcher matcher, int i, CharSequence seq) { 5242 return i == matcher.lookbehindTo; 5243 } 5244 }; 5245 5246 /** 5247 * Zero width positive lookbehind. 5248 */ 5249 static class Behind extends Node { 5250 Node cond; 5251 int rmax, rmin; 5252 Behind(Node cond, int rmax, int rmin) { 5253 this.cond = cond; 5254 this.rmax = rmax; 5255 this.rmin = rmin; 5256 } 5257 5258 boolean match(Matcher matcher, int i, CharSequence seq) { 5259 int savedFrom = matcher.from; 5260 boolean conditionMatched = false; 5261 int startIndex = (!matcher.transparentBounds) ? 5262 matcher.from : 0; 5263 int from = Math.max(i - rmax, startIndex); 5264 // Set end boundary 5265 int savedLBT = matcher.lookbehindTo; 5266 matcher.lookbehindTo = i; 5267 // Relax transparent region boundaries for lookbehind 5268 if (matcher.transparentBounds) 5269 matcher.from = 0; 5270 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5271 conditionMatched = cond.match(matcher, j, seq); 5272 } 5273 matcher.from = savedFrom; 5274 matcher.lookbehindTo = savedLBT; 5275 return conditionMatched && next.match(matcher, i, seq); 5276 } 5277 } 5278 5279 /** 5280 * Zero width positive lookbehind, including supplementary 5281 * characters or unpaired surrogates. 5282 */ 5283 static final class BehindS extends Behind { 5284 BehindS(Node cond, int rmax, int rmin) { 5285 super(cond, rmax, rmin); 5286 } 5287 boolean match(Matcher matcher, int i, CharSequence seq) { 5288 int rmaxChars = countChars(seq, i, -rmax); 5289 int rminChars = countChars(seq, i, -rmin); 5290 int savedFrom = matcher.from; 5291 int startIndex = (!matcher.transparentBounds) ? 5292 matcher.from : 0; 5293 boolean conditionMatched = false; 5294 int from = Math.max(i - rmaxChars, startIndex); 5295 // Set end boundary 5296 int savedLBT = matcher.lookbehindTo; 5297 matcher.lookbehindTo = i; 5298 // Relax transparent region boundaries for lookbehind 5299 if (matcher.transparentBounds) 5300 matcher.from = 0; 5301 5302 for (int j = i - rminChars; 5303 !conditionMatched && j >= from; 5304 j -= j>from ? countChars(seq, j, -1) : 1) { 5305 conditionMatched = cond.match(matcher, j, seq); 5306 } 5307 matcher.from = savedFrom; 5308 matcher.lookbehindTo = savedLBT; 5309 return conditionMatched && next.match(matcher, i, seq); 5310 } 5311 } 5312 5313 /** 5314 * Zero width negative lookbehind. 5315 */ 5316 static class NotBehind extends Node { 5317 Node cond; 5318 int rmax, rmin; 5319 NotBehind(Node cond, int rmax, int rmin) { 5320 this.cond = cond; 5321 this.rmax = rmax; 5322 this.rmin = rmin; 5323 } 5324 5325 boolean match(Matcher matcher, int i, CharSequence seq) { 5326 int savedLBT = matcher.lookbehindTo; 5327 int savedFrom = matcher.from; 5328 boolean conditionMatched = false; 5329 int startIndex = (!matcher.transparentBounds) ? 5330 matcher.from : 0; 5331 int from = Math.max(i - rmax, startIndex); 5332 matcher.lookbehindTo = i; 5333 // Relax transparent region boundaries for lookbehind 5334 if (matcher.transparentBounds) 5335 matcher.from = 0; 5336 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5337 conditionMatched = cond.match(matcher, j, seq); 5338 } 5339 // Reinstate region boundaries 5340 matcher.from = savedFrom; 5341 matcher.lookbehindTo = savedLBT; 5342 return !conditionMatched && next.match(matcher, i, seq); 5343 } 5344 } 5345 5346 /** 5347 * Zero width negative lookbehind, including supplementary 5348 * characters or unpaired surrogates. 5349 */ 5350 static final class NotBehindS extends NotBehind { 5351 NotBehindS(Node cond, int rmax, int rmin) { 5352 super(cond, rmax, rmin); 5353 } 5354 boolean match(Matcher matcher, int i, CharSequence seq) { 5355 int rmaxChars = countChars(seq, i, -rmax); 5356 int rminChars = countChars(seq, i, -rmin); 5357 int savedFrom = matcher.from; 5358 int savedLBT = matcher.lookbehindTo; 5359 boolean conditionMatched = false; 5360 int startIndex = (!matcher.transparentBounds) ? 5361 matcher.from : 0; 5362 int from = Math.max(i - rmaxChars, startIndex); 5363 matcher.lookbehindTo = i; 5364 // Relax transparent region boundaries for lookbehind 5365 if (matcher.transparentBounds) 5366 matcher.from = 0; 5367 for (int j = i - rminChars; 5368 !conditionMatched && j >= from; 5369 j -= j>from ? countChars(seq, j, -1) : 1) { 5370 conditionMatched = cond.match(matcher, j, seq); 5371 } 5372 //Reinstate region boundaries 5373 matcher.from = savedFrom; 5374 matcher.lookbehindTo = savedLBT; 5375 return !conditionMatched && next.match(matcher, i, seq); 5376 } 5377 } 5378 5379 /** 5380 * Handles word boundaries. Includes a field to allow this one class to 5381 * deal with the different types of word boundaries we can match. The word 5382 * characters include underscores, letters, and digits. Non spacing marks 5383 * can are also part of a word if they have a base character, otherwise 5384 * they are ignored for purposes of finding word boundaries. 5385 */ 5386 static final class Bound extends Node { 5387 static int LEFT = 0x1; 5388 static int RIGHT= 0x2; 5389 static int BOTH = 0x3; 5390 static int NONE = 0x4; 5391 int type; 5392 boolean useUWORD; 5393 Bound(int n, boolean useUWORD) { 5394 type = n; 5395 this.useUWORD = useUWORD; 5396 } 5397 5398 boolean isWord(int ch) { 5399 return useUWORD ? CharPredicates.WORD().is(ch) 5400 : (ch == '_' || Character.isLetterOrDigit(ch)); 5401 } 5402 5403 int check(Matcher matcher, int i, CharSequence seq) { 5404 int ch; 5405 boolean left = false; 5406 int startIndex = matcher.from; 5407 int endIndex = matcher.to; 5408 if (matcher.transparentBounds) { 5409 startIndex = 0; 5410 endIndex = matcher.getTextLength(); 5411 } 5412 if (i > startIndex) { 5413 ch = Character.codePointBefore(seq, i); 5414 left = (isWord(ch) || 5415 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5416 && hasBaseCharacter(matcher, i-1, seq))); 5417 } 5418 boolean right = false; 5419 if (i < endIndex) { 5420 ch = Character.codePointAt(seq, i); 5421 right = (isWord(ch) || 5422 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5423 && hasBaseCharacter(matcher, i, seq))); 5424 } else { 5425 // Tried to access char past the end 5426 matcher.hitEnd = true; 5427 // The addition of another char could wreck a boundary 5428 matcher.requireEnd = true; 5429 } 5430 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); 5431 } 5432 boolean match(Matcher matcher, int i, CharSequence seq) { 5433 return (check(matcher, i, seq) & type) > 0 5434 && next.match(matcher, i, seq); 5435 } 5436 } 5437 5438 /** 5439 * Non spacing marks only count as word characters in bounds calculations 5440 * if they have a base character. 5441 */ 5442 private static boolean hasBaseCharacter(Matcher matcher, int i, 5443 CharSequence seq) 5444 { 5445 int start = (!matcher.transparentBounds) ? 5446 matcher.from : 0; 5447 for (int x=i; x >= start; x--) { 5448 int ch = Character.codePointAt(seq, x); 5449 if (Character.isLetterOrDigit(ch)) 5450 return true; 5451 if (Character.getType(ch) == Character.NON_SPACING_MARK) 5452 continue; 5453 return false; 5454 } 5455 return false; 5456 } 5457 5458 /** 5459 * Attempts to match a slice in the input using the Boyer-Moore string 5460 * matching algorithm. The algorithm is based on the idea that the 5461 * pattern can be shifted farther ahead in the search text if it is 5462 * matched right to left. 5463 * <p> 5464 * The pattern is compared to the input one character at a time, from 5465 * the rightmost character in the pattern to the left. If the characters 5466 * all match the pattern has been found. If a character does not match, 5467 * the pattern is shifted right a distance that is the maximum of two 5468 * functions, the bad character shift and the good suffix shift. This 5469 * shift moves the attempted match position through the input more 5470 * quickly than a naive one position at a time check. 5471 * <p> 5472 * The bad character shift is based on the character from the text that 5473 * did not match. If the character does not appear in the pattern, the 5474 * pattern can be shifted completely beyond the bad character. If the 5475 * character does occur in the pattern, the pattern can be shifted to 5476 * line the pattern up with the next occurrence of that character. 5477 * <p> 5478 * The good suffix shift is based on the idea that some subset on the right 5479 * side of the pattern has matched. When a bad character is found, the 5480 * pattern can be shifted right by the pattern length if the subset does 5481 * not occur again in pattern, or by the amount of distance to the 5482 * next occurrence of the subset in the pattern. 5483 * 5484 * Boyer-Moore search methods adapted from code by Amy Yu. 5485 */ 5486 static class BnM extends Node { 5487 int[] buffer; 5488 int[] lastOcc; 5489 int[] optoSft; 5490 5491 /** 5492 * Pre calculates arrays needed to generate the bad character 5493 * shift and the good suffix shift. Only the last seven bits 5494 * are used to see if chars match; This keeps the tables small 5495 * and covers the heavily used ASCII range, but occasionally 5496 * results in an aliased match for the bad character shift. 5497 */ 5498 static Node optimize(Node node) { 5499 if (!(node instanceof Slice)) { 5500 return node; 5501 } 5502 5503 int[] src = ((Slice) node).buffer; 5504 int patternLength = src.length; 5505 // The BM algorithm requires a bit of overhead; 5506 // If the pattern is short don't use it, since 5507 // a shift larger than the pattern length cannot 5508 // be used anyway. 5509 if (patternLength < 4) { 5510 return node; 5511 } 5512 int i, j, k; 5513 int[] lastOcc = new int[128]; 5514 int[] optoSft = new int[patternLength]; 5515 // Precalculate part of the bad character shift 5516 // It is a table for where in the pattern each 5517 // lower 7-bit value occurs 5518 for (i = 0; i < patternLength; i++) { 5519 lastOcc[src[i]&0x7F] = i + 1; 5520 } 5521 // Precalculate the good suffix shift 5522 // i is the shift amount being considered 5523NEXT: for (i = patternLength; i > 0; i--) { 5524 // j is the beginning index of suffix being considered 5525 for (j = patternLength - 1; j >= i; j--) { 5526 // Testing for good suffix 5527 if (src[j] == src[j-i]) { 5528 // src[j..len] is a good suffix 5529 optoSft[j-1] = i; 5530 } else { 5531 // No match. The array has already been 5532 // filled up with correct values before. 5533 continue NEXT; 5534 } 5535 } 5536 // This fills up the remaining of optoSft 5537 // any suffix can not have larger shift amount 5538 // then its sub-suffix. Why??? 5539 while (j > 0) { 5540 optoSft[--j] = i; 5541 } 5542 } 5543 // Set the guard value because of unicode compression 5544 optoSft[patternLength-1] = 1; 5545 if (node instanceof SliceS) 5546 return new BnMS(src, lastOcc, optoSft, node.next); 5547 return new BnM(src, lastOcc, optoSft, node.next); 5548 } 5549 BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5550 this.buffer = src; 5551 this.lastOcc = lastOcc; 5552 this.optoSft = optoSft; 5553 this.next = next; 5554 } 5555 boolean match(Matcher matcher, int i, CharSequence seq) { 5556 int[] src = buffer; 5557 int patternLength = src.length; 5558 int last = matcher.to - patternLength; 5559 5560 // Loop over all possible match positions in text 5561NEXT: while (i <= last) { 5562 // Loop over pattern from right to left 5563 for (int j = patternLength - 1; j >= 0; j--) { 5564 int ch = seq.charAt(i+j); 5565 if (ch != src[j]) { 5566 // Shift search to the right by the maximum of the 5567 // bad character shift and the good suffix shift 5568 i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); 5569 continue NEXT; 5570 } 5571 } 5572 // Entire pattern matched starting at i 5573 matcher.first = i; 5574 boolean ret = next.match(matcher, i + patternLength, seq); 5575 if (ret) { 5576 matcher.first = i; 5577 matcher.groups[0] = matcher.first; 5578 matcher.groups[1] = matcher.last; 5579 return true; 5580 } 5581 i++; 5582 } 5583 // BnM is only used as the leading node in the unanchored case, 5584 // and it replaced its Start() which always searches to the end 5585 // if it doesn't find what it's looking for, so hitEnd is true. 5586 matcher.hitEnd = true; 5587 return false; 5588 } 5589 boolean study(TreeInfo info) { 5590 info.minLength += buffer.length; 5591 info.maxValid = false; 5592 return next.study(info); 5593 } 5594 } 5595 5596 /** 5597 * Supplementary support version of BnM(). Unpaired surrogates are 5598 * also handled by this class. 5599 */ 5600 static final class BnMS extends BnM { 5601 int lengthInChars; 5602 5603 BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5604 super(src, lastOcc, optoSft, next); 5605 for (int cp : buffer) { 5606 lengthInChars += Character.charCount(cp); 5607 } 5608 } 5609 boolean match(Matcher matcher, int i, CharSequence seq) { 5610 int[] src = buffer; 5611 int patternLength = src.length; 5612 int last = matcher.to - lengthInChars; 5613 5614 // Loop over all possible match positions in text 5615NEXT: while (i <= last) { 5616 // Loop over pattern from right to left 5617 int ch; 5618 for (int j = countChars(seq, i, patternLength), x = patternLength - 1; 5619 j > 0; j -= Character.charCount(ch), x--) { 5620 ch = Character.codePointBefore(seq, i+j); 5621 if (ch != src[x]) { 5622 // Shift search to the right by the maximum of the 5623 // bad character shift and the good suffix shift 5624 int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]); 5625 i += countChars(seq, i, n); 5626 continue NEXT; 5627 } 5628 } 5629 // Entire pattern matched starting at i 5630 matcher.first = i; 5631 boolean ret = next.match(matcher, i + lengthInChars, seq); 5632 if (ret) { 5633 matcher.first = i; 5634 matcher.groups[0] = matcher.first; 5635 matcher.groups[1] = matcher.last; 5636 return true; 5637 } 5638 i += countChars(seq, i, 1); 5639 } 5640 matcher.hitEnd = true; 5641 return false; 5642 } 5643 } 5644 5645 @FunctionalInterface 5646 static interface CharPredicate { 5647 boolean is(int ch); 5648 5649 default CharPredicate and(CharPredicate p) { 5650 return ch -> is(ch) && p.is(ch); 5651 } 5652 default CharPredicate union(CharPredicate p) { 5653 return ch -> is(ch) || p.is(ch); 5654 } 5655 default CharPredicate union(CharPredicate p1, 5656 CharPredicate p2 ) { 5657 return ch -> is(ch) || p1.is(ch) || p2.is(ch); 5658 } 5659 default CharPredicate negate() { 5660 return ch -> !is(ch); 5661 } 5662 } 5663 5664 static interface BmpCharPredicate extends CharPredicate { 5665 5666 default CharPredicate and(CharPredicate p) { 5667 if(p instanceof BmpCharPredicate) 5668 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); 5669 return ch -> is(ch) && p.is(ch); 5670 } 5671 default CharPredicate union(CharPredicate p) { 5672 if (p instanceof BmpCharPredicate) 5673 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch)); 5674 return ch -> is(ch) || p.is(ch); 5675 } 5676 static CharPredicate union(CharPredicate... predicates) { 5677 CharPredicate cp = ch -> { 5678 for (CharPredicate p : predicates) { 5679 if (!p.is(ch)) 5680 return false; 5681 } 5682 return true; 5683 }; 5684 for (CharPredicate p : predicates) { 5685 if (! (p instanceof BmpCharPredicate)) 5686 return cp; 5687 } 5688 return (BmpCharPredicate)cp; 5689 } 5690 } 5691 5692 /** 5693 * matches a Perl vertical whitespace 5694 */ 5695 static BmpCharPredicate VertWS() { 5696 return cp -> (cp >= 0x0A && cp <= 0x0D) || 5697 cp == 0x85 || cp == 0x2028 || cp == 0x2029; 5698 } 5699 5700 /** 5701 * matches a Perl horizontal whitespace 5702 */ 5703 static BmpCharPredicate HorizWS() { 5704 return cp -> 5705 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || 5706 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || 5707 cp == 0x205f || cp == 0x3000; 5708 } 5709 5710 /** 5711 * for the Unicode category ALL and the dot metacharacter when 5712 * in dotall mode. 5713 */ 5714 static CharPredicate ALL() { 5715 return ch -> true; 5716 } 5717 5718 /** 5719 * for the dot metacharacter when dotall is not enabled. 5720 */ 5721 static CharPredicate DOT() { 5722 return ch -> 5723 (ch != '\n' && ch != '\r' 5724 && (ch|1) != '\u2029' 5725 && ch != '\u0085'); 5726 } 5727 5728 /** 5729 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. 5730 */ 5731 static CharPredicate UNIXDOT() { 5732 return ch -> ch != '\n'; 5733 } 5734 5735 /** 5736 * Indicate that matches a Supplementary Unicode character 5737 */ 5738 static CharPredicate SingleS(int c) { 5739 return ch -> ch == c; 5740 } 5741 5742 /** 5743 * A bmp/optimized predicate of single 5744 */ 5745 static BmpCharPredicate Single(int c) { 5746 return ch -> ch == c; 5747 } 5748 5749 /** 5750 * Case insensitive matches a given BMP character 5751 */ 5752 static BmpCharPredicate SingleI(int lower, int upper) { 5753 return ch -> ch == lower || ch == upper; 5754 } 5755 5756 /** 5757 * Unicode case insensitive matches a given Unicode character 5758 */ 5759 static CharPredicate SingleU(int lower) { 5760 return ch -> lower == ch || 5761 lower == Character.toLowerCase(Character.toUpperCase(ch)); 5762 } 5763 5764 private static boolean inRange(int lower, int ch, int upper) { 5765 return lower <= ch && ch <= upper; 5766 } 5767 5768 /** 5769 * Charactrs within a explicit value range 5770 */ 5771 static CharPredicate Range(int lower, int upper) { 5772 if (upper < Character.MIN_HIGH_SURROGATE || 5773 lower > Character.MAX_HIGH_SURROGATE && 5774 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) 5775 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); 5776 return ch -> inRange(lower, ch, upper); 5777 } 5778 5779 /** 5780 * Charactrs within a explicit value range in a case insensitive manner. 5781 */ 5782 static CharPredicate CIRange(int lower, int upper) { 5783 return ch -> inRange(lower, ch, upper) || 5784 ASCII.isAscii(ch) && 5785 (inRange(lower, ASCII.toUpper(ch), upper) || 5786 inRange(lower, ASCII.toLower(ch), upper)); 5787 } 5788 5789 static CharPredicate CIRangeU(int lower, int upper) { 5790 return ch -> { 5791 if (inRange(lower, ch, upper)) 5792 return true; 5793 int up = Character.toUpperCase(ch); 5794 return inRange(lower, up, upper) || 5795 inRange(lower, Character.toLowerCase(up), upper); 5796 }; 5797 } 5798 5799 /** 5800 * This must be the very first initializer. 5801 */ 5802 static final Node accept = new Node(); 5803 5804 static final Node lastAccept = new LastNode(); 5805 5806 /** 5807 * Creates a predicate which can be used to match a string. 5808 * 5809 * @return The predicate which can be used for matching on a string 5810 * @since 1.8 5811 */ 5812 public Predicate<String> asPredicate() { 5813 return s -> matcher(s).find(); 5814 } 5815 5816 /** 5817 * Creates a stream from the given input sequence around matches of this 5818 * pattern. 5819 * 5820 * <p> The stream returned by this method contains each substring of the 5821 * input sequence that is terminated by another subsequence that matches 5822 * this pattern or is terminated by the end of the input sequence. The 5823 * substrings in the stream are in the order in which they occur in the 5824 * input. Trailing empty strings will be discarded and not encountered in 5825 * the stream. 5826 * 5827 * <p> If this pattern does not match any subsequence of the input then 5828 * the resulting stream has just one element, namely the input sequence in 5829 * string form. 5830 * 5831 * <p> When there is a positive-width match at the beginning of the input 5832 * sequence then an empty leading substring is included at the beginning 5833 * of the stream. A zero-width match at the beginning however never produces 5834 * such empty leading substring. 5835 * 5836 * <p> If the input sequence is mutable, it must remain constant during the 5837 * execution of the terminal stream operation. Otherwise, the result of the 5838 * terminal stream operation is undefined. 5839 * 5840 * @param input 5841 * The character sequence to be split 5842 * 5843 * @return The stream of strings computed by splitting the input 5844 * around matches of this pattern 5845 * @see #split(CharSequence) 5846 * @since 1.8 5847 */ 5848 public Stream<String> splitAsStream(final CharSequence input) { 5849 class MatcherIterator implements Iterator<String> { 5850 private Matcher matcher; 5851 // The start position of the next sub-sequence of input 5852 // when current == input.length there are no more elements 5853 private int current; 5854 // null if the next element, if any, needs to obtained 5855 private String nextElement; 5856 // > 0 if there are N next empty elements 5857 private int emptyElementCount; 5858 5859 public String next() { 5860 if (!hasNext()) 5861 throw new NoSuchElementException(); 5862 5863 if (emptyElementCount == 0) { 5864 String n = nextElement; 5865 nextElement = null; 5866 return n; 5867 } else { 5868 emptyElementCount--; 5869 return ""; 5870 } 5871 } 5872 5873 public boolean hasNext() { 5874 if (matcher == null) { 5875 matcher = matcher(input); 5876 // If the input is an empty string then the result can only be a 5877 // stream of the input. Induce that by setting the empty 5878 // element count to 1 5879 emptyElementCount = input.length() == 0 ? 1 : 0; 5880 } 5881 if (nextElement != null || emptyElementCount > 0) 5882 return true; 5883 5884 if (current == input.length()) 5885 return false; 5886 5887 // Consume the next matching element 5888 // Count sequence of matching empty elements 5889 while (matcher.find()) { 5890 nextElement = input.subSequence(current, matcher.start()).toString(); 5891 current = matcher.end(); 5892 if (!nextElement.isEmpty()) { 5893 return true; 5894 } else if (current > 0) { // no empty leading substring for zero-width 5895 // match at the beginning of the input 5896 emptyElementCount++; 5897 } 5898 } 5899 5900 // Consume last matching element 5901 nextElement = input.subSequence(current, input.length()).toString(); 5902 current = input.length(); 5903 if (!nextElement.isEmpty()) { 5904 return true; 5905 } else { 5906 // Ignore a terminal sequence of matching empty elements 5907 emptyElementCount = 0; 5908 nextElement = null; 5909 return false; 5910 } 5911 } 5912 } 5913 return StreamSupport.stream(Spliterators.spliteratorUnknownSize( 5914 new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false); 5915 } 5916} 5917