1/* 2********************************************************************** 3* Copyright (C) 2002-2013, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* file name: regex.h 7* encoding: US-ASCII 8* indentation:4 9* 10* created on: 2002oct22 11* created by: Andy Heninger 12* 13* ICU Regular Expressions, API for C++ 14*/ 15 16#ifndef REGEX_H 17#define REGEX_H 18 19//#define REGEX_DEBUG 20 21/** 22 * \file 23 * \brief C++ API: Regular Expressions 24 * 25 * <h2>Regular Expression API</h2> 26 * 27 * <p>The ICU API for processing regular expressions consists of two classes, 28 * <code>RegexPattern</code> and <code>RegexMatcher</code>. 29 * <code>RegexPattern</code> objects represent a pre-processed, or compiled 30 * regular expression. They are created from a regular expression pattern string, 31 * and can be used to create <code>RegexMatcher</code> objects for the pattern.</p> 32 * 33 * <p>Class <code>RegexMatcher</code> bundles together a regular expression 34 * pattern and a target string to which the search pattern will be applied. 35 * <code>RegexMatcher</code> includes API for doing plain find or search 36 * operations, for search and replace operations, and for obtaining detailed 37 * information about bounds of a match. </p> 38 * 39 * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular 40 * expression pattern strings application code can be simplified and the explicit 41 * need for <code>RegexPattern</code> objects can usually be eliminated. 42 * </p> 43 */ 44 45#include "unicode/utypes.h" 46 47#if !UCONFIG_NO_REGULAR_EXPRESSIONS 48 49#include "unicode/uobject.h" 50#include "unicode/unistr.h" 51#include "unicode/utext.h" 52#include "unicode/parseerr.h" 53 54#include "unicode/uregex.h" 55 56// Forward Declarations 57 58U_NAMESPACE_BEGIN 59 60struct Regex8BitSet; 61class RegexCImpl; 62class RegexMatcher; 63class RegexPattern; 64struct REStackFrame; 65class RuleBasedBreakIterator; 66class UnicodeSet; 67class UVector; 68class UVector32; 69class UVector64; 70 71 72/** 73 * Class <code>RegexPattern</code> represents a compiled regular expression. It includes 74 * factory methods for creating a RegexPattern object from the source (string) form 75 * of a regular expression, methods for creating RegexMatchers that allow the pattern 76 * to be applied to input text, and a few convenience methods for simple common 77 * uses of regular expressions. 78 * 79 * <p>Class RegexPattern is not intended to be subclassed.</p> 80 * 81 * @stable ICU 2.4 82 */ 83class U_I18N_API RegexPattern: public UObject { 84public: 85 86 /** 87 * default constructor. Create a RegexPattern object that refers to no actual 88 * pattern. Not normally needed; RegexPattern objects are usually 89 * created using the factory method <code>compile()</code>. 90 * 91 * @stable ICU 2.4 92 */ 93 RegexPattern(); 94 95 /** 96 * Copy Constructor. Create a new RegexPattern object that is equivalent 97 * to the source object. 98 * @param source the pattern object to be copied. 99 * @stable ICU 2.4 100 */ 101 RegexPattern(const RegexPattern &source); 102 103 /** 104 * Destructor. Note that a RegexPattern object must persist so long as any 105 * RegexMatcher objects that were created from the RegexPattern are active. 106 * @stable ICU 2.4 107 */ 108 virtual ~RegexPattern(); 109 110 /** 111 * Comparison operator. Two RegexPattern objects are considered equal if they 112 * were constructed from identical source patterns using the same match flag 113 * settings. 114 * @param that a RegexPattern object to compare with "this". 115 * @return TRUE if the objects are equivalent. 116 * @stable ICU 2.4 117 */ 118 UBool operator==(const RegexPattern& that) const; 119 120 /** 121 * Comparison operator. Two RegexPattern objects are considered equal if they 122 * were constructed from identical source patterns using the same match flag 123 * settings. 124 * @param that a RegexPattern object to compare with "this". 125 * @return TRUE if the objects are different. 126 * @stable ICU 2.4 127 */ 128 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);} 129 130 /** 131 * Assignment operator. After assignment, this RegexPattern will behave identically 132 * to the source object. 133 * @stable ICU 2.4 134 */ 135 RegexPattern &operator =(const RegexPattern &source); 136 137 /** 138 * Create an exact copy of this RegexPattern object. Since RegexPattern is not 139 * intended to be subclasses, <code>clone()</code> and the copy construction are 140 * equivalent operations. 141 * @return the copy of this RegexPattern 142 * @stable ICU 2.4 143 */ 144 virtual RegexPattern *clone() const; 145 146 147 /** 148 * Compiles the regular expression in string form into a RegexPattern 149 * object. These compile methods, rather than the constructors, are the usual 150 * way that RegexPattern objects are created. 151 * 152 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 153 * objects created from the pattern are active. RegexMatchers keep a pointer 154 * back to their pattern, so premature deletion of the pattern is a 155 * catastrophic error.</p> 156 * 157 * <p>All pattern match mode flags are set to their default values.</p> 158 * 159 * <p>Note that it is often more convenient to construct a RegexMatcher directly 160 * from a pattern string rather than separately compiling the pattern and 161 * then creating a RegexMatcher object from the pattern.</p> 162 * 163 * @param regex The regular expression to be compiled. 164 * @param pe Receives the position (line and column nubers) of any error 165 * within the regular expression.) 166 * @param status A reference to a UErrorCode to receive any errors. 167 * @return A regexPattern object for the compiled pattern. 168 * 169 * @stable ICU 2.4 170 */ 171 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 172 UParseError &pe, 173 UErrorCode &status); 174 175 /** 176 * Compiles the regular expression in string form into a RegexPattern 177 * object. These compile methods, rather than the constructors, are the usual 178 * way that RegexPattern objects are created. 179 * 180 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 181 * objects created from the pattern are active. RegexMatchers keep a pointer 182 * back to their pattern, so premature deletion of the pattern is a 183 * catastrophic error.</p> 184 * 185 * <p>All pattern match mode flags are set to their default values.</p> 186 * 187 * <p>Note that it is often more convenient to construct a RegexMatcher directly 188 * from a pattern string rather than separately compiling the pattern and 189 * then creating a RegexMatcher object from the pattern.</p> 190 * 191 * @param regex The regular expression to be compiled. Note, the text referred 192 * to by this UText must not be deleted during the lifetime of the 193 * RegexPattern object or any RegexMatcher object created from it. 194 * @param pe Receives the position (line and column nubers) of any error 195 * within the regular expression.) 196 * @param status A reference to a UErrorCode to receive any errors. 197 * @return A regexPattern object for the compiled pattern. 198 * 199 * @stable ICU 4.6 200 */ 201 static RegexPattern * U_EXPORT2 compile( UText *regex, 202 UParseError &pe, 203 UErrorCode &status); 204 205 /** 206 * Compiles the regular expression in string form into a RegexPattern 207 * object using the specified match mode flags. These compile methods, 208 * rather than the constructors, are the usual way that RegexPattern objects 209 * are created. 210 * 211 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 212 * objects created from the pattern are active. RegexMatchers keep a pointer 213 * back to their pattern, so premature deletion of the pattern is a 214 * catastrophic error.</p> 215 * 216 * <p>Note that it is often more convenient to construct a RegexMatcher directly 217 * from a pattern string instead of than separately compiling the pattern and 218 * then creating a RegexMatcher object from the pattern.</p> 219 * 220 * @param regex The regular expression to be compiled. 221 * @param flags The match mode flags to be used. 222 * @param pe Receives the position (line and column numbers) of any error 223 * within the regular expression.) 224 * @param status A reference to a UErrorCode to receive any errors. 225 * @return A regexPattern object for the compiled pattern. 226 * 227 * @stable ICU 2.4 228 */ 229 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 230 uint32_t flags, 231 UParseError &pe, 232 UErrorCode &status); 233 234 /** 235 * Compiles the regular expression in string form into a RegexPattern 236 * object using the specified match mode flags. These compile methods, 237 * rather than the constructors, are the usual way that RegexPattern objects 238 * are created. 239 * 240 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 241 * objects created from the pattern are active. RegexMatchers keep a pointer 242 * back to their pattern, so premature deletion of the pattern is a 243 * catastrophic error.</p> 244 * 245 * <p>Note that it is often more convenient to construct a RegexMatcher directly 246 * from a pattern string instead of than separately compiling the pattern and 247 * then creating a RegexMatcher object from the pattern.</p> 248 * 249 * @param regex The regular expression to be compiled. Note, the text referred 250 * to by this UText must not be deleted during the lifetime of the 251 * RegexPattern object or any RegexMatcher object created from it. 252 * @param flags The match mode flags to be used. 253 * @param pe Receives the position (line and column numbers) of any error 254 * within the regular expression.) 255 * @param status A reference to a UErrorCode to receive any errors. 256 * @return A regexPattern object for the compiled pattern. 257 * 258 * @stable ICU 4.6 259 */ 260 static RegexPattern * U_EXPORT2 compile( UText *regex, 261 uint32_t flags, 262 UParseError &pe, 263 UErrorCode &status); 264 265 /** 266 * Compiles the regular expression in string form into a RegexPattern 267 * object using the specified match mode flags. These compile methods, 268 * rather than the constructors, are the usual way that RegexPattern objects 269 * are created. 270 * 271 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 272 * objects created from the pattern are active. RegexMatchers keep a pointer 273 * back to their pattern, so premature deletion of the pattern is a 274 * catastrophic error.</p> 275 * 276 * <p>Note that it is often more convenient to construct a RegexMatcher directly 277 * from a pattern string instead of than separately compiling the pattern and 278 * then creating a RegexMatcher object from the pattern.</p> 279 * 280 * @param regex The regular expression to be compiled. 281 * @param flags The match mode flags to be used. 282 * @param status A reference to a UErrorCode to receive any errors. 283 * @return A regexPattern object for the compiled pattern. 284 * 285 * @stable ICU 2.6 286 */ 287 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 288 uint32_t flags, 289 UErrorCode &status); 290 291 /** 292 * Compiles the regular expression in string form into a RegexPattern 293 * object using the specified match mode flags. These compile methods, 294 * rather than the constructors, are the usual way that RegexPattern objects 295 * are created. 296 * 297 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 298 * objects created from the pattern are active. RegexMatchers keep a pointer 299 * back to their pattern, so premature deletion of the pattern is a 300 * catastrophic error.</p> 301 * 302 * <p>Note that it is often more convenient to construct a RegexMatcher directly 303 * from a pattern string instead of than separately compiling the pattern and 304 * then creating a RegexMatcher object from the pattern.</p> 305 * 306 * @param regex The regular expression to be compiled. Note, the text referred 307 * to by this UText must not be deleted during the lifetime of the 308 * RegexPattern object or any RegexMatcher object created from it. 309 * @param flags The match mode flags to be used. 310 * @param status A reference to a UErrorCode to receive any errors. 311 * @return A regexPattern object for the compiled pattern. 312 * 313 * @stable ICU 4.6 314 */ 315 static RegexPattern * U_EXPORT2 compile( UText *regex, 316 uint32_t flags, 317 UErrorCode &status); 318 319 /** 320 * Get the match mode flags that were used when compiling this pattern. 321 * @return the match mode flags 322 * @stable ICU 2.4 323 */ 324 virtual uint32_t flags() const; 325 326 /** 327 * Creates a RegexMatcher that will match the given input against this pattern. The 328 * RegexMatcher can then be used to perform match, find or replace operations 329 * on the input. Note that a RegexPattern object must not be deleted while 330 * RegexMatchers created from it still exist and might possibly be used again. 331 * <p> 332 * The matcher will retain a reference to the supplied input string, and all regexp 333 * pattern matching operations happen directly on this original string. It is 334 * critical that the string not be altered or deleted before use by the regular 335 * expression operations is complete. 336 * 337 * @param input The input string to which the regular expression will be applied. 338 * @param status A reference to a UErrorCode to receive any errors. 339 * @return A RegexMatcher object for this pattern and input. 340 * 341 * @stable ICU 2.4 342 */ 343 virtual RegexMatcher *matcher(const UnicodeString &input, 344 UErrorCode &status) const; 345 346private: 347 /** 348 * Cause a compilation error if an application accidentally attempts to 349 * create a matcher with a (UChar *) string as input rather than 350 * a UnicodeString. Avoids a dangling reference to a temporary string. 351 * <p> 352 * To efficiently work with UChar *strings, wrap the data in a UnicodeString 353 * using one of the aliasing constructors, such as 354 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> 355 * or in a UText, using 356 * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code> 357 * 358 */ 359 RegexMatcher *matcher(const UChar *input, 360 UErrorCode &status) const; 361public: 362 363 364 /** 365 * Creates a RegexMatcher that will match against this pattern. The 366 * RegexMatcher can be used to perform match, find or replace operations. 367 * Note that a RegexPattern object must not be deleted while 368 * RegexMatchers created from it still exist and might possibly be used again. 369 * 370 * @param status A reference to a UErrorCode to receive any errors. 371 * @return A RegexMatcher object for this pattern and input. 372 * 373 * @stable ICU 2.6 374 */ 375 virtual RegexMatcher *matcher(UErrorCode &status) const; 376 377 378 /** 379 * Test whether a string matches a regular expression. This convenience function 380 * both compiles the regular expression and applies it in a single operation. 381 * Note that if the same pattern needs to be applied repeatedly, this method will be 382 * less efficient than creating and reusing a RegexMatcher object. 383 * 384 * @param regex The regular expression 385 * @param input The string data to be matched 386 * @param pe Receives the position of any syntax errors within the regular expression 387 * @param status A reference to a UErrorCode to receive any errors. 388 * @return True if the regular expression exactly matches the full input string. 389 * 390 * @stable ICU 2.4 391 */ 392 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 393 const UnicodeString &input, 394 UParseError &pe, 395 UErrorCode &status); 396 397 /** 398 * Test whether a string matches a regular expression. This convenience function 399 * both compiles the regular expression and applies it in a single operation. 400 * Note that if the same pattern needs to be applied repeatedly, this method will be 401 * less efficient than creating and reusing a RegexMatcher object. 402 * 403 * @param regex The regular expression 404 * @param input The string data to be matched 405 * @param pe Receives the position of any syntax errors within the regular expression 406 * @param status A reference to a UErrorCode to receive any errors. 407 * @return True if the regular expression exactly matches the full input string. 408 * 409 * @stable ICU 4.6 410 */ 411 static UBool U_EXPORT2 matches(UText *regex, 412 UText *input, 413 UParseError &pe, 414 UErrorCode &status); 415 416 /** 417 * Returns the regular expression from which this pattern was compiled. This method will work 418 * even if the pattern was compiled from a UText. 419 * 420 * Note: If the pattern was originally compiled from a UText, and that UText was modified, 421 * the returned string may no longer reflect the RegexPattern object. 422 * @stable ICU 2.4 423 */ 424 virtual UnicodeString pattern() const; 425 426 427 /** 428 * Returns the regular expression from which this pattern was compiled. This method will work 429 * even if the pattern was compiled from a UnicodeString. 430 * 431 * Note: This is the original input, not a clone. If the pattern was originally compiled from a 432 * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern 433 * object. 434 * 435 * @stable ICU 4.6 436 */ 437 virtual UText *patternText(UErrorCode &status) const; 438 439 440 /** 441 * Split a string into fields. Somewhat like split() from Perl or Java. 442 * Pattern matches identify delimiters that separate the input 443 * into fields. The input data between the delimiters becomes the 444 * fields themselves. 445 * 446 * If the delimiter pattern includes capture groups, the captured text will 447 * also appear in the destination array of output strings, interspersed 448 * with the fields. This is similar to Perl, but differs from Java, 449 * which ignores the presence of capture groups in the pattern. 450 * 451 * Trailing empty fields will always be returned, assuming sufficient 452 * destination capacity. This differs from the default behavior for Java 453 * and Perl where trailing empty fields are not returned. 454 * 455 * The number of strings produced by the split operation is returned. 456 * This count includes the strings from capture groups in the delimiter pattern. 457 * This behavior differs from Java, which ignores capture groups. 458 * 459 * For the best performance on split() operations, 460 * <code>RegexMatcher::split</code> is preferable to this function 461 * 462 * @param input The string to be split into fields. The field delimiters 463 * match the pattern (in the "this" object) 464 * @param dest An array of UnicodeStrings to receive the results of the split. 465 * This is an array of actual UnicodeString objects, not an 466 * array of pointers to strings. Local (stack based) arrays can 467 * work well here. 468 * @param destCapacity The number of elements in the destination array. 469 * If the number of fields found is less than destCapacity, the 470 * extra strings in the destination array are not altered. 471 * If the number of destination strings is less than the number 472 * of fields, the trailing part of the input string, including any 473 * field delimiters, is placed in the last destination string. 474 * @param status A reference to a UErrorCode to receive any errors. 475 * @return The number of fields into which the input string was split. 476 * @stable ICU 2.4 477 */ 478 virtual int32_t split(const UnicodeString &input, 479 UnicodeString dest[], 480 int32_t destCapacity, 481 UErrorCode &status) const; 482 483 484 /** 485 * Split a string into fields. Somewhat like split() from Perl or Java. 486 * Pattern matches identify delimiters that separate the input 487 * into fields. The input data between the delimiters becomes the 488 * fields themselves. 489 * 490 * If the delimiter pattern includes capture groups, the captured text will 491 * also appear in the destination array of output strings, interspersed 492 * with the fields. This is similar to Perl, but differs from Java, 493 * which ignores the presence of capture groups in the pattern. 494 * 495 * Trailing empty fields will always be returned, assuming sufficient 496 * destination capacity. This differs from the default behavior for Java 497 * and Perl where trailing empty fields are not returned. 498 * 499 * The number of strings produced by the split operation is returned. 500 * This count includes the strings from capture groups in the delimiter pattern. 501 * This behavior differs from Java, which ignores capture groups. 502 * 503 * For the best performance on split() operations, 504 * <code>RegexMatcher::split</code> is preferable to this function 505 * 506 * @param input The string to be split into fields. The field delimiters 507 * match the pattern (in the "this" object) 508 * @param dest An array of mutable UText structs to receive the results of the split. 509 * If a field is NULL, a new UText is allocated to contain the results for 510 * that field. This new UText is not guaranteed to be mutable. 511 * @param destCapacity The number of elements in the destination array. 512 * If the number of fields found is less than destCapacity, the 513 * extra strings in the destination array are not altered. 514 * If the number of destination strings is less than the number 515 * of fields, the trailing part of the input string, including any 516 * field delimiters, is placed in the last destination string. 517 * @param status A reference to a UErrorCode to receive any errors. 518 * @return The number of destination strings used. 519 * 520 * @stable ICU 4.6 521 */ 522 virtual int32_t split(UText *input, 523 UText *dest[], 524 int32_t destCapacity, 525 UErrorCode &status) const; 526 527 528 /** 529 * ICU "poor man's RTTI", returns a UClassID for the actual class. 530 * 531 * @stable ICU 2.4 532 */ 533 virtual UClassID getDynamicClassID() const; 534 535 /** 536 * ICU "poor man's RTTI", returns a UClassID for this class. 537 * 538 * @stable ICU 2.4 539 */ 540 static UClassID U_EXPORT2 getStaticClassID(); 541 542private: 543 // 544 // Implementation Data 545 // 546 UText *fPattern; // The original pattern string. 547 UnicodeString *fPatternString; // The original pattern UncodeString if relevant 548 uint32_t fFlags; // The flags used when compiling the pattern. 549 // 550 UVector64 *fCompiledPat; // The compiled pattern p-code. 551 UnicodeString fLiteralText; // Any literal string data from the pattern, 552 // after un-escaping, for use during the match. 553 554 UVector *fSets; // Any UnicodeSets referenced from the pattern. 555 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 556 557 558 UErrorCode fDeferredStatus; // status if some prior error has left this 559 // RegexPattern in an unusable state. 560 561 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 562 // >= this value. For some patterns, this calculated 563 // value may be less than the true shortest 564 // possible match. 565 566 int32_t fFrameSize; // Size of a state stack frame in the 567 // execution engine. 568 569 int32_t fDataSize; // The size of the data needed by the pattern that 570 // does not go on the state stack, but has just 571 // a single copy per matcher. 572 573 UVector32 *fGroupMap; // Map from capture group number to position of 574 // the group's variables in the matcher stack frame. 575 576 int32_t fMaxCaptureDigits; 577 578 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 579 // regex character classes, e.g. Word. 580 581 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 582 // sets for predefined regex classes. 583 584 int32_t fStartType; // Info on how a match must start. 585 int32_t fInitialStringIdx; // 586 int32_t fInitialStringLen; 587 UnicodeSet *fInitialChars; 588 UChar32 fInitialChar; 589 Regex8BitSet *fInitialChars8; 590 UBool fNeedsAltInput; 591 592 friend class RegexCompile; 593 friend class RegexMatcher; 594 friend class RegexCImpl; 595 596 // 597 // Implementation Methods 598 // 599 void init(); // Common initialization, for use by constructors. 600 void zap(); // Common cleanup 601 602 void dumpOp(int32_t index) const; 603 604 public: 605#ifndef U_HIDE_INTERNAL_API 606 /** 607 * Dump a compiled pattern. Internal debug function. 608 * @internal 609 */ 610 void dumpPattern() const; 611#endif 612}; 613 614 615 616/** 617 * class RegexMatcher bundles together a regular expression pattern and 618 * input text to which the expression can be applied. It includes methods 619 * for testing for matches, and for find and replace operations. 620 * 621 * <p>Class RegexMatcher is not intended to be subclassed.</p> 622 * 623 * @stable ICU 2.4 624 */ 625class U_I18N_API RegexMatcher: public UObject { 626public: 627 628 /** 629 * Construct a RegexMatcher for a regular expression. 630 * This is a convenience method that avoids the need to explicitly create 631 * a RegexPattern object. Note that if several RegexMatchers need to be 632 * created for the same expression, it will be more efficient to 633 * separately create and cache a RegexPattern object, and use 634 * its matcher() method to create the RegexMatcher objects. 635 * 636 * @param regexp The Regular Expression to be compiled. 637 * @param flags Regular expression options, such as case insensitive matching. 638 * @see UREGEX_CASE_INSENSITIVE 639 * @param status Any errors are reported by setting this UErrorCode variable. 640 * @stable ICU 2.6 641 */ 642 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 643 644 /** 645 * Construct a RegexMatcher for a regular expression. 646 * This is a convenience method that avoids the need to explicitly create 647 * a RegexPattern object. Note that if several RegexMatchers need to be 648 * created for the same expression, it will be more efficient to 649 * separately create and cache a RegexPattern object, and use 650 * its matcher() method to create the RegexMatcher objects. 651 * 652 * @param regexp The regular expression to be compiled. 653 * @param flags Regular expression options, such as case insensitive matching. 654 * @see UREGEX_CASE_INSENSITIVE 655 * @param status Any errors are reported by setting this UErrorCode variable. 656 * 657 * @stable ICU 4.6 658 */ 659 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); 660 661 /** 662 * Construct a RegexMatcher for a regular expression. 663 * This is a convenience method that avoids the need to explicitly create 664 * a RegexPattern object. Note that if several RegexMatchers need to be 665 * created for the same expression, it will be more efficient to 666 * separately create and cache a RegexPattern object, and use 667 * its matcher() method to create the RegexMatcher objects. 668 * <p> 669 * The matcher will retain a reference to the supplied input string, and all regexp 670 * pattern matching operations happen directly on the original string. It is 671 * critical that the string not be altered or deleted before use by the regular 672 * expression operations is complete. 673 * 674 * @param regexp The Regular Expression to be compiled. 675 * @param input The string to match. The matcher retains a reference to the 676 * caller's string; mo copy is made. 677 * @param flags Regular expression options, such as case insensitive matching. 678 * @see UREGEX_CASE_INSENSITIVE 679 * @param status Any errors are reported by setting this UErrorCode variable. 680 * @stable ICU 2.6 681 */ 682 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 683 uint32_t flags, UErrorCode &status); 684 685 /** 686 * Construct a RegexMatcher for a regular expression. 687 * This is a convenience method that avoids the need to explicitly create 688 * a RegexPattern object. Note that if several RegexMatchers need to be 689 * created for the same expression, it will be more efficient to 690 * separately create and cache a RegexPattern object, and use 691 * its matcher() method to create the RegexMatcher objects. 692 * <p> 693 * The matcher will make a shallow clone of the supplied input text, and all regexp 694 * pattern matching operations happen on this clone. While read-only operations on 695 * the supplied text are permitted, it is critical that the underlying string not be 696 * altered or deleted before use by the regular expression operations is complete. 697 * 698 * @param regexp The Regular Expression to be compiled. 699 * @param input The string to match. The matcher retains a shallow clone of the text. 700 * @param flags Regular expression options, such as case insensitive matching. 701 * @see UREGEX_CASE_INSENSITIVE 702 * @param status Any errors are reported by setting this UErrorCode variable. 703 * 704 * @stable ICU 4.6 705 */ 706 RegexMatcher(UText *regexp, UText *input, 707 uint32_t flags, UErrorCode &status); 708 709private: 710 /** 711 * Cause a compilation error if an application accidentally attempts to 712 * create a matcher with a (UChar *) string as input rather than 713 * a UnicodeString. Avoids a dangling reference to a temporary string. 714 * <p> 715 * To efficiently work with UChar *strings, wrap the data in a UnicodeString 716 * using one of the aliasing constructors, such as 717 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> 718 * or in a UText, using 719 * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code> 720 * 721 */ 722 RegexMatcher(const UnicodeString ®exp, const UChar *input, 723 uint32_t flags, UErrorCode &status); 724public: 725 726 727 /** 728 * Destructor. 729 * 730 * @stable ICU 2.4 731 */ 732 virtual ~RegexMatcher(); 733 734 735 /** 736 * Attempts to match the entire input region against the pattern. 737 * @param status A reference to a UErrorCode to receive any errors. 738 * @return TRUE if there is a match 739 * @stable ICU 2.4 740 */ 741 virtual UBool matches(UErrorCode &status); 742 743 744 /** 745 * Resets the matcher, then attempts to match the input beginning 746 * at the specified startIndex, and extending to the end of the input. 747 * The input region is reset to include the entire input string. 748 * A successful match must extend to the end of the input. 749 * @param startIndex The input string (native) index at which to begin matching. 750 * @param status A reference to a UErrorCode to receive any errors. 751 * @return TRUE if there is a match 752 * @stable ICU 2.8 753 */ 754 virtual UBool matches(int64_t startIndex, UErrorCode &status); 755 756 757 /** 758 * Attempts to match the input string, starting from the beginning of the region, 759 * against the pattern. Like the matches() method, this function 760 * always starts at the beginning of the input region; 761 * unlike that function, it does not require that the entire region be matched. 762 * 763 * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, 764 * <code>end()</code>, and <code>group()</code> functions.</p> 765 * 766 * @param status A reference to a UErrorCode to receive any errors. 767 * @return TRUE if there is a match at the start of the input string. 768 * @stable ICU 2.4 769 */ 770 virtual UBool lookingAt(UErrorCode &status); 771 772 773 /** 774 * Attempts to match the input string, starting from the specified index, against the pattern. 775 * The match may be of any length, and is not required to extend to the end 776 * of the input string. Contrast with match(). 777 * 778 * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, 779 * <code>end()</code>, and <code>group()</code> functions.</p> 780 * 781 * @param startIndex The input string (native) index at which to begin matching. 782 * @param status A reference to a UErrorCode to receive any errors. 783 * @return TRUE if there is a match. 784 * @stable ICU 2.8 785 */ 786 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status); 787 788 789 /** 790 * Find the next pattern match in the input string. 791 * The find begins searching the input at the location following the end of 792 * the previous match, or at the start of the string if there is no previous match. 793 * If a match is found, <code>start(), end()</code> and <code>group()</code> 794 * will provide more information regarding the match. 795 * <p>Note that if the input string is changed by the application, 796 * use find(startPos, status) instead of find(), because the saved starting 797 * position may not be valid with the altered input string.</p> 798 * @return TRUE if a match is found. 799 * @stable ICU 2.4 800 */ 801 virtual UBool find(); 802 803 804 /** 805 * Resets this RegexMatcher and then attempts to find the next substring of the 806 * input string that matches the pattern, starting at the specified index. 807 * 808 * @param start The (native) index in the input string to begin the search. 809 * @param status A reference to a UErrorCode to receive any errors. 810 * @return TRUE if a match is found. 811 * @stable ICU 2.4 812 */ 813 virtual UBool find(int64_t start, UErrorCode &status); 814 815 816 /** 817 * Returns a string containing the text matched by the previous match. 818 * If the pattern can match an empty string, an empty string may be returned. 819 * @param status A reference to a UErrorCode to receive any errors. 820 * Possible errors are U_REGEX_INVALID_STATE if no match 821 * has been attempted or the last match failed. 822 * @return a string containing the matched input text. 823 * @stable ICU 2.4 824 */ 825 virtual UnicodeString group(UErrorCode &status) const; 826 827 828 /** 829 * Returns a string containing the text captured by the given group 830 * during the previous match operation. Group(0) is the entire match. 831 * 832 * @param groupNum the capture group number 833 * @param status A reference to a UErrorCode to receive any errors. 834 * Possible errors are U_REGEX_INVALID_STATE if no match 835 * has been attempted or the last match failed and 836 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. 837 * @return the captured text 838 * @stable ICU 2.4 839 */ 840 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 841 842 843 /** 844 * Returns the number of capturing groups in this matcher's pattern. 845 * @return the number of capture groups 846 * @stable ICU 2.4 847 */ 848 virtual int32_t groupCount() const; 849 850 851 /** 852 * Returns a shallow clone of the entire live input string with the UText current native index 853 * set to the beginning of the requested group. 854 * 855 * @param dest The UText into which the input should be cloned, or NULL to create a new UText 856 * @param group_len A reference to receive the length of the desired capture group 857 * @param status A reference to a UErrorCode to receive any errors. 858 * Possible errors are U_REGEX_INVALID_STATE if no match 859 * has been attempted or the last match failed and 860 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. 861 * @return dest if non-NULL, a shallow copy of the input text otherwise 862 * 863 * @stable ICU 4.6 864 */ 865 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 866 867 /** 868 * Returns a shallow clone of the entire live input string with the UText current native index 869 * set to the beginning of the requested group. 870 * 871 * @param groupNum The capture group number. 872 * @param dest The UText into which the input should be cloned, or NULL to create a new UText. 873 * @param group_len A reference to receive the length of the desired capture group 874 * @param status A reference to a UErrorCode to receive any errors. 875 * Possible errors are U_REGEX_INVALID_STATE if no match 876 * has been attempted or the last match failed and 877 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. 878 * @return dest if non-NULL, a shallow copy of the input text otherwise 879 * 880 * @stable ICU 4.6 881 */ 882 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; 883 884 /** 885 * Returns a string containing the text captured by the given group 886 * during the previous match operation. Group(0) is the entire match. 887 * 888 * @param groupNum the capture group number 889 * @param dest A mutable UText in which the matching text is placed. 890 * If NULL, a new UText will be created (which may not be mutable). 891 * @param status A reference to a UErrorCode to receive any errors. 892 * Possible errors are U_REGEX_INVALID_STATE if no match 893 * has been attempted or the last match failed. 894 * @return A string containing the matched input text. If a pre-allocated UText 895 * was provided, it will always be used and returned. 896 * 897 * @internal ICU 4.4 technology preview 898 */ 899 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; 900 901 902 /** 903 * Returns the index in the input string of the start of the text matched 904 * during the previous match operation. 905 * @param status a reference to a UErrorCode to receive any errors. 906 * @return The (native) position in the input string of the start of the last match. 907 * @stable ICU 2.4 908 */ 909 virtual int32_t start(UErrorCode &status) const; 910 911 /** 912 * Returns the index in the input string of the start of the text matched 913 * during the previous match operation. 914 * @param status a reference to a UErrorCode to receive any errors. 915 * @return The (native) position in the input string of the start of the last match. 916 * @stable ICU 4.6 917 */ 918 virtual int64_t start64(UErrorCode &status) const; 919 920 921 /** 922 * Returns the index in the input string of the start of the text matched by the 923 * specified capture group during the previous match operation. Return -1 if 924 * the capture group exists in the pattern, but was not part of the last match. 925 * 926 * @param group the capture group number 927 * @param status A reference to a UErrorCode to receive any errors. Possible 928 * errors are U_REGEX_INVALID_STATE if no match has been 929 * attempted or the last match failed, and 930 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number 931 * @return the (native) start position of substring matched by the specified group. 932 * @stable ICU 2.4 933 */ 934 virtual int32_t start(int32_t group, UErrorCode &status) const; 935 936 /** 937 * Returns the index in the input string of the start of the text matched by the 938 * specified capture group during the previous match operation. Return -1 if 939 * the capture group exists in the pattern, but was not part of the last match. 940 * 941 * @param group the capture group number. 942 * @param status A reference to a UErrorCode to receive any errors. Possible 943 * errors are U_REGEX_INVALID_STATE if no match has been 944 * attempted or the last match failed, and 945 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. 946 * @return the (native) start position of substring matched by the specified group. 947 * @stable ICU 4.6 948 */ 949 virtual int64_t start64(int32_t group, UErrorCode &status) const; 950 951 952 /** 953 * Returns the index in the input string of the first character following the 954 * text matched during the previous match operation. 955 * 956 * @param status A reference to a UErrorCode to receive any errors. Possible 957 * errors are U_REGEX_INVALID_STATE if no match has been 958 * attempted or the last match failed. 959 * @return the index of the last character matched, plus one. 960 * The index value returned is a native index, corresponding to 961 * code units for the underlying encoding type, for example, 962 * a byte index for UTF-8. 963 * @stable ICU 2.4 964 */ 965 virtual int32_t end(UErrorCode &status) const; 966 967 /** 968 * Returns the index in the input string of the first character following the 969 * text matched during the previous match operation. 970 * 971 * @param status A reference to a UErrorCode to receive any errors. Possible 972 * errors are U_REGEX_INVALID_STATE if no match has been 973 * attempted or the last match failed. 974 * @return the index of the last character matched, plus one. 975 * The index value returned is a native index, corresponding to 976 * code units for the underlying encoding type, for example, 977 * a byte index for UTF-8. 978 * @stable ICU 4.6 979 */ 980 virtual int64_t end64(UErrorCode &status) const; 981 982 983 /** 984 * Returns the index in the input string of the character following the 985 * text matched by the specified capture group during the previous match operation. 986 * 987 * @param group the capture group number 988 * @param status A reference to a UErrorCode to receive any errors. Possible 989 * errors are U_REGEX_INVALID_STATE if no match has been 990 * attempted or the last match failed and 991 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number 992 * @return the index of the first character following the text 993 * captured by the specified group during the previous match operation. 994 * Return -1 if the capture group exists in the pattern but was not part of the match. 995 * The index value returned is a native index, corresponding to 996 * code units for the underlying encoding type, for example, 997 * a byte index for UTF8. 998 * @stable ICU 2.4 999 */ 1000 virtual int32_t end(int32_t group, UErrorCode &status) const; 1001 1002 /** 1003 * Returns the index in the input string of the character following the 1004 * text matched by the specified capture group during the previous match operation. 1005 * 1006 * @param group the capture group number 1007 * @param status A reference to a UErrorCode to receive any errors. Possible 1008 * errors are U_REGEX_INVALID_STATE if no match has been 1009 * attempted or the last match failed and 1010 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number 1011 * @return the index of the first character following the text 1012 * captured by the specified group during the previous match operation. 1013 * Return -1 if the capture group exists in the pattern but was not part of the match. 1014 * The index value returned is a native index, corresponding to 1015 * code units for the underlying encoding type, for example, 1016 * a byte index for UTF8. 1017 * @stable ICU 4.6 1018 */ 1019 virtual int64_t end64(int32_t group, UErrorCode &status) const; 1020 1021 1022 /** 1023 * Resets this matcher. The effect is to remove any memory of previous matches, 1024 * and to cause subsequent find() operations to begin at the beginning of 1025 * the input string. 1026 * 1027 * @return this RegexMatcher. 1028 * @stable ICU 2.4 1029 */ 1030 virtual RegexMatcher &reset(); 1031 1032 1033 /** 1034 * Resets this matcher, and set the current input position. 1035 * The effect is to remove any memory of previous matches, 1036 * and to cause subsequent find() operations to begin at 1037 * the specified (native) position in the input string. 1038 * <p> 1039 * The matcher's region is reset to its default, which is the entire 1040 * input string. 1041 * <p> 1042 * An alternative to this function is to set a match region 1043 * beginning at the desired index. 1044 * 1045 * @return this RegexMatcher. 1046 * @stable ICU 2.8 1047 */ 1048 virtual RegexMatcher &reset(int64_t index, UErrorCode &status); 1049 1050 1051 /** 1052 * Resets this matcher with a new input string. This allows instances of RegexMatcher 1053 * to be reused, which is more efficient than creating a new RegexMatcher for 1054 * each input string to be processed. 1055 * @param input The new string on which subsequent pattern matches will operate. 1056 * The matcher retains a reference to the callers string, and operates 1057 * directly on that. Ownership of the string remains with the caller. 1058 * Because no copy of the string is made, it is essential that the 1059 * caller not delete the string until after regexp operations on it 1060 * are done. 1061 * Note that while a reset on the matcher with an input string that is then 1062 * modified across/during matcher operations may be supported currently for UnicodeString, 1063 * this was not originally intended behavior, and support for this is not guaranteed 1064 * in upcoming versions of ICU. 1065 * @return this RegexMatcher. 1066 * @stable ICU 2.4 1067 */ 1068 virtual RegexMatcher &reset(const UnicodeString &input); 1069 1070 1071 /** 1072 * Resets this matcher with a new input string. This allows instances of RegexMatcher 1073 * to be reused, which is more efficient than creating a new RegexMatcher for 1074 * each input string to be processed. 1075 * @param input The new string on which subsequent pattern matches will operate. 1076 * The matcher makes a shallow clone of the given text; ownership of the 1077 * original string remains with the caller. Because no deep copy of the 1078 * text is made, it is essential that the caller not modify the string 1079 * until after regexp operations on it are done. 1080 * @return this RegexMatcher. 1081 * 1082 * @stable ICU 4.6 1083 */ 1084 virtual RegexMatcher &reset(UText *input); 1085 1086 1087 /** 1088 * Set the subject text string upon which the regular expression is looking for matches 1089 * without changing any other aspect of the matching state. 1090 * The new and previous text strings must have the same content. 1091 * 1092 * This function is intended for use in environments where ICU is operating on 1093 * strings that may move around in memory. It provides a mechanism for notifying 1094 * ICU that the string has been relocated, and providing a new UText to access the 1095 * string in its new position. 1096 * 1097 * Note that the regular expression implementation never copies the underlying text 1098 * of a string being matched, but always operates directly on the original text 1099 * provided by the user. Refreshing simply drops the references to the old text 1100 * and replaces them with references to the new. 1101 * 1102 * Caution: this function is normally used only by very specialized, 1103 * system-level code. One example use case is with garbage collection that moves 1104 * the text in memory. 1105 * 1106 * @param input The new (moved) text string. 1107 * @param status Receives errors detected by this function. 1108 * 1109 * @stable ICU 4.8 1110 */ 1111 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status); 1112 1113private: 1114 /** 1115 * Cause a compilation error if an application accidentally attempts to 1116 * reset a matcher with a (UChar *) string as input rather than 1117 * a UnicodeString. Avoids a dangling reference to a temporary string. 1118 * <p> 1119 * To efficiently work with UChar *strings, wrap the data in a UnicodeString 1120 * using one of the aliasing constructors, such as 1121 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> 1122 * or in a UText, using 1123 * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code> 1124 * 1125 */ 1126 RegexMatcher &reset(const UChar *input); 1127public: 1128 1129 /** 1130 * Returns the input string being matched. Ownership of the string belongs to 1131 * the matcher; it should not be altered or deleted. This method will work even if the input 1132 * was originally supplied as a UText. 1133 * @return the input string 1134 * @stable ICU 2.4 1135 */ 1136 virtual const UnicodeString &input() const; 1137 1138 /** 1139 * Returns the input string being matched. This is the live input text; it should not be 1140 * altered or deleted. This method will work even if the input was originally supplied as 1141 * a UnicodeString. 1142 * @return the input text 1143 * 1144 * @stable ICU 4.6 1145 */ 1146 virtual UText *inputText() const; 1147 1148 /** 1149 * Returns the input string being matched, either by copying it into the provided 1150 * UText parameter or by returning a shallow clone of the live input. Note that copying 1151 * the entire input may cause significant performance and memory issues. 1152 * @param dest The UText into which the input should be copied, or NULL to create a new UText 1153 * @param status error code 1154 * @return dest if non-NULL, a shallow copy of the input text otherwise 1155 * 1156 * @stable ICU 4.6 1157 */ 1158 virtual UText *getInput(UText *dest, UErrorCode &status) const; 1159 1160 1161 /** Sets the limits of this matcher's region. 1162 * The region is the part of the input string that will be searched to find a match. 1163 * Invoking this method resets the matcher, and then sets the region to start 1164 * at the index specified by the start parameter and end at the index specified 1165 * by the end parameter. 1166 * 1167 * Depending on the transparency and anchoring being used (see useTransparentBounds 1168 * and useAnchoringBounds), certain constructs such as anchors may behave differently 1169 * at or around the boundaries of the region 1170 * 1171 * The function will fail if start is greater than limit, or if either index 1172 * is less than zero or greater than the length of the string being matched. 1173 * 1174 * @param start The (native) index to begin searches at. 1175 * @param limit The index to end searches at (exclusive). 1176 * @param status A reference to a UErrorCode to receive any errors. 1177 * @stable ICU 4.0 1178 */ 1179 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status); 1180 1181 /** 1182 * Identical to region(start, limit, status) but also allows a start position without 1183 * resetting the region state. 1184 * @param regionStart The region start 1185 * @param regionLimit the limit of the region 1186 * @param startIndex The (native) index within the region bounds at which to begin searches. 1187 * @param status A reference to a UErrorCode to receive any errors. 1188 * If startIndex is not within the specified region bounds, 1189 * U_INDEX_OUTOFBOUNDS_ERROR is returned. 1190 * @stable ICU 4.6 1191 */ 1192 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status); 1193 1194 /** 1195 * Reports the start index of this matcher's region. The searches this matcher 1196 * conducts are limited to finding matches within regionStart (inclusive) and 1197 * regionEnd (exclusive). 1198 * 1199 * @return The starting (native) index of this matcher's region. 1200 * @stable ICU 4.0 1201 */ 1202 virtual int32_t regionStart() const; 1203 1204 /** 1205 * Reports the start index of this matcher's region. The searches this matcher 1206 * conducts are limited to finding matches within regionStart (inclusive) and 1207 * regionEnd (exclusive). 1208 * 1209 * @return The starting (native) index of this matcher's region. 1210 * @stable ICU 4.6 1211 */ 1212 virtual int64_t regionStart64() const; 1213 1214 1215 /** 1216 * Reports the end (limit) index (exclusive) of this matcher's region. The searches 1217 * this matcher conducts are limited to finding matches within regionStart 1218 * (inclusive) and regionEnd (exclusive). 1219 * 1220 * @return The ending point (native) of this matcher's region. 1221 * @stable ICU 4.0 1222 */ 1223 virtual int32_t regionEnd() const; 1224 1225 /** 1226 * Reports the end (limit) index (exclusive) of this matcher's region. The searches 1227 * this matcher conducts are limited to finding matches within regionStart 1228 * (inclusive) and regionEnd (exclusive). 1229 * 1230 * @return The ending point (native) of this matcher's region. 1231 * @stable ICU 4.6 1232 */ 1233 virtual int64_t regionEnd64() const; 1234 1235 /** 1236 * Queries the transparency of region bounds for this matcher. 1237 * See useTransparentBounds for a description of transparent and opaque bounds. 1238 * By default, a matcher uses opaque region boundaries. 1239 * 1240 * @return TRUE if this matcher is using opaque bounds, false if it is not. 1241 * @stable ICU 4.0 1242 */ 1243 virtual UBool hasTransparentBounds() const; 1244 1245 /** 1246 * Sets the transparency of region bounds for this matcher. 1247 * Invoking this function with an argument of true will set this matcher to use transparent bounds. 1248 * If the boolean argument is false, then opaque bounds will be used. 1249 * 1250 * Using transparent bounds, the boundaries of this matcher's region are transparent 1251 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 1252 * see text beyond the boundaries of the region while checking for a match. 1253 * 1254 * With opaque bounds, no text outside of the matcher's region is visible to lookahead, 1255 * lookbehind, and boundary matching constructs. 1256 * 1257 * By default, a matcher uses opaque bounds. 1258 * 1259 * @param b TRUE for transparent bounds; FALSE for opaque bounds 1260 * @return This Matcher; 1261 * @stable ICU 4.0 1262 **/ 1263 virtual RegexMatcher &useTransparentBounds(UBool b); 1264 1265 1266 /** 1267 * Return true if this matcher is using anchoring bounds. 1268 * By default, matchers use anchoring region bounds. 1269 * 1270 * @return TRUE if this matcher is using anchoring bounds. 1271 * @stable ICU 4.0 1272 */ 1273 virtual UBool hasAnchoringBounds() const; 1274 1275 1276 /** 1277 * Set whether this matcher is using Anchoring Bounds for its region. 1278 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 1279 * and end of the region. Without Anchoring Bounds, anchors will only match at 1280 * the positions they would in the complete text. 1281 * 1282 * Anchoring Bounds are the default for regions. 1283 * 1284 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 1285 * @return This Matcher 1286 * @stable ICU 4.0 1287 */ 1288 virtual RegexMatcher &useAnchoringBounds(UBool b); 1289 1290 1291 /** 1292 * Return TRUE if the most recent matching operation attempted to access 1293 * additional input beyond the available input text. 1294 * In this case, additional input text could change the results of the match. 1295 * 1296 * hitEnd() is defined for both successful and unsuccessful matches. 1297 * In either case hitEnd() will return TRUE if if the end of the text was 1298 * reached at any point during the matching process. 1299 * 1300 * @return TRUE if the most recent match hit the end of input 1301 * @stable ICU 4.0 1302 */ 1303 virtual UBool hitEnd() const; 1304 1305 /** 1306 * Return TRUE the most recent match succeeded and additional input could cause 1307 * it to fail. If this method returns false and a match was found, then more input 1308 * might change the match but the match won't be lost. If a match was not found, 1309 * then requireEnd has no meaning. 1310 * 1311 * @return TRUE if more input could cause the most recent match to no longer match. 1312 * @stable ICU 4.0 1313 */ 1314 virtual UBool requireEnd() const; 1315 1316 1317 /** 1318 * Returns the pattern that is interpreted by this matcher. 1319 * @return the RegexPattern for this RegexMatcher 1320 * @stable ICU 2.4 1321 */ 1322 virtual const RegexPattern &pattern() const; 1323 1324 1325 /** 1326 * Replaces every substring of the input that matches the pattern 1327 * with the given replacement string. This is a convenience function that 1328 * provides a complete find-and-replace-all operation. 1329 * 1330 * This method first resets this matcher. It then scans the input string 1331 * looking for matches of the pattern. Input that is not part of any 1332 * match is left unchanged; each match is replaced in the result by the 1333 * replacement string. The replacement string may contain references to 1334 * capture groups. 1335 * 1336 * @param replacement a string containing the replacement text. 1337 * @param status a reference to a UErrorCode to receive any errors. 1338 * @return a string containing the results of the find and replace. 1339 * @stable ICU 2.4 1340 */ 1341 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 1342 1343 1344 /** 1345 * Replaces every substring of the input that matches the pattern 1346 * with the given replacement string. This is a convenience function that 1347 * provides a complete find-and-replace-all operation. 1348 * 1349 * This method first resets this matcher. It then scans the input string 1350 * looking for matches of the pattern. Input that is not part of any 1351 * match is left unchanged; each match is replaced in the result by the 1352 * replacement string. The replacement string may contain references to 1353 * capture groups. 1354 * 1355 * @param replacement a string containing the replacement text. 1356 * @param dest a mutable UText in which the results are placed. 1357 * If NULL, a new UText will be created (which may not be mutable). 1358 * @param status a reference to a UErrorCode to receive any errors. 1359 * @return a string containing the results of the find and replace. 1360 * If a pre-allocated UText was provided, it will always be used and returned. 1361 * 1362 * @stable ICU 4.6 1363 */ 1364 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); 1365 1366 1367 /** 1368 * Replaces the first substring of the input that matches 1369 * the pattern with the replacement string. This is a convenience 1370 * function that provides a complete find-and-replace operation. 1371 * 1372 * <p>This function first resets this RegexMatcher. It then scans the input string 1373 * looking for a match of the pattern. Input that is not part 1374 * of the match is appended directly to the result string; the match is replaced 1375 * in the result by the replacement string. The replacement string may contain 1376 * references to captured groups.</p> 1377 * 1378 * <p>The state of the matcher (the position at which a subsequent find() 1379 * would begin) after completing a replaceFirst() is not specified. The 1380 * RegexMatcher should be reset before doing additional find() operations.</p> 1381 * 1382 * @param replacement a string containing the replacement text. 1383 * @param status a reference to a UErrorCode to receive any errors. 1384 * @return a string containing the results of the find and replace. 1385 * @stable ICU 2.4 1386 */ 1387 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 1388 1389 1390 /** 1391 * Replaces the first substring of the input that matches 1392 * the pattern with the replacement string. This is a convenience 1393 * function that provides a complete find-and-replace operation. 1394 * 1395 * <p>This function first resets this RegexMatcher. It then scans the input string 1396 * looking for a match of the pattern. Input that is not part 1397 * of the match is appended directly to the result string; the match is replaced 1398 * in the result by the replacement string. The replacement string may contain 1399 * references to captured groups.</p> 1400 * 1401 * <p>The state of the matcher (the position at which a subsequent find() 1402 * would begin) after completing a replaceFirst() is not specified. The 1403 * RegexMatcher should be reset before doing additional find() operations.</p> 1404 * 1405 * @param replacement a string containing the replacement text. 1406 * @param dest a mutable UText in which the results are placed. 1407 * If NULL, a new UText will be created (which may not be mutable). 1408 * @param status a reference to a UErrorCode to receive any errors. 1409 * @return a string containing the results of the find and replace. 1410 * If a pre-allocated UText was provided, it will always be used and returned. 1411 * 1412 * @stable ICU 4.6 1413 */ 1414 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); 1415 1416 1417 /** 1418 * Implements a replace operation intended to be used as part of an 1419 * incremental find-and-replace. 1420 * 1421 * <p>The input string, starting from the end of the previous replacement and ending at 1422 * the start of the current match, is appended to the destination string. Then the 1423 * replacement string is appended to the output string, 1424 * including handling any substitutions of captured text.</p> 1425 * 1426 * <p>For simple, prepackaged, non-incremental find-and-replace 1427 * operations, see replaceFirst() or replaceAll().</p> 1428 * 1429 * @param dest A UnicodeString to which the results of the find-and-replace are appended. 1430 * @param replacement A UnicodeString that provides the text to be substituted for 1431 * the input text that matched the regexp pattern. The replacement 1432 * text may contain references to captured text from the 1433 * input. 1434 * @param status A reference to a UErrorCode to receive any errors. Possible 1435 * errors are U_REGEX_INVALID_STATE if no match has been 1436 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR 1437 * if the replacement text specifies a capture group that 1438 * does not exist in the pattern. 1439 * 1440 * @return this RegexMatcher 1441 * @stable ICU 2.4 1442 * 1443 */ 1444 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 1445 const UnicodeString &replacement, UErrorCode &status); 1446 1447 1448 /** 1449 * Implements a replace operation intended to be used as part of an 1450 * incremental find-and-replace. 1451 * 1452 * <p>The input string, starting from the end of the previous replacement and ending at 1453 * the start of the current match, is appended to the destination string. Then the 1454 * replacement string is appended to the output string, 1455 * including handling any substitutions of captured text.</p> 1456 * 1457 * <p>For simple, prepackaged, non-incremental find-and-replace 1458 * operations, see replaceFirst() or replaceAll().</p> 1459 * 1460 * @param dest A mutable UText to which the results of the find-and-replace are appended. 1461 * Must not be NULL. 1462 * @param replacement A UText that provides the text to be substituted for 1463 * the input text that matched the regexp pattern. The replacement 1464 * text may contain references to captured text from the input. 1465 * @param status A reference to a UErrorCode to receive any errors. Possible 1466 * errors are U_REGEX_INVALID_STATE if no match has been 1467 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR 1468 * if the replacement text specifies a capture group that 1469 * does not exist in the pattern. 1470 * 1471 * @return this RegexMatcher 1472 * 1473 * @stable ICU 4.6 1474 */ 1475 virtual RegexMatcher &appendReplacement(UText *dest, 1476 UText *replacement, UErrorCode &status); 1477 1478 1479 /** 1480 * As the final step in a find-and-replace operation, append the remainder 1481 * of the input string, starting at the position following the last appendReplacement(), 1482 * to the destination string. <code>appendTail()</code> is intended to be invoked after one 1483 * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. 1484 * 1485 * @param dest A UnicodeString to which the results of the find-and-replace are appended. 1486 * @return the destination string. 1487 * @stable ICU 2.4 1488 */ 1489 virtual UnicodeString &appendTail(UnicodeString &dest); 1490 1491 1492 /** 1493 * As the final step in a find-and-replace operation, append the remainder 1494 * of the input string, starting at the position following the last appendReplacement(), 1495 * to the destination string. <code>appendTail()</code> is intended to be invoked after one 1496 * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. 1497 * 1498 * @param dest A mutable UText to which the results of the find-and-replace are appended. 1499 * Must not be NULL. 1500 * @param status error cod 1501 * @return the destination string. 1502 * 1503 * @stable ICU 4.6 1504 */ 1505 virtual UText *appendTail(UText *dest, UErrorCode &status); 1506 1507 1508 /** 1509 * Split a string into fields. Somewhat like split() from Perl. 1510 * The pattern matches identify delimiters that separate the input 1511 * into fields. The input data between the matches becomes the 1512 * fields themselves. 1513 * 1514 * @param input The string to be split into fields. The field delimiters 1515 * match the pattern (in the "this" object). This matcher 1516 * will be reset to this input string. 1517 * @param dest An array of UnicodeStrings to receive the results of the split. 1518 * This is an array of actual UnicodeString objects, not an 1519 * array of pointers to strings. Local (stack based) arrays can 1520 * work well here. 1521 * @param destCapacity The number of elements in the destination array. 1522 * If the number of fields found is less than destCapacity, the 1523 * extra strings in the destination array are not altered. 1524 * If the number of destination strings is less than the number 1525 * of fields, the trailing part of the input string, including any 1526 * field delimiters, is placed in the last destination string. 1527 * @param status A reference to a UErrorCode to receive any errors. 1528 * @return The number of fields into which the input string was split. 1529 * @stable ICU 2.6 1530 */ 1531 virtual int32_t split(const UnicodeString &input, 1532 UnicodeString dest[], 1533 int32_t destCapacity, 1534 UErrorCode &status); 1535 1536 1537 /** 1538 * Split a string into fields. Somewhat like split() from Perl. 1539 * The pattern matches identify delimiters that separate the input 1540 * into fields. The input data between the matches becomes the 1541 * fields themselves. 1542 * 1543 * @param input The string to be split into fields. The field delimiters 1544 * match the pattern (in the "this" object). This matcher 1545 * will be reset to this input string. 1546 * @param dest An array of mutable UText structs to receive the results of the split. 1547 * If a field is NULL, a new UText is allocated to contain the results for 1548 * that field. This new UText is not guaranteed to be mutable. 1549 * @param destCapacity The number of elements in the destination array. 1550 * If the number of fields found is less than destCapacity, the 1551 * extra strings in the destination array are not altered. 1552 * If the number of destination strings is less than the number 1553 * of fields, the trailing part of the input string, including any 1554 * field delimiters, is placed in the last destination string. 1555 * @param status A reference to a UErrorCode to receive any errors. 1556 * @return The number of fields into which the input string was split. 1557 * 1558 * @stable ICU 4.6 1559 */ 1560 virtual int32_t split(UText *input, 1561 UText *dest[], 1562 int32_t destCapacity, 1563 UErrorCode &status); 1564 1565 /** 1566 * Set a processing time limit for match operations with this Matcher. 1567 * 1568 * Some patterns, when matching certain strings, can run in exponential time. 1569 * For practical purposes, the match operation may appear to be in an 1570 * infinite loop. 1571 * When a limit is set a match operation will fail with an error if the 1572 * limit is exceeded. 1573 * <p> 1574 * The units of the limit are steps of the match engine. 1575 * Correspondence with actual processor time will depend on the speed 1576 * of the processor and the details of the specific pattern, but will 1577 * typically be on the order of milliseconds. 1578 * <p> 1579 * By default, the matching time is not limited. 1580 * <p> 1581 * 1582 * @param limit The limit value, or 0 for no limit. 1583 * @param status A reference to a UErrorCode to receive any errors. 1584 * @stable ICU 4.0 1585 */ 1586 virtual void setTimeLimit(int32_t limit, UErrorCode &status); 1587 1588 /** 1589 * Get the time limit, if any, for match operations made with this Matcher. 1590 * 1591 * @return the maximum allowed time for a match, in units of processing steps. 1592 * @stable ICU 4.0 1593 */ 1594 virtual int32_t getTimeLimit() const; 1595 1596 /** 1597 * Set the amount of heap storage available for use by the match backtracking stack. 1598 * The matcher is also reset, discarding any results from previous matches. 1599 * <p> 1600 * ICU uses a backtracking regular expression engine, with the backtrack stack 1601 * maintained on the heap. This function sets the limit to the amount of memory 1602 * that can be used for this purpose. A backtracking stack overflow will 1603 * result in an error from the match operation that caused it. 1604 * <p> 1605 * A limit is desirable because a malicious or poorly designed pattern can use 1606 * excessive memory, potentially crashing the process. A limit is enabled 1607 * by default. 1608 * <p> 1609 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1610 * A value of zero means no limit. 1611 * The limit must be greater or equal to zero. 1612 * 1613 * @param status A reference to a UErrorCode to receive any errors. 1614 * 1615 * @stable ICU 4.0 1616 */ 1617 virtual void setStackLimit(int32_t limit, UErrorCode &status); 1618 1619 /** 1620 * Get the size of the heap storage available for use by the back tracking stack. 1621 * 1622 * @return the maximum backtracking stack size, in bytes, or zero if the 1623 * stack size is unlimited. 1624 * @stable ICU 4.0 1625 */ 1626 virtual int32_t getStackLimit() const; 1627 1628 1629 /** 1630 * Set a callback function for use with this Matcher. 1631 * During matching operations the function will be called periodically, 1632 * giving the application the opportunity to terminate a long-running 1633 * match. 1634 * 1635 * @param callback A pointer to the user-supplied callback function. 1636 * @param context User context pointer. The value supplied at the 1637 * time the callback function is set will be saved 1638 * and passed to the callback each time that it is called. 1639 * @param status A reference to a UErrorCode to receive any errors. 1640 * @stable ICU 4.0 1641 */ 1642 virtual void setMatchCallback(URegexMatchCallback *callback, 1643 const void *context, 1644 UErrorCode &status); 1645 1646 1647 /** 1648 * Get the callback function for this URegularExpression. 1649 * 1650 * @param callback Out parameter, receives a pointer to the user-supplied 1651 * callback function. 1652 * @param context Out parameter, receives the user context pointer that 1653 * was set when uregex_setMatchCallback() was called. 1654 * @param status A reference to a UErrorCode to receive any errors. 1655 * @stable ICU 4.0 1656 */ 1657 virtual void getMatchCallback(URegexMatchCallback *&callback, 1658 const void *&context, 1659 UErrorCode &status); 1660 1661 1662 /** 1663 * Set a progress callback function for use with find operations on this Matcher. 1664 * During find operations, the callback will be invoked after each return from a 1665 * match attempt, giving the application the opportunity to terminate a long-running 1666 * find operation. 1667 * 1668 * @param callback A pointer to the user-supplied callback function. 1669 * @param context User context pointer. The value supplied at the 1670 * time the callback function is set will be saved 1671 * and passed to the callback each time that it is called. 1672 * @param status A reference to a UErrorCode to receive any errors. 1673 * @stable ICU 4.6 1674 */ 1675 virtual void setFindProgressCallback(URegexFindProgressCallback *callback, 1676 const void *context, 1677 UErrorCode &status); 1678 1679 1680 /** 1681 * Get the find progress callback function for this URegularExpression. 1682 * 1683 * @param callback Out parameter, receives a pointer to the user-supplied 1684 * callback function. 1685 * @param context Out parameter, receives the user context pointer that 1686 * was set when uregex_setFindProgressCallback() was called. 1687 * @param status A reference to a UErrorCode to receive any errors. 1688 * @stable ICU 4.6 1689 */ 1690 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback, 1691 const void *&context, 1692 UErrorCode &status); 1693 1694#ifndef U_HIDE_INTERNAL_API 1695 /** 1696 * setTrace Debug function, enable/disable tracing of the matching engine. 1697 * For internal ICU development use only. DO NO USE!!!! 1698 * @internal 1699 */ 1700 void setTrace(UBool state); 1701#endif /* U_HIDE_INTERNAL_API */ 1702 1703 /** 1704 * ICU "poor man's RTTI", returns a UClassID for this class. 1705 * 1706 * @stable ICU 2.2 1707 */ 1708 static UClassID U_EXPORT2 getStaticClassID(); 1709 1710 /** 1711 * ICU "poor man's RTTI", returns a UClassID for the actual class. 1712 * 1713 * @stable ICU 2.2 1714 */ 1715 virtual UClassID getDynamicClassID() const; 1716 1717private: 1718 // Constructors and other object boilerplate are private. 1719 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 1720 RegexMatcher(); // default constructor not implemented 1721 RegexMatcher(const RegexPattern *pat); 1722 RegexMatcher(const RegexMatcher &other); 1723 RegexMatcher &operator =(const RegexMatcher &rhs); 1724 void init(UErrorCode &status); // Common initialization 1725 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. 1726 1727 friend class RegexPattern; 1728 friend class RegexCImpl; 1729public: 1730#ifndef U_HIDE_INTERNAL_API 1731 /** @internal */ 1732 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 1733#endif /* U_HIDE_INTERNAL_API */ 1734private: 1735 1736 // 1737 // MatchAt This is the internal interface to the match engine itself. 1738 // Match status comes back in matcher member variables. 1739 // 1740 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); 1741 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); 1742 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test 1743 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test 1744 REStackFrame *resetStack(); 1745 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); 1746 void IncrementTime(UErrorCode &status); 1747 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status); 1748 1749 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; 1750 1751 UBool findUsingChunk(); 1752 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 1753 UBool isChunkWordBoundary(int32_t pos); 1754 1755 const RegexPattern *fPattern; 1756 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 1757 // should delete it when through. 1758 1759 const UnicodeString *fInput; // The string being matched. Only used for input() 1760 UText *fInputText; // The text being matched. Is never NULL. 1761 UText *fAltInputText; // A shallow copy of the text being matched. 1762 // Only created if the pattern contains backreferences. 1763 int64_t fInputLength; // Full length of the input text. 1764 int32_t fFrameSize; // The size of a frame in the backtrack stack. 1765 1766 int64_t fRegionStart; // Start of the input region, default = 0. 1767 int64_t fRegionLimit; // End of input region, default to input.length. 1768 1769 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 1770 int64_t fAnchorLimit; // See useAnchoringBounds 1771 1772 int64_t fLookStart; // Region bounds for look-ahead/behind and 1773 int64_t fLookLimit; // and other boundary tests. See 1774 // useTransparentBounds 1775 1776 int64_t fActiveStart; // Currently active bounds for matching. 1777 int64_t fActiveLimit; // Usually is the same as region, but 1778 // is changed to fLookStart/Limit when 1779 // entering look around regions. 1780 1781 UBool fTransparentBounds; // True if using transparent bounds. 1782 UBool fAnchoringBounds; // True if using anchoring bounds. 1783 1784 UBool fMatch; // True if the last attempted match was successful. 1785 int64_t fMatchStart; // Position of the start of the most recent match 1786 int64_t fMatchEnd; // First position after the end of the most recent match 1787 // Zero if no previous match, even when a region 1788 // is active. 1789 int64_t fLastMatchEnd; // First position after the end of the previous match, 1790 // or -1 if there was no previous match. 1791 int64_t fAppendPosition; // First position after the end of the previous 1792 // appendReplacement(). As described by the 1793 // JavaDoc for Java Matcher, where it is called 1794 // "append position" 1795 UBool fHitEnd; // True if the last match touched the end of input. 1796 UBool fRequireEnd; // True if the last match required end-of-input 1797 // (matched $ or Z) 1798 1799 UVector64 *fStack; 1800 REStackFrame *fFrame; // After finding a match, the last active stack frame, 1801 // which will contain the capture group results. 1802 // NOT valid while match engine is running. 1803 1804 int64_t *fData; // Data area for use by the compiled pattern. 1805 int64_t fSmallData[8]; // Use this for data if it's enough. 1806 1807 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the 1808 // match engine run. Zero for unlimited. 1809 1810 int32_t fTime; // Match time, accumulates while matching. 1811 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. 1812 // Kept separately from fTime to keep as much 1813 // code as possible out of the inline 1814 // StateSave function. 1815 1816 int32_t fStackLimit; // Maximum memory size to use for the backtrack 1817 // stack, in bytes. Zero for unlimited. 1818 1819 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. 1820 // NULL if there is no callback. 1821 const void *fCallbackContext; // User Context ptr for callback function. 1822 1823 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct. 1824 // NULL if there is no callback. 1825 const void *fFindProgressCallbackContext; // User Context ptr for callback function. 1826 1827 1828 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. 1829 1830 UBool fTraceDebug; // Set true for debug tracing of match engine. 1831 1832 UErrorCode fDeferredStatus; // Save error state that cannot be immediately 1833 // reported, or that permanently disables this matcher. 1834 1835 RuleBasedBreakIterator *fWordBreakItr; 1836}; 1837 1838U_NAMESPACE_END 1839#endif // UCONFIG_NO_REGULAR_EXPRESSIONS 1840#endif 1841