1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2012 University of Cambridge 10 11----------------------------------------------------------------------------- 12Redistribution and use in source and binary forms, with or without 13modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36POSSIBILITY OF SUCH DAMAGE. 37----------------------------------------------------------------------------- 38*/ 39 40/* This module contains pcre_exec(), the externally visible function that does 41pattern matching using an NFA algorithm, trying to mimic Perl as closely as 42possible. There are also some static supporting functions. */ 43 44#ifdef HAVE_CONFIG_H 45#include "config.h" 46#endif 47 48#define NLBLOCK md /* Block containing newline information */ 49#define PSSTART start_subject /* Field containing processed string start */ 50#define PSEND end_subject /* Field containing processed string end */ 51 52#include "pcre_internal.h" 53 54/* Undefine some potentially clashing cpp symbols */ 55 56#undef min 57#undef max 58 59/* Values for setting in md->match_function_type to indicate two special types 60of call to match(). We do it this way to save on using another stack variable, 61as stack usage is to be discouraged. */ 62 63#define MATCH_CONDASSERT 1 /* Called to check a condition assertion */ 64#define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */ 65 66/* Non-error returns from the match() function. Error returns are externally 67defined PCRE_ERROR_xxx codes, which are all negative. */ 68 69#define MATCH_MATCH 1 70#define MATCH_NOMATCH 0 71 72/* Special internal returns from the match() function. Make them sufficiently 73negative to avoid the external error codes. */ 74 75#define MATCH_ACCEPT (-999) 76#define MATCH_COMMIT (-998) 77#define MATCH_KETRPOS (-997) 78#define MATCH_ONCE (-996) 79#define MATCH_PRUNE (-995) 80#define MATCH_SKIP (-994) 81#define MATCH_SKIP_ARG (-993) 82#define MATCH_THEN (-992) 83 84/* Maximum number of ints of offset to save on the stack for recursive calls. 85If the offset vector is bigger, malloc is used. This should be a multiple of 3, 86because the offset vector is always a multiple of 3 long. */ 87 88#define REC_STACK_SAVE_MAX 30 89 90/* Min and max values for the common repeats; for the maxima, 0 => infinity */ 91 92static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; 93static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; 94 95 96 97#ifdef PCRE_DEBUG 98/************************************************* 99* Debugging function to print chars * 100*************************************************/ 101 102/* Print a sequence of chars in printable format, stopping at the end of the 103subject if the requested. 104 105Arguments: 106 p points to characters 107 length number to print 108 is_subject TRUE if printing from within md->start_subject 109 md pointer to matching data block, if is_subject is TRUE 110 111Returns: nothing 112*/ 113 114static void 115pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) 116{ 117unsigned int c; 118if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 119while (length-- > 0) 120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); 121} 122#endif 123 124 125 126/************************************************* 127* Match a back-reference * 128*************************************************/ 129 130/* Normally, if a back reference hasn't been set, the length that is passed is 131negative, so the match always fails. However, in JavaScript compatibility mode, 132the length passed is zero. Note that in caseless UTF-8 mode, the number of 133subject bytes matched may be different to the number of reference bytes. 134 135Arguments: 136 offset index into the offset vector 137 eptr pointer into the subject 138 length length of reference to be matched (number of bytes) 139 md points to match data block 140 caseless TRUE if caseless 141 142Returns: >= 0 the number of subject bytes matched 143 -1 no match 144 -2 partial match; always given if at end subject 145*/ 146 147static int 148match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, 149 BOOL caseless) 150{ 151PCRE_PUCHAR eptr_start = eptr; 152register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; 153 154#ifdef PCRE_DEBUG 155if (eptr >= md->end_subject) 156 printf("matching subject <null>"); 157else 158 { 159 printf("matching subject "); 160 pchars(eptr, length, TRUE, md); 161 } 162printf(" against backref "); 163pchars(p, length, FALSE, md); 164printf("\n"); 165#endif 166 167/* Always fail if reference not set (and not JavaScript compatible - in that 168case the length is passed as zero). */ 169 170if (length < 0) return -1; 171 172/* Separate the caseless case for speed. In UTF-8 mode we can only do this 173properly if Unicode properties are supported. Otherwise, we can check only 174ASCII characters. */ 175 176if (caseless) 177 { 178#ifdef SUPPORT_UTF 179#ifdef SUPPORT_UCP 180 if (md->utf) 181 { 182 /* Match characters up to the end of the reference. NOTE: the number of 183 bytes matched may differ, because there are some characters whose upper and 184 lower case versions code as different numbers of bytes. For example, U+023A 185 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8); 186 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of 187 the latter. It is important, therefore, to check the length along the 188 reference, not along the subject (earlier code did this wrong). */ 189 190 PCRE_PUCHAR endptr = p + length; 191 while (p < endptr) 192 { 193 int c, d; 194 if (eptr >= md->end_subject) return -2; /* Partial match */ 195 GETCHARINC(c, eptr); 196 GETCHARINC(d, p); 197 if (c != d && c != UCD_OTHERCASE(d)) return -1; 198 } 199 } 200 else 201#endif 202#endif 203 204 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there 205 is no UCP support. */ 206 { 207 while (length-- > 0) 208 { 209 if (eptr >= md->end_subject) return -2; /* Partial match */ 210 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1; 211 p++; 212 eptr++; 213 } 214 } 215 } 216 217/* In the caseful case, we can just compare the bytes, whether or not we 218are in UTF-8 mode. */ 219 220else 221 { 222 while (length-- > 0) 223 { 224 if (eptr >= md->end_subject) return -2; /* Partial match */ 225 if (*p++ != *eptr++) return -1; 226 } 227 } 228 229return (int)(eptr - eptr_start); 230} 231 232 233 234/*************************************************************************** 235**************************************************************************** 236 RECURSION IN THE match() FUNCTION 237 238The match() function is highly recursive, though not every recursive call 239increases the recursive depth. Nevertheless, some regular expressions can cause 240it to recurse to a great depth. I was writing for Unix, so I just let it call 241itself recursively. This uses the stack for saving everything that has to be 242saved for a recursive call. On Unix, the stack can be large, and this works 243fine. 244 245It turns out that on some non-Unix-like systems there are problems with 246programs that use a lot of stack. (This despite the fact that every last chip 247has oodles of memory these days, and techniques for extending the stack have 248been known for decades.) So.... 249 250There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 251calls by keeping local variables that need to be preserved in blocks of memory 252obtained from malloc() instead instead of on the stack. Macros are used to 253achieve this so that the actual code doesn't look very different to what it 254always used to. 255 256The original heap-recursive code used longjmp(). However, it seems that this 257can be very slow on some operating systems. Following a suggestion from Stan 258Switzer, the use of longjmp() has been abolished, at the cost of having to 259provide a unique number for each call to RMATCH. There is no way of generating 260a sequence of numbers at compile time in C. I have given them names, to make 261them stand out more clearly. 262 263Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 264FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 265tests. Furthermore, not using longjmp() means that local dynamic variables 266don't have indeterminate values; this has meant that the frame size can be 267reduced because the result can be "passed back" by straight setting of the 268variable instead of being passed in the frame. 269**************************************************************************** 270***************************************************************************/ 271 272/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 273below must be updated in sync. */ 274 275enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 276 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 277 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 278 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 279 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 280 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, 281 RM61, RM62, RM63, RM64, RM65, RM66 }; 282 283/* These versions of the macros use the stack, as normal. There are debugging 284versions and production versions. Note that the "rw" argument of RMATCH isn't 285actually used in this definition. */ 286 287#ifndef NO_RECURSE 288#define REGISTER register 289 290#ifdef PCRE_DEBUG 291#define RMATCH(ra,rb,rc,rd,re,rw) \ 292 { \ 293 printf("match() called in line %d\n", __LINE__); \ 294 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \ 295 printf("to line %d\n", __LINE__); \ 296 } 297#define RRETURN(ra) \ 298 { \ 299 printf("match() returned %d from line %d ", ra, __LINE__); \ 300 return ra; \ 301 } 302#else 303#define RMATCH(ra,rb,rc,rd,re,rw) \ 304 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) 305#define RRETURN(ra) return ra 306#endif 307 308#else 309 310 311/* These versions of the macros manage a private stack on the heap. Note that 312the "rd" argument of RMATCH isn't actually used in this definition. It's the md 313argument of match(), which never changes. */ 314 315#define REGISTER 316 317#define RMATCH(ra,rb,rc,rd,re,rw)\ 318 {\ 319 heapframe *newframe = frame->Xnextframe;\ 320 if (newframe == NULL)\ 321 {\ 322 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ 323 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ 324 newframe->Xnextframe = NULL;\ 325 frame->Xnextframe = newframe;\ 326 }\ 327 frame->Xwhere = rw;\ 328 newframe->Xeptr = ra;\ 329 newframe->Xecode = rb;\ 330 newframe->Xmstart = mstart;\ 331 newframe->Xoffset_top = rc;\ 332 newframe->Xeptrb = re;\ 333 newframe->Xrdepth = frame->Xrdepth + 1;\ 334 newframe->Xprevframe = frame;\ 335 frame = newframe;\ 336 DPRINTF(("restarting from line %d\n", __LINE__));\ 337 goto HEAP_RECURSE;\ 338 L_##rw:\ 339 DPRINTF(("jumped back to line %d\n", __LINE__));\ 340 } 341 342#define RRETURN(ra)\ 343 {\ 344 heapframe *oldframe = frame;\ 345 frame = oldframe->Xprevframe;\ 346 if (frame != NULL)\ 347 {\ 348 rrc = ra;\ 349 goto HEAP_RETURN;\ 350 }\ 351 return ra;\ 352 } 353 354 355/* Structure for remembering the local variables in a private frame */ 356 357typedef struct heapframe { 358 struct heapframe *Xprevframe; 359 struct heapframe *Xnextframe; 360 361 /* Function arguments that may change */ 362 363 PCRE_PUCHAR Xeptr; 364 const pcre_uchar *Xecode; 365 PCRE_PUCHAR Xmstart; 366 int Xoffset_top; 367 eptrblock *Xeptrb; 368 unsigned int Xrdepth; 369 370 /* Function local variables */ 371 372 PCRE_PUCHAR Xcallpat; 373#ifdef SUPPORT_UTF 374 PCRE_PUCHAR Xcharptr; 375#endif 376 PCRE_PUCHAR Xdata; 377 PCRE_PUCHAR Xnext; 378 PCRE_PUCHAR Xpp; 379 PCRE_PUCHAR Xprev; 380 PCRE_PUCHAR Xsaved_eptr; 381 382 recursion_info Xnew_recursive; 383 384 BOOL Xcur_is_word; 385 BOOL Xcondition; 386 BOOL Xprev_is_word; 387 388#ifdef SUPPORT_UCP 389 int Xprop_type; 390 int Xprop_value; 391 int Xprop_fail_result; 392 int Xoclength; 393 pcre_uchar Xocchars[6]; 394#endif 395 396 int Xcodelink; 397 int Xctype; 398 unsigned int Xfc; 399 int Xfi; 400 int Xlength; 401 int Xmax; 402 int Xmin; 403 int Xnumber; 404 int Xoffset; 405 int Xop; 406 int Xsave_capture_last; 407 int Xsave_offset1, Xsave_offset2, Xsave_offset3; 408 int Xstacksave[REC_STACK_SAVE_MAX]; 409 410 eptrblock Xnewptrb; 411 412 /* Where to jump back to */ 413 414 int Xwhere; 415 416} heapframe; 417 418#endif 419 420 421/*************************************************************************** 422***************************************************************************/ 423 424 425 426/************************************************* 427* Match from current position * 428*************************************************/ 429 430/* This function is called recursively in many circumstances. Whenever it 431returns a negative (error) response, the outer incarnation must also return the 432same response. */ 433 434/* These macros pack up tests that are used for partial matching, and which 435appear several times in the code. We set the "hit end" flag if the pointer is 436at the end of the subject and also past the start of the subject (i.e. 437something has been matched). For hard partial matching, we then return 438immediately. The second one is used when we already know we are past the end of 439the subject. */ 440 441#define CHECK_PARTIAL()\ 442 if (md->partial != 0 && eptr >= md->end_subject && \ 443 eptr > md->start_used_ptr) \ 444 { \ 445 md->hitend = TRUE; \ 446 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 447 } 448 449#define SCHECK_PARTIAL()\ 450 if (md->partial != 0 && eptr > md->start_used_ptr) \ 451 { \ 452 md->hitend = TRUE; \ 453 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \ 454 } 455 456 457/* Performance note: It might be tempting to extract commonly used fields from 458the md structure (e.g. utf, end_subject) into individual variables to improve 459performance. Tests using gcc on a SPARC disproved this; in the first case, it 460made performance worse. 461 462Arguments: 463 eptr pointer to current character in subject 464 ecode pointer to current position in compiled code 465 mstart pointer to the current match start position (can be modified 466 by encountering \K) 467 offset_top current top pointer 468 md pointer to "static" info for the match 469 eptrb pointer to chain of blocks containing eptr at start of 470 brackets - for testing for empty matches 471 rdepth the recursion depth 472 473Returns: MATCH_MATCH if matched ) these values are >= 0 474 MATCH_NOMATCH if failed to match ) 475 a negative MATCH_xxx value for PRUNE, SKIP, etc 476 a negative PCRE_ERROR_xxx value if aborted by an error condition 477 (e.g. stopped by repeated call or recursion limit) 478*/ 479 480static int 481match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode, 482 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb, 483 unsigned int rdepth) 484{ 485/* These variables do not need to be preserved over recursion in this function, 486so they can be ordinary variables in all cases. Mark some of them with 487"register" because they are used a lot in loops. */ 488 489register int rrc; /* Returns from recursive calls */ 490register int i; /* Used for loops not involving calls to RMATCH() */ 491register unsigned int c; /* Character values not kept over RMATCH() calls */ 492register BOOL utf; /* Local copy of UTF flag for speed */ 493 494BOOL minimize, possessive; /* Quantifier options */ 495BOOL caseless; 496int condcode; 497 498/* When recursion is not being used, all "local" variables that have to be 499preserved over calls to RMATCH() are part of a "frame". We set up the top-level 500frame on the stack here; subsequent instantiations are obtained from the heap 501whenever RMATCH() does a "recursion". See the macro definitions above. Putting 502the top-level on the stack rather than malloc-ing them all gives a performance 503boost in many cases where there is not much "recursion". */ 504 505#ifdef NO_RECURSE 506heapframe *frame = (heapframe *)md->match_frames_base; 507 508/* Copy in the original argument variables */ 509 510frame->Xeptr = eptr; 511frame->Xecode = ecode; 512frame->Xmstart = mstart; 513frame->Xoffset_top = offset_top; 514frame->Xeptrb = eptrb; 515frame->Xrdepth = rdepth; 516 517/* This is where control jumps back to to effect "recursion" */ 518 519HEAP_RECURSE: 520 521/* Macros make the argument variables come from the current frame */ 522 523#define eptr frame->Xeptr 524#define ecode frame->Xecode 525#define mstart frame->Xmstart 526#define offset_top frame->Xoffset_top 527#define eptrb frame->Xeptrb 528#define rdepth frame->Xrdepth 529 530/* Ditto for the local variables */ 531 532#ifdef SUPPORT_UTF 533#define charptr frame->Xcharptr 534#endif 535#define callpat frame->Xcallpat 536#define codelink frame->Xcodelink 537#define data frame->Xdata 538#define next frame->Xnext 539#define pp frame->Xpp 540#define prev frame->Xprev 541#define saved_eptr frame->Xsaved_eptr 542 543#define new_recursive frame->Xnew_recursive 544 545#define cur_is_word frame->Xcur_is_word 546#define condition frame->Xcondition 547#define prev_is_word frame->Xprev_is_word 548 549#ifdef SUPPORT_UCP 550#define prop_type frame->Xprop_type 551#define prop_value frame->Xprop_value 552#define prop_fail_result frame->Xprop_fail_result 553#define oclength frame->Xoclength 554#define occhars frame->Xocchars 555#endif 556 557#define ctype frame->Xctype 558#define fc frame->Xfc 559#define fi frame->Xfi 560#define length frame->Xlength 561#define max frame->Xmax 562#define min frame->Xmin 563#define number frame->Xnumber 564#define offset frame->Xoffset 565#define op frame->Xop 566#define save_capture_last frame->Xsave_capture_last 567#define save_offset1 frame->Xsave_offset1 568#define save_offset2 frame->Xsave_offset2 569#define save_offset3 frame->Xsave_offset3 570#define stacksave frame->Xstacksave 571 572#define newptrb frame->Xnewptrb 573 574/* When recursion is being used, local variables are allocated on the stack and 575get preserved during recursion in the normal way. In this environment, fi and 576i, and fc and c, can be the same variables. */ 577 578#else /* NO_RECURSE not defined */ 579#define fi i 580#define fc c 581 582/* Many of the following variables are used only in small blocks of the code. 583My normal style of coding would have declared them within each of those blocks. 584However, in order to accommodate the version of this code that uses an external 585"stack" implemented on the heap, it is easier to declare them all here, so the 586declarations can be cut out in a block. The only declarations within blocks 587below are for variables that do not have to be preserved over a recursive call 588to RMATCH(). */ 589 590#ifdef SUPPORT_UTF 591const pcre_uchar *charptr; 592#endif 593const pcre_uchar *callpat; 594const pcre_uchar *data; 595const pcre_uchar *next; 596PCRE_PUCHAR pp; 597const pcre_uchar *prev; 598PCRE_PUCHAR saved_eptr; 599 600recursion_info new_recursive; 601 602BOOL cur_is_word; 603BOOL condition; 604BOOL prev_is_word; 605 606#ifdef SUPPORT_UCP 607int prop_type; 608int prop_value; 609int prop_fail_result; 610int oclength; 611pcre_uchar occhars[6]; 612#endif 613 614int codelink; 615int ctype; 616int length; 617int max; 618int min; 619int number; 620int offset; 621int op; 622int save_capture_last; 623int save_offset1, save_offset2, save_offset3; 624int stacksave[REC_STACK_SAVE_MAX]; 625 626eptrblock newptrb; 627 628/* There is a special fudge for calling match() in a way that causes it to 629measure the size of its basic stack frame when the stack is being used for 630recursion. The second argument (ecode) being NULL triggers this behaviour. It 631cannot normally ever be NULL. The return is the negated value of the frame 632size. */ 633 634if (ecode == NULL) 635 { 636 if (rdepth == 0) 637 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1); 638 else 639 { 640 int len = (char *)&rdepth - (char *)eptr; 641 return (len > 0)? -len : len; 642 } 643 } 644#endif /* NO_RECURSE */ 645 646/* To save space on the stack and in the heap frame, I have doubled up on some 647of the local variables that are used only in localised parts of the code, but 648still need to be preserved over recursive calls of match(). These macros define 649the alternative names that are used. */ 650 651#define allow_zero cur_is_word 652#define cbegroup condition 653#define code_offset codelink 654#define condassert condition 655#define matched_once prev_is_word 656#define foc number 657#define save_mark data 658 659/* These statements are here to stop the compiler complaining about unitialized 660variables. */ 661 662#ifdef SUPPORT_UCP 663prop_value = 0; 664prop_fail_result = 0; 665#endif 666 667 668/* This label is used for tail recursion, which is used in a few cases even 669when NO_RECURSE is not defined, in order to reduce the amount of stack that is 670used. Thanks to Ian Taylor for noticing this possibility and sending the 671original patch. */ 672 673TAIL_RECURSE: 674 675/* OK, now we can get on with the real code of the function. Recursive calls 676are specified by the macro RMATCH and RRETURN is used to return. When 677NO_RECURSE is *not* defined, these just turn into a recursive call to match() 678and a "return", respectively (possibly with some debugging if PCRE_DEBUG is 679defined). However, RMATCH isn't like a function call because it's quite a 680complicated macro. It has to be used in one particular way. This shouldn't, 681however, impact performance when true recursion is being used. */ 682 683#ifdef SUPPORT_UTF 684utf = md->utf; /* Local copy of the flag */ 685#else 686utf = FALSE; 687#endif 688 689/* First check that we haven't called match() too many times, or that we 690haven't exceeded the recursive call limit. */ 691 692if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 693if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); 694 695/* At the start of a group with an unlimited repeat that may match an empty 696string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is 697done this way to save having to use another function argument, which would take 698up space on the stack. See also MATCH_CONDASSERT below. 699 700When MATCH_CBEGROUP is set, add the current subject pointer to the chain of 701such remembered pointers, to be checked when we hit the closing ket, in order 702to break infinite loops that match no characters. When match() is called in 703other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must 704NOT be used with tail recursion, because the memory block that is used is on 705the stack, so a new one may be required for each match(). */ 706 707if (md->match_function_type == MATCH_CBEGROUP) 708 { 709 newptrb.epb_saved_eptr = eptr; 710 newptrb.epb_prev = eptrb; 711 eptrb = &newptrb; 712 md->match_function_type = 0; 713 } 714 715/* Now start processing the opcodes. */ 716 717for (;;) 718 { 719 minimize = possessive = FALSE; 720 op = *ecode; 721 722 switch(op) 723 { 724 case OP_MARK: 725 md->nomatch_mark = ecode + 2; 726 md->mark = NULL; /* In case previously set by assertion */ 727 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 728 eptrb, RM55); 729 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 730 md->mark == NULL) md->mark = ecode + 2; 731 732 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an 733 argument, and we must check whether that argument matches this MARK's 734 argument. It is passed back in md->start_match_ptr (an overloading of that 735 variable). If it does match, we reset that variable to the current subject 736 position and return MATCH_SKIP. Otherwise, pass back the return code 737 unaltered. */ 738 739 else if (rrc == MATCH_SKIP_ARG && 740 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0) 741 { 742 md->start_match_ptr = eptr; 743 RRETURN(MATCH_SKIP); 744 } 745 RRETURN(rrc); 746 747 case OP_FAIL: 748 RRETURN(MATCH_NOMATCH); 749 750 /* COMMIT overrides PRUNE, SKIP, and THEN */ 751 752 case OP_COMMIT: 753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 754 eptrb, RM52); 755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && 756 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && 757 rrc != MATCH_THEN) 758 RRETURN(rrc); 759 RRETURN(MATCH_COMMIT); 760 761 /* PRUNE overrides THEN */ 762 763 case OP_PRUNE: 764 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 765 eptrb, RM51); 766 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 767 RRETURN(MATCH_PRUNE); 768 769 case OP_PRUNE_ARG: 770 md->nomatch_mark = ecode + 2; 771 md->mark = NULL; /* In case previously set by assertion */ 772 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 773 eptrb, RM56); 774 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 775 md->mark == NULL) md->mark = ecode + 2; 776 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 777 RRETURN(MATCH_PRUNE); 778 779 /* SKIP overrides PRUNE and THEN */ 780 781 case OP_SKIP: 782 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 783 eptrb, RM53); 784 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) 785 RRETURN(rrc); 786 md->start_match_ptr = eptr; /* Pass back current position */ 787 RRETURN(MATCH_SKIP); 788 789 /* Note that, for Perl compatibility, SKIP with an argument does NOT set 790 nomatch_mark. There is a flag that disables this opcode when re-matching a 791 pattern that ended with a SKIP for which there was not a matching MARK. */ 792 793 case OP_SKIP_ARG: 794 if (md->ignore_skip_arg) 795 { 796 ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; 797 break; 798 } 799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, 800 eptrb, RM57); 801 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) 802 RRETURN(rrc); 803 804 /* Pass back the current skip name by overloading md->start_match_ptr and 805 returning the special MATCH_SKIP_ARG return code. This will either be 806 caught by a matching MARK, or get to the top, where it causes a rematch 807 with the md->ignore_skip_arg flag set. */ 808 809 md->start_match_ptr = ecode + 2; 810 RRETURN(MATCH_SKIP_ARG); 811 812 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that 813 the branch in which it occurs can be determined. Overload the start of 814 match pointer to do this. */ 815 816 case OP_THEN: 817 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 818 eptrb, RM54); 819 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 820 md->start_match_ptr = ecode; 821 RRETURN(MATCH_THEN); 822 823 case OP_THEN_ARG: 824 md->nomatch_mark = ecode + 2; 825 md->mark = NULL; /* In case previously set by assertion */ 826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, 827 md, eptrb, RM58); 828 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && 829 md->mark == NULL) md->mark = ecode + 2; 830 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 831 md->start_match_ptr = ecode; 832 RRETURN(MATCH_THEN); 833 834 /* Handle an atomic group that does not contain any capturing parentheses. 835 This can be handled like an assertion. Prior to 8.13, all atomic groups 836 were handled this way. In 8.13, the code was changed as below for ONCE, so 837 that backups pass through the group and thereby reset captured values. 838 However, this uses a lot more stack, so in 8.20, atomic groups that do not 839 contain any captures generate OP_ONCE_NC, which can be handled in the old, 840 less stack intensive way. 841 842 Check the alternative branches in turn - the matching won't pass the KET 843 for this kind of subpattern. If any one branch matches, we carry on as at 844 the end of a normal bracket, leaving the subject pointer, but resetting 845 the start-of-match value in case it was changed by \K. */ 846 847 case OP_ONCE_NC: 848 prev = ecode; 849 saved_eptr = eptr; 850 save_mark = md->mark; 851 do 852 { 853 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); 854 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */ 855 { 856 mstart = md->start_match_ptr; 857 break; 858 } 859 if (rrc == MATCH_THEN) 860 { 861 next = ecode + GET(ecode,1); 862 if (md->start_match_ptr < next && 863 (*ecode == OP_ALT || *next == OP_ALT)) 864 rrc = MATCH_NOMATCH; 865 } 866 867 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 868 ecode += GET(ecode,1); 869 md->mark = save_mark; 870 } 871 while (*ecode == OP_ALT); 872 873 /* If hit the end of the group (which could be repeated), fail */ 874 875 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 876 877 /* Continue as from after the group, updating the offsets high water 878 mark, since extracts may have been taken. */ 879 880 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 881 882 offset_top = md->end_offset_top; 883 eptr = md->end_match_ptr; 884 885 /* For a non-repeating ket, just continue at this level. This also 886 happens for a repeating ket if no characters were matched in the group. 887 This is the forcible breaking of infinite loops as implemented in Perl 888 5.005. */ 889 890 if (*ecode == OP_KET || eptr == saved_eptr) 891 { 892 ecode += 1+LINK_SIZE; 893 break; 894 } 895 896 /* The repeating kets try the rest of the pattern or restart from the 897 preceding bracket, in the appropriate order. The second "call" of match() 898 uses tail recursion, to avoid using another stack frame. */ 899 900 if (*ecode == OP_KETRMIN) 901 { 902 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65); 903 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 904 ecode = prev; 905 goto TAIL_RECURSE; 906 } 907 else /* OP_KETRMAX */ 908 { 909 RMATCH(eptr, prev, offset_top, md, eptrb, RM66); 910 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 911 ecode += 1 + LINK_SIZE; 912 goto TAIL_RECURSE; 913 } 914 /* Control never gets here */ 915 916 /* Handle a capturing bracket, other than those that are possessive with an 917 unlimited repeat. If there is space in the offset vector, save the current 918 subject position in the working slot at the top of the vector. We mustn't 919 change the current values of the data slot, because they may be set from a 920 previous iteration of this group, and be referred to by a reference inside 921 the group. A failure to match might occur after the group has succeeded, 922 if something later on doesn't match. For this reason, we need to restore 923 the working value and also the values of the final offsets, in case they 924 were set by a previous iteration of the same bracket. 925 926 If there isn't enough space in the offset vector, treat this as if it were 927 a non-capturing bracket. Don't worry about setting the flag for the error 928 case here; that is handled in the code for KET. */ 929 930 case OP_CBRA: 931 case OP_SCBRA: 932 number = GET2(ecode, 1+LINK_SIZE); 933 offset = number << 1; 934 935#ifdef PCRE_DEBUG 936 printf("start bracket %d\n", number); 937 printf("subject="); 938 pchars(eptr, 16, TRUE, md); 939 printf("\n"); 940#endif 941 942 if (offset < md->offset_max) 943 { 944 save_offset1 = md->offset_vector[offset]; 945 save_offset2 = md->offset_vector[offset+1]; 946 save_offset3 = md->offset_vector[md->offset_end - number]; 947 save_capture_last = md->capture_last; 948 save_mark = md->mark; 949 950 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 951 md->offset_vector[md->offset_end - number] = 952 (int)(eptr - md->start_subject); 953 954 for (;;) 955 { 956 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 957 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 958 eptrb, RM1); 959 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ 960 961 /* If we backed up to a THEN, check whether it is within the current 962 branch by comparing the address of the THEN that is passed back with 963 the end of the branch. If it is within the current branch, and the 964 branch is one of two or more alternatives (it either starts or ends 965 with OP_ALT), we have reached the limit of THEN's action, so convert 966 the return code to NOMATCH, which will cause normal backtracking to 967 happen from now on. Otherwise, THEN is passed back to an outer 968 alternative. This implements Perl's treatment of parenthesized groups, 969 where a group not containing | does not affect the current alternative, 970 that is, (X) is NOT the same as (X|(*F)). */ 971 972 if (rrc == MATCH_THEN) 973 { 974 next = ecode + GET(ecode,1); 975 if (md->start_match_ptr < next && 976 (*ecode == OP_ALT || *next == OP_ALT)) 977 rrc = MATCH_NOMATCH; 978 } 979 980 /* Anything other than NOMATCH is passed back. */ 981 982 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 983 md->capture_last = save_capture_last; 984 ecode += GET(ecode, 1); 985 md->mark = save_mark; 986 if (*ecode != OP_ALT) break; 987 } 988 989 DPRINTF(("bracket %d failed\n", number)); 990 md->offset_vector[offset] = save_offset1; 991 md->offset_vector[offset+1] = save_offset2; 992 md->offset_vector[md->offset_end - number] = save_offset3; 993 994 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */ 995 996 RRETURN(rrc); 997 } 998 999 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1000 as a non-capturing bracket. */ 1001 1002 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1003 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1004 1005 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1006 1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1008 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1009 1010 /* Non-capturing or atomic group, except for possessive with unlimited 1011 repeat and ONCE group with no captures. Loop for all the alternatives. 1012 1013 When we get to the final alternative within the brackets, we used to return 1014 the result of a recursive call to match() whatever happened so it was 1015 possible to reduce stack usage by turning this into a tail recursion, 1016 except in the case of a possibly empty group. However, now that there is 1017 the possiblity of (*THEN) occurring in the final alternative, this 1018 optimization is no longer always possible. 1019 1020 We can optimize if we know there are no (*THEN)s in the pattern; at present 1021 this is the best that can be done. 1022 1023 MATCH_ONCE is returned when the end of an atomic group is successfully 1024 reached, but subsequent matching fails. It passes back up the tree (causing 1025 captured values to be reset) until the original atomic group level is 1026 reached. This is tested by comparing md->once_target with the start of the 1027 group. At this point, the return is converted into MATCH_NOMATCH so that 1028 previous backup points can be taken. */ 1029 1030 case OP_ONCE: 1031 case OP_BRA: 1032 case OP_SBRA: 1033 DPRINTF(("start non-capturing bracket\n")); 1034 1035 for (;;) 1036 { 1037 if (op >= OP_SBRA || op == OP_ONCE) 1038 md->match_function_type = MATCH_CBEGROUP; 1039 1040 /* If this is not a possibly empty group, and there are no (*THEN)s in 1041 the pattern, and this is the final alternative, optimize as described 1042 above. */ 1043 1044 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) 1045 { 1046 ecode += PRIV(OP_lengths)[*ecode]; 1047 goto TAIL_RECURSE; 1048 } 1049 1050 /* In all other cases, we have to make another call to match(). */ 1051 1052 save_mark = md->mark; 1053 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, 1054 RM2); 1055 1056 /* See comment in the code for capturing groups above about handling 1057 THEN. */ 1058 1059 if (rrc == MATCH_THEN) 1060 { 1061 next = ecode + GET(ecode,1); 1062 if (md->start_match_ptr < next && 1063 (*ecode == OP_ALT || *next == OP_ALT)) 1064 rrc = MATCH_NOMATCH; 1065 } 1066 1067 if (rrc != MATCH_NOMATCH) 1068 { 1069 if (rrc == MATCH_ONCE) 1070 { 1071 const pcre_uchar *scode = ecode; 1072 if (*scode != OP_ONCE) /* If not at start, find it */ 1073 { 1074 while (*scode == OP_ALT) scode += GET(scode, 1); 1075 scode -= GET(scode, 1); 1076 } 1077 if (md->once_target == scode) rrc = MATCH_NOMATCH; 1078 } 1079 RRETURN(rrc); 1080 } 1081 ecode += GET(ecode, 1); 1082 md->mark = save_mark; 1083 if (*ecode != OP_ALT) break; 1084 } 1085 1086 RRETURN(MATCH_NOMATCH); 1087 1088 /* Handle possessive capturing brackets with an unlimited repeat. We come 1089 here from BRAZERO with allow_zero set TRUE. The offset_vector values are 1090 handled similarly to the normal case above. However, the matching is 1091 different. The end of these brackets will always be OP_KETRPOS, which 1092 returns MATCH_KETRPOS without going further in the pattern. By this means 1093 we can handle the group by iteration rather than recursion, thereby 1094 reducing the amount of stack needed. */ 1095 1096 case OP_CBRAPOS: 1097 case OP_SCBRAPOS: 1098 allow_zero = FALSE; 1099 1100 POSSESSIVE_CAPTURE: 1101 number = GET2(ecode, 1+LINK_SIZE); 1102 offset = number << 1; 1103 1104#ifdef PCRE_DEBUG 1105 printf("start possessive bracket %d\n", number); 1106 printf("subject="); 1107 pchars(eptr, 16, TRUE, md); 1108 printf("\n"); 1109#endif 1110 1111 if (offset < md->offset_max) 1112 { 1113 matched_once = FALSE; 1114 code_offset = (int)(ecode - md->start_code); 1115 1116 save_offset1 = md->offset_vector[offset]; 1117 save_offset2 = md->offset_vector[offset+1]; 1118 save_offset3 = md->offset_vector[md->offset_end - number]; 1119 save_capture_last = md->capture_last; 1120 1121 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 1122 1123 /* Each time round the loop, save the current subject position for use 1124 when the group matches. For MATCH_MATCH, the group has matched, so we 1125 restart it with a new subject starting position, remembering that we had 1126 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as 1127 usual. If we haven't matched any alternatives in any iteration, check to 1128 see if a previous iteration matched. If so, the group has matched; 1129 continue from afterwards. Otherwise it has failed; restore the previous 1130 capture values before returning NOMATCH. */ 1131 1132 for (;;) 1133 { 1134 md->offset_vector[md->offset_end - number] = 1135 (int)(eptr - md->start_subject); 1136 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1137 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1138 eptrb, RM63); 1139 if (rrc == MATCH_KETRPOS) 1140 { 1141 offset_top = md->end_offset_top; 1142 eptr = md->end_match_ptr; 1143 ecode = md->start_code + code_offset; 1144 save_capture_last = md->capture_last; 1145 matched_once = TRUE; 1146 continue; 1147 } 1148 1149 /* See comment in the code for capturing groups above about handling 1150 THEN. */ 1151 1152 if (rrc == MATCH_THEN) 1153 { 1154 next = ecode + GET(ecode,1); 1155 if (md->start_match_ptr < next && 1156 (*ecode == OP_ALT || *next == OP_ALT)) 1157 rrc = MATCH_NOMATCH; 1158 } 1159 1160 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1161 md->capture_last = save_capture_last; 1162 ecode += GET(ecode, 1); 1163 if (*ecode != OP_ALT) break; 1164 } 1165 1166 if (!matched_once) 1167 { 1168 md->offset_vector[offset] = save_offset1; 1169 md->offset_vector[offset+1] = save_offset2; 1170 md->offset_vector[md->offset_end - number] = save_offset3; 1171 } 1172 1173 if (allow_zero || matched_once) 1174 { 1175 ecode += 1 + LINK_SIZE; 1176 break; 1177 } 1178 1179 RRETURN(MATCH_NOMATCH); 1180 } 1181 1182 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 1183 as a non-capturing bracket. */ 1184 1185 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1187 1188 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 1189 1190 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 1192 1193 /* Non-capturing possessive bracket with unlimited repeat. We come here 1194 from BRAZERO with allow_zero = TRUE. The code is similar to the above, 1195 without the capturing complication. It is written out separately for speed 1196 and cleanliness. */ 1197 1198 case OP_BRAPOS: 1199 case OP_SBRAPOS: 1200 allow_zero = FALSE; 1201 1202 POSSESSIVE_NON_CAPTURE: 1203 matched_once = FALSE; 1204 code_offset = (int)(ecode - md->start_code); 1205 1206 for (;;) 1207 { 1208 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; 1209 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, 1210 eptrb, RM48); 1211 if (rrc == MATCH_KETRPOS) 1212 { 1213 offset_top = md->end_offset_top; 1214 eptr = md->end_match_ptr; 1215 ecode = md->start_code + code_offset; 1216 matched_once = TRUE; 1217 continue; 1218 } 1219 1220 /* See comment in the code for capturing groups above about handling 1221 THEN. */ 1222 1223 if (rrc == MATCH_THEN) 1224 { 1225 next = ecode + GET(ecode,1); 1226 if (md->start_match_ptr < next && 1227 (*ecode == OP_ALT || *next == OP_ALT)) 1228 rrc = MATCH_NOMATCH; 1229 } 1230 1231 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1232 ecode += GET(ecode, 1); 1233 if (*ecode != OP_ALT) break; 1234 } 1235 1236 if (matched_once || allow_zero) 1237 { 1238 ecode += 1 + LINK_SIZE; 1239 break; 1240 } 1241 RRETURN(MATCH_NOMATCH); 1242 1243 /* Control never reaches here. */ 1244 1245 /* Conditional group: compilation checked that there are no more than 1246 two branches. If the condition is false, skipping the first branch takes us 1247 past the end if there is only one branch, but that's OK because that is 1248 exactly what going to the ket would do. */ 1249 1250 case OP_COND: 1251 case OP_SCOND: 1252 codelink = GET(ecode, 1); 1253 1254 /* Because of the way auto-callout works during compile, a callout item is 1255 inserted between OP_COND and an assertion condition. */ 1256 1257 if (ecode[LINK_SIZE+1] == OP_CALLOUT) 1258 { 1259 if (PUBL(callout) != NULL) 1260 { 1261 PUBL(callout_block) cb; 1262 cb.version = 2; /* Version 1 of the callout block */ 1263 cb.callout_number = ecode[LINK_SIZE+2]; 1264 cb.offset_vector = md->offset_vector; 1265#ifdef COMPILE_PCRE8 1266 cb.subject = (PCRE_SPTR)md->start_subject; 1267#else 1268 cb.subject = (PCRE_SPTR16)md->start_subject; 1269#endif 1270 cb.subject_length = (int)(md->end_subject - md->start_subject); 1271 cb.start_match = (int)(mstart - md->start_subject); 1272 cb.current_position = (int)(eptr - md->start_subject); 1273 cb.pattern_position = GET(ecode, LINK_SIZE + 3); 1274 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); 1275 cb.capture_top = offset_top/2; 1276 cb.capture_last = md->capture_last; 1277 cb.callout_data = md->callout_data; 1278 cb.mark = md->nomatch_mark; 1279 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1280 if (rrc < 0) RRETURN(rrc); 1281 } 1282 ecode += PRIV(OP_lengths)[OP_CALLOUT]; 1283 } 1284 1285 condcode = ecode[LINK_SIZE+1]; 1286 1287 /* Now see what the actual condition is */ 1288 1289 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ 1290 { 1291 if (md->recursive == NULL) /* Not recursing => FALSE */ 1292 { 1293 condition = FALSE; 1294 ecode += GET(ecode, 1); 1295 } 1296 else 1297 { 1298 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ 1299 condition = (recno == RREF_ANY || recno == md->recursive->group_num); 1300 1301 /* If the test is for recursion into a specific subpattern, and it is 1302 false, but the test was set up by name, scan the table to see if the 1303 name refers to any other numbers, and test them. The condition is true 1304 if any one is set. */ 1305 1306 if (!condition && condcode == OP_NRREF) 1307 { 1308 pcre_uchar *slotA = md->name_table; 1309 for (i = 0; i < md->name_count; i++) 1310 { 1311 if (GET2(slotA, 0) == recno) break; 1312 slotA += md->name_entry_size; 1313 } 1314 1315 /* Found a name for the number - there can be only one; duplicate 1316 names for different numbers are allowed, but not vice versa. First 1317 scan down for duplicates. */ 1318 1319 if (i < md->name_count) 1320 { 1321 pcre_uchar *slotB = slotA; 1322 while (slotB > md->name_table) 1323 { 1324 slotB -= md->name_entry_size; 1325 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) 1326 { 1327 condition = GET2(slotB, 0) == md->recursive->group_num; 1328 if (condition) break; 1329 } 1330 else break; 1331 } 1332 1333 /* Scan up for duplicates */ 1334 1335 if (!condition) 1336 { 1337 slotB = slotA; 1338 for (i++; i < md->name_count; i++) 1339 { 1340 slotB += md->name_entry_size; 1341 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) 1342 { 1343 condition = GET2(slotB, 0) == md->recursive->group_num; 1344 if (condition) break; 1345 } 1346 else break; 1347 } 1348 } 1349 } 1350 } 1351 1352 /* Chose branch according to the condition */ 1353 1354 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); 1355 } 1356 } 1357 1358 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ 1359 { 1360 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ 1361 condition = offset < offset_top && md->offset_vector[offset] >= 0; 1362 1363 /* If the numbered capture is unset, but the reference was by name, 1364 scan the table to see if the name refers to any other numbers, and test 1365 them. The condition is true if any one is set. This is tediously similar 1366 to the code above, but not close enough to try to amalgamate. */ 1367 1368 if (!condition && condcode == OP_NCREF) 1369 { 1370 int refno = offset >> 1; 1371 pcre_uchar *slotA = md->name_table; 1372 1373 for (i = 0; i < md->name_count; i++) 1374 { 1375 if (GET2(slotA, 0) == refno) break; 1376 slotA += md->name_entry_size; 1377 } 1378 1379 /* Found a name for the number - there can be only one; duplicate names 1380 for different numbers are allowed, but not vice versa. First scan down 1381 for duplicates. */ 1382 1383 if (i < md->name_count) 1384 { 1385 pcre_uchar *slotB = slotA; 1386 while (slotB > md->name_table) 1387 { 1388 slotB -= md->name_entry_size; 1389 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) 1390 { 1391 offset = GET2(slotB, 0) << 1; 1392 condition = offset < offset_top && 1393 md->offset_vector[offset] >= 0; 1394 if (condition) break; 1395 } 1396 else break; 1397 } 1398 1399 /* Scan up for duplicates */ 1400 1401 if (!condition) 1402 { 1403 slotB = slotA; 1404 for (i++; i < md->name_count; i++) 1405 { 1406 slotB += md->name_entry_size; 1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) 1408 { 1409 offset = GET2(slotB, 0) << 1; 1410 condition = offset < offset_top && 1411 md->offset_vector[offset] >= 0; 1412 if (condition) break; 1413 } 1414 else break; 1415 } 1416 } 1417 } 1418 } 1419 1420 /* Chose branch according to the condition */ 1421 1422 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); 1423 } 1424 1425 else if (condcode == OP_DEF) /* DEFINE - always false */ 1426 { 1427 condition = FALSE; 1428 ecode += GET(ecode, 1); 1429 } 1430 1431 /* The condition is an assertion. Call match() to evaluate it - setting 1432 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of 1433 an assertion. */ 1434 1435 else 1436 { 1437 md->match_function_type = MATCH_CONDASSERT; 1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3); 1439 if (rrc == MATCH_MATCH) 1440 { 1441 if (md->end_offset_top > offset_top) 1442 offset_top = md->end_offset_top; /* Captures may have happened */ 1443 condition = TRUE; 1444 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); 1445 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1446 } 1447 1448 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an 1449 assertion; it is therefore treated as NOMATCH. */ 1450 1451 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1452 { 1453 RRETURN(rrc); /* Need braces because of following else */ 1454 } 1455 else 1456 { 1457 condition = FALSE; 1458 ecode += codelink; 1459 } 1460 } 1461 1462 /* We are now at the branch that is to be obeyed. As there is only one, can 1463 use tail recursion to avoid using another stack frame, except when there is 1464 unlimited repeat of a possibly empty group. In the latter case, a recursive 1465 call to match() is always required, unless the second alternative doesn't 1466 exist, in which case we can just plough on. Note that, for compatibility 1467 with Perl, the | in a conditional group is NOT treated as creating two 1468 alternatives. If a THEN is encountered in the branch, it propagates out to 1469 the enclosing alternative (unless nested in a deeper set of alternatives, 1470 of course). */ 1471 1472 if (condition || *ecode == OP_ALT) 1473 { 1474 if (op != OP_SCOND) 1475 { 1476 ecode += 1 + LINK_SIZE; 1477 goto TAIL_RECURSE; 1478 } 1479 1480 md->match_function_type = MATCH_CBEGROUP; 1481 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49); 1482 RRETURN(rrc); 1483 } 1484 1485 /* Condition false & no alternative; continue after the group. */ 1486 1487 else 1488 { 1489 ecode += 1 + LINK_SIZE; 1490 } 1491 break; 1492 1493 1494 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1495 to close any currently open capturing brackets. */ 1496 1497 case OP_CLOSE: 1498 number = GET2(ecode, 1); 1499 offset = number << 1; 1500 1501#ifdef PCRE_DEBUG 1502 printf("end bracket %d at *ACCEPT", number); 1503 printf("\n"); 1504#endif 1505 1506 md->capture_last = number; 1507 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 1508 { 1509 md->offset_vector[offset] = 1510 md->offset_vector[md->offset_end - number]; 1511 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1512 if (offset_top <= offset) offset_top = offset + 2; 1513 } 1514 ecode += 1 + IMM2_SIZE; 1515 break; 1516 1517 1518 /* End of the pattern, either real or forced. */ 1519 1520 case OP_END: 1521 case OP_ACCEPT: 1522 case OP_ASSERT_ACCEPT: 1523 1524 /* If we have matched an empty string, fail if not in an assertion and not 1525 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART 1526 is set and we have matched at the start of the subject. In both cases, 1527 backtracking will then try other alternatives, if any. */ 1528 1529 if (eptr == mstart && op != OP_ASSERT_ACCEPT && 1530 md->recursive == NULL && 1531 (md->notempty || 1532 (md->notempty_atstart && 1533 mstart == md->start_subject + md->start_offset))) 1534 RRETURN(MATCH_NOMATCH); 1535 1536 /* Otherwise, we have a match. */ 1537 1538 md->end_match_ptr = eptr; /* Record where we ended */ 1539 md->end_offset_top = offset_top; /* and how many extracts were taken */ 1540 md->start_match_ptr = mstart; /* and the start (\K can modify) */ 1541 1542 /* For some reason, the macros don't work properly if an expression is 1543 given as the argument to RRETURN when the heap is in use. */ 1544 1545 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT; 1546 RRETURN(rrc); 1547 1548 /* Assertion brackets. Check the alternative branches in turn - the 1549 matching won't pass the KET for an assertion. If any one branch matches, 1550 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1551 start of each branch to move the current point backwards, so the code at 1552 this level is identical to the lookahead case. When the assertion is part 1553 of a condition, we want to return immediately afterwards. The caller of 1554 this incarnation of the match() function will have set MATCH_CONDASSERT in 1555 md->match_function type, and one of these opcodes will be the first opcode 1556 that is processed. We use a local variable that is preserved over calls to 1557 match() to remember this case. */ 1558 1559 case OP_ASSERT: 1560 case OP_ASSERTBACK: 1561 save_mark = md->mark; 1562 if (md->match_function_type == MATCH_CONDASSERT) 1563 { 1564 condassert = TRUE; 1565 md->match_function_type = 0; 1566 } 1567 else condassert = FALSE; 1568 1569 do 1570 { 1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4); 1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1573 { 1574 mstart = md->start_match_ptr; /* In case \K reset it */ 1575 break; 1576 } 1577 md->mark = save_mark; 1578 1579 /* A COMMIT failure must fail the entire assertion, without trying any 1580 subsequent branches. */ 1581 1582 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH); 1583 1584 /* PCRE does not allow THEN to escape beyond an assertion; it 1585 is treated as NOMATCH. */ 1586 1587 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 1588 ecode += GET(ecode, 1); 1589 } 1590 while (*ecode == OP_ALT); 1591 1592 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 1593 1594 /* If checking an assertion for a condition, return MATCH_MATCH. */ 1595 1596 if (condassert) RRETURN(MATCH_MATCH); 1597 1598 /* Continue from after the assertion, updating the offsets high water 1599 mark, since extracts may have been taken during the assertion. */ 1600 1601 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1602 ecode += 1 + LINK_SIZE; 1603 offset_top = md->end_offset_top; 1604 continue; 1605 1606 /* Negative assertion: all branches must fail to match. Encountering SKIP, 1607 PRUNE, or COMMIT means we must assume failure without checking subsequent 1608 branches. */ 1609 1610 case OP_ASSERT_NOT: 1611 case OP_ASSERTBACK_NOT: 1612 save_mark = md->mark; 1613 if (md->match_function_type == MATCH_CONDASSERT) 1614 { 1615 condassert = TRUE; 1616 md->match_function_type = 0; 1617 } 1618 else condassert = FALSE; 1619 1620 do 1621 { 1622 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); 1623 md->mark = save_mark; 1624 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); 1625 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) 1626 { 1627 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1628 break; 1629 } 1630 1631 /* PCRE does not allow THEN to escape beyond an assertion; it is treated 1632 as NOMATCH. */ 1633 1634 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 1635 ecode += GET(ecode,1); 1636 } 1637 while (*ecode == OP_ALT); 1638 1639 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */ 1640 1641 ecode += 1 + LINK_SIZE; 1642 continue; 1643 1644 /* Move the subject pointer back. This occurs only at the start of 1645 each branch of a lookbehind assertion. If we are too close to the start to 1646 move back, this match function fails. When working with UTF-8 we move 1647 back a number of characters, not bytes. */ 1648 1649 case OP_REVERSE: 1650#ifdef SUPPORT_UTF 1651 if (utf) 1652 { 1653 i = GET(ecode, 1); 1654 while (i-- > 0) 1655 { 1656 eptr--; 1657 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1658 BACKCHAR(eptr); 1659 } 1660 } 1661 else 1662#endif 1663 1664 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1665 1666 { 1667 eptr -= GET(ecode, 1); 1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1669 } 1670 1671 /* Save the earliest consulted character, then skip to next op code */ 1672 1673 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; 1674 ecode += 1 + LINK_SIZE; 1675 break; 1676 1677 /* The callout item calls an external function, if one is provided, passing 1678 details of the match so far. This is mainly for debugging, though the 1679 function is able to force a failure. */ 1680 1681 case OP_CALLOUT: 1682 if (PUBL(callout) != NULL) 1683 { 1684 PUBL(callout_block) cb; 1685 cb.version = 2; /* Version 1 of the callout block */ 1686 cb.callout_number = ecode[1]; 1687 cb.offset_vector = md->offset_vector; 1688#ifdef COMPILE_PCRE8 1689 cb.subject = (PCRE_SPTR)md->start_subject; 1690#else 1691 cb.subject = (PCRE_SPTR16)md->start_subject; 1692#endif 1693 cb.subject_length = (int)(md->end_subject - md->start_subject); 1694 cb.start_match = (int)(mstart - md->start_subject); 1695 cb.current_position = (int)(eptr - md->start_subject); 1696 cb.pattern_position = GET(ecode, 2); 1697 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1698 cb.capture_top = offset_top/2; 1699 cb.capture_last = md->capture_last; 1700 cb.callout_data = md->callout_data; 1701 cb.mark = md->nomatch_mark; 1702 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1703 if (rrc < 0) RRETURN(rrc); 1704 } 1705 ecode += 2 + 2*LINK_SIZE; 1706 break; 1707 1708 /* Recursion either matches the current regex, or some subexpression. The 1709 offset data is the offset to the starting bracket from the start of the 1710 whole pattern. (This is so that it works from duplicated subpatterns.) 1711 1712 The state of the capturing groups is preserved over recursion, and 1713 re-instated afterwards. We don't know how many are started and not yet 1714 finished (offset_top records the completed total) so we just have to save 1715 all the potential data. There may be up to 65535 such values, which is too 1716 large to put on the stack, but using malloc for small numbers seems 1717 expensive. As a compromise, the stack is used when there are no more than 1718 REC_STACK_SAVE_MAX values to store; otherwise malloc is used. 1719 1720 There are also other values that have to be saved. We use a chained 1721 sequence of blocks that actually live on the stack. Thanks to Robin Houston 1722 for the original version of this logic. It has, however, been hacked around 1723 a lot, so he is not to blame for the current way it works. */ 1724 1725 case OP_RECURSE: 1726 { 1727 recursion_info *ri; 1728 int recno; 1729 1730 callpat = md->start_code + GET(ecode, 1); 1731 recno = (callpat == md->start_code)? 0 : 1732 GET2(callpat, 1 + LINK_SIZE); 1733 1734 /* Check for repeating a recursion without advancing the subject pointer. 1735 This should catch convoluted mutual recursions. (Some simple cases are 1736 caught at compile time.) */ 1737 1738 for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 1739 if (recno == ri->group_num && eptr == ri->subject_position) 1740 RRETURN(PCRE_ERROR_RECURSELOOP); 1741 1742 /* Add to "recursing stack" */ 1743 1744 new_recursive.group_num = recno; 1745 new_recursive.subject_position = eptr; 1746 new_recursive.prevrec = md->recursive; 1747 md->recursive = &new_recursive; 1748 1749 /* Where to continue from afterwards */ 1750 1751 ecode += 1 + LINK_SIZE; 1752 1753 /* Now save the offset data */ 1754 1755 new_recursive.saved_max = md->offset_end; 1756 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 1757 new_recursive.offset_save = stacksave; 1758 else 1759 { 1760 new_recursive.offset_save = 1761 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int)); 1762 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 1763 } 1764 memcpy(new_recursive.offset_save, md->offset_vector, 1765 new_recursive.saved_max * sizeof(int)); 1766 1767 /* OK, now we can do the recursion. After processing each alternative, 1768 restore the offset data. If there were nested recursions, md->recursive 1769 might be changed, so reset it before looping. */ 1770 1771 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 1772 cbegroup = (*callpat >= OP_SBRA); 1773 do 1774 { 1775 if (cbegroup) md->match_function_type = MATCH_CBEGROUP; 1776 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, 1777 md, eptrb, RM6); 1778 memcpy(md->offset_vector, new_recursive.offset_save, 1779 new_recursive.saved_max * sizeof(int)); 1780 md->recursive = new_recursive.prevrec; 1781 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) 1782 { 1783 DPRINTF(("Recursion matched\n")); 1784 if (new_recursive.offset_save != stacksave) 1785 (PUBL(free))(new_recursive.offset_save); 1786 1787 /* Set where we got to in the subject, and reset the start in case 1788 it was changed by \K. This *is* propagated back out of a recursion, 1789 for Perl compatibility. */ 1790 1791 eptr = md->end_match_ptr; 1792 mstart = md->start_match_ptr; 1793 goto RECURSION_MATCHED; /* Exit loop; end processing */ 1794 } 1795 1796 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it 1797 is treated as NOMATCH. */ 1798 1799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN && 1800 rrc != MATCH_COMMIT) 1801 { 1802 DPRINTF(("Recursion gave error %d\n", rrc)); 1803 if (new_recursive.offset_save != stacksave) 1804 (PUBL(free))(new_recursive.offset_save); 1805 RRETURN(rrc); 1806 } 1807 1808 md->recursive = &new_recursive; 1809 callpat += GET(callpat, 1); 1810 } 1811 while (*callpat == OP_ALT); 1812 1813 DPRINTF(("Recursion didn't match\n")); 1814 md->recursive = new_recursive.prevrec; 1815 if (new_recursive.offset_save != stacksave) 1816 (PUBL(free))(new_recursive.offset_save); 1817 RRETURN(MATCH_NOMATCH); 1818 } 1819 1820 RECURSION_MATCHED: 1821 break; 1822 1823 /* An alternation is the end of a branch; scan along to find the end of the 1824 bracketed group and go to there. */ 1825 1826 case OP_ALT: 1827 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1828 break; 1829 1830 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1831 indicating that it may occur zero times. It may repeat infinitely, or not 1832 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1833 with fixed upper repeat limits are compiled as a number of copies, with the 1834 optional ones preceded by BRAZERO or BRAMINZERO. */ 1835 1836 case OP_BRAZERO: 1837 next = ecode + 1; 1838 RMATCH(eptr, next, offset_top, md, eptrb, RM10); 1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1840 do next += GET(next, 1); while (*next == OP_ALT); 1841 ecode = next + 1 + LINK_SIZE; 1842 break; 1843 1844 case OP_BRAMINZERO: 1845 next = ecode + 1; 1846 do next += GET(next, 1); while (*next == OP_ALT); 1847 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11); 1848 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1849 ecode++; 1850 break; 1851 1852 case OP_SKIPZERO: 1853 next = ecode+1; 1854 do next += GET(next,1); while (*next == OP_ALT); 1855 ecode = next + 1 + LINK_SIZE; 1856 break; 1857 1858 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything 1859 here; just jump to the group, with allow_zero set TRUE. */ 1860 1861 case OP_BRAPOSZERO: 1862 op = *(++ecode); 1863 allow_zero = TRUE; 1864 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE; 1865 goto POSSESSIVE_NON_CAPTURE; 1866 1867 /* End of a group, repeated or non-repeating. */ 1868 1869 case OP_KET: 1870 case OP_KETRMIN: 1871 case OP_KETRMAX: 1872 case OP_KETRPOS: 1873 prev = ecode - GET(ecode, 1); 1874 1875 /* If this was a group that remembered the subject start, in order to break 1876 infinite repeats of empty string matches, retrieve the subject start from 1877 the chain. Otherwise, set it NULL. */ 1878 1879 if (*prev >= OP_SBRA || *prev == OP_ONCE) 1880 { 1881 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1882 eptrb = eptrb->epb_prev; /* Backup to previous group */ 1883 } 1884 else saved_eptr = NULL; 1885 1886 /* If we are at the end of an assertion group or a non-capturing atomic 1887 group, stop matching and return MATCH_MATCH, but record the current high 1888 water mark for use by positive assertions. We also need to record the match 1889 start in case it was changed by \K. */ 1890 1891 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) || 1892 *prev == OP_ONCE_NC) 1893 { 1894 md->end_match_ptr = eptr; /* For ONCE_NC */ 1895 md->end_offset_top = offset_top; 1896 md->start_match_ptr = mstart; 1897 RRETURN(MATCH_MATCH); /* Sets md->mark */ 1898 } 1899 1900 /* For capturing groups we have to check the group number back at the start 1901 and if necessary complete handling an extraction by setting the offsets and 1902 bumping the high water mark. Whole-pattern recursion is coded as a recurse 1903 into group 0, so it won't be picked up here. Instead, we catch it when the 1904 OP_END is reached. Other recursion is handled here. We just have to record 1905 the current subject position and start match pointer and give a MATCH 1906 return. */ 1907 1908 if (*prev == OP_CBRA || *prev == OP_SCBRA || 1909 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS) 1910 { 1911 number = GET2(prev, 1+LINK_SIZE); 1912 offset = number << 1; 1913 1914#ifdef PCRE_DEBUG 1915 printf("end bracket %d", number); 1916 printf("\n"); 1917#endif 1918 1919 /* Handle a recursively called group. */ 1920 1921 if (md->recursive != NULL && md->recursive->group_num == number) 1922 { 1923 md->end_match_ptr = eptr; 1924 md->start_match_ptr = mstart; 1925 RRETURN(MATCH_MATCH); 1926 } 1927 1928 /* Deal with capturing */ 1929 1930 md->capture_last = number; 1931 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 1932 { 1933 /* If offset is greater than offset_top, it means that we are 1934 "skipping" a capturing group, and that group's offsets must be marked 1935 unset. In earlier versions of PCRE, all the offsets were unset at the 1936 start of matching, but this doesn't work because atomic groups and 1937 assertions can cause a value to be set that should later be unset. 1938 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as 1939 part of the atomic group, but this is not on the final matching path, 1940 so must be unset when 2 is set. (If there is no group 2, there is no 1941 problem, because offset_top will then be 2, indicating no capture.) */ 1942 1943 if (offset > offset_top) 1944 { 1945 register int *iptr = md->offset_vector + offset_top; 1946 register int *iend = md->offset_vector + offset; 1947 while (iptr < iend) *iptr++ = -1; 1948 } 1949 1950 /* Now make the extraction */ 1951 1952 md->offset_vector[offset] = 1953 md->offset_vector[md->offset_end - number]; 1954 md->offset_vector[offset+1] = (int)(eptr - md->start_subject); 1955 if (offset_top <= offset) offset_top = offset + 2; 1956 } 1957 } 1958 1959 /* For an ordinary non-repeating ket, just continue at this level. This 1960 also happens for a repeating ket if no characters were matched in the 1961 group. This is the forcible breaking of infinite loops as implemented in 1962 Perl 5.005. For a non-repeating atomic group that includes captures, 1963 establish a backup point by processing the rest of the pattern at a lower 1964 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the 1965 original OP_ONCE level, thereby bypassing intermediate backup points, but 1966 resetting any captures that happened along the way. */ 1967 1968 if (*ecode == OP_KET || eptr == saved_eptr) 1969 { 1970 if (*prev == OP_ONCE) 1971 { 1972 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12); 1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1974 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 1975 RRETURN(MATCH_ONCE); 1976 } 1977 ecode += 1 + LINK_SIZE; /* Carry on at this level */ 1978 break; 1979 } 1980 1981 /* OP_KETRPOS is a possessive repeating ket. Remember the current position, 1982 and return the MATCH_KETRPOS. This makes it possible to do the repeats one 1983 at a time from the outer level, thus saving stack. */ 1984 1985 if (*ecode == OP_KETRPOS) 1986 { 1987 md->end_match_ptr = eptr; 1988 md->end_offset_top = offset_top; 1989 RRETURN(MATCH_KETRPOS); 1990 } 1991 1992 /* The normal repeating kets try the rest of the pattern or restart from 1993 the preceding bracket, in the appropriate order. In the second case, we can 1994 use tail recursion to avoid using another stack frame, unless we have an 1995 an atomic group or an unlimited repeat of a group that can match an empty 1996 string. */ 1997 1998 if (*ecode == OP_KETRMIN) 1999 { 2000 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7); 2001 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2002 if (*prev == OP_ONCE) 2003 { 2004 RMATCH(eptr, prev, offset_top, md, eptrb, RM8); 2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2006 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */ 2007 RRETURN(MATCH_ONCE); 2008 } 2009 if (*prev >= OP_SBRA) /* Could match an empty string */ 2010 { 2011 RMATCH(eptr, prev, offset_top, md, eptrb, RM50); 2012 RRETURN(rrc); 2013 } 2014 ecode = prev; 2015 goto TAIL_RECURSE; 2016 } 2017 else /* OP_KETRMAX */ 2018 { 2019 RMATCH(eptr, prev, offset_top, md, eptrb, RM13); 2020 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH; 2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2022 if (*prev == OP_ONCE) 2023 { 2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9); 2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2026 md->once_target = prev; 2027 RRETURN(MATCH_ONCE); 2028 } 2029 ecode += 1 + LINK_SIZE; 2030 goto TAIL_RECURSE; 2031 } 2032 /* Control never gets here */ 2033 2034 /* Not multiline mode: start of subject assertion, unless notbol. */ 2035 2036 case OP_CIRC: 2037 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2038 2039 /* Start of subject assertion */ 2040 2041 case OP_SOD: 2042 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 2043 ecode++; 2044 break; 2045 2046 /* Multiline mode: start of subject unless notbol, or after any newline. */ 2047 2048 case OP_CIRCM: 2049 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 2050 if (eptr != md->start_subject && 2051 (eptr == md->end_subject || !WAS_NEWLINE(eptr))) 2052 RRETURN(MATCH_NOMATCH); 2053 ecode++; 2054 break; 2055 2056 /* Start of match assertion */ 2057 2058 case OP_SOM: 2059 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 2060 ecode++; 2061 break; 2062 2063 /* Reset the start of match point */ 2064 2065 case OP_SET_SOM: 2066 mstart = eptr; 2067 ecode++; 2068 break; 2069 2070 /* Multiline mode: assert before any newline, or before end of subject 2071 unless noteol is set. */ 2072 2073 case OP_DOLLM: 2074 if (eptr < md->end_subject) 2075 { 2076 if (!IS_NEWLINE(eptr)) 2077 { 2078 if (md->partial != 0 && 2079 eptr + 1 >= md->end_subject && 2080 NLBLOCK->nltype == NLTYPE_FIXED && 2081 NLBLOCK->nllen == 2 && 2082 *eptr == NLBLOCK->nl[0]) 2083 { 2084 md->hitend = TRUE; 2085 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2086 } 2087 RRETURN(MATCH_NOMATCH); 2088 } 2089 } 2090 else 2091 { 2092 if (md->noteol) RRETURN(MATCH_NOMATCH); 2093 SCHECK_PARTIAL(); 2094 } 2095 ecode++; 2096 break; 2097 2098 /* Not multiline mode: assert before a terminating newline or before end of 2099 subject unless noteol is set. */ 2100 2101 case OP_DOLL: 2102 if (md->noteol) RRETURN(MATCH_NOMATCH); 2103 if (!md->endonly) goto ASSERT_NL_OR_EOS; 2104 2105 /* ... else fall through for endonly */ 2106 2107 /* End of subject assertion (\z) */ 2108 2109 case OP_EOD: 2110 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 2111 SCHECK_PARTIAL(); 2112 ecode++; 2113 break; 2114 2115 /* End of subject or ending \n assertion (\Z) */ 2116 2117 case OP_EODN: 2118 ASSERT_NL_OR_EOS: 2119 if (eptr < md->end_subject && 2120 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 2121 { 2122 if (md->partial != 0 && 2123 eptr + 1 >= md->end_subject && 2124 NLBLOCK->nltype == NLTYPE_FIXED && 2125 NLBLOCK->nllen == 2 && 2126 *eptr == NLBLOCK->nl[0]) 2127 { 2128 md->hitend = TRUE; 2129 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2130 } 2131 RRETURN(MATCH_NOMATCH); 2132 } 2133 2134 /* Either at end of string or \n before end. */ 2135 2136 SCHECK_PARTIAL(); 2137 ecode++; 2138 break; 2139 2140 /* Word boundary assertions */ 2141 2142 case OP_NOT_WORD_BOUNDARY: 2143 case OP_WORD_BOUNDARY: 2144 { 2145 2146 /* Find out if the previous and current characters are "word" characters. 2147 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 2148 be "non-word" characters. Remember the earliest consulted character for 2149 partial matching. */ 2150 2151#ifdef SUPPORT_UTF 2152 if (utf) 2153 { 2154 /* Get status of previous character */ 2155 2156 if (eptr == md->start_subject) prev_is_word = FALSE; else 2157 { 2158 PCRE_PUCHAR lastptr = eptr - 1; 2159 BACKCHAR(lastptr); 2160 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; 2161 GETCHAR(c, lastptr); 2162#ifdef SUPPORT_UCP 2163 if (md->use_ucp) 2164 { 2165 if (c == '_') prev_is_word = TRUE; else 2166 { 2167 int cat = UCD_CATEGORY(c); 2168 prev_is_word = (cat == ucp_L || cat == ucp_N); 2169 } 2170 } 2171 else 2172#endif 2173 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2174 } 2175 2176 /* Get status of next character */ 2177 2178 if (eptr >= md->end_subject) 2179 { 2180 SCHECK_PARTIAL(); 2181 cur_is_word = FALSE; 2182 } 2183 else 2184 { 2185 GETCHAR(c, eptr); 2186#ifdef SUPPORT_UCP 2187 if (md->use_ucp) 2188 { 2189 if (c == '_') cur_is_word = TRUE; else 2190 { 2191 int cat = UCD_CATEGORY(c); 2192 cur_is_word = (cat == ucp_L || cat == ucp_N); 2193 } 2194 } 2195 else 2196#endif 2197 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 2198 } 2199 } 2200 else 2201#endif 2202 2203 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for 2204 consistency with the behaviour of \w we do use it in this case. */ 2205 2206 { 2207 /* Get status of previous character */ 2208 2209 if (eptr == md->start_subject) prev_is_word = FALSE; else 2210 { 2211 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; 2212#ifdef SUPPORT_UCP 2213 if (md->use_ucp) 2214 { 2215 c = eptr[-1]; 2216 if (c == '_') prev_is_word = TRUE; else 2217 { 2218 int cat = UCD_CATEGORY(c); 2219 prev_is_word = (cat == ucp_L || cat == ucp_N); 2220 } 2221 } 2222 else 2223#endif 2224 prev_is_word = MAX_255(eptr[-1]) 2225 && ((md->ctypes[eptr[-1]] & ctype_word) != 0); 2226 } 2227 2228 /* Get status of next character */ 2229 2230 if (eptr >= md->end_subject) 2231 { 2232 SCHECK_PARTIAL(); 2233 cur_is_word = FALSE; 2234 } 2235 else 2236#ifdef SUPPORT_UCP 2237 if (md->use_ucp) 2238 { 2239 c = *eptr; 2240 if (c == '_') cur_is_word = TRUE; else 2241 { 2242 int cat = UCD_CATEGORY(c); 2243 cur_is_word = (cat == ucp_L || cat == ucp_N); 2244 } 2245 } 2246 else 2247#endif 2248 cur_is_word = MAX_255(*eptr) 2249 && ((md->ctypes[*eptr] & ctype_word) != 0); 2250 } 2251 2252 /* Now see if the situation is what we want */ 2253 2254 if ((*ecode++ == OP_WORD_BOUNDARY)? 2255 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 2256 RRETURN(MATCH_NOMATCH); 2257 } 2258 break; 2259 2260 /* Match any single character type except newline; have to take care with 2261 CRLF newlines and partial matching. */ 2262 2263 case OP_ANY: 2264 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 2265 if (md->partial != 0 && 2266 eptr + 1 >= md->end_subject && 2267 NLBLOCK->nltype == NLTYPE_FIXED && 2268 NLBLOCK->nllen == 2 && 2269 *eptr == NLBLOCK->nl[0]) 2270 { 2271 md->hitend = TRUE; 2272 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2273 } 2274 2275 /* Fall through */ 2276 2277 /* Match any single character whatsoever. */ 2278 2279 case OP_ALLANY: 2280 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2281 { /* not be updated before SCHECK_PARTIAL. */ 2282 SCHECK_PARTIAL(); 2283 RRETURN(MATCH_NOMATCH); 2284 } 2285 eptr++; 2286#ifdef SUPPORT_UTF 2287 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 2288#endif 2289 ecode++; 2290 break; 2291 2292 /* Match a single byte, even in UTF-8 mode. This opcode really does match 2293 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 2294 2295 case OP_ANYBYTE: 2296 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */ 2297 { /* not be updated before SCHECK_PARTIAL. */ 2298 SCHECK_PARTIAL(); 2299 RRETURN(MATCH_NOMATCH); 2300 } 2301 eptr++; 2302 ecode++; 2303 break; 2304 2305 case OP_NOT_DIGIT: 2306 if (eptr >= md->end_subject) 2307 { 2308 SCHECK_PARTIAL(); 2309 RRETURN(MATCH_NOMATCH); 2310 } 2311 GETCHARINCTEST(c, eptr); 2312 if ( 2313#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2314 c < 256 && 2315#endif 2316 (md->ctypes[c] & ctype_digit) != 0 2317 ) 2318 RRETURN(MATCH_NOMATCH); 2319 ecode++; 2320 break; 2321 2322 case OP_DIGIT: 2323 if (eptr >= md->end_subject) 2324 { 2325 SCHECK_PARTIAL(); 2326 RRETURN(MATCH_NOMATCH); 2327 } 2328 GETCHARINCTEST(c, eptr); 2329 if ( 2330#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2331 c > 255 || 2332#endif 2333 (md->ctypes[c] & ctype_digit) == 0 2334 ) 2335 RRETURN(MATCH_NOMATCH); 2336 ecode++; 2337 break; 2338 2339 case OP_NOT_WHITESPACE: 2340 if (eptr >= md->end_subject) 2341 { 2342 SCHECK_PARTIAL(); 2343 RRETURN(MATCH_NOMATCH); 2344 } 2345 GETCHARINCTEST(c, eptr); 2346 if ( 2347#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2348 c < 256 && 2349#endif 2350 (md->ctypes[c] & ctype_space) != 0 2351 ) 2352 RRETURN(MATCH_NOMATCH); 2353 ecode++; 2354 break; 2355 2356 case OP_WHITESPACE: 2357 if (eptr >= md->end_subject) 2358 { 2359 SCHECK_PARTIAL(); 2360 RRETURN(MATCH_NOMATCH); 2361 } 2362 GETCHARINCTEST(c, eptr); 2363 if ( 2364#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2365 c > 255 || 2366#endif 2367 (md->ctypes[c] & ctype_space) == 0 2368 ) 2369 RRETURN(MATCH_NOMATCH); 2370 ecode++; 2371 break; 2372 2373 case OP_NOT_WORDCHAR: 2374 if (eptr >= md->end_subject) 2375 { 2376 SCHECK_PARTIAL(); 2377 RRETURN(MATCH_NOMATCH); 2378 } 2379 GETCHARINCTEST(c, eptr); 2380 if ( 2381#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2382 c < 256 && 2383#endif 2384 (md->ctypes[c] & ctype_word) != 0 2385 ) 2386 RRETURN(MATCH_NOMATCH); 2387 ecode++; 2388 break; 2389 2390 case OP_WORDCHAR: 2391 if (eptr >= md->end_subject) 2392 { 2393 SCHECK_PARTIAL(); 2394 RRETURN(MATCH_NOMATCH); 2395 } 2396 GETCHARINCTEST(c, eptr); 2397 if ( 2398#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) 2399 c > 255 || 2400#endif 2401 (md->ctypes[c] & ctype_word) == 0 2402 ) 2403 RRETURN(MATCH_NOMATCH); 2404 ecode++; 2405 break; 2406 2407 case OP_ANYNL: 2408 if (eptr >= md->end_subject) 2409 { 2410 SCHECK_PARTIAL(); 2411 RRETURN(MATCH_NOMATCH); 2412 } 2413 GETCHARINCTEST(c, eptr); 2414 switch(c) 2415 { 2416 default: RRETURN(MATCH_NOMATCH); 2417 2418 case 0x000d: 2419 if (eptr >= md->end_subject) 2420 { 2421 SCHECK_PARTIAL(); 2422 } 2423 else if (*eptr == 0x0a) eptr++; 2424 break; 2425 2426 case 0x000a: 2427 break; 2428 2429 case 0x000b: 2430 case 0x000c: 2431 case 0x0085: 2432 case 0x2028: 2433 case 0x2029: 2434 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 2435 break; 2436 } 2437 ecode++; 2438 break; 2439 2440 case OP_NOT_HSPACE: 2441 if (eptr >= md->end_subject) 2442 { 2443 SCHECK_PARTIAL(); 2444 RRETURN(MATCH_NOMATCH); 2445 } 2446 GETCHARINCTEST(c, eptr); 2447 switch(c) 2448 { 2449 default: break; 2450 case 0x09: /* HT */ 2451 case 0x20: /* SPACE */ 2452 case 0xa0: /* NBSP */ 2453 case 0x1680: /* OGHAM SPACE MARK */ 2454 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2455 case 0x2000: /* EN QUAD */ 2456 case 0x2001: /* EM QUAD */ 2457 case 0x2002: /* EN SPACE */ 2458 case 0x2003: /* EM SPACE */ 2459 case 0x2004: /* THREE-PER-EM SPACE */ 2460 case 0x2005: /* FOUR-PER-EM SPACE */ 2461 case 0x2006: /* SIX-PER-EM SPACE */ 2462 case 0x2007: /* FIGURE SPACE */ 2463 case 0x2008: /* PUNCTUATION SPACE */ 2464 case 0x2009: /* THIN SPACE */ 2465 case 0x200A: /* HAIR SPACE */ 2466 case 0x202f: /* NARROW NO-BREAK SPACE */ 2467 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2468 case 0x3000: /* IDEOGRAPHIC SPACE */ 2469 RRETURN(MATCH_NOMATCH); 2470 } 2471 ecode++; 2472 break; 2473 2474 case OP_HSPACE: 2475 if (eptr >= md->end_subject) 2476 { 2477 SCHECK_PARTIAL(); 2478 RRETURN(MATCH_NOMATCH); 2479 } 2480 GETCHARINCTEST(c, eptr); 2481 switch(c) 2482 { 2483 default: RRETURN(MATCH_NOMATCH); 2484 case 0x09: /* HT */ 2485 case 0x20: /* SPACE */ 2486 case 0xa0: /* NBSP */ 2487 case 0x1680: /* OGHAM SPACE MARK */ 2488 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2489 case 0x2000: /* EN QUAD */ 2490 case 0x2001: /* EM QUAD */ 2491 case 0x2002: /* EN SPACE */ 2492 case 0x2003: /* EM SPACE */ 2493 case 0x2004: /* THREE-PER-EM SPACE */ 2494 case 0x2005: /* FOUR-PER-EM SPACE */ 2495 case 0x2006: /* SIX-PER-EM SPACE */ 2496 case 0x2007: /* FIGURE SPACE */ 2497 case 0x2008: /* PUNCTUATION SPACE */ 2498 case 0x2009: /* THIN SPACE */ 2499 case 0x200A: /* HAIR SPACE */ 2500 case 0x202f: /* NARROW NO-BREAK SPACE */ 2501 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2502 case 0x3000: /* IDEOGRAPHIC SPACE */ 2503 break; 2504 } 2505 ecode++; 2506 break; 2507 2508 case OP_NOT_VSPACE: 2509 if (eptr >= md->end_subject) 2510 { 2511 SCHECK_PARTIAL(); 2512 RRETURN(MATCH_NOMATCH); 2513 } 2514 GETCHARINCTEST(c, eptr); 2515 switch(c) 2516 { 2517 default: break; 2518 case 0x0a: /* LF */ 2519 case 0x0b: /* VT */ 2520 case 0x0c: /* FF */ 2521 case 0x0d: /* CR */ 2522 case 0x85: /* NEL */ 2523 case 0x2028: /* LINE SEPARATOR */ 2524 case 0x2029: /* PARAGRAPH SEPARATOR */ 2525 RRETURN(MATCH_NOMATCH); 2526 } 2527 ecode++; 2528 break; 2529 2530 case OP_VSPACE: 2531 if (eptr >= md->end_subject) 2532 { 2533 SCHECK_PARTIAL(); 2534 RRETURN(MATCH_NOMATCH); 2535 } 2536 GETCHARINCTEST(c, eptr); 2537 switch(c) 2538 { 2539 default: RRETURN(MATCH_NOMATCH); 2540 case 0x0a: /* LF */ 2541 case 0x0b: /* VT */ 2542 case 0x0c: /* FF */ 2543 case 0x0d: /* CR */ 2544 case 0x85: /* NEL */ 2545 case 0x2028: /* LINE SEPARATOR */ 2546 case 0x2029: /* PARAGRAPH SEPARATOR */ 2547 break; 2548 } 2549 ecode++; 2550 break; 2551 2552#ifdef SUPPORT_UCP 2553 /* Check the next character by Unicode property. We will get here only 2554 if the support is in the binary; otherwise a compile-time error occurs. */ 2555 2556 case OP_PROP: 2557 case OP_NOTPROP: 2558 if (eptr >= md->end_subject) 2559 { 2560 SCHECK_PARTIAL(); 2561 RRETURN(MATCH_NOMATCH); 2562 } 2563 GETCHARINCTEST(c, eptr); 2564 { 2565 const ucd_record *prop = GET_UCD(c); 2566 2567 switch(ecode[1]) 2568 { 2569 case PT_ANY: 2570 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 2571 break; 2572 2573 case PT_LAMP: 2574 if ((prop->chartype == ucp_Lu || 2575 prop->chartype == ucp_Ll || 2576 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 2577 RRETURN(MATCH_NOMATCH); 2578 break; 2579 2580 case PT_GC: 2581 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) 2582 RRETURN(MATCH_NOMATCH); 2583 break; 2584 2585 case PT_PC: 2586 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2587 RRETURN(MATCH_NOMATCH); 2588 break; 2589 2590 case PT_SC: 2591 if ((ecode[2] != prop->script) == (op == OP_PROP)) 2592 RRETURN(MATCH_NOMATCH); 2593 break; 2594 2595 /* These are specials */ 2596 2597 case PT_ALNUM: 2598 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2599 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) 2600 RRETURN(MATCH_NOMATCH); 2601 break; 2602 2603 case PT_SPACE: /* Perl space */ 2604 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 2605 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) 2606 == (op == OP_NOTPROP)) 2607 RRETURN(MATCH_NOMATCH); 2608 break; 2609 2610 case PT_PXSPACE: /* POSIX space */ 2611 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 2612 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 2613 c == CHAR_FF || c == CHAR_CR) 2614 == (op == OP_NOTPROP)) 2615 RRETURN(MATCH_NOMATCH); 2616 break; 2617 2618 case PT_WORD: 2619 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2620 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2621 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) 2622 RRETURN(MATCH_NOMATCH); 2623 break; 2624 2625 /* This should never occur */ 2626 2627 default: 2628 RRETURN(PCRE_ERROR_INTERNAL); 2629 } 2630 2631 ecode += 3; 2632 } 2633 break; 2634 2635 /* Match an extended Unicode sequence. We will get here only if the support 2636 is in the binary; otherwise a compile-time error occurs. */ 2637 2638 case OP_EXTUNI: 2639 if (eptr >= md->end_subject) 2640 { 2641 SCHECK_PARTIAL(); 2642 RRETURN(MATCH_NOMATCH); 2643 } 2644 GETCHARINCTEST(c, eptr); 2645 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); 2646 while (eptr < md->end_subject) 2647 { 2648 int len = 1; 2649 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 2650 if (UCD_CATEGORY(c) != ucp_M) break; 2651 eptr += len; 2652 } 2653 CHECK_PARTIAL(); 2654 ecode++; 2655 break; 2656#endif 2657 2658 2659 /* Match a back reference, possibly repeatedly. Look past the end of the 2660 item to see if there is repeat information following. The code is similar 2661 to that for character classes, but repeated for efficiency. Then obey 2662 similar code to character type repeats - written out again for speed. 2663 However, if the referenced string is the empty string, always treat 2664 it as matched, any number of times (otherwise there could be infinite 2665 loops). */ 2666 2667 case OP_REF: 2668 case OP_REFI: 2669 caseless = op == OP_REFI; 2670 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2671 ecode += 1 + IMM2_SIZE; 2672 2673 /* If the reference is unset, there are two possibilities: 2674 2675 (a) In the default, Perl-compatible state, set the length negative; 2676 this ensures that every attempt at a match fails. We can't just fail 2677 here, because of the possibility of quantifiers with zero minima. 2678 2679 (b) If the JavaScript compatibility flag is set, set the length to zero 2680 so that the back reference matches an empty string. 2681 2682 Otherwise, set the length to the length of what was matched by the 2683 referenced subpattern. */ 2684 2685 if (offset >= offset_top || md->offset_vector[offset] < 0) 2686 length = (md->jscript_compat)? 0 : -1; 2687 else 2688 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2689 2690 /* Set up for repetition, or handle the non-repeated case */ 2691 2692 switch (*ecode) 2693 { 2694 case OP_CRSTAR: 2695 case OP_CRMINSTAR: 2696 case OP_CRPLUS: 2697 case OP_CRMINPLUS: 2698 case OP_CRQUERY: 2699 case OP_CRMINQUERY: 2700 c = *ecode++ - OP_CRSTAR; 2701 minimize = (c & 1) != 0; 2702 min = rep_min[c]; /* Pick up values from tables; */ 2703 max = rep_max[c]; /* zero for max => infinity */ 2704 if (max == 0) max = INT_MAX; 2705 break; 2706 2707 case OP_CRRANGE: 2708 case OP_CRMINRANGE: 2709 minimize = (*ecode == OP_CRMINRANGE); 2710 min = GET2(ecode, 1); 2711 max = GET2(ecode, 1 + IMM2_SIZE); 2712 if (max == 0) max = INT_MAX; 2713 ecode += 1 + 2 * IMM2_SIZE; 2714 break; 2715 2716 default: /* No repeat follows */ 2717 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0) 2718 { 2719 if (length == -2) eptr = md->end_subject; /* Partial match */ 2720 CHECK_PARTIAL(); 2721 RRETURN(MATCH_NOMATCH); 2722 } 2723 eptr += length; 2724 continue; /* With the main loop */ 2725 } 2726 2727 /* Handle repeated back references. If the length of the reference is 2728 zero, just continue with the main loop. If the length is negative, it 2729 means the reference is unset in non-Java-compatible mode. If the minimum is 2730 zero, we can continue at the same level without recursion. For any other 2731 minimum, carrying on will result in NOMATCH. */ 2732 2733 if (length == 0) continue; 2734 if (length < 0 && min == 0) continue; 2735 2736 /* First, ensure the minimum number of matches are present. We get back 2737 the length of the reference string explicitly rather than passing the 2738 address of eptr, so that eptr can be a register variable. */ 2739 2740 for (i = 1; i <= min; i++) 2741 { 2742 int slength; 2743 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2744 { 2745 if (slength == -2) eptr = md->end_subject; /* Partial match */ 2746 CHECK_PARTIAL(); 2747 RRETURN(MATCH_NOMATCH); 2748 } 2749 eptr += slength; 2750 } 2751 2752 /* If min = max, continue at the same level without recursion. 2753 They are not both allowed to be zero. */ 2754 2755 if (min == max) continue; 2756 2757 /* If minimizing, keep trying and advancing the pointer */ 2758 2759 if (minimize) 2760 { 2761 for (fi = min;; fi++) 2762 { 2763 int slength; 2764 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14); 2765 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2766 if (fi >= max) RRETURN(MATCH_NOMATCH); 2767 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2768 { 2769 if (slength == -2) eptr = md->end_subject; /* Partial match */ 2770 CHECK_PARTIAL(); 2771 RRETURN(MATCH_NOMATCH); 2772 } 2773 eptr += slength; 2774 } 2775 /* Control never gets here */ 2776 } 2777 2778 /* If maximizing, find the longest string and work backwards */ 2779 2780 else 2781 { 2782 pp = eptr; 2783 for (i = min; i < max; i++) 2784 { 2785 int slength; 2786 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) 2787 { 2788 /* Can't use CHECK_PARTIAL because we don't want to update eptr in 2789 the soft partial matching case. */ 2790 2791 if (slength == -2 && md->partial != 0 && 2792 md->end_subject > md->start_used_ptr) 2793 { 2794 md->hitend = TRUE; 2795 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 2796 } 2797 break; 2798 } 2799 eptr += slength; 2800 } 2801 2802 while (eptr >= pp) 2803 { 2804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15); 2805 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2806 eptr -= length; 2807 } 2808 RRETURN(MATCH_NOMATCH); 2809 } 2810 /* Control never gets here */ 2811 2812 /* Match a bit-mapped character class, possibly repeatedly. This op code is 2813 used when all the characters in the class have values in the range 0-255, 2814 and either the matching is caseful, or the characters are in the range 2815 0-127 when UTF-8 processing is enabled. The only difference between 2816 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2817 encountered. 2818 2819 First, look past the end of the item to see if there is repeat information 2820 following. Then obey similar code to character type repeats - written out 2821 again for speed. */ 2822 2823 case OP_NCLASS: 2824 case OP_CLASS: 2825 { 2826 /* The data variable is saved across frames, so the byte map needs to 2827 be stored there. */ 2828#define BYTE_MAP ((pcre_uint8 *)data) 2829 data = ecode + 1; /* Save for matching */ 2830 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ 2831 2832 switch (*ecode) 2833 { 2834 case OP_CRSTAR: 2835 case OP_CRMINSTAR: 2836 case OP_CRPLUS: 2837 case OP_CRMINPLUS: 2838 case OP_CRQUERY: 2839 case OP_CRMINQUERY: 2840 c = *ecode++ - OP_CRSTAR; 2841 minimize = (c & 1) != 0; 2842 min = rep_min[c]; /* Pick up values from tables; */ 2843 max = rep_max[c]; /* zero for max => infinity */ 2844 if (max == 0) max = INT_MAX; 2845 break; 2846 2847 case OP_CRRANGE: 2848 case OP_CRMINRANGE: 2849 minimize = (*ecode == OP_CRMINRANGE); 2850 min = GET2(ecode, 1); 2851 max = GET2(ecode, 1 + IMM2_SIZE); 2852 if (max == 0) max = INT_MAX; 2853 ecode += 1 + 2 * IMM2_SIZE; 2854 break; 2855 2856 default: /* No repeat follows */ 2857 min = max = 1; 2858 break; 2859 } 2860 2861 /* First, ensure the minimum number of matches are present. */ 2862 2863#ifdef SUPPORT_UTF 2864 if (utf) 2865 { 2866 for (i = 1; i <= min; i++) 2867 { 2868 if (eptr >= md->end_subject) 2869 { 2870 SCHECK_PARTIAL(); 2871 RRETURN(MATCH_NOMATCH); 2872 } 2873 GETCHARINC(c, eptr); 2874 if (c > 255) 2875 { 2876 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2877 } 2878 else 2879 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2880 } 2881 } 2882 else 2883#endif 2884 /* Not UTF mode */ 2885 { 2886 for (i = 1; i <= min; i++) 2887 { 2888 if (eptr >= md->end_subject) 2889 { 2890 SCHECK_PARTIAL(); 2891 RRETURN(MATCH_NOMATCH); 2892 } 2893 c = *eptr++; 2894#ifndef COMPILE_PCRE8 2895 if (c > 255) 2896 { 2897 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2898 } 2899 else 2900#endif 2901 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2902 } 2903 } 2904 2905 /* If max == min we can continue with the main loop without the 2906 need to recurse. */ 2907 2908 if (min == max) continue; 2909 2910 /* If minimizing, keep testing the rest of the expression and advancing 2911 the pointer while it matches the class. */ 2912 2913 if (minimize) 2914 { 2915#ifdef SUPPORT_UTF 2916 if (utf) 2917 { 2918 for (fi = min;; fi++) 2919 { 2920 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16); 2921 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2922 if (fi >= max) RRETURN(MATCH_NOMATCH); 2923 if (eptr >= md->end_subject) 2924 { 2925 SCHECK_PARTIAL(); 2926 RRETURN(MATCH_NOMATCH); 2927 } 2928 GETCHARINC(c, eptr); 2929 if (c > 255) 2930 { 2931 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2932 } 2933 else 2934 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2935 } 2936 } 2937 else 2938#endif 2939 /* Not UTF mode */ 2940 { 2941 for (fi = min;; fi++) 2942 { 2943 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17); 2944 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2945 if (fi >= max) RRETURN(MATCH_NOMATCH); 2946 if (eptr >= md->end_subject) 2947 { 2948 SCHECK_PARTIAL(); 2949 RRETURN(MATCH_NOMATCH); 2950 } 2951 c = *eptr++; 2952#ifndef COMPILE_PCRE8 2953 if (c > 255) 2954 { 2955 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2956 } 2957 else 2958#endif 2959 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2960 } 2961 } 2962 /* Control never gets here */ 2963 } 2964 2965 /* If maximizing, find the longest possible run, then work backwards. */ 2966 2967 else 2968 { 2969 pp = eptr; 2970 2971#ifdef SUPPORT_UTF 2972 if (utf) 2973 { 2974 for (i = min; i < max; i++) 2975 { 2976 int len = 1; 2977 if (eptr >= md->end_subject) 2978 { 2979 SCHECK_PARTIAL(); 2980 break; 2981 } 2982 GETCHARLEN(c, eptr, len); 2983 if (c > 255) 2984 { 2985 if (op == OP_CLASS) break; 2986 } 2987 else 2988 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 2989 eptr += len; 2990 } 2991 for (;;) 2992 { 2993 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); 2994 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2995 if (eptr-- == pp) break; /* Stop if tried at original pos */ 2996 BACKCHAR(eptr); 2997 } 2998 } 2999 else 3000#endif 3001 /* Not UTF mode */ 3002 { 3003 for (i = min; i < max; i++) 3004 { 3005 if (eptr >= md->end_subject) 3006 { 3007 SCHECK_PARTIAL(); 3008 break; 3009 } 3010 c = *eptr; 3011#ifndef COMPILE_PCRE8 3012 if (c > 255) 3013 { 3014 if (op == OP_CLASS) break; 3015 } 3016 else 3017#endif 3018 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; 3019 eptr++; 3020 } 3021 while (eptr >= pp) 3022 { 3023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); 3024 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3025 eptr--; 3026 } 3027 } 3028 3029 RRETURN(MATCH_NOMATCH); 3030 } 3031#undef BYTE_MAP 3032 } 3033 /* Control never gets here */ 3034 3035 3036 /* Match an extended character class. This opcode is encountered only 3037 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 3038 mode, because Unicode properties are supported in non-UTF-8 mode. */ 3039 3040#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3041 case OP_XCLASS: 3042 { 3043 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 3044 ecode += GET(ecode, 1); /* Advance past the item */ 3045 3046 switch (*ecode) 3047 { 3048 case OP_CRSTAR: 3049 case OP_CRMINSTAR: 3050 case OP_CRPLUS: 3051 case OP_CRMINPLUS: 3052 case OP_CRQUERY: 3053 case OP_CRMINQUERY: 3054 c = *ecode++ - OP_CRSTAR; 3055 minimize = (c & 1) != 0; 3056 min = rep_min[c]; /* Pick up values from tables; */ 3057 max = rep_max[c]; /* zero for max => infinity */ 3058 if (max == 0) max = INT_MAX; 3059 break; 3060 3061 case OP_CRRANGE: 3062 case OP_CRMINRANGE: 3063 minimize = (*ecode == OP_CRMINRANGE); 3064 min = GET2(ecode, 1); 3065 max = GET2(ecode, 1 + IMM2_SIZE); 3066 if (max == 0) max = INT_MAX; 3067 ecode += 1 + 2 * IMM2_SIZE; 3068 break; 3069 3070 default: /* No repeat follows */ 3071 min = max = 1; 3072 break; 3073 } 3074 3075 /* First, ensure the minimum number of matches are present. */ 3076 3077 for (i = 1; i <= min; i++) 3078 { 3079 if (eptr >= md->end_subject) 3080 { 3081 SCHECK_PARTIAL(); 3082 RRETURN(MATCH_NOMATCH); 3083 } 3084 GETCHARINCTEST(c, eptr); 3085 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3086 } 3087 3088 /* If max == min we can continue with the main loop without the 3089 need to recurse. */ 3090 3091 if (min == max) continue; 3092 3093 /* If minimizing, keep testing the rest of the expression and advancing 3094 the pointer while it matches the class. */ 3095 3096 if (minimize) 3097 { 3098 for (fi = min;; fi++) 3099 { 3100 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20); 3101 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3102 if (fi >= max) RRETURN(MATCH_NOMATCH); 3103 if (eptr >= md->end_subject) 3104 { 3105 SCHECK_PARTIAL(); 3106 RRETURN(MATCH_NOMATCH); 3107 } 3108 GETCHARINCTEST(c, eptr); 3109 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); 3110 } 3111 /* Control never gets here */ 3112 } 3113 3114 /* If maximizing, find the longest possible run, then work backwards. */ 3115 3116 else 3117 { 3118 pp = eptr; 3119 for (i = min; i < max; i++) 3120 { 3121 int len = 1; 3122 if (eptr >= md->end_subject) 3123 { 3124 SCHECK_PARTIAL(); 3125 break; 3126 } 3127#ifdef SUPPORT_UTF 3128 GETCHARLENTEST(c, eptr, len); 3129#else 3130 c = *eptr; 3131#endif 3132 if (!PRIV(xclass)(c, data, utf)) break; 3133 eptr += len; 3134 } 3135 for(;;) 3136 { 3137 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); 3138 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3139 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3140#ifdef SUPPORT_UTF 3141 if (utf) BACKCHAR(eptr); 3142#endif 3143 } 3144 RRETURN(MATCH_NOMATCH); 3145 } 3146 3147 /* Control never gets here */ 3148 } 3149#endif /* End of XCLASS */ 3150 3151 /* Match a single character, casefully */ 3152 3153 case OP_CHAR: 3154#ifdef SUPPORT_UTF 3155 if (utf) 3156 { 3157 length = 1; 3158 ecode++; 3159 GETCHARLEN(fc, ecode, length); 3160 if (length > md->end_subject - eptr) 3161 { 3162 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 3163 RRETURN(MATCH_NOMATCH); 3164 } 3165 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); 3166 } 3167 else 3168#endif 3169 /* Not UTF mode */ 3170 { 3171 if (md->end_subject - eptr < 1) 3172 { 3173 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 3174 RRETURN(MATCH_NOMATCH); 3175 } 3176 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 3177 ecode += 2; 3178 } 3179 break; 3180 3181 /* Match a single character, caselessly. If we are at the end of the 3182 subject, give up immediately. */ 3183 3184 case OP_CHARI: 3185 if (eptr >= md->end_subject) 3186 { 3187 SCHECK_PARTIAL(); 3188 RRETURN(MATCH_NOMATCH); 3189 } 3190 3191#ifdef SUPPORT_UTF 3192 if (utf) 3193 { 3194 length = 1; 3195 ecode++; 3196 GETCHARLEN(fc, ecode, length); 3197 3198 /* If the pattern character's value is < 128, we have only one byte, and 3199 we know that its other case must also be one byte long, so we can use the 3200 fast lookup table. We know that there is at least one byte left in the 3201 subject. */ 3202 3203 if (fc < 128) 3204 { 3205 if (md->lcc[fc] 3206 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); 3207 ecode++; 3208 eptr++; 3209 } 3210 3211 /* Otherwise we must pick up the subject character. Note that we cannot 3212 use the value of "length" to check for sufficient bytes left, because the 3213 other case of the character may have more or fewer bytes. */ 3214 3215 else 3216 { 3217 unsigned int dc; 3218 GETCHARINC(dc, eptr); 3219 ecode += length; 3220 3221 /* If we have Unicode property support, we can use it to test the other 3222 case of the character, if there is one. */ 3223 3224 if (fc != dc) 3225 { 3226#ifdef SUPPORT_UCP 3227 if (dc != UCD_OTHERCASE(fc)) 3228#endif 3229 RRETURN(MATCH_NOMATCH); 3230 } 3231 } 3232 } 3233 else 3234#endif /* SUPPORT_UTF */ 3235 3236 /* Not UTF mode */ 3237 { 3238 if (TABLE_GET(ecode[1], md->lcc, ecode[1]) 3239 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); 3240 eptr++; 3241 ecode += 2; 3242 } 3243 break; 3244 3245 /* Match a single character repeatedly. */ 3246 3247 case OP_EXACT: 3248 case OP_EXACTI: 3249 min = max = GET2(ecode, 1); 3250 ecode += 1 + IMM2_SIZE; 3251 goto REPEATCHAR; 3252 3253 case OP_POSUPTO: 3254 case OP_POSUPTOI: 3255 possessive = TRUE; 3256 /* Fall through */ 3257 3258 case OP_UPTO: 3259 case OP_UPTOI: 3260 case OP_MINUPTO: 3261 case OP_MINUPTOI: 3262 min = 0; 3263 max = GET2(ecode, 1); 3264 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; 3265 ecode += 1 + IMM2_SIZE; 3266 goto REPEATCHAR; 3267 3268 case OP_POSSTAR: 3269 case OP_POSSTARI: 3270 possessive = TRUE; 3271 min = 0; 3272 max = INT_MAX; 3273 ecode++; 3274 goto REPEATCHAR; 3275 3276 case OP_POSPLUS: 3277 case OP_POSPLUSI: 3278 possessive = TRUE; 3279 min = 1; 3280 max = INT_MAX; 3281 ecode++; 3282 goto REPEATCHAR; 3283 3284 case OP_POSQUERY: 3285 case OP_POSQUERYI: 3286 possessive = TRUE; 3287 min = 0; 3288 max = 1; 3289 ecode++; 3290 goto REPEATCHAR; 3291 3292 case OP_STAR: 3293 case OP_STARI: 3294 case OP_MINSTAR: 3295 case OP_MINSTARI: 3296 case OP_PLUS: 3297 case OP_PLUSI: 3298 case OP_MINPLUS: 3299 case OP_MINPLUSI: 3300 case OP_QUERY: 3301 case OP_QUERYI: 3302 case OP_MINQUERY: 3303 case OP_MINQUERYI: 3304 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI); 3305 minimize = (c & 1) != 0; 3306 min = rep_min[c]; /* Pick up values from tables; */ 3307 max = rep_max[c]; /* zero for max => infinity */ 3308 if (max == 0) max = INT_MAX; 3309 3310 /* Common code for all repeated single-character matches. */ 3311 3312 REPEATCHAR: 3313#ifdef SUPPORT_UTF 3314 if (utf) 3315 { 3316 length = 1; 3317 charptr = ecode; 3318 GETCHARLEN(fc, ecode, length); 3319 ecode += length; 3320 3321 /* Handle multibyte character matching specially here. There is 3322 support for caseless matching if UCP support is present. */ 3323 3324 if (length > 1) 3325 { 3326#ifdef SUPPORT_UCP 3327 unsigned int othercase; 3328 if (op >= OP_STARI && /* Caseless */ 3329 (othercase = UCD_OTHERCASE(fc)) != fc) 3330 oclength = PRIV(ord2utf)(othercase, occhars); 3331 else oclength = 0; 3332#endif /* SUPPORT_UCP */ 3333 3334 for (i = 1; i <= min; i++) 3335 { 3336 if (eptr <= md->end_subject - length && 3337 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3338#ifdef SUPPORT_UCP 3339 else if (oclength > 0 && 3340 eptr <= md->end_subject - oclength && 3341 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3342#endif /* SUPPORT_UCP */ 3343 else 3344 { 3345 CHECK_PARTIAL(); 3346 RRETURN(MATCH_NOMATCH); 3347 } 3348 } 3349 3350 if (min == max) continue; 3351 3352 if (minimize) 3353 { 3354 for (fi = min;; fi++) 3355 { 3356 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22); 3357 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3358 if (fi >= max) RRETURN(MATCH_NOMATCH); 3359 if (eptr <= md->end_subject - length && 3360 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3361#ifdef SUPPORT_UCP 3362 else if (oclength > 0 && 3363 eptr <= md->end_subject - oclength && 3364 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3365#endif /* SUPPORT_UCP */ 3366 else 3367 { 3368 CHECK_PARTIAL(); 3369 RRETURN(MATCH_NOMATCH); 3370 } 3371 } 3372 /* Control never gets here */ 3373 } 3374 3375 else /* Maximize */ 3376 { 3377 pp = eptr; 3378 for (i = min; i < max; i++) 3379 { 3380 if (eptr <= md->end_subject - length && 3381 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; 3382#ifdef SUPPORT_UCP 3383 else if (oclength > 0 && 3384 eptr <= md->end_subject - oclength && 3385 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; 3386#endif /* SUPPORT_UCP */ 3387 else 3388 { 3389 CHECK_PARTIAL(); 3390 break; 3391 } 3392 } 3393 3394 if (possessive) continue; 3395 3396 for(;;) 3397 { 3398 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23); 3399 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3400 if (eptr == pp) { RRETURN(MATCH_NOMATCH); } 3401#ifdef SUPPORT_UCP 3402 eptr--; 3403 BACKCHAR(eptr); 3404#else /* without SUPPORT_UCP */ 3405 eptr -= length; 3406#endif /* SUPPORT_UCP */ 3407 } 3408 } 3409 /* Control never gets here */ 3410 } 3411 3412 /* If the length of a UTF-8 character is 1, we fall through here, and 3413 obey the code as for non-UTF-8 characters below, though in this case the 3414 value of fc will always be < 128. */ 3415 } 3416 else 3417#endif /* SUPPORT_UTF */ 3418 /* When not in UTF-8 mode, load a single-byte character. */ 3419 fc = *ecode++; 3420 3421 /* The value of fc at this point is always one character, though we may 3422 or may not be in UTF mode. The code is duplicated for the caseless and 3423 caseful cases, for speed, since matching characters is likely to be quite 3424 common. First, ensure the minimum number of matches are present. If min = 3425 max, continue at the same level without recursing. Otherwise, if 3426 minimizing, keep trying the rest of the expression and advancing one 3427 matching character if failing, up to the maximum. Alternatively, if 3428 maximizing, find the maximum number of characters and work backwards. */ 3429 3430 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3431 max, (char *)eptr)); 3432 3433 if (op >= OP_STARI) /* Caseless */ 3434 { 3435#ifdef COMPILE_PCRE8 3436 /* fc must be < 128 if UTF is enabled. */ 3437 foc = md->fcc[fc]; 3438#else 3439#ifdef SUPPORT_UTF 3440#ifdef SUPPORT_UCP 3441 if (utf && fc > 127) 3442 foc = UCD_OTHERCASE(fc); 3443#else 3444 if (utf && fc > 127) 3445 foc = fc; 3446#endif /* SUPPORT_UCP */ 3447 else 3448#endif /* SUPPORT_UTF */ 3449 foc = TABLE_GET(fc, md->fcc, fc); 3450#endif /* COMPILE_PCRE8 */ 3451 3452 for (i = 1; i <= min; i++) 3453 { 3454 if (eptr >= md->end_subject) 3455 { 3456 SCHECK_PARTIAL(); 3457 RRETURN(MATCH_NOMATCH); 3458 } 3459 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); 3460 eptr++; 3461 } 3462 if (min == max) continue; 3463 if (minimize) 3464 { 3465 for (fi = min;; fi++) 3466 { 3467 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24); 3468 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3469 if (fi >= max) RRETURN(MATCH_NOMATCH); 3470 if (eptr >= md->end_subject) 3471 { 3472 SCHECK_PARTIAL(); 3473 RRETURN(MATCH_NOMATCH); 3474 } 3475 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); 3476 eptr++; 3477 } 3478 /* Control never gets here */ 3479 } 3480 else /* Maximize */ 3481 { 3482 pp = eptr; 3483 for (i = min; i < max; i++) 3484 { 3485 if (eptr >= md->end_subject) 3486 { 3487 SCHECK_PARTIAL(); 3488 break; 3489 } 3490 if (fc != *eptr && foc != *eptr) break; 3491 eptr++; 3492 } 3493 3494 if (possessive) continue; 3495 3496 while (eptr >= pp) 3497 { 3498 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25); 3499 eptr--; 3500 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3501 } 3502 RRETURN(MATCH_NOMATCH); 3503 } 3504 /* Control never gets here */ 3505 } 3506 3507 /* Caseful comparisons (includes all multi-byte characters) */ 3508 3509 else 3510 { 3511 for (i = 1; i <= min; i++) 3512 { 3513 if (eptr >= md->end_subject) 3514 { 3515 SCHECK_PARTIAL(); 3516 RRETURN(MATCH_NOMATCH); 3517 } 3518 if (fc != *eptr++) RRETURN(MATCH_NOMATCH); 3519 } 3520 3521 if (min == max) continue; 3522 3523 if (minimize) 3524 { 3525 for (fi = min;; fi++) 3526 { 3527 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26); 3528 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3529 if (fi >= max) RRETURN(MATCH_NOMATCH); 3530 if (eptr >= md->end_subject) 3531 { 3532 SCHECK_PARTIAL(); 3533 RRETURN(MATCH_NOMATCH); 3534 } 3535 if (fc != *eptr++) RRETURN(MATCH_NOMATCH); 3536 } 3537 /* Control never gets here */ 3538 } 3539 else /* Maximize */ 3540 { 3541 pp = eptr; 3542 for (i = min; i < max; i++) 3543 { 3544 if (eptr >= md->end_subject) 3545 { 3546 SCHECK_PARTIAL(); 3547 break; 3548 } 3549 if (fc != *eptr) break; 3550 eptr++; 3551 } 3552 if (possessive) continue; 3553 3554 while (eptr >= pp) 3555 { 3556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27); 3557 eptr--; 3558 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3559 } 3560 RRETURN(MATCH_NOMATCH); 3561 } 3562 } 3563 /* Control never gets here */ 3564 3565 /* Match a negated single one-byte character. The character we are 3566 checking can be multibyte. */ 3567 3568 case OP_NOT: 3569 case OP_NOTI: 3570 if (eptr >= md->end_subject) 3571 { 3572 SCHECK_PARTIAL(); 3573 RRETURN(MATCH_NOMATCH); 3574 } 3575#ifdef SUPPORT_UTF 3576 if (utf) 3577 { 3578 register unsigned int ch, och; 3579 3580 ecode++; 3581 GETCHARINC(ch, ecode); 3582 GETCHARINC(c, eptr); 3583 3584 if (op == OP_NOT) 3585 { 3586 if (ch == c) RRETURN(MATCH_NOMATCH); 3587 } 3588 else 3589 { 3590#ifdef SUPPORT_UCP 3591 if (ch > 127) 3592 och = UCD_OTHERCASE(ch); 3593#else 3594 if (ch > 127) 3595 och = ch; 3596#endif /* SUPPORT_UCP */ 3597 else 3598 och = TABLE_GET(ch, md->fcc, ch); 3599 if (ch == c || och == c) RRETURN(MATCH_NOMATCH); 3600 } 3601 } 3602 else 3603#endif 3604 { 3605 register unsigned int ch = ecode[1]; 3606 c = *eptr++; 3607 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c)) 3608 RRETURN(MATCH_NOMATCH); 3609 ecode += 2; 3610 } 3611 break; 3612 3613 /* Match a negated single one-byte character repeatedly. This is almost a 3614 repeat of the code for a repeated single character, but I haven't found a 3615 nice way of commoning these up that doesn't require a test of the 3616 positive/negative option for each character match. Maybe that wouldn't add 3617 very much to the time taken, but character matching *is* what this is all 3618 about... */ 3619 3620 case OP_NOTEXACT: 3621 case OP_NOTEXACTI: 3622 min = max = GET2(ecode, 1); 3623 ecode += 1 + IMM2_SIZE; 3624 goto REPEATNOTCHAR; 3625 3626 case OP_NOTUPTO: 3627 case OP_NOTUPTOI: 3628 case OP_NOTMINUPTO: 3629 case OP_NOTMINUPTOI: 3630 min = 0; 3631 max = GET2(ecode, 1); 3632 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; 3633 ecode += 1 + IMM2_SIZE; 3634 goto REPEATNOTCHAR; 3635 3636 case OP_NOTPOSSTAR: 3637 case OP_NOTPOSSTARI: 3638 possessive = TRUE; 3639 min = 0; 3640 max = INT_MAX; 3641 ecode++; 3642 goto REPEATNOTCHAR; 3643 3644 case OP_NOTPOSPLUS: 3645 case OP_NOTPOSPLUSI: 3646 possessive = TRUE; 3647 min = 1; 3648 max = INT_MAX; 3649 ecode++; 3650 goto REPEATNOTCHAR; 3651 3652 case OP_NOTPOSQUERY: 3653 case OP_NOTPOSQUERYI: 3654 possessive = TRUE; 3655 min = 0; 3656 max = 1; 3657 ecode++; 3658 goto REPEATNOTCHAR; 3659 3660 case OP_NOTPOSUPTO: 3661 case OP_NOTPOSUPTOI: 3662 possessive = TRUE; 3663 min = 0; 3664 max = GET2(ecode, 1); 3665 ecode += 1 + IMM2_SIZE; 3666 goto REPEATNOTCHAR; 3667 3668 case OP_NOTSTAR: 3669 case OP_NOTSTARI: 3670 case OP_NOTMINSTAR: 3671 case OP_NOTMINSTARI: 3672 case OP_NOTPLUS: 3673 case OP_NOTPLUSI: 3674 case OP_NOTMINPLUS: 3675 case OP_NOTMINPLUSI: 3676 case OP_NOTQUERY: 3677 case OP_NOTQUERYI: 3678 case OP_NOTMINQUERY: 3679 case OP_NOTMINQUERYI: 3680 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR); 3681 minimize = (c & 1) != 0; 3682 min = rep_min[c]; /* Pick up values from tables; */ 3683 max = rep_max[c]; /* zero for max => infinity */ 3684 if (max == 0) max = INT_MAX; 3685 3686 /* Common code for all repeated single-byte matches. */ 3687 3688 REPEATNOTCHAR: 3689 GETCHARINCTEST(fc, ecode); 3690 3691 /* The code is duplicated for the caseless and caseful cases, for speed, 3692 since matching characters is likely to be quite common. First, ensure the 3693 minimum number of matches are present. If min = max, continue at the same 3694 level without recursing. Otherwise, if minimizing, keep trying the rest of 3695 the expression and advancing one matching character if failing, up to the 3696 maximum. Alternatively, if maximizing, find the maximum number of 3697 characters and work backwards. */ 3698 3699 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 3700 max, (char *)eptr)); 3701 3702 if (op >= OP_NOTSTARI) /* Caseless */ 3703 { 3704#ifdef SUPPORT_UTF 3705#ifdef SUPPORT_UCP 3706 if (utf && fc > 127) 3707 foc = UCD_OTHERCASE(fc); 3708#else 3709 if (utf && fc > 127) 3710 foc = fc; 3711#endif /* SUPPORT_UCP */ 3712 else 3713#endif /* SUPPORT_UTF */ 3714 foc = TABLE_GET(fc, md->fcc, fc); 3715 3716#ifdef SUPPORT_UTF 3717 if (utf) 3718 { 3719 register unsigned int d; 3720 for (i = 1; i <= min; i++) 3721 { 3722 if (eptr >= md->end_subject) 3723 { 3724 SCHECK_PARTIAL(); 3725 RRETURN(MATCH_NOMATCH); 3726 } 3727 GETCHARINC(d, eptr); 3728 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3729 } 3730 } 3731 else 3732#endif 3733 /* Not UTF mode */ 3734 { 3735 for (i = 1; i <= min; i++) 3736 { 3737 if (eptr >= md->end_subject) 3738 { 3739 SCHECK_PARTIAL(); 3740 RRETURN(MATCH_NOMATCH); 3741 } 3742 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3743 eptr++; 3744 } 3745 } 3746 3747 if (min == max) continue; 3748 3749 if (minimize) 3750 { 3751#ifdef SUPPORT_UTF 3752 if (utf) 3753 { 3754 register unsigned int d; 3755 for (fi = min;; fi++) 3756 { 3757 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28); 3758 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3759 if (fi >= max) RRETURN(MATCH_NOMATCH); 3760 if (eptr >= md->end_subject) 3761 { 3762 SCHECK_PARTIAL(); 3763 RRETURN(MATCH_NOMATCH); 3764 } 3765 GETCHARINC(d, eptr); 3766 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); 3767 } 3768 } 3769 else 3770#endif 3771 /* Not UTF mode */ 3772 { 3773 for (fi = min;; fi++) 3774 { 3775 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29); 3776 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3777 if (fi >= max) RRETURN(MATCH_NOMATCH); 3778 if (eptr >= md->end_subject) 3779 { 3780 SCHECK_PARTIAL(); 3781 RRETURN(MATCH_NOMATCH); 3782 } 3783 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); 3784 eptr++; 3785 } 3786 } 3787 /* Control never gets here */ 3788 } 3789 3790 /* Maximize case */ 3791 3792 else 3793 { 3794 pp = eptr; 3795 3796#ifdef SUPPORT_UTF 3797 if (utf) 3798 { 3799 register unsigned int d; 3800 for (i = min; i < max; i++) 3801 { 3802 int len = 1; 3803 if (eptr >= md->end_subject) 3804 { 3805 SCHECK_PARTIAL(); 3806 break; 3807 } 3808 GETCHARLEN(d, eptr, len); 3809 if (fc == d || (unsigned int)foc == d) break; 3810 eptr += len; 3811 } 3812 if (possessive) continue; 3813 for(;;) 3814 { 3815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); 3816 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3817 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3818 BACKCHAR(eptr); 3819 } 3820 } 3821 else 3822#endif 3823 /* Not UTF mode */ 3824 { 3825 for (i = min; i < max; i++) 3826 { 3827 if (eptr >= md->end_subject) 3828 { 3829 SCHECK_PARTIAL(); 3830 break; 3831 } 3832 if (fc == *eptr || foc == *eptr) break; 3833 eptr++; 3834 } 3835 if (possessive) continue; 3836 while (eptr >= pp) 3837 { 3838 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31); 3839 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3840 eptr--; 3841 } 3842 } 3843 3844 RRETURN(MATCH_NOMATCH); 3845 } 3846 /* Control never gets here */ 3847 } 3848 3849 /* Caseful comparisons */ 3850 3851 else 3852 { 3853#ifdef SUPPORT_UTF 3854 if (utf) 3855 { 3856 register unsigned int d; 3857 for (i = 1; i <= min; i++) 3858 { 3859 if (eptr >= md->end_subject) 3860 { 3861 SCHECK_PARTIAL(); 3862 RRETURN(MATCH_NOMATCH); 3863 } 3864 GETCHARINC(d, eptr); 3865 if (fc == d) RRETURN(MATCH_NOMATCH); 3866 } 3867 } 3868 else 3869#endif 3870 /* Not UTF mode */ 3871 { 3872 for (i = 1; i <= min; i++) 3873 { 3874 if (eptr >= md->end_subject) 3875 { 3876 SCHECK_PARTIAL(); 3877 RRETURN(MATCH_NOMATCH); 3878 } 3879 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3880 } 3881 } 3882 3883 if (min == max) continue; 3884 3885 if (minimize) 3886 { 3887#ifdef SUPPORT_UTF 3888 if (utf) 3889 { 3890 register unsigned int d; 3891 for (fi = min;; fi++) 3892 { 3893 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32); 3894 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3895 if (fi >= max) RRETURN(MATCH_NOMATCH); 3896 if (eptr >= md->end_subject) 3897 { 3898 SCHECK_PARTIAL(); 3899 RRETURN(MATCH_NOMATCH); 3900 } 3901 GETCHARINC(d, eptr); 3902 if (fc == d) RRETURN(MATCH_NOMATCH); 3903 } 3904 } 3905 else 3906#endif 3907 /* Not UTF mode */ 3908 { 3909 for (fi = min;; fi++) 3910 { 3911 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33); 3912 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3913 if (fi >= max) RRETURN(MATCH_NOMATCH); 3914 if (eptr >= md->end_subject) 3915 { 3916 SCHECK_PARTIAL(); 3917 RRETURN(MATCH_NOMATCH); 3918 } 3919 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3920 } 3921 } 3922 /* Control never gets here */ 3923 } 3924 3925 /* Maximize case */ 3926 3927 else 3928 { 3929 pp = eptr; 3930 3931#ifdef SUPPORT_UTF 3932 if (utf) 3933 { 3934 register unsigned int d; 3935 for (i = min; i < max; i++) 3936 { 3937 int len = 1; 3938 if (eptr >= md->end_subject) 3939 { 3940 SCHECK_PARTIAL(); 3941 break; 3942 } 3943 GETCHARLEN(d, eptr, len); 3944 if (fc == d) break; 3945 eptr += len; 3946 } 3947 if (possessive) continue; 3948 for(;;) 3949 { 3950 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34); 3951 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3952 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3953 BACKCHAR(eptr); 3954 } 3955 } 3956 else 3957#endif 3958 /* Not UTF mode */ 3959 { 3960 for (i = min; i < max; i++) 3961 { 3962 if (eptr >= md->end_subject) 3963 { 3964 SCHECK_PARTIAL(); 3965 break; 3966 } 3967 if (fc == *eptr) break; 3968 eptr++; 3969 } 3970 if (possessive) continue; 3971 while (eptr >= pp) 3972 { 3973 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35); 3974 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3975 eptr--; 3976 } 3977 } 3978 3979 RRETURN(MATCH_NOMATCH); 3980 } 3981 } 3982 /* Control never gets here */ 3983 3984 /* Match a single character type repeatedly; several different opcodes 3985 share code. This is very similar to the code for single characters, but we 3986 repeat it in the interests of efficiency. */ 3987 3988 case OP_TYPEEXACT: 3989 min = max = GET2(ecode, 1); 3990 minimize = TRUE; 3991 ecode += 1 + IMM2_SIZE; 3992 goto REPEATTYPE; 3993 3994 case OP_TYPEUPTO: 3995 case OP_TYPEMINUPTO: 3996 min = 0; 3997 max = GET2(ecode, 1); 3998 minimize = *ecode == OP_TYPEMINUPTO; 3999 ecode += 1 + IMM2_SIZE; 4000 goto REPEATTYPE; 4001 4002 case OP_TYPEPOSSTAR: 4003 possessive = TRUE; 4004 min = 0; 4005 max = INT_MAX; 4006 ecode++; 4007 goto REPEATTYPE; 4008 4009 case OP_TYPEPOSPLUS: 4010 possessive = TRUE; 4011 min = 1; 4012 max = INT_MAX; 4013 ecode++; 4014 goto REPEATTYPE; 4015 4016 case OP_TYPEPOSQUERY: 4017 possessive = TRUE; 4018 min = 0; 4019 max = 1; 4020 ecode++; 4021 goto REPEATTYPE; 4022 4023 case OP_TYPEPOSUPTO: 4024 possessive = TRUE; 4025 min = 0; 4026 max = GET2(ecode, 1); 4027 ecode += 1 + IMM2_SIZE; 4028 goto REPEATTYPE; 4029 4030 case OP_TYPESTAR: 4031 case OP_TYPEMINSTAR: 4032 case OP_TYPEPLUS: 4033 case OP_TYPEMINPLUS: 4034 case OP_TYPEQUERY: 4035 case OP_TYPEMINQUERY: 4036 c = *ecode++ - OP_TYPESTAR; 4037 minimize = (c & 1) != 0; 4038 min = rep_min[c]; /* Pick up values from tables; */ 4039 max = rep_max[c]; /* zero for max => infinity */ 4040 if (max == 0) max = INT_MAX; 4041 4042 /* Common code for all repeated single character type matches. Note that 4043 in UTF-8 mode, '.' matches a character of any length, but for the other 4044 character types, the valid characters are all one-byte long. */ 4045 4046 REPEATTYPE: 4047 ctype = *ecode++; /* Code for the character type */ 4048 4049#ifdef SUPPORT_UCP 4050 if (ctype == OP_PROP || ctype == OP_NOTPROP) 4051 { 4052 prop_fail_result = ctype == OP_NOTPROP; 4053 prop_type = *ecode++; 4054 prop_value = *ecode++; 4055 } 4056 else prop_type = -1; 4057#endif 4058 4059 /* First, ensure the minimum number of matches are present. Use inline 4060 code for maximizing the speed, and do the type test once at the start 4061 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 4062 is tidier. Also separate the UCP code, which can be the same for both UTF-8 4063 and single-bytes. */ 4064 4065 if (min > 0) 4066 { 4067#ifdef SUPPORT_UCP 4068 if (prop_type >= 0) 4069 { 4070 switch(prop_type) 4071 { 4072 case PT_ANY: 4073 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4074 for (i = 1; i <= min; i++) 4075 { 4076 if (eptr >= md->end_subject) 4077 { 4078 SCHECK_PARTIAL(); 4079 RRETURN(MATCH_NOMATCH); 4080 } 4081 GETCHARINCTEST(c, eptr); 4082 } 4083 break; 4084 4085 case PT_LAMP: 4086 for (i = 1; i <= min; i++) 4087 { 4088 int chartype; 4089 if (eptr >= md->end_subject) 4090 { 4091 SCHECK_PARTIAL(); 4092 RRETURN(MATCH_NOMATCH); 4093 } 4094 GETCHARINCTEST(c, eptr); 4095 chartype = UCD_CHARTYPE(c); 4096 if ((chartype == ucp_Lu || 4097 chartype == ucp_Ll || 4098 chartype == ucp_Lt) == prop_fail_result) 4099 RRETURN(MATCH_NOMATCH); 4100 } 4101 break; 4102 4103 case PT_GC: 4104 for (i = 1; i <= min; i++) 4105 { 4106 if (eptr >= md->end_subject) 4107 { 4108 SCHECK_PARTIAL(); 4109 RRETURN(MATCH_NOMATCH); 4110 } 4111 GETCHARINCTEST(c, eptr); 4112 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4113 RRETURN(MATCH_NOMATCH); 4114 } 4115 break; 4116 4117 case PT_PC: 4118 for (i = 1; i <= min; i++) 4119 { 4120 if (eptr >= md->end_subject) 4121 { 4122 SCHECK_PARTIAL(); 4123 RRETURN(MATCH_NOMATCH); 4124 } 4125 GETCHARINCTEST(c, eptr); 4126 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4127 RRETURN(MATCH_NOMATCH); 4128 } 4129 break; 4130 4131 case PT_SC: 4132 for (i = 1; i <= min; i++) 4133 { 4134 if (eptr >= md->end_subject) 4135 { 4136 SCHECK_PARTIAL(); 4137 RRETURN(MATCH_NOMATCH); 4138 } 4139 GETCHARINCTEST(c, eptr); 4140 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4141 RRETURN(MATCH_NOMATCH); 4142 } 4143 break; 4144 4145 case PT_ALNUM: 4146 for (i = 1; i <= min; i++) 4147 { 4148 int category; 4149 if (eptr >= md->end_subject) 4150 { 4151 SCHECK_PARTIAL(); 4152 RRETURN(MATCH_NOMATCH); 4153 } 4154 GETCHARINCTEST(c, eptr); 4155 category = UCD_CATEGORY(c); 4156 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4157 RRETURN(MATCH_NOMATCH); 4158 } 4159 break; 4160 4161 case PT_SPACE: /* Perl space */ 4162 for (i = 1; i <= min; i++) 4163 { 4164 if (eptr >= md->end_subject) 4165 { 4166 SCHECK_PARTIAL(); 4167 RRETURN(MATCH_NOMATCH); 4168 } 4169 GETCHARINCTEST(c, eptr); 4170 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4171 c == CHAR_FF || c == CHAR_CR) 4172 == prop_fail_result) 4173 RRETURN(MATCH_NOMATCH); 4174 } 4175 break; 4176 4177 case PT_PXSPACE: /* POSIX space */ 4178 for (i = 1; i <= min; i++) 4179 { 4180 if (eptr >= md->end_subject) 4181 { 4182 SCHECK_PARTIAL(); 4183 RRETURN(MATCH_NOMATCH); 4184 } 4185 GETCHARINCTEST(c, eptr); 4186 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4187 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) 4188 == prop_fail_result) 4189 RRETURN(MATCH_NOMATCH); 4190 } 4191 break; 4192 4193 case PT_WORD: 4194 for (i = 1; i <= min; i++) 4195 { 4196 int category; 4197 if (eptr >= md->end_subject) 4198 { 4199 SCHECK_PARTIAL(); 4200 RRETURN(MATCH_NOMATCH); 4201 } 4202 GETCHARINCTEST(c, eptr); 4203 category = UCD_CATEGORY(c); 4204 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE) 4205 == prop_fail_result) 4206 RRETURN(MATCH_NOMATCH); 4207 } 4208 break; 4209 4210 /* This should not occur */ 4211 4212 default: 4213 RRETURN(PCRE_ERROR_INTERNAL); 4214 } 4215 } 4216 4217 /* Match extended Unicode sequences. We will get here only if the 4218 support is in the binary; otherwise a compile-time error occurs. */ 4219 4220 else if (ctype == OP_EXTUNI) 4221 { 4222 for (i = 1; i <= min; i++) 4223 { 4224 if (eptr >= md->end_subject) 4225 { 4226 SCHECK_PARTIAL(); 4227 RRETURN(MATCH_NOMATCH); 4228 } 4229 GETCHARINCTEST(c, eptr); 4230 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); 4231 while (eptr < md->end_subject) 4232 { 4233 int len = 1; 4234 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 4235 if (UCD_CATEGORY(c) != ucp_M) break; 4236 eptr += len; 4237 } 4238 CHECK_PARTIAL(); 4239 } 4240 } 4241 4242 else 4243#endif /* SUPPORT_UCP */ 4244 4245/* Handle all other cases when the coding is UTF-8 */ 4246 4247#ifdef SUPPORT_UTF 4248 if (utf) switch(ctype) 4249 { 4250 case OP_ANY: 4251 for (i = 1; i <= min; i++) 4252 { 4253 if (eptr >= md->end_subject) 4254 { 4255 SCHECK_PARTIAL(); 4256 RRETURN(MATCH_NOMATCH); 4257 } 4258 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4259 if (md->partial != 0 && 4260 eptr + 1 >= md->end_subject && 4261 NLBLOCK->nltype == NLTYPE_FIXED && 4262 NLBLOCK->nllen == 2 && 4263 *eptr == NLBLOCK->nl[0]) 4264 { 4265 md->hitend = TRUE; 4266 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4267 } 4268 eptr++; 4269 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4270 } 4271 break; 4272 4273 case OP_ALLANY: 4274 for (i = 1; i <= min; i++) 4275 { 4276 if (eptr >= md->end_subject) 4277 { 4278 SCHECK_PARTIAL(); 4279 RRETURN(MATCH_NOMATCH); 4280 } 4281 eptr++; 4282 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4283 } 4284 break; 4285 4286 case OP_ANYBYTE: 4287 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); 4288 eptr += min; 4289 break; 4290 4291 case OP_ANYNL: 4292 for (i = 1; i <= min; i++) 4293 { 4294 if (eptr >= md->end_subject) 4295 { 4296 SCHECK_PARTIAL(); 4297 RRETURN(MATCH_NOMATCH); 4298 } 4299 GETCHARINC(c, eptr); 4300 switch(c) 4301 { 4302 default: RRETURN(MATCH_NOMATCH); 4303 4304 case 0x000d: 4305 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4306 break; 4307 4308 case 0x000a: 4309 break; 4310 4311 case 0x000b: 4312 case 0x000c: 4313 case 0x0085: 4314 case 0x2028: 4315 case 0x2029: 4316 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4317 break; 4318 } 4319 } 4320 break; 4321 4322 case OP_NOT_HSPACE: 4323 for (i = 1; i <= min; i++) 4324 { 4325 if (eptr >= md->end_subject) 4326 { 4327 SCHECK_PARTIAL(); 4328 RRETURN(MATCH_NOMATCH); 4329 } 4330 GETCHARINC(c, eptr); 4331 switch(c) 4332 { 4333 default: break; 4334 case 0x09: /* HT */ 4335 case 0x20: /* SPACE */ 4336 case 0xa0: /* NBSP */ 4337 case 0x1680: /* OGHAM SPACE MARK */ 4338 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4339 case 0x2000: /* EN QUAD */ 4340 case 0x2001: /* EM QUAD */ 4341 case 0x2002: /* EN SPACE */ 4342 case 0x2003: /* EM SPACE */ 4343 case 0x2004: /* THREE-PER-EM SPACE */ 4344 case 0x2005: /* FOUR-PER-EM SPACE */ 4345 case 0x2006: /* SIX-PER-EM SPACE */ 4346 case 0x2007: /* FIGURE SPACE */ 4347 case 0x2008: /* PUNCTUATION SPACE */ 4348 case 0x2009: /* THIN SPACE */ 4349 case 0x200A: /* HAIR SPACE */ 4350 case 0x202f: /* NARROW NO-BREAK SPACE */ 4351 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4352 case 0x3000: /* IDEOGRAPHIC SPACE */ 4353 RRETURN(MATCH_NOMATCH); 4354 } 4355 } 4356 break; 4357 4358 case OP_HSPACE: 4359 for (i = 1; i <= min; i++) 4360 { 4361 if (eptr >= md->end_subject) 4362 { 4363 SCHECK_PARTIAL(); 4364 RRETURN(MATCH_NOMATCH); 4365 } 4366 GETCHARINC(c, eptr); 4367 switch(c) 4368 { 4369 default: RRETURN(MATCH_NOMATCH); 4370 case 0x09: /* HT */ 4371 case 0x20: /* SPACE */ 4372 case 0xa0: /* NBSP */ 4373 case 0x1680: /* OGHAM SPACE MARK */ 4374 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4375 case 0x2000: /* EN QUAD */ 4376 case 0x2001: /* EM QUAD */ 4377 case 0x2002: /* EN SPACE */ 4378 case 0x2003: /* EM SPACE */ 4379 case 0x2004: /* THREE-PER-EM SPACE */ 4380 case 0x2005: /* FOUR-PER-EM SPACE */ 4381 case 0x2006: /* SIX-PER-EM SPACE */ 4382 case 0x2007: /* FIGURE SPACE */ 4383 case 0x2008: /* PUNCTUATION SPACE */ 4384 case 0x2009: /* THIN SPACE */ 4385 case 0x200A: /* HAIR SPACE */ 4386 case 0x202f: /* NARROW NO-BREAK SPACE */ 4387 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4388 case 0x3000: /* IDEOGRAPHIC SPACE */ 4389 break; 4390 } 4391 } 4392 break; 4393 4394 case OP_NOT_VSPACE: 4395 for (i = 1; i <= min; i++) 4396 { 4397 if (eptr >= md->end_subject) 4398 { 4399 SCHECK_PARTIAL(); 4400 RRETURN(MATCH_NOMATCH); 4401 } 4402 GETCHARINC(c, eptr); 4403 switch(c) 4404 { 4405 default: break; 4406 case 0x0a: /* LF */ 4407 case 0x0b: /* VT */ 4408 case 0x0c: /* FF */ 4409 case 0x0d: /* CR */ 4410 case 0x85: /* NEL */ 4411 case 0x2028: /* LINE SEPARATOR */ 4412 case 0x2029: /* PARAGRAPH SEPARATOR */ 4413 RRETURN(MATCH_NOMATCH); 4414 } 4415 } 4416 break; 4417 4418 case OP_VSPACE: 4419 for (i = 1; i <= min; i++) 4420 { 4421 if (eptr >= md->end_subject) 4422 { 4423 SCHECK_PARTIAL(); 4424 RRETURN(MATCH_NOMATCH); 4425 } 4426 GETCHARINC(c, eptr); 4427 switch(c) 4428 { 4429 default: RRETURN(MATCH_NOMATCH); 4430 case 0x0a: /* LF */ 4431 case 0x0b: /* VT */ 4432 case 0x0c: /* FF */ 4433 case 0x0d: /* CR */ 4434 case 0x85: /* NEL */ 4435 case 0x2028: /* LINE SEPARATOR */ 4436 case 0x2029: /* PARAGRAPH SEPARATOR */ 4437 break; 4438 } 4439 } 4440 break; 4441 4442 case OP_NOT_DIGIT: 4443 for (i = 1; i <= min; i++) 4444 { 4445 if (eptr >= md->end_subject) 4446 { 4447 SCHECK_PARTIAL(); 4448 RRETURN(MATCH_NOMATCH); 4449 } 4450 GETCHARINC(c, eptr); 4451 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 4452 RRETURN(MATCH_NOMATCH); 4453 } 4454 break; 4455 4456 case OP_DIGIT: 4457 for (i = 1; i <= min; i++) 4458 { 4459 if (eptr >= md->end_subject) 4460 { 4461 SCHECK_PARTIAL(); 4462 RRETURN(MATCH_NOMATCH); 4463 } 4464 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0) 4465 RRETURN(MATCH_NOMATCH); 4466 eptr++; 4467 /* No need to skip more bytes - we know it's a 1-byte character */ 4468 } 4469 break; 4470 4471 case OP_NOT_WHITESPACE: 4472 for (i = 1; i <= min; i++) 4473 { 4474 if (eptr >= md->end_subject) 4475 { 4476 SCHECK_PARTIAL(); 4477 RRETURN(MATCH_NOMATCH); 4478 } 4479 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) 4480 RRETURN(MATCH_NOMATCH); 4481 eptr++; 4482 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4483 } 4484 break; 4485 4486 case OP_WHITESPACE: 4487 for (i = 1; i <= min; i++) 4488 { 4489 if (eptr >= md->end_subject) 4490 { 4491 SCHECK_PARTIAL(); 4492 RRETURN(MATCH_NOMATCH); 4493 } 4494 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0) 4495 RRETURN(MATCH_NOMATCH); 4496 eptr++; 4497 /* No need to skip more bytes - we know it's a 1-byte character */ 4498 } 4499 break; 4500 4501 case OP_NOT_WORDCHAR: 4502 for (i = 1; i <= min; i++) 4503 { 4504 if (eptr >= md->end_subject) 4505 { 4506 SCHECK_PARTIAL(); 4507 RRETURN(MATCH_NOMATCH); 4508 } 4509 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) 4510 RRETURN(MATCH_NOMATCH); 4511 eptr++; 4512 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 4513 } 4514 break; 4515 4516 case OP_WORDCHAR: 4517 for (i = 1; i <= min; i++) 4518 { 4519 if (eptr >= md->end_subject) 4520 { 4521 SCHECK_PARTIAL(); 4522 RRETURN(MATCH_NOMATCH); 4523 } 4524 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0) 4525 RRETURN(MATCH_NOMATCH); 4526 eptr++; 4527 /* No need to skip more bytes - we know it's a 1-byte character */ 4528 } 4529 break; 4530 4531 default: 4532 RRETURN(PCRE_ERROR_INTERNAL); 4533 } /* End switch(ctype) */ 4534 4535 else 4536#endif /* SUPPORT_UTF */ 4537 4538 /* Code for the non-UTF-8 case for minimum matching of operators other 4539 than OP_PROP and OP_NOTPROP. */ 4540 4541 switch(ctype) 4542 { 4543 case OP_ANY: 4544 for (i = 1; i <= min; i++) 4545 { 4546 if (eptr >= md->end_subject) 4547 { 4548 SCHECK_PARTIAL(); 4549 RRETURN(MATCH_NOMATCH); 4550 } 4551 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 4552 if (md->partial != 0 && 4553 eptr + 1 >= md->end_subject && 4554 NLBLOCK->nltype == NLTYPE_FIXED && 4555 NLBLOCK->nllen == 2 && 4556 *eptr == NLBLOCK->nl[0]) 4557 { 4558 md->hitend = TRUE; 4559 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 4560 } 4561 eptr++; 4562 } 4563 break; 4564 4565 case OP_ALLANY: 4566 if (eptr > md->end_subject - min) 4567 { 4568 SCHECK_PARTIAL(); 4569 RRETURN(MATCH_NOMATCH); 4570 } 4571 eptr += min; 4572 break; 4573 4574 case OP_ANYBYTE: 4575 if (eptr > md->end_subject - min) 4576 { 4577 SCHECK_PARTIAL(); 4578 RRETURN(MATCH_NOMATCH); 4579 } 4580 eptr += min; 4581 break; 4582 4583 case OP_ANYNL: 4584 for (i = 1; i <= min; i++) 4585 { 4586 if (eptr >= md->end_subject) 4587 { 4588 SCHECK_PARTIAL(); 4589 RRETURN(MATCH_NOMATCH); 4590 } 4591 switch(*eptr++) 4592 { 4593 default: RRETURN(MATCH_NOMATCH); 4594 4595 case 0x000d: 4596 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4597 break; 4598 4599 case 0x000a: 4600 break; 4601 4602 case 0x000b: 4603 case 0x000c: 4604 case 0x0085: 4605#ifdef COMPILE_PCRE16 4606 case 0x2028: 4607 case 0x2029: 4608#endif 4609 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4610 break; 4611 } 4612 } 4613 break; 4614 4615 case OP_NOT_HSPACE: 4616 for (i = 1; i <= min; i++) 4617 { 4618 if (eptr >= md->end_subject) 4619 { 4620 SCHECK_PARTIAL(); 4621 RRETURN(MATCH_NOMATCH); 4622 } 4623 switch(*eptr++) 4624 { 4625 default: break; 4626 case 0x09: /* HT */ 4627 case 0x20: /* SPACE */ 4628 case 0xa0: /* NBSP */ 4629#ifdef COMPILE_PCRE16 4630 case 0x1680: /* OGHAM SPACE MARK */ 4631 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4632 case 0x2000: /* EN QUAD */ 4633 case 0x2001: /* EM QUAD */ 4634 case 0x2002: /* EN SPACE */ 4635 case 0x2003: /* EM SPACE */ 4636 case 0x2004: /* THREE-PER-EM SPACE */ 4637 case 0x2005: /* FOUR-PER-EM SPACE */ 4638 case 0x2006: /* SIX-PER-EM SPACE */ 4639 case 0x2007: /* FIGURE SPACE */ 4640 case 0x2008: /* PUNCTUATION SPACE */ 4641 case 0x2009: /* THIN SPACE */ 4642 case 0x200A: /* HAIR SPACE */ 4643 case 0x202f: /* NARROW NO-BREAK SPACE */ 4644 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4645 case 0x3000: /* IDEOGRAPHIC SPACE */ 4646#endif 4647 RRETURN(MATCH_NOMATCH); 4648 } 4649 } 4650 break; 4651 4652 case OP_HSPACE: 4653 for (i = 1; i <= min; i++) 4654 { 4655 if (eptr >= md->end_subject) 4656 { 4657 SCHECK_PARTIAL(); 4658 RRETURN(MATCH_NOMATCH); 4659 } 4660 switch(*eptr++) 4661 { 4662 default: RRETURN(MATCH_NOMATCH); 4663 case 0x09: /* HT */ 4664 case 0x20: /* SPACE */ 4665 case 0xa0: /* NBSP */ 4666#ifdef COMPILE_PCRE16 4667 case 0x1680: /* OGHAM SPACE MARK */ 4668 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4669 case 0x2000: /* EN QUAD */ 4670 case 0x2001: /* EM QUAD */ 4671 case 0x2002: /* EN SPACE */ 4672 case 0x2003: /* EM SPACE */ 4673 case 0x2004: /* THREE-PER-EM SPACE */ 4674 case 0x2005: /* FOUR-PER-EM SPACE */ 4675 case 0x2006: /* SIX-PER-EM SPACE */ 4676 case 0x2007: /* FIGURE SPACE */ 4677 case 0x2008: /* PUNCTUATION SPACE */ 4678 case 0x2009: /* THIN SPACE */ 4679 case 0x200A: /* HAIR SPACE */ 4680 case 0x202f: /* NARROW NO-BREAK SPACE */ 4681 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4682 case 0x3000: /* IDEOGRAPHIC SPACE */ 4683#endif 4684 break; 4685 } 4686 } 4687 break; 4688 4689 case OP_NOT_VSPACE: 4690 for (i = 1; i <= min; i++) 4691 { 4692 if (eptr >= md->end_subject) 4693 { 4694 SCHECK_PARTIAL(); 4695 RRETURN(MATCH_NOMATCH); 4696 } 4697 switch(*eptr++) 4698 { 4699 default: break; 4700 case 0x0a: /* LF */ 4701 case 0x0b: /* VT */ 4702 case 0x0c: /* FF */ 4703 case 0x0d: /* CR */ 4704 case 0x85: /* NEL */ 4705#ifdef COMPILE_PCRE16 4706 case 0x2028: /* LINE SEPARATOR */ 4707 case 0x2029: /* PARAGRAPH SEPARATOR */ 4708#endif 4709 RRETURN(MATCH_NOMATCH); 4710 } 4711 } 4712 break; 4713 4714 case OP_VSPACE: 4715 for (i = 1; i <= min; i++) 4716 { 4717 if (eptr >= md->end_subject) 4718 { 4719 SCHECK_PARTIAL(); 4720 RRETURN(MATCH_NOMATCH); 4721 } 4722 switch(*eptr++) 4723 { 4724 default: RRETURN(MATCH_NOMATCH); 4725 case 0x0a: /* LF */ 4726 case 0x0b: /* VT */ 4727 case 0x0c: /* FF */ 4728 case 0x0d: /* CR */ 4729 case 0x85: /* NEL */ 4730#ifdef COMPILE_PCRE16 4731 case 0x2028: /* LINE SEPARATOR */ 4732 case 0x2029: /* PARAGRAPH SEPARATOR */ 4733#endif 4734 break; 4735 } 4736 } 4737 break; 4738 4739 case OP_NOT_DIGIT: 4740 for (i = 1; i <= min; i++) 4741 { 4742 if (eptr >= md->end_subject) 4743 { 4744 SCHECK_PARTIAL(); 4745 RRETURN(MATCH_NOMATCH); 4746 } 4747 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) 4748 RRETURN(MATCH_NOMATCH); 4749 eptr++; 4750 } 4751 break; 4752 4753 case OP_DIGIT: 4754 for (i = 1; i <= min; i++) 4755 { 4756 if (eptr >= md->end_subject) 4757 { 4758 SCHECK_PARTIAL(); 4759 RRETURN(MATCH_NOMATCH); 4760 } 4761 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) 4762 RRETURN(MATCH_NOMATCH); 4763 eptr++; 4764 } 4765 break; 4766 4767 case OP_NOT_WHITESPACE: 4768 for (i = 1; i <= min; i++) 4769 { 4770 if (eptr >= md->end_subject) 4771 { 4772 SCHECK_PARTIAL(); 4773 RRETURN(MATCH_NOMATCH); 4774 } 4775 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) 4776 RRETURN(MATCH_NOMATCH); 4777 eptr++; 4778 } 4779 break; 4780 4781 case OP_WHITESPACE: 4782 for (i = 1; i <= min; i++) 4783 { 4784 if (eptr >= md->end_subject) 4785 { 4786 SCHECK_PARTIAL(); 4787 RRETURN(MATCH_NOMATCH); 4788 } 4789 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) 4790 RRETURN(MATCH_NOMATCH); 4791 eptr++; 4792 } 4793 break; 4794 4795 case OP_NOT_WORDCHAR: 4796 for (i = 1; i <= min; i++) 4797 { 4798 if (eptr >= md->end_subject) 4799 { 4800 SCHECK_PARTIAL(); 4801 RRETURN(MATCH_NOMATCH); 4802 } 4803 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) 4804 RRETURN(MATCH_NOMATCH); 4805 eptr++; 4806 } 4807 break; 4808 4809 case OP_WORDCHAR: 4810 for (i = 1; i <= min; i++) 4811 { 4812 if (eptr >= md->end_subject) 4813 { 4814 SCHECK_PARTIAL(); 4815 RRETURN(MATCH_NOMATCH); 4816 } 4817 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) 4818 RRETURN(MATCH_NOMATCH); 4819 eptr++; 4820 } 4821 break; 4822 4823 default: 4824 RRETURN(PCRE_ERROR_INTERNAL); 4825 } 4826 } 4827 4828 /* If min = max, continue at the same level without recursing */ 4829 4830 if (min == max) continue; 4831 4832 /* If minimizing, we have to test the rest of the pattern before each 4833 subsequent match. Again, separate the UTF-8 case for speed, and also 4834 separate the UCP cases. */ 4835 4836 if (minimize) 4837 { 4838#ifdef SUPPORT_UCP 4839 if (prop_type >= 0) 4840 { 4841 switch(prop_type) 4842 { 4843 case PT_ANY: 4844 for (fi = min;; fi++) 4845 { 4846 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36); 4847 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4848 if (fi >= max) RRETURN(MATCH_NOMATCH); 4849 if (eptr >= md->end_subject) 4850 { 4851 SCHECK_PARTIAL(); 4852 RRETURN(MATCH_NOMATCH); 4853 } 4854 GETCHARINCTEST(c, eptr); 4855 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 4856 } 4857 /* Control never gets here */ 4858 4859 case PT_LAMP: 4860 for (fi = min;; fi++) 4861 { 4862 int chartype; 4863 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37); 4864 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4865 if (fi >= max) RRETURN(MATCH_NOMATCH); 4866 if (eptr >= md->end_subject) 4867 { 4868 SCHECK_PARTIAL(); 4869 RRETURN(MATCH_NOMATCH); 4870 } 4871 GETCHARINCTEST(c, eptr); 4872 chartype = UCD_CHARTYPE(c); 4873 if ((chartype == ucp_Lu || 4874 chartype == ucp_Ll || 4875 chartype == ucp_Lt) == prop_fail_result) 4876 RRETURN(MATCH_NOMATCH); 4877 } 4878 /* Control never gets here */ 4879 4880 case PT_GC: 4881 for (fi = min;; fi++) 4882 { 4883 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38); 4884 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4885 if (fi >= max) RRETURN(MATCH_NOMATCH); 4886 if (eptr >= md->end_subject) 4887 { 4888 SCHECK_PARTIAL(); 4889 RRETURN(MATCH_NOMATCH); 4890 } 4891 GETCHARINCTEST(c, eptr); 4892 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) 4893 RRETURN(MATCH_NOMATCH); 4894 } 4895 /* Control never gets here */ 4896 4897 case PT_PC: 4898 for (fi = min;; fi++) 4899 { 4900 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39); 4901 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4902 if (fi >= max) RRETURN(MATCH_NOMATCH); 4903 if (eptr >= md->end_subject) 4904 { 4905 SCHECK_PARTIAL(); 4906 RRETURN(MATCH_NOMATCH); 4907 } 4908 GETCHARINCTEST(c, eptr); 4909 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) 4910 RRETURN(MATCH_NOMATCH); 4911 } 4912 /* Control never gets here */ 4913 4914 case PT_SC: 4915 for (fi = min;; fi++) 4916 { 4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40); 4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4919 if (fi >= max) RRETURN(MATCH_NOMATCH); 4920 if (eptr >= md->end_subject) 4921 { 4922 SCHECK_PARTIAL(); 4923 RRETURN(MATCH_NOMATCH); 4924 } 4925 GETCHARINCTEST(c, eptr); 4926 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) 4927 RRETURN(MATCH_NOMATCH); 4928 } 4929 /* Control never gets here */ 4930 4931 case PT_ALNUM: 4932 for (fi = min;; fi++) 4933 { 4934 int category; 4935 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59); 4936 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4937 if (fi >= max) RRETURN(MATCH_NOMATCH); 4938 if (eptr >= md->end_subject) 4939 { 4940 SCHECK_PARTIAL(); 4941 RRETURN(MATCH_NOMATCH); 4942 } 4943 GETCHARINCTEST(c, eptr); 4944 category = UCD_CATEGORY(c); 4945 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 4946 RRETURN(MATCH_NOMATCH); 4947 } 4948 /* Control never gets here */ 4949 4950 case PT_SPACE: /* Perl space */ 4951 for (fi = min;; fi++) 4952 { 4953 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60); 4954 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4955 if (fi >= max) RRETURN(MATCH_NOMATCH); 4956 if (eptr >= md->end_subject) 4957 { 4958 SCHECK_PARTIAL(); 4959 RRETURN(MATCH_NOMATCH); 4960 } 4961 GETCHARINCTEST(c, eptr); 4962 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4963 c == CHAR_FF || c == CHAR_CR) 4964 == prop_fail_result) 4965 RRETURN(MATCH_NOMATCH); 4966 } 4967 /* Control never gets here */ 4968 4969 case PT_PXSPACE: /* POSIX space */ 4970 for (fi = min;; fi++) 4971 { 4972 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61); 4973 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4974 if (fi >= max) RRETURN(MATCH_NOMATCH); 4975 if (eptr >= md->end_subject) 4976 { 4977 SCHECK_PARTIAL(); 4978 RRETURN(MATCH_NOMATCH); 4979 } 4980 GETCHARINCTEST(c, eptr); 4981 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || 4982 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) 4983 == prop_fail_result) 4984 RRETURN(MATCH_NOMATCH); 4985 } 4986 /* Control never gets here */ 4987 4988 case PT_WORD: 4989 for (fi = min;; fi++) 4990 { 4991 int category; 4992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62); 4993 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4994 if (fi >= max) RRETURN(MATCH_NOMATCH); 4995 if (eptr >= md->end_subject) 4996 { 4997 SCHECK_PARTIAL(); 4998 RRETURN(MATCH_NOMATCH); 4999 } 5000 GETCHARINCTEST(c, eptr); 5001 category = UCD_CATEGORY(c); 5002 if ((category == ucp_L || 5003 category == ucp_N || 5004 c == CHAR_UNDERSCORE) 5005 == prop_fail_result) 5006 RRETURN(MATCH_NOMATCH); 5007 } 5008 /* Control never gets here */ 5009 5010 /* This should never occur */ 5011 5012 default: 5013 RRETURN(PCRE_ERROR_INTERNAL); 5014 } 5015 } 5016 5017 /* Match extended Unicode sequences. We will get here only if the 5018 support is in the binary; otherwise a compile-time error occurs. */ 5019 5020 else if (ctype == OP_EXTUNI) 5021 { 5022 for (fi = min;; fi++) 5023 { 5024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41); 5025 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5026 if (fi >= max) RRETURN(MATCH_NOMATCH); 5027 if (eptr >= md->end_subject) 5028 { 5029 SCHECK_PARTIAL(); 5030 RRETURN(MATCH_NOMATCH); 5031 } 5032 GETCHARINCTEST(c, eptr); 5033 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH); 5034 while (eptr < md->end_subject) 5035 { 5036 int len = 1; 5037 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5038 if (UCD_CATEGORY(c) != ucp_M) break; 5039 eptr += len; 5040 } 5041 CHECK_PARTIAL(); 5042 } 5043 } 5044 else 5045#endif /* SUPPORT_UCP */ 5046 5047#ifdef SUPPORT_UTF 5048 if (utf) 5049 { 5050 for (fi = min;; fi++) 5051 { 5052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42); 5053 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5054 if (fi >= max) RRETURN(MATCH_NOMATCH); 5055 if (eptr >= md->end_subject) 5056 { 5057 SCHECK_PARTIAL(); 5058 RRETURN(MATCH_NOMATCH); 5059 } 5060 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5061 RRETURN(MATCH_NOMATCH); 5062 GETCHARINC(c, eptr); 5063 switch(ctype) 5064 { 5065 case OP_ANY: /* This is the non-NL case */ 5066 if (md->partial != 0 && /* Take care with CRLF partial */ 5067 eptr >= md->end_subject && 5068 NLBLOCK->nltype == NLTYPE_FIXED && 5069 NLBLOCK->nllen == 2 && 5070 c == NLBLOCK->nl[0]) 5071 { 5072 md->hitend = TRUE; 5073 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5074 } 5075 break; 5076 5077 case OP_ALLANY: 5078 case OP_ANYBYTE: 5079 break; 5080 5081 case OP_ANYNL: 5082 switch(c) 5083 { 5084 default: RRETURN(MATCH_NOMATCH); 5085 case 0x000d: 5086 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 5087 break; 5088 case 0x000a: 5089 break; 5090 5091 case 0x000b: 5092 case 0x000c: 5093 case 0x0085: 5094 case 0x2028: 5095 case 0x2029: 5096 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5097 break; 5098 } 5099 break; 5100 5101 case OP_NOT_HSPACE: 5102 switch(c) 5103 { 5104 default: break; 5105 case 0x09: /* HT */ 5106 case 0x20: /* SPACE */ 5107 case 0xa0: /* NBSP */ 5108 case 0x1680: /* OGHAM SPACE MARK */ 5109 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 5110 case 0x2000: /* EN QUAD */ 5111 case 0x2001: /* EM QUAD */ 5112 case 0x2002: /* EN SPACE */ 5113 case 0x2003: /* EM SPACE */ 5114 case 0x2004: /* THREE-PER-EM SPACE */ 5115 case 0x2005: /* FOUR-PER-EM SPACE */ 5116 case 0x2006: /* SIX-PER-EM SPACE */ 5117 case 0x2007: /* FIGURE SPACE */ 5118 case 0x2008: /* PUNCTUATION SPACE */ 5119 case 0x2009: /* THIN SPACE */ 5120 case 0x200A: /* HAIR SPACE */ 5121 case 0x202f: /* NARROW NO-BREAK SPACE */ 5122 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 5123 case 0x3000: /* IDEOGRAPHIC SPACE */ 5124 RRETURN(MATCH_NOMATCH); 5125 } 5126 break; 5127 5128 case OP_HSPACE: 5129 switch(c) 5130 { 5131 default: RRETURN(MATCH_NOMATCH); 5132 case 0x09: /* HT */ 5133 case 0x20: /* SPACE */ 5134 case 0xa0: /* NBSP */ 5135 case 0x1680: /* OGHAM SPACE MARK */ 5136 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 5137 case 0x2000: /* EN QUAD */ 5138 case 0x2001: /* EM QUAD */ 5139 case 0x2002: /* EN SPACE */ 5140 case 0x2003: /* EM SPACE */ 5141 case 0x2004: /* THREE-PER-EM SPACE */ 5142 case 0x2005: /* FOUR-PER-EM SPACE */ 5143 case 0x2006: /* SIX-PER-EM SPACE */ 5144 case 0x2007: /* FIGURE SPACE */ 5145 case 0x2008: /* PUNCTUATION SPACE */ 5146 case 0x2009: /* THIN SPACE */ 5147 case 0x200A: /* HAIR SPACE */ 5148 case 0x202f: /* NARROW NO-BREAK SPACE */ 5149 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 5150 case 0x3000: /* IDEOGRAPHIC SPACE */ 5151 break; 5152 } 5153 break; 5154 5155 case OP_NOT_VSPACE: 5156 switch(c) 5157 { 5158 default: break; 5159 case 0x0a: /* LF */ 5160 case 0x0b: /* VT */ 5161 case 0x0c: /* FF */ 5162 case 0x0d: /* CR */ 5163 case 0x85: /* NEL */ 5164 case 0x2028: /* LINE SEPARATOR */ 5165 case 0x2029: /* PARAGRAPH SEPARATOR */ 5166 RRETURN(MATCH_NOMATCH); 5167 } 5168 break; 5169 5170 case OP_VSPACE: 5171 switch(c) 5172 { 5173 default: RRETURN(MATCH_NOMATCH); 5174 case 0x0a: /* LF */ 5175 case 0x0b: /* VT */ 5176 case 0x0c: /* FF */ 5177 case 0x0d: /* CR */ 5178 case 0x85: /* NEL */ 5179 case 0x2028: /* LINE SEPARATOR */ 5180 case 0x2029: /* PARAGRAPH SEPARATOR */ 5181 break; 5182 } 5183 break; 5184 5185 case OP_NOT_DIGIT: 5186 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 5187 RRETURN(MATCH_NOMATCH); 5188 break; 5189 5190 case OP_DIGIT: 5191 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 5192 RRETURN(MATCH_NOMATCH); 5193 break; 5194 5195 case OP_NOT_WHITESPACE: 5196 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 5197 RRETURN(MATCH_NOMATCH); 5198 break; 5199 5200 case OP_WHITESPACE: 5201 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 5202 RRETURN(MATCH_NOMATCH); 5203 break; 5204 5205 case OP_NOT_WORDCHAR: 5206 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 5207 RRETURN(MATCH_NOMATCH); 5208 break; 5209 5210 case OP_WORDCHAR: 5211 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) 5212 RRETURN(MATCH_NOMATCH); 5213 break; 5214 5215 default: 5216 RRETURN(PCRE_ERROR_INTERNAL); 5217 } 5218 } 5219 } 5220 else 5221#endif 5222 /* Not UTF mode */ 5223 { 5224 for (fi = min;; fi++) 5225 { 5226 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43); 5227 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5228 if (fi >= max) RRETURN(MATCH_NOMATCH); 5229 if (eptr >= md->end_subject) 5230 { 5231 SCHECK_PARTIAL(); 5232 RRETURN(MATCH_NOMATCH); 5233 } 5234 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 5235 RRETURN(MATCH_NOMATCH); 5236 c = *eptr++; 5237 switch(ctype) 5238 { 5239 case OP_ANY: /* This is the non-NL case */ 5240 if (md->partial != 0 && /* Take care with CRLF partial */ 5241 eptr >= md->end_subject && 5242 NLBLOCK->nltype == NLTYPE_FIXED && 5243 NLBLOCK->nllen == 2 && 5244 c == NLBLOCK->nl[0]) 5245 { 5246 md->hitend = TRUE; 5247 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5248 } 5249 break; 5250 5251 case OP_ALLANY: 5252 case OP_ANYBYTE: 5253 break; 5254 5255 case OP_ANYNL: 5256 switch(c) 5257 { 5258 default: RRETURN(MATCH_NOMATCH); 5259 case 0x000d: 5260 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 5261 break; 5262 5263 case 0x000a: 5264 break; 5265 5266 case 0x000b: 5267 case 0x000c: 5268 case 0x0085: 5269#ifdef COMPILE_PCRE16 5270 case 0x2028: 5271 case 0x2029: 5272#endif 5273 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 5274 break; 5275 } 5276 break; 5277 5278 case OP_NOT_HSPACE: 5279 switch(c) 5280 { 5281 default: break; 5282 case 0x09: /* HT */ 5283 case 0x20: /* SPACE */ 5284 case 0xa0: /* NBSP */ 5285#ifdef COMPILE_PCRE16 5286 case 0x1680: /* OGHAM SPACE MARK */ 5287 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 5288 case 0x2000: /* EN QUAD */ 5289 case 0x2001: /* EM QUAD */ 5290 case 0x2002: /* EN SPACE */ 5291 case 0x2003: /* EM SPACE */ 5292 case 0x2004: /* THREE-PER-EM SPACE */ 5293 case 0x2005: /* FOUR-PER-EM SPACE */ 5294 case 0x2006: /* SIX-PER-EM SPACE */ 5295 case 0x2007: /* FIGURE SPACE */ 5296 case 0x2008: /* PUNCTUATION SPACE */ 5297 case 0x2009: /* THIN SPACE */ 5298 case 0x200A: /* HAIR SPACE */ 5299 case 0x202f: /* NARROW NO-BREAK SPACE */ 5300 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 5301 case 0x3000: /* IDEOGRAPHIC SPACE */ 5302#endif 5303 RRETURN(MATCH_NOMATCH); 5304 } 5305 break; 5306 5307 case OP_HSPACE: 5308 switch(c) 5309 { 5310 default: RRETURN(MATCH_NOMATCH); 5311 case 0x09: /* HT */ 5312 case 0x20: /* SPACE */ 5313 case 0xa0: /* NBSP */ 5314#ifdef COMPILE_PCRE16 5315 case 0x1680: /* OGHAM SPACE MARK */ 5316 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 5317 case 0x2000: /* EN QUAD */ 5318 case 0x2001: /* EM QUAD */ 5319 case 0x2002: /* EN SPACE */ 5320 case 0x2003: /* EM SPACE */ 5321 case 0x2004: /* THREE-PER-EM SPACE */ 5322 case 0x2005: /* FOUR-PER-EM SPACE */ 5323 case 0x2006: /* SIX-PER-EM SPACE */ 5324 case 0x2007: /* FIGURE SPACE */ 5325 case 0x2008: /* PUNCTUATION SPACE */ 5326 case 0x2009: /* THIN SPACE */ 5327 case 0x200A: /* HAIR SPACE */ 5328 case 0x202f: /* NARROW NO-BREAK SPACE */ 5329 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 5330 case 0x3000: /* IDEOGRAPHIC SPACE */ 5331#endif 5332 break; 5333 } 5334 break; 5335 5336 case OP_NOT_VSPACE: 5337 switch(c) 5338 { 5339 default: break; 5340 case 0x0a: /* LF */ 5341 case 0x0b: /* VT */ 5342 case 0x0c: /* FF */ 5343 case 0x0d: /* CR */ 5344 case 0x85: /* NEL */ 5345#ifdef COMPILE_PCRE16 5346 case 0x2028: /* LINE SEPARATOR */ 5347 case 0x2029: /* PARAGRAPH SEPARATOR */ 5348#endif 5349 RRETURN(MATCH_NOMATCH); 5350 } 5351 break; 5352 5353 case OP_VSPACE: 5354 switch(c) 5355 { 5356 default: RRETURN(MATCH_NOMATCH); 5357 case 0x0a: /* LF */ 5358 case 0x0b: /* VT */ 5359 case 0x0c: /* FF */ 5360 case 0x0d: /* CR */ 5361 case 0x85: /* NEL */ 5362#ifdef COMPILE_PCRE16 5363 case 0x2028: /* LINE SEPARATOR */ 5364 case 0x2029: /* PARAGRAPH SEPARATOR */ 5365#endif 5366 break; 5367 } 5368 break; 5369 5370 case OP_NOT_DIGIT: 5371 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 5372 break; 5373 5374 case OP_DIGIT: 5375 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 5376 break; 5377 5378 case OP_NOT_WHITESPACE: 5379 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 5380 break; 5381 5382 case OP_WHITESPACE: 5383 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 5384 break; 5385 5386 case OP_NOT_WORDCHAR: 5387 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 5388 break; 5389 5390 case OP_WORDCHAR: 5391 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 5392 break; 5393 5394 default: 5395 RRETURN(PCRE_ERROR_INTERNAL); 5396 } 5397 } 5398 } 5399 /* Control never gets here */ 5400 } 5401 5402 /* If maximizing, it is worth using inline code for speed, doing the type 5403 test once at the start (i.e. keep it out of the loop). Again, keep the 5404 UTF-8 and UCP stuff separate. */ 5405 5406 else 5407 { 5408 pp = eptr; /* Remember where we started */ 5409 5410#ifdef SUPPORT_UCP 5411 if (prop_type >= 0) 5412 { 5413 switch(prop_type) 5414 { 5415 case PT_ANY: 5416 for (i = min; i < max; i++) 5417 { 5418 int len = 1; 5419 if (eptr >= md->end_subject) 5420 { 5421 SCHECK_PARTIAL(); 5422 break; 5423 } 5424 GETCHARLENTEST(c, eptr, len); 5425 if (prop_fail_result) break; 5426 eptr+= len; 5427 } 5428 break; 5429 5430 case PT_LAMP: 5431 for (i = min; i < max; i++) 5432 { 5433 int chartype; 5434 int len = 1; 5435 if (eptr >= md->end_subject) 5436 { 5437 SCHECK_PARTIAL(); 5438 break; 5439 } 5440 GETCHARLENTEST(c, eptr, len); 5441 chartype = UCD_CHARTYPE(c); 5442 if ((chartype == ucp_Lu || 5443 chartype == ucp_Ll || 5444 chartype == ucp_Lt) == prop_fail_result) 5445 break; 5446 eptr+= len; 5447 } 5448 break; 5449 5450 case PT_GC: 5451 for (i = min; i < max; i++) 5452 { 5453 int len = 1; 5454 if (eptr >= md->end_subject) 5455 { 5456 SCHECK_PARTIAL(); 5457 break; 5458 } 5459 GETCHARLENTEST(c, eptr, len); 5460 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break; 5461 eptr+= len; 5462 } 5463 break; 5464 5465 case PT_PC: 5466 for (i = min; i < max; i++) 5467 { 5468 int len = 1; 5469 if (eptr >= md->end_subject) 5470 { 5471 SCHECK_PARTIAL(); 5472 break; 5473 } 5474 GETCHARLENTEST(c, eptr, len); 5475 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break; 5476 eptr+= len; 5477 } 5478 break; 5479 5480 case PT_SC: 5481 for (i = min; i < max; i++) 5482 { 5483 int len = 1; 5484 if (eptr >= md->end_subject) 5485 { 5486 SCHECK_PARTIAL(); 5487 break; 5488 } 5489 GETCHARLENTEST(c, eptr, len); 5490 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break; 5491 eptr+= len; 5492 } 5493 break; 5494 5495 case PT_ALNUM: 5496 for (i = min; i < max; i++) 5497 { 5498 int category; 5499 int len = 1; 5500 if (eptr >= md->end_subject) 5501 { 5502 SCHECK_PARTIAL(); 5503 break; 5504 } 5505 GETCHARLENTEST(c, eptr, len); 5506 category = UCD_CATEGORY(c); 5507 if ((category == ucp_L || category == ucp_N) == prop_fail_result) 5508 break; 5509 eptr+= len; 5510 } 5511 break; 5512 5513 case PT_SPACE: /* Perl space */ 5514 for (i = min; i < max; i++) 5515 { 5516 int len = 1; 5517 if (eptr >= md->end_subject) 5518 { 5519 SCHECK_PARTIAL(); 5520 break; 5521 } 5522 GETCHARLENTEST(c, eptr, len); 5523 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || 5524 c == CHAR_FF || c == CHAR_CR) 5525 == prop_fail_result) 5526 break; 5527 eptr+= len; 5528 } 5529 break; 5530 5531 case PT_PXSPACE: /* POSIX space */ 5532 for (i = min; i < max; i++) 5533 { 5534 int len = 1; 5535 if (eptr >= md->end_subject) 5536 { 5537 SCHECK_PARTIAL(); 5538 break; 5539 } 5540 GETCHARLENTEST(c, eptr, len); 5541 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL || 5542 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) 5543 == prop_fail_result) 5544 break; 5545 eptr+= len; 5546 } 5547 break; 5548 5549 case PT_WORD: 5550 for (i = min; i < max; i++) 5551 { 5552 int category; 5553 int len = 1; 5554 if (eptr >= md->end_subject) 5555 { 5556 SCHECK_PARTIAL(); 5557 break; 5558 } 5559 GETCHARLENTEST(c, eptr, len); 5560 category = UCD_CATEGORY(c); 5561 if ((category == ucp_L || category == ucp_N || 5562 c == CHAR_UNDERSCORE) == prop_fail_result) 5563 break; 5564 eptr+= len; 5565 } 5566 break; 5567 5568 default: 5569 RRETURN(PCRE_ERROR_INTERNAL); 5570 } 5571 5572 /* eptr is now past the end of the maximum run */ 5573 5574 if (possessive) continue; 5575 for(;;) 5576 { 5577 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); 5578 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5579 if (eptr-- == pp) break; /* Stop if tried at original pos */ 5580 if (utf) BACKCHAR(eptr); 5581 } 5582 } 5583 5584 /* Match extended Unicode sequences. We will get here only if the 5585 support is in the binary; otherwise a compile-time error occurs. */ 5586 5587 else if (ctype == OP_EXTUNI) 5588 { 5589 for (i = min; i < max; i++) 5590 { 5591 int len = 1; 5592 if (eptr >= md->end_subject) 5593 { 5594 SCHECK_PARTIAL(); 5595 break; 5596 } 5597 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5598 if (UCD_CATEGORY(c) == ucp_M) break; 5599 eptr += len; 5600 while (eptr < md->end_subject) 5601 { 5602 len = 1; 5603 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } 5604 if (UCD_CATEGORY(c) != ucp_M) break; 5605 eptr += len; 5606 } 5607 CHECK_PARTIAL(); 5608 } 5609 5610 /* eptr is now past the end of the maximum run */ 5611 5612 if (possessive) continue; 5613 5614 for(;;) 5615 { 5616 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45); 5617 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5618 if (eptr-- == pp) break; /* Stop if tried at original pos */ 5619 for (;;) /* Move back over one extended */ 5620 { 5621 if (!utf) c = *eptr; else 5622 { 5623 BACKCHAR(eptr); 5624 GETCHAR(c, eptr); 5625 } 5626 if (UCD_CATEGORY(c) != ucp_M) break; 5627 eptr--; 5628 } 5629 } 5630 } 5631 5632 else 5633#endif /* SUPPORT_UCP */ 5634 5635#ifdef SUPPORT_UTF 5636 if (utf) 5637 { 5638 switch(ctype) 5639 { 5640 case OP_ANY: 5641 if (max < INT_MAX) 5642 { 5643 for (i = min; i < max; i++) 5644 { 5645 if (eptr >= md->end_subject) 5646 { 5647 SCHECK_PARTIAL(); 5648 break; 5649 } 5650 if (IS_NEWLINE(eptr)) break; 5651 if (md->partial != 0 && /* Take care with CRLF partial */ 5652 eptr + 1 >= md->end_subject && 5653 NLBLOCK->nltype == NLTYPE_FIXED && 5654 NLBLOCK->nllen == 2 && 5655 *eptr == NLBLOCK->nl[0]) 5656 { 5657 md->hitend = TRUE; 5658 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5659 } 5660 eptr++; 5661 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5662 } 5663 } 5664 5665 /* Handle unlimited UTF-8 repeat */ 5666 5667 else 5668 { 5669 for (i = min; i < max; i++) 5670 { 5671 if (eptr >= md->end_subject) 5672 { 5673 SCHECK_PARTIAL(); 5674 break; 5675 } 5676 if (IS_NEWLINE(eptr)) break; 5677 if (md->partial != 0 && /* Take care with CRLF partial */ 5678 eptr + 1 >= md->end_subject && 5679 NLBLOCK->nltype == NLTYPE_FIXED && 5680 NLBLOCK->nllen == 2 && 5681 *eptr == NLBLOCK->nl[0]) 5682 { 5683 md->hitend = TRUE; 5684 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5685 } 5686 eptr++; 5687 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5688 } 5689 } 5690 break; 5691 5692 case OP_ALLANY: 5693 if (max < INT_MAX) 5694 { 5695 for (i = min; i < max; i++) 5696 { 5697 if (eptr >= md->end_subject) 5698 { 5699 SCHECK_PARTIAL(); 5700 break; 5701 } 5702 eptr++; 5703 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); 5704 } 5705 } 5706 else 5707 { 5708 eptr = md->end_subject; /* Unlimited UTF-8 repeat */ 5709 SCHECK_PARTIAL(); 5710 } 5711 break; 5712 5713 /* The byte case is the same as non-UTF8 */ 5714 5715 case OP_ANYBYTE: 5716 c = max - min; 5717 if (c > (unsigned int)(md->end_subject - eptr)) 5718 { 5719 eptr = md->end_subject; 5720 SCHECK_PARTIAL(); 5721 } 5722 else eptr += c; 5723 break; 5724 5725 case OP_ANYNL: 5726 for (i = min; i < max; i++) 5727 { 5728 int len = 1; 5729 if (eptr >= md->end_subject) 5730 { 5731 SCHECK_PARTIAL(); 5732 break; 5733 } 5734 GETCHARLEN(c, eptr, len); 5735 if (c == 0x000d) 5736 { 5737 if (++eptr >= md->end_subject) break; 5738 if (*eptr == 0x000a) eptr++; 5739 } 5740 else 5741 { 5742 if (c != 0x000a && 5743 (md->bsr_anycrlf || 5744 (c != 0x000b && c != 0x000c && 5745 c != 0x0085 && c != 0x2028 && c != 0x2029))) 5746 break; 5747 eptr += len; 5748 } 5749 } 5750 break; 5751 5752 case OP_NOT_HSPACE: 5753 case OP_HSPACE: 5754 for (i = min; i < max; i++) 5755 { 5756 BOOL gotspace; 5757 int len = 1; 5758 if (eptr >= md->end_subject) 5759 { 5760 SCHECK_PARTIAL(); 5761 break; 5762 } 5763 GETCHARLEN(c, eptr, len); 5764 switch(c) 5765 { 5766 default: gotspace = FALSE; break; 5767 case 0x09: /* HT */ 5768 case 0x20: /* SPACE */ 5769 case 0xa0: /* NBSP */ 5770 case 0x1680: /* OGHAM SPACE MARK */ 5771 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 5772 case 0x2000: /* EN QUAD */ 5773 case 0x2001: /* EM QUAD */ 5774 case 0x2002: /* EN SPACE */ 5775 case 0x2003: /* EM SPACE */ 5776 case 0x2004: /* THREE-PER-EM SPACE */ 5777 case 0x2005: /* FOUR-PER-EM SPACE */ 5778 case 0x2006: /* SIX-PER-EM SPACE */ 5779 case 0x2007: /* FIGURE SPACE */ 5780 case 0x2008: /* PUNCTUATION SPACE */ 5781 case 0x2009: /* THIN SPACE */ 5782 case 0x200A: /* HAIR SPACE */ 5783 case 0x202f: /* NARROW NO-BREAK SPACE */ 5784 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 5785 case 0x3000: /* IDEOGRAPHIC SPACE */ 5786 gotspace = TRUE; 5787 break; 5788 } 5789 if (gotspace == (ctype == OP_NOT_HSPACE)) break; 5790 eptr += len; 5791 } 5792 break; 5793 5794 case OP_NOT_VSPACE: 5795 case OP_VSPACE: 5796 for (i = min; i < max; i++) 5797 { 5798 BOOL gotspace; 5799 int len = 1; 5800 if (eptr >= md->end_subject) 5801 { 5802 SCHECK_PARTIAL(); 5803 break; 5804 } 5805 GETCHARLEN(c, eptr, len); 5806 switch(c) 5807 { 5808 default: gotspace = FALSE; break; 5809 case 0x0a: /* LF */ 5810 case 0x0b: /* VT */ 5811 case 0x0c: /* FF */ 5812 case 0x0d: /* CR */ 5813 case 0x85: /* NEL */ 5814 case 0x2028: /* LINE SEPARATOR */ 5815 case 0x2029: /* PARAGRAPH SEPARATOR */ 5816 gotspace = TRUE; 5817 break; 5818 } 5819 if (gotspace == (ctype == OP_NOT_VSPACE)) break; 5820 eptr += len; 5821 } 5822 break; 5823 5824 case OP_NOT_DIGIT: 5825 for (i = min; i < max; i++) 5826 { 5827 int len = 1; 5828 if (eptr >= md->end_subject) 5829 { 5830 SCHECK_PARTIAL(); 5831 break; 5832 } 5833 GETCHARLEN(c, eptr, len); 5834 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 5835 eptr+= len; 5836 } 5837 break; 5838 5839 case OP_DIGIT: 5840 for (i = min; i < max; i++) 5841 { 5842 int len = 1; 5843 if (eptr >= md->end_subject) 5844 { 5845 SCHECK_PARTIAL(); 5846 break; 5847 } 5848 GETCHARLEN(c, eptr, len); 5849 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 5850 eptr+= len; 5851 } 5852 break; 5853 5854 case OP_NOT_WHITESPACE: 5855 for (i = min; i < max; i++) 5856 { 5857 int len = 1; 5858 if (eptr >= md->end_subject) 5859 { 5860 SCHECK_PARTIAL(); 5861 break; 5862 } 5863 GETCHARLEN(c, eptr, len); 5864 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 5865 eptr+= len; 5866 } 5867 break; 5868 5869 case OP_WHITESPACE: 5870 for (i = min; i < max; i++) 5871 { 5872 int len = 1; 5873 if (eptr >= md->end_subject) 5874 { 5875 SCHECK_PARTIAL(); 5876 break; 5877 } 5878 GETCHARLEN(c, eptr, len); 5879 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 5880 eptr+= len; 5881 } 5882 break; 5883 5884 case OP_NOT_WORDCHAR: 5885 for (i = min; i < max; i++) 5886 { 5887 int len = 1; 5888 if (eptr >= md->end_subject) 5889 { 5890 SCHECK_PARTIAL(); 5891 break; 5892 } 5893 GETCHARLEN(c, eptr, len); 5894 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 5895 eptr+= len; 5896 } 5897 break; 5898 5899 case OP_WORDCHAR: 5900 for (i = min; i < max; i++) 5901 { 5902 int len = 1; 5903 if (eptr >= md->end_subject) 5904 { 5905 SCHECK_PARTIAL(); 5906 break; 5907 } 5908 GETCHARLEN(c, eptr, len); 5909 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 5910 eptr+= len; 5911 } 5912 break; 5913 5914 default: 5915 RRETURN(PCRE_ERROR_INTERNAL); 5916 } 5917 5918 /* eptr is now past the end of the maximum run. If possessive, we are 5919 done (no backing up). Otherwise, match at this position; anything other 5920 than no match is immediately returned. For nomatch, back up one 5921 character, unless we are matching \R and the last thing matched was 5922 \r\n, in which case, back up two bytes. */ 5923 5924 if (possessive) continue; 5925 for(;;) 5926 { 5927 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46); 5928 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5929 if (eptr-- == pp) break; /* Stop if tried at original pos */ 5930 BACKCHAR(eptr); 5931 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && 5932 eptr[-1] == '\r') eptr--; 5933 } 5934 } 5935 else 5936#endif /* SUPPORT_UTF */ 5937 /* Not UTF mode */ 5938 { 5939 switch(ctype) 5940 { 5941 case OP_ANY: 5942 for (i = min; i < max; i++) 5943 { 5944 if (eptr >= md->end_subject) 5945 { 5946 SCHECK_PARTIAL(); 5947 break; 5948 } 5949 if (IS_NEWLINE(eptr)) break; 5950 if (md->partial != 0 && /* Take care with CRLF partial */ 5951 eptr + 1 >= md->end_subject && 5952 NLBLOCK->nltype == NLTYPE_FIXED && 5953 NLBLOCK->nllen == 2 && 5954 *eptr == NLBLOCK->nl[0]) 5955 { 5956 md->hitend = TRUE; 5957 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); 5958 } 5959 eptr++; 5960 } 5961 break; 5962 5963 case OP_ALLANY: 5964 case OP_ANYBYTE: 5965 c = max - min; 5966 if (c > (unsigned int)(md->end_subject - eptr)) 5967 { 5968 eptr = md->end_subject; 5969 SCHECK_PARTIAL(); 5970 } 5971 else eptr += c; 5972 break; 5973 5974 case OP_ANYNL: 5975 for (i = min; i < max; i++) 5976 { 5977 if (eptr >= md->end_subject) 5978 { 5979 SCHECK_PARTIAL(); 5980 break; 5981 } 5982 c = *eptr; 5983 if (c == 0x000d) 5984 { 5985 if (++eptr >= md->end_subject) break; 5986 if (*eptr == 0x000a) eptr++; 5987 } 5988 else 5989 { 5990 if (c != 0x000a && (md->bsr_anycrlf || 5991 (c != 0x000b && c != 0x000c && c != 0x0085 5992#ifdef COMPILE_PCRE16 5993 && c != 0x2028 && c != 0x2029 5994#endif 5995 ))) break; 5996 eptr++; 5997 } 5998 } 5999 break; 6000 6001 case OP_NOT_HSPACE: 6002 for (i = min; i < max; i++) 6003 { 6004 if (eptr >= md->end_subject) 6005 { 6006 SCHECK_PARTIAL(); 6007 break; 6008 } 6009 c = *eptr; 6010 if (c == 0x09 || c == 0x20 || c == 0xa0 6011#ifdef COMPILE_PCRE16 6012 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A) 6013 || c == 0x202f || c == 0x205f || c == 0x3000 6014#endif 6015 ) break; 6016 eptr++; 6017 } 6018 break; 6019 6020 case OP_HSPACE: 6021 for (i = min; i < max; i++) 6022 { 6023 if (eptr >= md->end_subject) 6024 { 6025 SCHECK_PARTIAL(); 6026 break; 6027 } 6028 c = *eptr; 6029 if (c != 0x09 && c != 0x20 && c != 0xa0 6030#ifdef COMPILE_PCRE16 6031 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A) 6032 && c != 0x202f && c != 0x205f && c != 0x3000 6033#endif 6034 ) break; 6035 eptr++; 6036 } 6037 break; 6038 6039 case OP_NOT_VSPACE: 6040 for (i = min; i < max; i++) 6041 { 6042 if (eptr >= md->end_subject) 6043 { 6044 SCHECK_PARTIAL(); 6045 break; 6046 } 6047 c = *eptr; 6048 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85 6049#ifdef COMPILE_PCRE16 6050 || c == 0x2028 || c == 0x2029 6051#endif 6052 ) break; 6053 eptr++; 6054 } 6055 break; 6056 6057 case OP_VSPACE: 6058 for (i = min; i < max; i++) 6059 { 6060 if (eptr >= md->end_subject) 6061 { 6062 SCHECK_PARTIAL(); 6063 break; 6064 } 6065 c = *eptr; 6066 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85 6067#ifdef COMPILE_PCRE16 6068 && c != 0x2028 && c != 0x2029 6069#endif 6070 ) break; 6071 eptr++; 6072 } 6073 break; 6074 6075 case OP_NOT_DIGIT: 6076 for (i = min; i < max; i++) 6077 { 6078 if (eptr >= md->end_subject) 6079 { 6080 SCHECK_PARTIAL(); 6081 break; 6082 } 6083 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break; 6084 eptr++; 6085 } 6086 break; 6087 6088 case OP_DIGIT: 6089 for (i = min; i < max; i++) 6090 { 6091 if (eptr >= md->end_subject) 6092 { 6093 SCHECK_PARTIAL(); 6094 break; 6095 } 6096 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break; 6097 eptr++; 6098 } 6099 break; 6100 6101 case OP_NOT_WHITESPACE: 6102 for (i = min; i < max; i++) 6103 { 6104 if (eptr >= md->end_subject) 6105 { 6106 SCHECK_PARTIAL(); 6107 break; 6108 } 6109 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break; 6110 eptr++; 6111 } 6112 break; 6113 6114 case OP_WHITESPACE: 6115 for (i = min; i < max; i++) 6116 { 6117 if (eptr >= md->end_subject) 6118 { 6119 SCHECK_PARTIAL(); 6120 break; 6121 } 6122 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break; 6123 eptr++; 6124 } 6125 break; 6126 6127 case OP_NOT_WORDCHAR: 6128 for (i = min; i < max; i++) 6129 { 6130 if (eptr >= md->end_subject) 6131 { 6132 SCHECK_PARTIAL(); 6133 break; 6134 } 6135 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break; 6136 eptr++; 6137 } 6138 break; 6139 6140 case OP_WORDCHAR: 6141 for (i = min; i < max; i++) 6142 { 6143 if (eptr >= md->end_subject) 6144 { 6145 SCHECK_PARTIAL(); 6146 break; 6147 } 6148 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break; 6149 eptr++; 6150 } 6151 break; 6152 6153 default: 6154 RRETURN(PCRE_ERROR_INTERNAL); 6155 } 6156 6157 /* eptr is now past the end of the maximum run. If possessive, we are 6158 done (no backing up). Otherwise, match at this position; anything other 6159 than no match is immediately returned. For nomatch, back up one 6160 character (byte), unless we are matching \R and the last thing matched 6161 was \r\n, in which case, back up two bytes. */ 6162 6163 if (possessive) continue; 6164 while (eptr >= pp) 6165 { 6166 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47); 6167 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6168 eptr--; 6169 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' && 6170 eptr[-1] == '\r') eptr--; 6171 } 6172 } 6173 6174 /* Get here if we can't make it match with any permitted repetitions */ 6175 6176 RRETURN(MATCH_NOMATCH); 6177 } 6178 /* Control never gets here */ 6179 6180 /* There's been some horrible disaster. Arrival here can only mean there is 6181 something seriously wrong in the code above or the OP_xxx definitions. */ 6182 6183 default: 6184 DPRINTF(("Unknown opcode %d\n", *ecode)); 6185 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); 6186 } 6187 6188 /* Do not stick any code in here without much thought; it is assumed 6189 that "continue" in the code above comes out to here to repeat the main 6190 loop. */ 6191 6192 } /* End of main loop */ 6193/* Control never reaches here */ 6194 6195 6196/* When compiling to use the heap rather than the stack for recursive calls to 6197match(), the RRETURN() macro jumps here. The number that is saved in 6198frame->Xwhere indicates which label we actually want to return to. */ 6199 6200#ifdef NO_RECURSE 6201#define LBL(val) case val: goto L_RM##val; 6202HEAP_RETURN: 6203switch (frame->Xwhere) 6204 { 6205 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 6206 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 6207 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 6208 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 6209 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) 6210 LBL(65) LBL(66) 6211#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 6212 LBL(21) 6213#endif 6214#ifdef SUPPORT_UTF 6215 LBL(16) LBL(18) LBL(20) 6216 LBL(22) LBL(23) LBL(28) LBL(30) 6217 LBL(32) LBL(34) LBL(42) LBL(46) 6218#ifdef SUPPORT_UCP 6219 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 6220 LBL(59) LBL(60) LBL(61) LBL(62) 6221#endif /* SUPPORT_UCP */ 6222#endif /* SUPPORT_UTF */ 6223 default: 6224 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); 6225 6226printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere); 6227 6228 return PCRE_ERROR_INTERNAL; 6229 } 6230#undef LBL 6231#endif /* NO_RECURSE */ 6232} 6233 6234 6235/*************************************************************************** 6236**************************************************************************** 6237 RECURSION IN THE match() FUNCTION 6238 6239Undefine all the macros that were defined above to handle this. */ 6240 6241#ifdef NO_RECURSE 6242#undef eptr 6243#undef ecode 6244#undef mstart 6245#undef offset_top 6246#undef eptrb 6247#undef flags 6248 6249#undef callpat 6250#undef charptr 6251#undef data 6252#undef next 6253#undef pp 6254#undef prev 6255#undef saved_eptr 6256 6257#undef new_recursive 6258 6259#undef cur_is_word 6260#undef condition 6261#undef prev_is_word 6262 6263#undef ctype 6264#undef length 6265#undef max 6266#undef min 6267#undef number 6268#undef offset 6269#undef op 6270#undef save_capture_last 6271#undef save_offset1 6272#undef save_offset2 6273#undef save_offset3 6274#undef stacksave 6275 6276#undef newptrb 6277 6278#endif 6279 6280/* These two are defined as macros in both cases */ 6281 6282#undef fc 6283#undef fi 6284 6285/*************************************************************************** 6286***************************************************************************/ 6287 6288 6289#ifdef NO_RECURSE 6290/************************************************* 6291* Release allocated heap frames * 6292*************************************************/ 6293 6294/* This function releases all the allocated frames. The base frame is on the 6295machine stack, and so must not be freed. 6296 6297Argument: the address of the base frame 6298Returns: nothing 6299*/ 6300 6301static void 6302release_match_heapframes (heapframe *frame_base) 6303{ 6304heapframe *nextframe = frame_base->Xnextframe; 6305while (nextframe != NULL) 6306 { 6307 heapframe *oldframe = nextframe; 6308 nextframe = nextframe->Xnextframe; 6309 (PUBL(stack_free))(oldframe); 6310 } 6311} 6312#endif 6313 6314 6315/************************************************* 6316* Execute a Regular Expression * 6317*************************************************/ 6318 6319/* This function applies a compiled re to a subject string and picks out 6320portions of the string if it matches. Two elements in the vector are set for 6321each substring: the offsets to the start and end of the substring. 6322 6323Arguments: 6324 argument_re points to the compiled expression 6325 extra_data points to extra data or is NULL 6326 subject points to the subject string 6327 length length of subject string (may contain binary zeros) 6328 start_offset where to start in the subject string 6329 options option bits 6330 offsets points to a vector of ints to be filled in with offsets 6331 offsetcount the number of elements in the vector 6332 6333Returns: > 0 => success; value is the number of elements filled in 6334 = 0 => success, but offsets is not big enough 6335 -1 => failed to match 6336 < -1 => some kind of unexpected problem 6337*/ 6338 6339#ifdef COMPILE_PCRE8 6340PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6341pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 6342 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, 6343 int offsetcount) 6344#else 6345PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 6346pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 6347 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 6348 int offsetcount) 6349#endif 6350{ 6351int rc, ocount, arg_offset_max; 6352int newline; 6353BOOL using_temporary_offsets = FALSE; 6354BOOL anchored; 6355BOOL startline; 6356BOOL firstline; 6357BOOL utf; 6358BOOL has_first_char = FALSE; 6359BOOL has_req_char = FALSE; 6360pcre_uchar first_char = 0; 6361pcre_uchar first_char2 = 0; 6362pcre_uchar req_char = 0; 6363pcre_uchar req_char2 = 0; 6364match_data match_block; 6365match_data *md = &match_block; 6366const pcre_uint8 *tables; 6367const pcre_uint8 *start_bits = NULL; 6368PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; 6369PCRE_PUCHAR end_subject; 6370PCRE_PUCHAR start_partial = NULL; 6371PCRE_PUCHAR req_char_ptr = start_match - 1; 6372 6373const pcre_study_data *study; 6374const REAL_PCRE *re = (const REAL_PCRE *)argument_re; 6375 6376#ifdef NO_RECURSE 6377heapframe frame_zero; 6378frame_zero.Xprevframe = NULL; /* Marks the top level */ 6379frame_zero.Xnextframe = NULL; /* None are allocated yet */ 6380md->match_frames_base = &frame_zero; 6381#endif 6382 6383/* Check for the special magic call that measures the size of the stack used 6384per recursive call of match(). Without the funny casting for sizeof, a Windows 6385compiler gave this error: "unary minus operator applied to unsigned type, 6386result still unsigned". Hopefully the cast fixes that. */ 6387 6388if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && 6389 start_offset == -999) 6390#ifdef NO_RECURSE 6391 return -((int)sizeof(heapframe)); 6392#else 6393 return match(NULL, NULL, NULL, 0, NULL, NULL, 0); 6394#endif 6395 6396/* Plausibility checks */ 6397 6398if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 6399if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) 6400 return PCRE_ERROR_NULL; 6401if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 6402if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 6403 6404/* Check that the first field in the block is the magic number. If it is not, 6405return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 6406REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 6407means that the pattern is likely compiled with different endianness. */ 6408 6409if (re->magic_number != MAGIC_NUMBER) 6410 return re->magic_number == REVERSED_MAGIC_NUMBER? 6411 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 6412if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 6413 6414/* These two settings are used in the code for checking a UTF-8 string that 6415follows immediately afterwards. Other values in the md block are used only 6416during "normal" pcre_exec() processing, not when the JIT support is in use, 6417so they are set up later. */ 6418 6419/* PCRE_UTF16 has the same value as PCRE_UTF8. */ 6420utf = md->utf = (re->options & PCRE_UTF8) != 0; 6421md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : 6422 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; 6423 6424/* Check a UTF-8 string if required. Pass back the character offset and error 6425code for an invalid string if a results vector is available. */ 6426 6427#ifdef SUPPORT_UTF 6428if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 6429 { 6430 int erroroffset; 6431 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); 6432 if (errorcode != 0) 6433 { 6434 if (offsetcount >= 2) 6435 { 6436 offsets[0] = erroroffset; 6437 offsets[1] = errorcode; 6438 } 6439#ifdef COMPILE_PCRE16 6440 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? 6441 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; 6442#else 6443 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? 6444 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 6445#endif 6446 } 6447 6448 /* Check that a start_offset points to the start of a UTF character. */ 6449 if (start_offset > 0 && start_offset < length && 6450 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 6451 return PCRE_ERROR_BADUTF8_OFFSET; 6452 } 6453#endif 6454 6455/* If the pattern was successfully studied with JIT support, run the JIT 6456executable instead of the rest of this function. Most options must be set at 6457compile time for the JIT code to be usable. Fallback to the normal code path if 6458an unsupported flag is set. */ 6459 6460#ifdef SUPPORT_JIT 6461if (extra_data != NULL 6462 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT | 6463 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT 6464 && extra_data->executable_jit != NULL 6465 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL | 6466 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | 6467 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0) 6468 { 6469 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length, 6470 start_offset, options, offsets, offsetcount); 6471 6472 /* PCRE_ERROR_NULL means that the selected normal or partial matching 6473 mode is not compiled. In this case we simply fallback to interpreter. */ 6474 6475 if (rc != PCRE_ERROR_NULL) return rc; 6476 } 6477#endif 6478 6479/* Carry on with non-JIT matching. This information is for finding all the 6480numbers associated with a given name, for condition testing. */ 6481 6482md->name_table = (pcre_uchar *)re + re->name_table_offset; 6483md->name_count = re->name_count; 6484md->name_entry_size = re->name_entry_size; 6485 6486/* Fish out the optional data from the extra_data structure, first setting 6487the default values. */ 6488 6489study = NULL; 6490md->match_limit = MATCH_LIMIT; 6491md->match_limit_recursion = MATCH_LIMIT_RECURSION; 6492md->callout_data = NULL; 6493 6494/* The table pointer is always in native byte order. */ 6495 6496tables = re->tables; 6497 6498if (extra_data != NULL) 6499 { 6500 register unsigned int flags = extra_data->flags; 6501 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 6502 study = (const pcre_study_data *)extra_data->study_data; 6503 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 6504 md->match_limit = extra_data->match_limit; 6505 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 6506 md->match_limit_recursion = extra_data->match_limit_recursion; 6507 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 6508 md->callout_data = extra_data->callout_data; 6509 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 6510 } 6511 6512/* If the exec call supplied NULL for tables, use the inbuilt ones. This 6513is a feature that makes it possible to save compiled regex and re-use them 6514in other programs later. */ 6515 6516if (tables == NULL) tables = PRIV(default_tables); 6517 6518/* Set up other data */ 6519 6520anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 6521startline = (re->flags & PCRE_STARTLINE) != 0; 6522firstline = (re->options & PCRE_FIRSTLINE) != 0; 6523 6524/* The code starts after the real_pcre block and the capture name table. */ 6525 6526md->start_code = (const pcre_uchar *)re + re->name_table_offset + 6527 re->name_count * re->name_entry_size; 6528 6529md->start_subject = (PCRE_PUCHAR)subject; 6530md->start_offset = start_offset; 6531md->end_subject = md->start_subject + length; 6532end_subject = md->end_subject; 6533 6534md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 6535md->use_ucp = (re->options & PCRE_UCP) != 0; 6536md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; 6537md->ignore_skip_arg = FALSE; 6538 6539/* Some options are unpacked into BOOL variables in the hope that testing 6540them will be faster than individual option bits. */ 6541 6542md->notbol = (options & PCRE_NOTBOL) != 0; 6543md->noteol = (options & PCRE_NOTEOL) != 0; 6544md->notempty = (options & PCRE_NOTEMPTY) != 0; 6545md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; 6546 6547md->hitend = FALSE; 6548md->mark = md->nomatch_mark = NULL; /* In case never set */ 6549 6550md->recursive = NULL; /* No recursion at top level */ 6551md->hasthen = (re->flags & PCRE_HASTHEN) != 0; 6552 6553md->lcc = tables + lcc_offset; 6554md->fcc = tables + fcc_offset; 6555md->ctypes = tables + ctypes_offset; 6556 6557/* Handle different \R options. */ 6558 6559switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 6560 { 6561 case 0: 6562 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 6563 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; 6564 else 6565#ifdef BSR_ANYCRLF 6566 md->bsr_anycrlf = TRUE; 6567#else 6568 md->bsr_anycrlf = FALSE; 6569#endif 6570 break; 6571 6572 case PCRE_BSR_ANYCRLF: 6573 md->bsr_anycrlf = TRUE; 6574 break; 6575 6576 case PCRE_BSR_UNICODE: 6577 md->bsr_anycrlf = FALSE; 6578 break; 6579 6580 default: return PCRE_ERROR_BADNEWLINE; 6581 } 6582 6583/* Handle different types of newline. The three bits give eight cases. If 6584nothing is set at run time, whatever was used at compile time applies. */ 6585 6586switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : 6587 (pcre_uint32)options) & PCRE_NEWLINE_BITS) 6588 { 6589 case 0: newline = NEWLINE; break; /* Compile-time default */ 6590 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 6591 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 6592 case PCRE_NEWLINE_CR+ 6593 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 6594 case PCRE_NEWLINE_ANY: newline = -1; break; 6595 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 6596 default: return PCRE_ERROR_BADNEWLINE; 6597 } 6598 6599if (newline == -2) 6600 { 6601 md->nltype = NLTYPE_ANYCRLF; 6602 } 6603else if (newline < 0) 6604 { 6605 md->nltype = NLTYPE_ANY; 6606 } 6607else 6608 { 6609 md->nltype = NLTYPE_FIXED; 6610 if (newline > 255) 6611 { 6612 md->nllen = 2; 6613 md->nl[0] = (newline >> 8) & 255; 6614 md->nl[1] = newline & 255; 6615 } 6616 else 6617 { 6618 md->nllen = 1; 6619 md->nl[0] = newline; 6620 } 6621 } 6622 6623/* Partial matching was originally supported only for a restricted set of 6624regexes; from release 8.00 there are no restrictions, but the bits are still 6625defined (though never set). So there's no harm in leaving this code. */ 6626 6627if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) 6628 return PCRE_ERROR_BADPARTIAL; 6629 6630/* If the expression has got more back references than the offsets supplied can 6631hold, we get a temporary chunk of working store to use during the matching. 6632Otherwise, we can use the vector supplied, rounding down its size to a multiple 6633of 3. */ 6634 6635ocount = offsetcount - (offsetcount % 3); 6636arg_offset_max = (2*ocount)/3; 6637 6638if (re->top_backref > 0 && re->top_backref >= ocount/3) 6639 { 6640 ocount = re->top_backref * 3 + 3; 6641 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int)); 6642 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 6643 using_temporary_offsets = TRUE; 6644 DPRINTF(("Got memory to hold back references\n")); 6645 } 6646else md->offset_vector = offsets; 6647 6648md->offset_end = ocount; 6649md->offset_max = (2*ocount)/3; 6650md->offset_overflow = FALSE; 6651md->capture_last = -1; 6652 6653/* Reset the working variable associated with each extraction. These should 6654never be used unless previously set, but they get saved and restored, and so we 6655initialize them to avoid reading uninitialized locations. Also, unset the 6656offsets for the matched string. This is really just for tidiness with callouts, 6657in case they inspect these fields. */ 6658 6659if (md->offset_vector != NULL) 6660 { 6661 register int *iptr = md->offset_vector + ocount; 6662 register int *iend = iptr - re->top_bracket; 6663 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2; 6664 while (--iptr >= iend) *iptr = -1; 6665 md->offset_vector[0] = md->offset_vector[1] = -1; 6666 } 6667 6668/* Set up the first character to match, if available. The first_char value is 6669never set for an anchored regular expression, but the anchoring may be forced 6670at run time, so we have to test for anchoring. The first char may be unset for 6671an unanchored pattern, of course. If there's no first char and the pattern was 6672studied, there may be a bitmap of possible first characters. */ 6673 6674if (!anchored) 6675 { 6676 if ((re->flags & PCRE_FIRSTSET) != 0) 6677 { 6678 has_first_char = TRUE; 6679 first_char = first_char2 = (pcre_uchar)(re->first_char); 6680 if ((re->flags & PCRE_FCH_CASELESS) != 0) 6681 { 6682 first_char2 = TABLE_GET(first_char, md->fcc, first_char); 6683#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6684 if (utf && first_char > 127) 6685 first_char2 = UCD_OTHERCASE(first_char); 6686#endif 6687 } 6688 } 6689 else 6690 if (!startline && study != NULL && 6691 (study->flags & PCRE_STUDY_MAPPED) != 0) 6692 start_bits = study->start_bits; 6693 } 6694 6695/* For anchored or unanchored matches, there may be a "last known required 6696character" set. */ 6697 6698if ((re->flags & PCRE_REQCHSET) != 0) 6699 { 6700 has_req_char = TRUE; 6701 req_char = req_char2 = (pcre_uchar)(re->req_char); 6702 if ((re->flags & PCRE_RCH_CASELESS) != 0) 6703 { 6704 req_char2 = TABLE_GET(req_char, md->fcc, req_char); 6705#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 6706 if (utf && req_char > 127) 6707 req_char2 = UCD_OTHERCASE(req_char); 6708#endif 6709 } 6710 } 6711 6712 6713/* ==========================================================================*/ 6714 6715/* Loop for handling unanchored repeated matching attempts; for anchored regexs 6716the loop runs just once. */ 6717 6718for(;;) 6719 { 6720 PCRE_PUCHAR save_end_subject = end_subject; 6721 PCRE_PUCHAR new_start_match; 6722 6723 /* If firstline is TRUE, the start of the match is constrained to the first 6724 line of a multiline string. That is, the match must be before or at the first 6725 newline. Implement this by temporarily adjusting end_subject so that we stop 6726 scanning at a newline. If the match fails at the newline, later code breaks 6727 this loop. */ 6728 6729 if (firstline) 6730 { 6731 PCRE_PUCHAR t = start_match; 6732#ifdef SUPPORT_UTF 6733 if (utf) 6734 { 6735 while (t < md->end_subject && !IS_NEWLINE(t)) 6736 { 6737 t++; 6738 ACROSSCHAR(t < end_subject, *t, t++); 6739 } 6740 } 6741 else 6742#endif 6743 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 6744 end_subject = t; 6745 } 6746 6747 /* There are some optimizations that avoid running the match if a known 6748 starting point is not found, or if a known later character is not present. 6749 However, there is an option that disables these, for testing and for ensuring 6750 that all callouts do actually occur. The option can be set in the regex by 6751 (*NO_START_OPT) or passed in match-time options. */ 6752 6753 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 6754 { 6755 /* Advance to a unique first char if there is one. */ 6756 6757 if (has_first_char) 6758 { 6759 if (first_char != first_char2) 6760 while (start_match < end_subject && 6761 *start_match != first_char && *start_match != first_char2) 6762 start_match++; 6763 else 6764 while (start_match < end_subject && *start_match != first_char) 6765 start_match++; 6766 } 6767 6768 /* Or to just after a linebreak for a multiline match */ 6769 6770 else if (startline) 6771 { 6772 if (start_match > md->start_subject + start_offset) 6773 { 6774#ifdef SUPPORT_UTF 6775 if (utf) 6776 { 6777 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6778 { 6779 start_match++; 6780 ACROSSCHAR(start_match < end_subject, *start_match, 6781 start_match++); 6782 } 6783 } 6784 else 6785#endif 6786 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 6787 start_match++; 6788 6789 /* If we have just passed a CR and the newline option is ANY or ANYCRLF, 6790 and we are now at a LF, advance the match position by one more character. 6791 */ 6792 6793 if (start_match[-1] == CHAR_CR && 6794 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 6795 start_match < end_subject && 6796 *start_match == CHAR_NL) 6797 start_match++; 6798 } 6799 } 6800 6801 /* Or to a non-unique first byte after study */ 6802 6803 else if (start_bits != NULL) 6804 { 6805 while (start_match < end_subject) 6806 { 6807 register unsigned int c = *start_match; 6808#ifndef COMPILE_PCRE8 6809 if (c > 255) c = 255; 6810#endif 6811 if ((start_bits[c/8] & (1 << (c&7))) == 0) 6812 { 6813 start_match++; 6814#if defined SUPPORT_UTF && defined COMPILE_PCRE8 6815 /* In non 8-bit mode, the iteration will stop for 6816 characters > 255 at the beginning or not stop at all. */ 6817 if (utf) 6818 ACROSSCHAR(start_match < end_subject, *start_match, 6819 start_match++); 6820#endif 6821 } 6822 else break; 6823 } 6824 } 6825 } /* Starting optimizations */ 6826 6827 /* Restore fudged end_subject */ 6828 6829 end_subject = save_end_subject; 6830 6831 /* The following two optimizations are disabled for partial matching or if 6832 disabling is explicitly requested. */ 6833 6834 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) 6835 { 6836 /* If the pattern was studied, a minimum subject length may be set. This is 6837 a lower bound; no actual string of that length may actually match the 6838 pattern. Although the value is, strictly, in characters, we treat it as 6839 bytes to avoid spending too much time in this optimization. */ 6840 6841 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 6842 (pcre_uint32)(end_subject - start_match) < study->minlength) 6843 { 6844 rc = MATCH_NOMATCH; 6845 break; 6846 } 6847 6848 /* If req_char is set, we know that that character must appear in the 6849 subject for the match to succeed. If the first character is set, req_char 6850 must be later in the subject; otherwise the test starts at the match point. 6851 This optimization can save a huge amount of backtracking in patterns with 6852 nested unlimited repeats that aren't going to match. Writing separate code 6853 for cased/caseless versions makes it go faster, as does using an 6854 autoincrement and backing off on a match. 6855 6856 HOWEVER: when the subject string is very, very long, searching to its end 6857 can take a long time, and give bad performance on quite ordinary patterns. 6858 This showed up when somebody was matching something like /^\d+C/ on a 6859 32-megabyte string... so we don't do this when the string is sufficiently 6860 long. */ 6861 6862 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX) 6863 { 6864 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0); 6865 6866 /* We don't need to repeat the search if we haven't yet reached the 6867 place we found it at last time. */ 6868 6869 if (p > req_char_ptr) 6870 { 6871 if (req_char != req_char2) 6872 { 6873 while (p < end_subject) 6874 { 6875 register int pp = *p++; 6876 if (pp == req_char || pp == req_char2) { p--; break; } 6877 } 6878 } 6879 else 6880 { 6881 while (p < end_subject) 6882 { 6883 if (*p++ == req_char) { p--; break; } 6884 } 6885 } 6886 6887 /* If we can't find the required character, break the matching loop, 6888 forcing a match failure. */ 6889 6890 if (p >= end_subject) 6891 { 6892 rc = MATCH_NOMATCH; 6893 break; 6894 } 6895 6896 /* If we have found the required character, save the point where we 6897 found it, so that we don't search again next time round the loop if 6898 the start hasn't passed this character yet. */ 6899 6900 req_char_ptr = p; 6901 } 6902 } 6903 } 6904 6905#ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ 6906 printf(">>>> Match against: "); 6907 pchars(start_match, end_subject - start_match, TRUE, md); 6908 printf("\n"); 6909#endif 6910 6911 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 6912 first starting point for which a partial match was found. */ 6913 6914 md->start_match_ptr = start_match; 6915 md->start_used_ptr = start_match; 6916 md->match_call_count = 0; 6917 md->match_function_type = 0; 6918 md->end_offset_top = 0; 6919 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0); 6920 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; 6921 6922 switch(rc) 6923 { 6924 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched 6925 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP 6926 entirely. The only way we can do that is to re-do the match at the same 6927 point, with a flag to force SKIP with an argument to be ignored. Just 6928 treating this case as NOMATCH does not work because it does not check other 6929 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */ 6930 6931 case MATCH_SKIP_ARG: 6932 new_start_match = start_match; 6933 md->ignore_skip_arg = TRUE; 6934 break; 6935 6936 /* SKIP passes back the next starting point explicitly, but if it is the 6937 same as the match we have just done, treat it as NOMATCH. */ 6938 6939 case MATCH_SKIP: 6940 if (md->start_match_ptr != start_match) 6941 { 6942 new_start_match = md->start_match_ptr; 6943 break; 6944 } 6945 /* Fall through */ 6946 6947 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 6948 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */ 6949 6950 case MATCH_NOMATCH: 6951 case MATCH_PRUNE: 6952 case MATCH_THEN: 6953 md->ignore_skip_arg = FALSE; 6954 new_start_match = start_match + 1; 6955#ifdef SUPPORT_UTF 6956 if (utf) 6957 ACROSSCHAR(new_start_match < end_subject, *new_start_match, 6958 new_start_match++); 6959#endif 6960 break; 6961 6962 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 6963 6964 case MATCH_COMMIT: 6965 rc = MATCH_NOMATCH; 6966 goto ENDLOOP; 6967 6968 /* Any other return is either a match, or some kind of error. */ 6969 6970 default: 6971 goto ENDLOOP; 6972 } 6973 6974 /* Control reaches here for the various types of "no match at this point" 6975 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 6976 6977 rc = MATCH_NOMATCH; 6978 6979 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first 6980 newline in the subject (though it may continue over the newline). Therefore, 6981 if we have just failed to match, starting at a newline, do not continue. */ 6982 6983 if (firstline && IS_NEWLINE(start_match)) break; 6984 6985 /* Advance to new matching position */ 6986 6987 start_match = new_start_match; 6988 6989 /* Break the loop if the pattern is anchored or if we have passed the end of 6990 the subject. */ 6991 6992 if (anchored || start_match > end_subject) break; 6993 6994 /* If we have just passed a CR and we are now at a LF, and the pattern does 6995 not contain any explicit matches for \r or \n, and the newline option is CRLF 6996 or ANY or ANYCRLF, advance the match position by one more character. In 6997 normal matching start_match will aways be greater than the first position at 6998 this stage, but a failed *SKIP can cause a return at the same point, which is 6999 why the first test exists. */ 7000 7001 if (start_match > (PCRE_PUCHAR)subject + start_offset && 7002 start_match[-1] == CHAR_CR && 7003 start_match < end_subject && 7004 *start_match == CHAR_NL && 7005 (re->flags & PCRE_HASCRORLF) == 0 && 7006 (md->nltype == NLTYPE_ANY || 7007 md->nltype == NLTYPE_ANYCRLF || 7008 md->nllen == 2)) 7009 start_match++; 7010 7011 md->mark = NULL; /* Reset for start of next match attempt */ 7012 } /* End of for(;;) "bumpalong" loop */ 7013 7014/* ==========================================================================*/ 7015 7016/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping 7017conditions is true: 7018 7019(1) The pattern is anchored or the match was failed by (*COMMIT); 7020 7021(2) We are past the end of the subject; 7022 7023(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because 7024 this option requests that a match occur at or before the first newline in 7025 the subject. 7026 7027When we have a match and the offset vector is big enough to deal with any 7028backreferences, captured substring offsets will already be set up. In the case 7029where we had to get some local store to hold offsets for backreference 7030processing, copy those that we can. In this case there need not be overflow if 7031certain parts of the pattern were not used, even though there are more 7032capturing parentheses than vector slots. */ 7033 7034ENDLOOP: 7035 7036if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) 7037 { 7038 if (using_temporary_offsets) 7039 { 7040 if (arg_offset_max >= 4) 7041 { 7042 memcpy(offsets + 2, md->offset_vector + 2, 7043 (arg_offset_max - 2) * sizeof(int)); 7044 DPRINTF(("Copied offsets from temporary memory\n")); 7045 } 7046 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE; 7047 DPRINTF(("Freeing temporary memory\n")); 7048 (PUBL(free))(md->offset_vector); 7049 } 7050 7051 /* Set the return code to the number of captured strings, or 0 if there were 7052 too many to fit into the vector. */ 7053 7054 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)? 7055 0 : md->end_offset_top/2; 7056 7057 /* If there is space in the offset vector, set any unused pairs at the end of 7058 the pattern to -1 for backwards compatibility. It is documented that this 7059 happens. In earlier versions, the whole set of potential capturing offsets 7060 was set to -1 each time round the loop, but this is handled differently now. 7061 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only 7062 those at the end that need unsetting here. We can't just unset them all at 7063 the start of the whole thing because they may get set in one branch that is 7064 not the final matching branch. */ 7065 7066 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL) 7067 { 7068 register int *iptr, *iend; 7069 int resetcount = 2 + re->top_bracket * 2; 7070 if (resetcount > offsetcount) resetcount = offsetcount; 7071 iptr = offsets + md->end_offset_top; 7072 iend = offsets + resetcount; 7073 while (iptr < iend) *iptr++ = -1; 7074 } 7075 7076 /* If there is space, set up the whole thing as substring 0. The value of 7077 md->start_match_ptr might be modified if \K was encountered on the success 7078 matching path. */ 7079 7080 if (offsetcount < 2) rc = 0; else 7081 { 7082 offsets[0] = (int)(md->start_match_ptr - md->start_subject); 7083 offsets[1] = (int)(md->end_match_ptr - md->start_subject); 7084 } 7085 7086 /* Return MARK data if requested */ 7087 7088 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7089 *(extra_data->mark) = (pcre_uchar *)md->mark; 7090 DPRINTF((">>>> returning %d\n", rc)); 7091#ifdef NO_RECURSE 7092 release_match_heapframes(&frame_zero); 7093#endif 7094 return rc; 7095 } 7096 7097/* Control gets here if there has been an error, or if the overall match 7098attempt has failed at all permitted starting positions. */ 7099 7100if (using_temporary_offsets) 7101 { 7102 DPRINTF(("Freeing temporary memory\n")); 7103 (PUBL(free))(md->offset_vector); 7104 } 7105 7106/* For anything other than nomatch or partial match, just return the code. */ 7107 7108if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) 7109 { 7110 DPRINTF((">>>> error: returning %d\n", rc)); 7111#ifdef NO_RECURSE 7112 release_match_heapframes(&frame_zero); 7113#endif 7114 return rc; 7115 } 7116 7117/* Handle partial matches - disable any mark data */ 7118 7119if (start_partial != NULL) 7120 { 7121 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 7122 md->mark = NULL; 7123 if (offsetcount > 1) 7124 { 7125 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); 7126 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); 7127 } 7128 rc = PCRE_ERROR_PARTIAL; 7129 } 7130 7131/* This is the classic nomatch case */ 7132 7133else 7134 { 7135 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 7136 rc = PCRE_ERROR_NOMATCH; 7137 } 7138 7139/* Return the MARK data if it has been requested. */ 7140 7141if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) 7142 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; 7143#ifdef NO_RECURSE 7144 release_match_heapframes(&frame_zero); 7145#endif 7146return rc; 7147} 7148 7149/* End of pcre_exec.c */ 7150