1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2010 University of Cambridge 10 11----------------------------------------------------------------------------- 12Redistribution and use in source and binary forms, with or without 13modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36POSSIBILITY OF SUCH DAMAGE. 37----------------------------------------------------------------------------- 38*/ 39 40 41/* This module contains pcre_exec(), the externally visible function that does 42pattern matching using an NFA algorithm, trying to mimic Perl as closely as 43possible. There are also some static supporting functions. */ 44 45#ifdef HAVE_CONFIG_H 46#include "config.h" 47#endif 48 49#define NLBLOCK md /* Block containing newline information */ 50#define PSSTART start_subject /* Field containing processed string start */ 51#define PSEND end_subject /* Field containing processed string end */ 52 53#include "pcre_internal.h" 54 55/* Undefine some potentially clashing cpp symbols */ 56 57#undef min 58#undef max 59 60/* Flag bits for the match() function */ 61 62#define match_condassert 0x01 /* Called to check a condition assertion */ 63#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ 64 65/* Non-error returns from the match() function. Error returns are externally 66defined PCRE_ERROR_xxx codes, which are all negative. */ 67 68#define MATCH_MATCH 1 69#define MATCH_NOMATCH 0 70 71/* Special internal returns from the match() function. Make them sufficiently 72negative to avoid the external error codes. */ 73 74#define MATCH_COMMIT (-999) 75#define MATCH_PRUNE (-998) 76#define MATCH_SKIP (-997) 77#define MATCH_THEN (-996) 78 79/* Maximum number of ints of offset to save on the stack for recursive calls. 80If the offset vector is bigger, malloc is used. This should be a multiple of 3, 81because the offset vector is always a multiple of 3 long. */ 82 83#define REC_STACK_SAVE_MAX 30 84 85/* Min and max values for the common repeats; for the maxima, 0 => infinity */ 86 87static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; 88static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; 89 90 91 92#ifdef PCRE_DEBUG 93/************************************************* 94* Debugging function to print chars * 95*************************************************/ 96 97/* Print a sequence of chars in printable format, stopping at the end of the 98subject if the requested. 99 100Arguments: 101 p points to characters 102 length number to print 103 is_subject TRUE if printing from within md->start_subject 104 md pointer to matching data block, if is_subject is TRUE 105 106Returns: nothing 107*/ 108 109static void 110pchars(const uschar *p, int length, BOOL is_subject, match_data *md) 111{ 112unsigned int c; 113if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 114while (length-- > 0) 115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); 116} 117#endif 118 119 120 121/************************************************* 122* Match a back-reference * 123*************************************************/ 124 125/* If a back reference hasn't been set, the length that is passed is greater 126than the number of characters left in the string, so the match fails. 127 128Arguments: 129 offset index into the offset vector 130 eptr points into the subject 131 length length to be matched 132 md points to match data block 133 ims the ims flags 134 135Returns: TRUE if matched 136*/ 137 138static BOOL 139match_ref(int offset, register USPTR eptr, int length, match_data *md, 140 unsigned long int ims) 141{ 142USPTR p = md->start_subject + md->offset_vector[offset]; 143 144#ifdef PCRE_DEBUG 145if (eptr >= md->end_subject) 146 printf("matching subject <null>"); 147else 148 { 149 printf("matching subject "); 150 pchars(eptr, length, TRUE, md); 151 } 152printf(" against backref "); 153pchars(p, length, FALSE, md); 154printf("\n"); 155#endif 156 157/* Always fail if not enough characters left */ 158 159if (length > md->end_subject - eptr) return FALSE; 160 161/* Separate the caseless case for speed. In UTF-8 mode we can only do this 162properly if Unicode properties are supported. Otherwise, we can check only 163ASCII characters. */ 164 165if ((ims & PCRE_CASELESS) != 0) 166 { 167#ifdef SUPPORT_UTF8 168#ifdef SUPPORT_UCP 169 if (md->utf8) 170 { 171 USPTR endptr = eptr + length; 172 while (eptr < endptr) 173 { 174 int c, d; 175 GETCHARINC(c, eptr); 176 GETCHARINC(d, p); 177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE; 178 } 179 } 180 else 181#endif 182#endif 183 184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there 185 is no UCP support. */ 186 187 while (length-- > 0) 188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } 189 } 190 191/* In the caseful case, we can just compare the bytes, whether or not we 192are in UTF-8 mode. */ 193 194else 195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } 196 197return TRUE; 198} 199 200 201 202/*************************************************************************** 203**************************************************************************** 204 RECURSION IN THE match() FUNCTION 205 206The match() function is highly recursive, though not every recursive call 207increases the recursive depth. Nevertheless, some regular expressions can cause 208it to recurse to a great depth. I was writing for Unix, so I just let it call 209itself recursively. This uses the stack for saving everything that has to be 210saved for a recursive call. On Unix, the stack can be large, and this works 211fine. 212 213It turns out that on some non-Unix-like systems there are problems with 214programs that use a lot of stack. (This despite the fact that every last chip 215has oodles of memory these days, and techniques for extending the stack have 216been known for decades.) So.... 217 218There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 219calls by keeping local variables that need to be preserved in blocks of memory 220obtained from malloc() instead instead of on the stack. Macros are used to 221achieve this so that the actual code doesn't look very different to what it 222always used to. 223 224The original heap-recursive code used longjmp(). However, it seems that this 225can be very slow on some operating systems. Following a suggestion from Stan 226Switzer, the use of longjmp() has been abolished, at the cost of having to 227provide a unique number for each call to RMATCH. There is no way of generating 228a sequence of numbers at compile time in C. I have given them names, to make 229them stand out more clearly. 230 231Crude tests on x86 Linux show a small speedup of around 5-8%. However, on 232FreeBSD, avoiding longjmp() more than halves the time taken to run the standard 233tests. Furthermore, not using longjmp() means that local dynamic variables 234don't have indeterminate values; this has meant that the frame size can be 235reduced because the result can be "passed back" by straight setting of the 236variable instead of being passed in the frame. 237**************************************************************************** 238***************************************************************************/ 239 240/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN 241below must be updated in sync. */ 242 243enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, 244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, 245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, 246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, 247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, 248 RM51, RM52, RM53, RM54 }; 249 250/* These versions of the macros use the stack, as normal. There are debugging 251versions and production versions. Note that the "rw" argument of RMATCH isn't 252actually used in this definition. */ 253 254#ifndef NO_RECURSE 255#define REGISTER register 256 257#ifdef PCRE_DEBUG 258#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ 259 { \ 260 printf("match() called in line %d\n", __LINE__); \ 261 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \ 262 printf("to line %d\n", __LINE__); \ 263 } 264#define RRETURN(ra) \ 265 { \ 266 printf("match() returned %d from line %d ", ra, __LINE__); \ 267 return ra; \ 268 } 269#else 270#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ 271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1) 272#define RRETURN(ra) return ra 273#endif 274 275#else 276 277 278/* These versions of the macros manage a private stack on the heap. Note that 279the "rd" argument of RMATCH isn't actually used in this definition. It's the md 280argument of match(), which never changes. */ 281 282#define REGISTER 283 284#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ 285 {\ 286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ 287 frame->Xwhere = rw; \ 288 newframe->Xeptr = ra;\ 289 newframe->Xecode = rb;\ 290 newframe->Xmstart = mstart;\ 291 newframe->Xmarkptr = markptr;\ 292 newframe->Xoffset_top = rc;\ 293 newframe->Xims = re;\ 294 newframe->Xeptrb = rf;\ 295 newframe->Xflags = rg;\ 296 newframe->Xrdepth = frame->Xrdepth + 1;\ 297 newframe->Xprevframe = frame;\ 298 frame = newframe;\ 299 DPRINTF(("restarting from line %d\n", __LINE__));\ 300 goto HEAP_RECURSE;\ 301 L_##rw:\ 302 DPRINTF(("jumped back to line %d\n", __LINE__));\ 303 } 304 305#define RRETURN(ra)\ 306 {\ 307 heapframe *newframe = frame;\ 308 frame = newframe->Xprevframe;\ 309 (pcre_stack_free)(newframe);\ 310 if (frame != NULL)\ 311 {\ 312 rrc = ra;\ 313 goto HEAP_RETURN;\ 314 }\ 315 return ra;\ 316 } 317 318 319/* Structure for remembering the local variables in a private frame */ 320 321typedef struct heapframe { 322 struct heapframe *Xprevframe; 323 324 /* Function arguments that may change */ 325 326 USPTR Xeptr; 327 const uschar *Xecode; 328 USPTR Xmstart; 329 USPTR Xmarkptr; 330 int Xoffset_top; 331 long int Xims; 332 eptrblock *Xeptrb; 333 int Xflags; 334 unsigned int Xrdepth; 335 336 /* Function local variables */ 337 338 USPTR Xcallpat; 339#ifdef SUPPORT_UTF8 340 USPTR Xcharptr; 341#endif 342 USPTR Xdata; 343 USPTR Xnext; 344 USPTR Xpp; 345 USPTR Xprev; 346 USPTR Xsaved_eptr; 347 348 recursion_info Xnew_recursive; 349 350 BOOL Xcur_is_word; 351 BOOL Xcondition; 352 BOOL Xprev_is_word; 353 354 unsigned long int Xoriginal_ims; 355 356#ifdef SUPPORT_UCP 357 int Xprop_type; 358 int Xprop_value; 359 int Xprop_fail_result; 360 int Xprop_category; 361 int Xprop_chartype; 362 int Xprop_script; 363 int Xoclength; 364 uschar Xocchars[8]; 365#endif 366 367 int Xcodelink; 368 int Xctype; 369 unsigned int Xfc; 370 int Xfi; 371 int Xlength; 372 int Xmax; 373 int Xmin; 374 int Xnumber; 375 int Xoffset; 376 int Xop; 377 int Xsave_capture_last; 378 int Xsave_offset1, Xsave_offset2, Xsave_offset3; 379 int Xstacksave[REC_STACK_SAVE_MAX]; 380 381 eptrblock Xnewptrb; 382 383 /* Where to jump back to */ 384 385 int Xwhere; 386 387} heapframe; 388 389#endif 390 391 392/*************************************************************************** 393***************************************************************************/ 394 395 396 397/************************************************* 398* Match from current position * 399*************************************************/ 400 401/* This function is called recursively in many circumstances. Whenever it 402returns a negative (error) response, the outer incarnation must also return the 403same response. */ 404 405/* These macros pack up tests that are used for partial matching, and which 406appears several times in the code. We set the "hit end" flag if the pointer is 407at the end of the subject and also past the start of the subject (i.e. 408something has been matched). For hard partial matching, we then return 409immediately. The second one is used when we already know we are past the end of 410the subject. */ 411 412#define CHECK_PARTIAL()\ 413 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\ 414 {\ 415 md->hitend = TRUE;\ 416 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ 417 } 418 419#define SCHECK_PARTIAL()\ 420 if (md->partial != 0 && eptr > mstart)\ 421 {\ 422 md->hitend = TRUE;\ 423 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\ 424 } 425 426 427/* Performance note: It might be tempting to extract commonly used fields from 428the md structure (e.g. utf8, end_subject) into individual variables to improve 429performance. Tests using gcc on a SPARC disproved this; in the first case, it 430made performance worse. 431 432Arguments: 433 eptr pointer to current character in subject 434 ecode pointer to current position in compiled code 435 mstart pointer to the current match start position (can be modified 436 by encountering \K) 437 markptr pointer to the most recent MARK name, or NULL 438 offset_top current top pointer 439 md pointer to "static" info for the match 440 ims current /i, /m, and /s options 441 eptrb pointer to chain of blocks containing eptr at start of 442 brackets - for testing for empty matches 443 flags can contain 444 match_condassert - this is an assertion condition 445 match_cbegroup - this is the start of an unlimited repeat 446 group that can match an empty string 447 rdepth the recursion depth 448 449Returns: MATCH_MATCH if matched ) these values are >= 0 450 MATCH_NOMATCH if failed to match ) 451 a negative PCRE_ERROR_xxx value if aborted by an error condition 452 (e.g. stopped by repeated call or recursion limit) 453*/ 454 455static int 456match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR 457 markptr, int offset_top, match_data *md, unsigned long int ims, 458 eptrblock *eptrb, int flags, unsigned int rdepth) 459{ 460/* These variables do not need to be preserved over recursion in this function, 461so they can be ordinary variables in all cases. Mark some of them with 462"register" because they are used a lot in loops. */ 463 464register int rrc; /* Returns from recursive calls */ 465register int i; /* Used for loops not involving calls to RMATCH() */ 466register unsigned int c; /* Character values not kept over RMATCH() calls */ 467register BOOL utf8; /* Local copy of UTF-8 flag for speed */ 468 469BOOL minimize, possessive; /* Quantifier options */ 470int condcode; 471 472/* When recursion is not being used, all "local" variables that have to be 473preserved over calls to RMATCH() are part of a "frame" which is obtained from 474heap storage. Set up the top-level frame here; others are obtained from the 475heap whenever RMATCH() does a "recursion". See the macro definitions above. */ 476 477#ifdef NO_RECURSE 478heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); 479frame->Xprevframe = NULL; /* Marks the top level */ 480 481/* Copy in the original argument variables */ 482 483frame->Xeptr = eptr; 484frame->Xecode = ecode; 485frame->Xmstart = mstart; 486frame->Xmarkptr = markptr; 487frame->Xoffset_top = offset_top; 488frame->Xims = ims; 489frame->Xeptrb = eptrb; 490frame->Xflags = flags; 491frame->Xrdepth = rdepth; 492 493/* This is where control jumps back to to effect "recursion" */ 494 495HEAP_RECURSE: 496 497/* Macros make the argument variables come from the current frame */ 498 499#define eptr frame->Xeptr 500#define ecode frame->Xecode 501#define mstart frame->Xmstart 502#define markptr frame->Xmarkptr 503#define offset_top frame->Xoffset_top 504#define ims frame->Xims 505#define eptrb frame->Xeptrb 506#define flags frame->Xflags 507#define rdepth frame->Xrdepth 508 509/* Ditto for the local variables */ 510 511#ifdef SUPPORT_UTF8 512#define charptr frame->Xcharptr 513#endif 514#define callpat frame->Xcallpat 515#define codelink frame->Xcodelink 516#define data frame->Xdata 517#define next frame->Xnext 518#define pp frame->Xpp 519#define prev frame->Xprev 520#define saved_eptr frame->Xsaved_eptr 521 522#define new_recursive frame->Xnew_recursive 523 524#define cur_is_word frame->Xcur_is_word 525#define condition frame->Xcondition 526#define prev_is_word frame->Xprev_is_word 527 528#define original_ims frame->Xoriginal_ims 529 530#ifdef SUPPORT_UCP 531#define prop_type frame->Xprop_type 532#define prop_value frame->Xprop_value 533#define prop_fail_result frame->Xprop_fail_result 534#define prop_category frame->Xprop_category 535#define prop_chartype frame->Xprop_chartype 536#define prop_script frame->Xprop_script 537#define oclength frame->Xoclength 538#define occhars frame->Xocchars 539#endif 540 541#define ctype frame->Xctype 542#define fc frame->Xfc 543#define fi frame->Xfi 544#define length frame->Xlength 545#define max frame->Xmax 546#define min frame->Xmin 547#define number frame->Xnumber 548#define offset frame->Xoffset 549#define op frame->Xop 550#define save_capture_last frame->Xsave_capture_last 551#define save_offset1 frame->Xsave_offset1 552#define save_offset2 frame->Xsave_offset2 553#define save_offset3 frame->Xsave_offset3 554#define stacksave frame->Xstacksave 555 556#define newptrb frame->Xnewptrb 557 558/* When recursion is being used, local variables are allocated on the stack and 559get preserved during recursion in the normal way. In this environment, fi and 560i, and fc and c, can be the same variables. */ 561 562#else /* NO_RECURSE not defined */ 563#define fi i 564#define fc c 565 566 567#ifdef SUPPORT_UTF8 /* Many of these variables are used only */ 568const uschar *charptr; /* in small blocks of the code. My normal */ 569#endif /* style of coding would have declared */ 570const uschar *callpat; /* them within each of those blocks. */ 571const uschar *data; /* However, in order to accommodate the */ 572const uschar *next; /* version of this code that uses an */ 573USPTR pp; /* external "stack" implemented on the */ 574const uschar *prev; /* heap, it is easier to declare them all */ 575USPTR saved_eptr; /* here, so the declarations can be cut */ 576 /* out in a block. The only declarations */ 577recursion_info new_recursive; /* within blocks below are for variables */ 578 /* that do not have to be preserved over */ 579BOOL cur_is_word; /* a recursive call to RMATCH(). */ 580BOOL condition; 581BOOL prev_is_word; 582 583unsigned long int original_ims; 584 585#ifdef SUPPORT_UCP 586int prop_type; 587int prop_value; 588int prop_fail_result; 589int prop_category; 590int prop_chartype; 591int prop_script; 592int oclength; 593uschar occhars[8]; 594#endif 595 596int codelink; 597int ctype; 598int length; 599int max; 600int min; 601int number; 602int offset; 603int op; 604int save_capture_last; 605int save_offset1, save_offset2, save_offset3; 606int stacksave[REC_STACK_SAVE_MAX]; 607 608eptrblock newptrb; 609#endif /* NO_RECURSE */ 610 611/* These statements are here to stop the compiler complaining about unitialized 612variables. */ 613 614#ifdef SUPPORT_UCP 615prop_value = 0; 616prop_fail_result = 0; 617#endif 618 619 620/* This label is used for tail recursion, which is used in a few cases even 621when NO_RECURSE is not defined, in order to reduce the amount of stack that is 622used. Thanks to Ian Taylor for noticing this possibility and sending the 623original patch. */ 624 625TAIL_RECURSE: 626 627/* OK, now we can get on with the real code of the function. Recursive calls 628are specified by the macro RMATCH and RRETURN is used to return. When 629NO_RECURSE is *not* defined, these just turn into a recursive call to match() 630and a "return", respectively (possibly with some debugging if PCRE_DEBUG is 631defined). However, RMATCH isn't like a function call because it's quite a 632complicated macro. It has to be used in one particular way. This shouldn't, 633however, impact performance when true recursion is being used. */ 634 635#ifdef SUPPORT_UTF8 636utf8 = md->utf8; /* Local copy of the flag */ 637#else 638utf8 = FALSE; 639#endif 640 641/* First check that we haven't called match() too many times, or that we 642haven't exceeded the recursive call limit. */ 643 644if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 645if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); 646 647original_ims = ims; /* Save for resetting on ')' */ 648 649/* At the start of a group with an unlimited repeat that may match an empty 650string, the match_cbegroup flag is set. When this is the case, add the current 651subject pointer to the chain of such remembered pointers, to be checked when we 652hit the closing ket, in order to break infinite loops that match no characters. 653When match() is called in other circumstances, don't add to the chain. The 654match_cbegroup flag must NOT be used with tail recursion, because the memory 655block that is used is on the stack, so a new one may be required for each 656match(). */ 657 658if ((flags & match_cbegroup) != 0) 659 { 660 newptrb.epb_saved_eptr = eptr; 661 newptrb.epb_prev = eptrb; 662 eptrb = &newptrb; 663 } 664 665/* Now start processing the opcodes. */ 666 667for (;;) 668 { 669 minimize = possessive = FALSE; 670 op = *ecode; 671 672 switch(op) 673 { 674 case OP_FAIL: 675 RRETURN(MATCH_NOMATCH); 676 677 case OP_PRUNE: 678 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 679 ims, eptrb, flags, RM51); 680 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 681 RRETURN(MATCH_PRUNE); 682 683 case OP_COMMIT: 684 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 685 ims, eptrb, flags, RM52); 686 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 687 RRETURN(MATCH_COMMIT); 688 689 case OP_SKIP: 690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 691 ims, eptrb, flags, RM53); 692 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 693 md->start_match_ptr = eptr; /* Pass back current position */ 694 RRETURN(MATCH_SKIP); 695 696 case OP_THEN: 697 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 698 ims, eptrb, flags, RM54); 699 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 700 RRETURN(MATCH_THEN); 701 702 /* Handle a capturing bracket. If there is space in the offset vector, save 703 the current subject position in the working slot at the top of the vector. 704 We mustn't change the current values of the data slot, because they may be 705 set from a previous iteration of this group, and be referred to by a 706 reference inside the group. 707 708 If the bracket fails to match, we need to restore this value and also the 709 values of the final offsets, in case they were set by a previous iteration 710 of the same bracket. 711 712 If there isn't enough space in the offset vector, treat this as if it were 713 a non-capturing bracket. Don't worry about setting the flag for the error 714 case here; that is handled in the code for KET. */ 715 716 case OP_CBRA: 717 case OP_SCBRA: 718 number = GET2(ecode, 1+LINK_SIZE); 719 offset = number << 1; 720 721#ifdef PCRE_DEBUG 722 printf("start bracket %d\n", number); 723 printf("subject="); 724 pchars(eptr, 16, TRUE, md); 725 printf("\n"); 726#endif 727 728 if (offset < md->offset_max) 729 { 730 save_offset1 = md->offset_vector[offset]; 731 save_offset2 = md->offset_vector[offset+1]; 732 save_offset3 = md->offset_vector[md->offset_end - number]; 733 save_capture_last = md->capture_last; 734 735 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 736 md->offset_vector[md->offset_end - number] = eptr - md->start_subject; 737 738 flags = (op == OP_SCBRA)? match_cbegroup : 0; 739 do 740 { 741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, 742 ims, eptrb, flags, RM1); 743 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 744 md->capture_last = save_capture_last; 745 ecode += GET(ecode, 1); 746 } 747 while (*ecode == OP_ALT); 748 749 DPRINTF(("bracket %d failed\n", number)); 750 751 md->offset_vector[offset] = save_offset1; 752 md->offset_vector[offset+1] = save_offset2; 753 md->offset_vector[md->offset_end - number] = save_offset3; 754 755 RRETURN(MATCH_NOMATCH); 756 } 757 758 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat 759 as a non-capturing bracket. */ 760 761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 763 764 DPRINTF(("insufficient capture room: treat as non-capturing\n")); 765 766 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 767 /* VVVVVVVVVVVVVVVVVVVVVVVVV */ 768 769 /* Non-capturing bracket. Loop for all the alternatives. When we get to the 770 final alternative within the brackets, we would return the result of a 771 recursive call to match() whatever happened. We can reduce stack usage by 772 turning this into a tail recursion, except in the case when match_cbegroup 773 is set.*/ 774 775 case OP_BRA: 776 case OP_SBRA: 777 DPRINTF(("start non-capturing bracket\n")); 778 flags = (op >= OP_SBRA)? match_cbegroup : 0; 779 for (;;) 780 { 781 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ 782 { 783 if (flags == 0) /* Not a possibly empty group */ 784 { 785 ecode += _pcre_OP_lengths[*ecode]; 786 DPRINTF(("bracket 0 tail recursion\n")); 787 goto TAIL_RECURSE; 788 } 789 790 /* Possibly empty group; can't use tail recursion. */ 791 792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, 793 eptrb, flags, RM48); 794 RRETURN(rrc); 795 } 796 797 /* For non-final alternatives, continue the loop for a NOMATCH result; 798 otherwise return. */ 799 800 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, 801 eptrb, flags, RM2); 802 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 803 ecode += GET(ecode, 1); 804 } 805 /* Control never reaches here. */ 806 807 /* Conditional group: compilation checked that there are no more than 808 two branches. If the condition is false, skipping the first branch takes us 809 past the end if there is only one branch, but that's OK because that is 810 exactly what going to the ket would do. As there is only one branch to be 811 obeyed, we can use tail recursion to avoid using another stack frame. */ 812 813 case OP_COND: 814 case OP_SCOND: 815 codelink= GET(ecode, 1); 816 817 /* Because of the way auto-callout works during compile, a callout item is 818 inserted between OP_COND and an assertion condition. */ 819 820 if (ecode[LINK_SIZE+1] == OP_CALLOUT) 821 { 822 if (pcre_callout != NULL) 823 { 824 pcre_callout_block cb; 825 cb.version = 1; /* Version 1 of the callout block */ 826 cb.callout_number = ecode[LINK_SIZE+2]; 827 cb.offset_vector = md->offset_vector; 828 cb.subject = (PCRE_SPTR)md->start_subject; 829 cb.subject_length = md->end_subject - md->start_subject; 830 cb.start_match = mstart - md->start_subject; 831 cb.current_position = eptr - md->start_subject; 832 cb.pattern_position = GET(ecode, LINK_SIZE + 3); 833 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); 834 cb.capture_top = offset_top/2; 835 cb.capture_last = md->capture_last; 836 cb.callout_data = md->callout_data; 837 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); 838 if (rrc < 0) RRETURN(rrc); 839 } 840 ecode += _pcre_OP_lengths[OP_CALLOUT]; 841 } 842 843 condcode = ecode[LINK_SIZE+1]; 844 845 /* Now see what the actual condition is */ 846 847 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */ 848 { 849 if (md->recursive == NULL) /* Not recursing => FALSE */ 850 { 851 condition = FALSE; 852 ecode += GET(ecode, 1); 853 } 854 else 855 { 856 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ 857 condition = (recno == RREF_ANY || recno == md->recursive->group_num); 858 859 /* If the test is for recursion into a specific subpattern, and it is 860 false, but the test was set up by name, scan the table to see if the 861 name refers to any other numbers, and test them. The condition is true 862 if any one is set. */ 863 864 if (!condition && condcode == OP_NRREF && recno != RREF_ANY) 865 { 866 uschar *slotA = md->name_table; 867 for (i = 0; i < md->name_count; i++) 868 { 869 if (GET2(slotA, 0) == recno) break; 870 slotA += md->name_entry_size; 871 } 872 873 /* Found a name for the number - there can be only one; duplicate 874 names for different numbers are allowed, but not vice versa. First 875 scan down for duplicates. */ 876 877 if (i < md->name_count) 878 { 879 uschar *slotB = slotA; 880 while (slotB > md->name_table) 881 { 882 slotB -= md->name_entry_size; 883 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 884 { 885 condition = GET2(slotB, 0) == md->recursive->group_num; 886 if (condition) break; 887 } 888 else break; 889 } 890 891 /* Scan up for duplicates */ 892 893 if (!condition) 894 { 895 slotB = slotA; 896 for (i++; i < md->name_count; i++) 897 { 898 slotB += md->name_entry_size; 899 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 900 { 901 condition = GET2(slotB, 0) == md->recursive->group_num; 902 if (condition) break; 903 } 904 else break; 905 } 906 } 907 } 908 } 909 910 /* Chose branch according to the condition */ 911 912 ecode += condition? 3 : GET(ecode, 1); 913 } 914 } 915 916 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */ 917 { 918 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ 919 condition = offset < offset_top && md->offset_vector[offset] >= 0; 920 921 /* If the numbered capture is unset, but the reference was by name, 922 scan the table to see if the name refers to any other numbers, and test 923 them. The condition is true if any one is set. This is tediously similar 924 to the code above, but not close enough to try to amalgamate. */ 925 926 if (!condition && condcode == OP_NCREF) 927 { 928 int refno = offset >> 1; 929 uschar *slotA = md->name_table; 930 931 for (i = 0; i < md->name_count; i++) 932 { 933 if (GET2(slotA, 0) == refno) break; 934 slotA += md->name_entry_size; 935 } 936 937 /* Found a name for the number - there can be only one; duplicate names 938 for different numbers are allowed, but not vice versa. First scan down 939 for duplicates. */ 940 941 if (i < md->name_count) 942 { 943 uschar *slotB = slotA; 944 while (slotB > md->name_table) 945 { 946 slotB -= md->name_entry_size; 947 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 948 { 949 offset = GET2(slotB, 0) << 1; 950 condition = offset < offset_top && 951 md->offset_vector[offset] >= 0; 952 if (condition) break; 953 } 954 else break; 955 } 956 957 /* Scan up for duplicates */ 958 959 if (!condition) 960 { 961 slotB = slotA; 962 for (i++; i < md->name_count; i++) 963 { 964 slotB += md->name_entry_size; 965 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) 966 { 967 offset = GET2(slotB, 0) << 1; 968 condition = offset < offset_top && 969 md->offset_vector[offset] >= 0; 970 if (condition) break; 971 } 972 else break; 973 } 974 } 975 } 976 } 977 978 /* Chose branch according to the condition */ 979 980 ecode += condition? 3 : GET(ecode, 1); 981 } 982 983 else if (condcode == OP_DEF) /* DEFINE - always false */ 984 { 985 condition = FALSE; 986 ecode += GET(ecode, 1); 987 } 988 989 /* The condition is an assertion. Call match() to evaluate it - setting 990 the final argument match_condassert causes it to stop at the end of an 991 assertion. */ 992 993 else 994 { 995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 996 match_condassert, RM3); 997 if (rrc == MATCH_MATCH) 998 { 999 condition = TRUE; 1000 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); 1001 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 1002 } 1003 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1004 { 1005 RRETURN(rrc); /* Need braces because of following else */ 1006 } 1007 else 1008 { 1009 condition = FALSE; 1010 ecode += codelink; 1011 } 1012 } 1013 1014 /* We are now at the branch that is to be obeyed. As there is only one, 1015 we can use tail recursion to avoid using another stack frame, except when 1016 match_cbegroup is required for an unlimited repeat of a possibly empty 1017 group. If the second alternative doesn't exist, we can just plough on. */ 1018 1019 if (condition || *ecode == OP_ALT) 1020 { 1021 ecode += 1 + LINK_SIZE; 1022 if (op == OP_SCOND) /* Possibly empty group */ 1023 { 1024 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); 1025 RRETURN(rrc); 1026 } 1027 else /* Group must match something */ 1028 { 1029 flags = 0; 1030 goto TAIL_RECURSE; 1031 } 1032 } 1033 else /* Condition false & no alternative */ 1034 { 1035 ecode += 1 + LINK_SIZE; 1036 } 1037 break; 1038 1039 1040 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, 1041 to close any currently open capturing brackets. */ 1042 1043 case OP_CLOSE: 1044 number = GET2(ecode, 1); 1045 offset = number << 1; 1046 1047#ifdef PCRE_DEBUG 1048 printf("end bracket %d at *ACCEPT", number); 1049 printf("\n"); 1050#endif 1051 1052 md->capture_last = number; 1053 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 1054 { 1055 md->offset_vector[offset] = 1056 md->offset_vector[md->offset_end - number]; 1057 md->offset_vector[offset+1] = eptr - md->start_subject; 1058 if (offset_top <= offset) offset_top = offset + 2; 1059 } 1060 ecode += 3; 1061 break; 1062 1063 1064 /* End of the pattern, either real or forced. If we are in a top-level 1065 recursion, we should restore the offsets appropriately and continue from 1066 after the call. */ 1067 1068 case OP_ACCEPT: 1069 case OP_END: 1070 if (md->recursive != NULL && md->recursive->group_num == 0) 1071 { 1072 recursion_info *rec = md->recursive; 1073 DPRINTF(("End of pattern in a (?0) recursion\n")); 1074 md->recursive = rec->prevrec; 1075 memmove(md->offset_vector, rec->offset_save, 1076 rec->saved_max * sizeof(int)); 1077 offset_top = rec->save_offset_top; 1078 ims = original_ims; 1079 ecode = rec->after_call; 1080 break; 1081 } 1082 1083 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is 1084 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of 1085 the subject. In both cases, backtracking will then try other alternatives, 1086 if any. */ 1087 1088 if (eptr == mstart && 1089 (md->notempty || 1090 (md->notempty_atstart && 1091 mstart == md->start_subject + md->start_offset))) 1092 RRETURN(MATCH_NOMATCH); 1093 1094 /* Otherwise, we have a match. */ 1095 1096 md->end_match_ptr = eptr; /* Record where we ended */ 1097 md->end_offset_top = offset_top; /* and how many extracts were taken */ 1098 md->start_match_ptr = mstart; /* and the start (\K can modify) */ 1099 RRETURN(MATCH_MATCH); 1100 1101 /* Change option settings */ 1102 1103 case OP_OPT: 1104 ims = ecode[1]; 1105 ecode += 2; 1106 DPRINTF(("ims set to %02lx\n", ims)); 1107 break; 1108 1109 /* Assertion brackets. Check the alternative branches in turn - the 1110 matching won't pass the KET for an assertion. If any one branch matches, 1111 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 1112 start of each branch to move the current point backwards, so the code at 1113 this level is identical to the lookahead case. */ 1114 1115 case OP_ASSERT: 1116 case OP_ASSERTBACK: 1117 do 1118 { 1119 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, 1120 RM4); 1121 if (rrc == MATCH_MATCH) 1122 { 1123 mstart = md->start_match_ptr; /* In case \K reset it */ 1124 break; 1125 } 1126 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 1127 ecode += GET(ecode, 1); 1128 } 1129 while (*ecode == OP_ALT); 1130 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 1131 1132 /* If checking an assertion for a condition, return MATCH_MATCH. */ 1133 1134 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 1135 1136 /* Continue from after the assertion, updating the offsets high water 1137 mark, since extracts may have been taken during the assertion. */ 1138 1139 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1140 ecode += 1 + LINK_SIZE; 1141 offset_top = md->end_offset_top; 1142 continue; 1143 1144 /* Negative assertion: all branches must fail to match. Encountering SKIP, 1145 PRUNE, or COMMIT means we must assume failure without checking subsequent 1146 branches. */ 1147 1148 case OP_ASSERT_NOT: 1149 case OP_ASSERTBACK_NOT: 1150 do 1151 { 1152 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, 1153 RM5); 1154 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); 1155 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) 1156 { 1157 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1158 break; 1159 } 1160 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 1161 ecode += GET(ecode,1); 1162 } 1163 while (*ecode == OP_ALT); 1164 1165 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 1166 1167 ecode += 1 + LINK_SIZE; 1168 continue; 1169 1170 /* Move the subject pointer back. This occurs only at the start of 1171 each branch of a lookbehind assertion. If we are too close to the start to 1172 move back, this match function fails. When working with UTF-8 we move 1173 back a number of characters, not bytes. */ 1174 1175 case OP_REVERSE: 1176#ifdef SUPPORT_UTF8 1177 if (utf8) 1178 { 1179 i = GET(ecode, 1); 1180 while (i-- > 0) 1181 { 1182 eptr--; 1183 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1184 BACKCHAR(eptr); 1185 } 1186 } 1187 else 1188#endif 1189 1190 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 1191 1192 { 1193 eptr -= GET(ecode, 1); 1194 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 1195 } 1196 1197 /* Save the earliest consulted character, then skip to next op code */ 1198 1199 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr; 1200 ecode += 1 + LINK_SIZE; 1201 break; 1202 1203 /* The callout item calls an external function, if one is provided, passing 1204 details of the match so far. This is mainly for debugging, though the 1205 function is able to force a failure. */ 1206 1207 case OP_CALLOUT: 1208 if (pcre_callout != NULL) 1209 { 1210 pcre_callout_block cb; 1211 cb.version = 1; /* Version 1 of the callout block */ 1212 cb.callout_number = ecode[1]; 1213 cb.offset_vector = md->offset_vector; 1214 cb.subject = (PCRE_SPTR)md->start_subject; 1215 cb.subject_length = md->end_subject - md->start_subject; 1216 cb.start_match = mstart - md->start_subject; 1217 cb.current_position = eptr - md->start_subject; 1218 cb.pattern_position = GET(ecode, 2); 1219 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 1220 cb.capture_top = offset_top/2; 1221 cb.capture_last = md->capture_last; 1222 cb.callout_data = md->callout_data; 1223 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); 1224 if (rrc < 0) RRETURN(rrc); 1225 } 1226 ecode += 2 + 2*LINK_SIZE; 1227 break; 1228 1229 /* Recursion either matches the current regex, or some subexpression. The 1230 offset data is the offset to the starting bracket from the start of the 1231 whole pattern. (This is so that it works from duplicated subpatterns.) 1232 1233 If there are any capturing brackets started but not finished, we have to 1234 save their starting points and reinstate them after the recursion. However, 1235 we don't know how many such there are (offset_top records the completed 1236 total) so we just have to save all the potential data. There may be up to 1237 65535 such values, which is too large to put on the stack, but using malloc 1238 for small numbers seems expensive. As a compromise, the stack is used when 1239 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc 1240 is used. A problem is what to do if the malloc fails ... there is no way of 1241 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX 1242 values on the stack, and accept that the rest may be wrong. 1243 1244 There are also other values that have to be saved. We use a chained 1245 sequence of blocks that actually live on the stack. Thanks to Robin Houston 1246 for the original version of this logic. */ 1247 1248 case OP_RECURSE: 1249 { 1250 callpat = md->start_code + GET(ecode, 1); 1251 new_recursive.group_num = (callpat == md->start_code)? 0 : 1252 GET2(callpat, 1 + LINK_SIZE); 1253 1254 /* Add to "recursing stack" */ 1255 1256 new_recursive.prevrec = md->recursive; 1257 md->recursive = &new_recursive; 1258 1259 /* Find where to continue from afterwards */ 1260 1261 ecode += 1 + LINK_SIZE; 1262 new_recursive.after_call = ecode; 1263 1264 /* Now save the offset data. */ 1265 1266 new_recursive.saved_max = md->offset_end; 1267 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 1268 new_recursive.offset_save = stacksave; 1269 else 1270 { 1271 new_recursive.offset_save = 1272 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); 1273 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 1274 } 1275 1276 memcpy(new_recursive.offset_save, md->offset_vector, 1277 new_recursive.saved_max * sizeof(int)); 1278 new_recursive.save_offset_top = offset_top; 1279 1280 /* OK, now we can do the recursion. For each top-level alternative we 1281 restore the offset and recursion data. */ 1282 1283 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 1284 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; 1285 do 1286 { 1287 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, 1288 md, ims, eptrb, flags, RM6); 1289 if (rrc == MATCH_MATCH) 1290 { 1291 DPRINTF(("Recursion matched\n")); 1292 md->recursive = new_recursive.prevrec; 1293 if (new_recursive.offset_save != stacksave) 1294 (pcre_free)(new_recursive.offset_save); 1295 RRETURN(MATCH_MATCH); 1296 } 1297 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) 1298 { 1299 DPRINTF(("Recursion gave error %d\n", rrc)); 1300 if (new_recursive.offset_save != stacksave) 1301 (pcre_free)(new_recursive.offset_save); 1302 RRETURN(rrc); 1303 } 1304 1305 md->recursive = &new_recursive; 1306 memcpy(md->offset_vector, new_recursive.offset_save, 1307 new_recursive.saved_max * sizeof(int)); 1308 callpat += GET(callpat, 1); 1309 } 1310 while (*callpat == OP_ALT); 1311 1312 DPRINTF(("Recursion didn't match\n")); 1313 md->recursive = new_recursive.prevrec; 1314 if (new_recursive.offset_save != stacksave) 1315 (pcre_free)(new_recursive.offset_save); 1316 RRETURN(MATCH_NOMATCH); 1317 } 1318 /* Control never reaches here */ 1319 1320 /* "Once" brackets are like assertion brackets except that after a match, 1321 the point in the subject string is not moved back. Thus there can never be 1322 a move back into the brackets. Friedl calls these "atomic" subpatterns. 1323 Check the alternative branches in turn - the matching won't pass the KET 1324 for this kind of subpattern. If any one branch matches, we carry on as at 1325 the end of a normal bracket, leaving the subject pointer, but resetting 1326 the start-of-match value in case it was changed by \K. */ 1327 1328 case OP_ONCE: 1329 prev = ecode; 1330 saved_eptr = eptr; 1331 1332 do 1333 { 1334 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); 1335 if (rrc == MATCH_MATCH) 1336 { 1337 mstart = md->start_match_ptr; 1338 break; 1339 } 1340 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); 1341 ecode += GET(ecode,1); 1342 } 1343 while (*ecode == OP_ALT); 1344 1345 /* If hit the end of the group (which could be repeated), fail */ 1346 1347 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 1348 1349 /* Continue as from after the assertion, updating the offsets high water 1350 mark, since extracts may have been taken. */ 1351 1352 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); 1353 1354 offset_top = md->end_offset_top; 1355 eptr = md->end_match_ptr; 1356 1357 /* For a non-repeating ket, just continue at this level. This also 1358 happens for a repeating ket if no characters were matched in the group. 1359 This is the forcible breaking of infinite loops as implemented in Perl 1360 5.005. If there is an options reset, it will get obeyed in the normal 1361 course of events. */ 1362 1363 if (*ecode == OP_KET || eptr == saved_eptr) 1364 { 1365 ecode += 1+LINK_SIZE; 1366 break; 1367 } 1368 1369 /* The repeating kets try the rest of the pattern or restart from the 1370 preceding bracket, in the appropriate order. The second "call" of match() 1371 uses tail recursion, to avoid using another stack frame. We need to reset 1372 any options that changed within the bracket before re-running it, so 1373 check the next opcode. */ 1374 1375 if (ecode[1+LINK_SIZE] == OP_OPT) 1376 { 1377 ims = (ims & ~PCRE_IMS) | ecode[4]; 1378 DPRINTF(("ims set to %02lx at group repeat\n", ims)); 1379 } 1380 1381 if (*ecode == OP_KETRMIN) 1382 { 1383 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); 1384 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1385 ecode = prev; 1386 flags = 0; 1387 goto TAIL_RECURSE; 1388 } 1389 else /* OP_KETRMAX */ 1390 { 1391 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); 1392 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1393 ecode += 1 + LINK_SIZE; 1394 flags = 0; 1395 goto TAIL_RECURSE; 1396 } 1397 /* Control never gets here */ 1398 1399 /* An alternation is the end of a branch; scan along to find the end of the 1400 bracketed group and go to there. */ 1401 1402 case OP_ALT: 1403 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 1404 break; 1405 1406 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, 1407 indicating that it may occur zero times. It may repeat infinitely, or not 1408 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets 1409 with fixed upper repeat limits are compiled as a number of copies, with the 1410 optional ones preceded by BRAZERO or BRAMINZERO. */ 1411 1412 case OP_BRAZERO: 1413 { 1414 next = ecode+1; 1415 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); 1416 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1417 do next += GET(next,1); while (*next == OP_ALT); 1418 ecode = next + 1 + LINK_SIZE; 1419 } 1420 break; 1421 1422 case OP_BRAMINZERO: 1423 { 1424 next = ecode+1; 1425 do next += GET(next, 1); while (*next == OP_ALT); 1426 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); 1427 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1428 ecode++; 1429 } 1430 break; 1431 1432 case OP_SKIPZERO: 1433 { 1434 next = ecode+1; 1435 do next += GET(next,1); while (*next == OP_ALT); 1436 ecode = next + 1 + LINK_SIZE; 1437 } 1438 break; 1439 1440 /* End of a group, repeated or non-repeating. */ 1441 1442 case OP_KET: 1443 case OP_KETRMIN: 1444 case OP_KETRMAX: 1445 prev = ecode - GET(ecode, 1); 1446 1447 /* If this was a group that remembered the subject start, in order to break 1448 infinite repeats of empty string matches, retrieve the subject start from 1449 the chain. Otherwise, set it NULL. */ 1450 1451 if (*prev >= OP_SBRA) 1452 { 1453 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ 1454 eptrb = eptrb->epb_prev; /* Backup to previous group */ 1455 } 1456 else saved_eptr = NULL; 1457 1458 /* If we are at the end of an assertion group or an atomic group, stop 1459 matching and return MATCH_MATCH, but record the current high water mark for 1460 use by positive assertions. We also need to record the match start in case 1461 it was changed by \K. */ 1462 1463 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || 1464 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || 1465 *prev == OP_ONCE) 1466 { 1467 md->end_match_ptr = eptr; /* For ONCE */ 1468 md->end_offset_top = offset_top; 1469 md->start_match_ptr = mstart; 1470 RRETURN(MATCH_MATCH); 1471 } 1472 1473 /* For capturing groups we have to check the group number back at the start 1474 and if necessary complete handling an extraction by setting the offsets and 1475 bumping the high water mark. Note that whole-pattern recursion is coded as 1476 a recurse into group 0, so it won't be picked up here. Instead, we catch it 1477 when the OP_END is reached. Other recursion is handled here. */ 1478 1479 if (*prev == OP_CBRA || *prev == OP_SCBRA) 1480 { 1481 number = GET2(prev, 1+LINK_SIZE); 1482 offset = number << 1; 1483 1484#ifdef PCRE_DEBUG 1485 printf("end bracket %d", number); 1486 printf("\n"); 1487#endif 1488 1489 md->capture_last = number; 1490 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 1491 { 1492 md->offset_vector[offset] = 1493 md->offset_vector[md->offset_end - number]; 1494 md->offset_vector[offset+1] = eptr - md->start_subject; 1495 if (offset_top <= offset) offset_top = offset + 2; 1496 } 1497 1498 /* Handle a recursively called group. Restore the offsets 1499 appropriately and continue from after the call. */ 1500 1501 if (md->recursive != NULL && md->recursive->group_num == number) 1502 { 1503 recursion_info *rec = md->recursive; 1504 DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); 1505 md->recursive = rec->prevrec; 1506 memcpy(md->offset_vector, rec->offset_save, 1507 rec->saved_max * sizeof(int)); 1508 offset_top = rec->save_offset_top; 1509 ecode = rec->after_call; 1510 ims = original_ims; 1511 break; 1512 } 1513 } 1514 1515 /* For both capturing and non-capturing groups, reset the value of the ims 1516 flags, in case they got changed during the group. */ 1517 1518 ims = original_ims; 1519 DPRINTF(("ims reset to %02lx\n", ims)); 1520 1521 /* For a non-repeating ket, just continue at this level. This also 1522 happens for a repeating ket if no characters were matched in the group. 1523 This is the forcible breaking of infinite loops as implemented in Perl 1524 5.005. If there is an options reset, it will get obeyed in the normal 1525 course of events. */ 1526 1527 if (*ecode == OP_KET || eptr == saved_eptr) 1528 { 1529 ecode += 1 + LINK_SIZE; 1530 break; 1531 } 1532 1533 /* The repeating kets try the rest of the pattern or restart from the 1534 preceding bracket, in the appropriate order. In the second case, we can use 1535 tail recursion to avoid using another stack frame, unless we have an 1536 unlimited repeat of a group that can match an empty string. */ 1537 1538 flags = (*prev >= OP_SBRA)? match_cbegroup : 0; 1539 1540 if (*ecode == OP_KETRMIN) 1541 { 1542 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); 1543 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1544 if (flags != 0) /* Could match an empty string */ 1545 { 1546 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); 1547 RRETURN(rrc); 1548 } 1549 ecode = prev; 1550 goto TAIL_RECURSE; 1551 } 1552 else /* OP_KETRMAX */ 1553 { 1554 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); 1555 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 1556 ecode += 1 + LINK_SIZE; 1557 flags = 0; 1558 goto TAIL_RECURSE; 1559 } 1560 /* Control never gets here */ 1561 1562 /* Start of subject unless notbol, or after internal newline if multiline */ 1563 1564 case OP_CIRC: 1565 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 1566 if ((ims & PCRE_MULTILINE) != 0) 1567 { 1568 if (eptr != md->start_subject && 1569 (eptr == md->end_subject || !WAS_NEWLINE(eptr))) 1570 RRETURN(MATCH_NOMATCH); 1571 ecode++; 1572 break; 1573 } 1574 /* ... else fall through */ 1575 1576 /* Start of subject assertion */ 1577 1578 case OP_SOD: 1579 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 1580 ecode++; 1581 break; 1582 1583 /* Start of match assertion */ 1584 1585 case OP_SOM: 1586 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 1587 ecode++; 1588 break; 1589 1590 /* Reset the start of match point */ 1591 1592 case OP_SET_SOM: 1593 mstart = eptr; 1594 ecode++; 1595 break; 1596 1597 /* Assert before internal newline if multiline, or before a terminating 1598 newline unless endonly is set, else end of subject unless noteol is set. */ 1599 1600 case OP_DOLL: 1601 if ((ims & PCRE_MULTILINE) != 0) 1602 { 1603 if (eptr < md->end_subject) 1604 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } 1605 else 1606 { if (md->noteol) RRETURN(MATCH_NOMATCH); } 1607 ecode++; 1608 break; 1609 } 1610 else 1611 { 1612 if (md->noteol) RRETURN(MATCH_NOMATCH); 1613 if (!md->endonly) 1614 { 1615 if (eptr != md->end_subject && 1616 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 1617 RRETURN(MATCH_NOMATCH); 1618 ecode++; 1619 break; 1620 } 1621 } 1622 /* ... else fall through for endonly */ 1623 1624 /* End of subject assertion (\z) */ 1625 1626 case OP_EOD: 1627 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 1628 ecode++; 1629 break; 1630 1631 /* End of subject or ending \n assertion (\Z) */ 1632 1633 case OP_EODN: 1634 if (eptr != md->end_subject && 1635 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) 1636 RRETURN(MATCH_NOMATCH); 1637 ecode++; 1638 break; 1639 1640 /* Word boundary assertions */ 1641 1642 case OP_NOT_WORD_BOUNDARY: 1643 case OP_WORD_BOUNDARY: 1644 { 1645 1646 /* Find out if the previous and current characters are "word" characters. 1647 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 1648 be "non-word" characters. Remember the earliest consulted character for 1649 partial matching. */ 1650 1651#ifdef SUPPORT_UTF8 1652 if (utf8) 1653 { 1654 if (eptr == md->start_subject) prev_is_word = FALSE; else 1655 { 1656 USPTR lastptr = eptr - 1; 1657 while((*lastptr & 0xc0) == 0x80) lastptr--; 1658 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; 1659 GETCHAR(c, lastptr); 1660 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 1661 } 1662 if (eptr >= md->end_subject) 1663 { 1664 SCHECK_PARTIAL(); 1665 cur_is_word = FALSE; 1666 } 1667 else 1668 { 1669 GETCHAR(c, eptr); 1670 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 1671 } 1672 } 1673 else 1674#endif 1675 1676 /* Not in UTF-8 mode */ 1677 1678 { 1679 if (eptr == md->start_subject) prev_is_word = FALSE; else 1680 { 1681 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1; 1682 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0); 1683 } 1684 if (eptr >= md->end_subject) 1685 { 1686 SCHECK_PARTIAL(); 1687 cur_is_word = FALSE; 1688 } 1689 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); 1690 } 1691 1692 /* Now see if the situation is what we want */ 1693 1694 if ((*ecode++ == OP_WORD_BOUNDARY)? 1695 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 1696 RRETURN(MATCH_NOMATCH); 1697 } 1698 break; 1699 1700 /* Match a single character type; inline for speed */ 1701 1702 case OP_ANY: 1703 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 1704 /* Fall through */ 1705 1706 case OP_ALLANY: 1707 if (eptr++ >= md->end_subject) 1708 { 1709 SCHECK_PARTIAL(); 1710 RRETURN(MATCH_NOMATCH); 1711 } 1712 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 1713 ecode++; 1714 break; 1715 1716 /* Match a single byte, even in UTF-8 mode. This opcode really does match 1717 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 1718 1719 case OP_ANYBYTE: 1720 if (eptr++ >= md->end_subject) 1721 { 1722 SCHECK_PARTIAL(); 1723 RRETURN(MATCH_NOMATCH); 1724 } 1725 ecode++; 1726 break; 1727 1728 case OP_NOT_DIGIT: 1729 if (eptr >= md->end_subject) 1730 { 1731 SCHECK_PARTIAL(); 1732 RRETURN(MATCH_NOMATCH); 1733 } 1734 GETCHARINCTEST(c, eptr); 1735 if ( 1736#ifdef SUPPORT_UTF8 1737 c < 256 && 1738#endif 1739 (md->ctypes[c] & ctype_digit) != 0 1740 ) 1741 RRETURN(MATCH_NOMATCH); 1742 ecode++; 1743 break; 1744 1745 case OP_DIGIT: 1746 if (eptr >= md->end_subject) 1747 { 1748 SCHECK_PARTIAL(); 1749 RRETURN(MATCH_NOMATCH); 1750 } 1751 GETCHARINCTEST(c, eptr); 1752 if ( 1753#ifdef SUPPORT_UTF8 1754 c >= 256 || 1755#endif 1756 (md->ctypes[c] & ctype_digit) == 0 1757 ) 1758 RRETURN(MATCH_NOMATCH); 1759 ecode++; 1760 break; 1761 1762 case OP_NOT_WHITESPACE: 1763 if (eptr >= md->end_subject) 1764 { 1765 SCHECK_PARTIAL(); 1766 RRETURN(MATCH_NOMATCH); 1767 } 1768 GETCHARINCTEST(c, eptr); 1769 if ( 1770#ifdef SUPPORT_UTF8 1771 c < 256 && 1772#endif 1773 (md->ctypes[c] & ctype_space) != 0 1774 ) 1775 RRETURN(MATCH_NOMATCH); 1776 ecode++; 1777 break; 1778 1779 case OP_WHITESPACE: 1780 if (eptr >= md->end_subject) 1781 { 1782 SCHECK_PARTIAL(); 1783 RRETURN(MATCH_NOMATCH); 1784 } 1785 GETCHARINCTEST(c, eptr); 1786 if ( 1787#ifdef SUPPORT_UTF8 1788 c >= 256 || 1789#endif 1790 (md->ctypes[c] & ctype_space) == 0 1791 ) 1792 RRETURN(MATCH_NOMATCH); 1793 ecode++; 1794 break; 1795 1796 case OP_NOT_WORDCHAR: 1797 if (eptr >= md->end_subject) 1798 { 1799 SCHECK_PARTIAL(); 1800 RRETURN(MATCH_NOMATCH); 1801 } 1802 GETCHARINCTEST(c, eptr); 1803 if ( 1804#ifdef SUPPORT_UTF8 1805 c < 256 && 1806#endif 1807 (md->ctypes[c] & ctype_word) != 0 1808 ) 1809 RRETURN(MATCH_NOMATCH); 1810 ecode++; 1811 break; 1812 1813 case OP_WORDCHAR: 1814 if (eptr >= md->end_subject) 1815 { 1816 SCHECK_PARTIAL(); 1817 RRETURN(MATCH_NOMATCH); 1818 } 1819 GETCHARINCTEST(c, eptr); 1820 if ( 1821#ifdef SUPPORT_UTF8 1822 c >= 256 || 1823#endif 1824 (md->ctypes[c] & ctype_word) == 0 1825 ) 1826 RRETURN(MATCH_NOMATCH); 1827 ecode++; 1828 break; 1829 1830 case OP_ANYNL: 1831 if (eptr >= md->end_subject) 1832 { 1833 SCHECK_PARTIAL(); 1834 RRETURN(MATCH_NOMATCH); 1835 } 1836 GETCHARINCTEST(c, eptr); 1837 switch(c) 1838 { 1839 default: RRETURN(MATCH_NOMATCH); 1840 case 0x000d: 1841 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 1842 break; 1843 1844 case 0x000a: 1845 break; 1846 1847 case 0x000b: 1848 case 0x000c: 1849 case 0x0085: 1850 case 0x2028: 1851 case 0x2029: 1852 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 1853 break; 1854 } 1855 ecode++; 1856 break; 1857 1858 case OP_NOT_HSPACE: 1859 if (eptr >= md->end_subject) 1860 { 1861 SCHECK_PARTIAL(); 1862 RRETURN(MATCH_NOMATCH); 1863 } 1864 GETCHARINCTEST(c, eptr); 1865 switch(c) 1866 { 1867 default: break; 1868 case 0x09: /* HT */ 1869 case 0x20: /* SPACE */ 1870 case 0xa0: /* NBSP */ 1871 case 0x1680: /* OGHAM SPACE MARK */ 1872 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1873 case 0x2000: /* EN QUAD */ 1874 case 0x2001: /* EM QUAD */ 1875 case 0x2002: /* EN SPACE */ 1876 case 0x2003: /* EM SPACE */ 1877 case 0x2004: /* THREE-PER-EM SPACE */ 1878 case 0x2005: /* FOUR-PER-EM SPACE */ 1879 case 0x2006: /* SIX-PER-EM SPACE */ 1880 case 0x2007: /* FIGURE SPACE */ 1881 case 0x2008: /* PUNCTUATION SPACE */ 1882 case 0x2009: /* THIN SPACE */ 1883 case 0x200A: /* HAIR SPACE */ 1884 case 0x202f: /* NARROW NO-BREAK SPACE */ 1885 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1886 case 0x3000: /* IDEOGRAPHIC SPACE */ 1887 RRETURN(MATCH_NOMATCH); 1888 } 1889 ecode++; 1890 break; 1891 1892 case OP_HSPACE: 1893 if (eptr >= md->end_subject) 1894 { 1895 SCHECK_PARTIAL(); 1896 RRETURN(MATCH_NOMATCH); 1897 } 1898 GETCHARINCTEST(c, eptr); 1899 switch(c) 1900 { 1901 default: RRETURN(MATCH_NOMATCH); 1902 case 0x09: /* HT */ 1903 case 0x20: /* SPACE */ 1904 case 0xa0: /* NBSP */ 1905 case 0x1680: /* OGHAM SPACE MARK */ 1906 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1907 case 0x2000: /* EN QUAD */ 1908 case 0x2001: /* EM QUAD */ 1909 case 0x2002: /* EN SPACE */ 1910 case 0x2003: /* EM SPACE */ 1911 case 0x2004: /* THREE-PER-EM SPACE */ 1912 case 0x2005: /* FOUR-PER-EM SPACE */ 1913 case 0x2006: /* SIX-PER-EM SPACE */ 1914 case 0x2007: /* FIGURE SPACE */ 1915 case 0x2008: /* PUNCTUATION SPACE */ 1916 case 0x2009: /* THIN SPACE */ 1917 case 0x200A: /* HAIR SPACE */ 1918 case 0x202f: /* NARROW NO-BREAK SPACE */ 1919 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1920 case 0x3000: /* IDEOGRAPHIC SPACE */ 1921 break; 1922 } 1923 ecode++; 1924 break; 1925 1926 case OP_NOT_VSPACE: 1927 if (eptr >= md->end_subject) 1928 { 1929 SCHECK_PARTIAL(); 1930 RRETURN(MATCH_NOMATCH); 1931 } 1932 GETCHARINCTEST(c, eptr); 1933 switch(c) 1934 { 1935 default: break; 1936 case 0x0a: /* LF */ 1937 case 0x0b: /* VT */ 1938 case 0x0c: /* FF */ 1939 case 0x0d: /* CR */ 1940 case 0x85: /* NEL */ 1941 case 0x2028: /* LINE SEPARATOR */ 1942 case 0x2029: /* PARAGRAPH SEPARATOR */ 1943 RRETURN(MATCH_NOMATCH); 1944 } 1945 ecode++; 1946 break; 1947 1948 case OP_VSPACE: 1949 if (eptr >= md->end_subject) 1950 { 1951 SCHECK_PARTIAL(); 1952 RRETURN(MATCH_NOMATCH); 1953 } 1954 GETCHARINCTEST(c, eptr); 1955 switch(c) 1956 { 1957 default: RRETURN(MATCH_NOMATCH); 1958 case 0x0a: /* LF */ 1959 case 0x0b: /* VT */ 1960 case 0x0c: /* FF */ 1961 case 0x0d: /* CR */ 1962 case 0x85: /* NEL */ 1963 case 0x2028: /* LINE SEPARATOR */ 1964 case 0x2029: /* PARAGRAPH SEPARATOR */ 1965 break; 1966 } 1967 ecode++; 1968 break; 1969 1970#ifdef SUPPORT_UCP 1971 /* Check the next character by Unicode property. We will get here only 1972 if the support is in the binary; otherwise a compile-time error occurs. */ 1973 1974 case OP_PROP: 1975 case OP_NOTPROP: 1976 if (eptr >= md->end_subject) 1977 { 1978 SCHECK_PARTIAL(); 1979 RRETURN(MATCH_NOMATCH); 1980 } 1981 GETCHARINCTEST(c, eptr); 1982 { 1983 const ucd_record *prop = GET_UCD(c); 1984 1985 switch(ecode[1]) 1986 { 1987 case PT_ANY: 1988 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); 1989 break; 1990 1991 case PT_LAMP: 1992 if ((prop->chartype == ucp_Lu || 1993 prop->chartype == ucp_Ll || 1994 prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) 1995 RRETURN(MATCH_NOMATCH); 1996 break; 1997 1998 case PT_GC: 1999 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) 2000 RRETURN(MATCH_NOMATCH); 2001 break; 2002 2003 case PT_PC: 2004 if ((ecode[2] != prop->chartype) == (op == OP_PROP)) 2005 RRETURN(MATCH_NOMATCH); 2006 break; 2007 2008 case PT_SC: 2009 if ((ecode[2] != prop->script) == (op == OP_PROP)) 2010 RRETURN(MATCH_NOMATCH); 2011 break; 2012 2013 default: 2014 RRETURN(PCRE_ERROR_INTERNAL); 2015 } 2016 2017 ecode += 3; 2018 } 2019 break; 2020 2021 /* Match an extended Unicode sequence. We will get here only if the support 2022 is in the binary; otherwise a compile-time error occurs. */ 2023 2024 case OP_EXTUNI: 2025 if (eptr >= md->end_subject) 2026 { 2027 SCHECK_PARTIAL(); 2028 RRETURN(MATCH_NOMATCH); 2029 } 2030 GETCHARINCTEST(c, eptr); 2031 { 2032 int category = UCD_CATEGORY(c); 2033 if (category == ucp_M) RRETURN(MATCH_NOMATCH); 2034 while (eptr < md->end_subject) 2035 { 2036 int len = 1; 2037 if (!utf8) c = *eptr; else 2038 { 2039 GETCHARLEN(c, eptr, len); 2040 } 2041 category = UCD_CATEGORY(c); 2042 if (category != ucp_M) break; 2043 eptr += len; 2044 } 2045 } 2046 ecode++; 2047 break; 2048#endif 2049 2050 2051 /* Match a back reference, possibly repeatedly. Look past the end of the 2052 item to see if there is repeat information following. The code is similar 2053 to that for character classes, but repeated for efficiency. Then obey 2054 similar code to character type repeats - written out again for speed. 2055 However, if the referenced string is the empty string, always treat 2056 it as matched, any number of times (otherwise there could be infinite 2057 loops). */ 2058 2059 case OP_REF: 2060 { 2061 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 2062 ecode += 3; 2063 2064 /* If the reference is unset, there are two possibilities: 2065 2066 (a) In the default, Perl-compatible state, set the length to be longer 2067 than the amount of subject left; this ensures that every attempt at a 2068 match fails. We can't just fail here, because of the possibility of 2069 quantifiers with zero minima. 2070 2071 (b) If the JavaScript compatibility flag is set, set the length to zero 2072 so that the back reference matches an empty string. 2073 2074 Otherwise, set the length to the length of what was matched by the 2075 referenced subpattern. */ 2076 2077 if (offset >= offset_top || md->offset_vector[offset] < 0) 2078 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; 2079 else 2080 length = md->offset_vector[offset+1] - md->offset_vector[offset]; 2081 2082 /* Set up for repetition, or handle the non-repeated case */ 2083 2084 switch (*ecode) 2085 { 2086 case OP_CRSTAR: 2087 case OP_CRMINSTAR: 2088 case OP_CRPLUS: 2089 case OP_CRMINPLUS: 2090 case OP_CRQUERY: 2091 case OP_CRMINQUERY: 2092 c = *ecode++ - OP_CRSTAR; 2093 minimize = (c & 1) != 0; 2094 min = rep_min[c]; /* Pick up values from tables; */ 2095 max = rep_max[c]; /* zero for max => infinity */ 2096 if (max == 0) max = INT_MAX; 2097 break; 2098 2099 case OP_CRRANGE: 2100 case OP_CRMINRANGE: 2101 minimize = (*ecode == OP_CRMINRANGE); 2102 min = GET2(ecode, 1); 2103 max = GET2(ecode, 3); 2104 if (max == 0) max = INT_MAX; 2105 ecode += 5; 2106 break; 2107 2108 default: /* No repeat follows */ 2109 if (!match_ref(offset, eptr, length, md, ims)) 2110 { 2111 CHECK_PARTIAL(); 2112 RRETURN(MATCH_NOMATCH); 2113 } 2114 eptr += length; 2115 continue; /* With the main loop */ 2116 } 2117 2118 /* If the length of the reference is zero, just continue with the 2119 main loop. */ 2120 2121 if (length == 0) continue; 2122 2123 /* First, ensure the minimum number of matches are present. We get back 2124 the length of the reference string explicitly rather than passing the 2125 address of eptr, so that eptr can be a register variable. */ 2126 2127 for (i = 1; i <= min; i++) 2128 { 2129 if (!match_ref(offset, eptr, length, md, ims)) 2130 { 2131 CHECK_PARTIAL(); 2132 RRETURN(MATCH_NOMATCH); 2133 } 2134 eptr += length; 2135 } 2136 2137 /* If min = max, continue at the same level without recursion. 2138 They are not both allowed to be zero. */ 2139 2140 if (min == max) continue; 2141 2142 /* If minimizing, keep trying and advancing the pointer */ 2143 2144 if (minimize) 2145 { 2146 for (fi = min;; fi++) 2147 { 2148 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); 2149 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2150 if (fi >= max) RRETURN(MATCH_NOMATCH); 2151 if (!match_ref(offset, eptr, length, md, ims)) 2152 { 2153 CHECK_PARTIAL(); 2154 RRETURN(MATCH_NOMATCH); 2155 } 2156 eptr += length; 2157 } 2158 /* Control never gets here */ 2159 } 2160 2161 /* If maximizing, find the longest string and work backwards */ 2162 2163 else 2164 { 2165 pp = eptr; 2166 for (i = min; i < max; i++) 2167 { 2168 if (!match_ref(offset, eptr, length, md, ims)) 2169 { 2170 CHECK_PARTIAL(); 2171 break; 2172 } 2173 eptr += length; 2174 } 2175 while (eptr >= pp) 2176 { 2177 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); 2178 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2179 eptr -= length; 2180 } 2181 RRETURN(MATCH_NOMATCH); 2182 } 2183 } 2184 /* Control never gets here */ 2185 2186 /* Match a bit-mapped character class, possibly repeatedly. This op code is 2187 used when all the characters in the class have values in the range 0-255, 2188 and either the matching is caseful, or the characters are in the range 2189 0-127 when UTF-8 processing is enabled. The only difference between 2190 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 2191 encountered. 2192 2193 First, look past the end of the item to see if there is repeat information 2194 following. Then obey similar code to character type repeats - written out 2195 again for speed. */ 2196 2197 case OP_NCLASS: 2198 case OP_CLASS: 2199 { 2200 data = ecode + 1; /* Save for matching */ 2201 ecode += 33; /* Advance past the item */ 2202 2203 switch (*ecode) 2204 { 2205 case OP_CRSTAR: 2206 case OP_CRMINSTAR: 2207 case OP_CRPLUS: 2208 case OP_CRMINPLUS: 2209 case OP_CRQUERY: 2210 case OP_CRMINQUERY: 2211 c = *ecode++ - OP_CRSTAR; 2212 minimize = (c & 1) != 0; 2213 min = rep_min[c]; /* Pick up values from tables; */ 2214 max = rep_max[c]; /* zero for max => infinity */ 2215 if (max == 0) max = INT_MAX; 2216 break; 2217 2218 case OP_CRRANGE: 2219 case OP_CRMINRANGE: 2220 minimize = (*ecode == OP_CRMINRANGE); 2221 min = GET2(ecode, 1); 2222 max = GET2(ecode, 3); 2223 if (max == 0) max = INT_MAX; 2224 ecode += 5; 2225 break; 2226 2227 default: /* No repeat follows */ 2228 min = max = 1; 2229 break; 2230 } 2231 2232 /* First, ensure the minimum number of matches are present. */ 2233 2234#ifdef SUPPORT_UTF8 2235 /* UTF-8 mode */ 2236 if (utf8) 2237 { 2238 for (i = 1; i <= min; i++) 2239 { 2240 if (eptr >= md->end_subject) 2241 { 2242 SCHECK_PARTIAL(); 2243 RRETURN(MATCH_NOMATCH); 2244 } 2245 GETCHARINC(c, eptr); 2246 if (c > 255) 2247 { 2248 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2249 } 2250 else 2251 { 2252 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2253 } 2254 } 2255 } 2256 else 2257#endif 2258 /* Not UTF-8 mode */ 2259 { 2260 for (i = 1; i <= min; i++) 2261 { 2262 if (eptr >= md->end_subject) 2263 { 2264 SCHECK_PARTIAL(); 2265 RRETURN(MATCH_NOMATCH); 2266 } 2267 c = *eptr++; 2268 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2269 } 2270 } 2271 2272 /* If max == min we can continue with the main loop without the 2273 need to recurse. */ 2274 2275 if (min == max) continue; 2276 2277 /* If minimizing, keep testing the rest of the expression and advancing 2278 the pointer while it matches the class. */ 2279 2280 if (minimize) 2281 { 2282#ifdef SUPPORT_UTF8 2283 /* UTF-8 mode */ 2284 if (utf8) 2285 { 2286 for (fi = min;; fi++) 2287 { 2288 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); 2289 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2290 if (fi >= max) RRETURN(MATCH_NOMATCH); 2291 if (eptr >= md->end_subject) 2292 { 2293 SCHECK_PARTIAL(); 2294 RRETURN(MATCH_NOMATCH); 2295 } 2296 GETCHARINC(c, eptr); 2297 if (c > 255) 2298 { 2299 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 2300 } 2301 else 2302 { 2303 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2304 } 2305 } 2306 } 2307 else 2308#endif 2309 /* Not UTF-8 mode */ 2310 { 2311 for (fi = min;; fi++) 2312 { 2313 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); 2314 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2315 if (fi >= max) RRETURN(MATCH_NOMATCH); 2316 if (eptr >= md->end_subject) 2317 { 2318 SCHECK_PARTIAL(); 2319 RRETURN(MATCH_NOMATCH); 2320 } 2321 c = *eptr++; 2322 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 2323 } 2324 } 2325 /* Control never gets here */ 2326 } 2327 2328 /* If maximizing, find the longest possible run, then work backwards. */ 2329 2330 else 2331 { 2332 pp = eptr; 2333 2334#ifdef SUPPORT_UTF8 2335 /* UTF-8 mode */ 2336 if (utf8) 2337 { 2338 for (i = min; i < max; i++) 2339 { 2340 int len = 1; 2341 if (eptr >= md->end_subject) 2342 { 2343 SCHECK_PARTIAL(); 2344 break; 2345 } 2346 GETCHARLEN(c, eptr, len); 2347 if (c > 255) 2348 { 2349 if (op == OP_CLASS) break; 2350 } 2351 else 2352 { 2353 if ((data[c/8] & (1 << (c&7))) == 0) break; 2354 } 2355 eptr += len; 2356 } 2357 for (;;) 2358 { 2359 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18); 2360 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2361 if (eptr-- == pp) break; /* Stop if tried at original pos */ 2362 BACKCHAR(eptr); 2363 } 2364 } 2365 else 2366#endif 2367 /* Not UTF-8 mode */ 2368 { 2369 for (i = min; i < max; i++) 2370 { 2371 if (eptr >= md->end_subject) 2372 { 2373 SCHECK_PARTIAL(); 2374 break; 2375 } 2376 c = *eptr; 2377 if ((data[c/8] & (1 << (c&7))) == 0) break; 2378 eptr++; 2379 } 2380 while (eptr >= pp) 2381 { 2382 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19); 2383 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2384 eptr--; 2385 } 2386 } 2387 2388 RRETURN(MATCH_NOMATCH); 2389 } 2390 } 2391 /* Control never gets here */ 2392 2393 2394 /* Match an extended character class. This opcode is encountered only 2395 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 2396 mode, because Unicode properties are supported in non-UTF-8 mode. */ 2397 2398#ifdef SUPPORT_UTF8 2399 case OP_XCLASS: 2400 { 2401 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 2402 ecode += GET(ecode, 1); /* Advance past the item */ 2403 2404 switch (*ecode) 2405 { 2406 case OP_CRSTAR: 2407 case OP_CRMINSTAR: 2408 case OP_CRPLUS: 2409 case OP_CRMINPLUS: 2410 case OP_CRQUERY: 2411 case OP_CRMINQUERY: 2412 c = *ecode++ - OP_CRSTAR; 2413 minimize = (c & 1) != 0; 2414 min = rep_min[c]; /* Pick up values from tables; */ 2415 max = rep_max[c]; /* zero for max => infinity */ 2416 if (max == 0) max = INT_MAX; 2417 break; 2418 2419 case OP_CRRANGE: 2420 case OP_CRMINRANGE: 2421 minimize = (*ecode == OP_CRMINRANGE); 2422 min = GET2(ecode, 1); 2423 max = GET2(ecode, 3); 2424 if (max == 0) max = INT_MAX; 2425 ecode += 5; 2426 break; 2427 2428 default: /* No repeat follows */ 2429 min = max = 1; 2430 break; 2431 } 2432 2433 /* First, ensure the minimum number of matches are present. */ 2434 2435 for (i = 1; i <= min; i++) 2436 { 2437 if (eptr >= md->end_subject) 2438 { 2439 SCHECK_PARTIAL(); 2440 RRETURN(MATCH_NOMATCH); 2441 } 2442 GETCHARINCTEST(c, eptr); 2443 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); 2444 } 2445 2446 /* If max == min we can continue with the main loop without the 2447 need to recurse. */ 2448 2449 if (min == max) continue; 2450 2451 /* If minimizing, keep testing the rest of the expression and advancing 2452 the pointer while it matches the class. */ 2453 2454 if (minimize) 2455 { 2456 for (fi = min;; fi++) 2457 { 2458 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); 2459 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2460 if (fi >= max) RRETURN(MATCH_NOMATCH); 2461 if (eptr >= md->end_subject) 2462 { 2463 SCHECK_PARTIAL(); 2464 RRETURN(MATCH_NOMATCH); 2465 } 2466 GETCHARINCTEST(c, eptr); 2467 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); 2468 } 2469 /* Control never gets here */ 2470 } 2471 2472 /* If maximizing, find the longest possible run, then work backwards. */ 2473 2474 else 2475 { 2476 pp = eptr; 2477 for (i = min; i < max; i++) 2478 { 2479 int len = 1; 2480 if (eptr >= md->end_subject) 2481 { 2482 SCHECK_PARTIAL(); 2483 break; 2484 } 2485 GETCHARLENTEST(c, eptr, len); 2486 if (!_pcre_xclass(c, data)) break; 2487 eptr += len; 2488 } 2489 for(;;) 2490 { 2491 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); 2492 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2493 if (eptr-- == pp) break; /* Stop if tried at original pos */ 2494 if (utf8) BACKCHAR(eptr); 2495 } 2496 RRETURN(MATCH_NOMATCH); 2497 } 2498 2499 /* Control never gets here */ 2500 } 2501#endif /* End of XCLASS */ 2502 2503 /* Match a single character, casefully */ 2504 2505 case OP_CHAR: 2506#ifdef SUPPORT_UTF8 2507 if (utf8) 2508 { 2509 length = 1; 2510 ecode++; 2511 GETCHARLEN(fc, ecode, length); 2512 if (length > md->end_subject - eptr) 2513 { 2514 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 2515 RRETURN(MATCH_NOMATCH); 2516 } 2517 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); 2518 } 2519 else 2520#endif 2521 2522 /* Non-UTF-8 mode */ 2523 { 2524 if (md->end_subject - eptr < 1) 2525 { 2526 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 2527 RRETURN(MATCH_NOMATCH); 2528 } 2529 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 2530 ecode += 2; 2531 } 2532 break; 2533 2534 /* Match a single character, caselessly */ 2535 2536 case OP_CHARNC: 2537#ifdef SUPPORT_UTF8 2538 if (utf8) 2539 { 2540 length = 1; 2541 ecode++; 2542 GETCHARLEN(fc, ecode, length); 2543 2544 if (length > md->end_subject - eptr) 2545 { 2546 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ 2547 RRETURN(MATCH_NOMATCH); 2548 } 2549 2550 /* If the pattern character's value is < 128, we have only one byte, and 2551 can use the fast lookup table. */ 2552 2553 if (fc < 128) 2554 { 2555 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 2556 } 2557 2558 /* Otherwise we must pick up the subject character */ 2559 2560 else 2561 { 2562 unsigned int dc; 2563 GETCHARINC(dc, eptr); 2564 ecode += length; 2565 2566 /* If we have Unicode property support, we can use it to test the other 2567 case of the character, if there is one. */ 2568 2569 if (fc != dc) 2570 { 2571#ifdef SUPPORT_UCP 2572 if (dc != UCD_OTHERCASE(fc)) 2573#endif 2574 RRETURN(MATCH_NOMATCH); 2575 } 2576 } 2577 } 2578 else 2579#endif /* SUPPORT_UTF8 */ 2580 2581 /* Non-UTF-8 mode */ 2582 { 2583 if (md->end_subject - eptr < 1) 2584 { 2585 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */ 2586 RRETURN(MATCH_NOMATCH); 2587 } 2588 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 2589 ecode += 2; 2590 } 2591 break; 2592 2593 /* Match a single character repeatedly. */ 2594 2595 case OP_EXACT: 2596 min = max = GET2(ecode, 1); 2597 ecode += 3; 2598 goto REPEATCHAR; 2599 2600 case OP_POSUPTO: 2601 possessive = TRUE; 2602 /* Fall through */ 2603 2604 case OP_UPTO: 2605 case OP_MINUPTO: 2606 min = 0; 2607 max = GET2(ecode, 1); 2608 minimize = *ecode == OP_MINUPTO; 2609 ecode += 3; 2610 goto REPEATCHAR; 2611 2612 case OP_POSSTAR: 2613 possessive = TRUE; 2614 min = 0; 2615 max = INT_MAX; 2616 ecode++; 2617 goto REPEATCHAR; 2618 2619 case OP_POSPLUS: 2620 possessive = TRUE; 2621 min = 1; 2622 max = INT_MAX; 2623 ecode++; 2624 goto REPEATCHAR; 2625 2626 case OP_POSQUERY: 2627 possessive = TRUE; 2628 min = 0; 2629 max = 1; 2630 ecode++; 2631 goto REPEATCHAR; 2632 2633 case OP_STAR: 2634 case OP_MINSTAR: 2635 case OP_PLUS: 2636 case OP_MINPLUS: 2637 case OP_QUERY: 2638 case OP_MINQUERY: 2639 c = *ecode++ - OP_STAR; 2640 minimize = (c & 1) != 0; 2641 2642 min = rep_min[c]; /* Pick up values from tables; */ 2643 max = rep_max[c]; /* zero for max => infinity */ 2644 if (max == 0) max = INT_MAX; 2645 2646 /* Common code for all repeated single-character matches. */ 2647 2648 REPEATCHAR: 2649#ifdef SUPPORT_UTF8 2650 if (utf8) 2651 { 2652 length = 1; 2653 charptr = ecode; 2654 GETCHARLEN(fc, ecode, length); 2655 ecode += length; 2656 2657 /* Handle multibyte character matching specially here. There is 2658 support for caseless matching if UCP support is present. */ 2659 2660 if (length > 1) 2661 { 2662#ifdef SUPPORT_UCP 2663 unsigned int othercase; 2664 if ((ims & PCRE_CASELESS) != 0 && 2665 (othercase = UCD_OTHERCASE(fc)) != fc) 2666 oclength = _pcre_ord2utf8(othercase, occhars); 2667 else oclength = 0; 2668#endif /* SUPPORT_UCP */ 2669 2670 for (i = 1; i <= min; i++) 2671 { 2672 if (eptr <= md->end_subject - length && 2673 memcmp(eptr, charptr, length) == 0) eptr += length; 2674#ifdef SUPPORT_UCP 2675 else if (oclength > 0 && 2676 eptr <= md->end_subject - oclength && 2677 memcmp(eptr, occhars, oclength) == 0) eptr += oclength; 2678#endif /* SUPPORT_UCP */ 2679 else 2680 { 2681 CHECK_PARTIAL(); 2682 RRETURN(MATCH_NOMATCH); 2683 } 2684 } 2685 2686 if (min == max) continue; 2687 2688 if (minimize) 2689 { 2690 for (fi = min;; fi++) 2691 { 2692 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); 2693 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2694 if (fi >= max) RRETURN(MATCH_NOMATCH); 2695 if (eptr <= md->end_subject - length && 2696 memcmp(eptr, charptr, length) == 0) eptr += length; 2697#ifdef SUPPORT_UCP 2698 else if (oclength > 0 && 2699 eptr <= md->end_subject - oclength && 2700 memcmp(eptr, occhars, oclength) == 0) eptr += oclength; 2701#endif /* SUPPORT_UCP */ 2702 else 2703 { 2704 CHECK_PARTIAL(); 2705 RRETURN(MATCH_NOMATCH); 2706 } 2707 } 2708 /* Control never gets here */ 2709 } 2710 2711 else /* Maximize */ 2712 { 2713 pp = eptr; 2714 for (i = min; i < max; i++) 2715 { 2716 if (eptr <= md->end_subject - length && 2717 memcmp(eptr, charptr, length) == 0) eptr += length; 2718#ifdef SUPPORT_UCP 2719 else if (oclength > 0 && 2720 eptr <= md->end_subject - oclength && 2721 memcmp(eptr, occhars, oclength) == 0) eptr += oclength; 2722#endif /* SUPPORT_UCP */ 2723 else 2724 { 2725 CHECK_PARTIAL(); 2726 break; 2727 } 2728 } 2729 2730 if (possessive) continue; 2731 2732 for(;;) 2733 { 2734 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); 2735 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2736 if (eptr == pp) { RRETURN(MATCH_NOMATCH); } 2737#ifdef SUPPORT_UCP 2738 eptr--; 2739 BACKCHAR(eptr); 2740#else /* without SUPPORT_UCP */ 2741 eptr -= length; 2742#endif /* SUPPORT_UCP */ 2743 } 2744 } 2745 /* Control never gets here */ 2746 } 2747 2748 /* If the length of a UTF-8 character is 1, we fall through here, and 2749 obey the code as for non-UTF-8 characters below, though in this case the 2750 value of fc will always be < 128. */ 2751 } 2752 else 2753#endif /* SUPPORT_UTF8 */ 2754 2755 /* When not in UTF-8 mode, load a single-byte character. */ 2756 2757 fc = *ecode++; 2758 2759 /* The value of fc at this point is always less than 256, though we may or 2760 may not be in UTF-8 mode. The code is duplicated for the caseless and 2761 caseful cases, for speed, since matching characters is likely to be quite 2762 common. First, ensure the minimum number of matches are present. If min = 2763 max, continue at the same level without recursing. Otherwise, if 2764 minimizing, keep trying the rest of the expression and advancing one 2765 matching character if failing, up to the maximum. Alternatively, if 2766 maximizing, find the maximum number of characters and work backwards. */ 2767 2768 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 2769 max, eptr)); 2770 2771 if ((ims & PCRE_CASELESS) != 0) 2772 { 2773 fc = md->lcc[fc]; 2774 for (i = 1; i <= min; i++) 2775 { 2776 if (eptr >= md->end_subject) 2777 { 2778 SCHECK_PARTIAL(); 2779 RRETURN(MATCH_NOMATCH); 2780 } 2781 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 2782 } 2783 if (min == max) continue; 2784 if (minimize) 2785 { 2786 for (fi = min;; fi++) 2787 { 2788 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); 2789 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2790 if (fi >= max) RRETURN(MATCH_NOMATCH); 2791 if (eptr >= md->end_subject) 2792 { 2793 SCHECK_PARTIAL(); 2794 RRETURN(MATCH_NOMATCH); 2795 } 2796 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 2797 } 2798 /* Control never gets here */ 2799 } 2800 else /* Maximize */ 2801 { 2802 pp = eptr; 2803 for (i = min; i < max; i++) 2804 { 2805 if (eptr >= md->end_subject) 2806 { 2807 SCHECK_PARTIAL(); 2808 break; 2809 } 2810 if (fc != md->lcc[*eptr]) break; 2811 eptr++; 2812 } 2813 2814 if (possessive) continue; 2815 2816 while (eptr >= pp) 2817 { 2818 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); 2819 eptr--; 2820 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2821 } 2822 RRETURN(MATCH_NOMATCH); 2823 } 2824 /* Control never gets here */ 2825 } 2826 2827 /* Caseful comparisons (includes all multi-byte characters) */ 2828 2829 else 2830 { 2831 for (i = 1; i <= min; i++) 2832 { 2833 if (eptr >= md->end_subject) 2834 { 2835 SCHECK_PARTIAL(); 2836 RRETURN(MATCH_NOMATCH); 2837 } 2838 if (fc != *eptr++) RRETURN(MATCH_NOMATCH); 2839 } 2840 2841 if (min == max) continue; 2842 2843 if (minimize) 2844 { 2845 for (fi = min;; fi++) 2846 { 2847 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); 2848 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2849 if (fi >= max) RRETURN(MATCH_NOMATCH); 2850 if (eptr >= md->end_subject) 2851 { 2852 SCHECK_PARTIAL(); 2853 RRETURN(MATCH_NOMATCH); 2854 } 2855 if (fc != *eptr++) RRETURN(MATCH_NOMATCH); 2856 } 2857 /* Control never gets here */ 2858 } 2859 else /* Maximize */ 2860 { 2861 pp = eptr; 2862 for (i = min; i < max; i++) 2863 { 2864 if (eptr >= md->end_subject) 2865 { 2866 SCHECK_PARTIAL(); 2867 break; 2868 } 2869 if (fc != *eptr) break; 2870 eptr++; 2871 } 2872 if (possessive) continue; 2873 2874 while (eptr >= pp) 2875 { 2876 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); 2877 eptr--; 2878 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 2879 } 2880 RRETURN(MATCH_NOMATCH); 2881 } 2882 } 2883 /* Control never gets here */ 2884 2885 /* Match a negated single one-byte character. The character we are 2886 checking can be multibyte. */ 2887 2888 case OP_NOT: 2889 if (eptr >= md->end_subject) 2890 { 2891 SCHECK_PARTIAL(); 2892 RRETURN(MATCH_NOMATCH); 2893 } 2894 ecode++; 2895 GETCHARINCTEST(c, eptr); 2896 if ((ims & PCRE_CASELESS) != 0) 2897 { 2898#ifdef SUPPORT_UTF8 2899 if (c < 256) 2900#endif 2901 c = md->lcc[c]; 2902 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); 2903 } 2904 else 2905 { 2906 if (*ecode++ == c) RRETURN(MATCH_NOMATCH); 2907 } 2908 break; 2909 2910 /* Match a negated single one-byte character repeatedly. This is almost a 2911 repeat of the code for a repeated single character, but I haven't found a 2912 nice way of commoning these up that doesn't require a test of the 2913 positive/negative option for each character match. Maybe that wouldn't add 2914 very much to the time taken, but character matching *is* what this is all 2915 about... */ 2916 2917 case OP_NOTEXACT: 2918 min = max = GET2(ecode, 1); 2919 ecode += 3; 2920 goto REPEATNOTCHAR; 2921 2922 case OP_NOTUPTO: 2923 case OP_NOTMINUPTO: 2924 min = 0; 2925 max = GET2(ecode, 1); 2926 minimize = *ecode == OP_NOTMINUPTO; 2927 ecode += 3; 2928 goto REPEATNOTCHAR; 2929 2930 case OP_NOTPOSSTAR: 2931 possessive = TRUE; 2932 min = 0; 2933 max = INT_MAX; 2934 ecode++; 2935 goto REPEATNOTCHAR; 2936 2937 case OP_NOTPOSPLUS: 2938 possessive = TRUE; 2939 min = 1; 2940 max = INT_MAX; 2941 ecode++; 2942 goto REPEATNOTCHAR; 2943 2944 case OP_NOTPOSQUERY: 2945 possessive = TRUE; 2946 min = 0; 2947 max = 1; 2948 ecode++; 2949 goto REPEATNOTCHAR; 2950 2951 case OP_NOTPOSUPTO: 2952 possessive = TRUE; 2953 min = 0; 2954 max = GET2(ecode, 1); 2955 ecode += 3; 2956 goto REPEATNOTCHAR; 2957 2958 case OP_NOTSTAR: 2959 case OP_NOTMINSTAR: 2960 case OP_NOTPLUS: 2961 case OP_NOTMINPLUS: 2962 case OP_NOTQUERY: 2963 case OP_NOTMINQUERY: 2964 c = *ecode++ - OP_NOTSTAR; 2965 minimize = (c & 1) != 0; 2966 min = rep_min[c]; /* Pick up values from tables; */ 2967 max = rep_max[c]; /* zero for max => infinity */ 2968 if (max == 0) max = INT_MAX; 2969 2970 /* Common code for all repeated single-byte matches. */ 2971 2972 REPEATNOTCHAR: 2973 fc = *ecode++; 2974 2975 /* The code is duplicated for the caseless and caseful cases, for speed, 2976 since matching characters is likely to be quite common. First, ensure the 2977 minimum number of matches are present. If min = max, continue at the same 2978 level without recursing. Otherwise, if minimizing, keep trying the rest of 2979 the expression and advancing one matching character if failing, up to the 2980 maximum. Alternatively, if maximizing, find the maximum number of 2981 characters and work backwards. */ 2982 2983 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 2984 max, eptr)); 2985 2986 if ((ims & PCRE_CASELESS) != 0) 2987 { 2988 fc = md->lcc[fc]; 2989 2990#ifdef SUPPORT_UTF8 2991 /* UTF-8 mode */ 2992 if (utf8) 2993 { 2994 register unsigned int d; 2995 for (i = 1; i <= min; i++) 2996 { 2997 if (eptr >= md->end_subject) 2998 { 2999 SCHECK_PARTIAL(); 3000 RRETURN(MATCH_NOMATCH); 3001 } 3002 GETCHARINC(d, eptr); 3003 if (d < 256) d = md->lcc[d]; 3004 if (fc == d) RRETURN(MATCH_NOMATCH); 3005 } 3006 } 3007 else 3008#endif 3009 3010 /* Not UTF-8 mode */ 3011 { 3012 for (i = 1; i <= min; i++) 3013 { 3014 if (eptr >= md->end_subject) 3015 { 3016 SCHECK_PARTIAL(); 3017 RRETURN(MATCH_NOMATCH); 3018 } 3019 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 3020 } 3021 } 3022 3023 if (min == max) continue; 3024 3025 if (minimize) 3026 { 3027#ifdef SUPPORT_UTF8 3028 /* UTF-8 mode */ 3029 if (utf8) 3030 { 3031 register unsigned int d; 3032 for (fi = min;; fi++) 3033 { 3034 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); 3035 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3036 if (fi >= max) RRETURN(MATCH_NOMATCH); 3037 if (eptr >= md->end_subject) 3038 { 3039 SCHECK_PARTIAL(); 3040 RRETURN(MATCH_NOMATCH); 3041 } 3042 GETCHARINC(d, eptr); 3043 if (d < 256) d = md->lcc[d]; 3044 if (fc == d) RRETURN(MATCH_NOMATCH); 3045 } 3046 } 3047 else 3048#endif 3049 /* Not UTF-8 mode */ 3050 { 3051 for (fi = min;; fi++) 3052 { 3053 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); 3054 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3055 if (fi >= max) RRETURN(MATCH_NOMATCH); 3056 if (eptr >= md->end_subject) 3057 { 3058 SCHECK_PARTIAL(); 3059 RRETURN(MATCH_NOMATCH); 3060 } 3061 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 3062 } 3063 } 3064 /* Control never gets here */ 3065 } 3066 3067 /* Maximize case */ 3068 3069 else 3070 { 3071 pp = eptr; 3072 3073#ifdef SUPPORT_UTF8 3074 /* UTF-8 mode */ 3075 if (utf8) 3076 { 3077 register unsigned int d; 3078 for (i = min; i < max; i++) 3079 { 3080 int len = 1; 3081 if (eptr >= md->end_subject) 3082 { 3083 SCHECK_PARTIAL(); 3084 break; 3085 } 3086 GETCHARLEN(d, eptr, len); 3087 if (d < 256) d = md->lcc[d]; 3088 if (fc == d) break; 3089 eptr += len; 3090 } 3091 if (possessive) continue; 3092 for(;;) 3093 { 3094 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30); 3095 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3096 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3097 BACKCHAR(eptr); 3098 } 3099 } 3100 else 3101#endif 3102 /* Not UTF-8 mode */ 3103 { 3104 for (i = min; i < max; i++) 3105 { 3106 if (eptr >= md->end_subject) 3107 { 3108 SCHECK_PARTIAL(); 3109 break; 3110 } 3111 if (fc == md->lcc[*eptr]) break; 3112 eptr++; 3113 } 3114 if (possessive) continue; 3115 while (eptr >= pp) 3116 { 3117 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31); 3118 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3119 eptr--; 3120 } 3121 } 3122 3123 RRETURN(MATCH_NOMATCH); 3124 } 3125 /* Control never gets here */ 3126 } 3127 3128 /* Caseful comparisons */ 3129 3130 else 3131 { 3132#ifdef SUPPORT_UTF8 3133 /* UTF-8 mode */ 3134 if (utf8) 3135 { 3136 register unsigned int d; 3137 for (i = 1; i <= min; i++) 3138 { 3139 if (eptr >= md->end_subject) 3140 { 3141 SCHECK_PARTIAL(); 3142 RRETURN(MATCH_NOMATCH); 3143 } 3144 GETCHARINC(d, eptr); 3145 if (fc == d) RRETURN(MATCH_NOMATCH); 3146 } 3147 } 3148 else 3149#endif 3150 /* Not UTF-8 mode */ 3151 { 3152 for (i = 1; i <= min; i++) 3153 { 3154 if (eptr >= md->end_subject) 3155 { 3156 SCHECK_PARTIAL(); 3157 RRETURN(MATCH_NOMATCH); 3158 } 3159 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3160 } 3161 } 3162 3163 if (min == max) continue; 3164 3165 if (minimize) 3166 { 3167#ifdef SUPPORT_UTF8 3168 /* UTF-8 mode */ 3169 if (utf8) 3170 { 3171 register unsigned int d; 3172 for (fi = min;; fi++) 3173 { 3174 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); 3175 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3176 if (fi >= max) RRETURN(MATCH_NOMATCH); 3177 if (eptr >= md->end_subject) 3178 { 3179 SCHECK_PARTIAL(); 3180 RRETURN(MATCH_NOMATCH); 3181 } 3182 GETCHARINC(d, eptr); 3183 if (fc == d) RRETURN(MATCH_NOMATCH); 3184 } 3185 } 3186 else 3187#endif 3188 /* Not UTF-8 mode */ 3189 { 3190 for (fi = min;; fi++) 3191 { 3192 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); 3193 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3194 if (fi >= max) RRETURN(MATCH_NOMATCH); 3195 if (eptr >= md->end_subject) 3196 { 3197 SCHECK_PARTIAL(); 3198 RRETURN(MATCH_NOMATCH); 3199 } 3200 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 3201 } 3202 } 3203 /* Control never gets here */ 3204 } 3205 3206 /* Maximize case */ 3207 3208 else 3209 { 3210 pp = eptr; 3211 3212#ifdef SUPPORT_UTF8 3213 /* UTF-8 mode */ 3214 if (utf8) 3215 { 3216 register unsigned int d; 3217 for (i = min; i < max; i++) 3218 { 3219 int len = 1; 3220 if (eptr >= md->end_subject) 3221 { 3222 SCHECK_PARTIAL(); 3223 break; 3224 } 3225 GETCHARLEN(d, eptr, len); 3226 if (fc == d) break; 3227 eptr += len; 3228 } 3229 if (possessive) continue; 3230 for(;;) 3231 { 3232 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34); 3233 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3234 if (eptr-- == pp) break; /* Stop if tried at original pos */ 3235 BACKCHAR(eptr); 3236 } 3237 } 3238 else 3239#endif 3240 /* Not UTF-8 mode */ 3241 { 3242 for (i = min; i < max; i++) 3243 { 3244 if (eptr >= md->end_subject) 3245 { 3246 SCHECK_PARTIAL(); 3247 break; 3248 } 3249 if (fc == *eptr) break; 3250 eptr++; 3251 } 3252 if (possessive) continue; 3253 while (eptr >= pp) 3254 { 3255 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35); 3256 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3257 eptr--; 3258 } 3259 } 3260 3261 RRETURN(MATCH_NOMATCH); 3262 } 3263 } 3264 /* Control never gets here */ 3265 3266 /* Match a single character type repeatedly; several different opcodes 3267 share code. This is very similar to the code for single characters, but we 3268 repeat it in the interests of efficiency. */ 3269 3270 case OP_TYPEEXACT: 3271 min = max = GET2(ecode, 1); 3272 minimize = TRUE; 3273 ecode += 3; 3274 goto REPEATTYPE; 3275 3276 case OP_TYPEUPTO: 3277 case OP_TYPEMINUPTO: 3278 min = 0; 3279 max = GET2(ecode, 1); 3280 minimize = *ecode == OP_TYPEMINUPTO; 3281 ecode += 3; 3282 goto REPEATTYPE; 3283 3284 case OP_TYPEPOSSTAR: 3285 possessive = TRUE; 3286 min = 0; 3287 max = INT_MAX; 3288 ecode++; 3289 goto REPEATTYPE; 3290 3291 case OP_TYPEPOSPLUS: 3292 possessive = TRUE; 3293 min = 1; 3294 max = INT_MAX; 3295 ecode++; 3296 goto REPEATTYPE; 3297 3298 case OP_TYPEPOSQUERY: 3299 possessive = TRUE; 3300 min = 0; 3301 max = 1; 3302 ecode++; 3303 goto REPEATTYPE; 3304 3305 case OP_TYPEPOSUPTO: 3306 possessive = TRUE; 3307 min = 0; 3308 max = GET2(ecode, 1); 3309 ecode += 3; 3310 goto REPEATTYPE; 3311 3312 case OP_TYPESTAR: 3313 case OP_TYPEMINSTAR: 3314 case OP_TYPEPLUS: 3315 case OP_TYPEMINPLUS: 3316 case OP_TYPEQUERY: 3317 case OP_TYPEMINQUERY: 3318 c = *ecode++ - OP_TYPESTAR; 3319 minimize = (c & 1) != 0; 3320 min = rep_min[c]; /* Pick up values from tables; */ 3321 max = rep_max[c]; /* zero for max => infinity */ 3322 if (max == 0) max = INT_MAX; 3323 3324 /* Common code for all repeated single character type matches. Note that 3325 in UTF-8 mode, '.' matches a character of any length, but for the other 3326 character types, the valid characters are all one-byte long. */ 3327 3328 REPEATTYPE: 3329 ctype = *ecode++; /* Code for the character type */ 3330 3331#ifdef SUPPORT_UCP 3332 if (ctype == OP_PROP || ctype == OP_NOTPROP) 3333 { 3334 prop_fail_result = ctype == OP_NOTPROP; 3335 prop_type = *ecode++; 3336 prop_value = *ecode++; 3337 } 3338 else prop_type = -1; 3339#endif 3340 3341 /* First, ensure the minimum number of matches are present. Use inline 3342 code for maximizing the speed, and do the type test once at the start 3343 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that 3344 is tidier. Also separate the UCP code, which can be the same for both UTF-8 3345 and single-bytes. */ 3346 3347 if (min > 0) 3348 { 3349#ifdef SUPPORT_UCP 3350 if (prop_type >= 0) 3351 { 3352 switch(prop_type) 3353 { 3354 case PT_ANY: 3355 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 3356 for (i = 1; i <= min; i++) 3357 { 3358 if (eptr >= md->end_subject) 3359 { 3360 SCHECK_PARTIAL(); 3361 RRETURN(MATCH_NOMATCH); 3362 } 3363 GETCHARINCTEST(c, eptr); 3364 } 3365 break; 3366 3367 case PT_LAMP: 3368 for (i = 1; i <= min; i++) 3369 { 3370 if (eptr >= md->end_subject) 3371 { 3372 SCHECK_PARTIAL(); 3373 RRETURN(MATCH_NOMATCH); 3374 } 3375 GETCHARINCTEST(c, eptr); 3376 prop_chartype = UCD_CHARTYPE(c); 3377 if ((prop_chartype == ucp_Lu || 3378 prop_chartype == ucp_Ll || 3379 prop_chartype == ucp_Lt) == prop_fail_result) 3380 RRETURN(MATCH_NOMATCH); 3381 } 3382 break; 3383 3384 case PT_GC: 3385 for (i = 1; i <= min; i++) 3386 { 3387 if (eptr >= md->end_subject) 3388 { 3389 SCHECK_PARTIAL(); 3390 RRETURN(MATCH_NOMATCH); 3391 } 3392 GETCHARINCTEST(c, eptr); 3393 prop_category = UCD_CATEGORY(c); 3394 if ((prop_category == prop_value) == prop_fail_result) 3395 RRETURN(MATCH_NOMATCH); 3396 } 3397 break; 3398 3399 case PT_PC: 3400 for (i = 1; i <= min; i++) 3401 { 3402 if (eptr >= md->end_subject) 3403 { 3404 SCHECK_PARTIAL(); 3405 RRETURN(MATCH_NOMATCH); 3406 } 3407 GETCHARINCTEST(c, eptr); 3408 prop_chartype = UCD_CHARTYPE(c); 3409 if ((prop_chartype == prop_value) == prop_fail_result) 3410 RRETURN(MATCH_NOMATCH); 3411 } 3412 break; 3413 3414 case PT_SC: 3415 for (i = 1; i <= min; i++) 3416 { 3417 if (eptr >= md->end_subject) 3418 { 3419 SCHECK_PARTIAL(); 3420 RRETURN(MATCH_NOMATCH); 3421 } 3422 GETCHARINCTEST(c, eptr); 3423 prop_script = UCD_SCRIPT(c); 3424 if ((prop_script == prop_value) == prop_fail_result) 3425 RRETURN(MATCH_NOMATCH); 3426 } 3427 break; 3428 3429 default: 3430 RRETURN(PCRE_ERROR_INTERNAL); 3431 } 3432 } 3433 3434 /* Match extended Unicode sequences. We will get here only if the 3435 support is in the binary; otherwise a compile-time error occurs. */ 3436 3437 else if (ctype == OP_EXTUNI) 3438 { 3439 for (i = 1; i <= min; i++) 3440 { 3441 if (eptr >= md->end_subject) 3442 { 3443 SCHECK_PARTIAL(); 3444 RRETURN(MATCH_NOMATCH); 3445 } 3446 GETCHARINCTEST(c, eptr); 3447 prop_category = UCD_CATEGORY(c); 3448 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); 3449 while (eptr < md->end_subject) 3450 { 3451 int len = 1; 3452 if (!utf8) c = *eptr; 3453 else { GETCHARLEN(c, eptr, len); } 3454 prop_category = UCD_CATEGORY(c); 3455 if (prop_category != ucp_M) break; 3456 eptr += len; 3457 } 3458 } 3459 } 3460 3461 else 3462#endif /* SUPPORT_UCP */ 3463 3464/* Handle all other cases when the coding is UTF-8 */ 3465 3466#ifdef SUPPORT_UTF8 3467 if (utf8) switch(ctype) 3468 { 3469 case OP_ANY: 3470 for (i = 1; i <= min; i++) 3471 { 3472 if (eptr >= md->end_subject) 3473 { 3474 SCHECK_PARTIAL(); 3475 RRETURN(MATCH_NOMATCH); 3476 } 3477 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 3478 eptr++; 3479 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 3480 } 3481 break; 3482 3483 case OP_ALLANY: 3484 for (i = 1; i <= min; i++) 3485 { 3486 if (eptr >= md->end_subject) 3487 { 3488 SCHECK_PARTIAL(); 3489 RRETURN(MATCH_NOMATCH); 3490 } 3491 eptr++; 3492 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 3493 } 3494 break; 3495 3496 case OP_ANYBYTE: 3497 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH); 3498 eptr += min; 3499 break; 3500 3501 case OP_ANYNL: 3502 for (i = 1; i <= min; i++) 3503 { 3504 if (eptr >= md->end_subject) 3505 { 3506 SCHECK_PARTIAL(); 3507 RRETURN(MATCH_NOMATCH); 3508 } 3509 GETCHARINC(c, eptr); 3510 switch(c) 3511 { 3512 default: RRETURN(MATCH_NOMATCH); 3513 case 0x000d: 3514 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 3515 break; 3516 3517 case 0x000a: 3518 break; 3519 3520 case 0x000b: 3521 case 0x000c: 3522 case 0x0085: 3523 case 0x2028: 3524 case 0x2029: 3525 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 3526 break; 3527 } 3528 } 3529 break; 3530 3531 case OP_NOT_HSPACE: 3532 for (i = 1; i <= min; i++) 3533 { 3534 if (eptr >= md->end_subject) 3535 { 3536 SCHECK_PARTIAL(); 3537 RRETURN(MATCH_NOMATCH); 3538 } 3539 GETCHARINC(c, eptr); 3540 switch(c) 3541 { 3542 default: break; 3543 case 0x09: /* HT */ 3544 case 0x20: /* SPACE */ 3545 case 0xa0: /* NBSP */ 3546 case 0x1680: /* OGHAM SPACE MARK */ 3547 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 3548 case 0x2000: /* EN QUAD */ 3549 case 0x2001: /* EM QUAD */ 3550 case 0x2002: /* EN SPACE */ 3551 case 0x2003: /* EM SPACE */ 3552 case 0x2004: /* THREE-PER-EM SPACE */ 3553 case 0x2005: /* FOUR-PER-EM SPACE */ 3554 case 0x2006: /* SIX-PER-EM SPACE */ 3555 case 0x2007: /* FIGURE SPACE */ 3556 case 0x2008: /* PUNCTUATION SPACE */ 3557 case 0x2009: /* THIN SPACE */ 3558 case 0x200A: /* HAIR SPACE */ 3559 case 0x202f: /* NARROW NO-BREAK SPACE */ 3560 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 3561 case 0x3000: /* IDEOGRAPHIC SPACE */ 3562 RRETURN(MATCH_NOMATCH); 3563 } 3564 } 3565 break; 3566 3567 case OP_HSPACE: 3568 for (i = 1; i <= min; i++) 3569 { 3570 if (eptr >= md->end_subject) 3571 { 3572 SCHECK_PARTIAL(); 3573 RRETURN(MATCH_NOMATCH); 3574 } 3575 GETCHARINC(c, eptr); 3576 switch(c) 3577 { 3578 default: RRETURN(MATCH_NOMATCH); 3579 case 0x09: /* HT */ 3580 case 0x20: /* SPACE */ 3581 case 0xa0: /* NBSP */ 3582 case 0x1680: /* OGHAM SPACE MARK */ 3583 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 3584 case 0x2000: /* EN QUAD */ 3585 case 0x2001: /* EM QUAD */ 3586 case 0x2002: /* EN SPACE */ 3587 case 0x2003: /* EM SPACE */ 3588 case 0x2004: /* THREE-PER-EM SPACE */ 3589 case 0x2005: /* FOUR-PER-EM SPACE */ 3590 case 0x2006: /* SIX-PER-EM SPACE */ 3591 case 0x2007: /* FIGURE SPACE */ 3592 case 0x2008: /* PUNCTUATION SPACE */ 3593 case 0x2009: /* THIN SPACE */ 3594 case 0x200A: /* HAIR SPACE */ 3595 case 0x202f: /* NARROW NO-BREAK SPACE */ 3596 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 3597 case 0x3000: /* IDEOGRAPHIC SPACE */ 3598 break; 3599 } 3600 } 3601 break; 3602 3603 case OP_NOT_VSPACE: 3604 for (i = 1; i <= min; i++) 3605 { 3606 if (eptr >= md->end_subject) 3607 { 3608 SCHECK_PARTIAL(); 3609 RRETURN(MATCH_NOMATCH); 3610 } 3611 GETCHARINC(c, eptr); 3612 switch(c) 3613 { 3614 default: break; 3615 case 0x0a: /* LF */ 3616 case 0x0b: /* VT */ 3617 case 0x0c: /* FF */ 3618 case 0x0d: /* CR */ 3619 case 0x85: /* NEL */ 3620 case 0x2028: /* LINE SEPARATOR */ 3621 case 0x2029: /* PARAGRAPH SEPARATOR */ 3622 RRETURN(MATCH_NOMATCH); 3623 } 3624 } 3625 break; 3626 3627 case OP_VSPACE: 3628 for (i = 1; i <= min; i++) 3629 { 3630 if (eptr >= md->end_subject) 3631 { 3632 SCHECK_PARTIAL(); 3633 RRETURN(MATCH_NOMATCH); 3634 } 3635 GETCHARINC(c, eptr); 3636 switch(c) 3637 { 3638 default: RRETURN(MATCH_NOMATCH); 3639 case 0x0a: /* LF */ 3640 case 0x0b: /* VT */ 3641 case 0x0c: /* FF */ 3642 case 0x0d: /* CR */ 3643 case 0x85: /* NEL */ 3644 case 0x2028: /* LINE SEPARATOR */ 3645 case 0x2029: /* PARAGRAPH SEPARATOR */ 3646 break; 3647 } 3648 } 3649 break; 3650 3651 case OP_NOT_DIGIT: 3652 for (i = 1; i <= min; i++) 3653 { 3654 if (eptr >= md->end_subject) 3655 { 3656 SCHECK_PARTIAL(); 3657 RRETURN(MATCH_NOMATCH); 3658 } 3659 GETCHARINC(c, eptr); 3660 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 3661 RRETURN(MATCH_NOMATCH); 3662 } 3663 break; 3664 3665 case OP_DIGIT: 3666 for (i = 1; i <= min; i++) 3667 { 3668 if (eptr >= md->end_subject) 3669 { 3670 SCHECK_PARTIAL(); 3671 RRETURN(MATCH_NOMATCH); 3672 } 3673 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) 3674 RRETURN(MATCH_NOMATCH); 3675 /* No need to skip more bytes - we know it's a 1-byte character */ 3676 } 3677 break; 3678 3679 case OP_NOT_WHITESPACE: 3680 for (i = 1; i <= min; i++) 3681 { 3682 if (eptr >= md->end_subject) 3683 { 3684 SCHECK_PARTIAL(); 3685 RRETURN(MATCH_NOMATCH); 3686 } 3687 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) 3688 RRETURN(MATCH_NOMATCH); 3689 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); 3690 } 3691 break; 3692 3693 case OP_WHITESPACE: 3694 for (i = 1; i <= min; i++) 3695 { 3696 if (eptr >= md->end_subject) 3697 { 3698 SCHECK_PARTIAL(); 3699 RRETURN(MATCH_NOMATCH); 3700 } 3701 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) 3702 RRETURN(MATCH_NOMATCH); 3703 /* No need to skip more bytes - we know it's a 1-byte character */ 3704 } 3705 break; 3706 3707 case OP_NOT_WORDCHAR: 3708 for (i = 1; i <= min; i++) 3709 { 3710 if (eptr >= md->end_subject) 3711 { 3712 SCHECK_PARTIAL(); 3713 RRETURN(MATCH_NOMATCH); 3714 } 3715 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) 3716 RRETURN(MATCH_NOMATCH); 3717 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); 3718 } 3719 break; 3720 3721 case OP_WORDCHAR: 3722 for (i = 1; i <= min; i++) 3723 { 3724 if (eptr >= md->end_subject) 3725 { 3726 SCHECK_PARTIAL(); 3727 RRETURN(MATCH_NOMATCH); 3728 } 3729 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) 3730 RRETURN(MATCH_NOMATCH); 3731 /* No need to skip more bytes - we know it's a 1-byte character */ 3732 } 3733 break; 3734 3735 default: 3736 RRETURN(PCRE_ERROR_INTERNAL); 3737 } /* End switch(ctype) */ 3738 3739 else 3740#endif /* SUPPORT_UTF8 */ 3741 3742 /* Code for the non-UTF-8 case for minimum matching of operators other 3743 than OP_PROP and OP_NOTPROP. */ 3744 3745 switch(ctype) 3746 { 3747 case OP_ANY: 3748 for (i = 1; i <= min; i++) 3749 { 3750 if (eptr >= md->end_subject) 3751 { 3752 SCHECK_PARTIAL(); 3753 RRETURN(MATCH_NOMATCH); 3754 } 3755 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); 3756 eptr++; 3757 } 3758 break; 3759 3760 case OP_ALLANY: 3761 if (eptr > md->end_subject - min) 3762 { 3763 SCHECK_PARTIAL(); 3764 RRETURN(MATCH_NOMATCH); 3765 } 3766 eptr += min; 3767 break; 3768 3769 case OP_ANYBYTE: 3770 if (eptr > md->end_subject - min) 3771 { 3772 SCHECK_PARTIAL(); 3773 RRETURN(MATCH_NOMATCH); 3774 } 3775 eptr += min; 3776 break; 3777 3778 case OP_ANYNL: 3779 for (i = 1; i <= min; i++) 3780 { 3781 if (eptr >= md->end_subject) 3782 { 3783 SCHECK_PARTIAL(); 3784 RRETURN(MATCH_NOMATCH); 3785 } 3786 switch(*eptr++) 3787 { 3788 default: RRETURN(MATCH_NOMATCH); 3789 case 0x000d: 3790 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 3791 break; 3792 case 0x000a: 3793 break; 3794 3795 case 0x000b: 3796 case 0x000c: 3797 case 0x0085: 3798 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 3799 break; 3800 } 3801 } 3802 break; 3803 3804 case OP_NOT_HSPACE: 3805 for (i = 1; i <= min; i++) 3806 { 3807 if (eptr >= md->end_subject) 3808 { 3809 SCHECK_PARTIAL(); 3810 RRETURN(MATCH_NOMATCH); 3811 } 3812 switch(*eptr++) 3813 { 3814 default: break; 3815 case 0x09: /* HT */ 3816 case 0x20: /* SPACE */ 3817 case 0xa0: /* NBSP */ 3818 RRETURN(MATCH_NOMATCH); 3819 } 3820 } 3821 break; 3822 3823 case OP_HSPACE: 3824 for (i = 1; i <= min; i++) 3825 { 3826 if (eptr >= md->end_subject) 3827 { 3828 SCHECK_PARTIAL(); 3829 RRETURN(MATCH_NOMATCH); 3830 } 3831 switch(*eptr++) 3832 { 3833 default: RRETURN(MATCH_NOMATCH); 3834 case 0x09: /* HT */ 3835 case 0x20: /* SPACE */ 3836 case 0xa0: /* NBSP */ 3837 break; 3838 } 3839 } 3840 break; 3841 3842 case OP_NOT_VSPACE: 3843 for (i = 1; i <= min; i++) 3844 { 3845 if (eptr >= md->end_subject) 3846 { 3847 SCHECK_PARTIAL(); 3848 RRETURN(MATCH_NOMATCH); 3849 } 3850 switch(*eptr++) 3851 { 3852 default: break; 3853 case 0x0a: /* LF */ 3854 case 0x0b: /* VT */ 3855 case 0x0c: /* FF */ 3856 case 0x0d: /* CR */ 3857 case 0x85: /* NEL */ 3858 RRETURN(MATCH_NOMATCH); 3859 } 3860 } 3861 break; 3862 3863 case OP_VSPACE: 3864 for (i = 1; i <= min; i++) 3865 { 3866 if (eptr >= md->end_subject) 3867 { 3868 SCHECK_PARTIAL(); 3869 RRETURN(MATCH_NOMATCH); 3870 } 3871 switch(*eptr++) 3872 { 3873 default: RRETURN(MATCH_NOMATCH); 3874 case 0x0a: /* LF */ 3875 case 0x0b: /* VT */ 3876 case 0x0c: /* FF */ 3877 case 0x0d: /* CR */ 3878 case 0x85: /* NEL */ 3879 break; 3880 } 3881 } 3882 break; 3883 3884 case OP_NOT_DIGIT: 3885 for (i = 1; i <= min; i++) 3886 { 3887 if (eptr >= md->end_subject) 3888 { 3889 SCHECK_PARTIAL(); 3890 RRETURN(MATCH_NOMATCH); 3891 } 3892 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 3893 } 3894 break; 3895 3896 case OP_DIGIT: 3897 for (i = 1; i <= min; i++) 3898 { 3899 if (eptr >= md->end_subject) 3900 { 3901 SCHECK_PARTIAL(); 3902 RRETURN(MATCH_NOMATCH); 3903 } 3904 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 3905 } 3906 break; 3907 3908 case OP_NOT_WHITESPACE: 3909 for (i = 1; i <= min; i++) 3910 { 3911 if (eptr >= md->end_subject) 3912 { 3913 SCHECK_PARTIAL(); 3914 RRETURN(MATCH_NOMATCH); 3915 } 3916 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 3917 } 3918 break; 3919 3920 case OP_WHITESPACE: 3921 for (i = 1; i <= min; i++) 3922 { 3923 if (eptr >= md->end_subject) 3924 { 3925 SCHECK_PARTIAL(); 3926 RRETURN(MATCH_NOMATCH); 3927 } 3928 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 3929 } 3930 break; 3931 3932 case OP_NOT_WORDCHAR: 3933 for (i = 1; i <= min; i++) 3934 { 3935 if (eptr >= md->end_subject) 3936 { 3937 SCHECK_PARTIAL(); 3938 RRETURN(MATCH_NOMATCH); 3939 } 3940 if ((md->ctypes[*eptr++] & ctype_word) != 0) 3941 RRETURN(MATCH_NOMATCH); 3942 } 3943 break; 3944 3945 case OP_WORDCHAR: 3946 for (i = 1; i <= min; i++) 3947 { 3948 if (eptr >= md->end_subject) 3949 { 3950 SCHECK_PARTIAL(); 3951 RRETURN(MATCH_NOMATCH); 3952 } 3953 if ((md->ctypes[*eptr++] & ctype_word) == 0) 3954 RRETURN(MATCH_NOMATCH); 3955 } 3956 break; 3957 3958 default: 3959 RRETURN(PCRE_ERROR_INTERNAL); 3960 } 3961 } 3962 3963 /* If min = max, continue at the same level without recursing */ 3964 3965 if (min == max) continue; 3966 3967 /* If minimizing, we have to test the rest of the pattern before each 3968 subsequent match. Again, separate the UTF-8 case for speed, and also 3969 separate the UCP cases. */ 3970 3971 if (minimize) 3972 { 3973#ifdef SUPPORT_UCP 3974 if (prop_type >= 0) 3975 { 3976 switch(prop_type) 3977 { 3978 case PT_ANY: 3979 for (fi = min;; fi++) 3980 { 3981 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); 3982 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3983 if (fi >= max) RRETURN(MATCH_NOMATCH); 3984 if (eptr >= md->end_subject) 3985 { 3986 SCHECK_PARTIAL(); 3987 RRETURN(MATCH_NOMATCH); 3988 } 3989 GETCHARINC(c, eptr); 3990 if (prop_fail_result) RRETURN(MATCH_NOMATCH); 3991 } 3992 /* Control never gets here */ 3993 3994 case PT_LAMP: 3995 for (fi = min;; fi++) 3996 { 3997 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); 3998 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 3999 if (fi >= max) RRETURN(MATCH_NOMATCH); 4000 if (eptr >= md->end_subject) 4001 { 4002 SCHECK_PARTIAL(); 4003 RRETURN(MATCH_NOMATCH); 4004 } 4005 GETCHARINC(c, eptr); 4006 prop_chartype = UCD_CHARTYPE(c); 4007 if ((prop_chartype == ucp_Lu || 4008 prop_chartype == ucp_Ll || 4009 prop_chartype == ucp_Lt) == prop_fail_result) 4010 RRETURN(MATCH_NOMATCH); 4011 } 4012 /* Control never gets here */ 4013 4014 case PT_GC: 4015 for (fi = min;; fi++) 4016 { 4017 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); 4018 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4019 if (fi >= max) RRETURN(MATCH_NOMATCH); 4020 if (eptr >= md->end_subject) 4021 { 4022 SCHECK_PARTIAL(); 4023 RRETURN(MATCH_NOMATCH); 4024 } 4025 GETCHARINC(c, eptr); 4026 prop_category = UCD_CATEGORY(c); 4027 if ((prop_category == prop_value) == prop_fail_result) 4028 RRETURN(MATCH_NOMATCH); 4029 } 4030 /* Control never gets here */ 4031 4032 case PT_PC: 4033 for (fi = min;; fi++) 4034 { 4035 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); 4036 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4037 if (fi >= max) RRETURN(MATCH_NOMATCH); 4038 if (eptr >= md->end_subject) 4039 { 4040 SCHECK_PARTIAL(); 4041 RRETURN(MATCH_NOMATCH); 4042 } 4043 GETCHARINC(c, eptr); 4044 prop_chartype = UCD_CHARTYPE(c); 4045 if ((prop_chartype == prop_value) == prop_fail_result) 4046 RRETURN(MATCH_NOMATCH); 4047 } 4048 /* Control never gets here */ 4049 4050 case PT_SC: 4051 for (fi = min;; fi++) 4052 { 4053 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); 4054 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4055 if (fi >= max) RRETURN(MATCH_NOMATCH); 4056 if (eptr >= md->end_subject) 4057 { 4058 SCHECK_PARTIAL(); 4059 RRETURN(MATCH_NOMATCH); 4060 } 4061 GETCHARINC(c, eptr); 4062 prop_script = UCD_SCRIPT(c); 4063 if ((prop_script == prop_value) == prop_fail_result) 4064 RRETURN(MATCH_NOMATCH); 4065 } 4066 /* Control never gets here */ 4067 4068 default: 4069 RRETURN(PCRE_ERROR_INTERNAL); 4070 } 4071 } 4072 4073 /* Match extended Unicode sequences. We will get here only if the 4074 support is in the binary; otherwise a compile-time error occurs. */ 4075 4076 else if (ctype == OP_EXTUNI) 4077 { 4078 for (fi = min;; fi++) 4079 { 4080 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); 4081 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4082 if (fi >= max) RRETURN(MATCH_NOMATCH); 4083 if (eptr >= md->end_subject) 4084 { 4085 SCHECK_PARTIAL(); 4086 RRETURN(MATCH_NOMATCH); 4087 } 4088 GETCHARINCTEST(c, eptr); 4089 prop_category = UCD_CATEGORY(c); 4090 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); 4091 while (eptr < md->end_subject) 4092 { 4093 int len = 1; 4094 if (!utf8) c = *eptr; 4095 else { GETCHARLEN(c, eptr, len); } 4096 prop_category = UCD_CATEGORY(c); 4097 if (prop_category != ucp_M) break; 4098 eptr += len; 4099 } 4100 } 4101 } 4102 4103 else 4104#endif /* SUPPORT_UCP */ 4105 4106#ifdef SUPPORT_UTF8 4107 /* UTF-8 mode */ 4108 if (utf8) 4109 { 4110 for (fi = min;; fi++) 4111 { 4112 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); 4113 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4114 if (fi >= max) RRETURN(MATCH_NOMATCH); 4115 if (eptr >= md->end_subject) 4116 { 4117 SCHECK_PARTIAL(); 4118 RRETURN(MATCH_NOMATCH); 4119 } 4120 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 4121 RRETURN(MATCH_NOMATCH); 4122 GETCHARINC(c, eptr); 4123 switch(ctype) 4124 { 4125 case OP_ANY: /* This is the non-NL case */ 4126 case OP_ALLANY: 4127 case OP_ANYBYTE: 4128 break; 4129 4130 case OP_ANYNL: 4131 switch(c) 4132 { 4133 default: RRETURN(MATCH_NOMATCH); 4134 case 0x000d: 4135 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4136 break; 4137 case 0x000a: 4138 break; 4139 4140 case 0x000b: 4141 case 0x000c: 4142 case 0x0085: 4143 case 0x2028: 4144 case 0x2029: 4145 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4146 break; 4147 } 4148 break; 4149 4150 case OP_NOT_HSPACE: 4151 switch(c) 4152 { 4153 default: break; 4154 case 0x09: /* HT */ 4155 case 0x20: /* SPACE */ 4156 case 0xa0: /* NBSP */ 4157 case 0x1680: /* OGHAM SPACE MARK */ 4158 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4159 case 0x2000: /* EN QUAD */ 4160 case 0x2001: /* EM QUAD */ 4161 case 0x2002: /* EN SPACE */ 4162 case 0x2003: /* EM SPACE */ 4163 case 0x2004: /* THREE-PER-EM SPACE */ 4164 case 0x2005: /* FOUR-PER-EM SPACE */ 4165 case 0x2006: /* SIX-PER-EM SPACE */ 4166 case 0x2007: /* FIGURE SPACE */ 4167 case 0x2008: /* PUNCTUATION SPACE */ 4168 case 0x2009: /* THIN SPACE */ 4169 case 0x200A: /* HAIR SPACE */ 4170 case 0x202f: /* NARROW NO-BREAK SPACE */ 4171 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4172 case 0x3000: /* IDEOGRAPHIC SPACE */ 4173 RRETURN(MATCH_NOMATCH); 4174 } 4175 break; 4176 4177 case OP_HSPACE: 4178 switch(c) 4179 { 4180 default: RRETURN(MATCH_NOMATCH); 4181 case 0x09: /* HT */ 4182 case 0x20: /* SPACE */ 4183 case 0xa0: /* NBSP */ 4184 case 0x1680: /* OGHAM SPACE MARK */ 4185 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4186 case 0x2000: /* EN QUAD */ 4187 case 0x2001: /* EM QUAD */ 4188 case 0x2002: /* EN SPACE */ 4189 case 0x2003: /* EM SPACE */ 4190 case 0x2004: /* THREE-PER-EM SPACE */ 4191 case 0x2005: /* FOUR-PER-EM SPACE */ 4192 case 0x2006: /* SIX-PER-EM SPACE */ 4193 case 0x2007: /* FIGURE SPACE */ 4194 case 0x2008: /* PUNCTUATION SPACE */ 4195 case 0x2009: /* THIN SPACE */ 4196 case 0x200A: /* HAIR SPACE */ 4197 case 0x202f: /* NARROW NO-BREAK SPACE */ 4198 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4199 case 0x3000: /* IDEOGRAPHIC SPACE */ 4200 break; 4201 } 4202 break; 4203 4204 case OP_NOT_VSPACE: 4205 switch(c) 4206 { 4207 default: break; 4208 case 0x0a: /* LF */ 4209 case 0x0b: /* VT */ 4210 case 0x0c: /* FF */ 4211 case 0x0d: /* CR */ 4212 case 0x85: /* NEL */ 4213 case 0x2028: /* LINE SEPARATOR */ 4214 case 0x2029: /* PARAGRAPH SEPARATOR */ 4215 RRETURN(MATCH_NOMATCH); 4216 } 4217 break; 4218 4219 case OP_VSPACE: 4220 switch(c) 4221 { 4222 default: RRETURN(MATCH_NOMATCH); 4223 case 0x0a: /* LF */ 4224 case 0x0b: /* VT */ 4225 case 0x0c: /* FF */ 4226 case 0x0d: /* CR */ 4227 case 0x85: /* NEL */ 4228 case 0x2028: /* LINE SEPARATOR */ 4229 case 0x2029: /* PARAGRAPH SEPARATOR */ 4230 break; 4231 } 4232 break; 4233 4234 case OP_NOT_DIGIT: 4235 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 4236 RRETURN(MATCH_NOMATCH); 4237 break; 4238 4239 case OP_DIGIT: 4240 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 4241 RRETURN(MATCH_NOMATCH); 4242 break; 4243 4244 case OP_NOT_WHITESPACE: 4245 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 4246 RRETURN(MATCH_NOMATCH); 4247 break; 4248 4249 case OP_WHITESPACE: 4250 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 4251 RRETURN(MATCH_NOMATCH); 4252 break; 4253 4254 case OP_NOT_WORDCHAR: 4255 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 4256 RRETURN(MATCH_NOMATCH); 4257 break; 4258 4259 case OP_WORDCHAR: 4260 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) 4261 RRETURN(MATCH_NOMATCH); 4262 break; 4263 4264 default: 4265 RRETURN(PCRE_ERROR_INTERNAL); 4266 } 4267 } 4268 } 4269 else 4270#endif 4271 /* Not UTF-8 mode */ 4272 { 4273 for (fi = min;; fi++) 4274 { 4275 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); 4276 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4277 if (fi >= max) RRETURN(MATCH_NOMATCH); 4278 if (eptr >= md->end_subject) 4279 { 4280 SCHECK_PARTIAL(); 4281 RRETURN(MATCH_NOMATCH); 4282 } 4283 if (ctype == OP_ANY && IS_NEWLINE(eptr)) 4284 RRETURN(MATCH_NOMATCH); 4285 c = *eptr++; 4286 switch(ctype) 4287 { 4288 case OP_ANY: /* This is the non-NL case */ 4289 case OP_ALLANY: 4290 case OP_ANYBYTE: 4291 break; 4292 4293 case OP_ANYNL: 4294 switch(c) 4295 { 4296 default: RRETURN(MATCH_NOMATCH); 4297 case 0x000d: 4298 if (eptr < md->end_subject && *eptr == 0x0a) eptr++; 4299 break; 4300 4301 case 0x000a: 4302 break; 4303 4304 case 0x000b: 4305 case 0x000c: 4306 case 0x0085: 4307 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); 4308 break; 4309 } 4310 break; 4311 4312 case OP_NOT_HSPACE: 4313 switch(c) 4314 { 4315 default: break; 4316 case 0x09: /* HT */ 4317 case 0x20: /* SPACE */ 4318 case 0xa0: /* NBSP */ 4319 RRETURN(MATCH_NOMATCH); 4320 } 4321 break; 4322 4323 case OP_HSPACE: 4324 switch(c) 4325 { 4326 default: RRETURN(MATCH_NOMATCH); 4327 case 0x09: /* HT */ 4328 case 0x20: /* SPACE */ 4329 case 0xa0: /* NBSP */ 4330 break; 4331 } 4332 break; 4333 4334 case OP_NOT_VSPACE: 4335 switch(c) 4336 { 4337 default: break; 4338 case 0x0a: /* LF */ 4339 case 0x0b: /* VT */ 4340 case 0x0c: /* FF */ 4341 case 0x0d: /* CR */ 4342 case 0x85: /* NEL */ 4343 RRETURN(MATCH_NOMATCH); 4344 } 4345 break; 4346 4347 case OP_VSPACE: 4348 switch(c) 4349 { 4350 default: RRETURN(MATCH_NOMATCH); 4351 case 0x0a: /* LF */ 4352 case 0x0b: /* VT */ 4353 case 0x0c: /* FF */ 4354 case 0x0d: /* CR */ 4355 case 0x85: /* NEL */ 4356 break; 4357 } 4358 break; 4359 4360 case OP_NOT_DIGIT: 4361 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 4362 break; 4363 4364 case OP_DIGIT: 4365 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 4366 break; 4367 4368 case OP_NOT_WHITESPACE: 4369 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 4370 break; 4371 4372 case OP_WHITESPACE: 4373 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 4374 break; 4375 4376 case OP_NOT_WORDCHAR: 4377 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 4378 break; 4379 4380 case OP_WORDCHAR: 4381 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 4382 break; 4383 4384 default: 4385 RRETURN(PCRE_ERROR_INTERNAL); 4386 } 4387 } 4388 } 4389 /* Control never gets here */ 4390 } 4391 4392 /* If maximizing, it is worth using inline code for speed, doing the type 4393 test once at the start (i.e. keep it out of the loop). Again, keep the 4394 UTF-8 and UCP stuff separate. */ 4395 4396 else 4397 { 4398 pp = eptr; /* Remember where we started */ 4399 4400#ifdef SUPPORT_UCP 4401 if (prop_type >= 0) 4402 { 4403 switch(prop_type) 4404 { 4405 case PT_ANY: 4406 for (i = min; i < max; i++) 4407 { 4408 int len = 1; 4409 if (eptr >= md->end_subject) 4410 { 4411 SCHECK_PARTIAL(); 4412 break; 4413 } 4414 GETCHARLEN(c, eptr, len); 4415 if (prop_fail_result) break; 4416 eptr+= len; 4417 } 4418 break; 4419 4420 case PT_LAMP: 4421 for (i = min; i < max; i++) 4422 { 4423 int len = 1; 4424 if (eptr >= md->end_subject) 4425 { 4426 SCHECK_PARTIAL(); 4427 break; 4428 } 4429 GETCHARLEN(c, eptr, len); 4430 prop_chartype = UCD_CHARTYPE(c); 4431 if ((prop_chartype == ucp_Lu || 4432 prop_chartype == ucp_Ll || 4433 prop_chartype == ucp_Lt) == prop_fail_result) 4434 break; 4435 eptr+= len; 4436 } 4437 break; 4438 4439 case PT_GC: 4440 for (i = min; i < max; i++) 4441 { 4442 int len = 1; 4443 if (eptr >= md->end_subject) 4444 { 4445 SCHECK_PARTIAL(); 4446 break; 4447 } 4448 GETCHARLEN(c, eptr, len); 4449 prop_category = UCD_CATEGORY(c); 4450 if ((prop_category == prop_value) == prop_fail_result) 4451 break; 4452 eptr+= len; 4453 } 4454 break; 4455 4456 case PT_PC: 4457 for (i = min; i < max; i++) 4458 { 4459 int len = 1; 4460 if (eptr >= md->end_subject) 4461 { 4462 SCHECK_PARTIAL(); 4463 break; 4464 } 4465 GETCHARLEN(c, eptr, len); 4466 prop_chartype = UCD_CHARTYPE(c); 4467 if ((prop_chartype == prop_value) == prop_fail_result) 4468 break; 4469 eptr+= len; 4470 } 4471 break; 4472 4473 case PT_SC: 4474 for (i = min; i < max; i++) 4475 { 4476 int len = 1; 4477 if (eptr >= md->end_subject) 4478 { 4479 SCHECK_PARTIAL(); 4480 break; 4481 } 4482 GETCHARLEN(c, eptr, len); 4483 prop_script = UCD_SCRIPT(c); 4484 if ((prop_script == prop_value) == prop_fail_result) 4485 break; 4486 eptr+= len; 4487 } 4488 break; 4489 } 4490 4491 /* eptr is now past the end of the maximum run */ 4492 4493 if (possessive) continue; 4494 for(;;) 4495 { 4496 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); 4497 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4498 if (eptr-- == pp) break; /* Stop if tried at original pos */ 4499 if (utf8) BACKCHAR(eptr); 4500 } 4501 } 4502 4503 /* Match extended Unicode sequences. We will get here only if the 4504 support is in the binary; otherwise a compile-time error occurs. */ 4505 4506 else if (ctype == OP_EXTUNI) 4507 { 4508 for (i = min; i < max; i++) 4509 { 4510 if (eptr >= md->end_subject) 4511 { 4512 SCHECK_PARTIAL(); 4513 break; 4514 } 4515 GETCHARINCTEST(c, eptr); 4516 prop_category = UCD_CATEGORY(c); 4517 if (prop_category == ucp_M) break; 4518 while (eptr < md->end_subject) 4519 { 4520 int len = 1; 4521 if (!utf8) c = *eptr; else 4522 { 4523 GETCHARLEN(c, eptr, len); 4524 } 4525 prop_category = UCD_CATEGORY(c); 4526 if (prop_category != ucp_M) break; 4527 eptr += len; 4528 } 4529 } 4530 4531 /* eptr is now past the end of the maximum run */ 4532 4533 if (possessive) continue; 4534 4535 for(;;) 4536 { 4537 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); 4538 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4539 if (eptr-- == pp) break; /* Stop if tried at original pos */ 4540 for (;;) /* Move back over one extended */ 4541 { 4542 int len = 1; 4543 if (!utf8) c = *eptr; else 4544 { 4545 BACKCHAR(eptr); 4546 GETCHARLEN(c, eptr, len); 4547 } 4548 prop_category = UCD_CATEGORY(c); 4549 if (prop_category != ucp_M) break; 4550 eptr--; 4551 } 4552 } 4553 } 4554 4555 else 4556#endif /* SUPPORT_UCP */ 4557 4558#ifdef SUPPORT_UTF8 4559 /* UTF-8 mode */ 4560 4561 if (utf8) 4562 { 4563 switch(ctype) 4564 { 4565 case OP_ANY: 4566 if (max < INT_MAX) 4567 { 4568 for (i = min; i < max; i++) 4569 { 4570 if (eptr >= md->end_subject) 4571 { 4572 SCHECK_PARTIAL(); 4573 break; 4574 } 4575 if (IS_NEWLINE(eptr)) break; 4576 eptr++; 4577 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 4578 } 4579 } 4580 4581 /* Handle unlimited UTF-8 repeat */ 4582 4583 else 4584 { 4585 for (i = min; i < max; i++) 4586 { 4587 if (eptr >= md->end_subject) 4588 { 4589 SCHECK_PARTIAL(); 4590 break; 4591 } 4592 if (IS_NEWLINE(eptr)) break; 4593 eptr++; 4594 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 4595 } 4596 } 4597 break; 4598 4599 case OP_ALLANY: 4600 if (max < INT_MAX) 4601 { 4602 for (i = min; i < max; i++) 4603 { 4604 if (eptr >= md->end_subject) 4605 { 4606 SCHECK_PARTIAL(); 4607 break; 4608 } 4609 eptr++; 4610 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 4611 } 4612 } 4613 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ 4614 break; 4615 4616 /* The byte case is the same as non-UTF8 */ 4617 4618 case OP_ANYBYTE: 4619 c = max - min; 4620 if (c > (unsigned int)(md->end_subject - eptr)) 4621 { 4622 eptr = md->end_subject; 4623 SCHECK_PARTIAL(); 4624 } 4625 else eptr += c; 4626 break; 4627 4628 case OP_ANYNL: 4629 for (i = min; i < max; i++) 4630 { 4631 int len = 1; 4632 if (eptr >= md->end_subject) 4633 { 4634 SCHECK_PARTIAL(); 4635 break; 4636 } 4637 GETCHARLEN(c, eptr, len); 4638 if (c == 0x000d) 4639 { 4640 if (++eptr >= md->end_subject) break; 4641 if (*eptr == 0x000a) eptr++; 4642 } 4643 else 4644 { 4645 if (c != 0x000a && 4646 (md->bsr_anycrlf || 4647 (c != 0x000b && c != 0x000c && 4648 c != 0x0085 && c != 0x2028 && c != 0x2029))) 4649 break; 4650 eptr += len; 4651 } 4652 } 4653 break; 4654 4655 case OP_NOT_HSPACE: 4656 case OP_HSPACE: 4657 for (i = min; i < max; i++) 4658 { 4659 BOOL gotspace; 4660 int len = 1; 4661 if (eptr >= md->end_subject) 4662 { 4663 SCHECK_PARTIAL(); 4664 break; 4665 } 4666 GETCHARLEN(c, eptr, len); 4667 switch(c) 4668 { 4669 default: gotspace = FALSE; break; 4670 case 0x09: /* HT */ 4671 case 0x20: /* SPACE */ 4672 case 0xa0: /* NBSP */ 4673 case 0x1680: /* OGHAM SPACE MARK */ 4674 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 4675 case 0x2000: /* EN QUAD */ 4676 case 0x2001: /* EM QUAD */ 4677 case 0x2002: /* EN SPACE */ 4678 case 0x2003: /* EM SPACE */ 4679 case 0x2004: /* THREE-PER-EM SPACE */ 4680 case 0x2005: /* FOUR-PER-EM SPACE */ 4681 case 0x2006: /* SIX-PER-EM SPACE */ 4682 case 0x2007: /* FIGURE SPACE */ 4683 case 0x2008: /* PUNCTUATION SPACE */ 4684 case 0x2009: /* THIN SPACE */ 4685 case 0x200A: /* HAIR SPACE */ 4686 case 0x202f: /* NARROW NO-BREAK SPACE */ 4687 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 4688 case 0x3000: /* IDEOGRAPHIC SPACE */ 4689 gotspace = TRUE; 4690 break; 4691 } 4692 if (gotspace == (ctype == OP_NOT_HSPACE)) break; 4693 eptr += len; 4694 } 4695 break; 4696 4697 case OP_NOT_VSPACE: 4698 case OP_VSPACE: 4699 for (i = min; i < max; i++) 4700 { 4701 BOOL gotspace; 4702 int len = 1; 4703 if (eptr >= md->end_subject) 4704 { 4705 SCHECK_PARTIAL(); 4706 break; 4707 } 4708 GETCHARLEN(c, eptr, len); 4709 switch(c) 4710 { 4711 default: gotspace = FALSE; break; 4712 case 0x0a: /* LF */ 4713 case 0x0b: /* VT */ 4714 case 0x0c: /* FF */ 4715 case 0x0d: /* CR */ 4716 case 0x85: /* NEL */ 4717 case 0x2028: /* LINE SEPARATOR */ 4718 case 0x2029: /* PARAGRAPH SEPARATOR */ 4719 gotspace = TRUE; 4720 break; 4721 } 4722 if (gotspace == (ctype == OP_NOT_VSPACE)) break; 4723 eptr += len; 4724 } 4725 break; 4726 4727 case OP_NOT_DIGIT: 4728 for (i = min; i < max; i++) 4729 { 4730 int len = 1; 4731 if (eptr >= md->end_subject) 4732 { 4733 SCHECK_PARTIAL(); 4734 break; 4735 } 4736 GETCHARLEN(c, eptr, len); 4737 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 4738 eptr+= len; 4739 } 4740 break; 4741 4742 case OP_DIGIT: 4743 for (i = min; i < max; i++) 4744 { 4745 int len = 1; 4746 if (eptr >= md->end_subject) 4747 { 4748 SCHECK_PARTIAL(); 4749 break; 4750 } 4751 GETCHARLEN(c, eptr, len); 4752 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 4753 eptr+= len; 4754 } 4755 break; 4756 4757 case OP_NOT_WHITESPACE: 4758 for (i = min; i < max; i++) 4759 { 4760 int len = 1; 4761 if (eptr >= md->end_subject) 4762 { 4763 SCHECK_PARTIAL(); 4764 break; 4765 } 4766 GETCHARLEN(c, eptr, len); 4767 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 4768 eptr+= len; 4769 } 4770 break; 4771 4772 case OP_WHITESPACE: 4773 for (i = min; i < max; i++) 4774 { 4775 int len = 1; 4776 if (eptr >= md->end_subject) 4777 { 4778 SCHECK_PARTIAL(); 4779 break; 4780 } 4781 GETCHARLEN(c, eptr, len); 4782 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 4783 eptr+= len; 4784 } 4785 break; 4786 4787 case OP_NOT_WORDCHAR: 4788 for (i = min; i < max; i++) 4789 { 4790 int len = 1; 4791 if (eptr >= md->end_subject) 4792 { 4793 SCHECK_PARTIAL(); 4794 break; 4795 } 4796 GETCHARLEN(c, eptr, len); 4797 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 4798 eptr+= len; 4799 } 4800 break; 4801 4802 case OP_WORDCHAR: 4803 for (i = min; i < max; i++) 4804 { 4805 int len = 1; 4806 if (eptr >= md->end_subject) 4807 { 4808 SCHECK_PARTIAL(); 4809 break; 4810 } 4811 GETCHARLEN(c, eptr, len); 4812 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 4813 eptr+= len; 4814 } 4815 break; 4816 4817 default: 4818 RRETURN(PCRE_ERROR_INTERNAL); 4819 } 4820 4821 /* eptr is now past the end of the maximum run */ 4822 4823 if (possessive) continue; 4824 for(;;) 4825 { 4826 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46); 4827 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 4828 if (eptr-- == pp) break; /* Stop if tried at original pos */ 4829 BACKCHAR(eptr); 4830 } 4831 } 4832 else 4833#endif /* SUPPORT_UTF8 */ 4834 4835 /* Not UTF-8 mode */ 4836 { 4837 switch(ctype) 4838 { 4839 case OP_ANY: 4840 for (i = min; i < max; i++) 4841 { 4842 if (eptr >= md->end_subject) 4843 { 4844 SCHECK_PARTIAL(); 4845 break; 4846 } 4847 if (IS_NEWLINE(eptr)) break; 4848 eptr++; 4849 } 4850 break; 4851 4852 case OP_ALLANY: 4853 case OP_ANYBYTE: 4854 c = max - min; 4855 if (c > (unsigned int)(md->end_subject - eptr)) 4856 { 4857 eptr = md->end_subject; 4858 SCHECK_PARTIAL(); 4859 } 4860 else eptr += c; 4861 break; 4862 4863 case OP_ANYNL: 4864 for (i = min; i < max; i++) 4865 { 4866 if (eptr >= md->end_subject) 4867 { 4868 SCHECK_PARTIAL(); 4869 break; 4870 } 4871 c = *eptr; 4872 if (c == 0x000d) 4873 { 4874 if (++eptr >= md->end_subject) break; 4875 if (*eptr == 0x000a) eptr++; 4876 } 4877 else 4878 { 4879 if (c != 0x000a && 4880 (md->bsr_anycrlf || 4881 (c != 0x000b && c != 0x000c && c != 0x0085))) 4882 break; 4883 eptr++; 4884 } 4885 } 4886 break; 4887 4888 case OP_NOT_HSPACE: 4889 for (i = min; i < max; i++) 4890 { 4891 if (eptr >= md->end_subject) 4892 { 4893 SCHECK_PARTIAL(); 4894 break; 4895 } 4896 c = *eptr; 4897 if (c == 0x09 || c == 0x20 || c == 0xa0) break; 4898 eptr++; 4899 } 4900 break; 4901 4902 case OP_HSPACE: 4903 for (i = min; i < max; i++) 4904 { 4905 if (eptr >= md->end_subject) 4906 { 4907 SCHECK_PARTIAL(); 4908 break; 4909 } 4910 c = *eptr; 4911 if (c != 0x09 && c != 0x20 && c != 0xa0) break; 4912 eptr++; 4913 } 4914 break; 4915 4916 case OP_NOT_VSPACE: 4917 for (i = min; i < max; i++) 4918 { 4919 if (eptr >= md->end_subject) 4920 { 4921 SCHECK_PARTIAL(); 4922 break; 4923 } 4924 c = *eptr; 4925 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) 4926 break; 4927 eptr++; 4928 } 4929 break; 4930 4931 case OP_VSPACE: 4932 for (i = min; i < max; i++) 4933 { 4934 if (eptr >= md->end_subject) 4935 { 4936 SCHECK_PARTIAL(); 4937 break; 4938 } 4939 c = *eptr; 4940 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) 4941 break; 4942 eptr++; 4943 } 4944 break; 4945 4946 case OP_NOT_DIGIT: 4947 for (i = min; i < max; i++) 4948 { 4949 if (eptr >= md->end_subject) 4950 { 4951 SCHECK_PARTIAL(); 4952 break; 4953 } 4954 if ((md->ctypes[*eptr] & ctype_digit) != 0) break; 4955 eptr++; 4956 } 4957 break; 4958 4959 case OP_DIGIT: 4960 for (i = min; i < max; i++) 4961 { 4962 if (eptr >= md->end_subject) 4963 { 4964 SCHECK_PARTIAL(); 4965 break; 4966 } 4967 if ((md->ctypes[*eptr] & ctype_digit) == 0) break; 4968 eptr++; 4969 } 4970 break; 4971 4972 case OP_NOT_WHITESPACE: 4973 for (i = min; i < max; i++) 4974 { 4975 if (eptr >= md->end_subject) 4976 { 4977 SCHECK_PARTIAL(); 4978 break; 4979 } 4980 if ((md->ctypes[*eptr] & ctype_space) != 0) break; 4981 eptr++; 4982 } 4983 break; 4984 4985 case OP_WHITESPACE: 4986 for (i = min; i < max; i++) 4987 { 4988 if (eptr >= md->end_subject) 4989 { 4990 SCHECK_PARTIAL(); 4991 break; 4992 } 4993 if ((md->ctypes[*eptr] & ctype_space) == 0) break; 4994 eptr++; 4995 } 4996 break; 4997 4998 case OP_NOT_WORDCHAR: 4999 for (i = min; i < max; i++) 5000 { 5001 if (eptr >= md->end_subject) 5002 { 5003 SCHECK_PARTIAL(); 5004 break; 5005 } 5006 if ((md->ctypes[*eptr] & ctype_word) != 0) break; 5007 eptr++; 5008 } 5009 break; 5010 5011 case OP_WORDCHAR: 5012 for (i = min; i < max; i++) 5013 { 5014 if (eptr >= md->end_subject) 5015 { 5016 SCHECK_PARTIAL(); 5017 break; 5018 } 5019 if ((md->ctypes[*eptr] & ctype_word) == 0) break; 5020 eptr++; 5021 } 5022 break; 5023 5024 default: 5025 RRETURN(PCRE_ERROR_INTERNAL); 5026 } 5027 5028 /* eptr is now past the end of the maximum run */ 5029 5030 if (possessive) continue; 5031 while (eptr >= pp) 5032 { 5033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47); 5034 eptr--; 5035 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 5036 } 5037 } 5038 5039 /* Get here if we can't make it match with any permitted repetitions */ 5040 5041 RRETURN(MATCH_NOMATCH); 5042 } 5043 /* Control never gets here */ 5044 5045 /* There's been some horrible disaster. Arrival here can only mean there is 5046 something seriously wrong in the code above or the OP_xxx definitions. */ 5047 5048 default: 5049 DPRINTF(("Unknown opcode %d\n", *ecode)); 5050 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); 5051 } 5052 5053 /* Do not stick any code in here without much thought; it is assumed 5054 that "continue" in the code above comes out to here to repeat the main 5055 loop. */ 5056 5057 } /* End of main loop */ 5058/* Control never reaches here */ 5059 5060 5061/* When compiling to use the heap rather than the stack for recursive calls to 5062match(), the RRETURN() macro jumps here. The number that is saved in 5063frame->Xwhere indicates which label we actually want to return to. */ 5064 5065#ifdef NO_RECURSE 5066#define LBL(val) case val: goto L_RM##val; 5067HEAP_RETURN: 5068switch (frame->Xwhere) 5069 { 5070 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) 5071 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) 5072 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) 5073 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) 5074 LBL(53) LBL(54) 5075#ifdef SUPPORT_UTF8 5076 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) 5077 LBL(32) LBL(34) LBL(42) LBL(46) 5078#ifdef SUPPORT_UCP 5079 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) 5080#endif /* SUPPORT_UCP */ 5081#endif /* SUPPORT_UTF8 */ 5082 default: 5083 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); 5084 return PCRE_ERROR_INTERNAL; 5085 } 5086#undef LBL 5087#endif /* NO_RECURSE */ 5088} 5089 5090 5091/*************************************************************************** 5092**************************************************************************** 5093 RECURSION IN THE match() FUNCTION 5094 5095Undefine all the macros that were defined above to handle this. */ 5096 5097#ifdef NO_RECURSE 5098#undef eptr 5099#undef ecode 5100#undef mstart 5101#undef offset_top 5102#undef ims 5103#undef eptrb 5104#undef flags 5105 5106#undef callpat 5107#undef charptr 5108#undef data 5109#undef next 5110#undef pp 5111#undef prev 5112#undef saved_eptr 5113 5114#undef new_recursive 5115 5116#undef cur_is_word 5117#undef condition 5118#undef prev_is_word 5119 5120#undef original_ims 5121 5122#undef ctype 5123#undef length 5124#undef max 5125#undef min 5126#undef number 5127#undef offset 5128#undef op 5129#undef save_capture_last 5130#undef save_offset1 5131#undef save_offset2 5132#undef save_offset3 5133#undef stacksave 5134 5135#undef newptrb 5136 5137#endif 5138 5139/* These two are defined as macros in both cases */ 5140 5141#undef fc 5142#undef fi 5143 5144/*************************************************************************** 5145***************************************************************************/ 5146 5147 5148 5149/************************************************* 5150* Execute a Regular Expression * 5151*************************************************/ 5152 5153/* This function applies a compiled re to a subject string and picks out 5154portions of the string if it matches. Two elements in the vector are set for 5155each substring: the offsets to the start and end of the substring. 5156 5157Arguments: 5158 argument_re points to the compiled expression 5159 extra_data points to extra data or is NULL 5160 subject points to the subject string 5161 length length of subject string (may contain binary zeros) 5162 start_offset where to start in the subject string 5163 options option bits 5164 offsets points to a vector of ints to be filled in with offsets 5165 offsetcount the number of elements in the vector 5166 5167Returns: > 0 => success; value is the number of elements filled in 5168 = 0 => success, but offsets is not big enough 5169 -1 => failed to match 5170 < -1 => some kind of unexpected problem 5171*/ 5172 5173PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 5174pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 5175 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, 5176 int offsetcount) 5177{ 5178int rc, resetcount, ocount; 5179int first_byte = -1; 5180int req_byte = -1; 5181int req_byte2 = -1; 5182int newline; 5183unsigned long int ims; 5184BOOL using_temporary_offsets = FALSE; 5185BOOL anchored; 5186BOOL startline; 5187BOOL firstline; 5188BOOL first_byte_caseless = FALSE; 5189BOOL req_byte_caseless = FALSE; 5190BOOL utf8; 5191match_data match_block; 5192match_data *md = &match_block; 5193const uschar *tables; 5194const uschar *start_bits = NULL; 5195USPTR start_match = (USPTR)subject + start_offset; 5196USPTR end_subject; 5197USPTR start_partial = NULL; 5198USPTR req_byte_ptr = start_match - 1; 5199 5200pcre_study_data internal_study; 5201const pcre_study_data *study; 5202 5203real_pcre internal_re; 5204const real_pcre *external_re = (const real_pcre *)argument_re; 5205const real_pcre *re = external_re; 5206 5207/* Plausibility checks */ 5208 5209if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 5210if (re == NULL || subject == NULL || 5211 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 5212if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 5213 5214/* This information is for finding all the numbers associated with a given 5215name, for condition testing. */ 5216 5217md->name_table = (uschar *)re + re->name_table_offset; 5218md->name_count = re->name_count; 5219md->name_entry_size = re->name_entry_size; 5220 5221/* Fish out the optional data from the extra_data structure, first setting 5222the default values. */ 5223 5224study = NULL; 5225md->match_limit = MATCH_LIMIT; 5226md->match_limit_recursion = MATCH_LIMIT_RECURSION; 5227md->callout_data = NULL; 5228 5229/* The table pointer is always in native byte order. */ 5230 5231tables = external_re->tables; 5232 5233if (extra_data != NULL) 5234 { 5235 register unsigned int flags = extra_data->flags; 5236 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 5237 study = (const pcre_study_data *)extra_data->study_data; 5238 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 5239 md->match_limit = extra_data->match_limit; 5240 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 5241 md->match_limit_recursion = extra_data->match_limit_recursion; 5242 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 5243 md->callout_data = extra_data->callout_data; 5244 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 5245 } 5246 5247/* If the exec call supplied NULL for tables, use the inbuilt ones. This 5248is a feature that makes it possible to save compiled regex and re-use them 5249in other programs later. */ 5250 5251if (tables == NULL) tables = _pcre_default_tables; 5252 5253/* Check that the first field in the block is the magic number. If it is not, 5254test for a regex that was compiled on a host of opposite endianness. If this is 5255the case, flipped values are put in internal_re and internal_study if there was 5256study data too. */ 5257 5258if (re->magic_number != MAGIC_NUMBER) 5259 { 5260 re = _pcre_try_flipped(re, &internal_re, study, &internal_study); 5261 if (re == NULL) return PCRE_ERROR_BADMAGIC; 5262 if (study != NULL) study = &internal_study; 5263 } 5264 5265/* Set up other data */ 5266 5267anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 5268startline = (re->flags & PCRE_STARTLINE) != 0; 5269firstline = (re->options & PCRE_FIRSTLINE) != 0; 5270 5271/* The code starts after the real_pcre block and the capture name table. */ 5272 5273md->start_code = (const uschar *)external_re + re->name_table_offset + 5274 re->name_count * re->name_entry_size; 5275 5276md->start_subject = (USPTR)subject; 5277md->start_offset = start_offset; 5278md->end_subject = md->start_subject + length; 5279end_subject = md->end_subject; 5280 5281md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 5282utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; 5283md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; 5284 5285md->notbol = (options & PCRE_NOTBOL) != 0; 5286md->noteol = (options & PCRE_NOTEOL) != 0; 5287md->notempty = (options & PCRE_NOTEMPTY) != 0; 5288md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0; 5289md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : 5290 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; 5291md->hitend = FALSE; 5292 5293md->recursive = NULL; /* No recursion at top level */ 5294 5295md->lcc = tables + lcc_offset; 5296md->ctypes = tables + ctypes_offset; 5297 5298/* Handle different \R options. */ 5299 5300switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 5301 { 5302 case 0: 5303 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 5304 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; 5305 else 5306#ifdef BSR_ANYCRLF 5307 md->bsr_anycrlf = TRUE; 5308#else 5309 md->bsr_anycrlf = FALSE; 5310#endif 5311 break; 5312 5313 case PCRE_BSR_ANYCRLF: 5314 md->bsr_anycrlf = TRUE; 5315 break; 5316 5317 case PCRE_BSR_UNICODE: 5318 md->bsr_anycrlf = FALSE; 5319 break; 5320 5321 default: return PCRE_ERROR_BADNEWLINE; 5322 } 5323 5324/* Handle different types of newline. The three bits give eight cases. If 5325nothing is set at run time, whatever was used at compile time applies. */ 5326 5327switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : 5328 (pcre_uint32)options) & PCRE_NEWLINE_BITS) 5329 { 5330 case 0: newline = NEWLINE; break; /* Compile-time default */ 5331 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 5332 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 5333 case PCRE_NEWLINE_CR+ 5334 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 5335 case PCRE_NEWLINE_ANY: newline = -1; break; 5336 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 5337 default: return PCRE_ERROR_BADNEWLINE; 5338 } 5339 5340if (newline == -2) 5341 { 5342 md->nltype = NLTYPE_ANYCRLF; 5343 } 5344else if (newline < 0) 5345 { 5346 md->nltype = NLTYPE_ANY; 5347 } 5348else 5349 { 5350 md->nltype = NLTYPE_FIXED; 5351 if (newline > 255) 5352 { 5353 md->nllen = 2; 5354 md->nl[0] = (newline >> 8) & 255; 5355 md->nl[1] = newline & 255; 5356 } 5357 else 5358 { 5359 md->nllen = 1; 5360 md->nl[0] = newline; 5361 } 5362 } 5363 5364/* Partial matching was originally supported only for a restricted set of 5365regexes; from release 8.00 there are no restrictions, but the bits are still 5366defined (though never set). So there's no harm in leaving this code. */ 5367 5368if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) 5369 return PCRE_ERROR_BADPARTIAL; 5370 5371/* Check a UTF-8 string if required. Unfortunately there's no way of passing 5372back the character offset. */ 5373 5374#ifdef SUPPORT_UTF8 5375if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) 5376 { 5377 if (_pcre_valid_utf8((USPTR)subject, length) >= 0) 5378 return PCRE_ERROR_BADUTF8; 5379 if (start_offset > 0 && start_offset < length) 5380 { 5381 int tb = ((USPTR)subject)[start_offset]; 5382 if (tb > 127) 5383 { 5384 tb &= 0xc0; 5385 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; 5386 } 5387 } 5388 } 5389#endif 5390 5391/* The ims options can vary during the matching as a result of the presence 5392of (?ims) items in the pattern. They are kept in a local variable so that 5393restoring at the exit of a group is easy. */ 5394 5395ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); 5396 5397/* If the expression has got more back references than the offsets supplied can 5398hold, we get a temporary chunk of working store to use during the matching. 5399Otherwise, we can use the vector supplied, rounding down its size to a multiple 5400of 3. */ 5401 5402ocount = offsetcount - (offsetcount % 3); 5403 5404if (re->top_backref > 0 && re->top_backref >= ocount/3) 5405 { 5406 ocount = re->top_backref * 3 + 3; 5407 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); 5408 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 5409 using_temporary_offsets = TRUE; 5410 DPRINTF(("Got memory to hold back references\n")); 5411 } 5412else md->offset_vector = offsets; 5413 5414md->offset_end = ocount; 5415md->offset_max = (2*ocount)/3; 5416md->offset_overflow = FALSE; 5417md->capture_last = -1; 5418 5419/* Compute the minimum number of offsets that we need to reset each time. Doing 5420this makes a huge difference to execution time when there aren't many brackets 5421in the pattern. */ 5422 5423resetcount = 2 + re->top_bracket * 2; 5424if (resetcount > offsetcount) resetcount = ocount; 5425 5426/* Reset the working variable associated with each extraction. These should 5427never be used unless previously set, but they get saved and restored, and so we 5428initialize them to avoid reading uninitialized locations. */ 5429 5430if (md->offset_vector != NULL) 5431 { 5432 register int *iptr = md->offset_vector + ocount; 5433 register int *iend = iptr - resetcount/2 + 1; 5434 while (--iptr >= iend) *iptr = -1; 5435 } 5436 5437/* Set up the first character to match, if available. The first_byte value is 5438never set for an anchored regular expression, but the anchoring may be forced 5439at run time, so we have to test for anchoring. The first char may be unset for 5440an unanchored pattern, of course. If there's no first char and the pattern was 5441studied, there may be a bitmap of possible first characters. */ 5442 5443if (!anchored) 5444 { 5445 if ((re->flags & PCRE_FIRSTSET) != 0) 5446 { 5447 first_byte = re->first_byte & 255; 5448 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) 5449 first_byte = md->lcc[first_byte]; 5450 } 5451 else 5452 if (!startline && study != NULL && 5453 (study->flags & PCRE_STUDY_MAPPED) != 0) 5454 start_bits = study->start_bits; 5455 } 5456 5457/* For anchored or unanchored matches, there may be a "last known required 5458character" set. */ 5459 5460if ((re->flags & PCRE_REQCHSET) != 0) 5461 { 5462 req_byte = re->req_byte & 255; 5463 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 5464 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ 5465 } 5466 5467 5468/* ==========================================================================*/ 5469 5470/* Loop for handling unanchored repeated matching attempts; for anchored regexs 5471the loop runs just once. */ 5472 5473for(;;) 5474 { 5475 USPTR save_end_subject = end_subject; 5476 USPTR new_start_match; 5477 5478 /* Reset the maximum number of extractions we might see. */ 5479 5480 if (md->offset_vector != NULL) 5481 { 5482 register int *iptr = md->offset_vector; 5483 register int *iend = iptr + resetcount; 5484 while (iptr < iend) *iptr++ = -1; 5485 } 5486 5487 /* If firstline is TRUE, the start of the match is constrained to the first 5488 line of a multiline string. That is, the match must be before or at the first 5489 newline. Implement this by temporarily adjusting end_subject so that we stop 5490 scanning at a newline. If the match fails at the newline, later code breaks 5491 this loop. */ 5492 5493 if (firstline) 5494 { 5495 USPTR t = start_match; 5496#ifdef SUPPORT_UTF8 5497 if (utf8) 5498 { 5499 while (t < md->end_subject && !IS_NEWLINE(t)) 5500 { 5501 t++; 5502 while (t < end_subject && (*t & 0xc0) == 0x80) t++; 5503 } 5504 } 5505 else 5506#endif 5507 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 5508 end_subject = t; 5509 } 5510 5511 /* There are some optimizations that avoid running the match if a known 5512 starting point is not found, or if a known later character is not present. 5513 However, there is an option that disables these, for testing and for ensuring 5514 that all callouts do actually occur. */ 5515 5516 if ((options & PCRE_NO_START_OPTIMIZE) == 0) 5517 { 5518 /* Advance to a unique first byte if there is one. */ 5519 5520 if (first_byte >= 0) 5521 { 5522 if (first_byte_caseless) 5523 while (start_match < end_subject && md->lcc[*start_match] != first_byte) 5524 start_match++; 5525 else 5526 while (start_match < end_subject && *start_match != first_byte) 5527 start_match++; 5528 } 5529 5530 /* Or to just after a linebreak for a multiline match */ 5531 5532 else if (startline) 5533 { 5534 if (start_match > md->start_subject + start_offset) 5535 { 5536#ifdef SUPPORT_UTF8 5537 if (utf8) 5538 { 5539 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 5540 { 5541 start_match++; 5542 while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 5543 start_match++; 5544 } 5545 } 5546 else 5547#endif 5548 while (start_match < end_subject && !WAS_NEWLINE(start_match)) 5549 start_match++; 5550 5551 /* If we have just passed a CR and the newline option is ANY or ANYCRLF, 5552 and we are now at a LF, advance the match position by one more character. 5553 */ 5554 5555 if (start_match[-1] == CHAR_CR && 5556 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 5557 start_match < end_subject && 5558 *start_match == CHAR_NL) 5559 start_match++; 5560 } 5561 } 5562 5563 /* Or to a non-unique first byte after study */ 5564 5565 else if (start_bits != NULL) 5566 { 5567 while (start_match < end_subject) 5568 { 5569 register unsigned int c = *start_match; 5570 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; 5571 else break; 5572 } 5573 } 5574 } /* Starting optimizations */ 5575 5576 /* Restore fudged end_subject */ 5577 5578 end_subject = save_end_subject; 5579 5580 /* The following two optimizations are disabled for partial matching or if 5581 disabling is explicitly requested. */ 5582 5583 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) 5584 { 5585 /* If the pattern was studied, a minimum subject length may be set. This is 5586 a lower bound; no actual string of that length may actually match the 5587 pattern. Although the value is, strictly, in characters, we treat it as 5588 bytes to avoid spending too much time in this optimization. */ 5589 5590 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 5591 (pcre_uint32)(end_subject - start_match) < study->minlength) 5592 { 5593 rc = MATCH_NOMATCH; 5594 break; 5595 } 5596 5597 /* If req_byte is set, we know that that character must appear in the 5598 subject for the match to succeed. If the first character is set, req_byte 5599 must be later in the subject; otherwise the test starts at the match point. 5600 This optimization can save a huge amount of backtracking in patterns with 5601 nested unlimited repeats that aren't going to match. Writing separate code 5602 for cased/caseless versions makes it go faster, as does using an 5603 autoincrement and backing off on a match. 5604 5605 HOWEVER: when the subject string is very, very long, searching to its end 5606 can take a long time, and give bad performance on quite ordinary patterns. 5607 This showed up when somebody was matching something like /^\d+C/ on a 5608 32-megabyte string... so we don't do this when the string is sufficiently 5609 long. */ 5610 5611 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) 5612 { 5613 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); 5614 5615 /* We don't need to repeat the search if we haven't yet reached the 5616 place we found it at last time. */ 5617 5618 if (p > req_byte_ptr) 5619 { 5620 if (req_byte_caseless) 5621 { 5622 while (p < end_subject) 5623 { 5624 register int pp = *p++; 5625 if (pp == req_byte || pp == req_byte2) { p--; break; } 5626 } 5627 } 5628 else 5629 { 5630 while (p < end_subject) 5631 { 5632 if (*p++ == req_byte) { p--; break; } 5633 } 5634 } 5635 5636 /* If we can't find the required character, break the matching loop, 5637 forcing a match failure. */ 5638 5639 if (p >= end_subject) 5640 { 5641 rc = MATCH_NOMATCH; 5642 break; 5643 } 5644 5645 /* If we have found the required character, save the point where we 5646 found it, so that we don't search again next time round the loop if 5647 the start hasn't passed this character yet. */ 5648 5649 req_byte_ptr = p; 5650 } 5651 } 5652 } 5653 5654#ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */ 5655 printf(">>>> Match against: "); 5656 pchars(start_match, end_subject - start_match, TRUE, md); 5657 printf("\n"); 5658#endif 5659 5660 /* OK, we can now run the match. If "hitend" is set afterwards, remember the 5661 first starting point for which a partial match was found. */ 5662 5663 md->start_match_ptr = start_match; 5664 md->start_used_ptr = start_match; 5665 md->match_call_count = 0; 5666 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL, 5667 0, 0); 5668 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr; 5669 5670 switch(rc) 5671 { 5672 /* NOMATCH and PRUNE advance by one character. THEN at this level acts 5673 exactly like PRUNE. */ 5674 5675 case MATCH_NOMATCH: 5676 case MATCH_PRUNE: 5677 case MATCH_THEN: 5678 new_start_match = start_match + 1; 5679#ifdef SUPPORT_UTF8 5680 if (utf8) 5681 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) 5682 new_start_match++; 5683#endif 5684 break; 5685 5686 /* SKIP passes back the next starting point explicitly. */ 5687 5688 case MATCH_SKIP: 5689 new_start_match = md->start_match_ptr; 5690 break; 5691 5692 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ 5693 5694 case MATCH_COMMIT: 5695 rc = MATCH_NOMATCH; 5696 goto ENDLOOP; 5697 5698 /* Any other return is either a match, or some kind of error. */ 5699 5700 default: 5701 goto ENDLOOP; 5702 } 5703 5704 /* Control reaches here for the various types of "no match at this point" 5705 result. Reset the code to MATCH_NOMATCH for subsequent checking. */ 5706 5707 rc = MATCH_NOMATCH; 5708 5709 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first 5710 newline in the subject (though it may continue over the newline). Therefore, 5711 if we have just failed to match, starting at a newline, do not continue. */ 5712 5713 if (firstline && IS_NEWLINE(start_match)) break; 5714 5715 /* Advance to new matching position */ 5716 5717 start_match = new_start_match; 5718 5719 /* Break the loop if the pattern is anchored or if we have passed the end of 5720 the subject. */ 5721 5722 if (anchored || start_match > end_subject) break; 5723 5724 /* If we have just passed a CR and we are now at a LF, and the pattern does 5725 not contain any explicit matches for \r or \n, and the newline option is CRLF 5726 or ANY or ANYCRLF, advance the match position by one more character. */ 5727 5728 if (start_match[-1] == CHAR_CR && 5729 start_match < end_subject && 5730 *start_match == CHAR_NL && 5731 (re->flags & PCRE_HASCRORLF) == 0 && 5732 (md->nltype == NLTYPE_ANY || 5733 md->nltype == NLTYPE_ANYCRLF || 5734 md->nllen == 2)) 5735 start_match++; 5736 5737 } /* End of for(;;) "bumpalong" loop */ 5738 5739/* ==========================================================================*/ 5740 5741/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping 5742conditions is true: 5743 5744(1) The pattern is anchored or the match was failed by (*COMMIT); 5745 5746(2) We are past the end of the subject; 5747 5748(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because 5749 this option requests that a match occur at or before the first newline in 5750 the subject. 5751 5752When we have a match and the offset vector is big enough to deal with any 5753backreferences, captured substring offsets will already be set up. In the case 5754where we had to get some local store to hold offsets for backreference 5755processing, copy those that we can. In this case there need not be overflow if 5756certain parts of the pattern were not used, even though there are more 5757capturing parentheses than vector slots. */ 5758 5759ENDLOOP: 5760 5761if (rc == MATCH_MATCH) 5762 { 5763 if (using_temporary_offsets) 5764 { 5765 if (offsetcount >= 4) 5766 { 5767 memcpy(offsets + 2, md->offset_vector + 2, 5768 (offsetcount - 2) * sizeof(int)); 5769 DPRINTF(("Copied offsets from temporary memory\n")); 5770 } 5771 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; 5772 DPRINTF(("Freeing temporary memory\n")); 5773 (pcre_free)(md->offset_vector); 5774 } 5775 5776 /* Set the return code to the number of captured strings, or 0 if there are 5777 too many to fit into the vector. */ 5778 5779 rc = md->offset_overflow? 0 : md->end_offset_top/2; 5780 5781 /* If there is space, set up the whole thing as substring 0. The value of 5782 md->start_match_ptr might be modified if \K was encountered on the success 5783 matching path. */ 5784 5785 if (offsetcount < 2) rc = 0; else 5786 { 5787 offsets[0] = md->start_match_ptr - md->start_subject; 5788 offsets[1] = md->end_match_ptr - md->start_subject; 5789 } 5790 5791 DPRINTF((">>>> returning %d\n", rc)); 5792 return rc; 5793 } 5794 5795/* Control gets here if there has been an error, or if the overall match 5796attempt has failed at all permitted starting positions. */ 5797 5798if (using_temporary_offsets) 5799 { 5800 DPRINTF(("Freeing temporary memory\n")); 5801 (pcre_free)(md->offset_vector); 5802 } 5803 5804if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL) 5805 { 5806 DPRINTF((">>>> error: returning %d\n", rc)); 5807 return rc; 5808 } 5809else if (start_partial != NULL) 5810 { 5811 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 5812 if (offsetcount > 1) 5813 { 5814 offsets[0] = start_partial - (USPTR)subject; 5815 offsets[1] = end_subject - (USPTR)subject; 5816 } 5817 return PCRE_ERROR_PARTIAL; 5818 } 5819else 5820 { 5821 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 5822 return PCRE_ERROR_NOMATCH; 5823 } 5824} 5825 5826/* End of pcre_exec.c */ 5827