1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language (but see 7below for why this module is different). 8 9 Written by Philip Hazel 10 Copyright (c) 1997-2010 University of Cambridge 11 12----------------------------------------------------------------------------- 13Redistribution and use in source and binary forms, with or without 14modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37POSSIBILITY OF SUCH DAMAGE. 38----------------------------------------------------------------------------- 39*/ 40 41 42/* This module contains the external function pcre_dfa_exec(), which is an 43alternative matching function that uses a sort of DFA algorithm (not a true 44FSM). This is NOT Perl- compatible, but it has advantages in certain 45applications. */ 46 47 48/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved 49the performance of his patterns greatly. I could not use it as it stood, as it 50was not thread safe, and made assumptions about pattern sizes. Also, it caused 51test 7 to loop, and test 9 to crash with a segfault. 52 53The issue is the check for duplicate states, which is done by a simple linear 54search up the state list. (Grep for "duplicate" below to find the code.) For 55many patterns, there will never be many states active at one time, so a simple 56linear search is fine. In patterns that have many active states, it might be a 57bottleneck. The suggested code used an indexing scheme to remember which states 58had previously been used for each character, and avoided the linear search when 59it knew there was no chance of a duplicate. This was implemented when adding 60states to the state lists. 61 62I wrote some thread-safe, not-limited code to try something similar at the time 63of checking for duplicates (instead of when adding states), using index vectors 64on the stack. It did give a 13% improvement with one specially constructed 65pattern for certain subject strings, but on other strings and on many of the 66simpler patterns in the test suite it did worse. The major problem, I think, 67was the extra time to initialize the index. This had to be done for each call 68of internal_dfa_exec(). (The supplied patch used a static vector, initialized 69only once - I suspect this was the cause of the problems with the tests.) 70 71Overall, I concluded that the gains in some cases did not outweigh the losses 72in others, so I abandoned this code. */ 73 74 75 76#ifdef HAVE_CONFIG_H 77#include "config.h" 78#endif 79 80#define NLBLOCK md /* Block containing newline information */ 81#define PSSTART start_subject /* Field containing processed string start */ 82#define PSEND end_subject /* Field containing processed string end */ 83 84#include "pcre_internal.h" 85 86 87/* For use to indent debugging output */ 88 89#define SP " " 90 91 92/************************************************* 93* Code parameters and static tables * 94*************************************************/ 95 96/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes 97into others, under special conditions. A gap of 20 between the blocks should be 98enough. The resulting opcodes don't have to be less than 256 because they are 99never stored, so we push them well clear of the normal opcodes. */ 100 101#define OP_PROP_EXTRA 300 102#define OP_EXTUNI_EXTRA 320 103#define OP_ANYNL_EXTRA 340 104#define OP_HSPACE_EXTRA 360 105#define OP_VSPACE_EXTRA 380 106 107 108/* This table identifies those opcodes that are followed immediately by a 109character that is to be tested in some way. This makes is possible to 110centralize the loading of these characters. In the case of Type * etc, the 111"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a 112small value. Non-zero values in the table are the offsets from the opcode where 113the character is to be found. ***NOTE*** If the start of this table is 114modified, the three tables that follow must also be modified. */ 115 116static const uschar coptable[] = { 117 0, /* End */ 118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 120 0, 0, 0, /* Any, AllAny, Anybyte */ 121 0, 0, /* \P, \p */ 122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 123 0, /* \X */ 124 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ 125 1, /* Char */ 126 1, /* Charnc */ 127 1, /* not */ 128 /* Positive single-char repeats */ 129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 130 3, 3, 3, /* upto, minupto, exact */ 131 1, 1, 1, 3, /* *+, ++, ?+, upto+ */ 132 /* Negative single-char repeats - only for chars < 256 */ 133 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 134 3, 3, 3, /* NOT upto, minupto, exact */ 135 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */ 136 /* Positive type repeats */ 137 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 138 3, 3, 3, /* Type upto, minupto, exact */ 139 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */ 140 /* Character class & ref repeats */ 141 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 142 0, 0, /* CRRANGE, CRMINRANGE */ 143 0, /* CLASS */ 144 0, /* NCLASS */ 145 0, /* XCLASS - variable length */ 146 0, /* REF */ 147 0, /* RECURSE */ 148 0, /* CALLOUT */ 149 0, /* Alt */ 150 0, /* Ket */ 151 0, /* KetRmax */ 152 0, /* KetRmin */ 153 0, /* Assert */ 154 0, /* Assert not */ 155 0, /* Assert behind */ 156 0, /* Assert behind not */ 157 0, /* Reverse */ 158 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ 159 0, 0, 0, /* SBRA, SCBRA, SCOND */ 160 0, 0, /* CREF, NCREF */ 161 0, 0, /* RREF, NRREF */ 162 0, /* DEF */ 163 0, 0, /* BRAZERO, BRAMINZERO */ 164 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ 165 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ 166}; 167 168/* This table identifies those opcodes that inspect a character. It is used to 169remember the fact that a character could have been inspected when the end of 170the subject is reached. ***NOTE*** If the start of this table is modified, the 171two tables that follow must also be modified. */ 172 173static const uschar poptable[] = { 174 0, /* End */ 175 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 176 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ 177 1, 1, 1, /* Any, AllAny, Anybyte */ 178 1, 1, /* \P, \p */ 179 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 180 1, /* \X */ 181 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ 182 1, /* Char */ 183 1, /* Charnc */ 184 1, /* not */ 185 /* Positive single-char repeats */ 186 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 187 1, 1, 1, /* upto, minupto, exact */ 188 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ 189 /* Negative single-char repeats - only for chars < 256 */ 190 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 191 1, 1, 1, /* NOT upto, minupto, exact */ 192 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ 193 /* Positive type repeats */ 194 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 195 1, 1, 1, /* Type upto, minupto, exact */ 196 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ 197 /* Character class & ref repeats */ 198 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 199 1, 1, /* CRRANGE, CRMINRANGE */ 200 1, /* CLASS */ 201 1, /* NCLASS */ 202 1, /* XCLASS - variable length */ 203 0, /* REF */ 204 0, /* RECURSE */ 205 0, /* CALLOUT */ 206 0, /* Alt */ 207 0, /* Ket */ 208 0, /* KetRmax */ 209 0, /* KetRmin */ 210 0, /* Assert */ 211 0, /* Assert not */ 212 0, /* Assert behind */ 213 0, /* Assert behind not */ 214 0, /* Reverse */ 215 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ 216 0, 0, 0, /* SBRA, SCBRA, SCOND */ 217 0, 0, /* CREF, NCREF */ 218 0, 0, /* RREF, NRREF */ 219 0, /* DEF */ 220 0, 0, /* BRAZERO, BRAMINZERO */ 221 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ 222 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */ 223}; 224 225/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, 226and \w */ 227 228static const uschar toptable1[] = { 229 0, 0, 0, 0, 0, 0, 230 ctype_digit, ctype_digit, 231 ctype_space, ctype_space, 232 ctype_word, ctype_word, 233 0, 0 /* OP_ANY, OP_ALLANY */ 234}; 235 236static const uschar toptable2[] = { 237 0, 0, 0, 0, 0, 0, 238 ctype_digit, 0, 239 ctype_space, 0, 240 ctype_word, 0, 241 1, 1 /* OP_ANY, OP_ALLANY */ 242}; 243 244 245/* Structure for holding data about a particular state, which is in effect the 246current data for an active path through the match tree. It must consist 247entirely of ints because the working vector we are passed, and which we put 248these structures in, is a vector of ints. */ 249 250typedef struct stateblock { 251 int offset; /* Offset to opcode */ 252 int count; /* Count for repeats */ 253 int ims; /* ims flag bits */ 254 int data; /* Some use extra data */ 255} stateblock; 256 257#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int)) 258 259 260#ifdef PCRE_DEBUG 261/************************************************* 262* Print character string * 263*************************************************/ 264 265/* Character string printing function for debugging. 266 267Arguments: 268 p points to string 269 length number of bytes 270 f where to print 271 272Returns: nothing 273*/ 274 275static void 276pchars(unsigned char *p, int length, FILE *f) 277{ 278int c; 279while (length-- > 0) 280 { 281 if (isprint(c = *(p++))) 282 fprintf(f, "%c", c); 283 else 284 fprintf(f, "\\x%02x", c); 285 } 286} 287#endif 288 289 290 291/************************************************* 292* Execute a Regular Expression - DFA engine * 293*************************************************/ 294 295/* This internal function applies a compiled pattern to a subject string, 296starting at a given point, using a DFA engine. This function is called from the 297external one, possibly multiple times if the pattern is not anchored. The 298function calls itself recursively for some kinds of subpattern. 299 300Arguments: 301 md the match_data block with fixed information 302 this_start_code the opening bracket of this subexpression's code 303 current_subject where we currently are in the subject string 304 start_offset start offset in the subject string 305 offsets vector to contain the matching string offsets 306 offsetcount size of same 307 workspace vector of workspace 308 wscount size of same 309 ims the current ims flags 310 rlevel function call recursion level 311 recursing regex recursive call level 312 313Returns: > 0 => number of match offset pairs placed in offsets 314 = 0 => offsets overflowed; longest matches are present 315 -1 => failed to match 316 < -1 => some kind of unexpected problem 317 318The following macros are used for adding states to the two state vectors (one 319for the current character, one for the following character). */ 320 321#define ADD_ACTIVE(x,y) \ 322 if (active_count++ < wscount) \ 323 { \ 324 next_active_state->offset = (x); \ 325 next_active_state->count = (y); \ 326 next_active_state->ims = ims; \ 327 next_active_state++; \ 328 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 329 } \ 330 else return PCRE_ERROR_DFA_WSSIZE 331 332#define ADD_ACTIVE_DATA(x,y,z) \ 333 if (active_count++ < wscount) \ 334 { \ 335 next_active_state->offset = (x); \ 336 next_active_state->count = (y); \ 337 next_active_state->ims = ims; \ 338 next_active_state->data = (z); \ 339 next_active_state++; \ 340 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ 341 } \ 342 else return PCRE_ERROR_DFA_WSSIZE 343 344#define ADD_NEW(x,y) \ 345 if (new_count++ < wscount) \ 346 { \ 347 next_new_state->offset = (x); \ 348 next_new_state->count = (y); \ 349 next_new_state->ims = ims; \ 350 next_new_state++; \ 351 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 352 } \ 353 else return PCRE_ERROR_DFA_WSSIZE 354 355#define ADD_NEW_DATA(x,y,z) \ 356 if (new_count++ < wscount) \ 357 { \ 358 next_new_state->offset = (x); \ 359 next_new_state->count = (y); \ 360 next_new_state->ims = ims; \ 361 next_new_state->data = (z); \ 362 next_new_state++; \ 363 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ 364 } \ 365 else return PCRE_ERROR_DFA_WSSIZE 366 367/* And now, here is the code */ 368 369static int 370internal_dfa_exec( 371 dfa_match_data *md, 372 const uschar *this_start_code, 373 const uschar *current_subject, 374 int start_offset, 375 int *offsets, 376 int offsetcount, 377 int *workspace, 378 int wscount, 379 int ims, 380 int rlevel, 381 int recursing) 382{ 383stateblock *active_states, *new_states, *temp_states; 384stateblock *next_active_state, *next_new_state; 385 386const uschar *ctypes, *lcc, *fcc; 387const uschar *ptr; 388const uschar *end_code, *first_op; 389 390int active_count, new_count, match_count; 391 392/* Some fields in the md block are frequently referenced, so we load them into 393independent variables in the hope that this will perform better. */ 394 395const uschar *start_subject = md->start_subject; 396const uschar *end_subject = md->end_subject; 397const uschar *start_code = md->start_code; 398 399#ifdef SUPPORT_UTF8 400BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; 401#else 402BOOL utf8 = FALSE; 403#endif 404 405rlevel++; 406offsetcount &= (-2); 407 408wscount -= 2; 409wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / 410 (2 * INTS_PER_STATEBLOCK); 411 412DPRINTF(("\n%.*s---------------------\n" 413 "%.*sCall to internal_dfa_exec f=%d r=%d\n", 414 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing)); 415 416ctypes = md->tables + ctypes_offset; 417lcc = md->tables + lcc_offset; 418fcc = md->tables + fcc_offset; 419 420match_count = PCRE_ERROR_NOMATCH; /* A negative number */ 421 422active_states = (stateblock *)(workspace + 2); 423next_new_state = new_states = active_states + wscount; 424new_count = 0; 425 426first_op = this_start_code + 1 + LINK_SIZE + 427 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); 428 429/* The first thing in any (sub) pattern is a bracket of some sort. Push all 430the alternative states onto the list, and find out where the end is. This 431makes is possible to use this function recursively, when we want to stop at a 432matching internal ket rather than at the end. 433 434If the first opcode in the first alternative is OP_REVERSE, we are dealing with 435a backward assertion. In that case, we have to find out the maximum amount to 436move back, and set up each alternative appropriately. */ 437 438if (*first_op == OP_REVERSE) 439 { 440 int max_back = 0; 441 int gone_back; 442 443 end_code = this_start_code; 444 do 445 { 446 int back = GET(end_code, 2+LINK_SIZE); 447 if (back > max_back) max_back = back; 448 end_code += GET(end_code, 1); 449 } 450 while (*end_code == OP_ALT); 451 452 /* If we can't go back the amount required for the longest lookbehind 453 pattern, go back as far as we can; some alternatives may still be viable. */ 454 455#ifdef SUPPORT_UTF8 456 /* In character mode we have to step back character by character */ 457 458 if (utf8) 459 { 460 for (gone_back = 0; gone_back < max_back; gone_back++) 461 { 462 if (current_subject <= start_subject) break; 463 current_subject--; 464 while (current_subject > start_subject && 465 (*current_subject & 0xc0) == 0x80) 466 current_subject--; 467 } 468 } 469 else 470#endif 471 472 /* In byte-mode we can do this quickly. */ 473 474 { 475 gone_back = (current_subject - max_back < start_subject)? 476 current_subject - start_subject : max_back; 477 current_subject -= gone_back; 478 } 479 480 /* Save the earliest consulted character */ 481 482 if (current_subject < md->start_used_ptr) 483 md->start_used_ptr = current_subject; 484 485 /* Now we can process the individual branches. */ 486 487 end_code = this_start_code; 488 do 489 { 490 int back = GET(end_code, 2+LINK_SIZE); 491 if (back <= gone_back) 492 { 493 int bstate = end_code - start_code + 2 + 2*LINK_SIZE; 494 ADD_NEW_DATA(-bstate, 0, gone_back - back); 495 } 496 end_code += GET(end_code, 1); 497 } 498 while (*end_code == OP_ALT); 499 } 500 501/* This is the code for a "normal" subpattern (not a backward assertion). The 502start of a whole pattern is always one of these. If we are at the top level, 503we may be asked to restart matching from the same point that we reached for a 504previous partial match. We still have to scan through the top-level branches to 505find the end state. */ 506 507else 508 { 509 end_code = this_start_code; 510 511 /* Restarting */ 512 513 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) 514 { 515 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); 516 new_count = workspace[1]; 517 if (!workspace[0]) 518 memcpy(new_states, active_states, new_count * sizeof(stateblock)); 519 } 520 521 /* Not restarting */ 522 523 else 524 { 525 int length = 1 + LINK_SIZE + 526 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); 527 do 528 { 529 ADD_NEW(end_code - start_code + length, 0); 530 end_code += GET(end_code, 1); 531 length = 1 + LINK_SIZE; 532 } 533 while (*end_code == OP_ALT); 534 } 535 } 536 537workspace[0] = 0; /* Bit indicating which vector is current */ 538 539DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code)); 540 541/* Loop for scanning the subject */ 542 543ptr = current_subject; 544for (;;) 545 { 546 int i, j; 547 int clen, dlen; 548 unsigned int c, d; 549 int forced_fail = 0; 550 BOOL could_continue = FALSE; 551 552 /* Make the new state list into the active state list and empty the 553 new state list. */ 554 555 temp_states = active_states; 556 active_states = new_states; 557 new_states = temp_states; 558 active_count = new_count; 559 new_count = 0; 560 561 workspace[0] ^= 1; /* Remember for the restarting feature */ 562 workspace[1] = active_count; 563 564#ifdef PCRE_DEBUG 565 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); 566 pchars((uschar *)ptr, strlen((char *)ptr), stdout); 567 printf("\"\n"); 568 569 printf("%.*sActive states: ", rlevel*2-2, SP); 570 for (i = 0; i < active_count; i++) 571 printf("%d/%d ", active_states[i].offset, active_states[i].count); 572 printf("\n"); 573#endif 574 575 /* Set the pointers for adding new states */ 576 577 next_active_state = active_states + active_count; 578 next_new_state = new_states; 579 580 /* Load the current character from the subject outside the loop, as many 581 different states may want to look at it, and we assume that at least one 582 will. */ 583 584 if (ptr < end_subject) 585 { 586 clen = 1; /* Number of bytes in the character */ 587#ifdef SUPPORT_UTF8 588 if (utf8) { GETCHARLEN(c, ptr, clen); } else 589#endif /* SUPPORT_UTF8 */ 590 c = *ptr; 591 } 592 else 593 { 594 clen = 0; /* This indicates the end of the subject */ 595 c = NOTACHAR; /* This value should never actually be used */ 596 } 597 598 /* Scan up the active states and act on each one. The result of an action 599 may be to add more states to the currently active list (e.g. on hitting a 600 parenthesis) or it may be to put states on the new list, for considering 601 when we move the character pointer on. */ 602 603 for (i = 0; i < active_count; i++) 604 { 605 stateblock *current_state = active_states + i; 606 const uschar *code; 607 int state_offset = current_state->offset; 608 int count, codevalue, rrc; 609 610#ifdef PCRE_DEBUG 611 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); 612 if (clen == 0) printf("EOL\n"); 613 else if (c > 32 && c < 127) printf("'%c'\n", c); 614 else printf("0x%02x\n", c); 615#endif 616 617 /* This variable is referred to implicity in the ADD_xxx macros. */ 618 619 ims = current_state->ims; 620 621 /* A negative offset is a special case meaning "hold off going to this 622 (negated) state until the number of characters in the data field have 623 been skipped". */ 624 625 if (state_offset < 0) 626 { 627 if (current_state->data > 0) 628 { 629 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); 630 ADD_NEW_DATA(state_offset, current_state->count, 631 current_state->data - 1); 632 continue; 633 } 634 else 635 { 636 current_state->offset = state_offset = -state_offset; 637 } 638 } 639 640 /* Check for a duplicate state with the same count, and skip if found. 641 See the note at the head of this module about the possibility of improving 642 performance here. */ 643 644 for (j = 0; j < i; j++) 645 { 646 if (active_states[j].offset == state_offset && 647 active_states[j].count == current_state->count) 648 { 649 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); 650 goto NEXT_ACTIVE_STATE; 651 } 652 } 653 654 /* The state offset is the offset to the opcode */ 655 656 code = start_code + state_offset; 657 codevalue = *code; 658 659 /* If this opcode inspects a character, but we are at the end of the 660 subject, remember the fact for use when testing for a partial match. */ 661 662 if (clen == 0 && poptable[codevalue] != 0) 663 could_continue = TRUE; 664 665 /* If this opcode is followed by an inline character, load it. It is 666 tempting to test for the presence of a subject character here, but that 667 is wrong, because sometimes zero repetitions of the subject are 668 permitted. 669 670 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an 671 argument that is not a data character - but is always one byte long. We 672 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in 673 this case. To keep the other cases fast, convert these ones to new opcodes. 674 */ 675 676 if (coptable[codevalue] > 0) 677 { 678 dlen = 1; 679#ifdef SUPPORT_UTF8 680 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else 681#endif /* SUPPORT_UTF8 */ 682 d = code[coptable[codevalue]]; 683 if (codevalue >= OP_TYPESTAR) 684 { 685 switch(d) 686 { 687 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; 688 case OP_NOTPROP: 689 case OP_PROP: codevalue += OP_PROP_EXTRA; break; 690 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; 691 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; 692 case OP_NOT_HSPACE: 693 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; 694 case OP_NOT_VSPACE: 695 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; 696 default: break; 697 } 698 } 699 } 700 else 701 { 702 dlen = 0; /* Not strictly necessary, but compilers moan */ 703 d = NOTACHAR; /* if these variables are not set. */ 704 } 705 706 707 /* Now process the individual opcodes */ 708 709 switch (codevalue) 710 { 711/* ========================================================================== */ 712 /* These cases are never obeyed. This is a fudge that causes a compile- 713 time error if the vectors coptable or poptable, which are indexed by 714 opcode, are not the correct length. It seems to be the only way to do 715 such a check at compile time, as the sizeof() operator does not work 716 in the C preprocessor. */ 717 718 case OP_TABLE_LENGTH: 719 case OP_TABLE_LENGTH + 720 ((sizeof(coptable) == OP_TABLE_LENGTH) && 721 (sizeof(poptable) == OP_TABLE_LENGTH)): 722 break; 723 724/* ========================================================================== */ 725 /* Reached a closing bracket. If not at the end of the pattern, carry 726 on with the next opcode. Otherwise, unless we have an empty string and 727 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the 728 start of the subject, save the match data, shifting up all previous 729 matches so we always have the longest first. */ 730 731 case OP_KET: 732 case OP_KETRMIN: 733 case OP_KETRMAX: 734 if (code != end_code) 735 { 736 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); 737 if (codevalue != OP_KET) 738 { 739 ADD_ACTIVE(state_offset - GET(code, 1), 0); 740 } 741 } 742 else 743 { 744 if (ptr > current_subject || 745 ((md->moptions & PCRE_NOTEMPTY) == 0 && 746 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || 747 current_subject > start_subject + md->start_offset))) 748 { 749 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; 750 else if (match_count > 0 && ++match_count * 2 >= offsetcount) 751 match_count = 0; 752 count = ((match_count == 0)? offsetcount : match_count * 2) - 2; 753 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); 754 if (offsetcount >= 2) 755 { 756 offsets[0] = current_subject - start_subject; 757 offsets[1] = ptr - start_subject; 758 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, 759 offsets[1] - offsets[0], current_subject)); 760 } 761 if ((md->moptions & PCRE_DFA_SHORTEST) != 0) 762 { 763 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 764 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, 765 match_count, rlevel*2-2, SP)); 766 return match_count; 767 } 768 } 769 } 770 break; 771 772/* ========================================================================== */ 773 /* These opcodes add to the current list of states without looking 774 at the current character. */ 775 776 /*-----------------------------------------------------------------*/ 777 case OP_ALT: 778 do { code += GET(code, 1); } while (*code == OP_ALT); 779 ADD_ACTIVE(code - start_code, 0); 780 break; 781 782 /*-----------------------------------------------------------------*/ 783 case OP_BRA: 784 case OP_SBRA: 785 do 786 { 787 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); 788 code += GET(code, 1); 789 } 790 while (*code == OP_ALT); 791 break; 792 793 /*-----------------------------------------------------------------*/ 794 case OP_CBRA: 795 case OP_SCBRA: 796 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); 797 code += GET(code, 1); 798 while (*code == OP_ALT) 799 { 800 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); 801 code += GET(code, 1); 802 } 803 break; 804 805 /*-----------------------------------------------------------------*/ 806 case OP_BRAZERO: 807 case OP_BRAMINZERO: 808 ADD_ACTIVE(state_offset + 1, 0); 809 code += 1 + GET(code, 2); 810 while (*code == OP_ALT) code += GET(code, 1); 811 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); 812 break; 813 814 /*-----------------------------------------------------------------*/ 815 case OP_SKIPZERO: 816 code += 1 + GET(code, 2); 817 while (*code == OP_ALT) code += GET(code, 1); 818 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); 819 break; 820 821 /*-----------------------------------------------------------------*/ 822 case OP_CIRC: 823 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || 824 ((ims & PCRE_MULTILINE) != 0 && 825 ptr != end_subject && 826 WAS_NEWLINE(ptr))) 827 { ADD_ACTIVE(state_offset + 1, 0); } 828 break; 829 830 /*-----------------------------------------------------------------*/ 831 case OP_EOD: 832 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } 833 break; 834 835 /*-----------------------------------------------------------------*/ 836 case OP_OPT: 837 ims = code[1]; 838 ADD_ACTIVE(state_offset + 2, 0); 839 break; 840 841 /*-----------------------------------------------------------------*/ 842 case OP_SOD: 843 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } 844 break; 845 846 /*-----------------------------------------------------------------*/ 847 case OP_SOM: 848 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } 849 break; 850 851 852/* ========================================================================== */ 853 /* These opcodes inspect the next subject character, and sometimes 854 the previous one as well, but do not have an argument. The variable 855 clen contains the length of the current character and is zero if we are 856 at the end of the subject. */ 857 858 /*-----------------------------------------------------------------*/ 859 case OP_ANY: 860 if (clen > 0 && !IS_NEWLINE(ptr)) 861 { ADD_NEW(state_offset + 1, 0); } 862 break; 863 864 /*-----------------------------------------------------------------*/ 865 case OP_ALLANY: 866 if (clen > 0) 867 { ADD_NEW(state_offset + 1, 0); } 868 break; 869 870 /*-----------------------------------------------------------------*/ 871 case OP_EODN: 872 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) 873 { ADD_ACTIVE(state_offset + 1, 0); } 874 break; 875 876 /*-----------------------------------------------------------------*/ 877 case OP_DOLL: 878 if ((md->moptions & PCRE_NOTEOL) == 0) 879 { 880 if (clen == 0 || 881 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && 882 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) 883 )) 884 { ADD_ACTIVE(state_offset + 1, 0); } 885 } 886 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr)) 887 { ADD_ACTIVE(state_offset + 1, 0); } 888 break; 889 890 /*-----------------------------------------------------------------*/ 891 892 case OP_DIGIT: 893 case OP_WHITESPACE: 894 case OP_WORDCHAR: 895 if (clen > 0 && c < 256 && 896 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) 897 { ADD_NEW(state_offset + 1, 0); } 898 break; 899 900 /*-----------------------------------------------------------------*/ 901 case OP_NOT_DIGIT: 902 case OP_NOT_WHITESPACE: 903 case OP_NOT_WORDCHAR: 904 if (clen > 0 && (c >= 256 || 905 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) 906 { ADD_NEW(state_offset + 1, 0); } 907 break; 908 909 /*-----------------------------------------------------------------*/ 910 case OP_WORD_BOUNDARY: 911 case OP_NOT_WORD_BOUNDARY: 912 { 913 int left_word, right_word; 914 915 if (ptr > start_subject) 916 { 917 const uschar *temp = ptr - 1; 918 if (temp < md->start_used_ptr) md->start_used_ptr = temp; 919#ifdef SUPPORT_UTF8 920 if (utf8) BACKCHAR(temp); 921#endif 922 GETCHARTEST(d, temp); 923 left_word = d < 256 && (ctypes[d] & ctype_word) != 0; 924 } 925 else left_word = 0; 926 927 if (clen > 0) 928 right_word = c < 256 && (ctypes[c] & ctype_word) != 0; 929 else right_word = 0; 930 931 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) 932 { ADD_ACTIVE(state_offset + 1, 0); } 933 } 934 break; 935 936 937 /*-----------------------------------------------------------------*/ 938 /* Check the next character by Unicode property. We will get here only 939 if the support is in the binary; otherwise a compile-time error occurs. 940 */ 941 942#ifdef SUPPORT_UCP 943 case OP_PROP: 944 case OP_NOTPROP: 945 if (clen > 0) 946 { 947 BOOL OK; 948 const ucd_record * prop = GET_UCD(c); 949 switch(code[1]) 950 { 951 case PT_ANY: 952 OK = TRUE; 953 break; 954 955 case PT_LAMP: 956 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; 957 break; 958 959 case PT_GC: 960 OK = _pcre_ucp_gentype[prop->chartype] == code[2]; 961 break; 962 963 case PT_PC: 964 OK = prop->chartype == code[2]; 965 break; 966 967 case PT_SC: 968 OK = prop->script == code[2]; 969 break; 970 971 /* Should never occur, but keep compilers from grumbling. */ 972 973 default: 974 OK = codevalue != OP_PROP; 975 break; 976 } 977 978 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } 979 } 980 break; 981#endif 982 983 984 985/* ========================================================================== */ 986 /* These opcodes likewise inspect the subject character, but have an 987 argument that is not a data character. It is one of these opcodes: 988 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, 989 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ 990 991 case OP_TYPEPLUS: 992 case OP_TYPEMINPLUS: 993 case OP_TYPEPOSPLUS: 994 count = current_state->count; /* Already matched */ 995 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 996 if (clen > 0) 997 { 998 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 999 (c < 256 && 1000 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1001 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1002 { 1003 if (count > 0 && codevalue == OP_TYPEPOSPLUS) 1004 { 1005 active_count--; /* Remove non-match possibility */ 1006 next_active_state--; 1007 } 1008 count++; 1009 ADD_NEW(state_offset, count); 1010 } 1011 } 1012 break; 1013 1014 /*-----------------------------------------------------------------*/ 1015 case OP_TYPEQUERY: 1016 case OP_TYPEMINQUERY: 1017 case OP_TYPEPOSQUERY: 1018 ADD_ACTIVE(state_offset + 2, 0); 1019 if (clen > 0) 1020 { 1021 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1022 (c < 256 && 1023 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1024 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1025 { 1026 if (codevalue == OP_TYPEPOSQUERY) 1027 { 1028 active_count--; /* Remove non-match possibility */ 1029 next_active_state--; 1030 } 1031 ADD_NEW(state_offset + 2, 0); 1032 } 1033 } 1034 break; 1035 1036 /*-----------------------------------------------------------------*/ 1037 case OP_TYPESTAR: 1038 case OP_TYPEMINSTAR: 1039 case OP_TYPEPOSSTAR: 1040 ADD_ACTIVE(state_offset + 2, 0); 1041 if (clen > 0) 1042 { 1043 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1044 (c < 256 && 1045 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1046 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1047 { 1048 if (codevalue == OP_TYPEPOSSTAR) 1049 { 1050 active_count--; /* Remove non-match possibility */ 1051 next_active_state--; 1052 } 1053 ADD_NEW(state_offset, 0); 1054 } 1055 } 1056 break; 1057 1058 /*-----------------------------------------------------------------*/ 1059 case OP_TYPEEXACT: 1060 count = current_state->count; /* Number already matched */ 1061 if (clen > 0) 1062 { 1063 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1064 (c < 256 && 1065 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1066 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1067 { 1068 if (++count >= GET2(code, 1)) 1069 { ADD_NEW(state_offset + 4, 0); } 1070 else 1071 { ADD_NEW(state_offset, count); } 1072 } 1073 } 1074 break; 1075 1076 /*-----------------------------------------------------------------*/ 1077 case OP_TYPEUPTO: 1078 case OP_TYPEMINUPTO: 1079 case OP_TYPEPOSUPTO: 1080 ADD_ACTIVE(state_offset + 4, 0); 1081 count = current_state->count; /* Number already matched */ 1082 if (clen > 0) 1083 { 1084 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1085 (c < 256 && 1086 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1087 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1088 { 1089 if (codevalue == OP_TYPEPOSUPTO) 1090 { 1091 active_count--; /* Remove non-match possibility */ 1092 next_active_state--; 1093 } 1094 if (++count >= GET2(code, 1)) 1095 { ADD_NEW(state_offset + 4, 0); } 1096 else 1097 { ADD_NEW(state_offset, count); } 1098 } 1099 } 1100 break; 1101 1102/* ========================================================================== */ 1103 /* These are virtual opcodes that are used when something like 1104 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its 1105 argument. It keeps the code above fast for the other cases. The argument 1106 is in the d variable. */ 1107 1108#ifdef SUPPORT_UCP 1109 case OP_PROP_EXTRA + OP_TYPEPLUS: 1110 case OP_PROP_EXTRA + OP_TYPEMINPLUS: 1111 case OP_PROP_EXTRA + OP_TYPEPOSPLUS: 1112 count = current_state->count; /* Already matched */ 1113 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } 1114 if (clen > 0) 1115 { 1116 BOOL OK; 1117 const ucd_record * prop = GET_UCD(c); 1118 switch(code[2]) 1119 { 1120 case PT_ANY: 1121 OK = TRUE; 1122 break; 1123 1124 case PT_LAMP: 1125 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; 1126 break; 1127 1128 case PT_GC: 1129 OK = _pcre_ucp_gentype[prop->chartype] == code[3]; 1130 break; 1131 1132 case PT_PC: 1133 OK = prop->chartype == code[3]; 1134 break; 1135 1136 case PT_SC: 1137 OK = prop->script == code[3]; 1138 break; 1139 1140 /* Should never occur, but keep compilers from grumbling. */ 1141 1142 default: 1143 OK = codevalue != OP_PROP; 1144 break; 1145 } 1146 1147 if (OK == (d == OP_PROP)) 1148 { 1149 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) 1150 { 1151 active_count--; /* Remove non-match possibility */ 1152 next_active_state--; 1153 } 1154 count++; 1155 ADD_NEW(state_offset, count); 1156 } 1157 } 1158 break; 1159 1160 /*-----------------------------------------------------------------*/ 1161 case OP_EXTUNI_EXTRA + OP_TYPEPLUS: 1162 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: 1163 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: 1164 count = current_state->count; /* Already matched */ 1165 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1166 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1167 { 1168 const uschar *nptr = ptr + clen; 1169 int ncount = 0; 1170 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) 1171 { 1172 active_count--; /* Remove non-match possibility */ 1173 next_active_state--; 1174 } 1175 while (nptr < end_subject) 1176 { 1177 int nd; 1178 int ndlen = 1; 1179 GETCHARLEN(nd, nptr, ndlen); 1180 if (UCD_CATEGORY(nd) != ucp_M) break; 1181 ncount++; 1182 nptr += ndlen; 1183 } 1184 count++; 1185 ADD_NEW_DATA(-state_offset, count, ncount); 1186 } 1187 break; 1188#endif 1189 1190 /*-----------------------------------------------------------------*/ 1191 case OP_ANYNL_EXTRA + OP_TYPEPLUS: 1192 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: 1193 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: 1194 count = current_state->count; /* Already matched */ 1195 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1196 if (clen > 0) 1197 { 1198 int ncount = 0; 1199 switch (c) 1200 { 1201 case 0x000b: 1202 case 0x000c: 1203 case 0x0085: 1204 case 0x2028: 1205 case 0x2029: 1206 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1207 goto ANYNL01; 1208 1209 case 0x000d: 1210 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; 1211 /* Fall through */ 1212 1213 ANYNL01: 1214 case 0x000a: 1215 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) 1216 { 1217 active_count--; /* Remove non-match possibility */ 1218 next_active_state--; 1219 } 1220 count++; 1221 ADD_NEW_DATA(-state_offset, count, ncount); 1222 break; 1223 1224 default: 1225 break; 1226 } 1227 } 1228 break; 1229 1230 /*-----------------------------------------------------------------*/ 1231 case OP_VSPACE_EXTRA + OP_TYPEPLUS: 1232 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: 1233 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: 1234 count = current_state->count; /* Already matched */ 1235 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1236 if (clen > 0) 1237 { 1238 BOOL OK; 1239 switch (c) 1240 { 1241 case 0x000a: 1242 case 0x000b: 1243 case 0x000c: 1244 case 0x000d: 1245 case 0x0085: 1246 case 0x2028: 1247 case 0x2029: 1248 OK = TRUE; 1249 break; 1250 1251 default: 1252 OK = FALSE; 1253 break; 1254 } 1255 1256 if (OK == (d == OP_VSPACE)) 1257 { 1258 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) 1259 { 1260 active_count--; /* Remove non-match possibility */ 1261 next_active_state--; 1262 } 1263 count++; 1264 ADD_NEW_DATA(-state_offset, count, 0); 1265 } 1266 } 1267 break; 1268 1269 /*-----------------------------------------------------------------*/ 1270 case OP_HSPACE_EXTRA + OP_TYPEPLUS: 1271 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: 1272 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: 1273 count = current_state->count; /* Already matched */ 1274 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1275 if (clen > 0) 1276 { 1277 BOOL OK; 1278 switch (c) 1279 { 1280 case 0x09: /* HT */ 1281 case 0x20: /* SPACE */ 1282 case 0xa0: /* NBSP */ 1283 case 0x1680: /* OGHAM SPACE MARK */ 1284 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1285 case 0x2000: /* EN QUAD */ 1286 case 0x2001: /* EM QUAD */ 1287 case 0x2002: /* EN SPACE */ 1288 case 0x2003: /* EM SPACE */ 1289 case 0x2004: /* THREE-PER-EM SPACE */ 1290 case 0x2005: /* FOUR-PER-EM SPACE */ 1291 case 0x2006: /* SIX-PER-EM SPACE */ 1292 case 0x2007: /* FIGURE SPACE */ 1293 case 0x2008: /* PUNCTUATION SPACE */ 1294 case 0x2009: /* THIN SPACE */ 1295 case 0x200A: /* HAIR SPACE */ 1296 case 0x202f: /* NARROW NO-BREAK SPACE */ 1297 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1298 case 0x3000: /* IDEOGRAPHIC SPACE */ 1299 OK = TRUE; 1300 break; 1301 1302 default: 1303 OK = FALSE; 1304 break; 1305 } 1306 1307 if (OK == (d == OP_HSPACE)) 1308 { 1309 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) 1310 { 1311 active_count--; /* Remove non-match possibility */ 1312 next_active_state--; 1313 } 1314 count++; 1315 ADD_NEW_DATA(-state_offset, count, 0); 1316 } 1317 } 1318 break; 1319 1320 /*-----------------------------------------------------------------*/ 1321#ifdef SUPPORT_UCP 1322 case OP_PROP_EXTRA + OP_TYPEQUERY: 1323 case OP_PROP_EXTRA + OP_TYPEMINQUERY: 1324 case OP_PROP_EXTRA + OP_TYPEPOSQUERY: 1325 count = 4; 1326 goto QS1; 1327 1328 case OP_PROP_EXTRA + OP_TYPESTAR: 1329 case OP_PROP_EXTRA + OP_TYPEMINSTAR: 1330 case OP_PROP_EXTRA + OP_TYPEPOSSTAR: 1331 count = 0; 1332 1333 QS1: 1334 1335 ADD_ACTIVE(state_offset + 4, 0); 1336 if (clen > 0) 1337 { 1338 BOOL OK; 1339 const ucd_record * prop = GET_UCD(c); 1340 switch(code[2]) 1341 { 1342 case PT_ANY: 1343 OK = TRUE; 1344 break; 1345 1346 case PT_LAMP: 1347 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; 1348 break; 1349 1350 case PT_GC: 1351 OK = _pcre_ucp_gentype[prop->chartype] == code[3]; 1352 break; 1353 1354 case PT_PC: 1355 OK = prop->chartype == code[3]; 1356 break; 1357 1358 case PT_SC: 1359 OK = prop->script == code[3]; 1360 break; 1361 1362 /* Should never occur, but keep compilers from grumbling. */ 1363 1364 default: 1365 OK = codevalue != OP_PROP; 1366 break; 1367 } 1368 1369 if (OK == (d == OP_PROP)) 1370 { 1371 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || 1372 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) 1373 { 1374 active_count--; /* Remove non-match possibility */ 1375 next_active_state--; 1376 } 1377 ADD_NEW(state_offset + count, 0); 1378 } 1379 } 1380 break; 1381 1382 /*-----------------------------------------------------------------*/ 1383 case OP_EXTUNI_EXTRA + OP_TYPEQUERY: 1384 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: 1385 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: 1386 count = 2; 1387 goto QS2; 1388 1389 case OP_EXTUNI_EXTRA + OP_TYPESTAR: 1390 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: 1391 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: 1392 count = 0; 1393 1394 QS2: 1395 1396 ADD_ACTIVE(state_offset + 2, 0); 1397 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1398 { 1399 const uschar *nptr = ptr + clen; 1400 int ncount = 0; 1401 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || 1402 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) 1403 { 1404 active_count--; /* Remove non-match possibility */ 1405 next_active_state--; 1406 } 1407 while (nptr < end_subject) 1408 { 1409 int nd; 1410 int ndlen = 1; 1411 GETCHARLEN(nd, nptr, ndlen); 1412 if (UCD_CATEGORY(nd) != ucp_M) break; 1413 ncount++; 1414 nptr += ndlen; 1415 } 1416 ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1417 } 1418 break; 1419#endif 1420 1421 /*-----------------------------------------------------------------*/ 1422 case OP_ANYNL_EXTRA + OP_TYPEQUERY: 1423 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: 1424 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: 1425 count = 2; 1426 goto QS3; 1427 1428 case OP_ANYNL_EXTRA + OP_TYPESTAR: 1429 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: 1430 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: 1431 count = 0; 1432 1433 QS3: 1434 ADD_ACTIVE(state_offset + 2, 0); 1435 if (clen > 0) 1436 { 1437 int ncount = 0; 1438 switch (c) 1439 { 1440 case 0x000b: 1441 case 0x000c: 1442 case 0x0085: 1443 case 0x2028: 1444 case 0x2029: 1445 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1446 goto ANYNL02; 1447 1448 case 0x000d: 1449 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; 1450 /* Fall through */ 1451 1452 ANYNL02: 1453 case 0x000a: 1454 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || 1455 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) 1456 { 1457 active_count--; /* Remove non-match possibility */ 1458 next_active_state--; 1459 } 1460 ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1461 break; 1462 1463 default: 1464 break; 1465 } 1466 } 1467 break; 1468 1469 /*-----------------------------------------------------------------*/ 1470 case OP_VSPACE_EXTRA + OP_TYPEQUERY: 1471 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: 1472 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: 1473 count = 2; 1474 goto QS4; 1475 1476 case OP_VSPACE_EXTRA + OP_TYPESTAR: 1477 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: 1478 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: 1479 count = 0; 1480 1481 QS4: 1482 ADD_ACTIVE(state_offset + 2, 0); 1483 if (clen > 0) 1484 { 1485 BOOL OK; 1486 switch (c) 1487 { 1488 case 0x000a: 1489 case 0x000b: 1490 case 0x000c: 1491 case 0x000d: 1492 case 0x0085: 1493 case 0x2028: 1494 case 0x2029: 1495 OK = TRUE; 1496 break; 1497 1498 default: 1499 OK = FALSE; 1500 break; 1501 } 1502 if (OK == (d == OP_VSPACE)) 1503 { 1504 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || 1505 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) 1506 { 1507 active_count--; /* Remove non-match possibility */ 1508 next_active_state--; 1509 } 1510 ADD_NEW_DATA(-(state_offset + count), 0, 0); 1511 } 1512 } 1513 break; 1514 1515 /*-----------------------------------------------------------------*/ 1516 case OP_HSPACE_EXTRA + OP_TYPEQUERY: 1517 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: 1518 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: 1519 count = 2; 1520 goto QS5; 1521 1522 case OP_HSPACE_EXTRA + OP_TYPESTAR: 1523 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: 1524 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: 1525 count = 0; 1526 1527 QS5: 1528 ADD_ACTIVE(state_offset + 2, 0); 1529 if (clen > 0) 1530 { 1531 BOOL OK; 1532 switch (c) 1533 { 1534 case 0x09: /* HT */ 1535 case 0x20: /* SPACE */ 1536 case 0xa0: /* NBSP */ 1537 case 0x1680: /* OGHAM SPACE MARK */ 1538 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1539 case 0x2000: /* EN QUAD */ 1540 case 0x2001: /* EM QUAD */ 1541 case 0x2002: /* EN SPACE */ 1542 case 0x2003: /* EM SPACE */ 1543 case 0x2004: /* THREE-PER-EM SPACE */ 1544 case 0x2005: /* FOUR-PER-EM SPACE */ 1545 case 0x2006: /* SIX-PER-EM SPACE */ 1546 case 0x2007: /* FIGURE SPACE */ 1547 case 0x2008: /* PUNCTUATION SPACE */ 1548 case 0x2009: /* THIN SPACE */ 1549 case 0x200A: /* HAIR SPACE */ 1550 case 0x202f: /* NARROW NO-BREAK SPACE */ 1551 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1552 case 0x3000: /* IDEOGRAPHIC SPACE */ 1553 OK = TRUE; 1554 break; 1555 1556 default: 1557 OK = FALSE; 1558 break; 1559 } 1560 1561 if (OK == (d == OP_HSPACE)) 1562 { 1563 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || 1564 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) 1565 { 1566 active_count--; /* Remove non-match possibility */ 1567 next_active_state--; 1568 } 1569 ADD_NEW_DATA(-(state_offset + count), 0, 0); 1570 } 1571 } 1572 break; 1573 1574 /*-----------------------------------------------------------------*/ 1575#ifdef SUPPORT_UCP 1576 case OP_PROP_EXTRA + OP_TYPEEXACT: 1577 case OP_PROP_EXTRA + OP_TYPEUPTO: 1578 case OP_PROP_EXTRA + OP_TYPEMINUPTO: 1579 case OP_PROP_EXTRA + OP_TYPEPOSUPTO: 1580 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) 1581 { ADD_ACTIVE(state_offset + 6, 0); } 1582 count = current_state->count; /* Number already matched */ 1583 if (clen > 0) 1584 { 1585 BOOL OK; 1586 const ucd_record * prop = GET_UCD(c); 1587 switch(code[4]) 1588 { 1589 case PT_ANY: 1590 OK = TRUE; 1591 break; 1592 1593 case PT_LAMP: 1594 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; 1595 break; 1596 1597 case PT_GC: 1598 OK = _pcre_ucp_gentype[prop->chartype] == code[5]; 1599 break; 1600 1601 case PT_PC: 1602 OK = prop->chartype == code[5]; 1603 break; 1604 1605 case PT_SC: 1606 OK = prop->script == code[5]; 1607 break; 1608 1609 /* Should never occur, but keep compilers from grumbling. */ 1610 1611 default: 1612 OK = codevalue != OP_PROP; 1613 break; 1614 } 1615 1616 if (OK == (d == OP_PROP)) 1617 { 1618 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) 1619 { 1620 active_count--; /* Remove non-match possibility */ 1621 next_active_state--; 1622 } 1623 if (++count >= GET2(code, 1)) 1624 { ADD_NEW(state_offset + 6, 0); } 1625 else 1626 { ADD_NEW(state_offset, count); } 1627 } 1628 } 1629 break; 1630 1631 /*-----------------------------------------------------------------*/ 1632 case OP_EXTUNI_EXTRA + OP_TYPEEXACT: 1633 case OP_EXTUNI_EXTRA + OP_TYPEUPTO: 1634 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: 1635 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: 1636 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) 1637 { ADD_ACTIVE(state_offset + 4, 0); } 1638 count = current_state->count; /* Number already matched */ 1639 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1640 { 1641 const uschar *nptr = ptr + clen; 1642 int ncount = 0; 1643 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) 1644 { 1645 active_count--; /* Remove non-match possibility */ 1646 next_active_state--; 1647 } 1648 while (nptr < end_subject) 1649 { 1650 int nd; 1651 int ndlen = 1; 1652 GETCHARLEN(nd, nptr, ndlen); 1653 if (UCD_CATEGORY(nd) != ucp_M) break; 1654 ncount++; 1655 nptr += ndlen; 1656 } 1657 if (++count >= GET2(code, 1)) 1658 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } 1659 else 1660 { ADD_NEW_DATA(-state_offset, count, ncount); } 1661 } 1662 break; 1663#endif 1664 1665 /*-----------------------------------------------------------------*/ 1666 case OP_ANYNL_EXTRA + OP_TYPEEXACT: 1667 case OP_ANYNL_EXTRA + OP_TYPEUPTO: 1668 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: 1669 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: 1670 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) 1671 { ADD_ACTIVE(state_offset + 4, 0); } 1672 count = current_state->count; /* Number already matched */ 1673 if (clen > 0) 1674 { 1675 int ncount = 0; 1676 switch (c) 1677 { 1678 case 0x000b: 1679 case 0x000c: 1680 case 0x0085: 1681 case 0x2028: 1682 case 0x2029: 1683 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1684 goto ANYNL03; 1685 1686 case 0x000d: 1687 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; 1688 /* Fall through */ 1689 1690 ANYNL03: 1691 case 0x000a: 1692 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) 1693 { 1694 active_count--; /* Remove non-match possibility */ 1695 next_active_state--; 1696 } 1697 if (++count >= GET2(code, 1)) 1698 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } 1699 else 1700 { ADD_NEW_DATA(-state_offset, count, ncount); } 1701 break; 1702 1703 default: 1704 break; 1705 } 1706 } 1707 break; 1708 1709 /*-----------------------------------------------------------------*/ 1710 case OP_VSPACE_EXTRA + OP_TYPEEXACT: 1711 case OP_VSPACE_EXTRA + OP_TYPEUPTO: 1712 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: 1713 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: 1714 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) 1715 { ADD_ACTIVE(state_offset + 4, 0); } 1716 count = current_state->count; /* Number already matched */ 1717 if (clen > 0) 1718 { 1719 BOOL OK; 1720 switch (c) 1721 { 1722 case 0x000a: 1723 case 0x000b: 1724 case 0x000c: 1725 case 0x000d: 1726 case 0x0085: 1727 case 0x2028: 1728 case 0x2029: 1729 OK = TRUE; 1730 break; 1731 1732 default: 1733 OK = FALSE; 1734 } 1735 1736 if (OK == (d == OP_VSPACE)) 1737 { 1738 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) 1739 { 1740 active_count--; /* Remove non-match possibility */ 1741 next_active_state--; 1742 } 1743 if (++count >= GET2(code, 1)) 1744 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } 1745 else 1746 { ADD_NEW_DATA(-state_offset, count, 0); } 1747 } 1748 } 1749 break; 1750 1751 /*-----------------------------------------------------------------*/ 1752 case OP_HSPACE_EXTRA + OP_TYPEEXACT: 1753 case OP_HSPACE_EXTRA + OP_TYPEUPTO: 1754 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: 1755 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: 1756 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) 1757 { ADD_ACTIVE(state_offset + 4, 0); } 1758 count = current_state->count; /* Number already matched */ 1759 if (clen > 0) 1760 { 1761 BOOL OK; 1762 switch (c) 1763 { 1764 case 0x09: /* HT */ 1765 case 0x20: /* SPACE */ 1766 case 0xa0: /* NBSP */ 1767 case 0x1680: /* OGHAM SPACE MARK */ 1768 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1769 case 0x2000: /* EN QUAD */ 1770 case 0x2001: /* EM QUAD */ 1771 case 0x2002: /* EN SPACE */ 1772 case 0x2003: /* EM SPACE */ 1773 case 0x2004: /* THREE-PER-EM SPACE */ 1774 case 0x2005: /* FOUR-PER-EM SPACE */ 1775 case 0x2006: /* SIX-PER-EM SPACE */ 1776 case 0x2007: /* FIGURE SPACE */ 1777 case 0x2008: /* PUNCTUATION SPACE */ 1778 case 0x2009: /* THIN SPACE */ 1779 case 0x200A: /* HAIR SPACE */ 1780 case 0x202f: /* NARROW NO-BREAK SPACE */ 1781 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1782 case 0x3000: /* IDEOGRAPHIC SPACE */ 1783 OK = TRUE; 1784 break; 1785 1786 default: 1787 OK = FALSE; 1788 break; 1789 } 1790 1791 if (OK == (d == OP_HSPACE)) 1792 { 1793 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) 1794 { 1795 active_count--; /* Remove non-match possibility */ 1796 next_active_state--; 1797 } 1798 if (++count >= GET2(code, 1)) 1799 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } 1800 else 1801 { ADD_NEW_DATA(-state_offset, count, 0); } 1802 } 1803 } 1804 break; 1805 1806/* ========================================================================== */ 1807 /* These opcodes are followed by a character that is usually compared 1808 to the current subject character; it is loaded into d. We still get 1809 here even if there is no subject character, because in some cases zero 1810 repetitions are permitted. */ 1811 1812 /*-----------------------------------------------------------------*/ 1813 case OP_CHAR: 1814 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } 1815 break; 1816 1817 /*-----------------------------------------------------------------*/ 1818 case OP_CHARNC: 1819 if (clen == 0) break; 1820 1821#ifdef SUPPORT_UTF8 1822 if (utf8) 1823 { 1824 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else 1825 { 1826 unsigned int othercase; 1827 if (c < 128) othercase = fcc[c]; else 1828 1829 /* If we have Unicode property support, we can use it to test the 1830 other case of the character. */ 1831 1832#ifdef SUPPORT_UCP 1833 othercase = UCD_OTHERCASE(c); 1834#else 1835 othercase = NOTACHAR; 1836#endif 1837 1838 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } 1839 } 1840 } 1841 else 1842#endif /* SUPPORT_UTF8 */ 1843 1844 /* Non-UTF-8 mode */ 1845 { 1846 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } 1847 } 1848 break; 1849 1850 1851#ifdef SUPPORT_UCP 1852 /*-----------------------------------------------------------------*/ 1853 /* This is a tricky one because it can match more than one character. 1854 Find out how many characters to skip, and then set up a negative state 1855 to wait for them to pass before continuing. */ 1856 1857 case OP_EXTUNI: 1858 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1859 { 1860 const uschar *nptr = ptr + clen; 1861 int ncount = 0; 1862 while (nptr < end_subject) 1863 { 1864 int nclen = 1; 1865 GETCHARLEN(c, nptr, nclen); 1866 if (UCD_CATEGORY(c) != ucp_M) break; 1867 ncount++; 1868 nptr += nclen; 1869 } 1870 ADD_NEW_DATA(-(state_offset + 1), 0, ncount); 1871 } 1872 break; 1873#endif 1874 1875 /*-----------------------------------------------------------------*/ 1876 /* This is a tricky like EXTUNI because it too can match more than one 1877 character (when CR is followed by LF). In this case, set up a negative 1878 state to wait for one character to pass before continuing. */ 1879 1880 case OP_ANYNL: 1881 if (clen > 0) switch(c) 1882 { 1883 case 0x000b: 1884 case 0x000c: 1885 case 0x0085: 1886 case 0x2028: 1887 case 0x2029: 1888 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1889 1890 case 0x000a: 1891 ADD_NEW(state_offset + 1, 0); 1892 break; 1893 1894 case 0x000d: 1895 if (ptr + 1 < end_subject && ptr[1] == 0x0a) 1896 { 1897 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 1898 } 1899 else 1900 { 1901 ADD_NEW(state_offset + 1, 0); 1902 } 1903 break; 1904 } 1905 break; 1906 1907 /*-----------------------------------------------------------------*/ 1908 case OP_NOT_VSPACE: 1909 if (clen > 0) switch(c) 1910 { 1911 case 0x000a: 1912 case 0x000b: 1913 case 0x000c: 1914 case 0x000d: 1915 case 0x0085: 1916 case 0x2028: 1917 case 0x2029: 1918 break; 1919 1920 default: 1921 ADD_NEW(state_offset + 1, 0); 1922 break; 1923 } 1924 break; 1925 1926 /*-----------------------------------------------------------------*/ 1927 case OP_VSPACE: 1928 if (clen > 0) switch(c) 1929 { 1930 case 0x000a: 1931 case 0x000b: 1932 case 0x000c: 1933 case 0x000d: 1934 case 0x0085: 1935 case 0x2028: 1936 case 0x2029: 1937 ADD_NEW(state_offset + 1, 0); 1938 break; 1939 1940 default: break; 1941 } 1942 break; 1943 1944 /*-----------------------------------------------------------------*/ 1945 case OP_NOT_HSPACE: 1946 if (clen > 0) switch(c) 1947 { 1948 case 0x09: /* HT */ 1949 case 0x20: /* SPACE */ 1950 case 0xa0: /* NBSP */ 1951 case 0x1680: /* OGHAM SPACE MARK */ 1952 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1953 case 0x2000: /* EN QUAD */ 1954 case 0x2001: /* EM QUAD */ 1955 case 0x2002: /* EN SPACE */ 1956 case 0x2003: /* EM SPACE */ 1957 case 0x2004: /* THREE-PER-EM SPACE */ 1958 case 0x2005: /* FOUR-PER-EM SPACE */ 1959 case 0x2006: /* SIX-PER-EM SPACE */ 1960 case 0x2007: /* FIGURE SPACE */ 1961 case 0x2008: /* PUNCTUATION SPACE */ 1962 case 0x2009: /* THIN SPACE */ 1963 case 0x200A: /* HAIR SPACE */ 1964 case 0x202f: /* NARROW NO-BREAK SPACE */ 1965 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1966 case 0x3000: /* IDEOGRAPHIC SPACE */ 1967 break; 1968 1969 default: 1970 ADD_NEW(state_offset + 1, 0); 1971 break; 1972 } 1973 break; 1974 1975 /*-----------------------------------------------------------------*/ 1976 case OP_HSPACE: 1977 if (clen > 0) switch(c) 1978 { 1979 case 0x09: /* HT */ 1980 case 0x20: /* SPACE */ 1981 case 0xa0: /* NBSP */ 1982 case 0x1680: /* OGHAM SPACE MARK */ 1983 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1984 case 0x2000: /* EN QUAD */ 1985 case 0x2001: /* EM QUAD */ 1986 case 0x2002: /* EN SPACE */ 1987 case 0x2003: /* EM SPACE */ 1988 case 0x2004: /* THREE-PER-EM SPACE */ 1989 case 0x2005: /* FOUR-PER-EM SPACE */ 1990 case 0x2006: /* SIX-PER-EM SPACE */ 1991 case 0x2007: /* FIGURE SPACE */ 1992 case 0x2008: /* PUNCTUATION SPACE */ 1993 case 0x2009: /* THIN SPACE */ 1994 case 0x200A: /* HAIR SPACE */ 1995 case 0x202f: /* NARROW NO-BREAK SPACE */ 1996 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1997 case 0x3000: /* IDEOGRAPHIC SPACE */ 1998 ADD_NEW(state_offset + 1, 0); 1999 break; 2000 } 2001 break; 2002 2003 /*-----------------------------------------------------------------*/ 2004 /* Match a negated single character. This is only used for one-byte 2005 characters, that is, we know that d < 256. The character we are 2006 checking (c) can be multibyte. */ 2007 2008 case OP_NOT: 2009 if (clen > 0) 2010 { 2011 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d; 2012 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); } 2013 } 2014 break; 2015 2016 /*-----------------------------------------------------------------*/ 2017 case OP_PLUS: 2018 case OP_MINPLUS: 2019 case OP_POSPLUS: 2020 case OP_NOTPLUS: 2021 case OP_NOTMINPLUS: 2022 case OP_NOTPOSPLUS: 2023 count = current_state->count; /* Already matched */ 2024 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } 2025 if (clen > 0) 2026 { 2027 unsigned int otherd = NOTACHAR; 2028 if ((ims & PCRE_CASELESS) != 0) 2029 { 2030#ifdef SUPPORT_UTF8 2031 if (utf8 && d >= 128) 2032 { 2033#ifdef SUPPORT_UCP 2034 otherd = UCD_OTHERCASE(d); 2035#endif /* SUPPORT_UCP */ 2036 } 2037 else 2038#endif /* SUPPORT_UTF8 */ 2039 otherd = fcc[d]; 2040 } 2041 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2042 { 2043 if (count > 0 && 2044 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) 2045 { 2046 active_count--; /* Remove non-match possibility */ 2047 next_active_state--; 2048 } 2049 count++; 2050 ADD_NEW(state_offset, count); 2051 } 2052 } 2053 break; 2054 2055 /*-----------------------------------------------------------------*/ 2056 case OP_QUERY: 2057 case OP_MINQUERY: 2058 case OP_POSQUERY: 2059 case OP_NOTQUERY: 2060 case OP_NOTMINQUERY: 2061 case OP_NOTPOSQUERY: 2062 ADD_ACTIVE(state_offset + dlen + 1, 0); 2063 if (clen > 0) 2064 { 2065 unsigned int otherd = NOTACHAR; 2066 if ((ims & PCRE_CASELESS) != 0) 2067 { 2068#ifdef SUPPORT_UTF8 2069 if (utf8 && d >= 128) 2070 { 2071#ifdef SUPPORT_UCP 2072 otherd = UCD_OTHERCASE(d); 2073#endif /* SUPPORT_UCP */ 2074 } 2075 else 2076#endif /* SUPPORT_UTF8 */ 2077 otherd = fcc[d]; 2078 } 2079 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2080 { 2081 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) 2082 { 2083 active_count--; /* Remove non-match possibility */ 2084 next_active_state--; 2085 } 2086 ADD_NEW(state_offset + dlen + 1, 0); 2087 } 2088 } 2089 break; 2090 2091 /*-----------------------------------------------------------------*/ 2092 case OP_STAR: 2093 case OP_MINSTAR: 2094 case OP_POSSTAR: 2095 case OP_NOTSTAR: 2096 case OP_NOTMINSTAR: 2097 case OP_NOTPOSSTAR: 2098 ADD_ACTIVE(state_offset + dlen + 1, 0); 2099 if (clen > 0) 2100 { 2101 unsigned int otherd = NOTACHAR; 2102 if ((ims & PCRE_CASELESS) != 0) 2103 { 2104#ifdef SUPPORT_UTF8 2105 if (utf8 && d >= 128) 2106 { 2107#ifdef SUPPORT_UCP 2108 otherd = UCD_OTHERCASE(d); 2109#endif /* SUPPORT_UCP */ 2110 } 2111 else 2112#endif /* SUPPORT_UTF8 */ 2113 otherd = fcc[d]; 2114 } 2115 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2116 { 2117 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) 2118 { 2119 active_count--; /* Remove non-match possibility */ 2120 next_active_state--; 2121 } 2122 ADD_NEW(state_offset, 0); 2123 } 2124 } 2125 break; 2126 2127 /*-----------------------------------------------------------------*/ 2128 case OP_EXACT: 2129 case OP_NOTEXACT: 2130 count = current_state->count; /* Number already matched */ 2131 if (clen > 0) 2132 { 2133 unsigned int otherd = NOTACHAR; 2134 if ((ims & PCRE_CASELESS) != 0) 2135 { 2136#ifdef SUPPORT_UTF8 2137 if (utf8 && d >= 128) 2138 { 2139#ifdef SUPPORT_UCP 2140 otherd = UCD_OTHERCASE(d); 2141#endif /* SUPPORT_UCP */ 2142 } 2143 else 2144#endif /* SUPPORT_UTF8 */ 2145 otherd = fcc[d]; 2146 } 2147 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2148 { 2149 if (++count >= GET2(code, 1)) 2150 { ADD_NEW(state_offset + dlen + 3, 0); } 2151 else 2152 { ADD_NEW(state_offset, count); } 2153 } 2154 } 2155 break; 2156 2157 /*-----------------------------------------------------------------*/ 2158 case OP_UPTO: 2159 case OP_MINUPTO: 2160 case OP_POSUPTO: 2161 case OP_NOTUPTO: 2162 case OP_NOTMINUPTO: 2163 case OP_NOTPOSUPTO: 2164 ADD_ACTIVE(state_offset + dlen + 3, 0); 2165 count = current_state->count; /* Number already matched */ 2166 if (clen > 0) 2167 { 2168 unsigned int otherd = NOTACHAR; 2169 if ((ims & PCRE_CASELESS) != 0) 2170 { 2171#ifdef SUPPORT_UTF8 2172 if (utf8 && d >= 128) 2173 { 2174#ifdef SUPPORT_UCP 2175 otherd = UCD_OTHERCASE(d); 2176#endif /* SUPPORT_UCP */ 2177 } 2178 else 2179#endif /* SUPPORT_UTF8 */ 2180 otherd = fcc[d]; 2181 } 2182 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2183 { 2184 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) 2185 { 2186 active_count--; /* Remove non-match possibility */ 2187 next_active_state--; 2188 } 2189 if (++count >= GET2(code, 1)) 2190 { ADD_NEW(state_offset + dlen + 3, 0); } 2191 else 2192 { ADD_NEW(state_offset, count); } 2193 } 2194 } 2195 break; 2196 2197 2198/* ========================================================================== */ 2199 /* These are the class-handling opcodes */ 2200 2201 case OP_CLASS: 2202 case OP_NCLASS: 2203 case OP_XCLASS: 2204 { 2205 BOOL isinclass = FALSE; 2206 int next_state_offset; 2207 const uschar *ecode; 2208 2209 /* For a simple class, there is always just a 32-byte table, and we 2210 can set isinclass from it. */ 2211 2212 if (codevalue != OP_XCLASS) 2213 { 2214 ecode = code + 33; 2215 if (clen > 0) 2216 { 2217 isinclass = (c > 255)? (codevalue == OP_NCLASS) : 2218 ((code[1 + c/8] & (1 << (c&7))) != 0); 2219 } 2220 } 2221 2222 /* An extended class may have a table or a list of single characters, 2223 ranges, or both, and it may be positive or negative. There's a 2224 function that sorts all this out. */ 2225 2226 else 2227 { 2228 ecode = code + GET(code, 1); 2229 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); 2230 } 2231 2232 /* At this point, isinclass is set for all kinds of class, and ecode 2233 points to the byte after the end of the class. If there is a 2234 quantifier, this is where it will be. */ 2235 2236 next_state_offset = ecode - start_code; 2237 2238 switch (*ecode) 2239 { 2240 case OP_CRSTAR: 2241 case OP_CRMINSTAR: 2242 ADD_ACTIVE(next_state_offset + 1, 0); 2243 if (isinclass) { ADD_NEW(state_offset, 0); } 2244 break; 2245 2246 case OP_CRPLUS: 2247 case OP_CRMINPLUS: 2248 count = current_state->count; /* Already matched */ 2249 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } 2250 if (isinclass) { count++; ADD_NEW(state_offset, count); } 2251 break; 2252 2253 case OP_CRQUERY: 2254 case OP_CRMINQUERY: 2255 ADD_ACTIVE(next_state_offset + 1, 0); 2256 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } 2257 break; 2258 2259 case OP_CRRANGE: 2260 case OP_CRMINRANGE: 2261 count = current_state->count; /* Already matched */ 2262 if (count >= GET2(ecode, 1)) 2263 { ADD_ACTIVE(next_state_offset + 5, 0); } 2264 if (isinclass) 2265 { 2266 int max = GET2(ecode, 3); 2267 if (++count >= max && max != 0) /* Max 0 => no limit */ 2268 { ADD_NEW(next_state_offset + 5, 0); } 2269 else 2270 { ADD_NEW(state_offset, count); } 2271 } 2272 break; 2273 2274 default: 2275 if (isinclass) { ADD_NEW(next_state_offset, 0); } 2276 break; 2277 } 2278 } 2279 break; 2280 2281/* ========================================================================== */ 2282 /* These are the opcodes for fancy brackets of various kinds. We have 2283 to use recursion in order to handle them. The "always failing" assertion 2284 (?!) is optimised to OP_FAIL when compiling, so we have to support that, 2285 though the other "backtracking verbs" are not supported. */ 2286 2287 case OP_FAIL: 2288 forced_fail++; /* Count FAILs for multiple states */ 2289 break; 2290 2291 case OP_ASSERT: 2292 case OP_ASSERT_NOT: 2293 case OP_ASSERTBACK: 2294 case OP_ASSERTBACK_NOT: 2295 { 2296 int rc; 2297 int local_offsets[2]; 2298 int local_workspace[1000]; 2299 const uschar *endasscode = code + GET(code, 1); 2300 2301 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2302 2303 rc = internal_dfa_exec( 2304 md, /* static match data */ 2305 code, /* this subexpression's code */ 2306 ptr, /* where we currently are */ 2307 ptr - start_subject, /* start offset */ 2308 local_offsets, /* offset vector */ 2309 sizeof(local_offsets)/sizeof(int), /* size of same */ 2310 local_workspace, /* workspace vector */ 2311 sizeof(local_workspace)/sizeof(int), /* size of same */ 2312 ims, /* the current ims flags */ 2313 rlevel, /* function recursion level */ 2314 recursing); /* pass on regex recursion */ 2315 2316 if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2317 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) 2318 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } 2319 } 2320 break; 2321 2322 /*-----------------------------------------------------------------*/ 2323 case OP_COND: 2324 case OP_SCOND: 2325 { 2326 int local_offsets[1000]; 2327 int local_workspace[1000]; 2328 int codelink = GET(code, 1); 2329 int condcode; 2330 2331 /* Because of the way auto-callout works during compile, a callout item 2332 is inserted between OP_COND and an assertion condition. This does not 2333 happen for the other conditions. */ 2334 2335 if (code[LINK_SIZE+1] == OP_CALLOUT) 2336 { 2337 rrc = 0; 2338 if (pcre_callout != NULL) 2339 { 2340 pcre_callout_block cb; 2341 cb.version = 1; /* Version 1 of the callout block */ 2342 cb.callout_number = code[LINK_SIZE+2]; 2343 cb.offset_vector = offsets; 2344 cb.subject = (PCRE_SPTR)start_subject; 2345 cb.subject_length = end_subject - start_subject; 2346 cb.start_match = current_subject - start_subject; 2347 cb.current_position = ptr - start_subject; 2348 cb.pattern_position = GET(code, LINK_SIZE + 3); 2349 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); 2350 cb.capture_top = 1; 2351 cb.capture_last = -1; 2352 cb.callout_data = md->callout_data; 2353 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ 2354 } 2355 if (rrc > 0) break; /* Fail this thread */ 2356 code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */ 2357 } 2358 2359 condcode = code[LINK_SIZE+1]; 2360 2361 /* Back reference conditions are not supported */ 2362 2363 if (condcode == OP_CREF || condcode == OP_NCREF) 2364 return PCRE_ERROR_DFA_UCOND; 2365 2366 /* The DEFINE condition is always false */ 2367 2368 if (condcode == OP_DEF) 2369 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2370 2371 /* The only supported version of OP_RREF is for the value RREF_ANY, 2372 which means "test if in any recursion". We can't test for specifically 2373 recursed groups. */ 2374 2375 else if (condcode == OP_RREF || condcode == OP_NRREF) 2376 { 2377 int value = GET2(code, LINK_SIZE+2); 2378 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; 2379 if (recursing > 0) 2380 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } 2381 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2382 } 2383 2384 /* Otherwise, the condition is an assertion */ 2385 2386 else 2387 { 2388 int rc; 2389 const uschar *asscode = code + LINK_SIZE + 1; 2390 const uschar *endasscode = asscode + GET(asscode, 1); 2391 2392 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2393 2394 rc = internal_dfa_exec( 2395 md, /* fixed match data */ 2396 asscode, /* this subexpression's code */ 2397 ptr, /* where we currently are */ 2398 ptr - start_subject, /* start offset */ 2399 local_offsets, /* offset vector */ 2400 sizeof(local_offsets)/sizeof(int), /* size of same */ 2401 local_workspace, /* workspace vector */ 2402 sizeof(local_workspace)/sizeof(int), /* size of same */ 2403 ims, /* the current ims flags */ 2404 rlevel, /* function recursion level */ 2405 recursing); /* pass on regex recursion */ 2406 2407 if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2408 if ((rc >= 0) == 2409 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) 2410 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } 2411 else 2412 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2413 } 2414 } 2415 break; 2416 2417 /*-----------------------------------------------------------------*/ 2418 case OP_RECURSE: 2419 { 2420 int local_offsets[1000]; 2421 int local_workspace[1000]; 2422 int rc; 2423 2424 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP, 2425 recursing + 1)); 2426 2427 rc = internal_dfa_exec( 2428 md, /* fixed match data */ 2429 start_code + GET(code, 1), /* this subexpression's code */ 2430 ptr, /* where we currently are */ 2431 ptr - start_subject, /* start offset */ 2432 local_offsets, /* offset vector */ 2433 sizeof(local_offsets)/sizeof(int), /* size of same */ 2434 local_workspace, /* workspace vector */ 2435 sizeof(local_workspace)/sizeof(int), /* size of same */ 2436 ims, /* the current ims flags */ 2437 rlevel, /* function recursion level */ 2438 recursing + 1); /* regex recurse level */ 2439 2440 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP, 2441 recursing + 1, rc)); 2442 2443 /* Ran out of internal offsets */ 2444 2445 if (rc == 0) return PCRE_ERROR_DFA_RECURSE; 2446 2447 /* For each successful matched substring, set up the next state with a 2448 count of characters to skip before trying it. Note that the count is in 2449 characters, not bytes. */ 2450 2451 if (rc > 0) 2452 { 2453 for (rc = rc*2 - 2; rc >= 0; rc -= 2) 2454 { 2455 const uschar *p = start_subject + local_offsets[rc]; 2456 const uschar *pp = start_subject + local_offsets[rc+1]; 2457 int charcount = local_offsets[rc+1] - local_offsets[rc]; 2458 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; 2459 if (charcount > 0) 2460 { 2461 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); 2462 } 2463 else 2464 { 2465 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); 2466 } 2467 } 2468 } 2469 else if (rc != PCRE_ERROR_NOMATCH) return rc; 2470 } 2471 break; 2472 2473 /*-----------------------------------------------------------------*/ 2474 case OP_ONCE: 2475 { 2476 int local_offsets[2]; 2477 int local_workspace[1000]; 2478 2479 int rc = internal_dfa_exec( 2480 md, /* fixed match data */ 2481 code, /* this subexpression's code */ 2482 ptr, /* where we currently are */ 2483 ptr - start_subject, /* start offset */ 2484 local_offsets, /* offset vector */ 2485 sizeof(local_offsets)/sizeof(int), /* size of same */ 2486 local_workspace, /* workspace vector */ 2487 sizeof(local_workspace)/sizeof(int), /* size of same */ 2488 ims, /* the current ims flags */ 2489 rlevel, /* function recursion level */ 2490 recursing); /* pass on regex recursion */ 2491 2492 if (rc >= 0) 2493 { 2494 const uschar *end_subpattern = code; 2495 int charcount = local_offsets[1] - local_offsets[0]; 2496 int next_state_offset, repeat_state_offset; 2497 2498 do { end_subpattern += GET(end_subpattern, 1); } 2499 while (*end_subpattern == OP_ALT); 2500 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; 2501 2502 /* If the end of this subpattern is KETRMAX or KETRMIN, we must 2503 arrange for the repeat state also to be added to the relevant list. 2504 Calculate the offset, or set -1 for no repeat. */ 2505 2506 repeat_state_offset = (*end_subpattern == OP_KETRMAX || 2507 *end_subpattern == OP_KETRMIN)? 2508 end_subpattern - start_code - GET(end_subpattern, 1) : -1; 2509 2510 /* If we have matched an empty string, add the next state at the 2511 current character pointer. This is important so that the duplicate 2512 checking kicks in, which is what breaks infinite loops that match an 2513 empty string. */ 2514 2515 if (charcount == 0) 2516 { 2517 ADD_ACTIVE(next_state_offset, 0); 2518 } 2519 2520 /* Optimization: if there are no more active states, and there 2521 are no new states yet set up, then skip over the subject string 2522 right here, to save looping. Otherwise, set up the new state to swing 2523 into action when the end of the substring is reached. */ 2524 2525 else if (i + 1 >= active_count && new_count == 0) 2526 { 2527 ptr += charcount; 2528 clen = 0; 2529 ADD_NEW(next_state_offset, 0); 2530 2531 /* If we are adding a repeat state at the new character position, 2532 we must fudge things so that it is the only current state. 2533 Otherwise, it might be a duplicate of one we processed before, and 2534 that would cause it to be skipped. */ 2535 2536 if (repeat_state_offset >= 0) 2537 { 2538 next_active_state = active_states; 2539 active_count = 0; 2540 i = -1; 2541 ADD_ACTIVE(repeat_state_offset, 0); 2542 } 2543 } 2544 else 2545 { 2546 const uschar *p = start_subject + local_offsets[0]; 2547 const uschar *pp = start_subject + local_offsets[1]; 2548 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; 2549 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 2550 if (repeat_state_offset >= 0) 2551 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } 2552 } 2553 2554 } 2555 else if (rc != PCRE_ERROR_NOMATCH) return rc; 2556 } 2557 break; 2558 2559 2560/* ========================================================================== */ 2561 /* Handle callouts */ 2562 2563 case OP_CALLOUT: 2564 rrc = 0; 2565 if (pcre_callout != NULL) 2566 { 2567 pcre_callout_block cb; 2568 cb.version = 1; /* Version 1 of the callout block */ 2569 cb.callout_number = code[1]; 2570 cb.offset_vector = offsets; 2571 cb.subject = (PCRE_SPTR)start_subject; 2572 cb.subject_length = end_subject - start_subject; 2573 cb.start_match = current_subject - start_subject; 2574 cb.current_position = ptr - start_subject; 2575 cb.pattern_position = GET(code, 2); 2576 cb.next_item_length = GET(code, 2 + LINK_SIZE); 2577 cb.capture_top = 1; 2578 cb.capture_last = -1; 2579 cb.callout_data = md->callout_data; 2580 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ 2581 } 2582 if (rrc == 0) 2583 { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); } 2584 break; 2585 2586 2587/* ========================================================================== */ 2588 default: /* Unsupported opcode */ 2589 return PCRE_ERROR_DFA_UITEM; 2590 } 2591 2592 NEXT_ACTIVE_STATE: continue; 2593 2594 } /* End of loop scanning active states */ 2595 2596 /* We have finished the processing at the current subject character. If no 2597 new states have been set for the next character, we have found all the 2598 matches that we are going to find. If we are at the top level and partial 2599 matching has been requested, check for appropriate conditions. 2600 2601 The "forced_ fail" variable counts the number of (*F) encountered for the 2602 character. If it is equal to the original active_count (saved in 2603 workspace[1]) it means that (*F) was found on every active state. In this 2604 case we don't want to give a partial match. 2605 2606 The "could_continue" variable is true if a state could have continued but 2607 for the fact that the end of the subject was reached. */ 2608 2609 if (new_count <= 0) 2610 { 2611 if (rlevel == 1 && /* Top level, and */ 2612 could_continue && /* Some could go on */ 2613 forced_fail != workspace[1] && /* Not all forced fail & */ 2614 ( /* either... */ 2615 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ 2616 || /* or... */ 2617 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ 2618 match_count < 0) /* no matches */ 2619 ) && /* And... */ 2620 ptr >= end_subject && /* Reached end of subject */ 2621 ptr > current_subject) /* Matched non-empty string */ 2622 { 2623 if (offsetcount >= 2) 2624 { 2625 offsets[0] = md->start_used_ptr - start_subject; 2626 offsets[1] = end_subject - start_subject; 2627 } 2628 match_count = PCRE_ERROR_PARTIAL; 2629 } 2630 2631 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 2632 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, 2633 rlevel*2-2, SP)); 2634 break; /* In effect, "return", but see the comment below */ 2635 } 2636 2637 /* One or more states are active for the next character. */ 2638 2639 ptr += clen; /* Advance to next subject character */ 2640 } /* Loop to move along the subject string */ 2641 2642/* Control gets here from "break" a few lines above. We do it this way because 2643if we use "return" above, we have compiler trouble. Some compilers warn if 2644there's nothing here because they think the function doesn't return a value. On 2645the other hand, if we put a dummy statement here, some more clever compilers 2646complain that it can't be reached. Sigh. */ 2647 2648return match_count; 2649} 2650 2651 2652 2653 2654/************************************************* 2655* Execute a Regular Expression - DFA engine * 2656*************************************************/ 2657 2658/* This external function applies a compiled re to a subject string using a DFA 2659engine. This function calls the internal function multiple times if the pattern 2660is not anchored. 2661 2662Arguments: 2663 argument_re points to the compiled expression 2664 extra_data points to extra data or is NULL 2665 subject points to the subject string 2666 length length of subject string (may contain binary zeros) 2667 start_offset where to start in the subject string 2668 options option bits 2669 offsets vector of match offsets 2670 offsetcount size of same 2671 workspace workspace vector 2672 wscount size of same 2673 2674Returns: > 0 => number of match offset pairs placed in offsets 2675 = 0 => offsets overflowed; longest matches are present 2676 -1 => failed to match 2677 < -1 => some kind of unexpected problem 2678*/ 2679 2680PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 2681pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, 2682 const char *subject, int length, int start_offset, int options, int *offsets, 2683 int offsetcount, int *workspace, int wscount) 2684{ 2685real_pcre *re = (real_pcre *)argument_re; 2686dfa_match_data match_block; 2687dfa_match_data *md = &match_block; 2688BOOL utf8, anchored, startline, firstline; 2689const uschar *current_subject, *end_subject, *lcc; 2690 2691pcre_study_data internal_study; 2692const pcre_study_data *study = NULL; 2693real_pcre internal_re; 2694 2695const uschar *req_byte_ptr; 2696const uschar *start_bits = NULL; 2697BOOL first_byte_caseless = FALSE; 2698BOOL req_byte_caseless = FALSE; 2699int first_byte = -1; 2700int req_byte = -1; 2701int req_byte2 = -1; 2702int newline; 2703 2704/* Plausibility checks */ 2705 2706if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 2707if (re == NULL || subject == NULL || workspace == NULL || 2708 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 2709if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 2710if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; 2711 2712/* We need to find the pointer to any study data before we test for byte 2713flipping, so we scan the extra_data block first. This may set two fields in the 2714match block, so we must initialize them beforehand. However, the other fields 2715in the match block must not be set until after the byte flipping. */ 2716 2717md->tables = re->tables; 2718md->callout_data = NULL; 2719 2720if (extra_data != NULL) 2721 { 2722 unsigned int flags = extra_data->flags; 2723 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 2724 study = (const pcre_study_data *)extra_data->study_data; 2725 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; 2726 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 2727 return PCRE_ERROR_DFA_UMLIMIT; 2728 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 2729 md->callout_data = extra_data->callout_data; 2730 if ((flags & PCRE_EXTRA_TABLES) != 0) 2731 md->tables = extra_data->tables; 2732 } 2733 2734/* Check that the first field in the block is the magic number. If it is not, 2735test for a regex that was compiled on a host of opposite endianness. If this is 2736the case, flipped values are put in internal_re and internal_study if there was 2737study data too. */ 2738 2739if (re->magic_number != MAGIC_NUMBER) 2740 { 2741 re = _pcre_try_flipped(re, &internal_re, study, &internal_study); 2742 if (re == NULL) return PCRE_ERROR_BADMAGIC; 2743 if (study != NULL) study = &internal_study; 2744 } 2745 2746/* Set some local values */ 2747 2748current_subject = (const unsigned char *)subject + start_offset; 2749end_subject = (const unsigned char *)subject + length; 2750req_byte_ptr = current_subject - 1; 2751 2752#ifdef SUPPORT_UTF8 2753utf8 = (re->options & PCRE_UTF8) != 0; 2754#else 2755utf8 = FALSE; 2756#endif 2757 2758anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || 2759 (re->options & PCRE_ANCHORED) != 0; 2760 2761/* The remaining fixed data for passing around. */ 2762 2763md->start_code = (const uschar *)argument_re + 2764 re->name_table_offset + re->name_count * re->name_entry_size; 2765md->start_subject = (const unsigned char *)subject; 2766md->end_subject = end_subject; 2767md->start_offset = start_offset; 2768md->moptions = options; 2769md->poptions = re->options; 2770 2771/* If the BSR option is not set at match time, copy what was set 2772at compile time. */ 2773 2774if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) 2775 { 2776 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 2777 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); 2778#ifdef BSR_ANYCRLF 2779 else md->moptions |= PCRE_BSR_ANYCRLF; 2780#endif 2781 } 2782 2783/* Handle different types of newline. The three bits give eight cases. If 2784nothing is set at run time, whatever was used at compile time applies. */ 2785 2786switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & 2787 PCRE_NEWLINE_BITS) 2788 { 2789 case 0: newline = NEWLINE; break; /* Compile-time default */ 2790 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 2791 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 2792 case PCRE_NEWLINE_CR+ 2793 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 2794 case PCRE_NEWLINE_ANY: newline = -1; break; 2795 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 2796 default: return PCRE_ERROR_BADNEWLINE; 2797 } 2798 2799if (newline == -2) 2800 { 2801 md->nltype = NLTYPE_ANYCRLF; 2802 } 2803else if (newline < 0) 2804 { 2805 md->nltype = NLTYPE_ANY; 2806 } 2807else 2808 { 2809 md->nltype = NLTYPE_FIXED; 2810 if (newline > 255) 2811 { 2812 md->nllen = 2; 2813 md->nl[0] = (newline >> 8) & 255; 2814 md->nl[1] = newline & 255; 2815 } 2816 else 2817 { 2818 md->nllen = 1; 2819 md->nl[0] = newline; 2820 } 2821 } 2822 2823/* Check a UTF-8 string if required. Unfortunately there's no way of passing 2824back the character offset. */ 2825 2826#ifdef SUPPORT_UTF8 2827if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) 2828 { 2829 if (_pcre_valid_utf8((uschar *)subject, length) >= 0) 2830 return PCRE_ERROR_BADUTF8; 2831 if (start_offset > 0 && start_offset < length) 2832 { 2833 int tb = ((uschar *)subject)[start_offset]; 2834 if (tb > 127) 2835 { 2836 tb &= 0xc0; 2837 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; 2838 } 2839 } 2840 } 2841#endif 2842 2843/* If the exec call supplied NULL for tables, use the inbuilt ones. This 2844is a feature that makes it possible to save compiled regex and re-use them 2845in other programs later. */ 2846 2847if (md->tables == NULL) md->tables = _pcre_default_tables; 2848 2849/* The lower casing table and the "must be at the start of a line" flag are 2850used in a loop when finding where to start. */ 2851 2852lcc = md->tables + lcc_offset; 2853startline = (re->flags & PCRE_STARTLINE) != 0; 2854firstline = (re->options & PCRE_FIRSTLINE) != 0; 2855 2856/* Set up the first character to match, if available. The first_byte value is 2857never set for an anchored regular expression, but the anchoring may be forced 2858at run time, so we have to test for anchoring. The first char may be unset for 2859an unanchored pattern, of course. If there's no first char and the pattern was 2860studied, there may be a bitmap of possible first characters. */ 2861 2862if (!anchored) 2863 { 2864 if ((re->flags & PCRE_FIRSTSET) != 0) 2865 { 2866 first_byte = re->first_byte & 255; 2867 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) 2868 first_byte = lcc[first_byte]; 2869 } 2870 else 2871 { 2872 if (!startline && study != NULL && 2873 (study->flags & PCRE_STUDY_MAPPED) != 0) 2874 start_bits = study->start_bits; 2875 } 2876 } 2877 2878/* For anchored or unanchored matches, there may be a "last known required 2879character" set. */ 2880 2881if ((re->flags & PCRE_REQCHSET) != 0) 2882 { 2883 req_byte = re->req_byte & 255; 2884 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 2885 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ 2886 } 2887 2888/* Call the main matching function, looping for a non-anchored regex after a 2889failed match. If not restarting, perform certain optimizations at the start of 2890a match. */ 2891 2892for (;;) 2893 { 2894 int rc; 2895 2896 if ((options & PCRE_DFA_RESTART) == 0) 2897 { 2898 const uschar *save_end_subject = end_subject; 2899 2900 /* If firstline is TRUE, the start of the match is constrained to the first 2901 line of a multiline string. Implement this by temporarily adjusting 2902 end_subject so that we stop scanning at a newline. If the match fails at 2903 the newline, later code breaks this loop. */ 2904 2905 if (firstline) 2906 { 2907 USPTR t = current_subject; 2908#ifdef SUPPORT_UTF8 2909 if (utf8) 2910 { 2911 while (t < md->end_subject && !IS_NEWLINE(t)) 2912 { 2913 t++; 2914 while (t < end_subject && (*t & 0xc0) == 0x80) t++; 2915 } 2916 } 2917 else 2918#endif 2919 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 2920 end_subject = t; 2921 } 2922 2923 /* There are some optimizations that avoid running the match if a known 2924 starting point is not found. However, there is an option that disables 2925 these, for testing and for ensuring that all callouts do actually occur. */ 2926 2927 if ((options & PCRE_NO_START_OPTIMIZE) == 0) 2928 { 2929 /* Advance to a known first byte. */ 2930 2931 if (first_byte >= 0) 2932 { 2933 if (first_byte_caseless) 2934 while (current_subject < end_subject && 2935 lcc[*current_subject] != first_byte) 2936 current_subject++; 2937 else 2938 while (current_subject < end_subject && 2939 *current_subject != first_byte) 2940 current_subject++; 2941 } 2942 2943 /* Or to just after a linebreak for a multiline match if possible */ 2944 2945 else if (startline) 2946 { 2947 if (current_subject > md->start_subject + start_offset) 2948 { 2949#ifdef SUPPORT_UTF8 2950 if (utf8) 2951 { 2952 while (current_subject < end_subject && 2953 !WAS_NEWLINE(current_subject)) 2954 { 2955 current_subject++; 2956 while(current_subject < end_subject && 2957 (*current_subject & 0xc0) == 0x80) 2958 current_subject++; 2959 } 2960 } 2961 else 2962#endif 2963 while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) 2964 current_subject++; 2965 2966 /* If we have just passed a CR and the newline option is ANY or 2967 ANYCRLF, and we are now at a LF, advance the match position by one 2968 more character. */ 2969 2970 if (current_subject[-1] == CHAR_CR && 2971 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 2972 current_subject < end_subject && 2973 *current_subject == CHAR_NL) 2974 current_subject++; 2975 } 2976 } 2977 2978 /* Or to a non-unique first char after study */ 2979 2980 else if (start_bits != NULL) 2981 { 2982 while (current_subject < end_subject) 2983 { 2984 register unsigned int c = *current_subject; 2985 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; 2986 else break; 2987 } 2988 } 2989 } 2990 2991 /* Restore fudged end_subject */ 2992 2993 end_subject = save_end_subject; 2994 2995 /* The following two optimizations are disabled for partial matching or if 2996 disabling is explicitly requested (and of course, by the test above, this 2997 code is not obeyed when restarting after a partial match). */ 2998 2999 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && 3000 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) 3001 { 3002 /* If the pattern was studied, a minimum subject length may be set. This 3003 is a lower bound; no actual string of that length may actually match the 3004 pattern. Although the value is, strictly, in characters, we treat it as 3005 bytes to avoid spending too much time in this optimization. */ 3006 3007 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 3008 (pcre_uint32)(end_subject - current_subject) < study->minlength) 3009 return PCRE_ERROR_NOMATCH; 3010 3011 /* If req_byte is set, we know that that character must appear in the 3012 subject for the match to succeed. If the first character is set, req_byte 3013 must be later in the subject; otherwise the test starts at the match 3014 point. This optimization can save a huge amount of work in patterns with 3015 nested unlimited repeats that aren't going to match. Writing separate 3016 code for cased/caseless versions makes it go faster, as does using an 3017 autoincrement and backing off on a match. 3018 3019 HOWEVER: when the subject string is very, very long, searching to its end 3020 can take a long time, and give bad performance on quite ordinary 3021 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte 3022 string... so we don't do this when the string is sufficiently long. */ 3023 3024 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX) 3025 { 3026 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); 3027 3028 /* We don't need to repeat the search if we haven't yet reached the 3029 place we found it at last time. */ 3030 3031 if (p > req_byte_ptr) 3032 { 3033 if (req_byte_caseless) 3034 { 3035 while (p < end_subject) 3036 { 3037 register int pp = *p++; 3038 if (pp == req_byte || pp == req_byte2) { p--; break; } 3039 } 3040 } 3041 else 3042 { 3043 while (p < end_subject) 3044 { 3045 if (*p++ == req_byte) { p--; break; } 3046 } 3047 } 3048 3049 /* If we can't find the required character, break the matching loop, 3050 which will cause a return or PCRE_ERROR_NOMATCH. */ 3051 3052 if (p >= end_subject) break; 3053 3054 /* If we have found the required character, save the point where we 3055 found it, so that we don't search again next time round the loop if 3056 the start hasn't passed this character yet. */ 3057 3058 req_byte_ptr = p; 3059 } 3060 } 3061 } 3062 } /* End of optimizations that are done when not restarting */ 3063 3064 /* OK, now we can do the business */ 3065 3066 md->start_used_ptr = current_subject; 3067 3068 rc = internal_dfa_exec( 3069 md, /* fixed match data */ 3070 md->start_code, /* this subexpression's code */ 3071 current_subject, /* where we currently are */ 3072 start_offset, /* start offset in subject */ 3073 offsets, /* offset vector */ 3074 offsetcount, /* size of same */ 3075 workspace, /* workspace vector */ 3076 wscount, /* size of same */ 3077 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ 3078 0, /* function recurse level */ 3079 0); /* regex recurse level */ 3080 3081 /* Anything other than "no match" means we are done, always; otherwise, carry 3082 on only if not anchored. */ 3083 3084 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; 3085 3086 /* Advance to the next subject character unless we are at the end of a line 3087 and firstline is set. */ 3088 3089 if (firstline && IS_NEWLINE(current_subject)) break; 3090 current_subject++; 3091 if (utf8) 3092 { 3093 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) 3094 current_subject++; 3095 } 3096 if (current_subject > end_subject) break; 3097 3098 /* If we have just passed a CR and we are now at a LF, and the pattern does 3099 not contain any explicit matches for \r or \n, and the newline option is CRLF 3100 or ANY or ANYCRLF, advance the match position by one more character. */ 3101 3102 if (current_subject[-1] == CHAR_CR && 3103 current_subject < end_subject && 3104 *current_subject == CHAR_NL && 3105 (re->flags & PCRE_HASCRORLF) == 0 && 3106 (md->nltype == NLTYPE_ANY || 3107 md->nltype == NLTYPE_ANYCRLF || 3108 md->nllen == 2)) 3109 current_subject++; 3110 3111 } /* "Bumpalong" loop */ 3112 3113return PCRE_ERROR_NOMATCH; 3114} 3115 3116/* End of pcre_dfa_exec.c */ 3117