1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language (but see 7below for why this module is different). 8 9 Written by Philip Hazel 10 Copyright (c) 1997-2012 University of Cambridge 11 12----------------------------------------------------------------------------- 13Redistribution and use in source and binary forms, with or without 14modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37POSSIBILITY OF SUCH DAMAGE. 38----------------------------------------------------------------------------- 39*/ 40 41/* This module contains the external function pcre_dfa_exec(), which is an 42alternative matching function that uses a sort of DFA algorithm (not a true 43FSM). This is NOT Perl-compatible, but it has advantages in certain 44applications. */ 45 46 47/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved 48the performance of his patterns greatly. I could not use it as it stood, as it 49was not thread safe, and made assumptions about pattern sizes. Also, it caused 50test 7 to loop, and test 9 to crash with a segfault. 51 52The issue is the check for duplicate states, which is done by a simple linear 53search up the state list. (Grep for "duplicate" below to find the code.) For 54many patterns, there will never be many states active at one time, so a simple 55linear search is fine. In patterns that have many active states, it might be a 56bottleneck. The suggested code used an indexing scheme to remember which states 57had previously been used for each character, and avoided the linear search when 58it knew there was no chance of a duplicate. This was implemented when adding 59states to the state lists. 60 61I wrote some thread-safe, not-limited code to try something similar at the time 62of checking for duplicates (instead of when adding states), using index vectors 63on the stack. It did give a 13% improvement with one specially constructed 64pattern for certain subject strings, but on other strings and on many of the 65simpler patterns in the test suite it did worse. The major problem, I think, 66was the extra time to initialize the index. This had to be done for each call 67of internal_dfa_exec(). (The supplied patch used a static vector, initialized 68only once - I suspect this was the cause of the problems with the tests.) 69 70Overall, I concluded that the gains in some cases did not outweigh the losses 71in others, so I abandoned this code. */ 72 73 74 75#ifdef HAVE_CONFIG_H 76#include "config.h" 77#endif 78 79#define NLBLOCK md /* Block containing newline information */ 80#define PSSTART start_subject /* Field containing processed string start */ 81#define PSEND end_subject /* Field containing processed string end */ 82 83#include "pcre_internal.h" 84 85 86/* For use to indent debugging output */ 87 88#define SP " " 89 90 91/************************************************* 92* Code parameters and static tables * 93*************************************************/ 94 95/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes 96into others, under special conditions. A gap of 20 between the blocks should be 97enough. The resulting opcodes don't have to be less than 256 because they are 98never stored, so we push them well clear of the normal opcodes. */ 99 100#define OP_PROP_EXTRA 300 101#define OP_EXTUNI_EXTRA 320 102#define OP_ANYNL_EXTRA 340 103#define OP_HSPACE_EXTRA 360 104#define OP_VSPACE_EXTRA 380 105 106 107/* This table identifies those opcodes that are followed immediately by a 108character that is to be tested in some way. This makes it possible to 109centralize the loading of these characters. In the case of Type * etc, the 110"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a 111small value. Non-zero values in the table are the offsets from the opcode where 112the character is to be found. ***NOTE*** If the start of this table is 113modified, the three tables that follow must also be modified. */ 114 115static const pcre_uint8 coptable[] = { 116 0, /* End */ 117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 119 0, 0, 0, /* Any, AllAny, Anybyte */ 120 0, 0, /* \P, \p */ 121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 122 0, /* \X */ 123 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */ 124 1, /* Char */ 125 1, /* Chari */ 126 1, /* not */ 127 1, /* noti */ 128 /* Positive single-char repeats */ 129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */ 131 1+IMM2_SIZE, /* exact */ 132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */ 133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */ 135 1+IMM2_SIZE, /* exact I */ 136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ 137 /* Negative single-char repeats - only for chars < 256 */ 138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */ 140 1+IMM2_SIZE, /* NOT exact */ 141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */ 142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */ 144 1+IMM2_SIZE, /* NOT exact I */ 145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */ 146 /* Positive type repeats */ 147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */ 149 1+IMM2_SIZE, /* Type exact */ 150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */ 151 /* Character class & ref repeats */ 152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 153 0, 0, /* CRRANGE, CRMINRANGE */ 154 0, /* CLASS */ 155 0, /* NCLASS */ 156 0, /* XCLASS - variable length */ 157 0, /* REF */ 158 0, /* REFI */ 159 0, /* RECURSE */ 160 0, /* CALLOUT */ 161 0, /* Alt */ 162 0, /* Ket */ 163 0, /* KetRmax */ 164 0, /* KetRmin */ 165 0, /* KetRpos */ 166 0, /* Reverse */ 167 0, /* Assert */ 168 0, /* Assert not */ 169 0, /* Assert behind */ 170 0, /* Assert behind not */ 171 0, 0, /* ONCE, ONCE_NC */ 172 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 173 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 174 0, 0, /* CREF, NCREF */ 175 0, 0, /* RREF, NRREF */ 176 0, /* DEF */ 177 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 178 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 179 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 180 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 181 0, 0 /* CLOSE, SKIPZERO */ 182}; 183 184/* This table identifies those opcodes that inspect a character. It is used to 185remember the fact that a character could have been inspected when the end of 186the subject is reached. ***NOTE*** If the start of this table is modified, the 187two tables that follow must also be modified. */ 188 189static const pcre_uint8 poptable[] = { 190 0, /* End */ 191 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */ 192 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ 193 1, 1, 1, /* Any, AllAny, Anybyte */ 194 1, 1, /* \P, \p */ 195 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ 196 1, /* \X */ 197 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */ 198 1, /* Char */ 199 1, /* Chari */ 200 1, /* not */ 201 1, /* noti */ 202 /* Positive single-char repeats */ 203 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 204 1, 1, 1, /* upto, minupto, exact */ 205 1, 1, 1, 1, /* *+, ++, ?+, upto+ */ 206 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */ 207 1, 1, 1, /* upto I, minupto I, exact I */ 208 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */ 209 /* Negative single-char repeats - only for chars < 256 */ 210 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */ 211 1, 1, 1, /* NOT upto, minupto, exact */ 212 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */ 213 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */ 214 1, 1, 1, /* NOT upto I, minupto I, exact I */ 215 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */ 216 /* Positive type repeats */ 217 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */ 218 1, 1, 1, /* Type upto, minupto, exact */ 219 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */ 220 /* Character class & ref repeats */ 221 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 222 1, 1, /* CRRANGE, CRMINRANGE */ 223 1, /* CLASS */ 224 1, /* NCLASS */ 225 1, /* XCLASS - variable length */ 226 0, /* REF */ 227 0, /* REFI */ 228 0, /* RECURSE */ 229 0, /* CALLOUT */ 230 0, /* Alt */ 231 0, /* Ket */ 232 0, /* KetRmax */ 233 0, /* KetRmin */ 234 0, /* KetRpos */ 235 0, /* Reverse */ 236 0, /* Assert */ 237 0, /* Assert not */ 238 0, /* Assert behind */ 239 0, /* Assert behind not */ 240 0, 0, /* ONCE, ONCE_NC */ 241 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ 242 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 243 0, 0, /* CREF, NCREF */ 244 0, 0, /* RREF, NRREF */ 245 0, /* DEF */ 246 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 247 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 248 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 249 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ 250 0, 0 /* CLOSE, SKIPZERO */ 251}; 252 253/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, 254and \w */ 255 256static const pcre_uint8 toptable1[] = { 257 0, 0, 0, 0, 0, 0, 258 ctype_digit, ctype_digit, 259 ctype_space, ctype_space, 260 ctype_word, ctype_word, 261 0, 0 /* OP_ANY, OP_ALLANY */ 262}; 263 264static const pcre_uint8 toptable2[] = { 265 0, 0, 0, 0, 0, 0, 266 ctype_digit, 0, 267 ctype_space, 0, 268 ctype_word, 0, 269 1, 1 /* OP_ANY, OP_ALLANY */ 270}; 271 272 273/* Structure for holding data about a particular state, which is in effect the 274current data for an active path through the match tree. It must consist 275entirely of ints because the working vector we are passed, and which we put 276these structures in, is a vector of ints. */ 277 278typedef struct stateblock { 279 int offset; /* Offset to opcode */ 280 int count; /* Count for repeats */ 281 int data; /* Some use extra data */ 282} stateblock; 283 284#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int)) 285 286 287#ifdef PCRE_DEBUG 288/************************************************* 289* Print character string * 290*************************************************/ 291 292/* Character string printing function for debugging. 293 294Arguments: 295 p points to string 296 length number of bytes 297 f where to print 298 299Returns: nothing 300*/ 301 302static void 303pchars(const pcre_uchar *p, int length, FILE *f) 304{ 305int c; 306while (length-- > 0) 307 { 308 if (isprint(c = *(p++))) 309 fprintf(f, "%c", c); 310 else 311 fprintf(f, "\\x%02x", c); 312 } 313} 314#endif 315 316 317 318/************************************************* 319* Execute a Regular Expression - DFA engine * 320*************************************************/ 321 322/* This internal function applies a compiled pattern to a subject string, 323starting at a given point, using a DFA engine. This function is called from the 324external one, possibly multiple times if the pattern is not anchored. The 325function calls itself recursively for some kinds of subpattern. 326 327Arguments: 328 md the match_data block with fixed information 329 this_start_code the opening bracket of this subexpression's code 330 current_subject where we currently are in the subject string 331 start_offset start offset in the subject string 332 offsets vector to contain the matching string offsets 333 offsetcount size of same 334 workspace vector of workspace 335 wscount size of same 336 rlevel function call recursion level 337 338Returns: > 0 => number of match offset pairs placed in offsets 339 = 0 => offsets overflowed; longest matches are present 340 -1 => failed to match 341 < -1 => some kind of unexpected problem 342 343The following macros are used for adding states to the two state vectors (one 344for the current character, one for the following character). */ 345 346#define ADD_ACTIVE(x,y) \ 347 if (active_count++ < wscount) \ 348 { \ 349 next_active_state->offset = (x); \ 350 next_active_state->count = (y); \ 351 next_active_state++; \ 352 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 353 } \ 354 else return PCRE_ERROR_DFA_WSSIZE 355 356#define ADD_ACTIVE_DATA(x,y,z) \ 357 if (active_count++ < wscount) \ 358 { \ 359 next_active_state->offset = (x); \ 360 next_active_state->count = (y); \ 361 next_active_state->data = (z); \ 362 next_active_state++; \ 363 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ 364 } \ 365 else return PCRE_ERROR_DFA_WSSIZE 366 367#define ADD_NEW(x,y) \ 368 if (new_count++ < wscount) \ 369 { \ 370 next_new_state->offset = (x); \ 371 next_new_state->count = (y); \ 372 next_new_state++; \ 373 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ 374 } \ 375 else return PCRE_ERROR_DFA_WSSIZE 376 377#define ADD_NEW_DATA(x,y,z) \ 378 if (new_count++ < wscount) \ 379 { \ 380 next_new_state->offset = (x); \ 381 next_new_state->count = (y); \ 382 next_new_state->data = (z); \ 383 next_new_state++; \ 384 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \ 385 (x), (y), (z), __LINE__)); \ 386 } \ 387 else return PCRE_ERROR_DFA_WSSIZE 388 389/* And now, here is the code */ 390 391static int 392internal_dfa_exec( 393 dfa_match_data *md, 394 const pcre_uchar *this_start_code, 395 const pcre_uchar *current_subject, 396 int start_offset, 397 int *offsets, 398 int offsetcount, 399 int *workspace, 400 int wscount, 401 int rlevel) 402{ 403stateblock *active_states, *new_states, *temp_states; 404stateblock *next_active_state, *next_new_state; 405 406const pcre_uint8 *ctypes, *lcc, *fcc; 407const pcre_uchar *ptr; 408const pcre_uchar *end_code, *first_op; 409 410dfa_recursion_info new_recursive; 411 412int active_count, new_count, match_count; 413 414/* Some fields in the md block are frequently referenced, so we load them into 415independent variables in the hope that this will perform better. */ 416 417const pcre_uchar *start_subject = md->start_subject; 418const pcre_uchar *end_subject = md->end_subject; 419const pcre_uchar *start_code = md->start_code; 420 421#ifdef SUPPORT_UTF 422BOOL utf = (md->poptions & PCRE_UTF8) != 0; 423#else 424BOOL utf = FALSE; 425#endif 426 427BOOL reset_could_continue = FALSE; 428 429rlevel++; 430offsetcount &= (-2); 431 432wscount -= 2; 433wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / 434 (2 * INTS_PER_STATEBLOCK); 435 436DPRINTF(("\n%.*s---------------------\n" 437 "%.*sCall to internal_dfa_exec f=%d\n", 438 rlevel*2-2, SP, rlevel*2-2, SP, rlevel)); 439 440ctypes = md->tables + ctypes_offset; 441lcc = md->tables + lcc_offset; 442fcc = md->tables + fcc_offset; 443 444match_count = PCRE_ERROR_NOMATCH; /* A negative number */ 445 446active_states = (stateblock *)(workspace + 2); 447next_new_state = new_states = active_states + wscount; 448new_count = 0; 449 450first_op = this_start_code + 1 + LINK_SIZE + 451 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 452 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 453 ? IMM2_SIZE:0); 454 455/* The first thing in any (sub) pattern is a bracket of some sort. Push all 456the alternative states onto the list, and find out where the end is. This 457makes is possible to use this function recursively, when we want to stop at a 458matching internal ket rather than at the end. 459 460If the first opcode in the first alternative is OP_REVERSE, we are dealing with 461a backward assertion. In that case, we have to find out the maximum amount to 462move back, and set up each alternative appropriately. */ 463 464if (*first_op == OP_REVERSE) 465 { 466 int max_back = 0; 467 int gone_back; 468 469 end_code = this_start_code; 470 do 471 { 472 int back = GET(end_code, 2+LINK_SIZE); 473 if (back > max_back) max_back = back; 474 end_code += GET(end_code, 1); 475 } 476 while (*end_code == OP_ALT); 477 478 /* If we can't go back the amount required for the longest lookbehind 479 pattern, go back as far as we can; some alternatives may still be viable. */ 480 481#ifdef SUPPORT_UTF 482 /* In character mode we have to step back character by character */ 483 484 if (utf) 485 { 486 for (gone_back = 0; gone_back < max_back; gone_back++) 487 { 488 if (current_subject <= start_subject) break; 489 current_subject--; 490 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--); 491 } 492 } 493 else 494#endif 495 496 /* In byte-mode we can do this quickly. */ 497 498 { 499 gone_back = (current_subject - max_back < start_subject)? 500 (int)(current_subject - start_subject) : max_back; 501 current_subject -= gone_back; 502 } 503 504 /* Save the earliest consulted character */ 505 506 if (current_subject < md->start_used_ptr) 507 md->start_used_ptr = current_subject; 508 509 /* Now we can process the individual branches. */ 510 511 end_code = this_start_code; 512 do 513 { 514 int back = GET(end_code, 2+LINK_SIZE); 515 if (back <= gone_back) 516 { 517 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE); 518 ADD_NEW_DATA(-bstate, 0, gone_back - back); 519 } 520 end_code += GET(end_code, 1); 521 } 522 while (*end_code == OP_ALT); 523 } 524 525/* This is the code for a "normal" subpattern (not a backward assertion). The 526start of a whole pattern is always one of these. If we are at the top level, 527we may be asked to restart matching from the same point that we reached for a 528previous partial match. We still have to scan through the top-level branches to 529find the end state. */ 530 531else 532 { 533 end_code = this_start_code; 534 535 /* Restarting */ 536 537 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) 538 { 539 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); 540 new_count = workspace[1]; 541 if (!workspace[0]) 542 memcpy(new_states, active_states, new_count * sizeof(stateblock)); 543 } 544 545 /* Not restarting */ 546 547 else 548 { 549 int length = 1 + LINK_SIZE + 550 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA || 551 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS) 552 ? IMM2_SIZE:0); 553 do 554 { 555 ADD_NEW((int)(end_code - start_code + length), 0); 556 end_code += GET(end_code, 1); 557 length = 1 + LINK_SIZE; 558 } 559 while (*end_code == OP_ALT); 560 } 561 } 562 563workspace[0] = 0; /* Bit indicating which vector is current */ 564 565DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code))); 566 567/* Loop for scanning the subject */ 568 569ptr = current_subject; 570for (;;) 571 { 572 int i, j; 573 int clen, dlen; 574 unsigned int c, d; 575 int forced_fail = 0; 576 BOOL partial_newline = FALSE; 577 BOOL could_continue = reset_could_continue; 578 reset_could_continue = FALSE; 579 580 /* Make the new state list into the active state list and empty the 581 new state list. */ 582 583 temp_states = active_states; 584 active_states = new_states; 585 new_states = temp_states; 586 active_count = new_count; 587 new_count = 0; 588 589 workspace[0] ^= 1; /* Remember for the restarting feature */ 590 workspace[1] = active_count; 591 592#ifdef PCRE_DEBUG 593 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); 594 pchars(ptr, STRLEN_UC(ptr), stdout); 595 printf("\"\n"); 596 597 printf("%.*sActive states: ", rlevel*2-2, SP); 598 for (i = 0; i < active_count; i++) 599 printf("%d/%d ", active_states[i].offset, active_states[i].count); 600 printf("\n"); 601#endif 602 603 /* Set the pointers for adding new states */ 604 605 next_active_state = active_states + active_count; 606 next_new_state = new_states; 607 608 /* Load the current character from the subject outside the loop, as many 609 different states may want to look at it, and we assume that at least one 610 will. */ 611 612 if (ptr < end_subject) 613 { 614 clen = 1; /* Number of data items in the character */ 615#ifdef SUPPORT_UTF 616 if (utf) { GETCHARLEN(c, ptr, clen); } else 617#endif /* SUPPORT_UTF */ 618 c = *ptr; 619 } 620 else 621 { 622 clen = 0; /* This indicates the end of the subject */ 623 c = NOTACHAR; /* This value should never actually be used */ 624 } 625 626 /* Scan up the active states and act on each one. The result of an action 627 may be to add more states to the currently active list (e.g. on hitting a 628 parenthesis) or it may be to put states on the new list, for considering 629 when we move the character pointer on. */ 630 631 for (i = 0; i < active_count; i++) 632 { 633 stateblock *current_state = active_states + i; 634 BOOL caseless = FALSE; 635 const pcre_uchar *code; 636 int state_offset = current_state->offset; 637 int count, codevalue, rrc; 638 639#ifdef PCRE_DEBUG 640 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); 641 if (clen == 0) printf("EOL\n"); 642 else if (c > 32 && c < 127) printf("'%c'\n", c); 643 else printf("0x%02x\n", c); 644#endif 645 646 /* A negative offset is a special case meaning "hold off going to this 647 (negated) state until the number of characters in the data field have 648 been skipped". If the could_continue flag was passed over from a previous 649 state, arrange for it to passed on. */ 650 651 if (state_offset < 0) 652 { 653 if (current_state->data > 0) 654 { 655 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); 656 ADD_NEW_DATA(state_offset, current_state->count, 657 current_state->data - 1); 658 if (could_continue) reset_could_continue = TRUE; 659 continue; 660 } 661 else 662 { 663 current_state->offset = state_offset = -state_offset; 664 } 665 } 666 667 /* Check for a duplicate state with the same count, and skip if found. 668 See the note at the head of this module about the possibility of improving 669 performance here. */ 670 671 for (j = 0; j < i; j++) 672 { 673 if (active_states[j].offset == state_offset && 674 active_states[j].count == current_state->count) 675 { 676 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); 677 goto NEXT_ACTIVE_STATE; 678 } 679 } 680 681 /* The state offset is the offset to the opcode */ 682 683 code = start_code + state_offset; 684 codevalue = *code; 685 686 /* If this opcode inspects a character, but we are at the end of the 687 subject, remember the fact for use when testing for a partial match. */ 688 689 if (clen == 0 && poptable[codevalue] != 0) 690 could_continue = TRUE; 691 692 /* If this opcode is followed by an inline character, load it. It is 693 tempting to test for the presence of a subject character here, but that 694 is wrong, because sometimes zero repetitions of the subject are 695 permitted. 696 697 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an 698 argument that is not a data character - but is always one byte long because 699 the values are small. We have to take special action to deal with \P, \p, 700 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert 701 these ones to new opcodes. */ 702 703 if (coptable[codevalue] > 0) 704 { 705 dlen = 1; 706#ifdef SUPPORT_UTF 707 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else 708#endif /* SUPPORT_UTF */ 709 d = code[coptable[codevalue]]; 710 if (codevalue >= OP_TYPESTAR) 711 { 712 switch(d) 713 { 714 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; 715 case OP_NOTPROP: 716 case OP_PROP: codevalue += OP_PROP_EXTRA; break; 717 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; 718 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; 719 case OP_NOT_HSPACE: 720 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; 721 case OP_NOT_VSPACE: 722 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; 723 default: break; 724 } 725 } 726 } 727 else 728 { 729 dlen = 0; /* Not strictly necessary, but compilers moan */ 730 d = NOTACHAR; /* if these variables are not set. */ 731 } 732 733 734 /* Now process the individual opcodes */ 735 736 switch (codevalue) 737 { 738/* ========================================================================== */ 739 /* These cases are never obeyed. This is a fudge that causes a compile- 740 time error if the vectors coptable or poptable, which are indexed by 741 opcode, are not the correct length. It seems to be the only way to do 742 such a check at compile time, as the sizeof() operator does not work 743 in the C preprocessor. */ 744 745 case OP_TABLE_LENGTH: 746 case OP_TABLE_LENGTH + 747 ((sizeof(coptable) == OP_TABLE_LENGTH) && 748 (sizeof(poptable) == OP_TABLE_LENGTH)): 749 break; 750 751/* ========================================================================== */ 752 /* Reached a closing bracket. If not at the end of the pattern, carry 753 on with the next opcode. For repeating opcodes, also add the repeat 754 state. Note that KETRPOS will always be encountered at the end of the 755 subpattern, because the possessive subpattern repeats are always handled 756 using recursive calls. Thus, it never adds any new states. 757 758 At the end of the (sub)pattern, unless we have an empty string and 759 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the 760 start of the subject, save the match data, shifting up all previous 761 matches so we always have the longest first. */ 762 763 case OP_KET: 764 case OP_KETRMIN: 765 case OP_KETRMAX: 766 case OP_KETRPOS: 767 if (code != end_code) 768 { 769 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); 770 if (codevalue != OP_KET) 771 { 772 ADD_ACTIVE(state_offset - GET(code, 1), 0); 773 } 774 } 775 else 776 { 777 if (ptr > current_subject || 778 ((md->moptions & PCRE_NOTEMPTY) == 0 && 779 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 || 780 current_subject > start_subject + md->start_offset))) 781 { 782 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; 783 else if (match_count > 0 && ++match_count * 2 > offsetcount) 784 match_count = 0; 785 count = ((match_count == 0)? offsetcount : match_count * 2) - 2; 786 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); 787 if (offsetcount >= 2) 788 { 789 offsets[0] = (int)(current_subject - start_subject); 790 offsets[1] = (int)(ptr - start_subject); 791 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, 792 offsets[1] - offsets[0], (char *)current_subject)); 793 } 794 if ((md->moptions & PCRE_DFA_SHORTEST) != 0) 795 { 796 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 797 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, 798 match_count, rlevel*2-2, SP)); 799 return match_count; 800 } 801 } 802 } 803 break; 804 805/* ========================================================================== */ 806 /* These opcodes add to the current list of states without looking 807 at the current character. */ 808 809 /*-----------------------------------------------------------------*/ 810 case OP_ALT: 811 do { code += GET(code, 1); } while (*code == OP_ALT); 812 ADD_ACTIVE((int)(code - start_code), 0); 813 break; 814 815 /*-----------------------------------------------------------------*/ 816 case OP_BRA: 817 case OP_SBRA: 818 do 819 { 820 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 821 code += GET(code, 1); 822 } 823 while (*code == OP_ALT); 824 break; 825 826 /*-----------------------------------------------------------------*/ 827 case OP_CBRA: 828 case OP_SCBRA: 829 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0); 830 code += GET(code, 1); 831 while (*code == OP_ALT) 832 { 833 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 834 code += GET(code, 1); 835 } 836 break; 837 838 /*-----------------------------------------------------------------*/ 839 case OP_BRAZERO: 840 case OP_BRAMINZERO: 841 ADD_ACTIVE(state_offset + 1, 0); 842 code += 1 + GET(code, 2); 843 while (*code == OP_ALT) code += GET(code, 1); 844 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 845 break; 846 847 /*-----------------------------------------------------------------*/ 848 case OP_SKIPZERO: 849 code += 1 + GET(code, 2); 850 while (*code == OP_ALT) code += GET(code, 1); 851 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0); 852 break; 853 854 /*-----------------------------------------------------------------*/ 855 case OP_CIRC: 856 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) 857 { ADD_ACTIVE(state_offset + 1, 0); } 858 break; 859 860 /*-----------------------------------------------------------------*/ 861 case OP_CIRCM: 862 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || 863 (ptr != end_subject && WAS_NEWLINE(ptr))) 864 { ADD_ACTIVE(state_offset + 1, 0); } 865 break; 866 867 /*-----------------------------------------------------------------*/ 868 case OP_EOD: 869 if (ptr >= end_subject) 870 { 871 if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 872 could_continue = TRUE; 873 else { ADD_ACTIVE(state_offset + 1, 0); } 874 } 875 break; 876 877 /*-----------------------------------------------------------------*/ 878 case OP_SOD: 879 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } 880 break; 881 882 /*-----------------------------------------------------------------*/ 883 case OP_SOM: 884 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } 885 break; 886 887 888/* ========================================================================== */ 889 /* These opcodes inspect the next subject character, and sometimes 890 the previous one as well, but do not have an argument. The variable 891 clen contains the length of the current character and is zero if we are 892 at the end of the subject. */ 893 894 /*-----------------------------------------------------------------*/ 895 case OP_ANY: 896 if (clen > 0 && !IS_NEWLINE(ptr)) 897 { 898 if (ptr + 1 >= md->end_subject && 899 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 900 NLBLOCK->nltype == NLTYPE_FIXED && 901 NLBLOCK->nllen == 2 && 902 c == NLBLOCK->nl[0]) 903 { 904 could_continue = partial_newline = TRUE; 905 } 906 else 907 { 908 ADD_NEW(state_offset + 1, 0); 909 } 910 } 911 break; 912 913 /*-----------------------------------------------------------------*/ 914 case OP_ALLANY: 915 if (clen > 0) 916 { ADD_NEW(state_offset + 1, 0); } 917 break; 918 919 /*-----------------------------------------------------------------*/ 920 case OP_EODN: 921 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 922 could_continue = TRUE; 923 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) 924 { ADD_ACTIVE(state_offset + 1, 0); } 925 break; 926 927 /*-----------------------------------------------------------------*/ 928 case OP_DOLL: 929 if ((md->moptions & PCRE_NOTEOL) == 0) 930 { 931 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 932 could_continue = TRUE; 933 else if (clen == 0 || 934 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) && 935 (ptr == end_subject - md->nllen) 936 )) 937 { ADD_ACTIVE(state_offset + 1, 0); } 938 else if (ptr + 1 >= md->end_subject && 939 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && 940 NLBLOCK->nltype == NLTYPE_FIXED && 941 NLBLOCK->nllen == 2 && 942 c == NLBLOCK->nl[0]) 943 { 944 if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 945 { 946 reset_could_continue = TRUE; 947 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 948 } 949 else could_continue = partial_newline = TRUE; 950 } 951 } 952 break; 953 954 /*-----------------------------------------------------------------*/ 955 case OP_DOLLM: 956 if ((md->moptions & PCRE_NOTEOL) == 0) 957 { 958 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0) 959 could_continue = TRUE; 960 else if (clen == 0 || 961 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr))) 962 { ADD_ACTIVE(state_offset + 1, 0); } 963 else if (ptr + 1 >= md->end_subject && 964 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 && 965 NLBLOCK->nltype == NLTYPE_FIXED && 966 NLBLOCK->nllen == 2 && 967 c == NLBLOCK->nl[0]) 968 { 969 if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 970 { 971 reset_could_continue = TRUE; 972 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 973 } 974 else could_continue = partial_newline = TRUE; 975 } 976 } 977 else if (IS_NEWLINE(ptr)) 978 { ADD_ACTIVE(state_offset + 1, 0); } 979 break; 980 981 /*-----------------------------------------------------------------*/ 982 983 case OP_DIGIT: 984 case OP_WHITESPACE: 985 case OP_WORDCHAR: 986 if (clen > 0 && c < 256 && 987 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) 988 { ADD_NEW(state_offset + 1, 0); } 989 break; 990 991 /*-----------------------------------------------------------------*/ 992 case OP_NOT_DIGIT: 993 case OP_NOT_WHITESPACE: 994 case OP_NOT_WORDCHAR: 995 if (clen > 0 && (c >= 256 || 996 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) 997 { ADD_NEW(state_offset + 1, 0); } 998 break; 999 1000 /*-----------------------------------------------------------------*/ 1001 case OP_WORD_BOUNDARY: 1002 case OP_NOT_WORD_BOUNDARY: 1003 { 1004 int left_word, right_word; 1005 1006 if (ptr > start_subject) 1007 { 1008 const pcre_uchar *temp = ptr - 1; 1009 if (temp < md->start_used_ptr) md->start_used_ptr = temp; 1010#ifdef SUPPORT_UTF 1011 if (utf) { BACKCHAR(temp); } 1012#endif 1013 GETCHARTEST(d, temp); 1014#ifdef SUPPORT_UCP 1015 if ((md->poptions & PCRE_UCP) != 0) 1016 { 1017 if (d == '_') left_word = TRUE; else 1018 { 1019 int cat = UCD_CATEGORY(d); 1020 left_word = (cat == ucp_L || cat == ucp_N); 1021 } 1022 } 1023 else 1024#endif 1025 left_word = d < 256 && (ctypes[d] & ctype_word) != 0; 1026 } 1027 else left_word = FALSE; 1028 1029 if (clen > 0) 1030 { 1031#ifdef SUPPORT_UCP 1032 if ((md->poptions & PCRE_UCP) != 0) 1033 { 1034 if (c == '_') right_word = TRUE; else 1035 { 1036 int cat = UCD_CATEGORY(c); 1037 right_word = (cat == ucp_L || cat == ucp_N); 1038 } 1039 } 1040 else 1041#endif 1042 right_word = c < 256 && (ctypes[c] & ctype_word) != 0; 1043 } 1044 else right_word = FALSE; 1045 1046 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) 1047 { ADD_ACTIVE(state_offset + 1, 0); } 1048 } 1049 break; 1050 1051 1052 /*-----------------------------------------------------------------*/ 1053 /* Check the next character by Unicode property. We will get here only 1054 if the support is in the binary; otherwise a compile-time error occurs. 1055 */ 1056 1057#ifdef SUPPORT_UCP 1058 case OP_PROP: 1059 case OP_NOTPROP: 1060 if (clen > 0) 1061 { 1062 BOOL OK; 1063 const ucd_record * prop = GET_UCD(c); 1064 switch(code[1]) 1065 { 1066 case PT_ANY: 1067 OK = TRUE; 1068 break; 1069 1070 case PT_LAMP: 1071 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1072 prop->chartype == ucp_Lt; 1073 break; 1074 1075 case PT_GC: 1076 OK = PRIV(ucp_gentype)[prop->chartype] == code[2]; 1077 break; 1078 1079 case PT_PC: 1080 OK = prop->chartype == code[2]; 1081 break; 1082 1083 case PT_SC: 1084 OK = prop->script == code[2]; 1085 break; 1086 1087 /* These are specials for combination cases. */ 1088 1089 case PT_ALNUM: 1090 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1091 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1092 break; 1093 1094 case PT_SPACE: /* Perl space */ 1095 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1096 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1097 break; 1098 1099 case PT_PXSPACE: /* POSIX space */ 1100 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1101 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1102 c == CHAR_FF || c == CHAR_CR; 1103 break; 1104 1105 case PT_WORD: 1106 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1107 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1108 c == CHAR_UNDERSCORE; 1109 break; 1110 1111 /* Should never occur, but keep compilers from grumbling. */ 1112 1113 default: 1114 OK = codevalue != OP_PROP; 1115 break; 1116 } 1117 1118 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } 1119 } 1120 break; 1121#endif 1122 1123 1124 1125/* ========================================================================== */ 1126 /* These opcodes likewise inspect the subject character, but have an 1127 argument that is not a data character. It is one of these opcodes: 1128 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, 1129 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ 1130 1131 case OP_TYPEPLUS: 1132 case OP_TYPEMINPLUS: 1133 case OP_TYPEPOSPLUS: 1134 count = current_state->count; /* Already matched */ 1135 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1136 if (clen > 0) 1137 { 1138 if (d == OP_ANY && ptr + 1 >= md->end_subject && 1139 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1140 NLBLOCK->nltype == NLTYPE_FIXED && 1141 NLBLOCK->nllen == 2 && 1142 c == NLBLOCK->nl[0]) 1143 { 1144 could_continue = partial_newline = TRUE; 1145 } 1146 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1147 (c < 256 && 1148 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1149 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1150 { 1151 if (count > 0 && codevalue == OP_TYPEPOSPLUS) 1152 { 1153 active_count--; /* Remove non-match possibility */ 1154 next_active_state--; 1155 } 1156 count++; 1157 ADD_NEW(state_offset, count); 1158 } 1159 } 1160 break; 1161 1162 /*-----------------------------------------------------------------*/ 1163 case OP_TYPEQUERY: 1164 case OP_TYPEMINQUERY: 1165 case OP_TYPEPOSQUERY: 1166 ADD_ACTIVE(state_offset + 2, 0); 1167 if (clen > 0) 1168 { 1169 if (d == OP_ANY && ptr + 1 >= md->end_subject && 1170 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1171 NLBLOCK->nltype == NLTYPE_FIXED && 1172 NLBLOCK->nllen == 2 && 1173 c == NLBLOCK->nl[0]) 1174 { 1175 could_continue = partial_newline = TRUE; 1176 } 1177 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1178 (c < 256 && 1179 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1180 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1181 { 1182 if (codevalue == OP_TYPEPOSQUERY) 1183 { 1184 active_count--; /* Remove non-match possibility */ 1185 next_active_state--; 1186 } 1187 ADD_NEW(state_offset + 2, 0); 1188 } 1189 } 1190 break; 1191 1192 /*-----------------------------------------------------------------*/ 1193 case OP_TYPESTAR: 1194 case OP_TYPEMINSTAR: 1195 case OP_TYPEPOSSTAR: 1196 ADD_ACTIVE(state_offset + 2, 0); 1197 if (clen > 0) 1198 { 1199 if (d == OP_ANY && ptr + 1 >= md->end_subject && 1200 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1201 NLBLOCK->nltype == NLTYPE_FIXED && 1202 NLBLOCK->nllen == 2 && 1203 c == NLBLOCK->nl[0]) 1204 { 1205 could_continue = partial_newline = TRUE; 1206 } 1207 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1208 (c < 256 && 1209 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1210 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1211 { 1212 if (codevalue == OP_TYPEPOSSTAR) 1213 { 1214 active_count--; /* Remove non-match possibility */ 1215 next_active_state--; 1216 } 1217 ADD_NEW(state_offset, 0); 1218 } 1219 } 1220 break; 1221 1222 /*-----------------------------------------------------------------*/ 1223 case OP_TYPEEXACT: 1224 count = current_state->count; /* Number already matched */ 1225 if (clen > 0) 1226 { 1227 if (d == OP_ANY && ptr + 1 >= md->end_subject && 1228 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1229 NLBLOCK->nltype == NLTYPE_FIXED && 1230 NLBLOCK->nllen == 2 && 1231 c == NLBLOCK->nl[0]) 1232 { 1233 could_continue = partial_newline = TRUE; 1234 } 1235 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1236 (c < 256 && 1237 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1238 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1239 { 1240 if (++count >= GET2(code, 1)) 1241 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); } 1242 else 1243 { ADD_NEW(state_offset, count); } 1244 } 1245 } 1246 break; 1247 1248 /*-----------------------------------------------------------------*/ 1249 case OP_TYPEUPTO: 1250 case OP_TYPEMINUPTO: 1251 case OP_TYPEPOSUPTO: 1252 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); 1253 count = current_state->count; /* Number already matched */ 1254 if (clen > 0) 1255 { 1256 if (d == OP_ANY && ptr + 1 >= md->end_subject && 1257 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 && 1258 NLBLOCK->nltype == NLTYPE_FIXED && 1259 NLBLOCK->nllen == 2 && 1260 c == NLBLOCK->nl[0]) 1261 { 1262 could_continue = partial_newline = TRUE; 1263 } 1264 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || 1265 (c < 256 && 1266 (d != OP_ANY || !IS_NEWLINE(ptr)) && 1267 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) 1268 { 1269 if (codevalue == OP_TYPEPOSUPTO) 1270 { 1271 active_count--; /* Remove non-match possibility */ 1272 next_active_state--; 1273 } 1274 if (++count >= GET2(code, 1)) 1275 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); } 1276 else 1277 { ADD_NEW(state_offset, count); } 1278 } 1279 } 1280 break; 1281 1282/* ========================================================================== */ 1283 /* These are virtual opcodes that are used when something like 1284 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its 1285 argument. It keeps the code above fast for the other cases. The argument 1286 is in the d variable. */ 1287 1288#ifdef SUPPORT_UCP 1289 case OP_PROP_EXTRA + OP_TYPEPLUS: 1290 case OP_PROP_EXTRA + OP_TYPEMINPLUS: 1291 case OP_PROP_EXTRA + OP_TYPEPOSPLUS: 1292 count = current_state->count; /* Already matched */ 1293 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } 1294 if (clen > 0) 1295 { 1296 BOOL OK; 1297 const ucd_record * prop = GET_UCD(c); 1298 switch(code[2]) 1299 { 1300 case PT_ANY: 1301 OK = TRUE; 1302 break; 1303 1304 case PT_LAMP: 1305 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1306 prop->chartype == ucp_Lt; 1307 break; 1308 1309 case PT_GC: 1310 OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1311 break; 1312 1313 case PT_PC: 1314 OK = prop->chartype == code[3]; 1315 break; 1316 1317 case PT_SC: 1318 OK = prop->script == code[3]; 1319 break; 1320 1321 /* These are specials for combination cases. */ 1322 1323 case PT_ALNUM: 1324 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1325 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1326 break; 1327 1328 case PT_SPACE: /* Perl space */ 1329 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1330 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1331 break; 1332 1333 case PT_PXSPACE: /* POSIX space */ 1334 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1335 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1336 c == CHAR_FF || c == CHAR_CR; 1337 break; 1338 1339 case PT_WORD: 1340 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1341 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1342 c == CHAR_UNDERSCORE; 1343 break; 1344 1345 /* Should never occur, but keep compilers from grumbling. */ 1346 1347 default: 1348 OK = codevalue != OP_PROP; 1349 break; 1350 } 1351 1352 if (OK == (d == OP_PROP)) 1353 { 1354 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) 1355 { 1356 active_count--; /* Remove non-match possibility */ 1357 next_active_state--; 1358 } 1359 count++; 1360 ADD_NEW(state_offset, count); 1361 } 1362 } 1363 break; 1364 1365 /*-----------------------------------------------------------------*/ 1366 case OP_EXTUNI_EXTRA + OP_TYPEPLUS: 1367 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: 1368 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: 1369 count = current_state->count; /* Already matched */ 1370 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1371 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1372 { 1373 const pcre_uchar *nptr = ptr + clen; 1374 int ncount = 0; 1375 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) 1376 { 1377 active_count--; /* Remove non-match possibility */ 1378 next_active_state--; 1379 } 1380 while (nptr < end_subject) 1381 { 1382 int nd; 1383 int ndlen = 1; 1384 GETCHARLEN(nd, nptr, ndlen); 1385 if (UCD_CATEGORY(nd) != ucp_M) break; 1386 ncount++; 1387 nptr += ndlen; 1388 } 1389 count++; 1390 ADD_NEW_DATA(-state_offset, count, ncount); 1391 } 1392 break; 1393#endif 1394 1395 /*-----------------------------------------------------------------*/ 1396 case OP_ANYNL_EXTRA + OP_TYPEPLUS: 1397 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: 1398 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: 1399 count = current_state->count; /* Already matched */ 1400 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1401 if (clen > 0) 1402 { 1403 int ncount = 0; 1404 switch (c) 1405 { 1406 case 0x000b: 1407 case 0x000c: 1408 case 0x0085: 1409 case 0x2028: 1410 case 0x2029: 1411 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1412 goto ANYNL01; 1413 1414 case 0x000d: 1415 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; 1416 /* Fall through */ 1417 1418 ANYNL01: 1419 case 0x000a: 1420 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) 1421 { 1422 active_count--; /* Remove non-match possibility */ 1423 next_active_state--; 1424 } 1425 count++; 1426 ADD_NEW_DATA(-state_offset, count, ncount); 1427 break; 1428 1429 default: 1430 break; 1431 } 1432 } 1433 break; 1434 1435 /*-----------------------------------------------------------------*/ 1436 case OP_VSPACE_EXTRA + OP_TYPEPLUS: 1437 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: 1438 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: 1439 count = current_state->count; /* Already matched */ 1440 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1441 if (clen > 0) 1442 { 1443 BOOL OK; 1444 switch (c) 1445 { 1446 case 0x000a: 1447 case 0x000b: 1448 case 0x000c: 1449 case 0x000d: 1450 case 0x0085: 1451 case 0x2028: 1452 case 0x2029: 1453 OK = TRUE; 1454 break; 1455 1456 default: 1457 OK = FALSE; 1458 break; 1459 } 1460 1461 if (OK == (d == OP_VSPACE)) 1462 { 1463 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) 1464 { 1465 active_count--; /* Remove non-match possibility */ 1466 next_active_state--; 1467 } 1468 count++; 1469 ADD_NEW_DATA(-state_offset, count, 0); 1470 } 1471 } 1472 break; 1473 1474 /*-----------------------------------------------------------------*/ 1475 case OP_HSPACE_EXTRA + OP_TYPEPLUS: 1476 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: 1477 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: 1478 count = current_state->count; /* Already matched */ 1479 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } 1480 if (clen > 0) 1481 { 1482 BOOL OK; 1483 switch (c) 1484 { 1485 case 0x09: /* HT */ 1486 case 0x20: /* SPACE */ 1487 case 0xa0: /* NBSP */ 1488 case 0x1680: /* OGHAM SPACE MARK */ 1489 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1490 case 0x2000: /* EN QUAD */ 1491 case 0x2001: /* EM QUAD */ 1492 case 0x2002: /* EN SPACE */ 1493 case 0x2003: /* EM SPACE */ 1494 case 0x2004: /* THREE-PER-EM SPACE */ 1495 case 0x2005: /* FOUR-PER-EM SPACE */ 1496 case 0x2006: /* SIX-PER-EM SPACE */ 1497 case 0x2007: /* FIGURE SPACE */ 1498 case 0x2008: /* PUNCTUATION SPACE */ 1499 case 0x2009: /* THIN SPACE */ 1500 case 0x200A: /* HAIR SPACE */ 1501 case 0x202f: /* NARROW NO-BREAK SPACE */ 1502 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1503 case 0x3000: /* IDEOGRAPHIC SPACE */ 1504 OK = TRUE; 1505 break; 1506 1507 default: 1508 OK = FALSE; 1509 break; 1510 } 1511 1512 if (OK == (d == OP_HSPACE)) 1513 { 1514 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) 1515 { 1516 active_count--; /* Remove non-match possibility */ 1517 next_active_state--; 1518 } 1519 count++; 1520 ADD_NEW_DATA(-state_offset, count, 0); 1521 } 1522 } 1523 break; 1524 1525 /*-----------------------------------------------------------------*/ 1526#ifdef SUPPORT_UCP 1527 case OP_PROP_EXTRA + OP_TYPEQUERY: 1528 case OP_PROP_EXTRA + OP_TYPEMINQUERY: 1529 case OP_PROP_EXTRA + OP_TYPEPOSQUERY: 1530 count = 4; 1531 goto QS1; 1532 1533 case OP_PROP_EXTRA + OP_TYPESTAR: 1534 case OP_PROP_EXTRA + OP_TYPEMINSTAR: 1535 case OP_PROP_EXTRA + OP_TYPEPOSSTAR: 1536 count = 0; 1537 1538 QS1: 1539 1540 ADD_ACTIVE(state_offset + 4, 0); 1541 if (clen > 0) 1542 { 1543 BOOL OK; 1544 const ucd_record * prop = GET_UCD(c); 1545 switch(code[2]) 1546 { 1547 case PT_ANY: 1548 OK = TRUE; 1549 break; 1550 1551 case PT_LAMP: 1552 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1553 prop->chartype == ucp_Lt; 1554 break; 1555 1556 case PT_GC: 1557 OK = PRIV(ucp_gentype)[prop->chartype] == code[3]; 1558 break; 1559 1560 case PT_PC: 1561 OK = prop->chartype == code[3]; 1562 break; 1563 1564 case PT_SC: 1565 OK = prop->script == code[3]; 1566 break; 1567 1568 /* These are specials for combination cases. */ 1569 1570 case PT_ALNUM: 1571 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1572 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1573 break; 1574 1575 case PT_SPACE: /* Perl space */ 1576 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1577 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1578 break; 1579 1580 case PT_PXSPACE: /* POSIX space */ 1581 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1582 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1583 c == CHAR_FF || c == CHAR_CR; 1584 break; 1585 1586 case PT_WORD: 1587 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1588 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1589 c == CHAR_UNDERSCORE; 1590 break; 1591 1592 /* Should never occur, but keep compilers from grumbling. */ 1593 1594 default: 1595 OK = codevalue != OP_PROP; 1596 break; 1597 } 1598 1599 if (OK == (d == OP_PROP)) 1600 { 1601 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || 1602 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) 1603 { 1604 active_count--; /* Remove non-match possibility */ 1605 next_active_state--; 1606 } 1607 ADD_NEW(state_offset + count, 0); 1608 } 1609 } 1610 break; 1611 1612 /*-----------------------------------------------------------------*/ 1613 case OP_EXTUNI_EXTRA + OP_TYPEQUERY: 1614 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: 1615 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: 1616 count = 2; 1617 goto QS2; 1618 1619 case OP_EXTUNI_EXTRA + OP_TYPESTAR: 1620 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: 1621 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: 1622 count = 0; 1623 1624 QS2: 1625 1626 ADD_ACTIVE(state_offset + 2, 0); 1627 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1628 { 1629 const pcre_uchar *nptr = ptr + clen; 1630 int ncount = 0; 1631 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || 1632 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) 1633 { 1634 active_count--; /* Remove non-match possibility */ 1635 next_active_state--; 1636 } 1637 while (nptr < end_subject) 1638 { 1639 int nd; 1640 int ndlen = 1; 1641 GETCHARLEN(nd, nptr, ndlen); 1642 if (UCD_CATEGORY(nd) != ucp_M) break; 1643 ncount++; 1644 nptr += ndlen; 1645 } 1646 ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1647 } 1648 break; 1649#endif 1650 1651 /*-----------------------------------------------------------------*/ 1652 case OP_ANYNL_EXTRA + OP_TYPEQUERY: 1653 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: 1654 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: 1655 count = 2; 1656 goto QS3; 1657 1658 case OP_ANYNL_EXTRA + OP_TYPESTAR: 1659 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: 1660 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: 1661 count = 0; 1662 1663 QS3: 1664 ADD_ACTIVE(state_offset + 2, 0); 1665 if (clen > 0) 1666 { 1667 int ncount = 0; 1668 switch (c) 1669 { 1670 case 0x000b: 1671 case 0x000c: 1672 case 0x0085: 1673 case 0x2028: 1674 case 0x2029: 1675 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1676 goto ANYNL02; 1677 1678 case 0x000d: 1679 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; 1680 /* Fall through */ 1681 1682 ANYNL02: 1683 case 0x000a: 1684 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || 1685 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) 1686 { 1687 active_count--; /* Remove non-match possibility */ 1688 next_active_state--; 1689 } 1690 ADD_NEW_DATA(-(state_offset + count), 0, ncount); 1691 break; 1692 1693 default: 1694 break; 1695 } 1696 } 1697 break; 1698 1699 /*-----------------------------------------------------------------*/ 1700 case OP_VSPACE_EXTRA + OP_TYPEQUERY: 1701 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: 1702 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: 1703 count = 2; 1704 goto QS4; 1705 1706 case OP_VSPACE_EXTRA + OP_TYPESTAR: 1707 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: 1708 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: 1709 count = 0; 1710 1711 QS4: 1712 ADD_ACTIVE(state_offset + 2, 0); 1713 if (clen > 0) 1714 { 1715 BOOL OK; 1716 switch (c) 1717 { 1718 case 0x000a: 1719 case 0x000b: 1720 case 0x000c: 1721 case 0x000d: 1722 case 0x0085: 1723 case 0x2028: 1724 case 0x2029: 1725 OK = TRUE; 1726 break; 1727 1728 default: 1729 OK = FALSE; 1730 break; 1731 } 1732 if (OK == (d == OP_VSPACE)) 1733 { 1734 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || 1735 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) 1736 { 1737 active_count--; /* Remove non-match possibility */ 1738 next_active_state--; 1739 } 1740 ADD_NEW_DATA(-(state_offset + count), 0, 0); 1741 } 1742 } 1743 break; 1744 1745 /*-----------------------------------------------------------------*/ 1746 case OP_HSPACE_EXTRA + OP_TYPEQUERY: 1747 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: 1748 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: 1749 count = 2; 1750 goto QS5; 1751 1752 case OP_HSPACE_EXTRA + OP_TYPESTAR: 1753 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: 1754 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: 1755 count = 0; 1756 1757 QS5: 1758 ADD_ACTIVE(state_offset + 2, 0); 1759 if (clen > 0) 1760 { 1761 BOOL OK; 1762 switch (c) 1763 { 1764 case 0x09: /* HT */ 1765 case 0x20: /* SPACE */ 1766 case 0xa0: /* NBSP */ 1767 case 0x1680: /* OGHAM SPACE MARK */ 1768 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 1769 case 0x2000: /* EN QUAD */ 1770 case 0x2001: /* EM QUAD */ 1771 case 0x2002: /* EN SPACE */ 1772 case 0x2003: /* EM SPACE */ 1773 case 0x2004: /* THREE-PER-EM SPACE */ 1774 case 0x2005: /* FOUR-PER-EM SPACE */ 1775 case 0x2006: /* SIX-PER-EM SPACE */ 1776 case 0x2007: /* FIGURE SPACE */ 1777 case 0x2008: /* PUNCTUATION SPACE */ 1778 case 0x2009: /* THIN SPACE */ 1779 case 0x200A: /* HAIR SPACE */ 1780 case 0x202f: /* NARROW NO-BREAK SPACE */ 1781 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 1782 case 0x3000: /* IDEOGRAPHIC SPACE */ 1783 OK = TRUE; 1784 break; 1785 1786 default: 1787 OK = FALSE; 1788 break; 1789 } 1790 1791 if (OK == (d == OP_HSPACE)) 1792 { 1793 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || 1794 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) 1795 { 1796 active_count--; /* Remove non-match possibility */ 1797 next_active_state--; 1798 } 1799 ADD_NEW_DATA(-(state_offset + count), 0, 0); 1800 } 1801 } 1802 break; 1803 1804 /*-----------------------------------------------------------------*/ 1805#ifdef SUPPORT_UCP 1806 case OP_PROP_EXTRA + OP_TYPEEXACT: 1807 case OP_PROP_EXTRA + OP_TYPEUPTO: 1808 case OP_PROP_EXTRA + OP_TYPEMINUPTO: 1809 case OP_PROP_EXTRA + OP_TYPEPOSUPTO: 1810 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) 1811 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); } 1812 count = current_state->count; /* Number already matched */ 1813 if (clen > 0) 1814 { 1815 BOOL OK; 1816 const ucd_record * prop = GET_UCD(c); 1817 switch(code[1 + IMM2_SIZE + 1]) 1818 { 1819 case PT_ANY: 1820 OK = TRUE; 1821 break; 1822 1823 case PT_LAMP: 1824 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || 1825 prop->chartype == ucp_Lt; 1826 break; 1827 1828 case PT_GC: 1829 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2]; 1830 break; 1831 1832 case PT_PC: 1833 OK = prop->chartype == code[1 + IMM2_SIZE + 2]; 1834 break; 1835 1836 case PT_SC: 1837 OK = prop->script == code[1 + IMM2_SIZE + 2]; 1838 break; 1839 1840 /* These are specials for combination cases. */ 1841 1842 case PT_ALNUM: 1843 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1844 PRIV(ucp_gentype)[prop->chartype] == ucp_N; 1845 break; 1846 1847 case PT_SPACE: /* Perl space */ 1848 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1849 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR; 1850 break; 1851 1852 case PT_PXSPACE: /* POSIX space */ 1853 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z || 1854 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || 1855 c == CHAR_FF || c == CHAR_CR; 1856 break; 1857 1858 case PT_WORD: 1859 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L || 1860 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 1861 c == CHAR_UNDERSCORE; 1862 break; 1863 1864 /* Should never occur, but keep compilers from grumbling. */ 1865 1866 default: 1867 OK = codevalue != OP_PROP; 1868 break; 1869 } 1870 1871 if (OK == (d == OP_PROP)) 1872 { 1873 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) 1874 { 1875 active_count--; /* Remove non-match possibility */ 1876 next_active_state--; 1877 } 1878 if (++count >= GET2(code, 1)) 1879 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); } 1880 else 1881 { ADD_NEW(state_offset, count); } 1882 } 1883 } 1884 break; 1885 1886 /*-----------------------------------------------------------------*/ 1887 case OP_EXTUNI_EXTRA + OP_TYPEEXACT: 1888 case OP_EXTUNI_EXTRA + OP_TYPEUPTO: 1889 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: 1890 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: 1891 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) 1892 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1893 count = current_state->count; /* Number already matched */ 1894 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 1895 { 1896 const pcre_uchar *nptr = ptr + clen; 1897 int ncount = 0; 1898 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) 1899 { 1900 active_count--; /* Remove non-match possibility */ 1901 next_active_state--; 1902 } 1903 while (nptr < end_subject) 1904 { 1905 int nd; 1906 int ndlen = 1; 1907 GETCHARLEN(nd, nptr, ndlen); 1908 if (UCD_CATEGORY(nd) != ucp_M) break; 1909 ncount++; 1910 nptr += ndlen; 1911 } 1912 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 1913 reset_could_continue = TRUE; 1914 if (++count >= GET2(code, 1)) 1915 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1916 else 1917 { ADD_NEW_DATA(-state_offset, count, ncount); } 1918 } 1919 break; 1920#endif 1921 1922 /*-----------------------------------------------------------------*/ 1923 case OP_ANYNL_EXTRA + OP_TYPEEXACT: 1924 case OP_ANYNL_EXTRA + OP_TYPEUPTO: 1925 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: 1926 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: 1927 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) 1928 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1929 count = current_state->count; /* Number already matched */ 1930 if (clen > 0) 1931 { 1932 int ncount = 0; 1933 switch (c) 1934 { 1935 case 0x000b: 1936 case 0x000c: 1937 case 0x0085: 1938 case 0x2028: 1939 case 0x2029: 1940 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 1941 goto ANYNL03; 1942 1943 case 0x000d: 1944 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; 1945 /* Fall through */ 1946 1947 ANYNL03: 1948 case 0x000a: 1949 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) 1950 { 1951 active_count--; /* Remove non-match possibility */ 1952 next_active_state--; 1953 } 1954 if (++count >= GET2(code, 1)) 1955 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); } 1956 else 1957 { ADD_NEW_DATA(-state_offset, count, ncount); } 1958 break; 1959 1960 default: 1961 break; 1962 } 1963 } 1964 break; 1965 1966 /*-----------------------------------------------------------------*/ 1967 case OP_VSPACE_EXTRA + OP_TYPEEXACT: 1968 case OP_VSPACE_EXTRA + OP_TYPEUPTO: 1969 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: 1970 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: 1971 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) 1972 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 1973 count = current_state->count; /* Number already matched */ 1974 if (clen > 0) 1975 { 1976 BOOL OK; 1977 switch (c) 1978 { 1979 case 0x000a: 1980 case 0x000b: 1981 case 0x000c: 1982 case 0x000d: 1983 case 0x0085: 1984 case 0x2028: 1985 case 0x2029: 1986 OK = TRUE; 1987 break; 1988 1989 default: 1990 OK = FALSE; 1991 } 1992 1993 if (OK == (d == OP_VSPACE)) 1994 { 1995 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) 1996 { 1997 active_count--; /* Remove non-match possibility */ 1998 next_active_state--; 1999 } 2000 if (++count >= GET2(code, 1)) 2001 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2002 else 2003 { ADD_NEW_DATA(-state_offset, count, 0); } 2004 } 2005 } 2006 break; 2007 2008 /*-----------------------------------------------------------------*/ 2009 case OP_HSPACE_EXTRA + OP_TYPEEXACT: 2010 case OP_HSPACE_EXTRA + OP_TYPEUPTO: 2011 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: 2012 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: 2013 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) 2014 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); } 2015 count = current_state->count; /* Number already matched */ 2016 if (clen > 0) 2017 { 2018 BOOL OK; 2019 switch (c) 2020 { 2021 case 0x09: /* HT */ 2022 case 0x20: /* SPACE */ 2023 case 0xa0: /* NBSP */ 2024 case 0x1680: /* OGHAM SPACE MARK */ 2025 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2026 case 0x2000: /* EN QUAD */ 2027 case 0x2001: /* EM QUAD */ 2028 case 0x2002: /* EN SPACE */ 2029 case 0x2003: /* EM SPACE */ 2030 case 0x2004: /* THREE-PER-EM SPACE */ 2031 case 0x2005: /* FOUR-PER-EM SPACE */ 2032 case 0x2006: /* SIX-PER-EM SPACE */ 2033 case 0x2007: /* FIGURE SPACE */ 2034 case 0x2008: /* PUNCTUATION SPACE */ 2035 case 0x2009: /* THIN SPACE */ 2036 case 0x200A: /* HAIR SPACE */ 2037 case 0x202f: /* NARROW NO-BREAK SPACE */ 2038 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2039 case 0x3000: /* IDEOGRAPHIC SPACE */ 2040 OK = TRUE; 2041 break; 2042 2043 default: 2044 OK = FALSE; 2045 break; 2046 } 2047 2048 if (OK == (d == OP_HSPACE)) 2049 { 2050 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) 2051 { 2052 active_count--; /* Remove non-match possibility */ 2053 next_active_state--; 2054 } 2055 if (++count >= GET2(code, 1)) 2056 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); } 2057 else 2058 { ADD_NEW_DATA(-state_offset, count, 0); } 2059 } 2060 } 2061 break; 2062 2063/* ========================================================================== */ 2064 /* These opcodes are followed by a character that is usually compared 2065 to the current subject character; it is loaded into d. We still get 2066 here even if there is no subject character, because in some cases zero 2067 repetitions are permitted. */ 2068 2069 /*-----------------------------------------------------------------*/ 2070 case OP_CHAR: 2071 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } 2072 break; 2073 2074 /*-----------------------------------------------------------------*/ 2075 case OP_CHARI: 2076 if (clen == 0) break; 2077 2078#ifdef SUPPORT_UTF 2079 if (utf) 2080 { 2081 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else 2082 { 2083 unsigned int othercase; 2084 if (c < 128) 2085 othercase = fcc[c]; 2086 else 2087 /* If we have Unicode property support, we can use it to test the 2088 other case of the character. */ 2089#ifdef SUPPORT_UCP 2090 othercase = UCD_OTHERCASE(c); 2091#else 2092 othercase = NOTACHAR; 2093#endif 2094 2095 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } 2096 } 2097 } 2098 else 2099#endif /* SUPPORT_UTF */ 2100 /* Not UTF mode */ 2101 { 2102 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) 2103 { ADD_NEW(state_offset + 2, 0); } 2104 } 2105 break; 2106 2107 2108#ifdef SUPPORT_UCP 2109 /*-----------------------------------------------------------------*/ 2110 /* This is a tricky one because it can match more than one character. 2111 Find out how many characters to skip, and then set up a negative state 2112 to wait for them to pass before continuing. */ 2113 2114 case OP_EXTUNI: 2115 if (clen > 0 && UCD_CATEGORY(c) != ucp_M) 2116 { 2117 const pcre_uchar *nptr = ptr + clen; 2118 int ncount = 0; 2119 while (nptr < end_subject) 2120 { 2121 int nclen = 1; 2122 GETCHARLEN(c, nptr, nclen); 2123 if (UCD_CATEGORY(c) != ucp_M) break; 2124 ncount++; 2125 nptr += nclen; 2126 } 2127 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0) 2128 reset_could_continue = TRUE; 2129 ADD_NEW_DATA(-(state_offset + 1), 0, ncount); 2130 } 2131 break; 2132#endif 2133 2134 /*-----------------------------------------------------------------*/ 2135 /* This is a tricky like EXTUNI because it too can match more than one 2136 character (when CR is followed by LF). In this case, set up a negative 2137 state to wait for one character to pass before continuing. */ 2138 2139 case OP_ANYNL: 2140 if (clen > 0) switch(c) 2141 { 2142 case 0x000b: 2143 case 0x000c: 2144 case 0x0085: 2145 case 0x2028: 2146 case 0x2029: 2147 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; 2148 2149 case 0x000a: 2150 ADD_NEW(state_offset + 1, 0); 2151 break; 2152 2153 case 0x000d: 2154 if (ptr + 1 >= end_subject) 2155 { 2156 ADD_NEW(state_offset + 1, 0); 2157 if ((md->moptions & PCRE_PARTIAL_HARD) != 0) 2158 reset_could_continue = TRUE; 2159 } 2160 else if (ptr[1] == 0x0a) 2161 { 2162 ADD_NEW_DATA(-(state_offset + 1), 0, 1); 2163 } 2164 else 2165 { 2166 ADD_NEW(state_offset + 1, 0); 2167 } 2168 break; 2169 } 2170 break; 2171 2172 /*-----------------------------------------------------------------*/ 2173 case OP_NOT_VSPACE: 2174 if (clen > 0) switch(c) 2175 { 2176 case 0x000a: 2177 case 0x000b: 2178 case 0x000c: 2179 case 0x000d: 2180 case 0x0085: 2181 case 0x2028: 2182 case 0x2029: 2183 break; 2184 2185 default: 2186 ADD_NEW(state_offset + 1, 0); 2187 break; 2188 } 2189 break; 2190 2191 /*-----------------------------------------------------------------*/ 2192 case OP_VSPACE: 2193 if (clen > 0) switch(c) 2194 { 2195 case 0x000a: 2196 case 0x000b: 2197 case 0x000c: 2198 case 0x000d: 2199 case 0x0085: 2200 case 0x2028: 2201 case 0x2029: 2202 ADD_NEW(state_offset + 1, 0); 2203 break; 2204 2205 default: break; 2206 } 2207 break; 2208 2209 /*-----------------------------------------------------------------*/ 2210 case OP_NOT_HSPACE: 2211 if (clen > 0) switch(c) 2212 { 2213 case 0x09: /* HT */ 2214 case 0x20: /* SPACE */ 2215 case 0xa0: /* NBSP */ 2216 case 0x1680: /* OGHAM SPACE MARK */ 2217 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2218 case 0x2000: /* EN QUAD */ 2219 case 0x2001: /* EM QUAD */ 2220 case 0x2002: /* EN SPACE */ 2221 case 0x2003: /* EM SPACE */ 2222 case 0x2004: /* THREE-PER-EM SPACE */ 2223 case 0x2005: /* FOUR-PER-EM SPACE */ 2224 case 0x2006: /* SIX-PER-EM SPACE */ 2225 case 0x2007: /* FIGURE SPACE */ 2226 case 0x2008: /* PUNCTUATION SPACE */ 2227 case 0x2009: /* THIN SPACE */ 2228 case 0x200A: /* HAIR SPACE */ 2229 case 0x202f: /* NARROW NO-BREAK SPACE */ 2230 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2231 case 0x3000: /* IDEOGRAPHIC SPACE */ 2232 break; 2233 2234 default: 2235 ADD_NEW(state_offset + 1, 0); 2236 break; 2237 } 2238 break; 2239 2240 /*-----------------------------------------------------------------*/ 2241 case OP_HSPACE: 2242 if (clen > 0) switch(c) 2243 { 2244 case 0x09: /* HT */ 2245 case 0x20: /* SPACE */ 2246 case 0xa0: /* NBSP */ 2247 case 0x1680: /* OGHAM SPACE MARK */ 2248 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ 2249 case 0x2000: /* EN QUAD */ 2250 case 0x2001: /* EM QUAD */ 2251 case 0x2002: /* EN SPACE */ 2252 case 0x2003: /* EM SPACE */ 2253 case 0x2004: /* THREE-PER-EM SPACE */ 2254 case 0x2005: /* FOUR-PER-EM SPACE */ 2255 case 0x2006: /* SIX-PER-EM SPACE */ 2256 case 0x2007: /* FIGURE SPACE */ 2257 case 0x2008: /* PUNCTUATION SPACE */ 2258 case 0x2009: /* THIN SPACE */ 2259 case 0x200A: /* HAIR SPACE */ 2260 case 0x202f: /* NARROW NO-BREAK SPACE */ 2261 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ 2262 case 0x3000: /* IDEOGRAPHIC SPACE */ 2263 ADD_NEW(state_offset + 1, 0); 2264 break; 2265 } 2266 break; 2267 2268 /*-----------------------------------------------------------------*/ 2269 /* Match a negated single character casefully. */ 2270 2271 case OP_NOT: 2272 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); } 2273 break; 2274 2275 /*-----------------------------------------------------------------*/ 2276 /* Match a negated single character caselessly. */ 2277 2278 case OP_NOTI: 2279 if (clen > 0) 2280 { 2281 unsigned int otherd; 2282#ifdef SUPPORT_UTF 2283 if (utf && d >= 128) 2284 { 2285#ifdef SUPPORT_UCP 2286 otherd = UCD_OTHERCASE(d); 2287#endif /* SUPPORT_UCP */ 2288 } 2289 else 2290#endif /* SUPPORT_UTF */ 2291 otherd = TABLE_GET(d, fcc, d); 2292 if (c != d && c != otherd) 2293 { ADD_NEW(state_offset + dlen + 1, 0); } 2294 } 2295 break; 2296 2297 /*-----------------------------------------------------------------*/ 2298 case OP_PLUSI: 2299 case OP_MINPLUSI: 2300 case OP_POSPLUSI: 2301 case OP_NOTPLUSI: 2302 case OP_NOTMINPLUSI: 2303 case OP_NOTPOSPLUSI: 2304 caseless = TRUE; 2305 codevalue -= OP_STARI - OP_STAR; 2306 2307 /* Fall through */ 2308 case OP_PLUS: 2309 case OP_MINPLUS: 2310 case OP_POSPLUS: 2311 case OP_NOTPLUS: 2312 case OP_NOTMINPLUS: 2313 case OP_NOTPOSPLUS: 2314 count = current_state->count; /* Already matched */ 2315 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } 2316 if (clen > 0) 2317 { 2318 unsigned int otherd = NOTACHAR; 2319 if (caseless) 2320 { 2321#ifdef SUPPORT_UTF 2322 if (utf && d >= 128) 2323 { 2324#ifdef SUPPORT_UCP 2325 otherd = UCD_OTHERCASE(d); 2326#endif /* SUPPORT_UCP */ 2327 } 2328 else 2329#endif /* SUPPORT_UTF */ 2330 otherd = TABLE_GET(d, fcc, d); 2331 } 2332 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2333 { 2334 if (count > 0 && 2335 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) 2336 { 2337 active_count--; /* Remove non-match possibility */ 2338 next_active_state--; 2339 } 2340 count++; 2341 ADD_NEW(state_offset, count); 2342 } 2343 } 2344 break; 2345 2346 /*-----------------------------------------------------------------*/ 2347 case OP_QUERYI: 2348 case OP_MINQUERYI: 2349 case OP_POSQUERYI: 2350 case OP_NOTQUERYI: 2351 case OP_NOTMINQUERYI: 2352 case OP_NOTPOSQUERYI: 2353 caseless = TRUE; 2354 codevalue -= OP_STARI - OP_STAR; 2355 /* Fall through */ 2356 case OP_QUERY: 2357 case OP_MINQUERY: 2358 case OP_POSQUERY: 2359 case OP_NOTQUERY: 2360 case OP_NOTMINQUERY: 2361 case OP_NOTPOSQUERY: 2362 ADD_ACTIVE(state_offset + dlen + 1, 0); 2363 if (clen > 0) 2364 { 2365 unsigned int otherd = NOTACHAR; 2366 if (caseless) 2367 { 2368#ifdef SUPPORT_UTF 2369 if (utf && d >= 128) 2370 { 2371#ifdef SUPPORT_UCP 2372 otherd = UCD_OTHERCASE(d); 2373#endif /* SUPPORT_UCP */ 2374 } 2375 else 2376#endif /* SUPPORT_UTF */ 2377 otherd = TABLE_GET(d, fcc, d); 2378 } 2379 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2380 { 2381 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) 2382 { 2383 active_count--; /* Remove non-match possibility */ 2384 next_active_state--; 2385 } 2386 ADD_NEW(state_offset + dlen + 1, 0); 2387 } 2388 } 2389 break; 2390 2391 /*-----------------------------------------------------------------*/ 2392 case OP_STARI: 2393 case OP_MINSTARI: 2394 case OP_POSSTARI: 2395 case OP_NOTSTARI: 2396 case OP_NOTMINSTARI: 2397 case OP_NOTPOSSTARI: 2398 caseless = TRUE; 2399 codevalue -= OP_STARI - OP_STAR; 2400 /* Fall through */ 2401 case OP_STAR: 2402 case OP_MINSTAR: 2403 case OP_POSSTAR: 2404 case OP_NOTSTAR: 2405 case OP_NOTMINSTAR: 2406 case OP_NOTPOSSTAR: 2407 ADD_ACTIVE(state_offset + dlen + 1, 0); 2408 if (clen > 0) 2409 { 2410 unsigned int otherd = NOTACHAR; 2411 if (caseless) 2412 { 2413#ifdef SUPPORT_UTF 2414 if (utf && d >= 128) 2415 { 2416#ifdef SUPPORT_UCP 2417 otherd = UCD_OTHERCASE(d); 2418#endif /* SUPPORT_UCP */ 2419 } 2420 else 2421#endif /* SUPPORT_UTF */ 2422 otherd = TABLE_GET(d, fcc, d); 2423 } 2424 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2425 { 2426 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) 2427 { 2428 active_count--; /* Remove non-match possibility */ 2429 next_active_state--; 2430 } 2431 ADD_NEW(state_offset, 0); 2432 } 2433 } 2434 break; 2435 2436 /*-----------------------------------------------------------------*/ 2437 case OP_EXACTI: 2438 case OP_NOTEXACTI: 2439 caseless = TRUE; 2440 codevalue -= OP_STARI - OP_STAR; 2441 /* Fall through */ 2442 case OP_EXACT: 2443 case OP_NOTEXACT: 2444 count = current_state->count; /* Number already matched */ 2445 if (clen > 0) 2446 { 2447 unsigned int otherd = NOTACHAR; 2448 if (caseless) 2449 { 2450#ifdef SUPPORT_UTF 2451 if (utf && d >= 128) 2452 { 2453#ifdef SUPPORT_UCP 2454 otherd = UCD_OTHERCASE(d); 2455#endif /* SUPPORT_UCP */ 2456 } 2457 else 2458#endif /* SUPPORT_UTF */ 2459 otherd = TABLE_GET(d, fcc, d); 2460 } 2461 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2462 { 2463 if (++count >= GET2(code, 1)) 2464 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2465 else 2466 { ADD_NEW(state_offset, count); } 2467 } 2468 } 2469 break; 2470 2471 /*-----------------------------------------------------------------*/ 2472 case OP_UPTOI: 2473 case OP_MINUPTOI: 2474 case OP_POSUPTOI: 2475 case OP_NOTUPTOI: 2476 case OP_NOTMINUPTOI: 2477 case OP_NOTPOSUPTOI: 2478 caseless = TRUE; 2479 codevalue -= OP_STARI - OP_STAR; 2480 /* Fall through */ 2481 case OP_UPTO: 2482 case OP_MINUPTO: 2483 case OP_POSUPTO: 2484 case OP_NOTUPTO: 2485 case OP_NOTMINUPTO: 2486 case OP_NOTPOSUPTO: 2487 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0); 2488 count = current_state->count; /* Number already matched */ 2489 if (clen > 0) 2490 { 2491 unsigned int otherd = NOTACHAR; 2492 if (caseless) 2493 { 2494#ifdef SUPPORT_UTF 2495 if (utf && d >= 128) 2496 { 2497#ifdef SUPPORT_UCP 2498 otherd = UCD_OTHERCASE(d); 2499#endif /* SUPPORT_UCP */ 2500 } 2501 else 2502#endif /* SUPPORT_UTF */ 2503 otherd = TABLE_GET(d, fcc, d); 2504 } 2505 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) 2506 { 2507 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) 2508 { 2509 active_count--; /* Remove non-match possibility */ 2510 next_active_state--; 2511 } 2512 if (++count >= GET2(code, 1)) 2513 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); } 2514 else 2515 { ADD_NEW(state_offset, count); } 2516 } 2517 } 2518 break; 2519 2520 2521/* ========================================================================== */ 2522 /* These are the class-handling opcodes */ 2523 2524 case OP_CLASS: 2525 case OP_NCLASS: 2526 case OP_XCLASS: 2527 { 2528 BOOL isinclass = FALSE; 2529 int next_state_offset; 2530 const pcre_uchar *ecode; 2531 2532 /* For a simple class, there is always just a 32-byte table, and we 2533 can set isinclass from it. */ 2534 2535 if (codevalue != OP_XCLASS) 2536 { 2537 ecode = code + 1 + (32 / sizeof(pcre_uchar)); 2538 if (clen > 0) 2539 { 2540 isinclass = (c > 255)? (codevalue == OP_NCLASS) : 2541 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0); 2542 } 2543 } 2544 2545 /* An extended class may have a table or a list of single characters, 2546 ranges, or both, and it may be positive or negative. There's a 2547 function that sorts all this out. */ 2548 2549 else 2550 { 2551 ecode = code + GET(code, 1); 2552 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf); 2553 } 2554 2555 /* At this point, isinclass is set for all kinds of class, and ecode 2556 points to the byte after the end of the class. If there is a 2557 quantifier, this is where it will be. */ 2558 2559 next_state_offset = (int)(ecode - start_code); 2560 2561 switch (*ecode) 2562 { 2563 case OP_CRSTAR: 2564 case OP_CRMINSTAR: 2565 ADD_ACTIVE(next_state_offset + 1, 0); 2566 if (isinclass) { ADD_NEW(state_offset, 0); } 2567 break; 2568 2569 case OP_CRPLUS: 2570 case OP_CRMINPLUS: 2571 count = current_state->count; /* Already matched */ 2572 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } 2573 if (isinclass) { count++; ADD_NEW(state_offset, count); } 2574 break; 2575 2576 case OP_CRQUERY: 2577 case OP_CRMINQUERY: 2578 ADD_ACTIVE(next_state_offset + 1, 0); 2579 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } 2580 break; 2581 2582 case OP_CRRANGE: 2583 case OP_CRMINRANGE: 2584 count = current_state->count; /* Already matched */ 2585 if (count >= GET2(ecode, 1)) 2586 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2587 if (isinclass) 2588 { 2589 int max = GET2(ecode, 1 + IMM2_SIZE); 2590 if (++count >= max && max != 0) /* Max 0 => no limit */ 2591 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } 2592 else 2593 { ADD_NEW(state_offset, count); } 2594 } 2595 break; 2596 2597 default: 2598 if (isinclass) { ADD_NEW(next_state_offset, 0); } 2599 break; 2600 } 2601 } 2602 break; 2603 2604/* ========================================================================== */ 2605 /* These are the opcodes for fancy brackets of various kinds. We have 2606 to use recursion in order to handle them. The "always failing" assertion 2607 (?!) is optimised to OP_FAIL when compiling, so we have to support that, 2608 though the other "backtracking verbs" are not supported. */ 2609 2610 case OP_FAIL: 2611 forced_fail++; /* Count FAILs for multiple states */ 2612 break; 2613 2614 case OP_ASSERT: 2615 case OP_ASSERT_NOT: 2616 case OP_ASSERTBACK: 2617 case OP_ASSERTBACK_NOT: 2618 { 2619 int rc; 2620 int local_offsets[2]; 2621 int local_workspace[1000]; 2622 const pcre_uchar *endasscode = code + GET(code, 1); 2623 2624 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2625 2626 rc = internal_dfa_exec( 2627 md, /* static match data */ 2628 code, /* this subexpression's code */ 2629 ptr, /* where we currently are */ 2630 (int)(ptr - start_subject), /* start offset */ 2631 local_offsets, /* offset vector */ 2632 sizeof(local_offsets)/sizeof(int), /* size of same */ 2633 local_workspace, /* workspace vector */ 2634 sizeof(local_workspace)/sizeof(int), /* size of same */ 2635 rlevel); /* function recursion level */ 2636 2637 if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2638 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) 2639 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2640 } 2641 break; 2642 2643 /*-----------------------------------------------------------------*/ 2644 case OP_COND: 2645 case OP_SCOND: 2646 { 2647 int local_offsets[1000]; 2648 int local_workspace[1000]; 2649 int codelink = GET(code, 1); 2650 int condcode; 2651 2652 /* Because of the way auto-callout works during compile, a callout item 2653 is inserted between OP_COND and an assertion condition. This does not 2654 happen for the other conditions. */ 2655 2656 if (code[LINK_SIZE+1] == OP_CALLOUT) 2657 { 2658 rrc = 0; 2659 if (PUBL(callout) != NULL) 2660 { 2661 PUBL(callout_block) cb; 2662 cb.version = 1; /* Version 1 of the callout block */ 2663 cb.callout_number = code[LINK_SIZE+2]; 2664 cb.offset_vector = offsets; 2665#ifdef COMPILE_PCRE8 2666 cb.subject = (PCRE_SPTR)start_subject; 2667#else 2668 cb.subject = (PCRE_SPTR16)start_subject; 2669#endif 2670 cb.subject_length = (int)(end_subject - start_subject); 2671 cb.start_match = (int)(current_subject - start_subject); 2672 cb.current_position = (int)(ptr - start_subject); 2673 cb.pattern_position = GET(code, LINK_SIZE + 3); 2674 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); 2675 cb.capture_top = 1; 2676 cb.capture_last = -1; 2677 cb.callout_data = md->callout_data; 2678 cb.mark = NULL; /* No (*MARK) support */ 2679 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ 2680 } 2681 if (rrc > 0) break; /* Fail this thread */ 2682 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */ 2683 } 2684 2685 condcode = code[LINK_SIZE+1]; 2686 2687 /* Back reference conditions are not supported */ 2688 2689 if (condcode == OP_CREF || condcode == OP_NCREF) 2690 return PCRE_ERROR_DFA_UCOND; 2691 2692 /* The DEFINE condition is always false */ 2693 2694 if (condcode == OP_DEF) 2695 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2696 2697 /* The only supported version of OP_RREF is for the value RREF_ANY, 2698 which means "test if in any recursion". We can't test for specifically 2699 recursed groups. */ 2700 2701 else if (condcode == OP_RREF || condcode == OP_NRREF) 2702 { 2703 int value = GET2(code, LINK_SIZE + 2); 2704 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; 2705 if (md->recursive != NULL) 2706 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } 2707 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2708 } 2709 2710 /* Otherwise, the condition is an assertion */ 2711 2712 else 2713 { 2714 int rc; 2715 const pcre_uchar *asscode = code + LINK_SIZE + 1; 2716 const pcre_uchar *endasscode = asscode + GET(asscode, 1); 2717 2718 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); 2719 2720 rc = internal_dfa_exec( 2721 md, /* fixed match data */ 2722 asscode, /* this subexpression's code */ 2723 ptr, /* where we currently are */ 2724 (int)(ptr - start_subject), /* start offset */ 2725 local_offsets, /* offset vector */ 2726 sizeof(local_offsets)/sizeof(int), /* size of same */ 2727 local_workspace, /* workspace vector */ 2728 sizeof(local_workspace)/sizeof(int), /* size of same */ 2729 rlevel); /* function recursion level */ 2730 2731 if (rc == PCRE_ERROR_DFA_UITEM) return rc; 2732 if ((rc >= 0) == 2733 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) 2734 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); } 2735 else 2736 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } 2737 } 2738 } 2739 break; 2740 2741 /*-----------------------------------------------------------------*/ 2742 case OP_RECURSE: 2743 { 2744 dfa_recursion_info *ri; 2745 int local_offsets[1000]; 2746 int local_workspace[1000]; 2747 const pcre_uchar *callpat = start_code + GET(code, 1); 2748 int recno = (callpat == md->start_code)? 0 : 2749 GET2(callpat, 1 + LINK_SIZE); 2750 int rc; 2751 2752 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP)); 2753 2754 /* Check for repeating a recursion without advancing the subject 2755 pointer. This should catch convoluted mutual recursions. (Some simple 2756 cases are caught at compile time.) */ 2757 2758 for (ri = md->recursive; ri != NULL; ri = ri->prevrec) 2759 if (recno == ri->group_num && ptr == ri->subject_position) 2760 return PCRE_ERROR_RECURSELOOP; 2761 2762 /* Remember this recursion and where we started it so as to 2763 catch infinite loops. */ 2764 2765 new_recursive.group_num = recno; 2766 new_recursive.subject_position = ptr; 2767 new_recursive.prevrec = md->recursive; 2768 md->recursive = &new_recursive; 2769 2770 rc = internal_dfa_exec( 2771 md, /* fixed match data */ 2772 callpat, /* this subexpression's code */ 2773 ptr, /* where we currently are */ 2774 (int)(ptr - start_subject), /* start offset */ 2775 local_offsets, /* offset vector */ 2776 sizeof(local_offsets)/sizeof(int), /* size of same */ 2777 local_workspace, /* workspace vector */ 2778 sizeof(local_workspace)/sizeof(int), /* size of same */ 2779 rlevel); /* function recursion level */ 2780 2781 md->recursive = new_recursive.prevrec; /* Done this recursion */ 2782 2783 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP, 2784 rc)); 2785 2786 /* Ran out of internal offsets */ 2787 2788 if (rc == 0) return PCRE_ERROR_DFA_RECURSE; 2789 2790 /* For each successful matched substring, set up the next state with a 2791 count of characters to skip before trying it. Note that the count is in 2792 characters, not bytes. */ 2793 2794 if (rc > 0) 2795 { 2796 for (rc = rc*2 - 2; rc >= 0; rc -= 2) 2797 { 2798 int charcount = local_offsets[rc+1] - local_offsets[rc]; 2799#ifdef SUPPORT_UTF 2800 if (utf) 2801 { 2802 const pcre_uchar *p = start_subject + local_offsets[rc]; 2803 const pcre_uchar *pp = start_subject + local_offsets[rc+1]; 2804 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2805 } 2806#endif 2807 if (charcount > 0) 2808 { 2809 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); 2810 } 2811 else 2812 { 2813 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); 2814 } 2815 } 2816 } 2817 else if (rc != PCRE_ERROR_NOMATCH) return rc; 2818 } 2819 break; 2820 2821 /*-----------------------------------------------------------------*/ 2822 case OP_BRAPOS: 2823 case OP_SBRAPOS: 2824 case OP_CBRAPOS: 2825 case OP_SCBRAPOS: 2826 case OP_BRAPOSZERO: 2827 { 2828 int charcount, matched_count; 2829 const pcre_uchar *local_ptr = ptr; 2830 BOOL allow_zero; 2831 2832 if (codevalue == OP_BRAPOSZERO) 2833 { 2834 allow_zero = TRUE; 2835 codevalue = *(++code); /* Codevalue will be one of above BRAs */ 2836 } 2837 else allow_zero = FALSE; 2838 2839 /* Loop to match the subpattern as many times as possible as if it were 2840 a complete pattern. */ 2841 2842 for (matched_count = 0;; matched_count++) 2843 { 2844 int local_offsets[2]; 2845 int local_workspace[1000]; 2846 2847 int rc = internal_dfa_exec( 2848 md, /* fixed match data */ 2849 code, /* this subexpression's code */ 2850 local_ptr, /* where we currently are */ 2851 (int)(ptr - start_subject), /* start offset */ 2852 local_offsets, /* offset vector */ 2853 sizeof(local_offsets)/sizeof(int), /* size of same */ 2854 local_workspace, /* workspace vector */ 2855 sizeof(local_workspace)/sizeof(int), /* size of same */ 2856 rlevel); /* function recursion level */ 2857 2858 /* Failed to match */ 2859 2860 if (rc < 0) 2861 { 2862 if (rc != PCRE_ERROR_NOMATCH) return rc; 2863 break; 2864 } 2865 2866 /* Matched: break the loop if zero characters matched. */ 2867 2868 charcount = local_offsets[1] - local_offsets[0]; 2869 if (charcount == 0) break; 2870 local_ptr += charcount; /* Advance temporary position ptr */ 2871 } 2872 2873 /* At this point we have matched the subpattern matched_count 2874 times, and local_ptr is pointing to the character after the end of the 2875 last match. */ 2876 2877 if (matched_count > 0 || allow_zero) 2878 { 2879 const pcre_uchar *end_subpattern = code; 2880 int next_state_offset; 2881 2882 do { end_subpattern += GET(end_subpattern, 1); } 2883 while (*end_subpattern == OP_ALT); 2884 next_state_offset = 2885 (int)(end_subpattern - start_code + LINK_SIZE + 1); 2886 2887 /* Optimization: if there are no more active states, and there 2888 are no new states yet set up, then skip over the subject string 2889 right here, to save looping. Otherwise, set up the new state to swing 2890 into action when the end of the matched substring is reached. */ 2891 2892 if (i + 1 >= active_count && new_count == 0) 2893 { 2894 ptr = local_ptr; 2895 clen = 0; 2896 ADD_NEW(next_state_offset, 0); 2897 } 2898 else 2899 { 2900 const pcre_uchar *p = ptr; 2901 const pcre_uchar *pp = local_ptr; 2902 charcount = (int)(pp - p); 2903#ifdef SUPPORT_UTF 2904 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2905#endif 2906 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 2907 } 2908 } 2909 } 2910 break; 2911 2912 /*-----------------------------------------------------------------*/ 2913 case OP_ONCE: 2914 case OP_ONCE_NC: 2915 { 2916 int local_offsets[2]; 2917 int local_workspace[1000]; 2918 2919 int rc = internal_dfa_exec( 2920 md, /* fixed match data */ 2921 code, /* this subexpression's code */ 2922 ptr, /* where we currently are */ 2923 (int)(ptr - start_subject), /* start offset */ 2924 local_offsets, /* offset vector */ 2925 sizeof(local_offsets)/sizeof(int), /* size of same */ 2926 local_workspace, /* workspace vector */ 2927 sizeof(local_workspace)/sizeof(int), /* size of same */ 2928 rlevel); /* function recursion level */ 2929 2930 if (rc >= 0) 2931 { 2932 const pcre_uchar *end_subpattern = code; 2933 int charcount = local_offsets[1] - local_offsets[0]; 2934 int next_state_offset, repeat_state_offset; 2935 2936 do { end_subpattern += GET(end_subpattern, 1); } 2937 while (*end_subpattern == OP_ALT); 2938 next_state_offset = 2939 (int)(end_subpattern - start_code + LINK_SIZE + 1); 2940 2941 /* If the end of this subpattern is KETRMAX or KETRMIN, we must 2942 arrange for the repeat state also to be added to the relevant list. 2943 Calculate the offset, or set -1 for no repeat. */ 2944 2945 repeat_state_offset = (*end_subpattern == OP_KETRMAX || 2946 *end_subpattern == OP_KETRMIN)? 2947 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1; 2948 2949 /* If we have matched an empty string, add the next state at the 2950 current character pointer. This is important so that the duplicate 2951 checking kicks in, which is what breaks infinite loops that match an 2952 empty string. */ 2953 2954 if (charcount == 0) 2955 { 2956 ADD_ACTIVE(next_state_offset, 0); 2957 } 2958 2959 /* Optimization: if there are no more active states, and there 2960 are no new states yet set up, then skip over the subject string 2961 right here, to save looping. Otherwise, set up the new state to swing 2962 into action when the end of the matched substring is reached. */ 2963 2964 else if (i + 1 >= active_count && new_count == 0) 2965 { 2966 ptr += charcount; 2967 clen = 0; 2968 ADD_NEW(next_state_offset, 0); 2969 2970 /* If we are adding a repeat state at the new character position, 2971 we must fudge things so that it is the only current state. 2972 Otherwise, it might be a duplicate of one we processed before, and 2973 that would cause it to be skipped. */ 2974 2975 if (repeat_state_offset >= 0) 2976 { 2977 next_active_state = active_states; 2978 active_count = 0; 2979 i = -1; 2980 ADD_ACTIVE(repeat_state_offset, 0); 2981 } 2982 } 2983 else 2984 { 2985#ifdef SUPPORT_UTF 2986 if (utf) 2987 { 2988 const pcre_uchar *p = start_subject + local_offsets[0]; 2989 const pcre_uchar *pp = start_subject + local_offsets[1]; 2990 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--; 2991 } 2992#endif 2993 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); 2994 if (repeat_state_offset >= 0) 2995 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } 2996 } 2997 } 2998 else if (rc != PCRE_ERROR_NOMATCH) return rc; 2999 } 3000 break; 3001 3002 3003/* ========================================================================== */ 3004 /* Handle callouts */ 3005 3006 case OP_CALLOUT: 3007 rrc = 0; 3008 if (PUBL(callout) != NULL) 3009 { 3010 PUBL(callout_block) cb; 3011 cb.version = 1; /* Version 1 of the callout block */ 3012 cb.callout_number = code[1]; 3013 cb.offset_vector = offsets; 3014#ifdef COMPILE_PCRE8 3015 cb.subject = (PCRE_SPTR)start_subject; 3016#else 3017 cb.subject = (PCRE_SPTR16)start_subject; 3018#endif 3019 cb.subject_length = (int)(end_subject - start_subject); 3020 cb.start_match = (int)(current_subject - start_subject); 3021 cb.current_position = (int)(ptr - start_subject); 3022 cb.pattern_position = GET(code, 2); 3023 cb.next_item_length = GET(code, 2 + LINK_SIZE); 3024 cb.capture_top = 1; 3025 cb.capture_last = -1; 3026 cb.callout_data = md->callout_data; 3027 cb.mark = NULL; /* No (*MARK) support */ 3028 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */ 3029 } 3030 if (rrc == 0) 3031 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); } 3032 break; 3033 3034 3035/* ========================================================================== */ 3036 default: /* Unsupported opcode */ 3037 return PCRE_ERROR_DFA_UITEM; 3038 } 3039 3040 NEXT_ACTIVE_STATE: continue; 3041 3042 } /* End of loop scanning active states */ 3043 3044 /* We have finished the processing at the current subject character. If no 3045 new states have been set for the next character, we have found all the 3046 matches that we are going to find. If we are at the top level and partial 3047 matching has been requested, check for appropriate conditions. 3048 3049 The "forced_ fail" variable counts the number of (*F) encountered for the 3050 character. If it is equal to the original active_count (saved in 3051 workspace[1]) it means that (*F) was found on every active state. In this 3052 case we don't want to give a partial match. 3053 3054 The "could_continue" variable is true if a state could have continued but 3055 for the fact that the end of the subject was reached. */ 3056 3057 if (new_count <= 0) 3058 { 3059 if (rlevel == 1 && /* Top level, and */ 3060 could_continue && /* Some could go on, and */ 3061 forced_fail != workspace[1] && /* Not all forced fail & */ 3062 ( /* either... */ 3063 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */ 3064 || /* or... */ 3065 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */ 3066 match_count < 0) /* no matches */ 3067 ) && /* And... */ 3068 ( 3069 partial_newline || /* Either partial NL */ 3070 ( /* or ... */ 3071 ptr >= end_subject && /* End of subject and */ 3072 ptr > md->start_used_ptr) /* Inspected non-empty string */ 3073 ) 3074 ) 3075 { 3076 if (offsetcount >= 2) 3077 { 3078 offsets[0] = (int)(md->start_used_ptr - start_subject); 3079 offsets[1] = (int)(end_subject - start_subject); 3080 } 3081 match_count = PCRE_ERROR_PARTIAL; 3082 } 3083 3084 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" 3085 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, 3086 rlevel*2-2, SP)); 3087 break; /* In effect, "return", but see the comment below */ 3088 } 3089 3090 /* One or more states are active for the next character. */ 3091 3092 ptr += clen; /* Advance to next subject character */ 3093 } /* Loop to move along the subject string */ 3094 3095/* Control gets here from "break" a few lines above. We do it this way because 3096if we use "return" above, we have compiler trouble. Some compilers warn if 3097there's nothing here because they think the function doesn't return a value. On 3098the other hand, if we put a dummy statement here, some more clever compilers 3099complain that it can't be reached. Sigh. */ 3100 3101return match_count; 3102} 3103 3104 3105 3106 3107/************************************************* 3108* Execute a Regular Expression - DFA engine * 3109*************************************************/ 3110 3111/* This external function applies a compiled re to a subject string using a DFA 3112engine. This function calls the internal function multiple times if the pattern 3113is not anchored. 3114 3115Arguments: 3116 argument_re points to the compiled expression 3117 extra_data points to extra data or is NULL 3118 subject points to the subject string 3119 length length of subject string (may contain binary zeros) 3120 start_offset where to start in the subject string 3121 options option bits 3122 offsets vector of match offsets 3123 offsetcount size of same 3124 workspace workspace vector 3125 wscount size of same 3126 3127Returns: > 0 => number of match offset pairs placed in offsets 3128 = 0 => offsets overflowed; longest matches are present 3129 -1 => failed to match 3130 < -1 => some kind of unexpected problem 3131*/ 3132 3133#ifdef COMPILE_PCRE8 3134PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3135pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, 3136 const char *subject, int length, int start_offset, int options, int *offsets, 3137 int offsetcount, int *workspace, int wscount) 3138#else 3139PCRE_EXP_DEFN int PCRE_CALL_CONVENTION 3140pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, 3141 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, 3142 int offsetcount, int *workspace, int wscount) 3143#endif 3144{ 3145REAL_PCRE *re = (REAL_PCRE *)argument_re; 3146dfa_match_data match_block; 3147dfa_match_data *md = &match_block; 3148BOOL utf, anchored, startline, firstline; 3149const pcre_uchar *current_subject, *end_subject; 3150const pcre_study_data *study = NULL; 3151 3152const pcre_uchar *req_char_ptr; 3153const pcre_uint8 *start_bits = NULL; 3154BOOL has_first_char = FALSE; 3155BOOL has_req_char = FALSE; 3156pcre_uchar first_char = 0; 3157pcre_uchar first_char2 = 0; 3158pcre_uchar req_char = 0; 3159pcre_uchar req_char2 = 0; 3160int newline; 3161 3162/* Plausibility checks */ 3163 3164if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 3165if (re == NULL || subject == NULL || workspace == NULL || 3166 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 3167if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 3168if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; 3169if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; 3170 3171/* Check that the first field in the block is the magic number. If it is not, 3172return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to 3173REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which 3174means that the pattern is likely compiled with different endianness. */ 3175 3176if (re->magic_number != MAGIC_NUMBER) 3177 return re->magic_number == REVERSED_MAGIC_NUMBER? 3178 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; 3179if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; 3180 3181/* If restarting after a partial match, do some sanity checks on the contents 3182of the workspace. */ 3183 3184if ((options & PCRE_DFA_RESTART) != 0) 3185 { 3186 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 || 3187 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK) 3188 return PCRE_ERROR_DFA_BADRESTART; 3189 } 3190 3191/* Set up study, callout, and table data */ 3192 3193md->tables = re->tables; 3194md->callout_data = NULL; 3195 3196if (extra_data != NULL) 3197 { 3198 unsigned int flags = extra_data->flags; 3199 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 3200 study = (const pcre_study_data *)extra_data->study_data; 3201 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; 3202 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) 3203 return PCRE_ERROR_DFA_UMLIMIT; 3204 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 3205 md->callout_data = extra_data->callout_data; 3206 if ((flags & PCRE_EXTRA_TABLES) != 0) 3207 md->tables = extra_data->tables; 3208 } 3209 3210/* Set some local values */ 3211 3212current_subject = (const pcre_uchar *)subject + start_offset; 3213end_subject = (const pcre_uchar *)subject + length; 3214req_char_ptr = current_subject - 1; 3215 3216#ifdef SUPPORT_UTF 3217/* PCRE_UTF16 has the same value as PCRE_UTF8. */ 3218utf = (re->options & PCRE_UTF8) != 0; 3219#else 3220utf = FALSE; 3221#endif 3222 3223anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || 3224 (re->options & PCRE_ANCHORED) != 0; 3225 3226/* The remaining fixed data for passing around. */ 3227 3228md->start_code = (const pcre_uchar *)argument_re + 3229 re->name_table_offset + re->name_count * re->name_entry_size; 3230md->start_subject = (const pcre_uchar *)subject; 3231md->end_subject = end_subject; 3232md->start_offset = start_offset; 3233md->moptions = options; 3234md->poptions = re->options; 3235 3236/* If the BSR option is not set at match time, copy what was set 3237at compile time. */ 3238 3239if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) 3240 { 3241 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) 3242 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); 3243#ifdef BSR_ANYCRLF 3244 else md->moptions |= PCRE_BSR_ANYCRLF; 3245#endif 3246 } 3247 3248/* Handle different types of newline. The three bits give eight cases. If 3249nothing is set at run time, whatever was used at compile time applies. */ 3250 3251switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & 3252 PCRE_NEWLINE_BITS) 3253 { 3254 case 0: newline = NEWLINE; break; /* Compile-time default */ 3255 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 3256 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 3257 case PCRE_NEWLINE_CR+ 3258 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 3259 case PCRE_NEWLINE_ANY: newline = -1; break; 3260 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 3261 default: return PCRE_ERROR_BADNEWLINE; 3262 } 3263 3264if (newline == -2) 3265 { 3266 md->nltype = NLTYPE_ANYCRLF; 3267 } 3268else if (newline < 0) 3269 { 3270 md->nltype = NLTYPE_ANY; 3271 } 3272else 3273 { 3274 md->nltype = NLTYPE_FIXED; 3275 if (newline > 255) 3276 { 3277 md->nllen = 2; 3278 md->nl[0] = (newline >> 8) & 255; 3279 md->nl[1] = newline & 255; 3280 } 3281 else 3282 { 3283 md->nllen = 1; 3284 md->nl[0] = newline; 3285 } 3286 } 3287 3288/* Check a UTF-8 string if required. Unfortunately there's no way of passing 3289back the character offset. */ 3290 3291#ifdef SUPPORT_UTF 3292if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) 3293 { 3294 int erroroffset; 3295 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset); 3296 if (errorcode != 0) 3297 { 3298 if (offsetcount >= 2) 3299 { 3300 offsets[0] = erroroffset; 3301 offsets[1] = errorcode; 3302 } 3303 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)? 3304 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; 3305 } 3306 if (start_offset > 0 && start_offset < length && 3307 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) 3308 return PCRE_ERROR_BADUTF8_OFFSET; 3309 } 3310#endif 3311 3312/* If the exec call supplied NULL for tables, use the inbuilt ones. This 3313is a feature that makes it possible to save compiled regex and re-use them 3314in other programs later. */ 3315 3316if (md->tables == NULL) md->tables = PRIV(default_tables); 3317 3318/* The "must be at the start of a line" flags are used in a loop when finding 3319where to start. */ 3320 3321startline = (re->flags & PCRE_STARTLINE) != 0; 3322firstline = (re->options & PCRE_FIRSTLINE) != 0; 3323 3324/* Set up the first character to match, if available. The first_byte value is 3325never set for an anchored regular expression, but the anchoring may be forced 3326at run time, so we have to test for anchoring. The first char may be unset for 3327an unanchored pattern, of course. If there's no first char and the pattern was 3328studied, there may be a bitmap of possible first characters. */ 3329 3330if (!anchored) 3331 { 3332 if ((re->flags & PCRE_FIRSTSET) != 0) 3333 { 3334 has_first_char = TRUE; 3335 first_char = first_char2 = (pcre_uchar)(re->first_char); 3336 if ((re->flags & PCRE_FCH_CASELESS) != 0) 3337 { 3338 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char); 3339#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 3340 if (utf && first_char > 127) 3341 first_char2 = UCD_OTHERCASE(first_char); 3342#endif 3343 } 3344 } 3345 else 3346 { 3347 if (!startline && study != NULL && 3348 (study->flags & PCRE_STUDY_MAPPED) != 0) 3349 start_bits = study->start_bits; 3350 } 3351 } 3352 3353/* For anchored or unanchored matches, there may be a "last known required 3354character" set. */ 3355 3356if ((re->flags & PCRE_REQCHSET) != 0) 3357 { 3358 has_req_char = TRUE; 3359 req_char = req_char2 = (pcre_uchar)(re->req_char); 3360 if ((re->flags & PCRE_RCH_CASELESS) != 0) 3361 { 3362 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char); 3363#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 3364 if (utf && req_char > 127) 3365 req_char2 = UCD_OTHERCASE(req_char); 3366#endif 3367 } 3368 } 3369 3370/* Call the main matching function, looping for a non-anchored regex after a 3371failed match. If not restarting, perform certain optimizations at the start of 3372a match. */ 3373 3374for (;;) 3375 { 3376 int rc; 3377 3378 if ((options & PCRE_DFA_RESTART) == 0) 3379 { 3380 const pcre_uchar *save_end_subject = end_subject; 3381 3382 /* If firstline is TRUE, the start of the match is constrained to the first 3383 line of a multiline string. Implement this by temporarily adjusting 3384 end_subject so that we stop scanning at a newline. If the match fails at 3385 the newline, later code breaks this loop. */ 3386 3387 if (firstline) 3388 { 3389 PCRE_PUCHAR t = current_subject; 3390#ifdef SUPPORT_UTF 3391 if (utf) 3392 { 3393 while (t < md->end_subject && !IS_NEWLINE(t)) 3394 { 3395 t++; 3396 ACROSSCHAR(t < end_subject, *t, t++); 3397 } 3398 } 3399 else 3400#endif 3401 while (t < md->end_subject && !IS_NEWLINE(t)) t++; 3402 end_subject = t; 3403 } 3404 3405 /* There are some optimizations that avoid running the match if a known 3406 starting point is not found. However, there is an option that disables 3407 these, for testing and for ensuring that all callouts do actually occur. 3408 The option can be set in the regex by (*NO_START_OPT) or passed in 3409 match-time options. */ 3410 3411 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) 3412 { 3413 /* Advance to a known first char. */ 3414 3415 if (has_first_char) 3416 { 3417 if (first_char != first_char2) 3418 while (current_subject < end_subject && 3419 *current_subject != first_char && *current_subject != first_char2) 3420 current_subject++; 3421 else 3422 while (current_subject < end_subject && 3423 *current_subject != first_char) 3424 current_subject++; 3425 } 3426 3427 /* Or to just after a linebreak for a multiline match if possible */ 3428 3429 else if (startline) 3430 { 3431 if (current_subject > md->start_subject + start_offset) 3432 { 3433#ifdef SUPPORT_UTF 3434 if (utf) 3435 { 3436 while (current_subject < end_subject && 3437 !WAS_NEWLINE(current_subject)) 3438 { 3439 current_subject++; 3440 ACROSSCHAR(current_subject < end_subject, *current_subject, 3441 current_subject++); 3442 } 3443 } 3444 else 3445#endif 3446 while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) 3447 current_subject++; 3448 3449 /* If we have just passed a CR and the newline option is ANY or 3450 ANYCRLF, and we are now at a LF, advance the match position by one 3451 more character. */ 3452 3453 if (current_subject[-1] == CHAR_CR && 3454 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && 3455 current_subject < end_subject && 3456 *current_subject == CHAR_NL) 3457 current_subject++; 3458 } 3459 } 3460 3461 /* Or to a non-unique first char after study */ 3462 3463 else if (start_bits != NULL) 3464 { 3465 while (current_subject < end_subject) 3466 { 3467 register unsigned int c = *current_subject; 3468#ifndef COMPILE_PCRE8 3469 if (c > 255) c = 255; 3470#endif 3471 if ((start_bits[c/8] & (1 << (c&7))) == 0) 3472 { 3473 current_subject++; 3474#if defined SUPPORT_UTF && defined COMPILE_PCRE8 3475 /* In non 8-bit mode, the iteration will stop for 3476 characters > 255 at the beginning or not stop at all. */ 3477 if (utf) 3478 ACROSSCHAR(current_subject < end_subject, *current_subject, 3479 current_subject++); 3480#endif 3481 } 3482 else break; 3483 } 3484 } 3485 } 3486 3487 /* Restore fudged end_subject */ 3488 3489 end_subject = save_end_subject; 3490 3491 /* The following two optimizations are disabled for partial matching or if 3492 disabling is explicitly requested (and of course, by the test above, this 3493 code is not obeyed when restarting after a partial match). */ 3494 3495 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && 3496 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0) 3497 { 3498 /* If the pattern was studied, a minimum subject length may be set. This 3499 is a lower bound; no actual string of that length may actually match the 3500 pattern. Although the value is, strictly, in characters, we treat it as 3501 bytes to avoid spending too much time in this optimization. */ 3502 3503 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && 3504 (pcre_uint32)(end_subject - current_subject) < study->minlength) 3505 return PCRE_ERROR_NOMATCH; 3506 3507 /* If req_char is set, we know that that character must appear in the 3508 subject for the match to succeed. If the first character is set, req_char 3509 must be later in the subject; otherwise the test starts at the match 3510 point. This optimization can save a huge amount of work in patterns with 3511 nested unlimited repeats that aren't going to match. Writing separate 3512 code for cased/caseless versions makes it go faster, as does using an 3513 autoincrement and backing off on a match. 3514 3515 HOWEVER: when the subject string is very, very long, searching to its end 3516 can take a long time, and give bad performance on quite ordinary 3517 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte 3518 string... so we don't do this when the string is sufficiently long. */ 3519 3520 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX) 3521 { 3522 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0); 3523 3524 /* We don't need to repeat the search if we haven't yet reached the 3525 place we found it at last time. */ 3526 3527 if (p > req_char_ptr) 3528 { 3529 if (req_char != req_char2) 3530 { 3531 while (p < end_subject) 3532 { 3533 register int pp = *p++; 3534 if (pp == req_char || pp == req_char2) { p--; break; } 3535 } 3536 } 3537 else 3538 { 3539 while (p < end_subject) 3540 { 3541 if (*p++ == req_char) { p--; break; } 3542 } 3543 } 3544 3545 /* If we can't find the required character, break the matching loop, 3546 which will cause a return or PCRE_ERROR_NOMATCH. */ 3547 3548 if (p >= end_subject) break; 3549 3550 /* If we have found the required character, save the point where we 3551 found it, so that we don't search again next time round the loop if 3552 the start hasn't passed this character yet. */ 3553 3554 req_char_ptr = p; 3555 } 3556 } 3557 } 3558 } /* End of optimizations that are done when not restarting */ 3559 3560 /* OK, now we can do the business */ 3561 3562 md->start_used_ptr = current_subject; 3563 md->recursive = NULL; 3564 3565 rc = internal_dfa_exec( 3566 md, /* fixed match data */ 3567 md->start_code, /* this subexpression's code */ 3568 current_subject, /* where we currently are */ 3569 start_offset, /* start offset in subject */ 3570 offsets, /* offset vector */ 3571 offsetcount, /* size of same */ 3572 workspace, /* workspace vector */ 3573 wscount, /* size of same */ 3574 0); /* function recurse level */ 3575 3576 /* Anything other than "no match" means we are done, always; otherwise, carry 3577 on only if not anchored. */ 3578 3579 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; 3580 3581 /* Advance to the next subject character unless we are at the end of a line 3582 and firstline is set. */ 3583 3584 if (firstline && IS_NEWLINE(current_subject)) break; 3585 current_subject++; 3586#ifdef SUPPORT_UTF 3587 if (utf) 3588 { 3589 ACROSSCHAR(current_subject < end_subject, *current_subject, 3590 current_subject++); 3591 } 3592#endif 3593 if (current_subject > end_subject) break; 3594 3595 /* If we have just passed a CR and we are now at a LF, and the pattern does 3596 not contain any explicit matches for \r or \n, and the newline option is CRLF 3597 or ANY or ANYCRLF, advance the match position by one more character. */ 3598 3599 if (current_subject[-1] == CHAR_CR && 3600 current_subject < end_subject && 3601 *current_subject == CHAR_NL && 3602 (re->flags & PCRE_HASCRORLF) == 0 && 3603 (md->nltype == NLTYPE_ANY || 3604 md->nltype == NLTYPE_ANYCRLF || 3605 md->nllen == 2)) 3606 current_subject++; 3607 3608 } /* "Bumpalong" loop */ 3609 3610return PCRE_ERROR_NOMATCH; 3611} 3612 3613/* End of pcre_dfa_exec.c */ 3614