1/* regexec.c 2 */ 3 4/* 5 * "One Ring to rule them all, One Ring to find them..." 6 */ 7 8/* NOTE: this is derived from Henry Spencer's regexp code, and should not 9 * confused with the original package (see point 3 below). Thanks, Henry! 10 */ 11 12/* Additional note: this code is very heavily munged from Henry's version 13 * in places. In some spots I've traded clarity for efficiency, so don't 14 * blame Henry for some of the lack of readability. 15 */ 16 17/* The names of the functions have been changed from regcomp and 18 * regexec to pregcomp and pregexec in order to avoid conflicts 19 * with the POSIX routines of the same names. 20*/ 21 22#ifdef PERL_EXT_RE_BUILD 23/* need to replace pregcomp et al, so enable that */ 24# ifndef PERL_IN_XSUB_RE 25# define PERL_IN_XSUB_RE 26# endif 27/* need access to debugger hooks */ 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING) 29# define DEBUGGING 30# endif 31#endif 32 33#ifdef PERL_IN_XSUB_RE 34/* We *really* need to overwrite these symbols: */ 35# define Perl_regexec_flags my_regexec 36# define Perl_regdump my_regdump 37# define Perl_regprop my_regprop 38# define Perl_re_intuit_start my_re_intuit_start 39/* *These* symbols are masked to allow static link. */ 40# define Perl_pregexec my_pregexec 41# define Perl_reginitcolors my_reginitcolors 42# define Perl_regclass_swash my_regclass_swash 43 44# define PERL_NO_GET_CONTEXT 45#endif 46 47/*SUPPRESS 112*/ 48/* 49 * pregcomp and pregexec -- regsub and regerror are not used in perl 50 * 51 * Copyright (c) 1986 by University of Toronto. 52 * Written by Henry Spencer. Not derived from licensed software. 53 * 54 * Permission is granted to anyone to use this software for any 55 * purpose on any computer system, and to redistribute it freely, 56 * subject to the following restrictions: 57 * 58 * 1. The author is not responsible for the consequences of use of 59 * this software, no matter how awful, even if they arise 60 * from defects in it. 61 * 62 * 2. The origin of this software must not be misrepresented, either 63 * by explicit claim or by omission. 64 * 65 * 3. Altered versions must be plainly marked as such, and must not 66 * be misrepresented as being the original software. 67 * 68 **** Alterations to Henry's code are... 69 **** 70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 71 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others 72 **** 73 **** You may distribute under the terms of either the GNU General Public 74 **** License or the Artistic License, as specified in the README file. 75 * 76 * Beware that some of this code is subtly aware of the way operator 77 * precedence is structured in regular expressions. Serious changes in 78 * regular-expression syntax might require a total rethink. 79 */ 80#include "EXTERN.h" 81#define PERL_IN_REGEXEC_C 82#include "perl.h" 83 84#include "regcomp.h" 85 86#define RF_tainted 1 /* tainted information used? */ 87#define RF_warned 2 /* warned about big count? */ 88#define RF_evaled 4 /* Did an EVAL with setting? */ 89#define RF_utf8 8 /* String contains multibyte chars? */ 90#define RF_false 16 /* odd number of nested negatives */ 91 92#define UTF ((PL_reg_flags & RF_utf8) != 0) 93 94#define RS_init 1 /* eval environment created */ 95#define RS_set 2 /* replsv value is set */ 96 97#ifndef STATIC 98#define STATIC static 99#endif 100 101#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) 102 103/* 104 * Forwards. 105 */ 106 107#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv)) 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b) 109 110#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off)) 111#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off)) 112#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off)) 113#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off)) 114#define HOPc(pos,off) ((char*)HOP(pos,off)) 115#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off)) 116 117#define HOPBACK(pos, off) ( \ 118 (PL_reg_match_utf8) \ 119 ? reghopmaybe((U8*)pos, -off) \ 120 : (pos - off >= PL_bostr) \ 121 ? (U8*)(pos - off) \ 122 : (U8*)NULL \ 123) 124#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off) 125 126#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim)) 127#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim)) 128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off)) 129#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off)) 130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 131#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim)) 132 133#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END 134 135/* for use after a quantifier and before an EXACT-like node -- japhy */ 136#define JUMPABLE(rn) ( \ 137 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \ 138 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 139 OP(rn) == PLUS || OP(rn) == MINMOD || \ 140 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \ 141) 142 143#define HAS_TEXT(rn) ( \ 144 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \ 145) 146 147/* 148 Search for mandatory following text node; for lookahead, the text must 149 follow but for lookbehind (rn->flags != 0) we skip to the next step. 150*/ 151#define FIND_NEXT_IMPT(rn) STMT_START { \ 152 while (JUMPABLE(rn)) \ 153 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \ 154 rn = NEXTOPER(NEXTOPER(rn)); \ 155 else if (OP(rn) == PLUS) \ 156 rn = NEXTOPER(rn); \ 157 else if (OP(rn) == IFMATCH) \ 158 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 159 else rn += NEXT_OFF(rn); \ 160} STMT_END 161 162static void restore_pos(pTHX_ void *arg); 163 164STATIC CHECKPOINT 165S_regcppush(pTHX_ I32 parenfloor) 166{ 167 int retval = PL_savestack_ix; 168#define REGCP_PAREN_ELEMS 4 169 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; 170 int p; 171 172 if (paren_elems_to_push < 0) 173 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); 174 175#define REGCP_OTHER_ELEMS 6 176 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS); 177 for (p = PL_regsize; p > parenfloor; p--) { 178/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 179 SSPUSHINT(PL_regendp[p]); 180 SSPUSHINT(PL_regstartp[p]); 181 SSPUSHPTR(PL_reg_start_tmp[p]); 182 SSPUSHINT(p); 183 } 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 185 SSPUSHINT(PL_regsize); 186 SSPUSHINT(*PL_reglastparen); 187 SSPUSHINT(*PL_reglastcloseparen); 188 SSPUSHPTR(PL_reginput); 189#define REGCP_FRAME_ELEMS 2 190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and 191 * are needed for the regexp context stack bookkeeping. */ 192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */ 194 195 return retval; 196} 197 198/* These are needed since we do not localize EVAL nodes: */ 199# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \ 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \ 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix 202 203# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \ 204 PerlIO_printf(Perl_debug_log, \ 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp) 207 208STATIC char * 209S_regcppop(pTHX) 210{ 211 I32 i; 212 U32 paren = 0; 213 char *input; 214 I32 tmps; 215 216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 217 i = SSPOPINT; 218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 219 i = SSPOPINT; /* Parentheses elements to pop. */ 220 input = (char *) SSPOPPTR; 221 *PL_reglastcloseparen = SSPOPINT; 222 *PL_reglastparen = SSPOPINT; 223 PL_regsize = SSPOPINT; 224 225 /* Now restore the parentheses context. */ 226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 227 i > 0; i -= REGCP_PAREN_ELEMS) { 228 paren = (U32)SSPOPINT; 229 PL_reg_start_tmp[paren] = (char *) SSPOPPTR; 230 PL_regstartp[paren] = SSPOPINT; 231 tmps = SSPOPINT; 232 if (paren <= *PL_reglastparen) 233 PL_regendp[paren] = tmps; 234 DEBUG_r( 235 PerlIO_printf(Perl_debug_log, 236 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", 237 (UV)paren, (IV)PL_regstartp[paren], 238 (IV)(PL_reg_start_tmp[paren] - PL_bostr), 239 (IV)PL_regendp[paren], 240 (paren > *PL_reglastparen ? "(no)" : "")); 241 ); 242 } 243 DEBUG_r( 244 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) { 245 PerlIO_printf(Perl_debug_log, 246 " restoring \\%"IVdf"..\\%"IVdf" to undef\n", 247 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar); 248 } 249 ); 250#if 1 251 /* It would seem that the similar code in regtry() 252 * already takes care of this, and in fact it is in 253 * a better location to since this code can #if 0-ed out 254 * but the code in regtry() is needed or otherwise tests 255 * requiring null fields (pat.t#187 and split.t#{13,14} 256 * (as of patchlevel 7877) will fail. Then again, 257 * this code seems to be necessary or otherwise 258 * building DynaLoader will fail: 259 * "Error: '*' not in typemap in DynaLoader.xs, line 164" 260 * --jhi */ 261 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) { 262 if ((I32)paren > PL_regsize) 263 PL_regstartp[paren] = -1; 264 PL_regendp[paren] = -1; 265 } 266#endif 267 return input; 268} 269 270STATIC char * 271S_regcp_set_to(pTHX_ I32 ss) 272{ 273 I32 tmp = PL_savestack_ix; 274 275 PL_savestack_ix = ss; 276 regcppop(); 277 PL_savestack_ix = tmp; 278 return Nullch; 279} 280 281typedef struct re_cc_state 282{ 283 I32 ss; 284 regnode *node; 285 struct re_cc_state *prev; 286 CURCUR *cc; 287 regexp *re; 288} re_cc_state; 289 290#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 291 292#define TRYPAREN(paren, n, input) { \ 293 if (paren) { \ 294 if (n) { \ 295 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \ 296 PL_regendp[paren] = input - PL_bostr; \ 297 } \ 298 else \ 299 PL_regendp[paren] = -1; \ 300 } \ 301 if (regmatch(next)) \ 302 sayYES; \ 303 if (paren && n) \ 304 PL_regendp[paren] = -1; \ 305} 306 307 308/* 309 * pregexec and friends 310 */ 311 312/* 313 - pregexec - match a regexp against a string 314 */ 315I32 316Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend, 317 char *strbeg, I32 minend, SV *screamer, U32 nosave) 318/* strend: pointer to null at end of string */ 319/* strbeg: real beginning of string */ 320/* minend: end of match must be >=minend after stringarg. */ 321/* nosave: For optimizations. */ 322{ 323 return 324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 325 nosave ? 0 : REXEC_COPY_STR); 326} 327 328STATIC void 329S_cache_re(pTHX_ regexp *prog) 330{ 331 PL_regprecomp = prog->precomp; /* Needed for FAIL. */ 332#ifdef DEBUGGING 333 PL_regprogram = prog->program; 334#endif 335 PL_regnpar = prog->nparens; 336 PL_regdata = prog->data; 337 PL_reg_re = prog; 338} 339 340/* 341 * Need to implement the following flags for reg_anch: 342 * 343 * USE_INTUIT_NOML - Useful to call re_intuit_start() first 344 * USE_INTUIT_ML 345 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer 346 * INTUIT_AUTORITATIVE_ML 347 * INTUIT_ONCE_NOML - Intuit can match in one location only. 348 * INTUIT_ONCE_ML 349 * 350 * Another flag for this function: SECOND_TIME (so that float substrs 351 * with giant delta may be not rechecked). 352 */ 353 354/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ 355 356/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend. 357 Otherwise, only SvCUR(sv) is used to get strbeg. */ 358 359/* XXXX We assume that strpos is strbeg unless sv. */ 360 361/* XXXX Some places assume that there is a fixed substring. 362 An update may be needed if optimizer marks as "INTUITable" 363 RExen without fixed substrings. Similarly, it is assumed that 364 lengths of all the strings are no more than minlen, thus they 365 cannot come from lookahead. 366 (Or minlen should take into account lookahead.) */ 367 368/* A failure to find a constant substring means that there is no need to make 369 an expensive call to REx engine, thus we celebrate a failure. Similarly, 370 finding a substring too deep into the string means that less calls to 371 regtry() should be needed. 372 373 REx compiler's optimizer found 4 possible hints: 374 a) Anchored substring; 375 b) Fixed substring; 376 c) Whether we are anchored (beginning-of-line or \G); 377 d) First node (of those at offset 0) which may distingush positions; 378 We use a)b)d) and multiline-part of c), and try to find a position in the 379 string which does not contradict any of them. 380 */ 381 382/* Most of decisions we do here should have been done at compile time. 383 The nodes of the REx which we used for the search should have been 384 deleted from the finite automaton. */ 385 386char * 387Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, 388 char *strend, U32 flags, re_scream_pos_data *data) 389{ 390 register I32 start_shift = 0; 391 /* Should be nonnegative! */ 392 register I32 end_shift = 0; 393 register char *s; 394 register SV *check; 395 char *strbeg; 396 char *t; 397 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */ 398 I32 ml_anch; 399 register char *other_last = Nullch; /* other substr checked before this */ 400 char *check_at = Nullch; /* check substr found at this pos */ 401#ifdef DEBUGGING 402 char *i_strpos = strpos; 403 SV *dsv = PERL_DEBUG_PAD_ZERO(0); 404#endif 405 RX_MATCH_UTF8_set(prog,do_utf8); 406 407 if (prog->reganch & ROPT_UTF8) { 408 DEBUG_r(PerlIO_printf(Perl_debug_log, 409 "UTF-8 regex...\n")); 410 PL_reg_flags |= RF_utf8; 411 } 412 413 DEBUG_r({ 414 char *s = PL_reg_match_utf8 ? 415 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) : 416 strpos; 417 int len = PL_reg_match_utf8 ? 418 strlen(s) : strend - strpos; 419 if (!PL_colorset) 420 reginitcolors(); 421 if (PL_reg_match_utf8) 422 DEBUG_r(PerlIO_printf(Perl_debug_log, 423 "UTF-8 target...\n")); 424 PerlIO_printf(Perl_debug_log, 425 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n", 426 PL_colors[4],PL_colors[5],PL_colors[0], 427 prog->precomp, 428 PL_colors[1], 429 (strlen(prog->precomp) > 60 ? "..." : ""), 430 PL_colors[0], 431 (int)(len > 60 ? 60 : len), 432 s, PL_colors[1], 433 (len > 60 ? "..." : "") 434 ); 435 }); 436 437 /* CHR_DIST() would be more correct here but it makes things slow. */ 438 if (prog->minlen > strend - strpos) { 439 DEBUG_r(PerlIO_printf(Perl_debug_log, 440 "String too short... [re_intuit_start]\n")); 441 goto fail; 442 } 443 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; 444 PL_regeol = strend; 445 if (do_utf8) { 446 if (!prog->check_utf8 && prog->check_substr) 447 to_utf8_substr(prog); 448 check = prog->check_utf8; 449 } else { 450 if (!prog->check_substr && prog->check_utf8) 451 to_byte_substr(prog); 452 check = prog->check_substr; 453 } 454 if (check == &PL_sv_undef) { 455 DEBUG_r(PerlIO_printf(Perl_debug_log, 456 "Non-utf string cannot match utf check string\n")); 457 goto fail; 458 } 459 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ 460 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE) 461 || ( (prog->reganch & ROPT_ANCH_BOL) 462 && !PL_multiline ) ); /* Check after \n? */ 463 464 if (!ml_anch) { 465 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */ 466 | ROPT_IMPLICIT)) /* not a real BOL */ 467 /* SvCUR is not set on references: SvRV and SvPVX overlap */ 468 && sv && !SvROK(sv) 469 && (strpos != strbeg)) { 470 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); 471 goto fail; 472 } 473 if (prog->check_offset_min == prog->check_offset_max && 474 !(prog->reganch & ROPT_CANY_SEEN)) { 475 /* Substring at constant offset from beg-of-str... */ 476 I32 slen; 477 478 s = HOP3c(strpos, prog->check_offset_min, strend); 479 if (SvTAIL(check)) { 480 slen = SvCUR(check); /* >= 1 */ 481 482 if ( strend - s > slen || strend - s < slen - 1 483 || (strend - s == slen && strend[-1] != '\n')) { 484 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); 485 goto fail_finish; 486 } 487 /* Now should match s[0..slen-2] */ 488 slen--; 489 if (slen && (*SvPVX(check) != *s 490 || (slen > 1 491 && memNE(SvPVX(check), s, slen)))) { 492 report_neq: 493 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); 494 goto fail_finish; 495 } 496 } 497 else if (*SvPVX(check) != *s 498 || ((slen = SvCUR(check)) > 1 499 && memNE(SvPVX(check), s, slen))) 500 goto report_neq; 501 goto success_at_start; 502 } 503 } 504 /* Match is anchored, but substr is not anchored wrt beg-of-str. */ 505 s = strpos; 506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 507 end_shift = prog->minlen - start_shift - 508 CHR_SVLEN(check) + (SvTAIL(check) != 0); 509 if (!ml_anch) { 510 I32 end = prog->check_offset_max + CHR_SVLEN(check) 511 - (SvTAIL(check) != 0); 512 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; 513 514 if (end_shift < eshift) 515 end_shift = eshift; 516 } 517 } 518 else { /* Can match at random position */ 519 ml_anch = 0; 520 s = strpos; 521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 522 /* Should be nonnegative! */ 523 end_shift = prog->minlen - start_shift - 524 CHR_SVLEN(check) + (SvTAIL(check) != 0); 525 } 526 527#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 528 if (end_shift < 0) 529 Perl_croak(aTHX_ "panic: end_shift"); 530#endif 531 532 restart: 533 /* Find a possible match in the region s..strend by looking for 534 the "check" substring in the region corrected by start/end_shift. */ 535 if (flags & REXEC_SCREAM) { 536 I32 p = -1; /* Internal iterator of scream. */ 537 I32 *pp = data ? data->scream_pos : &p; 538 539 if (PL_screamfirst[BmRARE(check)] >= 0 540 || ( BmRARE(check) == '\n' 541 && (BmPREVIOUS(check) == SvCUR(check) - 1) 542 && SvTAIL(check) )) 543 s = screaminstr(sv, check, 544 start_shift + (s - strbeg), end_shift, pp, 0); 545 else 546 goto fail_finish; 547 /* we may be pointing at the wrong string */ 548 if (s && RX_MATCH_COPIED(prog)) 549 s = strbeg + (s - SvPVX(sv)); 550 if (data) 551 *data->scream_olds = s; 552 } 553 else if (prog->reganch & ROPT_CANY_SEEN) 554 s = fbm_instr((U8*)(s + start_shift), 555 (U8*)(strend - end_shift), 556 check, PL_multiline ? FBMrf_MULTILINE : 0); 557 else 558 s = fbm_instr(HOP3(s, start_shift, strend), 559 HOP3(strend, -end_shift, strbeg), 560 check, PL_multiline ? FBMrf_MULTILINE : 0); 561 562 /* Update the count-of-usability, remove useless subpatterns, 563 unshift s. */ 564 565 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s", 566 (s ? "Found" : "Did not find"), 567 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"), 568 PL_colors[0], 569 (int)(SvCUR(check) - (SvTAIL(check)!=0)), 570 SvPVX(check), 571 PL_colors[1], (SvTAIL(check) ? "$" : ""), 572 (s ? " at offset " : "...\n") ) ); 573 574 if (!s) 575 goto fail_finish; 576 577 check_at = s; 578 579 /* Finish the diagnostic message */ 580 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); 581 582 /* Got a candidate. Check MBOL anchoring, and the *other* substr. 583 Start with the other substr. 584 XXXX no SCREAM optimization yet - and a very coarse implementation 585 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will 586 *always* match. Probably should be marked during compile... 587 Probably it is right to do no SCREAM here... 588 */ 589 590 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) { 591 /* Take into account the "other" substring. */ 592 /* XXXX May be hopelessly wrong for UTF... */ 593 if (!other_last) 594 other_last = strpos; 595 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) { 596 do_other_anchored: 597 { 598 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2; 599 char *s1 = s; 600 SV* must; 601 602 t = s - prog->check_offset_max; 603 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 604 && (!do_utf8 605 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos)) 606 && t > strpos))) 607 /* EMPTY */; 608 else 609 t = strpos; 610 t = HOP3c(t, prog->anchored_offset, strend); 611 if (t < other_last) /* These positions already checked */ 612 t = other_last; 613 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); 614 if (last < last1) 615 last1 = last; 616 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ 617 /* On end-of-str: see comment below. */ 618 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 619 if (must == &PL_sv_undef) { 620 s = (char*)NULL; 621 DEBUG_r(must = prog->anchored_utf8); /* for debug */ 622 } 623 else 624 s = fbm_instr( 625 (unsigned char*)t, 626 HOP3(HOP3(last1, prog->anchored_offset, strend) 627 + SvCUR(must), -(SvTAIL(must)!=0), strbeg), 628 must, 629 PL_multiline ? FBMrf_MULTILINE : 0 630 ); 631 DEBUG_r(PerlIO_printf(Perl_debug_log, 632 "%s anchored substr `%s%.*s%s'%s", 633 (s ? "Found" : "Contradicts"), 634 PL_colors[0], 635 (int)(SvCUR(must) 636 - (SvTAIL(must)!=0)), 637 SvPVX(must), 638 PL_colors[1], (SvTAIL(must) ? "$" : ""))); 639 if (!s) { 640 if (last1 >= last2) { 641 DEBUG_r(PerlIO_printf(Perl_debug_log, 642 ", giving up...\n")); 643 goto fail_finish; 644 } 645 DEBUG_r(PerlIO_printf(Perl_debug_log, 646 ", trying floating at offset %ld...\n", 647 (long)(HOP3c(s1, 1, strend) - i_strpos))); 648 other_last = HOP3c(last1, prog->anchored_offset+1, strend); 649 s = HOP3c(last, 1, strend); 650 goto restart; 651 } 652 else { 653 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 654 (long)(s - i_strpos))); 655 t = HOP3c(s, -prog->anchored_offset, strbeg); 656 other_last = HOP3c(s, 1, strend); 657 s = s1; 658 if (t == strpos) 659 goto try_at_start; 660 goto try_at_offset; 661 } 662 } 663 } 664 else { /* Take into account the floating substring. */ 665 char *last, *last1; 666 char *s1 = s; 667 SV* must; 668 669 t = HOP3c(s, -start_shift, strbeg); 670 last1 = last = 671 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); 672 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) 673 last = HOP3c(t, prog->float_max_offset, strend); 674 s = HOP3c(t, prog->float_min_offset, strend); 675 if (s < other_last) 676 s = other_last; 677 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ 678 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 679 /* fbm_instr() takes into account exact value of end-of-str 680 if the check is SvTAIL(ed). Since false positives are OK, 681 and end-of-str is not later than strend we are OK. */ 682 if (must == &PL_sv_undef) { 683 s = (char*)NULL; 684 DEBUG_r(must = prog->float_utf8); /* for debug message */ 685 } 686 else 687 s = fbm_instr((unsigned char*)s, 688 (unsigned char*)last + SvCUR(must) 689 - (SvTAIL(must)!=0), 690 must, PL_multiline ? FBMrf_MULTILINE : 0); 691 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s", 692 (s ? "Found" : "Contradicts"), 693 PL_colors[0], 694 (int)(SvCUR(must) - (SvTAIL(must)!=0)), 695 SvPVX(must), 696 PL_colors[1], (SvTAIL(must) ? "$" : ""))); 697 if (!s) { 698 if (last1 == last) { 699 DEBUG_r(PerlIO_printf(Perl_debug_log, 700 ", giving up...\n")); 701 goto fail_finish; 702 } 703 DEBUG_r(PerlIO_printf(Perl_debug_log, 704 ", trying anchored starting at offset %ld...\n", 705 (long)(s1 + 1 - i_strpos))); 706 other_last = last; 707 s = HOP3c(t, 1, strend); 708 goto restart; 709 } 710 else { 711 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 712 (long)(s - i_strpos))); 713 other_last = s; /* Fix this later. --Hugo */ 714 s = s1; 715 if (t == strpos) 716 goto try_at_start; 717 goto try_at_offset; 718 } 719 } 720 } 721 722 t = s - prog->check_offset_max; 723 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 724 && (!do_utf8 725 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos)) 726 && t > strpos))) { 727 /* Fixed substring is found far enough so that the match 728 cannot start at strpos. */ 729 try_at_offset: 730 if (ml_anch && t[-1] != '\n') { 731 /* Eventually fbm_*() should handle this, but often 732 anchored_offset is not 0, so this check will not be wasted. */ 733 /* XXXX In the code below we prefer to look for "^" even in 734 presence of anchored substrings. And we search even 735 beyond the found float position. These pessimizations 736 are historical artefacts only. */ 737 find_anchor: 738 while (t < strend - prog->minlen) { 739 if (*t == '\n') { 740 if (t < check_at - prog->check_offset_min) { 741 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) { 742 /* Since we moved from the found position, 743 we definitely contradict the found anchored 744 substr. Due to the above check we do not 745 contradict "check" substr. 746 Thus we can arrive here only if check substr 747 is float. Redo checking for "other"=="fixed". 748 */ 749 strpos = t + 1; 750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", 751 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); 752 goto do_other_anchored; 753 } 754 /* We don't contradict the found floating substring. */ 755 /* XXXX Why not check for STCLASS? */ 756 s = t + 1; 757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", 758 PL_colors[0],PL_colors[1], (long)(s - i_strpos))); 759 goto set_useful; 760 } 761 /* Position contradicts check-string */ 762 /* XXXX probably better to look for check-string 763 than for "\n", so one should lower the limit for t? */ 764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", 765 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos))); 766 other_last = strpos = s = t + 1; 767 goto restart; 768 } 769 t++; 770 } 771 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", 772 PL_colors[0],PL_colors[1])); 773 goto fail_finish; 774 } 775 else { 776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", 777 PL_colors[0],PL_colors[1])); 778 } 779 s = t; 780 set_useful: 781 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 782 } 783 else { 784 /* The found string does not prohibit matching at strpos, 785 - no optimization of calling REx engine can be performed, 786 unless it was an MBOL and we are not after MBOL, 787 or a future STCLASS check will fail this. */ 788 try_at_start: 789 /* Even in this situation we may use MBOL flag if strpos is offset 790 wrt the start of the string. */ 791 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */ 792 && (strpos != strbeg) && strpos[-1] != '\n' 793 /* May be due to an implicit anchor of m{.*foo} */ 794 && !(prog->reganch & ROPT_IMPLICIT)) 795 { 796 t = strpos; 797 goto find_anchor; 798 } 799 DEBUG_r( if (ml_anch) 800 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", 801 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]); 802 ); 803 success_at_start: 804 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */ 805 && (do_utf8 ? ( 806 prog->check_utf8 /* Could be deleted already */ 807 && --BmUSEFUL(prog->check_utf8) < 0 808 && (prog->check_utf8 == prog->float_utf8) 809 ) : ( 810 prog->check_substr /* Could be deleted already */ 811 && --BmUSEFUL(prog->check_substr) < 0 812 && (prog->check_substr == prog->float_substr) 813 ))) 814 { 815 /* If flags & SOMETHING - do not do it many times on the same match */ 816 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); 817 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr); 818 if (do_utf8 ? prog->check_substr : prog->check_utf8) 819 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8); 820 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */ 821 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */ 822 check = Nullsv; /* abort */ 823 s = strpos; 824 /* XXXX This is a remnant of the old implementation. It 825 looks wasteful, since now INTUIT can use many 826 other heuristics. */ 827 prog->reganch &= ~RE_USE_INTUIT; 828 } 829 else 830 s = strpos; 831 } 832 833 /* Last resort... */ 834 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ 835 if (prog->regstclass) { 836 /* minlen == 0 is possible if regstclass is \b or \B, 837 and the fixed substr is ''$. 838 Since minlen is already taken into account, s+1 is before strend; 839 accidentally, minlen >= 1 guaranties no false positives at s + 1 840 even for \b or \B. But (minlen? 1 : 0) below assumes that 841 regstclass does not come from lookahead... */ 842 /* If regstclass takes bytelength more than 1: If charlength==1, OK. 843 This leaves EXACTF only, which is dealt with in find_byclass(). */ 844 U8* str = (U8*)STRING(prog->regstclass); 845 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT 846 ? CHR_DIST(str+STR_LEN(prog->regstclass), str) 847 : 1); 848 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 849 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend) 850 : (prog->float_substr || prog->float_utf8 851 ? HOP3c(HOP3c(check_at, -start_shift, strbeg), 852 cl_l, strend) 853 : strend); 854 char *startpos = strbeg; 855 856 t = s; 857 cache_re(prog); 858 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1); 859 if (!s) { 860#ifdef DEBUGGING 861 char *what = 0; 862#endif 863 if (endpos == strend) { 864 DEBUG_r( PerlIO_printf(Perl_debug_log, 865 "Could not match STCLASS...\n") ); 866 goto fail; 867 } 868 DEBUG_r( PerlIO_printf(Perl_debug_log, 869 "This position contradicts STCLASS...\n") ); 870 if ((prog->reganch & ROPT_ANCH) && !ml_anch) 871 goto fail; 872 /* Contradict one of substrings */ 873 if (prog->anchored_substr || prog->anchored_utf8) { 874 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) { 875 DEBUG_r( what = "anchored" ); 876 hop_and_restart: 877 s = HOP3c(t, 1, strend); 878 if (s + start_shift + end_shift > strend) { 879 /* XXXX Should be taken into account earlier? */ 880 DEBUG_r( PerlIO_printf(Perl_debug_log, 881 "Could not match STCLASS...\n") ); 882 goto fail; 883 } 884 if (!check) 885 goto giveup; 886 DEBUG_r( PerlIO_printf(Perl_debug_log, 887 "Looking for %s substr starting at offset %ld...\n", 888 what, (long)(s + start_shift - i_strpos)) ); 889 goto restart; 890 } 891 /* Have both, check_string is floating */ 892 if (t + start_shift >= check_at) /* Contradicts floating=check */ 893 goto retry_floating_check; 894 /* Recheck anchored substring, but not floating... */ 895 s = check_at; 896 if (!check) 897 goto giveup; 898 DEBUG_r( PerlIO_printf(Perl_debug_log, 899 "Looking for anchored substr starting at offset %ld...\n", 900 (long)(other_last - i_strpos)) ); 901 goto do_other_anchored; 902 } 903 /* Another way we could have checked stclass at the 904 current position only: */ 905 if (ml_anch) { 906 s = t = t + 1; 907 if (!check) 908 goto giveup; 909 DEBUG_r( PerlIO_printf(Perl_debug_log, 910 "Looking for /%s^%s/m starting at offset %ld...\n", 911 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) ); 912 goto try_at_offset; 913 } 914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ 915 goto fail; 916 /* Check is floating subtring. */ 917 retry_floating_check: 918 t = check_at - start_shift; 919 DEBUG_r( what = "floating" ); 920 goto hop_and_restart; 921 } 922 if (t != s) { 923 DEBUG_r(PerlIO_printf(Perl_debug_log, 924 "By STCLASS: moving %ld --> %ld\n", 925 (long)(t - i_strpos), (long)(s - i_strpos)) 926 ); 927 } 928 else { 929 DEBUG_r(PerlIO_printf(Perl_debug_log, 930 "Does not contradict STCLASS...\n"); 931 ); 932 } 933 } 934 giveup: 935 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", 936 PL_colors[4], (check ? "Guessed" : "Giving up"), 937 PL_colors[5], (long)(s - i_strpos)) ); 938 return s; 939 940 fail_finish: /* Substring not found */ 941 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 942 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 943 fail: 944 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", 945 PL_colors[4],PL_colors[5])); 946 return Nullch; 947} 948 949/* We know what class REx starts with. Try to find this position... */ 950STATIC char * 951S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun) 952{ 953 I32 doevery = (prog->reganch & ROPT_SKIP) == 0; 954 char *m; 955 STRLEN ln; 956 STRLEN lnc; 957 register STRLEN uskip; 958 unsigned int c1; 959 unsigned int c2; 960 char *e; 961 register I32 tmp = 1; /* Scratch variable? */ 962 register bool do_utf8 = PL_reg_match_utf8; 963 964 /* We know what class it must start with. */ 965 switch (OP(c)) { 966 case ANYOF: 967 if (do_utf8) { 968 while (s + (uskip = UTF8SKIP(s)) <= strend) { 969 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) || 970 !UTF8_IS_INVARIANT((U8)s[0]) ? 971 reginclass(c, (U8*)s, 0, do_utf8) : 972 REGINCLASS(c, (U8*)s)) { 973 if (tmp && (norun || regtry(prog, s))) 974 goto got_it; 975 else 976 tmp = doevery; 977 } 978 else 979 tmp = 1; 980 s += uskip; 981 } 982 } 983 else { 984 while (s < strend) { 985 STRLEN skip = 1; 986 987 if (REGINCLASS(c, (U8*)s) || 988 (ANYOF_FOLD_SHARP_S(c, s, strend) && 989 /* The assignment of 2 is intentional: 990 * for the folded sharp s, the skip is 2. */ 991 (skip = SHARP_S_SKIP))) { 992 if (tmp && (norun || regtry(prog, s))) 993 goto got_it; 994 else 995 tmp = doevery; 996 } 997 else 998 tmp = 1; 999 s += skip; 1000 } 1001 } 1002 break; 1003 case CANY: 1004 while (s < strend) { 1005 if (tmp && (norun || regtry(prog, s))) 1006 goto got_it; 1007 else 1008 tmp = doevery; 1009 s++; 1010 } 1011 break; 1012 case EXACTF: 1013 m = STRING(c); 1014 ln = STR_LEN(c); /* length to match in octets/bytes */ 1015 lnc = (I32) ln; /* length to match in characters */ 1016 if (UTF) { 1017 STRLEN ulen1, ulen2; 1018 U8 *sm = (U8 *) m; 1019 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; 1020 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; 1021 1022 to_utf8_lower((U8*)m, tmpbuf1, &ulen1); 1023 to_utf8_upper((U8*)m, tmpbuf2, &ulen2); 1024 1025 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 1026 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1027 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, 1028 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1029 lnc = 0; 1030 while (sm < ((U8 *) m + ln)) { 1031 lnc++; 1032 sm += UTF8SKIP(sm); 1033 } 1034 } 1035 else { 1036 c1 = *(U8*)m; 1037 c2 = PL_fold[c1]; 1038 } 1039 goto do_exactf; 1040 case EXACTFL: 1041 m = STRING(c); 1042 ln = STR_LEN(c); 1043 lnc = (I32) ln; 1044 c1 = *(U8*)m; 1045 c2 = PL_fold_locale[c1]; 1046 do_exactf: 1047 e = HOP3c(strend, -((I32)lnc), s); 1048 1049 if (norun && e < s) 1050 e = s; /* Due to minlen logic of intuit() */ 1051 1052 /* The idea in the EXACTF* cases is to first find the 1053 * first character of the EXACTF* node and then, if 1054 * necessary, case-insensitively compare the full 1055 * text of the node. The c1 and c2 are the first 1056 * characters (though in Unicode it gets a bit 1057 * more complicated because there are more cases 1058 * than just upper and lower: one needs to use 1059 * the so-called folding case for case-insensitive 1060 * matching (called "loose matching" in Unicode). 1061 * ibcmp_utf8() will do just that. */ 1062 1063 if (do_utf8) { 1064 UV c, f; 1065 U8 tmpbuf [UTF8_MAXLEN+1]; 1066 U8 foldbuf[UTF8_MAXLEN_FOLD+1]; 1067 STRLEN len, foldlen; 1068 1069 if (c1 == c2) { 1070 /* Upper and lower of 1st char are equal - 1071 * probably not a "letter". */ 1072 while (s <= e) { 1073 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, 1074 ckWARN(WARN_UTF8) ? 1075 0 : UTF8_ALLOW_ANY); 1076 if ( c == c1 1077 && (ln == len || 1078 ibcmp_utf8(s, (char **)0, 0, do_utf8, 1079 m, (char **)0, ln, (bool)UTF)) 1080 && (norun || regtry(prog, s)) ) 1081 goto got_it; 1082 else { 1083 uvchr_to_utf8(tmpbuf, c); 1084 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); 1085 if ( f != c 1086 && (f == c1 || f == c2) 1087 && (ln == foldlen || 1088 !ibcmp_utf8((char *) foldbuf, 1089 (char **)0, foldlen, do_utf8, 1090 m, 1091 (char **)0, ln, (bool)UTF)) 1092 && (norun || regtry(prog, s)) ) 1093 goto got_it; 1094 } 1095 s += len; 1096 } 1097 } 1098 else { 1099 while (s <= e) { 1100 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, 1101 ckWARN(WARN_UTF8) ? 1102 0 : UTF8_ALLOW_ANY); 1103 1104 /* Handle some of the three Greek sigmas cases. 1105 * Note that not all the possible combinations 1106 * are handled here: some of them are handled 1107 * by the standard folding rules, and some of 1108 * them (the character class or ANYOF cases) 1109 * are handled during compiletime in 1110 * regexec.c:S_regclass(). */ 1111 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || 1112 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) 1113 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; 1114 1115 if ( (c == c1 || c == c2) 1116 && (ln == len || 1117 ibcmp_utf8(s, (char **)0, 0, do_utf8, 1118 m, (char **)0, ln, (bool)UTF)) 1119 && (norun || regtry(prog, s)) ) 1120 goto got_it; 1121 else { 1122 uvchr_to_utf8(tmpbuf, c); 1123 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); 1124 if ( f != c 1125 && (f == c1 || f == c2) 1126 && (ln == foldlen || 1127 !ibcmp_utf8((char *) foldbuf, 1128 (char **)0, foldlen, do_utf8, 1129 m, 1130 (char **)0, ln, (bool)UTF)) 1131 && (norun || regtry(prog, s)) ) 1132 goto got_it; 1133 } 1134 s += len; 1135 } 1136 } 1137 } 1138 else { 1139 if (c1 == c2) 1140 while (s <= e) { 1141 if ( *(U8*)s == c1 1142 && (ln == 1 || !(OP(c) == EXACTF 1143 ? ibcmp(s, m, ln) 1144 : ibcmp_locale(s, m, ln))) 1145 && (norun || regtry(prog, s)) ) 1146 goto got_it; 1147 s++; 1148 } 1149 else 1150 while (s <= e) { 1151 if ( (*(U8*)s == c1 || *(U8*)s == c2) 1152 && (ln == 1 || !(OP(c) == EXACTF 1153 ? ibcmp(s, m, ln) 1154 : ibcmp_locale(s, m, ln))) 1155 && (norun || regtry(prog, s)) ) 1156 goto got_it; 1157 s++; 1158 } 1159 } 1160 break; 1161 case BOUNDL: 1162 PL_reg_flags |= RF_tainted; 1163 /* FALL THROUGH */ 1164 case BOUND: 1165 if (do_utf8) { 1166 if (s == PL_bostr) 1167 tmp = '\n'; 1168 else { 1169 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1170 1171 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); 1172 } 1173 tmp = ((OP(c) == BOUND ? 1174 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1175 LOAD_UTF8_CHARCLASS(alnum,"a"); 1176 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1177 if (tmp == !(OP(c) == BOUND ? 1178 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1179 isALNUM_LC_utf8((U8*)s))) 1180 { 1181 tmp = !tmp; 1182 if ((norun || regtry(prog, s))) 1183 goto got_it; 1184 } 1185 s += uskip; 1186 } 1187 } 1188 else { 1189 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1190 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1191 while (s < strend) { 1192 if (tmp == 1193 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { 1194 tmp = !tmp; 1195 if ((norun || regtry(prog, s))) 1196 goto got_it; 1197 } 1198 s++; 1199 } 1200 } 1201 if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) 1202 goto got_it; 1203 break; 1204 case NBOUNDL: 1205 PL_reg_flags |= RF_tainted; 1206 /* FALL THROUGH */ 1207 case NBOUND: 1208 if (do_utf8) { 1209 if (s == PL_bostr) 1210 tmp = '\n'; 1211 else { 1212 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1213 1214 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); 1215 } 1216 tmp = ((OP(c) == NBOUND ? 1217 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1218 LOAD_UTF8_CHARCLASS(alnum,"a"); 1219 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1220 if (tmp == !(OP(c) == NBOUND ? 1221 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1222 isALNUM_LC_utf8((U8*)s))) 1223 tmp = !tmp; 1224 else if ((norun || regtry(prog, s))) 1225 goto got_it; 1226 s += uskip; 1227 } 1228 } 1229 else { 1230 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1231 tmp = ((OP(c) == NBOUND ? 1232 isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1233 while (s < strend) { 1234 if (tmp == 1235 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) 1236 tmp = !tmp; 1237 else if ((norun || regtry(prog, s))) 1238 goto got_it; 1239 s++; 1240 } 1241 } 1242 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) 1243 goto got_it; 1244 break; 1245 case ALNUM: 1246 if (do_utf8) { 1247 LOAD_UTF8_CHARCLASS(alnum,"a"); 1248 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1249 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { 1250 if (tmp && (norun || regtry(prog, s))) 1251 goto got_it; 1252 else 1253 tmp = doevery; 1254 } 1255 else 1256 tmp = 1; 1257 s += uskip; 1258 } 1259 } 1260 else { 1261 while (s < strend) { 1262 if (isALNUM(*s)) { 1263 if (tmp && (norun || regtry(prog, s))) 1264 goto got_it; 1265 else 1266 tmp = doevery; 1267 } 1268 else 1269 tmp = 1; 1270 s++; 1271 } 1272 } 1273 break; 1274 case ALNUML: 1275 PL_reg_flags |= RF_tainted; 1276 if (do_utf8) { 1277 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1278 if (isALNUM_LC_utf8((U8*)s)) { 1279 if (tmp && (norun || regtry(prog, s))) 1280 goto got_it; 1281 else 1282 tmp = doevery; 1283 } 1284 else 1285 tmp = 1; 1286 s += uskip; 1287 } 1288 } 1289 else { 1290 while (s < strend) { 1291 if (isALNUM_LC(*s)) { 1292 if (tmp && (norun || regtry(prog, s))) 1293 goto got_it; 1294 else 1295 tmp = doevery; 1296 } 1297 else 1298 tmp = 1; 1299 s++; 1300 } 1301 } 1302 break; 1303 case NALNUM: 1304 if (do_utf8) { 1305 LOAD_UTF8_CHARCLASS(alnum,"a"); 1306 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1307 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) { 1308 if (tmp && (norun || regtry(prog, s))) 1309 goto got_it; 1310 else 1311 tmp = doevery; 1312 } 1313 else 1314 tmp = 1; 1315 s += uskip; 1316 } 1317 } 1318 else { 1319 while (s < strend) { 1320 if (!isALNUM(*s)) { 1321 if (tmp && (norun || regtry(prog, s))) 1322 goto got_it; 1323 else 1324 tmp = doevery; 1325 } 1326 else 1327 tmp = 1; 1328 s++; 1329 } 1330 } 1331 break; 1332 case NALNUML: 1333 PL_reg_flags |= RF_tainted; 1334 if (do_utf8) { 1335 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1336 if (!isALNUM_LC_utf8((U8*)s)) { 1337 if (tmp && (norun || regtry(prog, s))) 1338 goto got_it; 1339 else 1340 tmp = doevery; 1341 } 1342 else 1343 tmp = 1; 1344 s += uskip; 1345 } 1346 } 1347 else { 1348 while (s < strend) { 1349 if (!isALNUM_LC(*s)) { 1350 if (tmp && (norun || regtry(prog, s))) 1351 goto got_it; 1352 else 1353 tmp = doevery; 1354 } 1355 else 1356 tmp = 1; 1357 s++; 1358 } 1359 } 1360 break; 1361 case SPACE: 1362 if (do_utf8) { 1363 LOAD_UTF8_CHARCLASS(space," "); 1364 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1365 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) { 1366 if (tmp && (norun || regtry(prog, s))) 1367 goto got_it; 1368 else 1369 tmp = doevery; 1370 } 1371 else 1372 tmp = 1; 1373 s += uskip; 1374 } 1375 } 1376 else { 1377 while (s < strend) { 1378 if (isSPACE(*s)) { 1379 if (tmp && (norun || regtry(prog, s))) 1380 goto got_it; 1381 else 1382 tmp = doevery; 1383 } 1384 else 1385 tmp = 1; 1386 s++; 1387 } 1388 } 1389 break; 1390 case SPACEL: 1391 PL_reg_flags |= RF_tainted; 1392 if (do_utf8) { 1393 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1394 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { 1395 if (tmp && (norun || regtry(prog, s))) 1396 goto got_it; 1397 else 1398 tmp = doevery; 1399 } 1400 else 1401 tmp = 1; 1402 s += uskip; 1403 } 1404 } 1405 else { 1406 while (s < strend) { 1407 if (isSPACE_LC(*s)) { 1408 if (tmp && (norun || regtry(prog, s))) 1409 goto got_it; 1410 else 1411 tmp = doevery; 1412 } 1413 else 1414 tmp = 1; 1415 s++; 1416 } 1417 } 1418 break; 1419 case NSPACE: 1420 if (do_utf8) { 1421 LOAD_UTF8_CHARCLASS(space," "); 1422 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1423 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) { 1424 if (tmp && (norun || regtry(prog, s))) 1425 goto got_it; 1426 else 1427 tmp = doevery; 1428 } 1429 else 1430 tmp = 1; 1431 s += uskip; 1432 } 1433 } 1434 else { 1435 while (s < strend) { 1436 if (!isSPACE(*s)) { 1437 if (tmp && (norun || regtry(prog, s))) 1438 goto got_it; 1439 else 1440 tmp = doevery; 1441 } 1442 else 1443 tmp = 1; 1444 s++; 1445 } 1446 } 1447 break; 1448 case NSPACEL: 1449 PL_reg_flags |= RF_tainted; 1450 if (do_utf8) { 1451 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1452 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { 1453 if (tmp && (norun || regtry(prog, s))) 1454 goto got_it; 1455 else 1456 tmp = doevery; 1457 } 1458 else 1459 tmp = 1; 1460 s += uskip; 1461 } 1462 } 1463 else { 1464 while (s < strend) { 1465 if (!isSPACE_LC(*s)) { 1466 if (tmp && (norun || regtry(prog, s))) 1467 goto got_it; 1468 else 1469 tmp = doevery; 1470 } 1471 else 1472 tmp = 1; 1473 s++; 1474 } 1475 } 1476 break; 1477 case DIGIT: 1478 if (do_utf8) { 1479 LOAD_UTF8_CHARCLASS(digit,"0"); 1480 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1481 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { 1482 if (tmp && (norun || regtry(prog, s))) 1483 goto got_it; 1484 else 1485 tmp = doevery; 1486 } 1487 else 1488 tmp = 1; 1489 s += uskip; 1490 } 1491 } 1492 else { 1493 while (s < strend) { 1494 if (isDIGIT(*s)) { 1495 if (tmp && (norun || regtry(prog, s))) 1496 goto got_it; 1497 else 1498 tmp = doevery; 1499 } 1500 else 1501 tmp = 1; 1502 s++; 1503 } 1504 } 1505 break; 1506 case DIGITL: 1507 PL_reg_flags |= RF_tainted; 1508 if (do_utf8) { 1509 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1510 if (isDIGIT_LC_utf8((U8*)s)) { 1511 if (tmp && (norun || regtry(prog, s))) 1512 goto got_it; 1513 else 1514 tmp = doevery; 1515 } 1516 else 1517 tmp = 1; 1518 s += uskip; 1519 } 1520 } 1521 else { 1522 while (s < strend) { 1523 if (isDIGIT_LC(*s)) { 1524 if (tmp && (norun || regtry(prog, s))) 1525 goto got_it; 1526 else 1527 tmp = doevery; 1528 } 1529 else 1530 tmp = 1; 1531 s++; 1532 } 1533 } 1534 break; 1535 case NDIGIT: 1536 if (do_utf8) { 1537 LOAD_UTF8_CHARCLASS(digit,"0"); 1538 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1539 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) { 1540 if (tmp && (norun || regtry(prog, s))) 1541 goto got_it; 1542 else 1543 tmp = doevery; 1544 } 1545 else 1546 tmp = 1; 1547 s += uskip; 1548 } 1549 } 1550 else { 1551 while (s < strend) { 1552 if (!isDIGIT(*s)) { 1553 if (tmp && (norun || regtry(prog, s))) 1554 goto got_it; 1555 else 1556 tmp = doevery; 1557 } 1558 else 1559 tmp = 1; 1560 s++; 1561 } 1562 } 1563 break; 1564 case NDIGITL: 1565 PL_reg_flags |= RF_tainted; 1566 if (do_utf8) { 1567 while (s + (uskip = UTF8SKIP(s)) <= strend) { 1568 if (!isDIGIT_LC_utf8((U8*)s)) { 1569 if (tmp && (norun || regtry(prog, s))) 1570 goto got_it; 1571 else 1572 tmp = doevery; 1573 } 1574 else 1575 tmp = 1; 1576 s += uskip; 1577 } 1578 } 1579 else { 1580 while (s < strend) { 1581 if (!isDIGIT_LC(*s)) { 1582 if (tmp && (norun || regtry(prog, s))) 1583 goto got_it; 1584 else 1585 tmp = doevery; 1586 } 1587 else 1588 tmp = 1; 1589 s++; 1590 } 1591 } 1592 break; 1593 default: 1594 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 1595 break; 1596 } 1597 return 0; 1598 got_it: 1599 return s; 1600} 1601 1602/* 1603 - regexec_flags - match a regexp against a string 1604 */ 1605I32 1606Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend, 1607 char *strbeg, I32 minend, SV *sv, void *data, U32 flags) 1608/* strend: pointer to null at end of string */ 1609/* strbeg: real beginning of string */ 1610/* minend: end of match must be >=minend after stringarg. */ 1611/* data: May be used for some additional optimizations. */ 1612/* nosave: For optimizations. */ 1613{ 1614 register char *s; 1615 register regnode *c; 1616 register char *startpos = stringarg; 1617 I32 minlen; /* must match at least this many chars */ 1618 I32 dontbother = 0; /* how many characters not to try at end */ 1619 /* I32 start_shift = 0; */ /* Offset of the start to find 1620 constant substr. */ /* CC */ 1621 I32 end_shift = 0; /* Same for the end. */ /* CC */ 1622 I32 scream_pos = -1; /* Internal iterator of scream. */ 1623 char *scream_olds; 1624 SV* oreplsv = GvSV(PL_replgv); 1625 bool do_utf8 = DO_UTF8(sv); 1626#ifdef DEBUGGING 1627 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0); 1628 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1); 1629#endif 1630 RX_MATCH_UTF8_set(prog,do_utf8); 1631 1632 PL_regcc = 0; 1633 1634 cache_re(prog); 1635#ifdef DEBUGGING 1636 PL_regnarrate = DEBUG_r_TEST; 1637#endif 1638 1639 /* Be paranoid... */ 1640 if (prog == NULL || startpos == NULL) { 1641 Perl_croak(aTHX_ "NULL regexp parameter"); 1642 return 0; 1643 } 1644 1645 minlen = prog->minlen; 1646 if (strend - startpos < minlen) { 1647 DEBUG_r(PerlIO_printf(Perl_debug_log, 1648 "String too short [regexec_flags]...\n")); 1649 goto phooey; 1650 } 1651 1652 /* Check validity of program. */ 1653 if (UCHARAT(prog->program) != REG_MAGIC) { 1654 Perl_croak(aTHX_ "corrupted regexp program"); 1655 } 1656 1657 PL_reg_flags = 0; 1658 PL_reg_eval_set = 0; 1659 PL_reg_maxiter = 0; 1660 1661 if (prog->reganch & ROPT_UTF8) 1662 PL_reg_flags |= RF_utf8; 1663 1664 /* Mark beginning of line for ^ and lookbehind. */ 1665 PL_regbol = startpos; 1666 PL_bostr = strbeg; 1667 PL_reg_sv = sv; 1668 1669 /* Mark end of line for $ (and such) */ 1670 PL_regeol = strend; 1671 1672 /* see how far we have to get to not match where we matched before */ 1673 PL_regtill = startpos+minend; 1674 1675 /* We start without call_cc context. */ 1676 PL_reg_call_cc = 0; 1677 1678 /* If there is a "must appear" string, look for it. */ 1679 s = startpos; 1680 1681 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */ 1682 MAGIC *mg; 1683 1684 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ 1685 PL_reg_ganch = startpos; 1686 else if (sv && SvTYPE(sv) >= SVt_PVMG 1687 && SvMAGIC(sv) 1688 && (mg = mg_find(sv, PERL_MAGIC_regex_global)) 1689 && mg->mg_len >= 0) { 1690 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */ 1691 if (prog->reganch & ROPT_ANCH_GPOS) { 1692 if (s > PL_reg_ganch) 1693 goto phooey; 1694 s = PL_reg_ganch; 1695 } 1696 } 1697 else /* pos() not defined */ 1698 PL_reg_ganch = strbeg; 1699 } 1700 1701 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) { 1702 re_scream_pos_data d; 1703 1704 d.scream_olds = &scream_olds; 1705 d.scream_pos = &scream_pos; 1706 s = re_intuit_start(prog, sv, s, strend, flags, &d); 1707 if (!s) { 1708 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); 1709 goto phooey; /* not present */ 1710 } 1711 } 1712 1713 DEBUG_r({ 1714 char *s0 = UTF ? 1715 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60, 1716 UNI_DISPLAY_REGEX) : 1717 prog->precomp; 1718 int len0 = UTF ? SvCUR(dsv0) : prog->prelen; 1719 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60, 1720 UNI_DISPLAY_REGEX) : startpos; 1721 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos; 1722 if (!PL_colorset) 1723 reginitcolors(); 1724 PerlIO_printf(Perl_debug_log, 1725 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n", 1726 PL_colors[4],PL_colors[5],PL_colors[0], 1727 len0, len0, s0, 1728 PL_colors[1], 1729 len0 > 60 ? "..." : "", 1730 PL_colors[0], 1731 (int)(len1 > 60 ? 60 : len1), 1732 s1, PL_colors[1], 1733 (len1 > 60 ? "..." : "") 1734 ); 1735 }); 1736 1737 /* Simplest case: anchored match need be tried only once. */ 1738 /* [unless only anchor is BOL and multiline is set] */ 1739 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) { 1740 if (s == startpos && regtry(prog, startpos)) 1741 goto got_it; 1742 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT) 1743 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */ 1744 { 1745 char *end; 1746 1747 if (minlen) 1748 dontbother = minlen - 1; 1749 end = HOP3c(strend, -dontbother, strbeg) - 1; 1750 /* for multiline we only have to try after newlines */ 1751 if (prog->check_substr || prog->check_utf8) { 1752 if (s == startpos) 1753 goto after_try; 1754 while (1) { 1755 if (regtry(prog, s)) 1756 goto got_it; 1757 after_try: 1758 if (s >= end) 1759 goto phooey; 1760 if (prog->reganch & RE_USE_INTUIT) { 1761 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL); 1762 if (!s) 1763 goto phooey; 1764 } 1765 else 1766 s++; 1767 } 1768 } else { 1769 if (s > startpos) 1770 s--; 1771 while (s < end) { 1772 if (*s++ == '\n') { /* don't need PL_utf8skip here */ 1773 if (regtry(prog, s)) 1774 goto got_it; 1775 } 1776 } 1777 } 1778 } 1779 goto phooey; 1780 } else if (prog->reganch & ROPT_ANCH_GPOS) { 1781 if (regtry(prog, PL_reg_ganch)) 1782 goto got_it; 1783 goto phooey; 1784 } 1785 1786 /* Messy cases: unanchored match. */ 1787 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) { 1788 /* we have /x+whatever/ */ 1789 /* it must be a one character string (XXXX Except UTF?) */ 1790 char ch; 1791#ifdef DEBUGGING 1792 int did_match = 0; 1793#endif 1794 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 1795 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1796 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0]; 1797 1798 if (do_utf8) { 1799 while (s < strend) { 1800 if (*s == ch) { 1801 DEBUG_r( did_match = 1 ); 1802 if (regtry(prog, s)) goto got_it; 1803 s += UTF8SKIP(s); 1804 while (s < strend && *s == ch) 1805 s += UTF8SKIP(s); 1806 } 1807 s += UTF8SKIP(s); 1808 } 1809 } 1810 else { 1811 while (s < strend) { 1812 if (*s == ch) { 1813 DEBUG_r( did_match = 1 ); 1814 if (regtry(prog, s)) goto got_it; 1815 s++; 1816 while (s < strend && *s == ch) 1817 s++; 1818 } 1819 s++; 1820 } 1821 } 1822 DEBUG_r(if (!did_match) 1823 PerlIO_printf(Perl_debug_log, 1824 "Did not find anchored character...\n") 1825 ); 1826 } 1827 /*SUPPRESS 560*/ 1828 else if (prog->anchored_substr != Nullsv 1829 || prog->anchored_utf8 != Nullsv 1830 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) 1831 && prog->float_max_offset < strend - s)) { 1832 SV *must; 1833 I32 back_max; 1834 I32 back_min; 1835 char *last; 1836 char *last1; /* Last position checked before */ 1837#ifdef DEBUGGING 1838 int did_match = 0; 1839#endif 1840 if (prog->anchored_substr || prog->anchored_utf8) { 1841 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 1842 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1843 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 1844 back_max = back_min = prog->anchored_offset; 1845 } else { 1846 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 1847 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1848 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 1849 back_max = prog->float_max_offset; 1850 back_min = prog->float_min_offset; 1851 } 1852 if (must == &PL_sv_undef) 1853 /* could not downgrade utf8 check substring, so must fail */ 1854 goto phooey; 1855 1856 last = HOP3c(strend, /* Cannot start after this */ 1857 -(I32)(CHR_SVLEN(must) 1858 - (SvTAIL(must) != 0) + back_min), strbeg); 1859 1860 if (s > PL_bostr) 1861 last1 = HOPc(s, -1); 1862 else 1863 last1 = s - 1; /* bogus */ 1864 1865 /* XXXX check_substr already used to find `s', can optimize if 1866 check_substr==must. */ 1867 scream_pos = -1; 1868 dontbother = end_shift; 1869 strend = HOPc(strend, -dontbother); 1870 while ( (s <= last) && 1871 ((flags & REXEC_SCREAM) 1872 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg, 1873 end_shift, &scream_pos, 0)) 1874 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend), 1875 (unsigned char*)strend, must, 1876 PL_multiline ? FBMrf_MULTILINE : 0))) ) { 1877 /* we may be pointing at the wrong string */ 1878 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog)) 1879 s = strbeg + (s - SvPVX(sv)); 1880 DEBUG_r( did_match = 1 ); 1881 if (HOPc(s, -back_max) > last1) { 1882 last1 = HOPc(s, -back_min); 1883 s = HOPc(s, -back_max); 1884 } 1885 else { 1886 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1; 1887 1888 last1 = HOPc(s, -back_min); 1889 s = t; 1890 } 1891 if (do_utf8) { 1892 while (s <= last1) { 1893 if (regtry(prog, s)) 1894 goto got_it; 1895 s += UTF8SKIP(s); 1896 } 1897 } 1898 else { 1899 while (s <= last1) { 1900 if (regtry(prog, s)) 1901 goto got_it; 1902 s++; 1903 } 1904 } 1905 } 1906 DEBUG_r(if (!did_match) 1907 PerlIO_printf(Perl_debug_log, 1908 "Did not find %s substr `%s%.*s%s'%s...\n", 1909 ((must == prog->anchored_substr || must == prog->anchored_utf8) 1910 ? "anchored" : "floating"), 1911 PL_colors[0], 1912 (int)(SvCUR(must) - (SvTAIL(must)!=0)), 1913 SvPVX(must), 1914 PL_colors[1], (SvTAIL(must) ? "$" : "")) 1915 ); 1916 goto phooey; 1917 } 1918 else if ((c = prog->regstclass)) { 1919 if (minlen) { 1920 I32 op = (U8)OP(prog->regstclass); 1921 /* don't bother with what can't match */ 1922 if (PL_regkind[op] != EXACT && op != CANY) 1923 strend = HOPc(strend, -(minlen - 1)); 1924 } 1925 DEBUG_r({ 1926 SV *prop = sv_newmortal(); 1927 char *s0; 1928 char *s1; 1929 int len0; 1930 int len1; 1931 1932 regprop(prop, c); 1933 s0 = UTF ? 1934 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60, 1935 UNI_DISPLAY_REGEX) : 1936 SvPVX(prop); 1937 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop); 1938 s1 = UTF ? 1939 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s; 1940 len1 = UTF ? SvCUR(dsv1) : strend - s; 1941 PerlIO_printf(Perl_debug_log, 1942 "Matching stclass `%*.*s' against `%*.*s'\n", 1943 len0, len0, s0, 1944 len1, len1, s1); 1945 }); 1946 if (find_byclass(prog, c, s, strend, startpos, 0)) 1947 goto got_it; 1948 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n")); 1949 } 1950 else { 1951 dontbother = 0; 1952 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) { 1953 /* Trim the end. */ 1954 char *last; 1955 SV* float_real; 1956 1957 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 1958 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 1959 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr; 1960 1961 if (flags & REXEC_SCREAM) { 1962 last = screaminstr(sv, float_real, s - strbeg, 1963 end_shift, &scream_pos, 1); /* last one */ 1964 if (!last) 1965 last = scream_olds; /* Only one occurrence. */ 1966 /* we may be pointing at the wrong string */ 1967 else if (RX_MATCH_COPIED(prog)) 1968 s = strbeg + (s - SvPVX(sv)); 1969 } 1970 else { 1971 STRLEN len; 1972 char *little = SvPV(float_real, len); 1973 1974 if (SvTAIL(float_real)) { 1975 if (memEQ(strend - len + 1, little, len - 1)) 1976 last = strend - len + 1; 1977 else if (!PL_multiline) 1978 last = memEQ(strend - len, little, len) 1979 ? strend - len : Nullch; 1980 else 1981 goto find_last; 1982 } else { 1983 find_last: 1984 if (len) 1985 last = rninstr(s, strend, little, little + len); 1986 else 1987 last = strend; /* matching `$' */ 1988 } 1989 } 1990 if (last == NULL) { 1991 DEBUG_r(PerlIO_printf(Perl_debug_log, 1992 "%sCan't trim the tail, match fails (should not happen)%s\n", 1993 PL_colors[4],PL_colors[5])); 1994 goto phooey; /* Should not happen! */ 1995 } 1996 dontbother = strend - last + prog->float_min_offset; 1997 } 1998 if (minlen && (dontbother < minlen)) 1999 dontbother = minlen - 1; 2000 strend -= dontbother; /* this one's always in bytes! */ 2001 /* We don't know much -- general case. */ 2002 if (do_utf8) { 2003 for (;;) { 2004 if (regtry(prog, s)) 2005 goto got_it; 2006 if (s >= strend) 2007 break; 2008 s += UTF8SKIP(s); 2009 }; 2010 } 2011 else { 2012 do { 2013 if (regtry(prog, s)) 2014 goto got_it; 2015 } while (s++ < strend); 2016 } 2017 } 2018 2019 /* Failure. */ 2020 goto phooey; 2021 2022got_it: 2023 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); 2024 2025 if (PL_reg_eval_set) { 2026 /* Preserve the current value of $^R */ 2027 if (oreplsv != GvSV(PL_replgv)) 2028 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is 2029 restored, the value remains 2030 the same. */ 2031 restore_pos(aTHX_ 0); 2032 } 2033 2034 /* make sure $`, $&, $', and $digit will work later */ 2035 if ( !(flags & REXEC_NOT_FIRST) ) { 2036 if (RX_MATCH_COPIED(prog)) { 2037 Safefree(prog->subbeg); 2038 RX_MATCH_COPIED_off(prog); 2039 } 2040 if (flags & REXEC_COPY_STR) { 2041 I32 i = PL_regeol - startpos + (stringarg - strbeg); 2042 2043 s = savepvn(strbeg, i); 2044 prog->subbeg = s; 2045 prog->sublen = i; 2046 RX_MATCH_COPIED_on(prog); 2047 } 2048 else { 2049 prog->subbeg = strbeg; 2050 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ 2051 } 2052 } 2053 2054 return 1; 2055 2056phooey: 2057 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", 2058 PL_colors[4],PL_colors[5])); 2059 if (PL_reg_eval_set) 2060 restore_pos(aTHX_ 0); 2061 return 0; 2062} 2063 2064/* 2065 - regtry - try match at specific point 2066 */ 2067STATIC I32 /* 0 failure, 1 success */ 2068S_regtry(pTHX_ regexp *prog, char *startpos) 2069{ 2070 register I32 i; 2071 register I32 *sp; 2072 register I32 *ep; 2073 CHECKPOINT lastcp; 2074 2075#ifdef DEBUGGING 2076 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */ 2077#endif 2078 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { 2079 MAGIC *mg; 2080 2081 PL_reg_eval_set = RS_init; 2082 DEBUG_r(DEBUG_s( 2083 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", 2084 (IV)(PL_stack_sp - PL_stack_base)); 2085 )); 2086 SAVEI32(cxstack[cxstack_ix].blk_oldsp); 2087 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; 2088 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ 2089 SAVETMPS; 2090 /* Apparently this is not needed, judging by wantarray. */ 2091 /* SAVEI8(cxstack[cxstack_ix].blk_gimme); 2092 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ 2093 2094 if (PL_reg_sv) { 2095 /* Make $_ available to executed code. */ 2096 if (PL_reg_sv != DEFSV) { 2097 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */ 2098 SAVESPTR(DEFSV); 2099 DEFSV = PL_reg_sv; 2100 } 2101 2102 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) 2103 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) { 2104 /* prepare for quick setting of pos */ 2105 sv_magic(PL_reg_sv, (SV*)0, 2106 PERL_MAGIC_regex_global, Nullch, 0); 2107 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global); 2108 mg->mg_len = -1; 2109 } 2110 PL_reg_magic = mg; 2111 PL_reg_oldpos = mg->mg_len; 2112 SAVEDESTRUCTOR_X(restore_pos, 0); 2113 } 2114 if (!PL_reg_curpm) { 2115 Newz(22,PL_reg_curpm, 1, PMOP); 2116#ifdef USE_ITHREADS 2117 { 2118 SV* repointer = newSViv(0); 2119 /* so we know which PL_regex_padav element is PL_reg_curpm */ 2120 SvFLAGS(repointer) |= SVf_BREAK; 2121 av_push(PL_regex_padav,repointer); 2122 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); 2123 PL_regex_pad = AvARRAY(PL_regex_padav); 2124 } 2125#endif 2126 } 2127 PM_SETRE(PL_reg_curpm, prog); 2128 PL_reg_oldcurpm = PL_curpm; 2129 PL_curpm = PL_reg_curpm; 2130 if (RX_MATCH_COPIED(prog)) { 2131 /* Here is a serious problem: we cannot rewrite subbeg, 2132 since it may be needed if this match fails. Thus 2133 $` inside (?{}) could fail... */ 2134 PL_reg_oldsaved = prog->subbeg; 2135 PL_reg_oldsavedlen = prog->sublen; 2136 RX_MATCH_COPIED_off(prog); 2137 } 2138 else 2139 PL_reg_oldsaved = Nullch; 2140 prog->subbeg = PL_bostr; 2141 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ 2142 } 2143 prog->startp[0] = startpos - PL_bostr; 2144 PL_reginput = startpos; 2145 PL_regstartp = prog->startp; 2146 PL_regendp = prog->endp; 2147 PL_reglastparen = &prog->lastparen; 2148 PL_reglastcloseparen = &prog->lastcloseparen; 2149 prog->lastparen = 0; 2150 prog->lastcloseparen = 0; 2151 PL_regsize = 0; 2152 DEBUG_r(PL_reg_starttry = startpos); 2153 if (PL_reg_start_tmpl <= prog->nparens) { 2154 PL_reg_start_tmpl = prog->nparens*3/2 + 3; 2155 if(PL_reg_start_tmp) 2156 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2157 else 2158 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2159 } 2160 2161 /* XXXX What this code is doing here?!!! There should be no need 2162 to do this again and again, PL_reglastparen should take care of 2163 this! --ilya*/ 2164 2165 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 2166 * Actually, the code in regcppop() (which Ilya may be meaning by 2167 * PL_reglastparen), is not needed at all by the test suite 2168 * (op/regexp, op/pat, op/split), but that code is needed, oddly 2169 * enough, for building DynaLoader, or otherwise this 2170 * "Error: '*' not in typemap in DynaLoader.xs, line 164" 2171 * will happen. Meanwhile, this code *is* needed for the 2172 * above-mentioned test suite tests to succeed. The common theme 2173 * on those tests seems to be returning null fields from matches. 2174 * --jhi */ 2175#if 1 2176 sp = prog->startp; 2177 ep = prog->endp; 2178 if (prog->nparens) { 2179 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { 2180 *++sp = -1; 2181 *++ep = -1; 2182 } 2183 } 2184#endif 2185 REGCP_SET(lastcp); 2186 if (regmatch(prog->program + 1)) { 2187 prog->endp[0] = PL_reginput - PL_bostr; 2188 return 1; 2189 } 2190 REGCP_UNWIND(lastcp); 2191 return 0; 2192} 2193 2194#define RE_UNWIND_BRANCH 1 2195#define RE_UNWIND_BRANCHJ 2 2196 2197union re_unwind_t; 2198 2199typedef struct { /* XX: makes sense to enlarge it... */ 2200 I32 type; 2201 I32 prev; 2202 CHECKPOINT lastcp; 2203} re_unwind_generic_t; 2204 2205typedef struct { 2206 I32 type; 2207 I32 prev; 2208 CHECKPOINT lastcp; 2209 I32 lastparen; 2210 regnode *next; 2211 char *locinput; 2212 I32 nextchr; 2213#ifdef DEBUGGING 2214 int regindent; 2215#endif 2216} re_unwind_branch_t; 2217 2218typedef union re_unwind_t { 2219 I32 type; 2220 re_unwind_generic_t generic; 2221 re_unwind_branch_t branch; 2222} re_unwind_t; 2223 2224#define sayYES goto yes 2225#define sayNO goto no 2226#define sayNO_ANYOF goto no_anyof 2227#define sayYES_FINAL goto yes_final 2228#define sayYES_LOUD goto yes_loud 2229#define sayNO_FINAL goto no_final 2230#define sayNO_SILENT goto do_no 2231#define saySAME(x) if (x) goto yes; else goto no 2232 2233#define REPORT_CODE_OFF 24 2234 2235/* 2236 - regmatch - main matching routine 2237 * 2238 * Conceptually the strategy is simple: check to see whether the current 2239 * node matches, call self recursively to see whether the rest matches, 2240 * and then act accordingly. In practice we make some effort to avoid 2241 * recursion, in particular by going through "ordinary" nodes (that don't 2242 * need to know whether the rest of the match failed) by a loop instead of 2243 * by recursion. 2244 */ 2245/* [lwall] I've hoisted the register declarations to the outer block in order to 2246 * maybe save a little bit of pushing and popping on the stack. It also takes 2247 * advantage of machines that use a register save mask on subroutine entry. 2248 */ 2249STATIC I32 /* 0 failure, 1 success */ 2250S_regmatch(pTHX_ regnode *prog) 2251{ 2252 register regnode *scan; /* Current node. */ 2253 regnode *next; /* Next node. */ 2254 regnode *inner; /* Next node in internal branch. */ 2255 register I32 nextchr; /* renamed nextchr - nextchar colides with 2256 function of same name */ 2257 register I32 n; /* no or next */ 2258 register I32 ln = 0; /* len or last */ 2259 register char *s = Nullch; /* operand or save */ 2260 register char *locinput = PL_reginput; 2261 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */ 2262 int minmod = 0, sw = 0, logical = 0; 2263 I32 unwind = 0; 2264#if 0 2265 I32 firstcp = PL_savestack_ix; 2266#endif 2267 register bool do_utf8 = PL_reg_match_utf8; 2268#ifdef DEBUGGING 2269 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0); 2270 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1); 2271 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2); 2272#endif 2273 2274#ifdef DEBUGGING 2275 PL_regindent++; 2276#endif 2277 2278 /* Note that nextchr is a byte even in UTF */ 2279 nextchr = UCHARAT(locinput); 2280 scan = prog; 2281 while (scan != NULL) { 2282 2283 DEBUG_r( { 2284 SV *prop = sv_newmortal(); 2285 int docolor = *PL_colors[0]; 2286 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 2287 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput); 2288 /* The part of the string before starttry has one color 2289 (pref0_len chars), between starttry and current 2290 position another one (pref_len - pref0_len chars), 2291 after the current position the third one. 2292 We assume that pref0_len <= pref_len, otherwise we 2293 decrease pref0_len. */ 2294 int pref_len = (locinput - PL_bostr) > (5 + taill) - l 2295 ? (5 + taill) - l : locinput - PL_bostr; 2296 int pref0_len; 2297 2298 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 2299 pref_len++; 2300 pref0_len = pref_len - (locinput - PL_reg_starttry); 2301 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) 2302 l = ( PL_regeol - locinput > (5 + taill) - pref_len 2303 ? (5 + taill) - pref_len : PL_regeol - locinput); 2304 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 2305 l--; 2306 if (pref0_len < 0) 2307 pref0_len = 0; 2308 if (pref0_len > pref_len) 2309 pref0_len = pref_len; 2310 regprop(prop, scan); 2311 { 2312 char *s0 = 2313 do_utf8 && OP(scan) != CANY ? 2314 pv_uni_display(dsv0, (U8*)(locinput - pref_len), 2315 pref0_len, 60, UNI_DISPLAY_REGEX) : 2316 locinput - pref_len; 2317 int len0 = do_utf8 ? strlen(s0) : pref0_len; 2318 char *s1 = do_utf8 && OP(scan) != CANY ? 2319 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len), 2320 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) : 2321 locinput - pref_len + pref0_len; 2322 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len; 2323 char *s2 = do_utf8 && OP(scan) != CANY ? 2324 pv_uni_display(dsv2, (U8*)locinput, 2325 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) : 2326 locinput; 2327 int len2 = do_utf8 ? strlen(s2) : l; 2328 PerlIO_printf(Perl_debug_log, 2329 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n", 2330 (IV)(locinput - PL_bostr), 2331 PL_colors[4], 2332 len0, s0, 2333 PL_colors[5], 2334 PL_colors[2], 2335 len1, s1, 2336 PL_colors[3], 2337 (docolor ? "" : "> <"), 2338 PL_colors[0], 2339 len2, s2, 2340 PL_colors[1], 2341 15 - l - pref_len + 1, 2342 "", 2343 (IV)(scan - PL_regprogram), PL_regindent*2, "", 2344 SvPVX(prop)); 2345 } 2346 }); 2347 2348 next = scan + NEXT_OFF(scan); 2349 if (next == scan) 2350 next = NULL; 2351 2352 switch (OP(scan)) { 2353 case BOL: 2354 if (locinput == PL_bostr || (PL_multiline && 2355 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) 2356 { 2357 /* regtill = regbol; */ 2358 break; 2359 } 2360 sayNO; 2361 case MBOL: 2362 if (locinput == PL_bostr || 2363 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n')) 2364 { 2365 break; 2366 } 2367 sayNO; 2368 case SBOL: 2369 if (locinput == PL_bostr) 2370 break; 2371 sayNO; 2372 case GPOS: 2373 if (locinput == PL_reg_ganch) 2374 break; 2375 sayNO; 2376 case EOL: 2377 if (PL_multiline) 2378 goto meol; 2379 else 2380 goto seol; 2381 case MEOL: 2382 meol: 2383 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 2384 sayNO; 2385 break; 2386 case SEOL: 2387 seol: 2388 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 2389 sayNO; 2390 if (PL_regeol - locinput > 1) 2391 sayNO; 2392 break; 2393 case EOS: 2394 if (PL_regeol != locinput) 2395 sayNO; 2396 break; 2397 case SANY: 2398 if (!nextchr && locinput >= PL_regeol) 2399 sayNO; 2400 if (do_utf8) { 2401 locinput += PL_utf8skip[nextchr]; 2402 if (locinput > PL_regeol) 2403 sayNO; 2404 nextchr = UCHARAT(locinput); 2405 } 2406 else 2407 nextchr = UCHARAT(++locinput); 2408 break; 2409 case CANY: 2410 if (!nextchr && locinput >= PL_regeol) 2411 sayNO; 2412 nextchr = UCHARAT(++locinput); 2413 break; 2414 case REG_ANY: 2415 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') 2416 sayNO; 2417 if (do_utf8) { 2418 locinput += PL_utf8skip[nextchr]; 2419 if (locinput > PL_regeol) 2420 sayNO; 2421 nextchr = UCHARAT(locinput); 2422 } 2423 else 2424 nextchr = UCHARAT(++locinput); 2425 break; 2426 case EXACT: 2427 s = STRING(scan); 2428 ln = STR_LEN(scan); 2429 if (do_utf8 != UTF) { 2430 /* The target and the pattern have differing utf8ness. */ 2431 char *l = locinput; 2432 char *e = s + ln; 2433 STRLEN ulen; 2434 2435 if (do_utf8) { 2436 /* The target is utf8, the pattern is not utf8. */ 2437 while (s < e) { 2438 if (l >= PL_regeol) 2439 sayNO; 2440 if (NATIVE_TO_UNI(*(U8*)s) != 2441 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, 2442 ckWARN(WARN_UTF8) ? 2443 0 : UTF8_ALLOW_ANY)) 2444 sayNO; 2445 l += ulen; 2446 s ++; 2447 } 2448 } 2449 else { 2450 /* The target is not utf8, the pattern is utf8. */ 2451 while (s < e) { 2452 if (l >= PL_regeol) 2453 sayNO; 2454 if (NATIVE_TO_UNI(*((U8*)l)) != 2455 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, 2456 ckWARN(WARN_UTF8) ? 2457 0 : UTF8_ALLOW_ANY)) 2458 sayNO; 2459 s += ulen; 2460 l ++; 2461 } 2462 } 2463 locinput = l; 2464 nextchr = UCHARAT(locinput); 2465 break; 2466 } 2467 /* The target and the pattern have the same utf8ness. */ 2468 /* Inline the first character, for speed. */ 2469 if (UCHARAT(s) != nextchr) 2470 sayNO; 2471 if (PL_regeol - locinput < ln) 2472 sayNO; 2473 if (ln > 1 && memNE(s, locinput, ln)) 2474 sayNO; 2475 locinput += ln; 2476 nextchr = UCHARAT(locinput); 2477 break; 2478 case EXACTFL: 2479 PL_reg_flags |= RF_tainted; 2480 /* FALL THROUGH */ 2481 case EXACTF: 2482 s = STRING(scan); 2483 ln = STR_LEN(scan); 2484 2485 if (do_utf8 || UTF) { 2486 /* Either target or the pattern are utf8. */ 2487 char *l = locinput; 2488 char *e = PL_regeol; 2489 2490 if (ibcmp_utf8(s, 0, ln, (bool)UTF, 2491 l, &e, 0, do_utf8)) { 2492 /* One more case for the sharp s: 2493 * pack("U0U*", 0xDF) =~ /ss/i, 2494 * the 0xC3 0x9F are the UTF-8 2495 * byte sequence for the U+00DF. */ 2496 if (!(do_utf8 && 2497 toLOWER(s[0]) == 's' && 2498 ln >= 2 && 2499 toLOWER(s[1]) == 's' && 2500 (U8)l[0] == 0xC3 && 2501 e - l >= 2 && 2502 (U8)l[1] == 0x9F)) 2503 sayNO; 2504 } 2505 locinput = e; 2506 nextchr = UCHARAT(locinput); 2507 break; 2508 } 2509 2510 /* Neither the target and the pattern are utf8. */ 2511 2512 /* Inline the first character, for speed. */ 2513 if (UCHARAT(s) != nextchr && 2514 UCHARAT(s) != ((OP(scan) == EXACTF) 2515 ? PL_fold : PL_fold_locale)[nextchr]) 2516 sayNO; 2517 if (PL_regeol - locinput < ln) 2518 sayNO; 2519 if (ln > 1 && (OP(scan) == EXACTF 2520 ? ibcmp(s, locinput, ln) 2521 : ibcmp_locale(s, locinput, ln))) 2522 sayNO; 2523 locinput += ln; 2524 nextchr = UCHARAT(locinput); 2525 break; 2526 case ANYOF: 2527 if (do_utf8) { 2528 STRLEN inclasslen = PL_regeol - locinput; 2529 2530 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8)) 2531 sayNO_ANYOF; 2532 if (locinput >= PL_regeol) 2533 sayNO; 2534 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); 2535 nextchr = UCHARAT(locinput); 2536 break; 2537 } 2538 else { 2539 if (nextchr < 0) 2540 nextchr = UCHARAT(locinput); 2541 if (!REGINCLASS(scan, (U8*)locinput)) 2542 sayNO_ANYOF; 2543 if (!nextchr && locinput >= PL_regeol) 2544 sayNO; 2545 nextchr = UCHARAT(++locinput); 2546 break; 2547 } 2548 no_anyof: 2549 /* If we might have the case of the German sharp s 2550 * in a casefolding Unicode character class. */ 2551 2552 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) { 2553 locinput += SHARP_S_SKIP; 2554 nextchr = UCHARAT(locinput); 2555 } 2556 else 2557 sayNO; 2558 break; 2559 case ALNUML: 2560 PL_reg_flags |= RF_tainted; 2561 /* FALL THROUGH */ 2562 case ALNUM: 2563 if (!nextchr) 2564 sayNO; 2565 if (do_utf8) { 2566 LOAD_UTF8_CHARCLASS(alnum,"a"); 2567 if (!(OP(scan) == ALNUM 2568 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) 2569 : isALNUM_LC_utf8((U8*)locinput))) 2570 { 2571 sayNO; 2572 } 2573 locinput += PL_utf8skip[nextchr]; 2574 nextchr = UCHARAT(locinput); 2575 break; 2576 } 2577 if (!(OP(scan) == ALNUM 2578 ? isALNUM(nextchr) : isALNUM_LC(nextchr))) 2579 sayNO; 2580 nextchr = UCHARAT(++locinput); 2581 break; 2582 case NALNUML: 2583 PL_reg_flags |= RF_tainted; 2584 /* FALL THROUGH */ 2585 case NALNUM: 2586 if (!nextchr && locinput >= PL_regeol) 2587 sayNO; 2588 if (do_utf8) { 2589 LOAD_UTF8_CHARCLASS(alnum,"a"); 2590 if (OP(scan) == NALNUM 2591 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8) 2592 : isALNUM_LC_utf8((U8*)locinput)) 2593 { 2594 sayNO; 2595 } 2596 locinput += PL_utf8skip[nextchr]; 2597 nextchr = UCHARAT(locinput); 2598 break; 2599 } 2600 if (OP(scan) == NALNUM 2601 ? isALNUM(nextchr) : isALNUM_LC(nextchr)) 2602 sayNO; 2603 nextchr = UCHARAT(++locinput); 2604 break; 2605 case BOUNDL: 2606 case NBOUNDL: 2607 PL_reg_flags |= RF_tainted; 2608 /* FALL THROUGH */ 2609 case BOUND: 2610 case NBOUND: 2611 /* was last char in word? */ 2612 if (do_utf8) { 2613 if (locinput == PL_bostr) 2614 ln = '\n'; 2615 else { 2616 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr); 2617 2618 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0); 2619 } 2620 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 2621 ln = isALNUM_uni(ln); 2622 LOAD_UTF8_CHARCLASS(alnum,"a"); 2623 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); 2624 } 2625 else { 2626 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); 2627 n = isALNUM_LC_utf8((U8*)locinput); 2628 } 2629 } 2630 else { 2631 ln = (locinput != PL_bostr) ? 2632 UCHARAT(locinput - 1) : '\n'; 2633 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 2634 ln = isALNUM(ln); 2635 n = isALNUM(nextchr); 2636 } 2637 else { 2638 ln = isALNUM_LC(ln); 2639 n = isALNUM_LC(nextchr); 2640 } 2641 } 2642 if (((!ln) == (!n)) == (OP(scan) == BOUND || 2643 OP(scan) == BOUNDL)) 2644 sayNO; 2645 break; 2646 case SPACEL: 2647 PL_reg_flags |= RF_tainted; 2648 /* FALL THROUGH */ 2649 case SPACE: 2650 if (!nextchr) 2651 sayNO; 2652 if (do_utf8) { 2653 if (UTF8_IS_CONTINUED(nextchr)) { 2654 LOAD_UTF8_CHARCLASS(space," "); 2655 if (!(OP(scan) == SPACE 2656 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) 2657 : isSPACE_LC_utf8((U8*)locinput))) 2658 { 2659 sayNO; 2660 } 2661 locinput += PL_utf8skip[nextchr]; 2662 nextchr = UCHARAT(locinput); 2663 break; 2664 } 2665 if (!(OP(scan) == SPACE 2666 ? isSPACE(nextchr) : isSPACE_LC(nextchr))) 2667 sayNO; 2668 nextchr = UCHARAT(++locinput); 2669 } 2670 else { 2671 if (!(OP(scan) == SPACE 2672 ? isSPACE(nextchr) : isSPACE_LC(nextchr))) 2673 sayNO; 2674 nextchr = UCHARAT(++locinput); 2675 } 2676 break; 2677 case NSPACEL: 2678 PL_reg_flags |= RF_tainted; 2679 /* FALL THROUGH */ 2680 case NSPACE: 2681 if (!nextchr && locinput >= PL_regeol) 2682 sayNO; 2683 if (do_utf8) { 2684 LOAD_UTF8_CHARCLASS(space," "); 2685 if (OP(scan) == NSPACE 2686 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8) 2687 : isSPACE_LC_utf8((U8*)locinput)) 2688 { 2689 sayNO; 2690 } 2691 locinput += PL_utf8skip[nextchr]; 2692 nextchr = UCHARAT(locinput); 2693 break; 2694 } 2695 if (OP(scan) == NSPACE 2696 ? isSPACE(nextchr) : isSPACE_LC(nextchr)) 2697 sayNO; 2698 nextchr = UCHARAT(++locinput); 2699 break; 2700 case DIGITL: 2701 PL_reg_flags |= RF_tainted; 2702 /* FALL THROUGH */ 2703 case DIGIT: 2704 if (!nextchr) 2705 sayNO; 2706 if (do_utf8) { 2707 LOAD_UTF8_CHARCLASS(digit,"0"); 2708 if (!(OP(scan) == DIGIT 2709 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) 2710 : isDIGIT_LC_utf8((U8*)locinput))) 2711 { 2712 sayNO; 2713 } 2714 locinput += PL_utf8skip[nextchr]; 2715 nextchr = UCHARAT(locinput); 2716 break; 2717 } 2718 if (!(OP(scan) == DIGIT 2719 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) 2720 sayNO; 2721 nextchr = UCHARAT(++locinput); 2722 break; 2723 case NDIGITL: 2724 PL_reg_flags |= RF_tainted; 2725 /* FALL THROUGH */ 2726 case NDIGIT: 2727 if (!nextchr && locinput >= PL_regeol) 2728 sayNO; 2729 if (do_utf8) { 2730 LOAD_UTF8_CHARCLASS(digit,"0"); 2731 if (OP(scan) == NDIGIT 2732 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8) 2733 : isDIGIT_LC_utf8((U8*)locinput)) 2734 { 2735 sayNO; 2736 } 2737 locinput += PL_utf8skip[nextchr]; 2738 nextchr = UCHARAT(locinput); 2739 break; 2740 } 2741 if (OP(scan) == NDIGIT 2742 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) 2743 sayNO; 2744 nextchr = UCHARAT(++locinput); 2745 break; 2746 case CLUMP: 2747 if (locinput >= PL_regeol) 2748 sayNO; 2749 if (do_utf8) { 2750 LOAD_UTF8_CHARCLASS(mark,"~"); 2751 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) 2752 sayNO; 2753 locinput += PL_utf8skip[nextchr]; 2754 while (locinput < PL_regeol && 2755 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) 2756 locinput += UTF8SKIP(locinput); 2757 if (locinput > PL_regeol) 2758 sayNO; 2759 } 2760 else 2761 locinput++; 2762 nextchr = UCHARAT(locinput); 2763 break; 2764 case REFFL: 2765 PL_reg_flags |= RF_tainted; 2766 /* FALL THROUGH */ 2767 case REF: 2768 case REFF: 2769 n = ARG(scan); /* which paren pair */ 2770 ln = PL_regstartp[n]; 2771 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 2772 if ((I32)*PL_reglastparen < n || ln == -1) 2773 sayNO; /* Do not match unless seen CLOSEn. */ 2774 if (ln == PL_regendp[n]) 2775 break; 2776 2777 s = PL_bostr + ln; 2778 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */ 2779 char *l = locinput; 2780 char *e = PL_bostr + PL_regendp[n]; 2781 /* 2782 * Note that we can't do the "other character" lookup trick as 2783 * in the 8-bit case (no pun intended) because in Unicode we 2784 * have to map both upper and title case to lower case. 2785 */ 2786 if (OP(scan) == REFF) { 2787 STRLEN ulen1, ulen2; 2788 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; 2789 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; 2790 while (s < e) { 2791 if (l >= PL_regeol) 2792 sayNO; 2793 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); 2794 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); 2795 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) 2796 sayNO; 2797 s += ulen1; 2798 l += ulen2; 2799 } 2800 } 2801 locinput = l; 2802 nextchr = UCHARAT(locinput); 2803 break; 2804 } 2805 2806 /* Inline the first character, for speed. */ 2807 if (UCHARAT(s) != nextchr && 2808 (OP(scan) == REF || 2809 (UCHARAT(s) != ((OP(scan) == REFF 2810 ? PL_fold : PL_fold_locale)[nextchr])))) 2811 sayNO; 2812 ln = PL_regendp[n] - ln; 2813 if (locinput + ln > PL_regeol) 2814 sayNO; 2815 if (ln > 1 && (OP(scan) == REF 2816 ? memNE(s, locinput, ln) 2817 : (OP(scan) == REFF 2818 ? ibcmp(s, locinput, ln) 2819 : ibcmp_locale(s, locinput, ln)))) 2820 sayNO; 2821 locinput += ln; 2822 nextchr = UCHARAT(locinput); 2823 break; 2824 2825 case NOTHING: 2826 case TAIL: 2827 break; 2828 case BACK: 2829 break; 2830 case EVAL: 2831 { 2832 dSP; 2833 OP_4tree *oop = PL_op; 2834 COP *ocurcop = PL_curcop; 2835 PAD *old_comppad; 2836 SV *ret; 2837 struct regexp *oreg = PL_reg_re; 2838 2839 n = ARG(scan); 2840 PL_op = (OP_4tree*)PL_regdata->data[n]; 2841 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); 2842 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]); 2843 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr; 2844 2845 { 2846 SV **before = SP; 2847 CALLRUNOPS(aTHX); /* Scalar context. */ 2848 SPAGAIN; 2849 if (SP == before) 2850 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 2851 else { 2852 ret = POPs; 2853 PUTBACK; 2854 } 2855 } 2856 2857 PL_op = oop; 2858 PAD_RESTORE_LOCAL(old_comppad); 2859 PL_curcop = ocurcop; 2860 if (logical) { 2861 if (logical == 2) { /* Postponed subexpression. */ 2862 regexp *re; 2863 MAGIC *mg = Null(MAGIC*); 2864 re_cc_state state; 2865 CHECKPOINT cp, lastcp; 2866 int toggleutf; 2867 register SV *sv; 2868 2869 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret))) 2870 mg = mg_find(sv, PERL_MAGIC_qr); 2871 else if (SvSMAGICAL(ret)) { 2872 if (SvGMAGICAL(ret)) 2873 sv_unmagic(ret, PERL_MAGIC_qr); 2874 else 2875 mg = mg_find(ret, PERL_MAGIC_qr); 2876 } 2877 2878 if (mg) { 2879 re = (regexp *)mg->mg_obj; 2880 (void)ReREFCNT_inc(re); 2881 } 2882 else { 2883 STRLEN len; 2884 char *t = SvPV(ret, len); 2885 PMOP pm; 2886 char *oprecomp = PL_regprecomp; 2887 I32 osize = PL_regsize; 2888 I32 onpar = PL_regnpar; 2889 2890 Zero(&pm, 1, PMOP); 2891 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8; 2892 re = CALLREGCOMP(aTHX_ t, t + len, &pm); 2893 if (!(SvFLAGS(ret) 2894 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY 2895 | SVs_GMG))) 2896 sv_magic(ret,(SV*)ReREFCNT_inc(re), 2897 PERL_MAGIC_qr,0,0); 2898 PL_regprecomp = oprecomp; 2899 PL_regsize = osize; 2900 PL_regnpar = onpar; 2901 } 2902 DEBUG_r( 2903 PerlIO_printf(Perl_debug_log, 2904 "Entering embedded `%s%.60s%s%s'\n", 2905 PL_colors[0], 2906 re->precomp, 2907 PL_colors[1], 2908 (strlen(re->precomp) > 60 ? "..." : "")) 2909 ); 2910 state.node = next; 2911 state.prev = PL_reg_call_cc; 2912 state.cc = PL_regcc; 2913 state.re = PL_reg_re; 2914 2915 PL_regcc = 0; 2916 2917 cp = regcppush(0); /* Save *all* the positions. */ 2918 REGCP_SET(lastcp); 2919 cache_re(re); 2920 state.ss = PL_savestack_ix; 2921 *PL_reglastparen = 0; 2922 *PL_reglastcloseparen = 0; 2923 PL_reg_call_cc = &state; 2924 PL_reginput = locinput; 2925 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^ 2926 ((re->reganch & ROPT_UTF8) != 0); 2927 if (toggleutf) PL_reg_flags ^= RF_utf8; 2928 2929 /* XXXX This is too dramatic a measure... */ 2930 PL_reg_maxiter = 0; 2931 2932 if (regmatch(re->program + 1)) { 2933 /* Even though we succeeded, we need to restore 2934 global variables, since we may be wrapped inside 2935 SUSPEND, thus the match may be not finished yet. */ 2936 2937 /* XXXX Do this only if SUSPENDed? */ 2938 PL_reg_call_cc = state.prev; 2939 PL_regcc = state.cc; 2940 PL_reg_re = state.re; 2941 cache_re(PL_reg_re); 2942 if (toggleutf) PL_reg_flags ^= RF_utf8; 2943 2944 /* XXXX This is too dramatic a measure... */ 2945 PL_reg_maxiter = 0; 2946 2947 /* These are needed even if not SUSPEND. */ 2948 ReREFCNT_dec(re); 2949 regcpblow(cp); 2950 sayYES; 2951 } 2952 ReREFCNT_dec(re); 2953 REGCP_UNWIND(lastcp); 2954 regcppop(); 2955 PL_reg_call_cc = state.prev; 2956 PL_regcc = state.cc; 2957 PL_reg_re = state.re; 2958 cache_re(PL_reg_re); 2959 if (toggleutf) PL_reg_flags ^= RF_utf8; 2960 2961 /* XXXX This is too dramatic a measure... */ 2962 PL_reg_maxiter = 0; 2963 2964 logical = 0; 2965 sayNO; 2966 } 2967 sw = SvTRUE(ret); 2968 logical = 0; 2969 } 2970 else { 2971 sv_setsv(save_scalar(PL_replgv), ret); 2972 cache_re(oreg); 2973 } 2974 break; 2975 } 2976 case OPEN: 2977 n = ARG(scan); /* which paren pair */ 2978 PL_reg_start_tmp[n] = locinput; 2979 if (n > PL_regsize) 2980 PL_regsize = n; 2981 break; 2982 case CLOSE: 2983 n = ARG(scan); /* which paren pair */ 2984 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr; 2985 PL_regendp[n] = locinput - PL_bostr; 2986 if (n > (I32)*PL_reglastparen) 2987 *PL_reglastparen = n; 2988 *PL_reglastcloseparen = n; 2989 break; 2990 case GROUPP: 2991 n = ARG(scan); /* which paren pair */ 2992 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1); 2993 break; 2994 case IFTHEN: 2995 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 2996 if (sw) 2997 next = NEXTOPER(NEXTOPER(scan)); 2998 else { 2999 next = scan + ARG(scan); 3000 if (OP(next) == IFTHEN) /* Fake one. */ 3001 next = NEXTOPER(NEXTOPER(next)); 3002 } 3003 break; 3004 case LOGICAL: 3005 logical = scan->flags; 3006 break; 3007/******************************************************************* 3008 PL_regcc contains infoblock about the innermost (...)* loop, and 3009 a pointer to the next outer infoblock. 3010 3011 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM): 3012 3013 1) After matching X, regnode for CURLYX is processed; 3014 3015 2) This regnode creates infoblock on the stack, and calls 3016 regmatch() recursively with the starting point at WHILEM node; 3017 3018 3) Each hit of WHILEM node tries to match A and Z (in the order 3019 depending on the current iteration, min/max of {min,max} and 3020 greediness). The information about where are nodes for "A" 3021 and "Z" is read from the infoblock, as is info on how many times "A" 3022 was already matched, and greediness. 3023 3024 4) After A matches, the same WHILEM node is hit again. 3025 3026 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX 3027 of the same pair. Thus when WHILEM tries to match Z, it temporarily 3028 resets PL_regcc, since this Y(A)*Z can be a part of some other loop: 3029 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node 3030 of the external loop. 3031 3032 Currently present infoblocks form a tree with a stem formed by PL_curcc 3033 and whatever it mentions via ->next, and additional attached trees 3034 corresponding to temporarily unset infoblocks as in "5" above. 3035 3036 In the following picture infoblocks for outer loop of 3037 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block 3038 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed 3039 infoblocks are drawn below the "reset" infoblock. 3040 3041 In fact in the picture below we do not show failed matches for Z and T 3042 by WHILEM blocks. [We illustrate minimal matches, since for them it is 3043 more obvious *why* one needs to *temporary* unset infoblocks.] 3044 3045 Matched REx position InfoBlocks Comment 3046 (Y(A)*?Z)*?T x 3047 Y(A)*?Z)*?T x <- O 3048 Y (A)*?Z)*?T x <- O 3049 Y A)*?Z)*?T x <- O <- I 3050 YA )*?Z)*?T x <- O <- I 3051 YA A)*?Z)*?T x <- O <- I 3052 YAA )*?Z)*?T x <- O <- I 3053 YAA Z)*?T x <- O # Temporary unset I 3054 I 3055 3056 YAAZ Y(A)*?Z)*?T x <- O 3057 I 3058 3059 YAAZY (A)*?Z)*?T x <- O 3060 I 3061 3062 YAAZY A)*?Z)*?T x <- O <- I 3063 I 3064 3065 YAAZYA )*?Z)*?T x <- O <- I 3066 I 3067 3068 YAAZYA Z)*?T x <- O # Temporary unset I 3069 I,I 3070 3071 YAAZYAZ )*?T x <- O 3072 I,I 3073 3074 YAAZYAZ T x # Temporary unset O 3075 O 3076 I,I 3077 3078 YAAZYAZT x 3079 O 3080 I,I 3081 *******************************************************************/ 3082 case CURLYX: { 3083 CURCUR cc; 3084 CHECKPOINT cp = PL_savestack_ix; 3085 /* No need to save/restore up to this paren */ 3086 I32 parenfloor = scan->flags; 3087 3088 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 3089 next += ARG(next); 3090 cc.oldcc = PL_regcc; 3091 PL_regcc = &cc; 3092 /* XXXX Probably it is better to teach regpush to support 3093 parenfloor > PL_regsize... */ 3094 if (parenfloor > (I32)*PL_reglastparen) 3095 parenfloor = *PL_reglastparen; /* Pessimization... */ 3096 cc.parenfloor = parenfloor; 3097 cc.cur = -1; 3098 cc.min = ARG1(scan); 3099 cc.max = ARG2(scan); 3100 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 3101 cc.next = next; 3102 cc.minmod = minmod; 3103 cc.lastloc = 0; 3104 PL_reginput = locinput; 3105 n = regmatch(PREVOPER(next)); /* start on the WHILEM */ 3106 regcpblow(cp); 3107 PL_regcc = cc.oldcc; 3108 saySAME(n); 3109 } 3110 /* NOT REACHED */ 3111 case WHILEM: { 3112 /* 3113 * This is really hard to understand, because after we match 3114 * what we're trying to match, we must make sure the rest of 3115 * the REx is going to match for sure, and to do that we have 3116 * to go back UP the parse tree by recursing ever deeper. And 3117 * if it fails, we have to reset our parent's current state 3118 * that we can try again after backing off. 3119 */ 3120 3121 CHECKPOINT cp, lastcp; 3122 CURCUR* cc = PL_regcc; 3123 char *lastloc = cc->lastloc; /* Detection of 0-len. */ 3124 3125 n = cc->cur + 1; /* how many we know we matched */ 3126 PL_reginput = locinput; 3127 3128 DEBUG_r( 3129 PerlIO_printf(Perl_debug_log, 3130 "%*s %ld out of %ld..%ld cc=%"UVxf"\n", 3131 REPORT_CODE_OFF+PL_regindent*2, "", 3132 (long)n, (long)cc->min, 3133 (long)cc->max, PTR2UV(cc)) 3134 ); 3135 3136 /* If degenerate scan matches "", assume scan done. */ 3137 3138 if (locinput == cc->lastloc && n >= cc->min) { 3139 PL_regcc = cc->oldcc; 3140 if (PL_regcc) 3141 ln = PL_regcc->cur; 3142 DEBUG_r( 3143 PerlIO_printf(Perl_debug_log, 3144 "%*s empty match detected, try continuation...\n", 3145 REPORT_CODE_OFF+PL_regindent*2, "") 3146 ); 3147 if (regmatch(cc->next)) 3148 sayYES; 3149 if (PL_regcc) 3150 PL_regcc->cur = ln; 3151 PL_regcc = cc; 3152 sayNO; 3153 } 3154 3155 /* First just match a string of min scans. */ 3156 3157 if (n < cc->min) { 3158 cc->cur = n; 3159 cc->lastloc = locinput; 3160 if (regmatch(cc->scan)) 3161 sayYES; 3162 cc->cur = n - 1; 3163 cc->lastloc = lastloc; 3164 sayNO; 3165 } 3166 3167 if (scan->flags) { 3168 /* Check whether we already were at this position. 3169 Postpone detection until we know the match is not 3170 *that* much linear. */ 3171 if (!PL_reg_maxiter) { 3172 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); 3173 PL_reg_leftiter = PL_reg_maxiter; 3174 } 3175 if (PL_reg_leftiter-- == 0) { 3176 I32 size = (PL_reg_maxiter + 7)/8; 3177 if (PL_reg_poscache) { 3178 if ((I32)PL_reg_poscache_size < size) { 3179 Renew(PL_reg_poscache, size, char); 3180 PL_reg_poscache_size = size; 3181 } 3182 Zero(PL_reg_poscache, size, char); 3183 } 3184 else { 3185 PL_reg_poscache_size = size; 3186 Newz(29, PL_reg_poscache, size, char); 3187 } 3188 DEBUG_r( 3189 PerlIO_printf(Perl_debug_log, 3190 "%sDetected a super-linear match, switching on caching%s...\n", 3191 PL_colors[4], PL_colors[5]) 3192 ); 3193 } 3194 if (PL_reg_leftiter < 0) { 3195 I32 o = locinput - PL_bostr, b; 3196 3197 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4); 3198 b = o % 8; 3199 o /= 8; 3200 if (PL_reg_poscache[o] & (1<<b)) { 3201 DEBUG_r( 3202 PerlIO_printf(Perl_debug_log, 3203 "%*s already tried at this position...\n", 3204 REPORT_CODE_OFF+PL_regindent*2, "") 3205 ); 3206 if (PL_reg_flags & RF_false) 3207 sayYES; 3208 else 3209 sayNO_SILENT; 3210 } 3211 PL_reg_poscache[o] |= (1<<b); 3212 } 3213 } 3214 3215 /* Prefer next over scan for minimal matching. */ 3216 3217 if (cc->minmod) { 3218 PL_regcc = cc->oldcc; 3219 if (PL_regcc) 3220 ln = PL_regcc->cur; 3221 cp = regcppush(cc->parenfloor); 3222 REGCP_SET(lastcp); 3223 if (regmatch(cc->next)) { 3224 regcpblow(cp); 3225 sayYES; /* All done. */ 3226 } 3227 REGCP_UNWIND(lastcp); 3228 regcppop(); 3229 if (PL_regcc) 3230 PL_regcc->cur = ln; 3231 PL_regcc = cc; 3232 3233 if (n >= cc->max) { /* Maximum greed exceeded? */ 3234 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY 3235 && !(PL_reg_flags & RF_warned)) { 3236 PL_reg_flags |= RF_warned; 3237 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", 3238 "Complex regular subexpression recursion", 3239 REG_INFTY - 1); 3240 } 3241 sayNO; 3242 } 3243 3244 DEBUG_r( 3245 PerlIO_printf(Perl_debug_log, 3246 "%*s trying longer...\n", 3247 REPORT_CODE_OFF+PL_regindent*2, "") 3248 ); 3249 /* Try scanning more and see if it helps. */ 3250 PL_reginput = locinput; 3251 cc->cur = n; 3252 cc->lastloc = locinput; 3253 cp = regcppush(cc->parenfloor); 3254 REGCP_SET(lastcp); 3255 if (regmatch(cc->scan)) { 3256 regcpblow(cp); 3257 sayYES; 3258 } 3259 REGCP_UNWIND(lastcp); 3260 regcppop(); 3261 cc->cur = n - 1; 3262 cc->lastloc = lastloc; 3263 sayNO; 3264 } 3265 3266 /* Prefer scan over next for maximal matching. */ 3267 3268 if (n < cc->max) { /* More greed allowed? */ 3269 cp = regcppush(cc->parenfloor); 3270 cc->cur = n; 3271 cc->lastloc = locinput; 3272 REGCP_SET(lastcp); 3273 if (regmatch(cc->scan)) { 3274 regcpblow(cp); 3275 sayYES; 3276 } 3277 REGCP_UNWIND(lastcp); 3278 regcppop(); /* Restore some previous $<digit>s? */ 3279 PL_reginput = locinput; 3280 DEBUG_r( 3281 PerlIO_printf(Perl_debug_log, 3282 "%*s failed, try continuation...\n", 3283 REPORT_CODE_OFF+PL_regindent*2, "") 3284 ); 3285 } 3286 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY 3287 && !(PL_reg_flags & RF_warned)) { 3288 PL_reg_flags |= RF_warned; 3289 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", 3290 "Complex regular subexpression recursion", 3291 REG_INFTY - 1); 3292 } 3293 3294 /* Failed deeper matches of scan, so see if this one works. */ 3295 PL_regcc = cc->oldcc; 3296 if (PL_regcc) 3297 ln = PL_regcc->cur; 3298 if (regmatch(cc->next)) 3299 sayYES; 3300 if (PL_regcc) 3301 PL_regcc->cur = ln; 3302 PL_regcc = cc; 3303 cc->cur = n - 1; 3304 cc->lastloc = lastloc; 3305 sayNO; 3306 } 3307 /* NOT REACHED */ 3308 case BRANCHJ: 3309 next = scan + ARG(scan); 3310 if (next == scan) 3311 next = NULL; 3312 inner = NEXTOPER(NEXTOPER(scan)); 3313 goto do_branch; 3314 case BRANCH: 3315 inner = NEXTOPER(scan); 3316 do_branch: 3317 { 3318 c1 = OP(scan); 3319 if (OP(next) != c1) /* No choice. */ 3320 next = inner; /* Avoid recursion. */ 3321 else { 3322 I32 lastparen = *PL_reglastparen; 3323 I32 unwind1; 3324 re_unwind_branch_t *uw; 3325 3326 /* Put unwinding data on stack */ 3327 unwind1 = SSNEWt(1,re_unwind_branch_t); 3328 uw = SSPTRt(unwind1,re_unwind_branch_t); 3329 uw->prev = unwind; 3330 unwind = unwind1; 3331 uw->type = ((c1 == BRANCH) 3332 ? RE_UNWIND_BRANCH 3333 : RE_UNWIND_BRANCHJ); 3334 uw->lastparen = lastparen; 3335 uw->next = next; 3336 uw->locinput = locinput; 3337 uw->nextchr = nextchr; 3338#ifdef DEBUGGING 3339 uw->regindent = ++PL_regindent; 3340#endif 3341 3342 REGCP_SET(uw->lastcp); 3343 3344 /* Now go into the first branch */ 3345 next = inner; 3346 } 3347 } 3348 break; 3349 case MINMOD: 3350 minmod = 1; 3351 break; 3352 case CURLYM: 3353 { 3354 I32 l = 0; 3355 CHECKPOINT lastcp; 3356 3357 /* We suppose that the next guy does not need 3358 backtracking: in particular, it is of constant length, 3359 and has no parenths to influence future backrefs. */ 3360 ln = ARG1(scan); /* min to match */ 3361 n = ARG2(scan); /* max to match */ 3362 paren = scan->flags; 3363 if (paren) { 3364 if (paren > PL_regsize) 3365 PL_regsize = paren; 3366 if (paren > (I32)*PL_reglastparen) 3367 *PL_reglastparen = paren; 3368 } 3369 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 3370 if (paren) 3371 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 3372 PL_reginput = locinput; 3373 if (minmod) { 3374 minmod = 0; 3375 if (ln && regrepeat_hard(scan, ln, &l) < ln) 3376 sayNO; 3377 /* if we matched something zero-length we don't need to 3378 backtrack - capturing parens are already defined, so 3379 the caveat in the maximal case doesn't apply 3380 3381 XXXX if ln == 0, we can redo this check first time 3382 through the following loop 3383 */ 3384 if (ln && l == 0) 3385 n = ln; /* don't backtrack */ 3386 locinput = PL_reginput; 3387 if (HAS_TEXT(next) || JUMPABLE(next)) { 3388 regnode *text_node = next; 3389 3390 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); 3391 3392 if (! HAS_TEXT(text_node)) c1 = c2 = -1000; 3393 else { 3394 if (PL_regkind[(U8)OP(text_node)] == REF) { 3395 c1 = c2 = -1000; 3396 goto assume_ok_MM; 3397 } 3398 else { c1 = (U8)*STRING(text_node); } 3399 if (OP(text_node) == EXACTF || OP(text_node) == REFF) 3400 c2 = PL_fold[c1]; 3401 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL) 3402 c2 = PL_fold_locale[c1]; 3403 else 3404 c2 = c1; 3405 } 3406 } 3407 else 3408 c1 = c2 = -1000; 3409 assume_ok_MM: 3410 REGCP_SET(lastcp); 3411 /* This may be improved if l == 0. */ 3412 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */ 3413 /* If it could work, try it. */ 3414 if (c1 == -1000 || 3415 UCHARAT(PL_reginput) == c1 || 3416 UCHARAT(PL_reginput) == c2) 3417 { 3418 if (paren) { 3419 if (ln) { 3420 PL_regstartp[paren] = 3421 HOPc(PL_reginput, -l) - PL_bostr; 3422 PL_regendp[paren] = PL_reginput - PL_bostr; 3423 } 3424 else 3425 PL_regendp[paren] = -1; 3426 } 3427 if (regmatch(next)) 3428 sayYES; 3429 REGCP_UNWIND(lastcp); 3430 } 3431 /* Couldn't or didn't -- move forward. */ 3432 PL_reginput = locinput; 3433 if (regrepeat_hard(scan, 1, &l)) { 3434 ln++; 3435 locinput = PL_reginput; 3436 } 3437 else 3438 sayNO; 3439 } 3440 } 3441 else { 3442 n = regrepeat_hard(scan, n, &l); 3443 /* if we matched something zero-length we don't need to 3444 backtrack, unless the minimum count is zero and we 3445 are capturing the result - in that case the capture 3446 being defined or not may affect later execution 3447 */ 3448 if (n != 0 && l == 0 && !(paren && ln == 0)) 3449 ln = n; /* don't backtrack */ 3450 locinput = PL_reginput; 3451 DEBUG_r( 3452 PerlIO_printf(Perl_debug_log, 3453 "%*s matched %"IVdf" times, len=%"IVdf"...\n", 3454 (int)(REPORT_CODE_OFF+PL_regindent*2), "", 3455 (IV) n, (IV)l) 3456 ); 3457 if (n >= ln) { 3458 if (HAS_TEXT(next) || JUMPABLE(next)) { 3459 regnode *text_node = next; 3460 3461 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); 3462 3463 if (! HAS_TEXT(text_node)) c1 = c2 = -1000; 3464 else { 3465 if (PL_regkind[(U8)OP(text_node)] == REF) { 3466 c1 = c2 = -1000; 3467 goto assume_ok_REG; 3468 } 3469 else { c1 = (U8)*STRING(text_node); } 3470 3471 if (OP(text_node) == EXACTF || OP(text_node) == REFF) 3472 c2 = PL_fold[c1]; 3473 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL) 3474 c2 = PL_fold_locale[c1]; 3475 else 3476 c2 = c1; 3477 } 3478 } 3479 else 3480 c1 = c2 = -1000; 3481 } 3482 assume_ok_REG: 3483 REGCP_SET(lastcp); 3484 while (n >= ln) { 3485 /* If it could work, try it. */ 3486 if (c1 == -1000 || 3487 UCHARAT(PL_reginput) == c1 || 3488 UCHARAT(PL_reginput) == c2) 3489 { 3490 DEBUG_r( 3491 PerlIO_printf(Perl_debug_log, 3492 "%*s trying tail with n=%"IVdf"...\n", 3493 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n) 3494 ); 3495 if (paren) { 3496 if (n) { 3497 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr; 3498 PL_regendp[paren] = PL_reginput - PL_bostr; 3499 } 3500 else 3501 PL_regendp[paren] = -1; 3502 } 3503 if (regmatch(next)) 3504 sayYES; 3505 REGCP_UNWIND(lastcp); 3506 } 3507 /* Couldn't or didn't -- back up. */ 3508 n--; 3509 locinput = HOPc(locinput, -l); 3510 PL_reginput = locinput; 3511 } 3512 } 3513 sayNO; 3514 break; 3515 } 3516 case CURLYN: 3517 paren = scan->flags; /* Which paren to set */ 3518 if (paren > PL_regsize) 3519 PL_regsize = paren; 3520 if (paren > (I32)*PL_reglastparen) 3521 *PL_reglastparen = paren; 3522 ln = ARG1(scan); /* min to match */ 3523 n = ARG2(scan); /* max to match */ 3524 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 3525 goto repeat; 3526 case CURLY: 3527 paren = 0; 3528 ln = ARG1(scan); /* min to match */ 3529 n = ARG2(scan); /* max to match */ 3530 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 3531 goto repeat; 3532 case STAR: 3533 ln = 0; 3534 n = REG_INFTY; 3535 scan = NEXTOPER(scan); 3536 paren = 0; 3537 goto repeat; 3538 case PLUS: 3539 ln = 1; 3540 n = REG_INFTY; 3541 scan = NEXTOPER(scan); 3542 paren = 0; 3543 repeat: 3544 /* 3545 * Lookahead to avoid useless match attempts 3546 * when we know what character comes next. 3547 */ 3548 3549 /* 3550 * Used to only do .*x and .*?x, but now it allows 3551 * for )'s, ('s and (?{ ... })'s to be in the way 3552 * of the quantifier and the EXACT-like node. -- japhy 3553 */ 3554 3555 if (HAS_TEXT(next) || JUMPABLE(next)) { 3556 U8 *s; 3557 regnode *text_node = next; 3558 3559 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); 3560 3561 if (! HAS_TEXT(text_node)) c1 = c2 = -1000; 3562 else { 3563 if (PL_regkind[(U8)OP(text_node)] == REF) { 3564 c1 = c2 = -1000; 3565 goto assume_ok_easy; 3566 } 3567 else { s = (U8*)STRING(text_node); } 3568 3569 if (!UTF) { 3570 c2 = c1 = *s; 3571 if (OP(text_node) == EXACTF || OP(text_node) == REFF) 3572 c2 = PL_fold[c1]; 3573 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL) 3574 c2 = PL_fold_locale[c1]; 3575 } 3576 else { /* UTF */ 3577 if (OP(text_node) == EXACTF || OP(text_node) == REFF) { 3578 STRLEN ulen1, ulen2; 3579 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1]; 3580 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1]; 3581 3582 to_utf8_lower((U8*)s, tmpbuf1, &ulen1); 3583 to_utf8_upper((U8*)s, tmpbuf2, &ulen2); 3584 3585 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, 3586 ckWARN(WARN_UTF8) ? 3587 0 : UTF8_ALLOW_ANY); 3588 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, 3589 ckWARN(WARN_UTF8) ? 3590 0 : UTF8_ALLOW_ANY); 3591 } 3592 else { 3593 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, 3594 ckWARN(WARN_UTF8) ? 3595 0 : UTF8_ALLOW_ANY); 3596 } 3597 } 3598 } 3599 } 3600 else 3601 c1 = c2 = -1000; 3602 assume_ok_easy: 3603 PL_reginput = locinput; 3604 if (minmod) { 3605 CHECKPOINT lastcp; 3606 minmod = 0; 3607 if (ln && regrepeat(scan, ln) < ln) 3608 sayNO; 3609 locinput = PL_reginput; 3610 REGCP_SET(lastcp); 3611 if (c1 != -1000) { 3612 char *e; /* Should not check after this */ 3613 char *old = locinput; 3614 int count = 0; 3615 3616 if (n == REG_INFTY) { 3617 e = PL_regeol - 1; 3618 if (do_utf8) 3619 while (UTF8_IS_CONTINUATION(*(U8*)e)) 3620 e--; 3621 } 3622 else if (do_utf8) { 3623 int m = n - ln; 3624 for (e = locinput; 3625 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--) 3626 e += UTF8SKIP(e); 3627 } 3628 else { 3629 e = locinput + n - ln; 3630 if (e >= PL_regeol) 3631 e = PL_regeol - 1; 3632 } 3633 while (1) { 3634 /* Find place 'next' could work */ 3635 if (!do_utf8) { 3636 if (c1 == c2) { 3637 while (locinput <= e && 3638 UCHARAT(locinput) != c1) 3639 locinput++; 3640 } else { 3641 while (locinput <= e 3642 && UCHARAT(locinput) != c1 3643 && UCHARAT(locinput) != c2) 3644 locinput++; 3645 } 3646 count = locinput - old; 3647 } 3648 else { 3649 STRLEN len; 3650 if (c1 == c2) { 3651 /* count initialised to 3652 * utf8_distance(old, locinput) */ 3653 while (locinput <= e && 3654 utf8n_to_uvchr((U8*)locinput, 3655 UTF8_MAXLEN, &len, 3656 ckWARN(WARN_UTF8) ? 3657 0 : UTF8_ALLOW_ANY) != (UV)c1) { 3658 locinput += len; 3659 count++; 3660 } 3661 } else { 3662 /* count initialised to 3663 * utf8_distance(old, locinput) */ 3664 while (locinput <= e) { 3665 UV c = utf8n_to_uvchr((U8*)locinput, 3666 UTF8_MAXLEN, &len, 3667 ckWARN(WARN_UTF8) ? 3668 0 : UTF8_ALLOW_ANY); 3669 if (c == (UV)c1 || c == (UV)c2) 3670 break; 3671 locinput += len; 3672 count++; 3673 } 3674 } 3675 } 3676 if (locinput > e) 3677 sayNO; 3678 /* PL_reginput == old now */ 3679 if (locinput != old) { 3680 ln = 1; /* Did some */ 3681 if (regrepeat(scan, count) < count) 3682 sayNO; 3683 } 3684 /* PL_reginput == locinput now */ 3685 TRYPAREN(paren, ln, locinput); 3686 PL_reginput = locinput; /* Could be reset... */ 3687 REGCP_UNWIND(lastcp); 3688 /* Couldn't or didn't -- move forward. */ 3689 old = locinput; 3690 if (do_utf8) 3691 locinput += UTF8SKIP(locinput); 3692 else 3693 locinput++; 3694 count = 1; 3695 } 3696 } 3697 else 3698 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */ 3699 UV c; 3700 if (c1 != -1000) { 3701 if (do_utf8) 3702 c = utf8n_to_uvchr((U8*)PL_reginput, 3703 UTF8_MAXLEN, 0, 3704 ckWARN(WARN_UTF8) ? 3705 0 : UTF8_ALLOW_ANY); 3706 else 3707 c = UCHARAT(PL_reginput); 3708 /* If it could work, try it. */ 3709 if (c == (UV)c1 || c == (UV)c2) 3710 { 3711 TRYPAREN(paren, ln, PL_reginput); 3712 REGCP_UNWIND(lastcp); 3713 } 3714 } 3715 /* If it could work, try it. */ 3716 else if (c1 == -1000) 3717 { 3718 TRYPAREN(paren, ln, PL_reginput); 3719 REGCP_UNWIND(lastcp); 3720 } 3721 /* Couldn't or didn't -- move forward. */ 3722 PL_reginput = locinput; 3723 if (regrepeat(scan, 1)) { 3724 ln++; 3725 locinput = PL_reginput; 3726 } 3727 else 3728 sayNO; 3729 } 3730 } 3731 else { 3732 CHECKPOINT lastcp; 3733 n = regrepeat(scan, n); 3734 locinput = PL_reginput; 3735 if (ln < n && PL_regkind[(U8)OP(next)] == EOL && 3736 ((!PL_multiline && OP(next) != MEOL) || 3737 OP(next) == SEOL || OP(next) == EOS)) 3738 { 3739 ln = n; /* why back off? */ 3740 /* ...because $ and \Z can match before *and* after 3741 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 3742 We should back off by one in this case. */ 3743 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS) 3744 ln--; 3745 } 3746 REGCP_SET(lastcp); 3747 if (paren) { 3748 UV c = 0; 3749 while (n >= ln) { 3750 if (c1 != -1000) { 3751 if (do_utf8) 3752 c = utf8n_to_uvchr((U8*)PL_reginput, 3753 UTF8_MAXLEN, 0, 3754 ckWARN(WARN_UTF8) ? 3755 0 : UTF8_ALLOW_ANY); 3756 else 3757 c = UCHARAT(PL_reginput); 3758 } 3759 /* If it could work, try it. */ 3760 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) 3761 { 3762 TRYPAREN(paren, n, PL_reginput); 3763 REGCP_UNWIND(lastcp); 3764 } 3765 /* Couldn't or didn't -- back up. */ 3766 n--; 3767 PL_reginput = locinput = HOPc(locinput, -1); 3768 } 3769 } 3770 else { 3771 UV c = 0; 3772 while (n >= ln) { 3773 if (c1 != -1000) { 3774 if (do_utf8) 3775 c = utf8n_to_uvchr((U8*)PL_reginput, 3776 UTF8_MAXLEN, 0, 3777 ckWARN(WARN_UTF8) ? 3778 0 : UTF8_ALLOW_ANY); 3779 else 3780 c = UCHARAT(PL_reginput); 3781 } 3782 /* If it could work, try it. */ 3783 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2) 3784 { 3785 TRYPAREN(paren, n, PL_reginput); 3786 REGCP_UNWIND(lastcp); 3787 } 3788 /* Couldn't or didn't -- back up. */ 3789 n--; 3790 PL_reginput = locinput = HOPc(locinput, -1); 3791 } 3792 } 3793 } 3794 sayNO; 3795 break; 3796 case END: 3797 if (PL_reg_call_cc) { 3798 re_cc_state *cur_call_cc = PL_reg_call_cc; 3799 CURCUR *cctmp = PL_regcc; 3800 regexp *re = PL_reg_re; 3801 CHECKPOINT cp, lastcp; 3802 3803 cp = regcppush(0); /* Save *all* the positions. */ 3804 REGCP_SET(lastcp); 3805 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of 3806 the caller. */ 3807 PL_reginput = locinput; /* Make position available to 3808 the callcc. */ 3809 cache_re(PL_reg_call_cc->re); 3810 PL_regcc = PL_reg_call_cc->cc; 3811 PL_reg_call_cc = PL_reg_call_cc->prev; 3812 if (regmatch(cur_call_cc->node)) { 3813 PL_reg_call_cc = cur_call_cc; 3814 regcpblow(cp); 3815 sayYES; 3816 } 3817 REGCP_UNWIND(lastcp); 3818 regcppop(); 3819 PL_reg_call_cc = cur_call_cc; 3820 PL_regcc = cctmp; 3821 PL_reg_re = re; 3822 cache_re(re); 3823 3824 DEBUG_r( 3825 PerlIO_printf(Perl_debug_log, 3826 "%*s continuation failed...\n", 3827 REPORT_CODE_OFF+PL_regindent*2, "") 3828 ); 3829 sayNO_SILENT; 3830 } 3831 if (locinput < PL_regtill) { 3832 DEBUG_r(PerlIO_printf(Perl_debug_log, 3833 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 3834 PL_colors[4], 3835 (long)(locinput - PL_reg_starttry), 3836 (long)(PL_regtill - PL_reg_starttry), 3837 PL_colors[5])); 3838 sayNO_FINAL; /* Cannot match: too short. */ 3839 } 3840 PL_reginput = locinput; /* put where regtry can find it */ 3841 sayYES_FINAL; /* Success! */ 3842 case SUCCEED: 3843 PL_reginput = locinput; /* put where regtry can find it */ 3844 sayYES_LOUD; /* Success! */ 3845 case SUSPEND: 3846 n = 1; 3847 PL_reginput = locinput; 3848 goto do_ifmatch; 3849 case UNLESSM: 3850 n = 0; 3851 if (scan->flags) { 3852 s = HOPBACKc(locinput, scan->flags); 3853 if (!s) 3854 goto say_yes; 3855 PL_reginput = s; 3856 } 3857 else 3858 PL_reginput = locinput; 3859 PL_reg_flags ^= RF_false; 3860 goto do_ifmatch; 3861 case IFMATCH: 3862 n = 1; 3863 if (scan->flags) { 3864 s = HOPBACKc(locinput, scan->flags); 3865 if (!s) 3866 goto say_no; 3867 PL_reginput = s; 3868 } 3869 else 3870 PL_reginput = locinput; 3871 3872 do_ifmatch: 3873 inner = NEXTOPER(NEXTOPER(scan)); 3874 if (regmatch(inner) != n) { 3875 if (n == 0) 3876 PL_reg_flags ^= RF_false; 3877 say_no: 3878 if (logical) { 3879 logical = 0; 3880 sw = 0; 3881 goto do_longjump; 3882 } 3883 else 3884 sayNO; 3885 } 3886 if (n == 0) 3887 PL_reg_flags ^= RF_false; 3888 say_yes: 3889 if (logical) { 3890 logical = 0; 3891 sw = 1; 3892 } 3893 if (OP(scan) == SUSPEND) { 3894 locinput = PL_reginput; 3895 nextchr = UCHARAT(locinput); 3896 } 3897 /* FALL THROUGH. */ 3898 case LONGJMP: 3899 do_longjump: 3900 next = scan + ARG(scan); 3901 if (next == scan) 3902 next = NULL; 3903 break; 3904 default: 3905 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", 3906 PTR2UV(scan), OP(scan)); 3907 Perl_croak(aTHX_ "regexp memory corruption"); 3908 } 3909 reenter: 3910 scan = next; 3911 } 3912 3913 /* 3914 * We get here only if there's trouble -- normally "case END" is 3915 * the terminating point. 3916 */ 3917 Perl_croak(aTHX_ "corrupted regexp pointers"); 3918 /*NOTREACHED*/ 3919 sayNO; 3920 3921yes_loud: 3922 DEBUG_r( 3923 PerlIO_printf(Perl_debug_log, 3924 "%*s %scould match...%s\n", 3925 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5]) 3926 ); 3927 goto yes; 3928yes_final: 3929 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", 3930 PL_colors[4],PL_colors[5])); 3931yes: 3932#ifdef DEBUGGING 3933 PL_regindent--; 3934#endif 3935 3936#if 0 /* Breaks $^R */ 3937 if (unwind) 3938 regcpblow(firstcp); 3939#endif 3940 return 1; 3941 3942no: 3943 DEBUG_r( 3944 PerlIO_printf(Perl_debug_log, 3945 "%*s %sfailed...%s\n", 3946 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5]) 3947 ); 3948 goto do_no; 3949no_final: 3950do_no: 3951 if (unwind) { 3952 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t); 3953 3954 switch (uw->type) { 3955 case RE_UNWIND_BRANCH: 3956 case RE_UNWIND_BRANCHJ: 3957 { 3958 re_unwind_branch_t *uwb = &(uw->branch); 3959 I32 lastparen = uwb->lastparen; 3960 3961 REGCP_UNWIND(uwb->lastcp); 3962 for (n = *PL_reglastparen; n > lastparen; n--) 3963 PL_regendp[n] = -1; 3964 *PL_reglastparen = n; 3965 scan = next = uwb->next; 3966 if ( !scan || 3967 OP(scan) != (uwb->type == RE_UNWIND_BRANCH 3968 ? BRANCH : BRANCHJ) ) { /* Failure */ 3969 unwind = uwb->prev; 3970#ifdef DEBUGGING 3971 PL_regindent--; 3972#endif 3973 goto do_no; 3974 } 3975 /* Have more choice yet. Reuse the same uwb. */ 3976 /*SUPPRESS 560*/ 3977 if ((n = (uwb->type == RE_UNWIND_BRANCH 3978 ? NEXT_OFF(next) : ARG(next)))) 3979 next += n; 3980 else 3981 next = NULL; /* XXXX Needn't unwinding in this case... */ 3982 uwb->next = next; 3983 next = NEXTOPER(scan); 3984 if (uwb->type == RE_UNWIND_BRANCHJ) 3985 next = NEXTOPER(next); 3986 locinput = uwb->locinput; 3987 nextchr = uwb->nextchr; 3988#ifdef DEBUGGING 3989 PL_regindent = uwb->regindent; 3990#endif 3991 3992 goto reenter; 3993 } 3994 /* NOT REACHED */ 3995 default: 3996 Perl_croak(aTHX_ "regexp unwind memory corruption"); 3997 } 3998 /* NOT REACHED */ 3999 } 4000#ifdef DEBUGGING 4001 PL_regindent--; 4002#endif 4003 return 0; 4004} 4005 4006/* 4007 - regrepeat - repeatedly match something simple, report how many 4008 */ 4009/* 4010 * [This routine now assumes that it will only match on things of length 1. 4011 * That was true before, but now we assume scan - reginput is the count, 4012 * rather than incrementing count on every character. [Er, except utf8.]] 4013 */ 4014STATIC I32 4015S_regrepeat(pTHX_ regnode *p, I32 max) 4016{ 4017 register char *scan; 4018 register I32 c; 4019 register char *loceol = PL_regeol; 4020 register I32 hardcount = 0; 4021 register bool do_utf8 = PL_reg_match_utf8; 4022 4023 scan = PL_reginput; 4024 if (max == REG_INFTY) 4025 max = I32_MAX; 4026 else if (max < loceol - scan) 4027 loceol = scan + max; 4028 switch (OP(p)) { 4029 case REG_ANY: 4030 if (do_utf8) { 4031 loceol = PL_regeol; 4032 while (scan < loceol && hardcount < max && *scan != '\n') { 4033 scan += UTF8SKIP(scan); 4034 hardcount++; 4035 } 4036 } else { 4037 while (scan < loceol && *scan != '\n') 4038 scan++; 4039 } 4040 break; 4041 case SANY: 4042 if (do_utf8) { 4043 loceol = PL_regeol; 4044 while (scan < loceol && hardcount < max) { 4045 scan += UTF8SKIP(scan); 4046 hardcount++; 4047 } 4048 } 4049 else 4050 scan = loceol; 4051 break; 4052 case CANY: 4053 scan = loceol; 4054 break; 4055 case EXACT: /* length of string is 1 */ 4056 c = (U8)*STRING(p); 4057 while (scan < loceol && UCHARAT(scan) == c) 4058 scan++; 4059 break; 4060 case EXACTF: /* length of string is 1 */ 4061 c = (U8)*STRING(p); 4062 while (scan < loceol && 4063 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) 4064 scan++; 4065 break; 4066 case EXACTFL: /* length of string is 1 */ 4067 PL_reg_flags |= RF_tainted; 4068 c = (U8)*STRING(p); 4069 while (scan < loceol && 4070 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) 4071 scan++; 4072 break; 4073 case ANYOF: 4074 if (do_utf8) { 4075 loceol = PL_regeol; 4076 while (hardcount < max && scan < loceol && 4077 reginclass(p, (U8*)scan, 0, do_utf8)) { 4078 scan += UTF8SKIP(scan); 4079 hardcount++; 4080 } 4081 } else { 4082 while (scan < loceol && REGINCLASS(p, (U8*)scan)) 4083 scan++; 4084 } 4085 break; 4086 case ALNUM: 4087 if (do_utf8) { 4088 loceol = PL_regeol; 4089 LOAD_UTF8_CHARCLASS(alnum,"a"); 4090 while (hardcount < max && scan < loceol && 4091 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 4092 scan += UTF8SKIP(scan); 4093 hardcount++; 4094 } 4095 } else { 4096 while (scan < loceol && isALNUM(*scan)) 4097 scan++; 4098 } 4099 break; 4100 case ALNUML: 4101 PL_reg_flags |= RF_tainted; 4102 if (do_utf8) { 4103 loceol = PL_regeol; 4104 while (hardcount < max && scan < loceol && 4105 isALNUM_LC_utf8((U8*)scan)) { 4106 scan += UTF8SKIP(scan); 4107 hardcount++; 4108 } 4109 } else { 4110 while (scan < loceol && isALNUM_LC(*scan)) 4111 scan++; 4112 } 4113 break; 4114 case NALNUM: 4115 if (do_utf8) { 4116 loceol = PL_regeol; 4117 LOAD_UTF8_CHARCLASS(alnum,"a"); 4118 while (hardcount < max && scan < loceol && 4119 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 4120 scan += UTF8SKIP(scan); 4121 hardcount++; 4122 } 4123 } else { 4124 while (scan < loceol && !isALNUM(*scan)) 4125 scan++; 4126 } 4127 break; 4128 case NALNUML: 4129 PL_reg_flags |= RF_tainted; 4130 if (do_utf8) { 4131 loceol = PL_regeol; 4132 while (hardcount < max && scan < loceol && 4133 !isALNUM_LC_utf8((U8*)scan)) { 4134 scan += UTF8SKIP(scan); 4135 hardcount++; 4136 } 4137 } else { 4138 while (scan < loceol && !isALNUM_LC(*scan)) 4139 scan++; 4140 } 4141 break; 4142 case SPACE: 4143 if (do_utf8) { 4144 loceol = PL_regeol; 4145 LOAD_UTF8_CHARCLASS(space," "); 4146 while (hardcount < max && scan < loceol && 4147 (*scan == ' ' || 4148 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 4149 scan += UTF8SKIP(scan); 4150 hardcount++; 4151 } 4152 } else { 4153 while (scan < loceol && isSPACE(*scan)) 4154 scan++; 4155 } 4156 break; 4157 case SPACEL: 4158 PL_reg_flags |= RF_tainted; 4159 if (do_utf8) { 4160 loceol = PL_regeol; 4161 while (hardcount < max && scan < loceol && 4162 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 4163 scan += UTF8SKIP(scan); 4164 hardcount++; 4165 } 4166 } else { 4167 while (scan < loceol && isSPACE_LC(*scan)) 4168 scan++; 4169 } 4170 break; 4171 case NSPACE: 4172 if (do_utf8) { 4173 loceol = PL_regeol; 4174 LOAD_UTF8_CHARCLASS(space," "); 4175 while (hardcount < max && scan < loceol && 4176 !(*scan == ' ' || 4177 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 4178 scan += UTF8SKIP(scan); 4179 hardcount++; 4180 } 4181 } else { 4182 while (scan < loceol && !isSPACE(*scan)) 4183 scan++; 4184 break; 4185 } 4186 case NSPACEL: 4187 PL_reg_flags |= RF_tainted; 4188 if (do_utf8) { 4189 loceol = PL_regeol; 4190 while (hardcount < max && scan < loceol && 4191 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 4192 scan += UTF8SKIP(scan); 4193 hardcount++; 4194 } 4195 } else { 4196 while (scan < loceol && !isSPACE_LC(*scan)) 4197 scan++; 4198 } 4199 break; 4200 case DIGIT: 4201 if (do_utf8) { 4202 loceol = PL_regeol; 4203 LOAD_UTF8_CHARCLASS(digit,"0"); 4204 while (hardcount < max && scan < loceol && 4205 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 4206 scan += UTF8SKIP(scan); 4207 hardcount++; 4208 } 4209 } else { 4210 while (scan < loceol && isDIGIT(*scan)) 4211 scan++; 4212 } 4213 break; 4214 case NDIGIT: 4215 if (do_utf8) { 4216 loceol = PL_regeol; 4217 LOAD_UTF8_CHARCLASS(digit,"0"); 4218 while (hardcount < max && scan < loceol && 4219 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 4220 scan += UTF8SKIP(scan); 4221 hardcount++; 4222 } 4223 } else { 4224 while (scan < loceol && !isDIGIT(*scan)) 4225 scan++; 4226 } 4227 break; 4228 default: /* Called on something of 0 width. */ 4229 break; /* So match right here or not at all. */ 4230 } 4231 4232 if (hardcount) 4233 c = hardcount; 4234 else 4235 c = scan - PL_reginput; 4236 PL_reginput = scan; 4237 4238 DEBUG_r( 4239 { 4240 SV *prop = sv_newmortal(); 4241 4242 regprop(prop, p); 4243 PerlIO_printf(Perl_debug_log, 4244 "%*s %s can match %"IVdf" times out of %"IVdf"...\n", 4245 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max); 4246 }); 4247 4248 return(c); 4249} 4250 4251/* 4252 - regrepeat_hard - repeatedly match something, report total lenth and length 4253 * 4254 * The repeater is supposed to have constant length. 4255 */ 4256 4257STATIC I32 4258S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) 4259{ 4260 register char *scan = Nullch; 4261 register char *start; 4262 register char *loceol = PL_regeol; 4263 I32 l = 0; 4264 I32 count = 0, res = 1; 4265 4266 if (!max) 4267 return 0; 4268 4269 start = PL_reginput; 4270 if (PL_reg_match_utf8) { 4271 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) { 4272 if (!count++) { 4273 l = 0; 4274 while (start < PL_reginput) { 4275 l++; 4276 start += UTF8SKIP(start); 4277 } 4278 *lp = l; 4279 if (l == 0) 4280 return max; 4281 } 4282 if (count == max) 4283 return count; 4284 } 4285 } 4286 else { 4287 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) { 4288 if (!count++) { 4289 *lp = l = PL_reginput - start; 4290 if (max != REG_INFTY && l*max < loceol - scan) 4291 loceol = scan + l*max; 4292 if (l == 0) 4293 return max; 4294 } 4295 } 4296 } 4297 if (!res) 4298 PL_reginput = scan; 4299 4300 return count; 4301} 4302 4303/* 4304- regclass_swash - prepare the utf8 swash 4305*/ 4306 4307SV * 4308Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp) 4309{ 4310 SV *sw = NULL; 4311 SV *si = NULL; 4312 SV *alt = NULL; 4313 4314 if (PL_regdata && PL_regdata->count) { 4315 U32 n = ARG(node); 4316 4317 if (PL_regdata->what[n] == 's') { 4318 SV *rv = (SV*)PL_regdata->data[n]; 4319 AV *av = (AV*)SvRV((SV*)rv); 4320 SV **ary = AvARRAY(av); 4321 SV **a, **b; 4322 4323 /* See the end of regcomp.c:S_reglass() for 4324 * documentation of these array elements. */ 4325 4326 si = *ary; 4327 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0; 4328 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0; 4329 4330 if (a) 4331 sw = *a; 4332 else if (si && doinit) { 4333 sw = swash_init("utf8", "", si, 1, 0); 4334 (void)av_store(av, 1, sw); 4335 } 4336 if (b) 4337 alt = *b; 4338 } 4339 } 4340 4341 if (listsvp) 4342 *listsvp = si; 4343 if (altsvp) 4344 *altsvp = alt; 4345 4346 return sw; 4347} 4348 4349/* 4350 - reginclass - determine if a character falls into a character class 4351 4352 The n is the ANYOF regnode, the p is the target string, lenp 4353 is pointer to the maximum length of how far to go in the p 4354 (if the lenp is zero, UTF8SKIP(p) is used), 4355 do_utf8 tells whether the target string is in UTF-8. 4356 4357 */ 4358 4359STATIC bool 4360S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8) 4361{ 4362 char flags = ANYOF_FLAGS(n); 4363 bool match = FALSE; 4364 UV c = *p; 4365 STRLEN len = 0; 4366 STRLEN plen; 4367 4368 if (do_utf8 && !UTF8_IS_INVARIANT(c)) 4369 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len, 4370 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 4371 4372 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); 4373 if (do_utf8 || (flags & ANYOF_UNICODE)) { 4374 if (lenp) 4375 *lenp = 0; 4376 if (do_utf8 && !ANYOF_RUNTIME(n)) { 4377 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) 4378 match = TRUE; 4379 } 4380 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) 4381 match = TRUE; 4382 if (!match) { 4383 AV *av; 4384 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av); 4385 4386 if (sw) { 4387 if (swash_fetch(sw, p, do_utf8)) 4388 match = TRUE; 4389 else if (flags & ANYOF_FOLD) { 4390 if (!match && lenp && av) { 4391 I32 i; 4392 4393 for (i = 0; i <= av_len(av); i++) { 4394 SV* sv = *av_fetch(av, i, FALSE); 4395 STRLEN len; 4396 char *s = SvPV(sv, len); 4397 4398 if (len <= plen && memEQ(s, (char*)p, len)) { 4399 *lenp = len; 4400 match = TRUE; 4401 break; 4402 } 4403 } 4404 } 4405 if (!match) { 4406 U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; 4407 STRLEN tmplen; 4408 4409 to_utf8_fold(p, tmpbuf, &tmplen); 4410 if (swash_fetch(sw, tmpbuf, do_utf8)) 4411 match = TRUE; 4412 } 4413 } 4414 } 4415 } 4416 if (match && lenp && *lenp == 0) 4417 *lenp = UNISKIP(NATIVE_TO_UNI(c)); 4418 } 4419 if (!match && c < 256) { 4420 if (ANYOF_BITMAP_TEST(n, c)) 4421 match = TRUE; 4422 else if (flags & ANYOF_FOLD) { 4423 U8 f; 4424 4425 if (flags & ANYOF_LOCALE) { 4426 PL_reg_flags |= RF_tainted; 4427 f = PL_fold_locale[c]; 4428 } 4429 else 4430 f = PL_fold[c]; 4431 if (f != c && ANYOF_BITMAP_TEST(n, f)) 4432 match = TRUE; 4433 } 4434 4435 if (!match && (flags & ANYOF_CLASS)) { 4436 PL_reg_flags |= RF_tainted; 4437 if ( 4438 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || 4439 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || 4440 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || 4441 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || 4442 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || 4443 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || 4444 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || 4445 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || 4446 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || 4447 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || 4448 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || 4449 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || 4450 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || 4451 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || 4452 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || 4453 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || 4454 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || 4455 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || 4456 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || 4457 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || 4458 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || 4459 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || 4460 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || 4461 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || 4462 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || 4463 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || 4464 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || 4465 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || 4466 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || 4467 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) 4468 ) /* How's that for a conditional? */ 4469 { 4470 match = TRUE; 4471 } 4472 } 4473 } 4474 4475 return (flags & ANYOF_INVERT) ? !match : match; 4476} 4477 4478STATIC U8 * 4479S_reghop(pTHX_ U8 *s, I32 off) 4480{ 4481 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); 4482} 4483 4484STATIC U8 * 4485S_reghop3(pTHX_ U8 *s, I32 off, U8* lim) 4486{ 4487 if (off >= 0) { 4488 while (off-- && s < lim) { 4489 /* XXX could check well-formedness here */ 4490 s += UTF8SKIP(s); 4491 } 4492 } 4493 else { 4494 while (off++) { 4495 if (s > lim) { 4496 s--; 4497 if (UTF8_IS_CONTINUED(*s)) { 4498 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) 4499 s--; 4500 } 4501 /* XXX could check well-formedness here */ 4502 } 4503 } 4504 } 4505 return s; 4506} 4507 4508STATIC U8 * 4509S_reghopmaybe(pTHX_ U8 *s, I32 off) 4510{ 4511 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); 4512} 4513 4514STATIC U8 * 4515S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim) 4516{ 4517 if (off >= 0) { 4518 while (off-- && s < lim) { 4519 /* XXX could check well-formedness here */ 4520 s += UTF8SKIP(s); 4521 } 4522 if (off >= 0) 4523 return 0; 4524 } 4525 else { 4526 while (off++) { 4527 if (s > lim) { 4528 s--; 4529 if (UTF8_IS_CONTINUED(*s)) { 4530 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) 4531 s--; 4532 } 4533 /* XXX could check well-formedness here */ 4534 } 4535 else 4536 break; 4537 } 4538 if (off <= 0) 4539 return 0; 4540 } 4541 return s; 4542} 4543 4544static void 4545restore_pos(pTHX_ void *arg) 4546{ 4547 if (PL_reg_eval_set) { 4548 if (PL_reg_oldsaved) { 4549 PL_reg_re->subbeg = PL_reg_oldsaved; 4550 PL_reg_re->sublen = PL_reg_oldsavedlen; 4551 RX_MATCH_COPIED_on(PL_reg_re); 4552 } 4553 PL_reg_magic->mg_len = PL_reg_oldpos; 4554 PL_reg_eval_set = 0; 4555 PL_curpm = PL_reg_oldcurpm; 4556 } 4557} 4558 4559STATIC void 4560S_to_utf8_substr(pTHX_ register regexp *prog) 4561{ 4562 SV* sv; 4563 if (prog->float_substr && !prog->float_utf8) { 4564 prog->float_utf8 = sv = NEWSV(117, 0); 4565 SvSetSV(sv, prog->float_substr); 4566 sv_utf8_upgrade(sv); 4567 if (SvTAIL(prog->float_substr)) 4568 SvTAIL_on(sv); 4569 if (prog->float_substr == prog->check_substr) 4570 prog->check_utf8 = sv; 4571 } 4572 if (prog->anchored_substr && !prog->anchored_utf8) { 4573 prog->anchored_utf8 = sv = NEWSV(118, 0); 4574 SvSetSV(sv, prog->anchored_substr); 4575 sv_utf8_upgrade(sv); 4576 if (SvTAIL(prog->anchored_substr)) 4577 SvTAIL_on(sv); 4578 if (prog->anchored_substr == prog->check_substr) 4579 prog->check_utf8 = sv; 4580 } 4581} 4582 4583STATIC void 4584S_to_byte_substr(pTHX_ register regexp *prog) 4585{ 4586 SV* sv; 4587 if (prog->float_utf8 && !prog->float_substr) { 4588 prog->float_substr = sv = NEWSV(117, 0); 4589 SvSetSV(sv, prog->float_utf8); 4590 if (sv_utf8_downgrade(sv, TRUE)) { 4591 if (SvTAIL(prog->float_utf8)) 4592 SvTAIL_on(sv); 4593 } else { 4594 SvREFCNT_dec(sv); 4595 prog->float_substr = sv = &PL_sv_undef; 4596 } 4597 if (prog->float_utf8 == prog->check_utf8) 4598 prog->check_substr = sv; 4599 } 4600 if (prog->anchored_utf8 && !prog->anchored_substr) { 4601 prog->anchored_substr = sv = NEWSV(118, 0); 4602 SvSetSV(sv, prog->anchored_utf8); 4603 if (sv_utf8_downgrade(sv, TRUE)) { 4604 if (SvTAIL(prog->anchored_utf8)) 4605 SvTAIL_on(sv); 4606 } else { 4607 SvREFCNT_dec(sv); 4608 prog->anchored_substr = sv = &PL_sv_undef; 4609 } 4610 if (prog->anchored_utf8 == prog->check_utf8) 4611 prog->check_substr = sv; 4612 } 4613} 4614