regexec.c revision 1.14
1/* regexec.c 2 */ 3 4/* 5 * One Ring to rule them all, One Ring to find them 6 & 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12/* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22/* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26/* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31/* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34*/ 35 36#ifdef PERL_EXT_RE_BUILD 37#include "re_top.h" 38#endif 39 40/* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73#include "EXTERN.h" 74#define PERL_IN_REGEXEC_C 75#include "perl.h" 76 77#ifdef PERL_IN_XSUB_RE 78# include "re_comp.h" 79#else 80# include "regcomp.h" 81#endif 82 83#define RF_tainted 1 /* tainted information used? */ 84#define RF_warned 2 /* warned about big count? */ 85 86#define RF_utf8 8 /* Pattern contains multibyte chars? */ 87 88#define UTF ((PL_reg_flags & RF_utf8) != 0) 89 90#define RS_init 1 /* eval environment created */ 91#define RS_set 2 /* replsv value is set */ 92 93#ifndef STATIC 94#define STATIC static 95#endif 96 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) 98 99/* 100 * Forwards. 101 */ 102 103#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv)) 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b) 105 106#define HOPc(pos,off) \ 107 (char *)(PL_reg_match_utf8 \ 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \ 109 : (U8*)(pos + off)) 110#define HOPBACKc(pos, off) \ 111 (char*)(PL_reg_match_utf8\ 112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \ 113 : (pos - off >= PL_bostr) \ 114 ? (U8*)pos - off \ 115 : NULL) 116 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 119 120/* these are unrolled below in the CCC_TRY_XXX defined */ 121#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ 122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END 123 124/* Doesn't do an assert to verify that is correct */ 125#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ 126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END 127 128#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") 129#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") 130#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") 131 132#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ 133 LOAD_UTF8_CHARCLASS(X_begin, " "); \ 134 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \ 135 /* These are utf8 constants, and not utf-ebcdic constants, so the \ 136 * assert should likely and hopefully fail on an EBCDIC machine */ \ 137 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \ 138 \ 139 /* No asserts are done for these, in case called on an early \ 140 * Unicode version in which they map to nothing */ \ 141 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \ 142 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \ 143 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \ 144 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \ 145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\ 146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \ 147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ 148 149/* 150 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test 151 so that it is possible to override the option here without having to 152 rebuild the entire core. as we are required to do if we change regcomp.h 153 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. 154*/ 155#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 156#define BROKEN_UNICODE_CHARCLASS_MAPPINGS 157#endif 158 159#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS 160#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM() 161#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE() 162#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT() 163#define RE_utf8_perl_word PL_utf8_alnum 164#define RE_utf8_perl_space PL_utf8_space 165#define RE_utf8_posix_digit PL_utf8_digit 166#define perl_word alnum 167#define perl_space space 168#define posix_digit digit 169#else 170#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a") 171#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ") 172#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0") 173#define RE_utf8_perl_word PL_utf8_perl_word 174#define RE_utf8_perl_space PL_utf8_perl_space 175#define RE_utf8_posix_digit PL_utf8_posix_digit 176#endif 177 178 179#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ 180 case NAMEL: \ 181 PL_reg_flags |= RF_tainted; \ 182 /* FALL THROUGH */ \ 183 case NAME: \ 184 if (!nextchr) \ 185 sayNO; \ 186 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \ 187 if (!CAT2(PL_utf8_,CLASS)) { \ 188 bool ok; \ 189 ENTER; \ 190 save_re_context(); \ 191 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ 192 assert(ok); \ 193 LEAVE; \ 194 } \ 195 if (!(OP(scan) == NAME \ 196 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \ 197 : LCFUNC_utf8((U8*)locinput))) \ 198 { \ 199 sayNO; \ 200 } \ 201 locinput += PL_utf8skip[nextchr]; \ 202 nextchr = UCHARAT(locinput); \ 203 break; \ 204 } \ 205 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ 206 sayNO; \ 207 nextchr = UCHARAT(++locinput); \ 208 break 209 210#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ 211 case NAMEL: \ 212 PL_reg_flags |= RF_tainted; \ 213 /* FALL THROUGH */ \ 214 case NAME : \ 215 if (!nextchr && locinput >= PL_regeol) \ 216 sayNO; \ 217 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \ 218 if (!CAT2(PL_utf8_,CLASS)) { \ 219 bool ok; \ 220 ENTER; \ 221 save_re_context(); \ 222 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ 223 assert(ok); \ 224 LEAVE; \ 225 } \ 226 if ((OP(scan) == NAME \ 227 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \ 228 : LCFUNC_utf8((U8*)locinput))) \ 229 { \ 230 sayNO; \ 231 } \ 232 locinput += PL_utf8skip[nextchr]; \ 233 nextchr = UCHARAT(locinput); \ 234 break; \ 235 } \ 236 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ 237 sayNO; \ 238 nextchr = UCHARAT(++locinput); \ 239 break 240 241 242 243 244 245/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 246 247/* for use after a quantifier and before an EXACT-like node -- japhy */ 248/* it would be nice to rework regcomp.sym to generate this stuff. sigh */ 249#define JUMPABLE(rn) ( \ 250 OP(rn) == OPEN || \ 251 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ 252 OP(rn) == EVAL || \ 253 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 254 OP(rn) == PLUS || OP(rn) == MINMOD || \ 255 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \ 256 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ 257) 258#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) 259 260#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) 261 262#if 0 263/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so 264 we don't need this definition. */ 265#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) 266#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) 267#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) 268 269#else 270/* ... so we use this as its faster. */ 271#define IS_TEXT(rn) ( OP(rn)==EXACT ) 272#define IS_TEXTF(rn) ( OP(rn)==EXACTF ) 273#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) 274 275#endif 276 277/* 278 Search for mandatory following text node; for lookahead, the text must 279 follow but for lookbehind (rn->flags != 0) we skip to the next step. 280*/ 281#define FIND_NEXT_IMPT(rn) STMT_START { \ 282 while (JUMPABLE(rn)) { \ 283 const OPCODE type = OP(rn); \ 284 if (type == SUSPEND || PL_regkind[type] == CURLY) \ 285 rn = NEXTOPER(NEXTOPER(rn)); \ 286 else if (type == PLUS) \ 287 rn = NEXTOPER(rn); \ 288 else if (type == IFMATCH) \ 289 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ 290 else rn += NEXT_OFF(rn); \ 291 } \ 292} STMT_END 293 294 295static void restore_pos(pTHX_ void *arg); 296 297STATIC CHECKPOINT 298S_regcppush(pTHX_ I32 parenfloor) 299{ 300 dVAR; 301 const int retval = PL_savestack_ix; 302#define REGCP_PAREN_ELEMS 4 303 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; 304 int p; 305 GET_RE_DEBUG_FLAGS_DECL; 306 307 if (paren_elems_to_push < 0) 308 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); 309 310#define REGCP_OTHER_ELEMS 7 311 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS); 312 313 for (p = PL_regsize; p > parenfloor; p--) { 314/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ 315 SSPUSHINT(PL_regoffs[p].end); 316 SSPUSHINT(PL_regoffs[p].start); 317 SSPUSHPTR(PL_reg_start_tmp[p]); 318 SSPUSHINT(p); 319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, 320 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n", 321 (UV)p, (IV)PL_regoffs[p].start, 322 (IV)(PL_reg_start_tmp[p] - PL_bostr), 323 (IV)PL_regoffs[p].end 324 )); 325 } 326/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 327 SSPUSHPTR(PL_regoffs); 328 SSPUSHINT(PL_regsize); 329 SSPUSHINT(*PL_reglastparen); 330 SSPUSHINT(*PL_reglastcloseparen); 331 SSPUSHPTR(PL_reginput); 332#define REGCP_FRAME_ELEMS 2 333/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and 334 * are needed for the regexp context stack bookkeeping. */ 335 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 336 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */ 337 338 return retval; 339} 340 341/* These are needed since we do not localize EVAL nodes: */ 342#define REGCP_SET(cp) \ 343 DEBUG_STATE_r( \ 344 PerlIO_printf(Perl_debug_log, \ 345 " Setting an EVAL scope, savestack=%"IVdf"\n", \ 346 (IV)PL_savestack_ix)); \ 347 cp = PL_savestack_ix 348 349#define REGCP_UNWIND(cp) \ 350 DEBUG_STATE_r( \ 351 if (cp != PL_savestack_ix) \ 352 PerlIO_printf(Perl_debug_log, \ 353 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ 354 (IV)(cp), (IV)PL_savestack_ix)); \ 355 regcpblow(cp) 356 357STATIC char * 358S_regcppop(pTHX_ const regexp *rex) 359{ 360 dVAR; 361 U32 i; 362 char *input; 363 GET_RE_DEBUG_FLAGS_DECL; 364 365 PERL_ARGS_ASSERT_REGCPPOP; 366 367 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 368 i = SSPOPINT; 369 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 370 i = SSPOPINT; /* Parentheses elements to pop. */ 371 input = (char *) SSPOPPTR; 372 *PL_reglastcloseparen = SSPOPINT; 373 *PL_reglastparen = SSPOPINT; 374 PL_regsize = SSPOPINT; 375 PL_regoffs=(regexp_paren_pair *) SSPOPPTR; 376 377 378 /* Now restore the parentheses context. */ 379 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS); 380 i > 0; i -= REGCP_PAREN_ELEMS) { 381 I32 tmps; 382 U32 paren = (U32)SSPOPINT; 383 PL_reg_start_tmp[paren] = (char *) SSPOPPTR; 384 PL_regoffs[paren].start = SSPOPINT; 385 tmps = SSPOPINT; 386 if (paren <= *PL_reglastparen) 387 PL_regoffs[paren].end = tmps; 388 DEBUG_BUFFERS_r( 389 PerlIO_printf(Perl_debug_log, 390 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", 391 (UV)paren, (IV)PL_regoffs[paren].start, 392 (IV)(PL_reg_start_tmp[paren] - PL_bostr), 393 (IV)PL_regoffs[paren].end, 394 (paren > *PL_reglastparen ? "(no)" : "")); 395 ); 396 } 397 DEBUG_BUFFERS_r( 398 if (*PL_reglastparen + 1 <= rex->nparens) { 399 PerlIO_printf(Perl_debug_log, 400 " restoring \\%"IVdf"..\\%"IVdf" to undef\n", 401 (IV)(*PL_reglastparen + 1), (IV)rex->nparens); 402 } 403 ); 404#if 1 405 /* It would seem that the similar code in regtry() 406 * already takes care of this, and in fact it is in 407 * a better location to since this code can #if 0-ed out 408 * but the code in regtry() is needed or otherwise tests 409 * requiring null fields (pat.t#187 and split.t#{13,14} 410 * (as of patchlevel 7877) will fail. Then again, 411 * this code seems to be necessary or otherwise 412 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 413 * --jhi updated by dapm */ 414 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) { 415 if (i > PL_regsize) 416 PL_regoffs[i].start = -1; 417 PL_regoffs[i].end = -1; 418 } 419#endif 420 return input; 421} 422 423#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 424 425/* 426 * pregexec and friends 427 */ 428 429#ifndef PERL_IN_XSUB_RE 430/* 431 - pregexec - match a regexp against a string 432 */ 433I32 434Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend, 435 char *strbeg, I32 minend, SV *screamer, U32 nosave) 436/* strend: pointer to null at end of string */ 437/* strbeg: real beginning of string */ 438/* minend: end of match must be >=minend after stringarg. */ 439/* nosave: For optimizations. */ 440{ 441 PERL_ARGS_ASSERT_PREGEXEC; 442 443 return 444 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 445 nosave ? 0 : REXEC_COPY_STR); 446} 447#endif 448 449/* 450 * Need to implement the following flags for reg_anch: 451 * 452 * USE_INTUIT_NOML - Useful to call re_intuit_start() first 453 * USE_INTUIT_ML 454 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer 455 * INTUIT_AUTORITATIVE_ML 456 * INTUIT_ONCE_NOML - Intuit can match in one location only. 457 * INTUIT_ONCE_ML 458 * 459 * Another flag for this function: SECOND_TIME (so that float substrs 460 * with giant delta may be not rechecked). 461 */ 462 463/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ 464 465/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. 466 Otherwise, only SvCUR(sv) is used to get strbeg. */ 467 468/* XXXX We assume that strpos is strbeg unless sv. */ 469 470/* XXXX Some places assume that there is a fixed substring. 471 An update may be needed if optimizer marks as "INTUITable" 472 RExen without fixed substrings. Similarly, it is assumed that 473 lengths of all the strings are no more than minlen, thus they 474 cannot come from lookahead. 475 (Or minlen should take into account lookahead.) 476 NOTE: Some of this comment is not correct. minlen does now take account 477 of lookahead/behind. Further research is required. -- demerphq 478 479*/ 480 481/* A failure to find a constant substring means that there is no need to make 482 an expensive call to REx engine, thus we celebrate a failure. Similarly, 483 finding a substring too deep into the string means that less calls to 484 regtry() should be needed. 485 486 REx compiler's optimizer found 4 possible hints: 487 a) Anchored substring; 488 b) Fixed substring; 489 c) Whether we are anchored (beginning-of-line or \G); 490 d) First node (of those at offset 0) which may distingush positions; 491 We use a)b)d) and multiline-part of c), and try to find a position in the 492 string which does not contradict any of them. 493 */ 494 495/* Most of decisions we do here should have been done at compile time. 496 The nodes of the REx which we used for the search should have been 497 deleted from the finite automaton. */ 498 499char * 500Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, 501 char *strend, const U32 flags, re_scream_pos_data *data) 502{ 503 dVAR; 504 struct regexp *const prog = (struct regexp *)SvANY(rx); 505 register I32 start_shift = 0; 506 /* Should be nonnegative! */ 507 register I32 end_shift = 0; 508 register char *s; 509 register SV *check; 510 char *strbeg; 511 char *t; 512 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 513 I32 ml_anch; 514 register char *other_last = NULL; /* other substr checked before this */ 515 char *check_at = NULL; /* check substr found at this pos */ 516 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 517 RXi_GET_DECL(prog,progi); 518#ifdef DEBUGGING 519 const char * const i_strpos = strpos; 520#endif 521 GET_RE_DEBUG_FLAGS_DECL; 522 523 PERL_ARGS_ASSERT_RE_INTUIT_START; 524 525 RX_MATCH_UTF8_set(rx,do_utf8); 526 527 if (RX_UTF8(rx)) { 528 PL_reg_flags |= RF_utf8; 529 } 530 DEBUG_EXECUTE_r( 531 debug_start_match(rx, do_utf8, strpos, strend, 532 sv ? "Guessing start of match in sv for" 533 : "Guessing start of match in string for"); 534 ); 535 536 /* CHR_DIST() would be more correct here but it makes things slow. */ 537 if (prog->minlen > strend - strpos) { 538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 539 "String too short... [re_intuit_start]\n")); 540 goto fail; 541 } 542 543 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; 544 PL_regeol = strend; 545 if (do_utf8) { 546 if (!prog->check_utf8 && prog->check_substr) 547 to_utf8_substr(prog); 548 check = prog->check_utf8; 549 } else { 550 if (!prog->check_substr && prog->check_utf8) 551 to_byte_substr(prog); 552 check = prog->check_substr; 553 } 554 if (check == &PL_sv_undef) { 555 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 556 "Non-utf8 string cannot match utf8 check string\n")); 557 goto fail; 558 } 559 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ 560 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) 561 || ( (prog->extflags & RXf_ANCH_BOL) 562 && !multiline ) ); /* Check after \n? */ 563 564 if (!ml_anch) { 565 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ 566 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ 567 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */ 568 && sv && !SvROK(sv) 569 && (strpos != strbeg)) { 570 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); 571 goto fail; 572 } 573 if (prog->check_offset_min == prog->check_offset_max && 574 !(prog->extflags & RXf_CANY_SEEN)) { 575 /* Substring at constant offset from beg-of-str... */ 576 I32 slen; 577 578 s = HOP3c(strpos, prog->check_offset_min, strend); 579 580 if (SvTAIL(check)) { 581 slen = SvCUR(check); /* >= 1 */ 582 583 if ( strend - s > slen || strend - s < slen - 1 584 || (strend - s == slen && strend[-1] != '\n')) { 585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); 586 goto fail_finish; 587 } 588 /* Now should match s[0..slen-2] */ 589 slen--; 590 if (slen && (*SvPVX_const(check) != *s 591 || (slen > 1 592 && memNE(SvPVX_const(check), s, slen)))) { 593 report_neq: 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); 595 goto fail_finish; 596 } 597 } 598 else if (*SvPVX_const(check) != *s 599 || ((slen = SvCUR(check)) > 1 600 && memNE(SvPVX_const(check), s, slen))) 601 goto report_neq; 602 check_at = s; 603 goto success_at_start; 604 } 605 } 606 /* Match is anchored, but substr is not anchored wrt beg-of-str. */ 607 s = strpos; 608 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 609 end_shift = prog->check_end_shift; 610 611 if (!ml_anch) { 612 const I32 end = prog->check_offset_max + CHR_SVLEN(check) 613 - (SvTAIL(check) != 0); 614 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; 615 616 if (end_shift < eshift) 617 end_shift = eshift; 618 } 619 } 620 else { /* Can match at random position */ 621 ml_anch = 0; 622 s = strpos; 623 start_shift = prog->check_offset_min; /* okay to underestimate on CC */ 624 end_shift = prog->check_end_shift; 625 626 /* end shift should be non negative here */ 627 } 628 629#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ 630 if (end_shift < 0) 631 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", 632 (IV)end_shift, RX_PRECOMP(prog)); 633#endif 634 635 restart: 636 /* Find a possible match in the region s..strend by looking for 637 the "check" substring in the region corrected by start/end_shift. */ 638 639 { 640 I32 srch_start_shift = start_shift; 641 I32 srch_end_shift = end_shift; 642 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { 643 srch_end_shift -= ((strbeg - s) - srch_start_shift); 644 srch_start_shift = strbeg - s; 645 } 646 DEBUG_OPTIMISE_MORE_r({ 647 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", 648 (IV)prog->check_offset_min, 649 (IV)srch_start_shift, 650 (IV)srch_end_shift, 651 (IV)prog->check_end_shift); 652 }); 653 654 if (flags & REXEC_SCREAM) { 655 I32 p = -1; /* Internal iterator of scream. */ 656 I32 * const pp = data ? data->scream_pos : &p; 657 658 if (PL_screamfirst[BmRARE(check)] >= 0 659 || ( BmRARE(check) == '\n' 660 && (BmPREVIOUS(check) == SvCUR(check) - 1) 661 && SvTAIL(check) )) 662 s = screaminstr(sv, check, 663 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0); 664 else 665 goto fail_finish; 666 /* we may be pointing at the wrong string */ 667 if (s && RXp_MATCH_COPIED(prog)) 668 s = strbeg + (s - SvPVX_const(sv)); 669 if (data) 670 *data->scream_olds = s; 671 } 672 else { 673 U8* start_point; 674 U8* end_point; 675 if (prog->extflags & RXf_CANY_SEEN) { 676 start_point= (U8*)(s + srch_start_shift); 677 end_point= (U8*)(strend - srch_end_shift); 678 } else { 679 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); 680 end_point= HOP3(strend, -srch_end_shift, strbeg); 681 } 682 DEBUG_OPTIMISE_MORE_r({ 683 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", 684 (int)(end_point - start_point), 685 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), 686 start_point); 687 }); 688 689 s = fbm_instr( start_point, end_point, 690 check, multiline ? FBMrf_MULTILINE : 0); 691 } 692 } 693 /* Update the count-of-usability, remove useless subpatterns, 694 unshift s. */ 695 696 DEBUG_EXECUTE_r({ 697 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 698 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 699 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", 700 (s ? "Found" : "Did not find"), 701 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) 702 ? "anchored" : "floating"), 703 quoted, 704 RE_SV_TAIL(check), 705 (s ? " at offset " : "...\n") ); 706 }); 707 708 if (!s) 709 goto fail_finish; 710 /* Finish the diagnostic message */ 711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); 712 713 /* XXX dmq: first branch is for positive lookbehind... 714 Our check string is offset from the beginning of the pattern. 715 So we need to do any stclass tests offset forward from that 716 point. I think. :-( 717 */ 718 719 720 721 check_at=s; 722 723 724 /* Got a candidate. Check MBOL anchoring, and the *other* substr. 725 Start with the other substr. 726 XXXX no SCREAM optimization yet - and a very coarse implementation 727 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will 728 *always* match. Probably should be marked during compile... 729 Probably it is right to do no SCREAM here... 730 */ 731 732 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) 733 : (prog->float_substr && prog->anchored_substr)) 734 { 735 /* Take into account the "other" substring. */ 736 /* XXXX May be hopelessly wrong for UTF... */ 737 if (!other_last) 738 other_last = strpos; 739 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) { 740 do_other_anchored: 741 { 742 char * const last = HOP3c(s, -start_shift, strbeg); 743 char *last1, *last2; 744 char * const saved_s = s; 745 SV* must; 746 747 t = s - prog->check_offset_max; 748 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 749 && (!do_utf8 750 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) 751 && t > strpos))) 752 NOOP; 753 else 754 t = strpos; 755 t = HOP3c(t, prog->anchored_offset, strend); 756 if (t < other_last) /* These positions already checked */ 757 t = other_last; 758 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); 759 if (last < last1) 760 last1 = last; 761 /* XXXX It is not documented what units *_offsets are in. 762 We assume bytes, but this is clearly wrong. 763 Meaning this code needs to be carefully reviewed for errors. 764 dmq. 765 */ 766 767 /* On end-of-str: see comment below. */ 768 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 769 if (must == &PL_sv_undef) { 770 s = (char*)NULL; 771 DEBUG_r(must = prog->anchored_utf8); /* for debug */ 772 } 773 else 774 s = fbm_instr( 775 (unsigned char*)t, 776 HOP3(HOP3(last1, prog->anchored_offset, strend) 777 + SvCUR(must), -(SvTAIL(must)!=0), strbeg), 778 must, 779 multiline ? FBMrf_MULTILINE : 0 780 ); 781 DEBUG_EXECUTE_r({ 782 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 783 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 784 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", 785 (s ? "Found" : "Contradicts"), 786 quoted, RE_SV_TAIL(must)); 787 }); 788 789 790 if (!s) { 791 if (last1 >= last2) { 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 793 ", giving up...\n")); 794 goto fail_finish; 795 } 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 797 ", trying floating at offset %ld...\n", 798 (long)(HOP3c(saved_s, 1, strend) - i_strpos))); 799 other_last = HOP3c(last1, prog->anchored_offset+1, strend); 800 s = HOP3c(last, 1, strend); 801 goto restart; 802 } 803 else { 804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 805 (long)(s - i_strpos))); 806 t = HOP3c(s, -prog->anchored_offset, strbeg); 807 other_last = HOP3c(s, 1, strend); 808 s = saved_s; 809 if (t == strpos) 810 goto try_at_start; 811 goto try_at_offset; 812 } 813 } 814 } 815 else { /* Take into account the floating substring. */ 816 char *last, *last1; 817 char * const saved_s = s; 818 SV* must; 819 820 t = HOP3c(s, -start_shift, strbeg); 821 last1 = last = 822 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); 823 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) 824 last = HOP3c(t, prog->float_max_offset, strend); 825 s = HOP3c(t, prog->float_min_offset, strend); 826 if (s < other_last) 827 s = other_last; 828 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ 829 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 830 /* fbm_instr() takes into account exact value of end-of-str 831 if the check is SvTAIL(ed). Since false positives are OK, 832 and end-of-str is not later than strend we are OK. */ 833 if (must == &PL_sv_undef) { 834 s = (char*)NULL; 835 DEBUG_r(must = prog->float_utf8); /* for debug message */ 836 } 837 else 838 s = fbm_instr((unsigned char*)s, 839 (unsigned char*)last + SvCUR(must) 840 - (SvTAIL(must)!=0), 841 must, multiline ? FBMrf_MULTILINE : 0); 842 DEBUG_EXECUTE_r({ 843 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 844 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 845 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", 846 (s ? "Found" : "Contradicts"), 847 quoted, RE_SV_TAIL(must)); 848 }); 849 if (!s) { 850 if (last1 == last) { 851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 852 ", giving up...\n")); 853 goto fail_finish; 854 } 855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 856 ", trying anchored starting at offset %ld...\n", 857 (long)(saved_s + 1 - i_strpos))); 858 other_last = last; 859 s = HOP3c(t, 1, strend); 860 goto restart; 861 } 862 else { 863 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", 864 (long)(s - i_strpos))); 865 other_last = s; /* Fix this later. --Hugo */ 866 s = saved_s; 867 if (t == strpos) 868 goto try_at_start; 869 goto try_at_offset; 870 } 871 } 872 } 873 874 875 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); 876 877 DEBUG_OPTIMISE_MORE_r( 878 PerlIO_printf(Perl_debug_log, 879 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", 880 (IV)prog->check_offset_min, 881 (IV)prog->check_offset_max, 882 (IV)(s-strpos), 883 (IV)(t-strpos), 884 (IV)(t-s), 885 (IV)(strend-strpos) 886 ) 887 ); 888 889 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ 890 && (!do_utf8 891 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) 892 && t > strpos))) 893 { 894 /* Fixed substring is found far enough so that the match 895 cannot start at strpos. */ 896 try_at_offset: 897 if (ml_anch && t[-1] != '\n') { 898 /* Eventually fbm_*() should handle this, but often 899 anchored_offset is not 0, so this check will not be wasted. */ 900 /* XXXX In the code below we prefer to look for "^" even in 901 presence of anchored substrings. And we search even 902 beyond the found float position. These pessimizations 903 are historical artefacts only. */ 904 find_anchor: 905 while (t < strend - prog->minlen) { 906 if (*t == '\n') { 907 if (t < check_at - prog->check_offset_min) { 908 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) { 909 /* Since we moved from the found position, 910 we definitely contradict the found anchored 911 substr. Due to the above check we do not 912 contradict "check" substr. 913 Thus we can arrive here only if check substr 914 is float. Redo checking for "other"=="fixed". 915 */ 916 strpos = t + 1; 917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", 918 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); 919 goto do_other_anchored; 920 } 921 /* We don't contradict the found floating substring. */ 922 /* XXXX Why not check for STCLASS? */ 923 s = t + 1; 924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", 925 PL_colors[0], PL_colors[1], (long)(s - i_strpos))); 926 goto set_useful; 927 } 928 /* Position contradicts check-string */ 929 /* XXXX probably better to look for check-string 930 than for "\n", so one should lower the limit for t? */ 931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", 932 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); 933 other_last = strpos = s = t + 1; 934 goto restart; 935 } 936 t++; 937 } 938 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", 939 PL_colors[0], PL_colors[1])); 940 goto fail_finish; 941 } 942 else { 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", 944 PL_colors[0], PL_colors[1])); 945 } 946 s = t; 947 set_useful: 948 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 949 } 950 else { 951 /* The found string does not prohibit matching at strpos, 952 - no optimization of calling REx engine can be performed, 953 unless it was an MBOL and we are not after MBOL, 954 or a future STCLASS check will fail this. */ 955 try_at_start: 956 /* Even in this situation we may use MBOL flag if strpos is offset 957 wrt the start of the string. */ 958 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */ 959 && (strpos != strbeg) && strpos[-1] != '\n' 960 /* May be due to an implicit anchor of m{.*foo} */ 961 && !(prog->intflags & PREGf_IMPLICIT)) 962 { 963 t = strpos; 964 goto find_anchor; 965 } 966 DEBUG_EXECUTE_r( if (ml_anch) 967 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", 968 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); 969 ); 970 success_at_start: 971 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ 972 && (do_utf8 ? ( 973 prog->check_utf8 /* Could be deleted already */ 974 && --BmUSEFUL(prog->check_utf8) < 0 975 && (prog->check_utf8 == prog->float_utf8) 976 ) : ( 977 prog->check_substr /* Could be deleted already */ 978 && --BmUSEFUL(prog->check_substr) < 0 979 && (prog->check_substr == prog->float_substr) 980 ))) 981 { 982 /* If flags & SOMETHING - do not do it many times on the same match */ 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); 984 /* XXX Does the destruction order has to change with do_utf8? */ 985 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr); 986 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8); 987 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 988 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 989 check = NULL; /* abort */ 990 s = strpos; 991 /* XXXX This is a remnant of the old implementation. It 992 looks wasteful, since now INTUIT can use many 993 other heuristics. */ 994 prog->extflags &= ~RXf_USE_INTUIT; 995 } 996 else 997 s = strpos; 998 } 999 1000 /* Last resort... */ 1001 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ 1002 /* trie stclasses are too expensive to use here, we are better off to 1003 leave it to regmatch itself */ 1004 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { 1005 /* minlen == 0 is possible if regstclass is \b or \B, 1006 and the fixed substr is ''$. 1007 Since minlen is already taken into account, s+1 is before strend; 1008 accidentally, minlen >= 1 guaranties no false positives at s + 1 1009 even for \b or \B. But (minlen? 1 : 0) below assumes that 1010 regstclass does not come from lookahead... */ 1011 /* If regstclass takes bytelength more than 1: If charlength==1, OK. 1012 This leaves EXACTF only, which is dealt with in find_byclass(). */ 1013 const U8* const str = (U8*)STRING(progi->regstclass); 1014 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT 1015 ? CHR_DIST(str+STR_LEN(progi->regstclass), str) 1016 : 1); 1017 char * endpos; 1018 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 1019 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); 1020 else if (prog->float_substr || prog->float_utf8) 1021 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); 1022 else 1023 endpos= strend; 1024 1025 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n", 1026 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg))); 1027 1028 t = s; 1029 s = find_byclass(prog, progi->regstclass, s, endpos, NULL); 1030 if (!s) { 1031#ifdef DEBUGGING 1032 const char *what = NULL; 1033#endif 1034 if (endpos == strend) { 1035 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1036 "Could not match STCLASS...\n") ); 1037 goto fail; 1038 } 1039 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1040 "This position contradicts STCLASS...\n") ); 1041 if ((prog->extflags & RXf_ANCH) && !ml_anch) 1042 goto fail; 1043 /* Contradict one of substrings */ 1044 if (prog->anchored_substr || prog->anchored_utf8) { 1045 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) { 1046 DEBUG_EXECUTE_r( what = "anchored" ); 1047 hop_and_restart: 1048 s = HOP3c(t, 1, strend); 1049 if (s + start_shift + end_shift > strend) { 1050 /* XXXX Should be taken into account earlier? */ 1051 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1052 "Could not match STCLASS...\n") ); 1053 goto fail; 1054 } 1055 if (!check) 1056 goto giveup; 1057 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1058 "Looking for %s substr starting at offset %ld...\n", 1059 what, (long)(s + start_shift - i_strpos)) ); 1060 goto restart; 1061 } 1062 /* Have both, check_string is floating */ 1063 if (t + start_shift >= check_at) /* Contradicts floating=check */ 1064 goto retry_floating_check; 1065 /* Recheck anchored substring, but not floating... */ 1066 s = check_at; 1067 if (!check) 1068 goto giveup; 1069 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1070 "Looking for anchored substr starting at offset %ld...\n", 1071 (long)(other_last - i_strpos)) ); 1072 goto do_other_anchored; 1073 } 1074 /* Another way we could have checked stclass at the 1075 current position only: */ 1076 if (ml_anch) { 1077 s = t = t + 1; 1078 if (!check) 1079 goto giveup; 1080 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 1081 "Looking for /%s^%s/m starting at offset %ld...\n", 1082 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); 1083 goto try_at_offset; 1084 } 1085 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ 1086 goto fail; 1087 /* Check is floating subtring. */ 1088 retry_floating_check: 1089 t = check_at - start_shift; 1090 DEBUG_EXECUTE_r( what = "floating" ); 1091 goto hop_and_restart; 1092 } 1093 if (t != s) { 1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1095 "By STCLASS: moving %ld --> %ld\n", 1096 (long)(t - i_strpos), (long)(s - i_strpos)) 1097 ); 1098 } 1099 else { 1100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1101 "Does not contradict STCLASS...\n"); 1102 ); 1103 } 1104 } 1105 giveup: 1106 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", 1107 PL_colors[4], (check ? "Guessed" : "Giving up"), 1108 PL_colors[5], (long)(s - i_strpos)) ); 1109 return s; 1110 1111 fail_finish: /* Substring not found */ 1112 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 1113 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 1114 fail: 1115 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", 1116 PL_colors[4], PL_colors[5])); 1117 return NULL; 1118} 1119 1120#define DECL_TRIE_TYPE(scan) \ 1121 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ 1122 trie_type = (scan->flags != EXACT) \ 1123 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \ 1124 : (do_utf8 ? trie_utf8 : trie_plain) 1125 1126#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ 1127uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ 1128 switch (trie_type) { \ 1129 case trie_utf8_fold: \ 1130 if ( foldlen>0 ) { \ 1131 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ 1132 foldlen -= len; \ 1133 uscan += len; \ 1134 len=0; \ 1135 } else { \ 1136 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ 1137 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ 1138 foldlen -= UNISKIP( uvc ); \ 1139 uscan = foldbuf + UNISKIP( uvc ); \ 1140 } \ 1141 break; \ 1142 case trie_latin_utf8_fold: \ 1143 if ( foldlen>0 ) { \ 1144 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ 1145 foldlen -= len; \ 1146 uscan += len; \ 1147 len=0; \ 1148 } else { \ 1149 len = 1; \ 1150 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ 1151 foldlen -= UNISKIP( uvc ); \ 1152 uscan = foldbuf + UNISKIP( uvc ); \ 1153 } \ 1154 break; \ 1155 case trie_utf8: \ 1156 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ 1157 break; \ 1158 case trie_plain: \ 1159 uvc = (UV)*uc; \ 1160 len = 1; \ 1161 } \ 1162 if (uvc < 256) { \ 1163 charid = trie->charmap[ uvc ]; \ 1164 } \ 1165 else { \ 1166 charid = 0; \ 1167 if (widecharmap) { \ 1168 SV** const svpp = hv_fetch(widecharmap, \ 1169 (char*)&uvc, sizeof(UV), 0); \ 1170 if (svpp) \ 1171 charid = (U16)SvIV(*svpp); \ 1172 } \ 1173 } \ 1174} STMT_END 1175 1176#define REXEC_FBC_EXACTISH_CHECK(CoNd) \ 1177{ \ 1178 char *my_strend= (char *)strend; \ 1179 if ( (CoNd) \ 1180 && (ln == len || \ 1181 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \ 1182 m, NULL, ln, (bool)UTF)) \ 1183 && (!reginfo || regtry(reginfo, &s)) ) \ 1184 goto got_it; \ 1185 else { \ 1186 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \ 1187 uvchr_to_utf8(tmpbuf, c); \ 1188 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \ 1189 if ( f != c \ 1190 && (f == c1 || f == c2) \ 1191 && (ln == len || \ 1192 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\ 1193 m, NULL, ln, (bool)UTF)) \ 1194 && (!reginfo || regtry(reginfo, &s)) ) \ 1195 goto got_it; \ 1196 } \ 1197} \ 1198s += len 1199 1200#define REXEC_FBC_EXACTISH_SCAN(CoNd) \ 1201STMT_START { \ 1202 while (s <= e) { \ 1203 if ( (CoNd) \ 1204 && (ln == 1 || !(OP(c) == EXACTF \ 1205 ? ibcmp(s, m, ln) \ 1206 : ibcmp_locale(s, m, ln))) \ 1207 && (!reginfo || regtry(reginfo, &s)) ) \ 1208 goto got_it; \ 1209 s++; \ 1210 } \ 1211} STMT_END 1212 1213#define REXEC_FBC_UTF8_SCAN(CoDe) \ 1214STMT_START { \ 1215 while (s + (uskip = UTF8SKIP(s)) <= strend) { \ 1216 CoDe \ 1217 s += uskip; \ 1218 } \ 1219} STMT_END 1220 1221#define REXEC_FBC_SCAN(CoDe) \ 1222STMT_START { \ 1223 while (s < strend) { \ 1224 CoDe \ 1225 s++; \ 1226 } \ 1227} STMT_END 1228 1229#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \ 1230REXEC_FBC_UTF8_SCAN( \ 1231 if (CoNd) { \ 1232 if (tmp && (!reginfo || regtry(reginfo, &s))) \ 1233 goto got_it; \ 1234 else \ 1235 tmp = doevery; \ 1236 } \ 1237 else \ 1238 tmp = 1; \ 1239) 1240 1241#define REXEC_FBC_CLASS_SCAN(CoNd) \ 1242REXEC_FBC_SCAN( \ 1243 if (CoNd) { \ 1244 if (tmp && (!reginfo || regtry(reginfo, &s))) \ 1245 goto got_it; \ 1246 else \ 1247 tmp = doevery; \ 1248 } \ 1249 else \ 1250 tmp = 1; \ 1251) 1252 1253#define REXEC_FBC_TRYIT \ 1254if ((!reginfo || regtry(reginfo, &s))) \ 1255 goto got_it 1256 1257#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ 1258 if (do_utf8) { \ 1259 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1260 } \ 1261 else { \ 1262 REXEC_FBC_CLASS_SCAN(CoNd); \ 1263 } \ 1264 break 1265 1266#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ 1267 if (do_utf8) { \ 1268 UtFpReLoAd; \ 1269 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1270 } \ 1271 else { \ 1272 REXEC_FBC_CLASS_SCAN(CoNd); \ 1273 } \ 1274 break 1275 1276#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \ 1277 PL_reg_flags |= RF_tainted; \ 1278 if (do_utf8) { \ 1279 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ 1280 } \ 1281 else { \ 1282 REXEC_FBC_CLASS_SCAN(CoNd); \ 1283 } \ 1284 break 1285 1286#define DUMP_EXEC_POS(li,s,doutf8) \ 1287 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8) 1288 1289/* We know what class REx starts with. Try to find this position... */ 1290/* if reginfo is NULL, its a dryrun */ 1291/* annoyingly all the vars in this routine have different names from their counterparts 1292 in regmatch. /grrr */ 1293 1294STATIC char * 1295S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 1296 const char *strend, regmatch_info *reginfo) 1297{ 1298 dVAR; 1299 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 1300 char *m; 1301 STRLEN ln; 1302 STRLEN lnc; 1303 register STRLEN uskip; 1304 unsigned int c1; 1305 unsigned int c2; 1306 char *e; 1307 register I32 tmp = 1; /* Scratch variable? */ 1308 register const bool do_utf8 = PL_reg_match_utf8; 1309 RXi_GET_DECL(prog,progi); 1310 1311 PERL_ARGS_ASSERT_FIND_BYCLASS; 1312 1313 /* We know what class it must start with. */ 1314 switch (OP(c)) { 1315 case ANYOF: 1316 if (do_utf8) { 1317 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || 1318 !UTF8_IS_INVARIANT((U8)s[0]) ? 1319 reginclass(prog, c, (U8*)s, 0, do_utf8) : 1320 REGINCLASS(prog, c, (U8*)s)); 1321 } 1322 else { 1323 while (s < strend) { 1324 STRLEN skip = 1; 1325 1326 if (REGINCLASS(prog, c, (U8*)s) || 1327 (ANYOF_FOLD_SHARP_S(c, s, strend) && 1328 /* The assignment of 2 is intentional: 1329 * for the folded sharp s, the skip is 2. */ 1330 (skip = SHARP_S_SKIP))) { 1331 if (tmp && (!reginfo || regtry(reginfo, &s))) 1332 goto got_it; 1333 else 1334 tmp = doevery; 1335 } 1336 else 1337 tmp = 1; 1338 s += skip; 1339 } 1340 } 1341 break; 1342 case CANY: 1343 REXEC_FBC_SCAN( 1344 if (tmp && (!reginfo || regtry(reginfo, &s))) 1345 goto got_it; 1346 else 1347 tmp = doevery; 1348 ); 1349 break; 1350 case EXACTF: 1351 m = STRING(c); 1352 ln = STR_LEN(c); /* length to match in octets/bytes */ 1353 lnc = (I32) ln; /* length to match in characters */ 1354 if (UTF) { 1355 STRLEN ulen1, ulen2; 1356 U8 *sm = (U8 *) m; 1357 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; 1358 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; 1359 /* used by commented-out code below */ 1360 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/ 1361 1362 /* XXX: Since the node will be case folded at compile 1363 time this logic is a little odd, although im not 1364 sure that its actually wrong. --dmq */ 1365 1366 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1); 1367 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2); 1368 1369 /* XXX: This is kinda strange. to_utf8_XYZ returns the 1370 codepoint of the first character in the converted 1371 form, yet originally we did the extra step. 1372 No tests fail by commenting this code out however 1373 so Ive left it out. -- dmq. 1374 1375 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 1376 0, uniflags); 1377 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE, 1378 0, uniflags); 1379 */ 1380 1381 lnc = 0; 1382 while (sm < ((U8 *) m + ln)) { 1383 lnc++; 1384 sm += UTF8SKIP(sm); 1385 } 1386 } 1387 else { 1388 c1 = *(U8*)m; 1389 c2 = PL_fold[c1]; 1390 } 1391 goto do_exactf; 1392 case EXACTFL: 1393 m = STRING(c); 1394 ln = STR_LEN(c); 1395 lnc = (I32) ln; 1396 c1 = *(U8*)m; 1397 c2 = PL_fold_locale[c1]; 1398 do_exactf: 1399 e = HOP3c(strend, -((I32)lnc), s); 1400 1401 if (!reginfo && e < s) 1402 e = s; /* Due to minlen logic of intuit() */ 1403 1404 /* The idea in the EXACTF* cases is to first find the 1405 * first character of the EXACTF* node and then, if 1406 * necessary, case-insensitively compare the full 1407 * text of the node. The c1 and c2 are the first 1408 * characters (though in Unicode it gets a bit 1409 * more complicated because there are more cases 1410 * than just upper and lower: one needs to use 1411 * the so-called folding case for case-insensitive 1412 * matching (called "loose matching" in Unicode). 1413 * ibcmp_utf8() will do just that. */ 1414 1415 if (do_utf8 || UTF) { 1416 UV c, f; 1417 U8 tmpbuf [UTF8_MAXBYTES+1]; 1418 STRLEN len = 1; 1419 STRLEN foldlen; 1420 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1421 if (c1 == c2) { 1422 /* Upper and lower of 1st char are equal - 1423 * probably not a "letter". */ 1424 while (s <= e) { 1425 if (do_utf8) { 1426 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, 1427 uniflags); 1428 } else { 1429 c = *((U8*)s); 1430 } 1431 REXEC_FBC_EXACTISH_CHECK(c == c1); 1432 } 1433 } 1434 else { 1435 while (s <= e) { 1436 if (do_utf8) { 1437 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, 1438 uniflags); 1439 } else { 1440 c = *((U8*)s); 1441 } 1442 1443 /* Handle some of the three Greek sigmas cases. 1444 * Note that not all the possible combinations 1445 * are handled here: some of them are handled 1446 * by the standard folding rules, and some of 1447 * them (the character class or ANYOF cases) 1448 * are handled during compiletime in 1449 * regexec.c:S_regclass(). */ 1450 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || 1451 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) 1452 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; 1453 1454 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2); 1455 } 1456 } 1457 } 1458 else { 1459 /* Neither pattern nor string are UTF8 */ 1460 if (c1 == c2) 1461 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); 1462 else 1463 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); 1464 } 1465 break; 1466 case BOUNDL: 1467 PL_reg_flags |= RF_tainted; 1468 /* FALL THROUGH */ 1469 case BOUND: 1470 if (do_utf8) { 1471 if (s == PL_bostr) 1472 tmp = '\n'; 1473 else { 1474 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1475 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); 1476 } 1477 tmp = ((OP(c) == BOUND ? 1478 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1479 LOAD_UTF8_CHARCLASS_ALNUM(); 1480 REXEC_FBC_UTF8_SCAN( 1481 if (tmp == !(OP(c) == BOUND ? 1482 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1483 isALNUM_LC_utf8((U8*)s))) 1484 { 1485 tmp = !tmp; 1486 REXEC_FBC_TRYIT; 1487 } 1488 ); 1489 } 1490 else { 1491 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1492 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1493 REXEC_FBC_SCAN( 1494 if (tmp == 1495 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { 1496 tmp = !tmp; 1497 REXEC_FBC_TRYIT; 1498 } 1499 ); 1500 } 1501 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) 1502 goto got_it; 1503 break; 1504 case NBOUNDL: 1505 PL_reg_flags |= RF_tainted; 1506 /* FALL THROUGH */ 1507 case NBOUND: 1508 if (do_utf8) { 1509 if (s == PL_bostr) 1510 tmp = '\n'; 1511 else { 1512 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); 1513 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); 1514 } 1515 tmp = ((OP(c) == NBOUND ? 1516 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); 1517 LOAD_UTF8_CHARCLASS_ALNUM(); 1518 REXEC_FBC_UTF8_SCAN( 1519 if (tmp == !(OP(c) == NBOUND ? 1520 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) : 1521 isALNUM_LC_utf8((U8*)s))) 1522 tmp = !tmp; 1523 else REXEC_FBC_TRYIT; 1524 ); 1525 } 1526 else { 1527 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; 1528 tmp = ((OP(c) == NBOUND ? 1529 isALNUM(tmp) : isALNUM_LC(tmp)) != 0); 1530 REXEC_FBC_SCAN( 1531 if (tmp == 1532 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) 1533 tmp = !tmp; 1534 else REXEC_FBC_TRYIT; 1535 ); 1536 } 1537 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s))) 1538 goto got_it; 1539 break; 1540 case ALNUM: 1541 REXEC_FBC_CSCAN_PRELOAD( 1542 LOAD_UTF8_CHARCLASS_PERL_WORD(), 1543 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), 1544 isALNUM(*s) 1545 ); 1546 case ALNUML: 1547 REXEC_FBC_CSCAN_TAINT( 1548 isALNUM_LC_utf8((U8*)s), 1549 isALNUM_LC(*s) 1550 ); 1551 case NALNUM: 1552 REXEC_FBC_CSCAN_PRELOAD( 1553 LOAD_UTF8_CHARCLASS_PERL_WORD(), 1554 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), 1555 !isALNUM(*s) 1556 ); 1557 case NALNUML: 1558 REXEC_FBC_CSCAN_TAINT( 1559 !isALNUM_LC_utf8((U8*)s), 1560 !isALNUM_LC(*s) 1561 ); 1562 case SPACE: 1563 REXEC_FBC_CSCAN_PRELOAD( 1564 LOAD_UTF8_CHARCLASS_PERL_SPACE(), 1565 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8), 1566 isSPACE(*s) 1567 ); 1568 case SPACEL: 1569 REXEC_FBC_CSCAN_TAINT( 1570 *s == ' ' || isSPACE_LC_utf8((U8*)s), 1571 isSPACE_LC(*s) 1572 ); 1573 case NSPACE: 1574 REXEC_FBC_CSCAN_PRELOAD( 1575 LOAD_UTF8_CHARCLASS_PERL_SPACE(), 1576 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)), 1577 !isSPACE(*s) 1578 ); 1579 case NSPACEL: 1580 REXEC_FBC_CSCAN_TAINT( 1581 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)), 1582 !isSPACE_LC(*s) 1583 ); 1584 case DIGIT: 1585 REXEC_FBC_CSCAN_PRELOAD( 1586 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), 1587 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), 1588 isDIGIT(*s) 1589 ); 1590 case DIGITL: 1591 REXEC_FBC_CSCAN_TAINT( 1592 isDIGIT_LC_utf8((U8*)s), 1593 isDIGIT_LC(*s) 1594 ); 1595 case NDIGIT: 1596 REXEC_FBC_CSCAN_PRELOAD( 1597 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), 1598 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), 1599 !isDIGIT(*s) 1600 ); 1601 case NDIGITL: 1602 REXEC_FBC_CSCAN_TAINT( 1603 !isDIGIT_LC_utf8((U8*)s), 1604 !isDIGIT_LC(*s) 1605 ); 1606 case LNBREAK: 1607 REXEC_FBC_CSCAN( 1608 is_LNBREAK_utf8(s), 1609 is_LNBREAK_latin1(s) 1610 ); 1611 case VERTWS: 1612 REXEC_FBC_CSCAN( 1613 is_VERTWS_utf8(s), 1614 is_VERTWS_latin1(s) 1615 ); 1616 case NVERTWS: 1617 REXEC_FBC_CSCAN( 1618 !is_VERTWS_utf8(s), 1619 !is_VERTWS_latin1(s) 1620 ); 1621 case HORIZWS: 1622 REXEC_FBC_CSCAN( 1623 is_HORIZWS_utf8(s), 1624 is_HORIZWS_latin1(s) 1625 ); 1626 case NHORIZWS: 1627 REXEC_FBC_CSCAN( 1628 !is_HORIZWS_utf8(s), 1629 !is_HORIZWS_latin1(s) 1630 ); 1631 case AHOCORASICKC: 1632 case AHOCORASICK: 1633 { 1634 DECL_TRIE_TYPE(c); 1635 /* what trie are we using right now */ 1636 reg_ac_data *aho 1637 = (reg_ac_data*)progi->data->data[ ARG( c ) ]; 1638 reg_trie_data *trie 1639 = (reg_trie_data*)progi->data->data[ aho->trie ]; 1640 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 1641 1642 const char *last_start = strend - trie->minlen; 1643#ifdef DEBUGGING 1644 const char *real_start = s; 1645#endif 1646 STRLEN maxlen = trie->maxlen; 1647 SV *sv_points; 1648 U8 **points; /* map of where we were in the input string 1649 when reading a given char. For ASCII this 1650 is unnecessary overhead as the relationship 1651 is always 1:1, but for Unicode, especially 1652 case folded Unicode this is not true. */ 1653 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 1654 U8 *bitmap=NULL; 1655 1656 1657 GET_RE_DEBUG_FLAGS_DECL; 1658 1659 /* We can't just allocate points here. We need to wrap it in 1660 * an SV so it gets freed properly if there is a croak while 1661 * running the match */ 1662 ENTER; 1663 SAVETMPS; 1664 sv_points=newSV(maxlen * sizeof(U8 *)); 1665 SvCUR_set(sv_points, 1666 maxlen * sizeof(U8 *)); 1667 SvPOK_on(sv_points); 1668 sv_2mortal(sv_points); 1669 points=(U8**)SvPV_nolen(sv_points ); 1670 if ( trie_type != trie_utf8_fold 1671 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 1672 { 1673 if (trie->bitmap) 1674 bitmap=(U8*)trie->bitmap; 1675 else 1676 bitmap=(U8*)ANYOF_BITMAP(c); 1677 } 1678 /* this is the Aho-Corasick algorithm modified a touch 1679 to include special handling for long "unknown char" 1680 sequences. The basic idea being that we use AC as long 1681 as we are dealing with a possible matching char, when 1682 we encounter an unknown char (and we have not encountered 1683 an accepting state) we scan forward until we find a legal 1684 starting char. 1685 AC matching is basically that of trie matching, except 1686 that when we encounter a failing transition, we fall back 1687 to the current states "fail state", and try the current char 1688 again, a process we repeat until we reach the root state, 1689 state 1, or a legal transition. If we fail on the root state 1690 then we can either terminate if we have reached an accepting 1691 state previously, or restart the entire process from the beginning 1692 if we have not. 1693 1694 */ 1695 while (s <= last_start) { 1696 const U32 uniflags = UTF8_ALLOW_DEFAULT; 1697 U8 *uc = (U8*)s; 1698 U16 charid = 0; 1699 U32 base = 1; 1700 U32 state = 1; 1701 UV uvc = 0; 1702 STRLEN len = 0; 1703 STRLEN foldlen = 0; 1704 U8 *uscan = (U8*)NULL; 1705 U8 *leftmost = NULL; 1706#ifdef DEBUGGING 1707 U32 accepted_word= 0; 1708#endif 1709 U32 pointpos = 0; 1710 1711 while ( state && uc <= (U8*)strend ) { 1712 int failed=0; 1713 U32 word = aho->states[ state ].wordnum; 1714 1715 if( state==1 ) { 1716 if ( bitmap ) { 1717 DEBUG_TRIE_EXECUTE_r( 1718 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 1719 dump_exec_pos( (char *)uc, c, strend, real_start, 1720 (char *)uc, do_utf8 ); 1721 PerlIO_printf( Perl_debug_log, 1722 " Scanning for legal start char...\n"); 1723 } 1724 ); 1725 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { 1726 uc++; 1727 } 1728 s= (char *)uc; 1729 } 1730 if (uc >(U8*)last_start) break; 1731 } 1732 1733 if ( word ) { 1734 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ]; 1735 if (!leftmost || lpos < leftmost) { 1736 DEBUG_r(accepted_word=word); 1737 leftmost= lpos; 1738 } 1739 if (base==0) break; 1740 1741 } 1742 points[pointpos++ % maxlen]= uc; 1743 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 1744 uscan, len, uvc, charid, foldlen, 1745 foldbuf, uniflags); 1746 DEBUG_TRIE_EXECUTE_r({ 1747 dump_exec_pos( (char *)uc, c, strend, real_start, 1748 s, do_utf8 ); 1749 PerlIO_printf(Perl_debug_log, 1750 " Charid:%3u CP:%4"UVxf" ", 1751 charid, uvc); 1752 }); 1753 1754 do { 1755#ifdef DEBUGGING 1756 word = aho->states[ state ].wordnum; 1757#endif 1758 base = aho->states[ state ].trans.base; 1759 1760 DEBUG_TRIE_EXECUTE_r({ 1761 if (failed) 1762 dump_exec_pos( (char *)uc, c, strend, real_start, 1763 s, do_utf8 ); 1764 PerlIO_printf( Perl_debug_log, 1765 "%sState: %4"UVxf", word=%"UVxf, 1766 failed ? " Fail transition to " : "", 1767 (UV)state, (UV)word); 1768 }); 1769 if ( base ) { 1770 U32 tmp; 1771 if (charid && 1772 (base + charid > trie->uniquecharcount ) 1773 && (base + charid - 1 - trie->uniquecharcount 1774 < trie->lasttrans) 1775 && trie->trans[base + charid - 1 - 1776 trie->uniquecharcount].check == state 1777 && (tmp=trie->trans[base + charid - 1 - 1778 trie->uniquecharcount ].next)) 1779 { 1780 DEBUG_TRIE_EXECUTE_r( 1781 PerlIO_printf( Perl_debug_log," - legal\n")); 1782 state = tmp; 1783 break; 1784 } 1785 else { 1786 DEBUG_TRIE_EXECUTE_r( 1787 PerlIO_printf( Perl_debug_log," - fail\n")); 1788 failed = 1; 1789 state = aho->fail[state]; 1790 } 1791 } 1792 else { 1793 /* we must be accepting here */ 1794 DEBUG_TRIE_EXECUTE_r( 1795 PerlIO_printf( Perl_debug_log," - accepting\n")); 1796 failed = 1; 1797 break; 1798 } 1799 } while(state); 1800 uc += len; 1801 if (failed) { 1802 if (leftmost) 1803 break; 1804 if (!state) state = 1; 1805 } 1806 } 1807 if ( aho->states[ state ].wordnum ) { 1808 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ]; 1809 if (!leftmost || lpos < leftmost) { 1810 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 1811 leftmost = lpos; 1812 } 1813 } 1814 if (leftmost) { 1815 s = (char*)leftmost; 1816 DEBUG_TRIE_EXECUTE_r({ 1817 PerlIO_printf( 1818 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", 1819 (UV)accepted_word, (IV)(s - real_start) 1820 ); 1821 }); 1822 if (!reginfo || regtry(reginfo, &s)) { 1823 FREETMPS; 1824 LEAVE; 1825 goto got_it; 1826 } 1827 s = HOPc(s,1); 1828 DEBUG_TRIE_EXECUTE_r({ 1829 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); 1830 }); 1831 } else { 1832 DEBUG_TRIE_EXECUTE_r( 1833 PerlIO_printf( Perl_debug_log,"No match.\n")); 1834 break; 1835 } 1836 } 1837 FREETMPS; 1838 LEAVE; 1839 } 1840 break; 1841 default: 1842 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 1843 break; 1844 } 1845 return 0; 1846 got_it: 1847 return s; 1848} 1849 1850 1851/* 1852 - regexec_flags - match a regexp against a string 1853 */ 1854I32 1855Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend, 1856 char *strbeg, I32 minend, SV *sv, void *data, U32 flags) 1857/* strend: pointer to null at end of string */ 1858/* strbeg: real beginning of string */ 1859/* minend: end of match must be >=minend after stringarg. */ 1860/* data: May be used for some additional optimizations. 1861 Currently its only used, with a U32 cast, for transmitting 1862 the ganch offset when doing a /g match. This will change */ 1863/* nosave: For optimizations. */ 1864{ 1865 dVAR; 1866 struct regexp *const prog = (struct regexp *)SvANY(rx); 1867 /*register*/ char *s; 1868 register regnode *c; 1869 /*register*/ char *startpos = stringarg; 1870 I32 minlen; /* must match at least this many chars */ 1871 I32 dontbother = 0; /* how many characters not to try at end */ 1872 I32 end_shift = 0; /* Same for the end. */ /* CC */ 1873 I32 scream_pos = -1; /* Internal iterator of scream. */ 1874 char *scream_olds = NULL; 1875 const bool do_utf8 = (bool)DO_UTF8(sv); 1876 I32 multiline; 1877 RXi_GET_DECL(prog,progi); 1878 regmatch_info reginfo; /* create some info to pass to regtry etc */ 1879 regexp_paren_pair *swap = NULL; 1880 GET_RE_DEBUG_FLAGS_DECL; 1881 1882 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 1883 PERL_UNUSED_ARG(data); 1884 1885 /* Be paranoid... */ 1886 if (prog == NULL || startpos == NULL) { 1887 Perl_croak(aTHX_ "NULL regexp parameter"); 1888 return 0; 1889 } 1890 1891 multiline = prog->extflags & RXf_PMf_MULTILINE; 1892 reginfo.prog = rx; /* Yes, sorry that this is confusing. */ 1893 1894 RX_MATCH_UTF8_set(rx, do_utf8); 1895 DEBUG_EXECUTE_r( 1896 debug_start_match(rx, do_utf8, startpos, strend, 1897 "Matching"); 1898 ); 1899 1900 minlen = prog->minlen; 1901 1902 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 1903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 1904 "String too short [regexec_flags]...\n")); 1905 goto phooey; 1906 } 1907 1908 1909 /* Check validity of program. */ 1910 if (UCHARAT(progi->program) != REG_MAGIC) { 1911 Perl_croak(aTHX_ "corrupted regexp program"); 1912 } 1913 1914 PL_reg_flags = 0; 1915 PL_reg_eval_set = 0; 1916 PL_reg_maxiter = 0; 1917 1918 if (RX_UTF8(rx)) 1919 PL_reg_flags |= RF_utf8; 1920 1921 /* Mark beginning of line for ^ and lookbehind. */ 1922 reginfo.bol = startpos; /* XXX not used ??? */ 1923 PL_bostr = strbeg; 1924 reginfo.sv = sv; 1925 1926 /* Mark end of line for $ (and such) */ 1927 PL_regeol = strend; 1928 1929 /* see how far we have to get to not match where we matched before */ 1930 reginfo.till = startpos+minend; 1931 1932 /* If there is a "must appear" string, look for it. */ 1933 s = startpos; 1934 1935 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ 1936 MAGIC *mg; 1937 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ 1938 reginfo.ganch = startpos + prog->gofs; 1939 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, 1940 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); 1941 } else if (sv && SvTYPE(sv) >= SVt_PVMG 1942 && SvMAGIC(sv) 1943 && (mg = mg_find(sv, PERL_MAGIC_regex_global)) 1944 && mg->mg_len >= 0) { 1945 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */ 1946 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, 1947 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); 1948 1949 if (prog->extflags & RXf_ANCH_GPOS) { 1950 if (s > reginfo.ganch) 1951 goto phooey; 1952 s = reginfo.ganch - prog->gofs; 1953 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, 1954 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); 1955 if (s < strbeg) 1956 goto phooey; 1957 } 1958 } 1959 else if (data) { 1960 reginfo.ganch = strbeg + PTR2UV(data); 1961 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, 1962 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data))); 1963 1964 } else { /* pos() not defined */ 1965 reginfo.ganch = strbeg; 1966 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, 1967 "GPOS: reginfo.ganch = strbeg\n")); 1968 } 1969 } 1970 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { 1971 /* We have to be careful. If the previous successful match 1972 was from this regex we don't want a subsequent partially 1973 successful match to clobber the old results. 1974 So when we detect this possibility we add a swap buffer 1975 to the re, and switch the buffer each match. If we fail 1976 we switch it back, otherwise we leave it swapped. 1977 */ 1978 swap = prog->offs; 1979 /* do we need a save destructor here for eval dies? */ 1980 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); 1981 } 1982 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { 1983 re_scream_pos_data d; 1984 1985 d.scream_olds = &scream_olds; 1986 d.scream_pos = &scream_pos; 1987 s = re_intuit_start(rx, sv, s, strend, flags, &d); 1988 if (!s) { 1989 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); 1990 goto phooey; /* not present */ 1991 } 1992 } 1993 1994 1995 1996 /* Simplest case: anchored match need be tried only once. */ 1997 /* [unless only anchor is BOL and multiline is set] */ 1998 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { 1999 if (s == startpos && regtry(®info, &startpos)) 2000 goto got_it; 2001 else if (multiline || (prog->intflags & PREGf_IMPLICIT) 2002 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ 2003 { 2004 char *end; 2005 2006 if (minlen) 2007 dontbother = minlen - 1; 2008 end = HOP3c(strend, -dontbother, strbeg) - 1; 2009 /* for multiline we only have to try after newlines */ 2010 if (prog->check_substr || prog->check_utf8) { 2011 /* because of the goto we can not easily reuse the macros for bifurcating the 2012 unicode/non-unicode match modes here like we do elsewhere - demerphq */ 2013 if (do_utf8) { 2014 if (s == startpos) 2015 goto after_try_utf8; 2016 while (1) { 2017 if (regtry(®info, &s)) { 2018 goto got_it; 2019 } 2020 after_try_utf8: 2021 if (s > end) { 2022 goto phooey; 2023 } 2024 if (prog->extflags & RXf_USE_INTUIT) { 2025 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL); 2026 if (!s) { 2027 goto phooey; 2028 } 2029 } 2030 else { 2031 s += UTF8SKIP(s); 2032 } 2033 } 2034 } /* end search for check string in unicode */ 2035 else { 2036 if (s == startpos) { 2037 goto after_try_latin; 2038 } 2039 while (1) { 2040 if (regtry(®info, &s)) { 2041 goto got_it; 2042 } 2043 after_try_latin: 2044 if (s > end) { 2045 goto phooey; 2046 } 2047 if (prog->extflags & RXf_USE_INTUIT) { 2048 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); 2049 if (!s) { 2050 goto phooey; 2051 } 2052 } 2053 else { 2054 s++; 2055 } 2056 } 2057 } /* end search for check string in latin*/ 2058 } /* end search for check string */ 2059 else { /* search for newline */ 2060 if (s > startpos) { 2061 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ 2062 s--; 2063 } 2064 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ 2065 while (s < end) { 2066 if (*s++ == '\n') { /* don't need PL_utf8skip here */ 2067 if (regtry(®info, &s)) 2068 goto got_it; 2069 } 2070 } 2071 } /* end search for newline */ 2072 } /* end anchored/multiline check string search */ 2073 goto phooey; 2074 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) 2075 { 2076 /* the warning about reginfo.ganch being used without intialization 2077 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN 2078 and we only enter this block when the same bit is set. */ 2079 char *tmp_s = reginfo.ganch - prog->gofs; 2080 2081 if (tmp_s >= strbeg && regtry(®info, &tmp_s)) 2082 goto got_it; 2083 goto phooey; 2084 } 2085 2086 /* Messy cases: unanchored match. */ 2087 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 2088 /* we have /x+whatever/ */ 2089 /* it must be a one character string (XXXX Except UTF?) */ 2090 char ch; 2091#ifdef DEBUGGING 2092 int did_match = 0; 2093#endif 2094 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 2095 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 2096 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0]; 2097 2098 if (do_utf8) { 2099 REXEC_FBC_SCAN( 2100 if (*s == ch) { 2101 DEBUG_EXECUTE_r( did_match = 1 ); 2102 if (regtry(®info, &s)) goto got_it; 2103 s += UTF8SKIP(s); 2104 while (s < strend && *s == ch) 2105 s += UTF8SKIP(s); 2106 } 2107 ); 2108 } 2109 else { 2110 REXEC_FBC_SCAN( 2111 if (*s == ch) { 2112 DEBUG_EXECUTE_r( did_match = 1 ); 2113 if (regtry(®info, &s)) goto got_it; 2114 s++; 2115 while (s < strend && *s == ch) 2116 s++; 2117 } 2118 ); 2119 } 2120 DEBUG_EXECUTE_r(if (!did_match) 2121 PerlIO_printf(Perl_debug_log, 2122 "Did not find anchored character...\n") 2123 ); 2124 } 2125 else if (prog->anchored_substr != NULL 2126 || prog->anchored_utf8 != NULL 2127 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 2128 && prog->float_max_offset < strend - s)) { 2129 SV *must; 2130 I32 back_max; 2131 I32 back_min; 2132 char *last; 2133 char *last1; /* Last position checked before */ 2134#ifdef DEBUGGING 2135 int did_match = 0; 2136#endif 2137 if (prog->anchored_substr || prog->anchored_utf8) { 2138 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) 2139 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 2140 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; 2141 back_max = back_min = prog->anchored_offset; 2142 } else { 2143 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 2144 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 2145 must = do_utf8 ? prog->float_utf8 : prog->float_substr; 2146 back_max = prog->float_max_offset; 2147 back_min = prog->float_min_offset; 2148 } 2149 2150 2151 if (must == &PL_sv_undef) 2152 /* could not downgrade utf8 check substring, so must fail */ 2153 goto phooey; 2154 2155 if (back_min<0) { 2156 last = strend; 2157 } else { 2158 last = HOP3c(strend, /* Cannot start after this */ 2159 -(I32)(CHR_SVLEN(must) 2160 - (SvTAIL(must) != 0) + back_min), strbeg); 2161 } 2162 if (s > PL_bostr) 2163 last1 = HOPc(s, -1); 2164 else 2165 last1 = s - 1; /* bogus */ 2166 2167 /* XXXX check_substr already used to find "s", can optimize if 2168 check_substr==must. */ 2169 scream_pos = -1; 2170 dontbother = end_shift; 2171 strend = HOPc(strend, -dontbother); 2172 while ( (s <= last) && 2173 ((flags & REXEC_SCREAM) 2174 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg, 2175 end_shift, &scream_pos, 0)) 2176 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), 2177 (unsigned char*)strend, must, 2178 multiline ? FBMrf_MULTILINE : 0))) ) { 2179 /* we may be pointing at the wrong string */ 2180 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog)) 2181 s = strbeg + (s - SvPVX_const(sv)); 2182 DEBUG_EXECUTE_r( did_match = 1 ); 2183 if (HOPc(s, -back_max) > last1) { 2184 last1 = HOPc(s, -back_min); 2185 s = HOPc(s, -back_max); 2186 } 2187 else { 2188 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1; 2189 2190 last1 = HOPc(s, -back_min); 2191 s = t; 2192 } 2193 if (do_utf8) { 2194 while (s <= last1) { 2195 if (regtry(®info, &s)) 2196 goto got_it; 2197 s += UTF8SKIP(s); 2198 } 2199 } 2200 else { 2201 while (s <= last1) { 2202 if (regtry(®info, &s)) 2203 goto got_it; 2204 s++; 2205 } 2206 } 2207 } 2208 DEBUG_EXECUTE_r(if (!did_match) { 2209 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), 2210 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 2211 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", 2212 ((must == prog->anchored_substr || must == prog->anchored_utf8) 2213 ? "anchored" : "floating"), 2214 quoted, RE_SV_TAIL(must)); 2215 }); 2216 goto phooey; 2217 } 2218 else if ( (c = progi->regstclass) ) { 2219 if (minlen) { 2220 const OPCODE op = OP(progi->regstclass); 2221 /* don't bother with what can't match */ 2222 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) 2223 strend = HOPc(strend, -(minlen - 1)); 2224 } 2225 DEBUG_EXECUTE_r({ 2226 SV * const prop = sv_newmortal(); 2227 regprop(prog, prop, c); 2228 { 2229 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1), 2230 s,strend-s,60); 2231 PerlIO_printf(Perl_debug_log, 2232 "Matching stclass %.*s against %s (%d chars)\n", 2233 (int)SvCUR(prop), SvPVX_const(prop), 2234 quoted, (int)(strend - s)); 2235 } 2236 }); 2237 if (find_byclass(prog, c, s, strend, ®info)) 2238 goto got_it; 2239 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); 2240 } 2241 else { 2242 dontbother = 0; 2243 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 2244 /* Trim the end. */ 2245 char *last; 2246 SV* float_real; 2247 2248 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) 2249 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); 2250 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr; 2251 2252 if (flags & REXEC_SCREAM) { 2253 last = screaminstr(sv, float_real, s - strbeg, 2254 end_shift, &scream_pos, 1); /* last one */ 2255 if (!last) 2256 last = scream_olds; /* Only one occurrence. */ 2257 /* we may be pointing at the wrong string */ 2258 else if (RXp_MATCH_COPIED(prog)) 2259 s = strbeg + (s - SvPVX_const(sv)); 2260 } 2261 else { 2262 STRLEN len; 2263 const char * const little = SvPV_const(float_real, len); 2264 2265 if (SvTAIL(float_real)) { 2266 if (memEQ(strend - len + 1, little, len - 1)) 2267 last = strend - len + 1; 2268 else if (!multiline) 2269 last = memEQ(strend - len, little, len) 2270 ? strend - len : NULL; 2271 else 2272 goto find_last; 2273 } else { 2274 find_last: 2275 if (len) 2276 last = rninstr(s, strend, little, little + len); 2277 else 2278 last = strend; /* matching "$" */ 2279 } 2280 } 2281 if (last == NULL) { 2282 DEBUG_EXECUTE_r( 2283 PerlIO_printf(Perl_debug_log, 2284 "%sCan't trim the tail, match fails (should not happen)%s\n", 2285 PL_colors[4], PL_colors[5])); 2286 goto phooey; /* Should not happen! */ 2287 } 2288 dontbother = strend - last + prog->float_min_offset; 2289 } 2290 if (minlen && (dontbother < minlen)) 2291 dontbother = minlen - 1; 2292 strend -= dontbother; /* this one's always in bytes! */ 2293 /* We don't know much -- general case. */ 2294 if (do_utf8) { 2295 for (;;) { 2296 if (regtry(®info, &s)) 2297 goto got_it; 2298 if (s >= strend) 2299 break; 2300 s += UTF8SKIP(s); 2301 }; 2302 } 2303 else { 2304 do { 2305 if (regtry(®info, &s)) 2306 goto got_it; 2307 } while (s++ < strend); 2308 } 2309 } 2310 2311 /* Failure. */ 2312 goto phooey; 2313 2314got_it: 2315 Safefree(swap); 2316 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted); 2317 2318 if (PL_reg_eval_set) 2319 restore_pos(aTHX_ prog); 2320 if (RXp_PAREN_NAMES(prog)) 2321 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 2322 2323 /* make sure $`, $&, $', and $digit will work later */ 2324 if ( !(flags & REXEC_NOT_FIRST) ) { 2325 RX_MATCH_COPY_FREE(rx); 2326 if (flags & REXEC_COPY_STR) { 2327 const I32 i = PL_regeol - startpos + (stringarg - strbeg); 2328#ifdef PERL_OLD_COPY_ON_WRITE 2329 if ((SvIsCOW(sv) 2330 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) { 2331 if (DEBUG_C_TEST) { 2332 PerlIO_printf(Perl_debug_log, 2333 "Copy on write: regexp capture, type %d\n", 2334 (int) SvTYPE(sv)); 2335 } 2336 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); 2337 prog->subbeg = (char *)SvPVX_const(prog->saved_copy); 2338 assert (SvPOKp(prog->saved_copy)); 2339 } else 2340#endif 2341 { 2342 RX_MATCH_COPIED_on(rx); 2343 s = savepvn(strbeg, i); 2344 prog->subbeg = s; 2345 } 2346 prog->sublen = i; 2347 } 2348 else { 2349 prog->subbeg = strbeg; 2350 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ 2351 } 2352 } 2353 2354 return 1; 2355 2356phooey: 2357 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", 2358 PL_colors[4], PL_colors[5])); 2359 if (PL_reg_eval_set) 2360 restore_pos(aTHX_ prog); 2361 if (swap) { 2362 /* we failed :-( roll it back */ 2363 Safefree(prog->offs); 2364 prog->offs = swap; 2365 } 2366 2367 return 0; 2368} 2369 2370 2371/* 2372 - regtry - try match at specific point 2373 */ 2374STATIC I32 /* 0 failure, 1 success */ 2375S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) 2376{ 2377 dVAR; 2378 CHECKPOINT lastcp; 2379 REGEXP *const rx = reginfo->prog; 2380 regexp *const prog = (struct regexp *)SvANY(rx); 2381 RXi_GET_DECL(prog,progi); 2382 GET_RE_DEBUG_FLAGS_DECL; 2383 2384 PERL_ARGS_ASSERT_REGTRY; 2385 2386 reginfo->cutpoint=NULL; 2387 2388 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) { 2389 MAGIC *mg; 2390 2391 PL_reg_eval_set = RS_init; 2392 DEBUG_EXECUTE_r(DEBUG_s( 2393 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", 2394 (IV)(PL_stack_sp - PL_stack_base)); 2395 )); 2396 SAVESTACK_CXPOS(); 2397 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; 2398 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ 2399 SAVETMPS; 2400 /* Apparently this is not needed, judging by wantarray. */ 2401 /* SAVEI8(cxstack[cxstack_ix].blk_gimme); 2402 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ 2403 2404 if (reginfo->sv) { 2405 /* Make $_ available to executed code. */ 2406 if (reginfo->sv != DEFSV) { 2407 SAVE_DEFSV; 2408 DEFSV_set(reginfo->sv); 2409 } 2410 2411 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv) 2412 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) { 2413 /* prepare for quick setting of pos */ 2414#ifdef PERL_OLD_COPY_ON_WRITE 2415 if (SvIsCOW(reginfo->sv)) 2416 sv_force_normal_flags(reginfo->sv, 0); 2417#endif 2418 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global, 2419 &PL_vtbl_mglob, NULL, 0); 2420 mg->mg_len = -1; 2421 } 2422 PL_reg_magic = mg; 2423 PL_reg_oldpos = mg->mg_len; 2424 SAVEDESTRUCTOR_X(restore_pos, prog); 2425 } 2426 if (!PL_reg_curpm) { 2427 Newxz(PL_reg_curpm, 1, PMOP); 2428#ifdef USE_ITHREADS 2429 { 2430 SV* const repointer = &PL_sv_undef; 2431 /* this regexp is also owned by the new PL_reg_curpm, which 2432 will try to free it. */ 2433 av_push(PL_regex_padav, repointer); 2434 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); 2435 PL_regex_pad = AvARRAY(PL_regex_padav); 2436 } 2437#endif 2438 } 2439#ifdef USE_ITHREADS 2440 /* It seems that non-ithreads works both with and without this code. 2441 So for efficiency reasons it seems best not to have the code 2442 compiled when it is not needed. */ 2443 /* This is safe against NULLs: */ 2444 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); 2445 /* PM_reg_curpm owns a reference to this regexp. */ 2446 ReREFCNT_inc(rx); 2447#endif 2448 PM_SETRE(PL_reg_curpm, rx); 2449 PL_reg_oldcurpm = PL_curpm; 2450 PL_curpm = PL_reg_curpm; 2451 if (RXp_MATCH_COPIED(prog)) { 2452 /* Here is a serious problem: we cannot rewrite subbeg, 2453 since it may be needed if this match fails. Thus 2454 $` inside (?{}) could fail... */ 2455 PL_reg_oldsaved = prog->subbeg; 2456 PL_reg_oldsavedlen = prog->sublen; 2457#ifdef PERL_OLD_COPY_ON_WRITE 2458 PL_nrs = prog->saved_copy; 2459#endif 2460 RXp_MATCH_COPIED_off(prog); 2461 } 2462 else 2463 PL_reg_oldsaved = NULL; 2464 prog->subbeg = PL_bostr; 2465 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ 2466 } 2467 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos); 2468 prog->offs[0].start = *startpos - PL_bostr; 2469 PL_reginput = *startpos; 2470 PL_reglastparen = &prog->lastparen; 2471 PL_reglastcloseparen = &prog->lastcloseparen; 2472 prog->lastparen = 0; 2473 prog->lastcloseparen = 0; 2474 PL_regsize = 0; 2475 PL_regoffs = prog->offs; 2476 if (PL_reg_start_tmpl <= prog->nparens) { 2477 PL_reg_start_tmpl = prog->nparens*3/2 + 3; 2478 if(PL_reg_start_tmp) 2479 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2480 else 2481 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 2482 } 2483 2484 /* XXXX What this code is doing here?!!! There should be no need 2485 to do this again and again, PL_reglastparen should take care of 2486 this! --ilya*/ 2487 2488 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 2489 * Actually, the code in regcppop() (which Ilya may be meaning by 2490 * PL_reglastparen), is not needed at all by the test suite 2491 * (op/regexp, op/pat, op/split), but that code is needed otherwise 2492 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 2493 * Meanwhile, this code *is* needed for the 2494 * above-mentioned test suite tests to succeed. The common theme 2495 * on those tests seems to be returning null fields from matches. 2496 * --jhi updated by dapm */ 2497#if 1 2498 if (prog->nparens) { 2499 regexp_paren_pair *pp = PL_regoffs; 2500 register I32 i; 2501 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { 2502 ++pp; 2503 pp->start = -1; 2504 pp->end = -1; 2505 } 2506 } 2507#endif 2508 REGCP_SET(lastcp); 2509 if (regmatch(reginfo, progi->program + 1)) { 2510 PL_regoffs[0].end = PL_reginput - PL_bostr; 2511 return 1; 2512 } 2513 if (reginfo->cutpoint) 2514 *startpos= reginfo->cutpoint; 2515 REGCP_UNWIND(lastcp); 2516 return 0; 2517} 2518 2519 2520#define sayYES goto yes 2521#define sayNO goto no 2522#define sayNO_SILENT goto no_silent 2523 2524/* we dont use STMT_START/END here because it leads to 2525 "unreachable code" warnings, which are bogus, but distracting. */ 2526#define CACHEsayNO \ 2527 if (ST.cache_mask) \ 2528 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \ 2529 sayNO 2530 2531/* this is used to determine how far from the left messages like 2532 'failed...' are printed. It should be set such that messages 2533 are inline with the regop output that created them. 2534*/ 2535#define REPORT_CODE_OFF 32 2536 2537 2538/* Make sure there is a test for this +1 options in re_tests */ 2539#define TRIE_INITAL_ACCEPT_BUFFLEN 4; 2540 2541#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ 2542#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ 2543 2544#define SLAB_FIRST(s) (&(s)->states[0]) 2545#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 2546 2547/* grab a new slab and return the first slot in it */ 2548 2549STATIC regmatch_state * 2550S_push_slab(pTHX) 2551{ 2552#if PERL_VERSION < 9 && !defined(PERL_CORE) 2553 dMY_CXT; 2554#endif 2555 regmatch_slab *s = PL_regmatch_slab->next; 2556 if (!s) { 2557 Newx(s, 1, regmatch_slab); 2558 s->prev = PL_regmatch_slab; 2559 s->next = NULL; 2560 PL_regmatch_slab->next = s; 2561 } 2562 PL_regmatch_slab = s; 2563 return SLAB_FIRST(s); 2564} 2565 2566 2567/* push a new state then goto it */ 2568 2569#define PUSH_STATE_GOTO(state, node) \ 2570 scan = node; \ 2571 st->resume_state = state; \ 2572 goto push_state; 2573 2574/* push a new state with success backtracking, then goto it */ 2575 2576#define PUSH_YES_STATE_GOTO(state, node) \ 2577 scan = node; \ 2578 st->resume_state = state; \ 2579 goto push_yes_state; 2580 2581 2582 2583/* 2584 2585regmatch() - main matching routine 2586 2587This is basically one big switch statement in a loop. We execute an op, 2588set 'next' to point the next op, and continue. If we come to a point which 2589we may need to backtrack to on failure such as (A|B|C), we push a 2590backtrack state onto the backtrack stack. On failure, we pop the top 2591state, and re-enter the loop at the state indicated. If there are no more 2592states to pop, we return failure. 2593 2594Sometimes we also need to backtrack on success; for example /A+/, where 2595after successfully matching one A, we need to go back and try to 2596match another one; similarly for lookahead assertions: if the assertion 2597completes successfully, we backtrack to the state just before the assertion 2598and then carry on. In these cases, the pushed state is marked as 2599'backtrack on success too'. This marking is in fact done by a chain of 2600pointers, each pointing to the previous 'yes' state. On success, we pop to 2601the nearest yes state, discarding any intermediate failure-only states. 2602Sometimes a yes state is pushed just to force some cleanup code to be 2603called at the end of a successful match or submatch; e.g. (??{$re}) uses 2604it to free the inner regex. 2605 2606Note that failure backtracking rewinds the cursor position, while 2607success backtracking leaves it alone. 2608 2609A pattern is complete when the END op is executed, while a subpattern 2610such as (?=foo) is complete when the SUCCESS op is executed. Both of these 2611ops trigger the "pop to last yes state if any, otherwise return true" 2612behaviour. 2613 2614A common convention in this function is to use A and B to refer to the two 2615subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 2616the subpattern to be matched possibly multiple times, while B is the entire 2617rest of the pattern. Variable and state names reflect this convention. 2618 2619The states in the main switch are the union of ops and failure/success of 2620substates associated with with that op. For example, IFMATCH is the op 2621that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 2622'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 2623successfully matched A and IFMATCH_A_fail is a state saying that we have 2624just failed to match A. Resume states always come in pairs. The backtrack 2625state we push is marked as 'IFMATCH_A', but when that is popped, we resume 2626at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 2627on success or failure. 2628 2629The struct that holds a backtracking state is actually a big union, with 2630one variant for each major type of op. The variable st points to the 2631top-most backtrack struct. To make the code clearer, within each 2632block of code we #define ST to alias the relevant union. 2633 2634Here's a concrete example of a (vastly oversimplified) IFMATCH 2635implementation: 2636 2637 switch (state) { 2638 .... 2639 2640#define ST st->u.ifmatch 2641 2642 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 2643 ST.foo = ...; // some state we wish to save 2644 ... 2645 // push a yes backtrack state with a resume value of 2646 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 2647 // first node of A: 2648 PUSH_YES_STATE_GOTO(IFMATCH_A, A); 2649 // NOTREACHED 2650 2651 case IFMATCH_A: // we have successfully executed A; now continue with B 2652 next = B; 2653 bar = ST.foo; // do something with the preserved value 2654 break; 2655 2656 case IFMATCH_A_fail: // A failed, so the assertion failed 2657 ...; // do some housekeeping, then ... 2658 sayNO; // propagate the failure 2659 2660#undef ST 2661 2662 ... 2663 } 2664 2665For any old-timers reading this who are familiar with the old recursive 2666approach, the code above is equivalent to: 2667 2668 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 2669 { 2670 int foo = ... 2671 ... 2672 if (regmatch(A)) { 2673 next = B; 2674 bar = foo; 2675 break; 2676 } 2677 ...; // do some housekeeping, then ... 2678 sayNO; // propagate the failure 2679 } 2680 2681The topmost backtrack state, pointed to by st, is usually free. If you 2682want to claim it, populate any ST.foo fields in it with values you wish to 2683save, then do one of 2684 2685 PUSH_STATE_GOTO(resume_state, node); 2686 PUSH_YES_STATE_GOTO(resume_state, node); 2687 2688which sets that backtrack state's resume value to 'resume_state', pushes a 2689new free entry to the top of the backtrack stack, then goes to 'node'. 2690On backtracking, the free slot is popped, and the saved state becomes the 2691new free state. An ST.foo field in this new top state can be temporarily 2692accessed to retrieve values, but once the main loop is re-entered, it 2693becomes available for reuse. 2694 2695Note that the depth of the backtrack stack constantly increases during the 2696left-to-right execution of the pattern, rather than going up and down with 2697the pattern nesting. For example the stack is at its maximum at Z at the 2698end of the pattern, rather than at X in the following: 2699 2700 /(((X)+)+)+....(Y)+....Z/ 2701 2702The only exceptions to this are lookahead/behind assertions and the cut, 2703(?>A), which pop all the backtrack states associated with A before 2704continuing. 2705 2706Bascktrack state structs are allocated in slabs of about 4K in size. 2707PL_regmatch_state and st always point to the currently active state, 2708and PL_regmatch_slab points to the slab currently containing 2709PL_regmatch_state. The first time regmatch() is called, the first slab is 2710allocated, and is never freed until interpreter destruction. When the slab 2711is full, a new one is allocated and chained to the end. At exit from 2712regmatch(), slabs allocated since entry are freed. 2713 2714*/ 2715 2716 2717#define DEBUG_STATE_pp(pp) \ 2718 DEBUG_STATE_r({ \ 2719 DUMP_EXEC_POS(locinput, scan, do_utf8); \ 2720 PerlIO_printf(Perl_debug_log, \ 2721 " %*s"pp" %s%s%s%s%s\n", \ 2722 depth*2, "", \ 2723 PL_reg_name[st->resume_state], \ 2724 ((st==yes_state||st==mark_state) ? "[" : ""), \ 2725 ((st==yes_state) ? "Y" : ""), \ 2726 ((st==mark_state) ? "M" : ""), \ 2727 ((st==yes_state||st==mark_state) ? "]" : "") \ 2728 ); \ 2729 }); 2730 2731 2732#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) 2733 2734#ifdef DEBUGGING 2735 2736STATIC void 2737S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, 2738 const char *start, const char *end, const char *blurb) 2739{ 2740 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 2741 2742 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 2743 2744 if (!PL_colorset) 2745 reginitcolors(); 2746 { 2747 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 2748 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); 2749 2750 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1), 2751 start, end - start, 60); 2752 2753 PerlIO_printf(Perl_debug_log, 2754 "%s%s REx%s %s against %s\n", 2755 PL_colors[4], blurb, PL_colors[5], s0, s1); 2756 2757 if (do_utf8||utf8_pat) 2758 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", 2759 utf8_pat ? "pattern" : "", 2760 utf8_pat && do_utf8 ? " and " : "", 2761 do_utf8 ? "string" : "" 2762 ); 2763 } 2764} 2765 2766STATIC void 2767S_dump_exec_pos(pTHX_ const char *locinput, 2768 const regnode *scan, 2769 const char *loc_regeol, 2770 const char *loc_bostr, 2771 const char *loc_reg_starttry, 2772 const bool do_utf8) 2773{ 2774 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 2775 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 2776 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 2777 /* The part of the string before starttry has one color 2778 (pref0_len chars), between starttry and current 2779 position another one (pref_len - pref0_len chars), 2780 after the current position the third one. 2781 We assume that pref0_len <= pref_len, otherwise we 2782 decrease pref0_len. */ 2783 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 2784 ? (5 + taill) - l : locinput - loc_bostr; 2785 int pref0_len; 2786 2787 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 2788 2789 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) 2790 pref_len++; 2791 pref0_len = pref_len - (locinput - loc_reg_starttry); 2792 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 2793 l = ( loc_regeol - locinput > (5 + taill) - pref_len 2794 ? (5 + taill) - pref_len : loc_regeol - locinput); 2795 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) 2796 l--; 2797 if (pref0_len < 0) 2798 pref0_len = 0; 2799 if (pref0_len > pref_len) 2800 pref0_len = pref_len; 2801 { 2802 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0; 2803 2804 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 2805 (locinput - pref_len),pref0_len, 60, 4, 5); 2806 2807 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 2808 (locinput - pref_len + pref0_len), 2809 pref_len - pref0_len, 60, 2, 3); 2810 2811 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 2812 locinput, loc_regeol - locinput, 10, 0, 1); 2813 2814 const STRLEN tlen=len0+len1+len2; 2815 PerlIO_printf(Perl_debug_log, 2816 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", 2817 (IV)(locinput - loc_bostr), 2818 len0, s0, 2819 len1, s1, 2820 (docolor ? "" : "> <"), 2821 len2, s2, 2822 (int)(tlen > 19 ? 0 : 19 - tlen), 2823 ""); 2824 } 2825} 2826 2827#endif 2828 2829/* reg_check_named_buff_matched() 2830 * Checks to see if a named buffer has matched. The data array of 2831 * buffer numbers corresponding to the buffer is expected to reside 2832 * in the regexp->data->data array in the slot stored in the ARG() of 2833 * node involved. Note that this routine doesn't actually care about the 2834 * name, that information is not preserved from compilation to execution. 2835 * Returns the index of the leftmost defined buffer with the given name 2836 * or 0 if non of the buffers matched. 2837 */ 2838STATIC I32 2839S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) 2840{ 2841 I32 n; 2842 RXi_GET_DECL(rex,rexi); 2843 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 2844 I32 *nums=(I32*)SvPVX(sv_dat); 2845 2846 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 2847 2848 for ( n=0; n<SvIVX(sv_dat); n++ ) { 2849 if ((I32)*PL_reglastparen >= nums[n] && 2850 PL_regoffs[nums[n]].end != -1) 2851 { 2852 return nums[n]; 2853 } 2854 } 2855 return 0; 2856} 2857 2858 2859/* free all slabs above current one - called during LEAVE_SCOPE */ 2860 2861STATIC void 2862S_clear_backtrack_stack(pTHX_ void *p) 2863{ 2864 regmatch_slab *s = PL_regmatch_slab->next; 2865 PERL_UNUSED_ARG(p); 2866 2867 if (!s) 2868 return; 2869 PL_regmatch_slab->next = NULL; 2870 while (s) { 2871 regmatch_slab * const osl = s; 2872 s = s->next; 2873 Safefree(osl); 2874 } 2875} 2876 2877 2878#define SETREX(Re1,Re2) \ 2879 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \ 2880 Re1 = (Re2) 2881 2882STATIC I32 /* 0 failure, 1 success */ 2883S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) 2884{ 2885#if PERL_VERSION < 9 && !defined(PERL_CORE) 2886 dMY_CXT; 2887#endif 2888 dVAR; 2889 register const bool do_utf8 = PL_reg_match_utf8; 2890 const U32 uniflags = UTF8_ALLOW_DEFAULT; 2891 REGEXP *rex_sv = reginfo->prog; 2892 regexp *rex = (struct regexp *)SvANY(rex_sv); 2893 RXi_GET_DECL(rex,rexi); 2894 I32 oldsave; 2895 /* the current state. This is a cached copy of PL_regmatch_state */ 2896 register regmatch_state *st; 2897 /* cache heavy used fields of st in registers */ 2898 register regnode *scan; 2899 register regnode *next; 2900 register U32 n = 0; /* general value; init to avoid compiler warning */ 2901 register I32 ln = 0; /* len or last; init to avoid compiler warning */ 2902 register char *locinput = PL_reginput; 2903 register I32 nextchr; /* is always set to UCHARAT(locinput) */ 2904 2905 bool result = 0; /* return value of S_regmatch */ 2906 int depth = 0; /* depth of backtrack stack */ 2907 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 2908 const U32 max_nochange_depth = 2909 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 2910 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 2911 regmatch_state *yes_state = NULL; /* state to pop to on success of 2912 subpattern */ 2913 /* mark_state piggy backs on the yes_state logic so that when we unwind 2914 the stack on success we can update the mark_state as we go */ 2915 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 2916 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 2917 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 2918 U32 state_num; 2919 bool no_final = 0; /* prevent failure from backtracking? */ 2920 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 2921 char *startpoint = PL_reginput; 2922 SV *popmark = NULL; /* are we looking for a mark? */ 2923 SV *sv_commit = NULL; /* last mark name seen in failure */ 2924 SV *sv_yes_mark = NULL; /* last mark name we have seen 2925 during a successfull match */ 2926 U32 lastopen = 0; /* last open we saw */ 2927 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; 2928 SV* const oreplsv = GvSV(PL_replgv); 2929 /* these three flags are set by various ops to signal information to 2930 * the very next op. They have a useful lifetime of exactly one loop 2931 * iteration, and are not preserved or restored by state pushes/pops 2932 */ 2933 bool sw = 0; /* the condition value in (?(cond)a|b) */ 2934 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 2935 int logical = 0; /* the following EVAL is: 2936 0: (?{...}) 2937 1: (?(?{...})X|Y) 2938 2: (??{...}) 2939 or the following IFMATCH/UNLESSM is: 2940 false: plain (?=foo) 2941 true: used as a condition: (?(?=foo)) 2942 */ 2943#ifdef DEBUGGING 2944 GET_RE_DEBUG_FLAGS_DECL; 2945#endif 2946 2947 PERL_ARGS_ASSERT_REGMATCH; 2948 2949 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 2950 PerlIO_printf(Perl_debug_log,"regmatch start\n"); 2951 })); 2952 /* on first ever call to regmatch, allocate first slab */ 2953 if (!PL_regmatch_slab) { 2954 Newx(PL_regmatch_slab, 1, regmatch_slab); 2955 PL_regmatch_slab->prev = NULL; 2956 PL_regmatch_slab->next = NULL; 2957 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 2958 } 2959 2960 oldsave = PL_savestack_ix; 2961 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL); 2962 SAVEVPTR(PL_regmatch_slab); 2963 SAVEVPTR(PL_regmatch_state); 2964 2965 /* grab next free state slot */ 2966 st = ++PL_regmatch_state; 2967 if (st > SLAB_LAST(PL_regmatch_slab)) 2968 st = PL_regmatch_state = S_push_slab(aTHX); 2969 2970 /* Note that nextchr is a byte even in UTF */ 2971 nextchr = UCHARAT(locinput); 2972 scan = prog; 2973 while (scan != NULL) { 2974 2975 DEBUG_EXECUTE_r( { 2976 SV * const prop = sv_newmortal(); 2977 regnode *rnext=regnext(scan); 2978 DUMP_EXEC_POS( locinput, scan, do_utf8 ); 2979 regprop(rex, prop, scan); 2980 2981 PerlIO_printf(Perl_debug_log, 2982 "%3"IVdf":%*s%s(%"IVdf")\n", 2983 (IV)(scan - rexi->program), depth*2, "", 2984 SvPVX_const(prop), 2985 (PL_regkind[OP(scan)] == END || !rnext) ? 2986 0 : (IV)(rnext - rexi->program)); 2987 }); 2988 2989 next = scan + NEXT_OFF(scan); 2990 if (next == scan) 2991 next = NULL; 2992 state_num = OP(scan); 2993 2994 reenter_switch: 2995 2996 assert(PL_reglastparen == &rex->lastparen); 2997 assert(PL_reglastcloseparen == &rex->lastcloseparen); 2998 assert(PL_regoffs == rex->offs); 2999 3000 switch (state_num) { 3001 case BOL: 3002 if (locinput == PL_bostr) 3003 { 3004 /* reginfo->till = reginfo->bol; */ 3005 break; 3006 } 3007 sayNO; 3008 case MBOL: 3009 if (locinput == PL_bostr || 3010 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n')) 3011 { 3012 break; 3013 } 3014 sayNO; 3015 case SBOL: 3016 if (locinput == PL_bostr) 3017 break; 3018 sayNO; 3019 case GPOS: 3020 if (locinput == reginfo->ganch) 3021 break; 3022 sayNO; 3023 3024 case KEEPS: 3025 /* update the startpoint */ 3026 st->u.keeper.val = PL_regoffs[0].start; 3027 PL_reginput = locinput; 3028 PL_regoffs[0].start = locinput - PL_bostr; 3029 PUSH_STATE_GOTO(KEEPS_next, next); 3030 /*NOT-REACHED*/ 3031 case KEEPS_next_fail: 3032 /* rollback the start point change */ 3033 PL_regoffs[0].start = st->u.keeper.val; 3034 sayNO_SILENT; 3035 /*NOT-REACHED*/ 3036 case EOL: 3037 goto seol; 3038 case MEOL: 3039 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 3040 sayNO; 3041 break; 3042 case SEOL: 3043 seol: 3044 if ((nextchr || locinput < PL_regeol) && nextchr != '\n') 3045 sayNO; 3046 if (PL_regeol - locinput > 1) 3047 sayNO; 3048 break; 3049 case EOS: 3050 if (PL_regeol != locinput) 3051 sayNO; 3052 break; 3053 case SANY: 3054 if (!nextchr && locinput >= PL_regeol) 3055 sayNO; 3056 if (do_utf8) { 3057 locinput += PL_utf8skip[nextchr]; 3058 if (locinput > PL_regeol) 3059 sayNO; 3060 nextchr = UCHARAT(locinput); 3061 } 3062 else 3063 nextchr = UCHARAT(++locinput); 3064 break; 3065 case CANY: 3066 if (!nextchr && locinput >= PL_regeol) 3067 sayNO; 3068 nextchr = UCHARAT(++locinput); 3069 break; 3070 case REG_ANY: 3071 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') 3072 sayNO; 3073 if (do_utf8) { 3074 locinput += PL_utf8skip[nextchr]; 3075 if (locinput > PL_regeol) 3076 sayNO; 3077 nextchr = UCHARAT(locinput); 3078 } 3079 else 3080 nextchr = UCHARAT(++locinput); 3081 break; 3082 3083#undef ST 3084#define ST st->u.trie 3085 case TRIEC: 3086 /* In this case the charclass data is available inline so 3087 we can fail fast without a lot of extra overhead. 3088 */ 3089 if (scan->flags == EXACT || !do_utf8) { 3090 if(!ANYOF_BITMAP_TEST(scan, *locinput)) { 3091 DEBUG_EXECUTE_r( 3092 PerlIO_printf(Perl_debug_log, 3093 "%*s %sfailed to match trie start class...%s\n", 3094 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 3095 ); 3096 sayNO_SILENT; 3097 /* NOTREACHED */ 3098 } 3099 } 3100 /* FALL THROUGH */ 3101 case TRIE: 3102 { 3103 /* what type of TRIE am I? (utf8 makes this contextual) */ 3104 DECL_TRIE_TYPE(scan); 3105 3106 /* what trie are we using right now */ 3107 reg_trie_data * const trie 3108 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; 3109 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); 3110 U32 state = trie->startstate; 3111 3112 if (trie->bitmap && trie_type != trie_utf8_fold && 3113 !TRIE_BITMAP_TEST(trie,*locinput) 3114 ) { 3115 if (trie->states[ state ].wordnum) { 3116 DEBUG_EXECUTE_r( 3117 PerlIO_printf(Perl_debug_log, 3118 "%*s %smatched empty string...%s\n", 3119 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 3120 ); 3121 break; 3122 } else { 3123 DEBUG_EXECUTE_r( 3124 PerlIO_printf(Perl_debug_log, 3125 "%*s %sfailed to match trie start class...%s\n", 3126 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) 3127 ); 3128 sayNO_SILENT; 3129 } 3130 } 3131 3132 { 3133 U8 *uc = ( U8* )locinput; 3134 3135 STRLEN len = 0; 3136 STRLEN foldlen = 0; 3137 U8 *uscan = (U8*)NULL; 3138 STRLEN bufflen=0; 3139 SV *sv_accept_buff = NULL; 3140 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 3141 3142 ST.accepted = 0; /* how many accepting states we have seen */ 3143 ST.B = next; 3144 ST.jump = trie->jump; 3145 ST.me = scan; 3146 /* 3147 traverse the TRIE keeping track of all accepting states 3148 we transition through until we get to a failing node. 3149 */ 3150 3151 while ( state && uc <= (U8*)PL_regeol ) { 3152 U32 base = trie->states[ state ].trans.base; 3153 UV uvc = 0; 3154 U16 charid; 3155 /* We use charid to hold the wordnum as we don't use it 3156 for charid until after we have done the wordnum logic. 3157 We define an alias just so that the wordnum logic reads 3158 more naturally. */ 3159 3160#define got_wordnum charid 3161 got_wordnum = trie->states[ state ].wordnum; 3162 3163 if ( got_wordnum ) { 3164 if ( ! ST.accepted ) { 3165 ENTER; 3166 SAVETMPS; /* XXX is this necessary? dmq */ 3167 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN; 3168 sv_accept_buff=newSV(bufflen * 3169 sizeof(reg_trie_accepted) - 1); 3170 SvCUR_set(sv_accept_buff, 0); 3171 SvPOK_on(sv_accept_buff); 3172 sv_2mortal(sv_accept_buff); 3173 SAVETMPS; 3174 ST.accept_buff = 3175 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff ); 3176 } 3177 do { 3178 if (ST.accepted >= bufflen) { 3179 bufflen *= 2; 3180 ST.accept_buff =(reg_trie_accepted*) 3181 SvGROW(sv_accept_buff, 3182 bufflen * sizeof(reg_trie_accepted)); 3183 } 3184 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff) 3185 + sizeof(reg_trie_accepted)); 3186 3187 3188 ST.accept_buff[ST.accepted].wordnum = got_wordnum; 3189 ST.accept_buff[ST.accepted].endpos = uc; 3190 ++ST.accepted; 3191 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum])); 3192 } 3193#undef got_wordnum 3194 3195 DEBUG_TRIE_EXECUTE_r({ 3196 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 ); 3197 PerlIO_printf( Perl_debug_log, 3198 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ", 3199 2+depth * 2, "", PL_colors[4], 3200 (UV)state, (UV)ST.accepted ); 3201 }); 3202 3203 if ( base ) { 3204 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 3205 uscan, len, uvc, charid, foldlen, 3206 foldbuf, uniflags); 3207 3208 if (charid && 3209 (base + charid > trie->uniquecharcount ) 3210 && (base + charid - 1 - trie->uniquecharcount 3211 < trie->lasttrans) 3212 && trie->trans[base + charid - 1 - 3213 trie->uniquecharcount].check == state) 3214 { 3215 state = trie->trans[base + charid - 1 - 3216 trie->uniquecharcount ].next; 3217 } 3218 else { 3219 state = 0; 3220 } 3221 uc += len; 3222 3223 } 3224 else { 3225 state = 0; 3226 } 3227 DEBUG_TRIE_EXECUTE_r( 3228 PerlIO_printf( Perl_debug_log, 3229 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", 3230 charid, uvc, (UV)state, PL_colors[5] ); 3231 ); 3232 } 3233 if (!ST.accepted ) 3234 sayNO; 3235 3236 DEBUG_EXECUTE_r( 3237 PerlIO_printf( Perl_debug_log, 3238 "%*s %sgot %"IVdf" possible matches%s\n", 3239 REPORT_CODE_OFF + depth * 2, "", 3240 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 3241 ); 3242 }} 3243 goto trie_first_try; /* jump into the fail handler */ 3244 /* NOTREACHED */ 3245 case TRIE_next_fail: /* we failed - try next alterative */ 3246 if ( ST.jump) { 3247 REGCP_UNWIND(ST.cp); 3248 for (n = *PL_reglastparen; n > ST.lastparen; n--) 3249 PL_regoffs[n].end = -1; 3250 *PL_reglastparen = n; 3251 } 3252 trie_first_try: 3253 if (do_cutgroup) { 3254 do_cutgroup = 0; 3255 no_final = 0; 3256 } 3257 3258 if ( ST.jump) { 3259 ST.lastparen = *PL_reglastparen; 3260 REGCP_SET(ST.cp); 3261 } 3262 if ( ST.accepted == 1 ) { 3263 /* only one choice left - just continue */ 3264 DEBUG_EXECUTE_r({ 3265 AV *const trie_words 3266 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 3267 SV ** const tmp = av_fetch( trie_words, 3268 ST.accept_buff[ 0 ].wordnum-1, 0 ); 3269 SV *sv= tmp ? sv_newmortal() : NULL; 3270 3271 PerlIO_printf( Perl_debug_log, 3272 "%*s %sonly one match left: #%d <%s>%s\n", 3273 REPORT_CODE_OFF+depth*2, "", PL_colors[4], 3274 ST.accept_buff[ 0 ].wordnum, 3275 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 3276 PL_colors[0], PL_colors[1], 3277 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 3278 ) 3279 : "not compiled under -Dr", 3280 PL_colors[5] ); 3281 }); 3282 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos; 3283 /* in this case we free tmps/leave before we call regmatch 3284 as we wont be using accept_buff again. */ 3285 3286 locinput = PL_reginput; 3287 nextchr = UCHARAT(locinput); 3288 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum]) 3289 scan = ST.B; 3290 else 3291 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum]; 3292 if (!has_cutgroup) { 3293 FREETMPS; 3294 LEAVE; 3295 } else { 3296 ST.accepted--; 3297 PUSH_YES_STATE_GOTO(TRIE_next, scan); 3298 } 3299 3300 continue; /* execute rest of RE */ 3301 } 3302 3303 if ( !ST.accepted-- ) { 3304 DEBUG_EXECUTE_r({ 3305 PerlIO_printf( Perl_debug_log, 3306 "%*s %sTRIE failed...%s\n", 3307 REPORT_CODE_OFF+depth*2, "", 3308 PL_colors[4], 3309 PL_colors[5] ); 3310 }); 3311 FREETMPS; 3312 LEAVE; 3313 sayNO_SILENT; 3314 /*NOTREACHED*/ 3315 } 3316 3317 /* 3318 There are at least two accepting states left. Presumably 3319 the number of accepting states is going to be low, 3320 typically two. So we simply scan through to find the one 3321 with lowest wordnum. Once we find it, we swap the last 3322 state into its place and decrement the size. We then try to 3323 match the rest of the pattern at the point where the word 3324 ends. If we succeed, control just continues along the 3325 regex; if we fail we return here to try the next accepting 3326 state 3327 */ 3328 3329 { 3330 U32 best = 0; 3331 U32 cur; 3332 for( cur = 1 ; cur <= ST.accepted ; cur++ ) { 3333 DEBUG_TRIE_EXECUTE_r( 3334 PerlIO_printf( Perl_debug_log, 3335 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n", 3336 REPORT_CODE_OFF + depth * 2, "", PL_colors[4], 3337 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur, 3338 ST.accept_buff[ cur ].wordnum, PL_colors[5] ); 3339 ); 3340 3341 if (ST.accept_buff[cur].wordnum < 3342 ST.accept_buff[best].wordnum) 3343 best = cur; 3344 } 3345 3346 DEBUG_EXECUTE_r({ 3347 AV *const trie_words 3348 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); 3349 SV ** const tmp = av_fetch( trie_words, 3350 ST.accept_buff[ best ].wordnum - 1, 0 ); 3351 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ? 3352 ST.B : 3353 ST.me + ST.jump[ST.accept_buff[best].wordnum]; 3354 SV *sv= tmp ? sv_newmortal() : NULL; 3355 3356 PerlIO_printf( Perl_debug_log, 3357 "%*s %strying alternation #%d <%s> at node #%d %s\n", 3358 REPORT_CODE_OFF+depth*2, "", PL_colors[4], 3359 ST.accept_buff[best].wordnum, 3360 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 3361 PL_colors[0], PL_colors[1], 3362 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 3363 ) : "not compiled under -Dr", 3364 REG_NODE_NUM(nextop), 3365 PL_colors[5] ); 3366 }); 3367 3368 if ( best<ST.accepted ) { 3369 reg_trie_accepted tmp = ST.accept_buff[ best ]; 3370 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ]; 3371 ST.accept_buff[ ST.accepted ] = tmp; 3372 best = ST.accepted; 3373 } 3374 PL_reginput = (char *)ST.accept_buff[ best ].endpos; 3375 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) { 3376 scan = ST.B; 3377 } else { 3378 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum]; 3379 } 3380 PUSH_YES_STATE_GOTO(TRIE_next, scan); 3381 /* NOTREACHED */ 3382 } 3383 /* NOTREACHED */ 3384 case TRIE_next: 3385 /* we dont want to throw this away, see bug 57042*/ 3386 if (oreplsv != GvSV(PL_replgv)) 3387 sv_setsv(oreplsv, GvSV(PL_replgv)); 3388 FREETMPS; 3389 LEAVE; 3390 sayYES; 3391#undef ST 3392 3393 case EXACT: { 3394 char *s = STRING(scan); 3395 ln = STR_LEN(scan); 3396 if (do_utf8 != UTF) { 3397 /* The target and the pattern have differing utf8ness. */ 3398 char *l = locinput; 3399 const char * const e = s + ln; 3400 3401 if (do_utf8) { 3402 /* The target is utf8, the pattern is not utf8. */ 3403 while (s < e) { 3404 STRLEN ulen; 3405 if (l >= PL_regeol) 3406 sayNO; 3407 if (NATIVE_TO_UNI(*(U8*)s) != 3408 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen, 3409 uniflags)) 3410 sayNO; 3411 l += ulen; 3412 s ++; 3413 } 3414 } 3415 else { 3416 /* The target is not utf8, the pattern is utf8. */ 3417 while (s < e) { 3418 STRLEN ulen; 3419 if (l >= PL_regeol) 3420 sayNO; 3421 if (NATIVE_TO_UNI(*((U8*)l)) != 3422 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen, 3423 uniflags)) 3424 sayNO; 3425 s += ulen; 3426 l ++; 3427 } 3428 } 3429 locinput = l; 3430 nextchr = UCHARAT(locinput); 3431 break; 3432 } 3433 /* The target and the pattern have the same utf8ness. */ 3434 /* Inline the first character, for speed. */ 3435 if (UCHARAT(s) != nextchr) 3436 sayNO; 3437 if (PL_regeol - locinput < ln) 3438 sayNO; 3439 if (ln > 1 && memNE(s, locinput, ln)) 3440 sayNO; 3441 locinput += ln; 3442 nextchr = UCHARAT(locinput); 3443 break; 3444 } 3445 case EXACTFL: 3446 PL_reg_flags |= RF_tainted; 3447 /* FALL THROUGH */ 3448 case EXACTF: { 3449 char * const s = STRING(scan); 3450 ln = STR_LEN(scan); 3451 3452 if (do_utf8 || UTF) { 3453 /* Either target or the pattern are utf8. */ 3454 const char * const l = locinput; 3455 char *e = PL_regeol; 3456 3457 if (ibcmp_utf8(s, 0, ln, (bool)UTF, 3458 l, &e, 0, do_utf8)) { 3459 /* One more case for the sharp s: 3460 * pack("U0U*", 0xDF) =~ /ss/i, 3461 * the 0xC3 0x9F are the UTF-8 3462 * byte sequence for the U+00DF. */ 3463 3464 if (!(do_utf8 && 3465 toLOWER(s[0]) == 's' && 3466 ln >= 2 && 3467 toLOWER(s[1]) == 's' && 3468 (U8)l[0] == 0xC3 && 3469 e - l >= 2 && 3470 (U8)l[1] == 0x9F)) 3471 sayNO; 3472 } 3473 locinput = e; 3474 nextchr = UCHARAT(locinput); 3475 break; 3476 } 3477 3478 /* Neither the target and the pattern are utf8. */ 3479 3480 /* Inline the first character, for speed. */ 3481 if (UCHARAT(s) != nextchr && 3482 UCHARAT(s) != ((OP(scan) == EXACTF) 3483 ? PL_fold : PL_fold_locale)[nextchr]) 3484 sayNO; 3485 if (PL_regeol - locinput < ln) 3486 sayNO; 3487 if (ln > 1 && (OP(scan) == EXACTF 3488 ? ibcmp(s, locinput, ln) 3489 : ibcmp_locale(s, locinput, ln))) 3490 sayNO; 3491 locinput += ln; 3492 nextchr = UCHARAT(locinput); 3493 break; 3494 } 3495 case BOUNDL: 3496 case NBOUNDL: 3497 PL_reg_flags |= RF_tainted; 3498 /* FALL THROUGH */ 3499 case BOUND: 3500 case NBOUND: 3501 /* was last char in word? */ 3502 if (do_utf8) { 3503 if (locinput == PL_bostr) 3504 ln = '\n'; 3505 else { 3506 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr); 3507 3508 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); 3509 } 3510 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 3511 ln = isALNUM_uni(ln); 3512 LOAD_UTF8_CHARCLASS_ALNUM(); 3513 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); 3514 } 3515 else { 3516 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); 3517 n = isALNUM_LC_utf8((U8*)locinput); 3518 } 3519 } 3520 else { 3521 ln = (locinput != PL_bostr) ? 3522 UCHARAT(locinput - 1) : '\n'; 3523 if (OP(scan) == BOUND || OP(scan) == NBOUND) { 3524 ln = isALNUM(ln); 3525 n = isALNUM(nextchr); 3526 } 3527 else { 3528 ln = isALNUM_LC(ln); 3529 n = isALNUM_LC(nextchr); 3530 } 3531 } 3532 if (((!ln) == (!n)) == (OP(scan) == BOUND || 3533 OP(scan) == BOUNDL)) 3534 sayNO; 3535 break; 3536 case ANYOF: 3537 if (do_utf8) { 3538 STRLEN inclasslen = PL_regeol - locinput; 3539 3540 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8)) 3541 goto anyof_fail; 3542 if (locinput >= PL_regeol) 3543 sayNO; 3544 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); 3545 nextchr = UCHARAT(locinput); 3546 break; 3547 } 3548 else { 3549 if (nextchr < 0) 3550 nextchr = UCHARAT(locinput); 3551 if (!REGINCLASS(rex, scan, (U8*)locinput)) 3552 goto anyof_fail; 3553 if (!nextchr && locinput >= PL_regeol) 3554 sayNO; 3555 nextchr = UCHARAT(++locinput); 3556 break; 3557 } 3558 anyof_fail: 3559 /* If we might have the case of the German sharp s 3560 * in a casefolding Unicode character class. */ 3561 3562 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) { 3563 locinput += SHARP_S_SKIP; 3564 nextchr = UCHARAT(locinput); 3565 } 3566 else 3567 sayNO; 3568 break; 3569 /* Special char classes - The defines start on line 129 or so */ 3570 CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); 3571 CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); 3572 3573 CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); 3574 CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); 3575 3576 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); 3577 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); 3578 3579 case CLUMP: /* Match \X: logical Unicode character. This is defined as 3580 a Unicode extended Grapheme Cluster */ 3581 /* From http://www.unicode.org/reports/tr29 (5.2 version). An 3582 extended Grapheme Cluster is: 3583 3584 CR LF 3585 | Prepend* Begin Extend* 3586 | . 3587 3588 Begin is (Hangul-syllable | ! Control) 3589 Extend is (Grapheme_Extend | Spacing_Mark) 3590 Control is [ GCB_Control CR LF ] 3591 3592 The discussion below shows how the code for CLUMP is derived 3593 from this regex. Note that most of these concepts are from 3594 property values of the Grapheme Cluster Boundary (GCB) property. 3595 No code point can have multiple property values for a given 3596 property. Thus a code point in Prepend can't be in Control, but 3597 it must be in !Control. This is why Control above includes 3598 GCB_Control plus CR plus LF. The latter two are used in the GCB 3599 property separately, and so can't be in GCB_Control, even though 3600 they logically are controls. Control is not the same as gc=cc, 3601 but includes format and other characters as well. 3602 3603 The Unicode definition of Hangul-syllable is: 3604 L+ 3605 | (L* ( ( V | LV ) V* | LVT ) T*) 3606 | T+ 3607 ) 3608 Each of these is a value for the GCB property, and hence must be 3609 disjoint, so the order they are tested is immaterial, so the 3610 above can safely be changed to 3611 T+ 3612 | L+ 3613 | (L* ( LVT | ( V | LV ) V*) T*) 3614 3615 The last two terms can be combined like this: 3616 L* ( L 3617 | (( LVT | ( V | LV ) V*) T*)) 3618 3619 And refactored into this: 3620 L* (L | LVT T* | V V* T* | LV V* T*) 3621 3622 That means that if we have seen any L's at all we can quit 3623 there, but if the next character is a LVT, a V or and LV we 3624 should keep going. 3625 3626 There is a subtlety with Prepend* which showed up in testing. 3627 Note that the Begin, and only the Begin is required in: 3628 | Prepend* Begin Extend* 3629 Also, Begin contains '! Control'. A Prepend must be a '! 3630 Control', which means it must be a Begin. What it comes down to 3631 is that if we match Prepend* and then find no suitable Begin 3632 afterwards, that if we backtrack the last Prepend, that one will 3633 be a suitable Begin. 3634 */ 3635 3636 if (locinput >= PL_regeol) 3637 sayNO; 3638 if (! do_utf8) { 3639 3640 /* Match either CR LF or '.', as all the other possibilities 3641 * require utf8 */ 3642 locinput++; /* Match the . or CR */ 3643 if (nextchr == '\r' 3644 && locinput < PL_regeol 3645 && UCHARAT(locinput) == '\n') locinput++; 3646 } 3647 else { 3648 3649 /* Utf8: See if is ( CR LF ); already know that locinput < 3650 * PL_regeol, so locinput+1 is in bounds */ 3651 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') { 3652 locinput += 2; 3653 } 3654 else { 3655 /* In case have to backtrack to beginning, then match '.' */ 3656 char *starting = locinput; 3657 3658 /* In case have to backtrack the last prepend */ 3659 char *previous_prepend = 0; 3660 3661 LOAD_UTF8_CHARCLASS_GCB(); 3662 3663 /* Match (prepend)* */ 3664 while (locinput < PL_regeol 3665 && swash_fetch(PL_utf8_X_prepend, 3666 (U8*)locinput, do_utf8)) 3667 { 3668 previous_prepend = locinput; 3669 locinput += UTF8SKIP(locinput); 3670 } 3671 3672 /* As noted above, if we matched a prepend character, but 3673 * the next thing won't match, back off the last prepend we 3674 * matched, as it is guaranteed to match the begin */ 3675 if (previous_prepend 3676 && (locinput >= PL_regeol 3677 || ! swash_fetch(PL_utf8_X_begin, 3678 (U8*)locinput, do_utf8))) 3679 { 3680 locinput = previous_prepend; 3681 } 3682 3683 /* Note that here we know PL_regeol > locinput, as we 3684 * tested that upon input to this switch case, and if we 3685 * moved locinput forward, we tested the result just above 3686 * and it either passed, or we backed off so that it will 3687 * now pass */ 3688 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) { 3689 3690 /* Here did not match the required 'Begin' in the 3691 * second term. So just match the very first 3692 * character, the '.' of the final term of the regex */ 3693 locinput = starting + UTF8SKIP(starting); 3694 } else { 3695 3696 /* Here is the beginning of a character that can have 3697 * an extender. It is either a hangul syllable, or a 3698 * non-control */ 3699 if (swash_fetch(PL_utf8_X_non_hangul, 3700 (U8*)locinput, do_utf8)) 3701 { 3702 3703 /* Here not a Hangul syllable, must be a 3704 * ('! * Control') */ 3705 locinput += UTF8SKIP(locinput); 3706 } else { 3707 3708 /* Here is a Hangul syllable. It can be composed 3709 * of several individual characters. One 3710 * possibility is T+ */ 3711 if (swash_fetch(PL_utf8_X_T, 3712 (U8*)locinput, do_utf8)) 3713 { 3714 while (locinput < PL_regeol 3715 && swash_fetch(PL_utf8_X_T, 3716 (U8*)locinput, do_utf8)) 3717 { 3718 locinput += UTF8SKIP(locinput); 3719 } 3720 } else { 3721 3722 /* Here, not T+, but is a Hangul. That means 3723 * it is one of the others: L, LV, LVT or V, 3724 * and matches: 3725 * L* (L | LVT T* | V V* T* | LV V* T*) */ 3726 3727 /* Match L* */ 3728 while (locinput < PL_regeol 3729 && swash_fetch(PL_utf8_X_L, 3730 (U8*)locinput, do_utf8)) 3731 { 3732 locinput += UTF8SKIP(locinput); 3733 } 3734 3735 /* Here, have exhausted L*. If the next 3736 * character is not an LV, LVT nor V, it means 3737 * we had to have at least one L, so matches L+ 3738 * in the original equation, we have a complete 3739 * hangul syllable. Are done. */ 3740 3741 if (locinput < PL_regeol 3742 && swash_fetch(PL_utf8_X_LV_LVT_V, 3743 (U8*)locinput, do_utf8)) 3744 { 3745 3746 /* Otherwise keep going. Must be LV, LVT 3747 * or V. See if LVT */ 3748 if (swash_fetch(PL_utf8_X_LVT, 3749 (U8*)locinput, do_utf8)) 3750 { 3751 locinput += UTF8SKIP(locinput); 3752 } else { 3753 3754 /* Must be V or LV. Take it, then 3755 * match V* */ 3756 locinput += UTF8SKIP(locinput); 3757 while (locinput < PL_regeol 3758 && swash_fetch(PL_utf8_X_V, 3759 (U8*)locinput, do_utf8)) 3760 { 3761 locinput += UTF8SKIP(locinput); 3762 } 3763 } 3764 3765 /* And any of LV, LVT, or V can be followed 3766 * by T* */ 3767 while (locinput < PL_regeol 3768 && swash_fetch(PL_utf8_X_T, 3769 (U8*)locinput, 3770 do_utf8)) 3771 { 3772 locinput += UTF8SKIP(locinput); 3773 } 3774 } 3775 } 3776 } 3777 3778 /* Match any extender */ 3779 while (locinput < PL_regeol 3780 && swash_fetch(PL_utf8_X_extend, 3781 (U8*)locinput, do_utf8)) 3782 { 3783 locinput += UTF8SKIP(locinput); 3784 } 3785 } 3786 } 3787 if (locinput > PL_regeol) sayNO; 3788 } 3789 nextchr = UCHARAT(locinput); 3790 break; 3791 3792 case NREFFL: 3793 { 3794 char *s; 3795 char type; 3796 PL_reg_flags |= RF_tainted; 3797 /* FALL THROUGH */ 3798 case NREF: 3799 case NREFF: 3800 type = OP(scan); 3801 n = reg_check_named_buff_matched(rex,scan); 3802 3803 if ( n ) { 3804 type = REF + ( type - NREF ); 3805 goto do_ref; 3806 } else { 3807 sayNO; 3808 } 3809 /* unreached */ 3810 case REFFL: 3811 PL_reg_flags |= RF_tainted; 3812 /* FALL THROUGH */ 3813 case REF: 3814 case REFF: 3815 n = ARG(scan); /* which paren pair */ 3816 type = OP(scan); 3817 do_ref: 3818 ln = PL_regoffs[n].start; 3819 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 3820 if (*PL_reglastparen < n || ln == -1) 3821 sayNO; /* Do not match unless seen CLOSEn. */ 3822 if (ln == PL_regoffs[n].end) 3823 break; 3824 3825 s = PL_bostr + ln; 3826 if (do_utf8 && type != REF) { /* REF can do byte comparison */ 3827 char *l = locinput; 3828 const char *e = PL_bostr + PL_regoffs[n].end; 3829 /* 3830 * Note that we can't do the "other character" lookup trick as 3831 * in the 8-bit case (no pun intended) because in Unicode we 3832 * have to map both upper and title case to lower case. 3833 */ 3834 if (type == REFF) { 3835 while (s < e) { 3836 STRLEN ulen1, ulen2; 3837 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; 3838 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; 3839 3840 if (l >= PL_regeol) 3841 sayNO; 3842 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); 3843 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); 3844 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) 3845 sayNO; 3846 s += ulen1; 3847 l += ulen2; 3848 } 3849 } 3850 locinput = l; 3851 nextchr = UCHARAT(locinput); 3852 break; 3853 } 3854 3855 /* Inline the first character, for speed. */ 3856 if (UCHARAT(s) != nextchr && 3857 (type == REF || 3858 (UCHARAT(s) != (type == REFF 3859 ? PL_fold : PL_fold_locale)[nextchr]))) 3860 sayNO; 3861 ln = PL_regoffs[n].end - ln; 3862 if (locinput + ln > PL_regeol) 3863 sayNO; 3864 if (ln > 1 && (type == REF 3865 ? memNE(s, locinput, ln) 3866 : (type == REFF 3867 ? ibcmp(s, locinput, ln) 3868 : ibcmp_locale(s, locinput, ln)))) 3869 sayNO; 3870 locinput += ln; 3871 nextchr = UCHARAT(locinput); 3872 break; 3873 } 3874 case NOTHING: 3875 case TAIL: 3876 break; 3877 case BACK: 3878 break; 3879 3880#undef ST 3881#define ST st->u.eval 3882 { 3883 SV *ret; 3884 REGEXP *re_sv; 3885 regexp *re; 3886 regexp_internal *rei; 3887 regnode *startpoint; 3888 3889 case GOSTART: 3890 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 3891 if (cur_eval && cur_eval->locinput==locinput) { 3892 if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) 3893 Perl_croak(aTHX_ "Infinite recursion in regex"); 3894 if ( ++nochange_depth > max_nochange_depth ) 3895 Perl_croak(aTHX_ 3896 "Pattern subroutine nesting without pos change" 3897 " exceeded limit in regex"); 3898 } else { 3899 nochange_depth = 0; 3900 } 3901 re_sv = rex_sv; 3902 re = rex; 3903 rei = rexi; 3904 (void)ReREFCNT_inc(rex_sv); 3905 if (OP(scan)==GOSUB) { 3906 startpoint = scan + ARG2L(scan); 3907 ST.close_paren = ARG(scan); 3908 } else { 3909 startpoint = rei->program+1; 3910 ST.close_paren = 0; 3911 } 3912 goto eval_recurse_doit; 3913 /* NOTREACHED */ 3914 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ 3915 if (cur_eval && cur_eval->locinput==locinput) { 3916 if ( ++nochange_depth > max_nochange_depth ) 3917 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 3918 } else { 3919 nochange_depth = 0; 3920 } 3921 { 3922 /* execute the code in the {...} */ 3923 dSP; 3924 SV ** const before = SP; 3925 OP_4tree * const oop = PL_op; 3926 COP * const ocurcop = PL_curcop; 3927 PAD *old_comppad; 3928 char *saved_regeol = PL_regeol; 3929 3930 n = ARG(scan); 3931 PL_op = (OP_4tree*)rexi->data->data[n]; 3932 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, 3933 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); 3934 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]); 3935 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; 3936 3937 if (sv_yes_mark) { 3938 SV *sv_mrk = get_sv("REGMARK", 1); 3939 sv_setsv(sv_mrk, sv_yes_mark); 3940 } 3941 3942 CALLRUNOPS(aTHX); /* Scalar context. */ 3943 SPAGAIN; 3944 if (SP == before) 3945 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 3946 else { 3947 ret = POPs; 3948 PUTBACK; 3949 } 3950 3951 PL_op = oop; 3952 PAD_RESTORE_LOCAL(old_comppad); 3953 PL_curcop = ocurcop; 3954 PL_regeol = saved_regeol; 3955 if (!logical) { 3956 /* /(?{...})/ */ 3957 sv_setsv(save_scalar(PL_replgv), ret); 3958 break; 3959 } 3960 } 3961 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */ 3962 logical = 0; 3963 { 3964 /* extract RE object from returned value; compiling if 3965 * necessary */ 3966 MAGIC *mg = NULL; 3967 REGEXP *rx = NULL; 3968 3969 if (SvROK(ret)) { 3970 SV *const sv = SvRV(ret); 3971 3972 if (SvTYPE(sv) == SVt_REGEXP) { 3973 rx = (REGEXP*) sv; 3974 } else if (SvSMAGICAL(sv)) { 3975 mg = mg_find(sv, PERL_MAGIC_qr); 3976 assert(mg); 3977 } 3978 } else if (SvTYPE(ret) == SVt_REGEXP) { 3979 rx = (REGEXP*) ret; 3980 } else if (SvSMAGICAL(ret)) { 3981 if (SvGMAGICAL(ret)) { 3982 /* I don't believe that there is ever qr magic 3983 here. */ 3984 assert(!mg_find(ret, PERL_MAGIC_qr)); 3985 sv_unmagic(ret, PERL_MAGIC_qr); 3986 } 3987 else { 3988 mg = mg_find(ret, PERL_MAGIC_qr); 3989 /* testing suggests mg only ends up non-NULL for 3990 scalars who were upgraded and compiled in the 3991 else block below. In turn, this is only 3992 triggered in the "postponed utf8 string" tests 3993 in t/op/pat.t */ 3994 } 3995 } 3996 3997 if (mg) { 3998 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/ 3999 assert(rx); 4000 } 4001 if (rx) { 4002 rx = reg_temp_copy(NULL, rx); 4003 } 4004 else { 4005 U32 pm_flags = 0; 4006 const I32 osize = PL_regsize; 4007 4008 if (DO_UTF8(ret)) { 4009 assert (SvUTF8(ret)); 4010 } else if (SvUTF8(ret)) { 4011 /* Not doing UTF-8, despite what the SV says. Is 4012 this only if we're trapped in use 'bytes'? */ 4013 /* Make a copy of the octet sequence, but without 4014 the flag on, as the compiler now honours the 4015 SvUTF8 flag on ret. */ 4016 STRLEN len; 4017 const char *const p = SvPV(ret, len); 4018 ret = newSVpvn_flags(p, len, SVs_TEMP); 4019 } 4020 rx = CALLREGCOMP(ret, pm_flags); 4021 if (!(SvFLAGS(ret) 4022 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY 4023 | SVs_GMG))) { 4024 /* This isn't a first class regexp. Instead, it's 4025 caching a regexp onto an existing, Perl visible 4026 scalar. */ 4027 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0); 4028 } 4029 PL_regsize = osize; 4030 } 4031 re_sv = rx; 4032 re = (struct regexp *)SvANY(rx); 4033 } 4034 RXp_MATCH_COPIED_off(re); 4035 re->subbeg = rex->subbeg; 4036 re->sublen = rex->sublen; 4037 rei = RXi_GET(re); 4038 DEBUG_EXECUTE_r( 4039 debug_start_match(re_sv, do_utf8, locinput, PL_regeol, 4040 "Matching embedded"); 4041 ); 4042 startpoint = rei->program + 1; 4043 ST.close_paren = 0; /* only used for GOSUB */ 4044 /* borrowed from regtry */ 4045 if (PL_reg_start_tmpl <= re->nparens) { 4046 PL_reg_start_tmpl = re->nparens*3/2 + 3; 4047 if(PL_reg_start_tmp) 4048 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 4049 else 4050 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); 4051 } 4052 4053 eval_recurse_doit: /* Share code with GOSUB below this line */ 4054 /* run the pattern returned from (??{...}) */ 4055 ST.cp = regcppush(0); /* Save *all* the positions. */ 4056 REGCP_SET(ST.lastcp); 4057 4058 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */ 4059 4060 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */ 4061 PL_reglastparen = &re->lastparen; 4062 PL_reglastcloseparen = &re->lastcloseparen; 4063 re->lastparen = 0; 4064 re->lastcloseparen = 0; 4065 4066 PL_reginput = locinput; 4067 PL_regsize = 0; 4068 4069 /* XXXX This is too dramatic a measure... */ 4070 PL_reg_maxiter = 0; 4071 4072 ST.toggle_reg_flags = PL_reg_flags; 4073 if (RX_UTF8(re_sv)) 4074 PL_reg_flags |= RF_utf8; 4075 else 4076 PL_reg_flags &= ~RF_utf8; 4077 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */ 4078 4079 ST.prev_rex = rex_sv; 4080 ST.prev_curlyx = cur_curlyx; 4081 SETREX(rex_sv,re_sv); 4082 rex = re; 4083 rexi = rei; 4084 cur_curlyx = NULL; 4085 ST.B = next; 4086 ST.prev_eval = cur_eval; 4087 cur_eval = st; 4088 /* now continue from first node in postoned RE */ 4089 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint); 4090 /* NOTREACHED */ 4091 } 4092 /* logical is 1, /(?(?{...})X|Y)/ */ 4093 sw = (bool)SvTRUE(ret); 4094 logical = 0; 4095 break; 4096 } 4097 4098 case EVAL_AB: /* cleanup after a successful (??{A})B */ 4099 /* note: this is called twice; first after popping B, then A */ 4100 PL_reg_flags ^= ST.toggle_reg_flags; 4101 ReREFCNT_dec(rex_sv); 4102 SETREX(rex_sv,ST.prev_rex); 4103 rex = (struct regexp *)SvANY(rex_sv); 4104 rexi = RXi_GET(rex); 4105 regcpblow(ST.cp); 4106 cur_eval = ST.prev_eval; 4107 cur_curlyx = ST.prev_curlyx; 4108 4109 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ 4110 PL_reglastparen = &rex->lastparen; 4111 PL_reglastcloseparen = &rex->lastcloseparen; 4112 /* also update PL_regoffs */ 4113 PL_regoffs = rex->offs; 4114 4115 /* XXXX This is too dramatic a measure... */ 4116 PL_reg_maxiter = 0; 4117 if ( nochange_depth ) 4118 nochange_depth--; 4119 sayYES; 4120 4121 4122 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 4123 /* note: this is called twice; first after popping B, then A */ 4124 PL_reg_flags ^= ST.toggle_reg_flags; 4125 ReREFCNT_dec(rex_sv); 4126 SETREX(rex_sv,ST.prev_rex); 4127 rex = (struct regexp *)SvANY(rex_sv); 4128 rexi = RXi_GET(rex); 4129 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ 4130 PL_reglastparen = &rex->lastparen; 4131 PL_reglastcloseparen = &rex->lastcloseparen; 4132 4133 PL_reginput = locinput; 4134 REGCP_UNWIND(ST.lastcp); 4135 regcppop(rex); 4136 cur_eval = ST.prev_eval; 4137 cur_curlyx = ST.prev_curlyx; 4138 /* XXXX This is too dramatic a measure... */ 4139 PL_reg_maxiter = 0; 4140 if ( nochange_depth ) 4141 nochange_depth--; 4142 sayNO_SILENT; 4143#undef ST 4144 4145 case OPEN: 4146 n = ARG(scan); /* which paren pair */ 4147 PL_reg_start_tmp[n] = locinput; 4148 if (n > PL_regsize) 4149 PL_regsize = n; 4150 lastopen = n; 4151 break; 4152 case CLOSE: 4153 n = ARG(scan); /* which paren pair */ 4154 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr; 4155 PL_regoffs[n].end = locinput - PL_bostr; 4156 /*if (n > PL_regsize) 4157 PL_regsize = n;*/ 4158 if (n > *PL_reglastparen) 4159 *PL_reglastparen = n; 4160 *PL_reglastcloseparen = n; 4161 if (cur_eval && cur_eval->u.eval.close_paren == n) { 4162 goto fake_end; 4163 } 4164 break; 4165 case ACCEPT: 4166 if (ARG(scan)){ 4167 regnode *cursor; 4168 for (cursor=scan; 4169 cursor && OP(cursor)!=END; 4170 cursor=regnext(cursor)) 4171 { 4172 if ( OP(cursor)==CLOSE ){ 4173 n = ARG(cursor); 4174 if ( n <= lastopen ) { 4175 PL_regoffs[n].start 4176 = PL_reg_start_tmp[n] - PL_bostr; 4177 PL_regoffs[n].end = locinput - PL_bostr; 4178 /*if (n > PL_regsize) 4179 PL_regsize = n;*/ 4180 if (n > *PL_reglastparen) 4181 *PL_reglastparen = n; 4182 *PL_reglastcloseparen = n; 4183 if ( n == ARG(scan) || (cur_eval && 4184 cur_eval->u.eval.close_paren == n)) 4185 break; 4186 } 4187 } 4188 } 4189 } 4190 goto fake_end; 4191 /*NOTREACHED*/ 4192 case GROUPP: 4193 n = ARG(scan); /* which paren pair */ 4194 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1); 4195 break; 4196 case NGROUPP: 4197 /* reg_check_named_buff_matched returns 0 for no match */ 4198 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan)); 4199 break; 4200 case INSUBP: 4201 n = ARG(scan); 4202 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); 4203 break; 4204 case DEFINEP: 4205 sw = 0; 4206 break; 4207 case IFTHEN: 4208 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ 4209 if (sw) 4210 next = NEXTOPER(NEXTOPER(scan)); 4211 else { 4212 next = scan + ARG(scan); 4213 if (OP(next) == IFTHEN) /* Fake one. */ 4214 next = NEXTOPER(NEXTOPER(next)); 4215 } 4216 break; 4217 case LOGICAL: 4218 logical = scan->flags; 4219 break; 4220 4221/******************************************************************* 4222 4223The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 4224pattern, where A and B are subpatterns. (For simple A, CURLYM or 4225STAR/PLUS/CURLY/CURLYN are used instead.) 4226 4227A*B is compiled as <CURLYX><A><WHILEM><B> 4228 4229On entry to the subpattern, CURLYX is called. This pushes a CURLYX 4230state, which contains the current count, initialised to -1. It also sets 4231cur_curlyx to point to this state, with any previous value saved in the 4232state block. 4233 4234CURLYX then jumps straight to the WHILEM op, rather than executing A, 4235since the pattern may possibly match zero times (i.e. it's a while {} loop 4236rather than a do {} while loop). 4237 4238Each entry to WHILEM represents a successful match of A. The count in the 4239CURLYX block is incremented, another WHILEM state is pushed, and execution 4240passes to A or B depending on greediness and the current count. 4241 4242For example, if matching against the string a1a2a3b (where the aN are 4243substrings that match /A/), then the match progresses as follows: (the 4244pushed states are interspersed with the bits of strings matched so far): 4245 4246 <CURLYX cnt=-1> 4247 <CURLYX cnt=0><WHILEM> 4248 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 4249 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 4250 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 4251 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 4252 4253(Contrast this with something like CURLYM, which maintains only a single 4254backtrack state: 4255 4256 <CURLYM cnt=0> a1 4257 a1 <CURLYM cnt=1> a2 4258 a1 a2 <CURLYM cnt=2> a3 4259 a1 a2 a3 <CURLYM cnt=3> b 4260) 4261 4262Each WHILEM state block marks a point to backtrack to upon partial failure 4263of A or B, and also contains some minor state data related to that 4264iteration. The CURLYX block, pointed to by cur_curlyx, contains the 4265overall state, such as the count, and pointers to the A and B ops. 4266 4267This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 4268must always point to the *current* CURLYX block, the rules are: 4269 4270When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 4271and set cur_curlyx to point the new block. 4272 4273When popping the CURLYX block after a successful or unsuccessful match, 4274restore the previous cur_curlyx. 4275 4276When WHILEM is about to execute B, save the current cur_curlyx, and set it 4277to the outer one saved in the CURLYX block. 4278 4279When popping the WHILEM block after a successful or unsuccessful B match, 4280restore the previous cur_curlyx. 4281 4282Here's an example for the pattern (AI* BI)*BO 4283I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 4284 4285cur_ 4286curlyx backtrack stack 4287------ --------------- 4288NULL 4289CO <CO prev=NULL> <WO> 4290CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 4291CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 4292NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 4293 4294At this point the pattern succeeds, and we work back down the stack to 4295clean up, restoring as we go: 4296 4297CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 4298CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 4299CO <CO prev=NULL> <WO> 4300NULL 4301 4302*******************************************************************/ 4303 4304#define ST st->u.curlyx 4305 4306 case CURLYX: /* start of /A*B/ (for complex A) */ 4307 { 4308 /* No need to save/restore up to this paren */ 4309 I32 parenfloor = scan->flags; 4310 4311 assert(next); /* keep Coverity happy */ 4312 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ 4313 next += ARG(next); 4314 4315 /* XXXX Probably it is better to teach regpush to support 4316 parenfloor > PL_regsize... */ 4317 if (parenfloor > (I32)*PL_reglastparen) 4318 parenfloor = *PL_reglastparen; /* Pessimization... */ 4319 4320 ST.prev_curlyx= cur_curlyx; 4321 cur_curlyx = st; 4322 ST.cp = PL_savestack_ix; 4323 4324 /* these fields contain the state of the current curly. 4325 * they are accessed by subsequent WHILEMs */ 4326 ST.parenfloor = parenfloor; 4327 ST.min = ARG1(scan); 4328 ST.max = ARG2(scan); 4329 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS; 4330 ST.B = next; 4331 ST.minmod = minmod; 4332 minmod = 0; 4333 ST.count = -1; /* this will be updated by WHILEM */ 4334 ST.lastloc = NULL; /* this will be updated by WHILEM */ 4335 4336 PL_reginput = locinput; 4337 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next)); 4338 /* NOTREACHED */ 4339 } 4340 4341 case CURLYX_end: /* just finished matching all of A*B */ 4342 cur_curlyx = ST.prev_curlyx; 4343 sayYES; 4344 /* NOTREACHED */ 4345 4346 case CURLYX_end_fail: /* just failed to match all of A*B */ 4347 regcpblow(ST.cp); 4348 cur_curlyx = ST.prev_curlyx; 4349 sayNO; 4350 /* NOTREACHED */ 4351 4352 4353#undef ST 4354#define ST st->u.whilem 4355 4356 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 4357 { 4358 /* see the discussion above about CURLYX/WHILEM */ 4359 I32 n; 4360 assert(cur_curlyx); /* keep Coverity happy */ 4361 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 4362 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 4363 ST.cache_offset = 0; 4364 ST.cache_mask = 0; 4365 4366 PL_reginput = locinput; 4367 4368 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4369 "%*s whilem: matched %ld out of %ld..%ld\n", 4370 REPORT_CODE_OFF+depth*2, "", (long)n, 4371 (long)cur_curlyx->u.curlyx.min, 4372 (long)cur_curlyx->u.curlyx.max) 4373 ); 4374 4375 /* First just match a string of min A's. */ 4376 4377 if (n < cur_curlyx->u.curlyx.min) { 4378 cur_curlyx->u.curlyx.lastloc = locinput; 4379 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A); 4380 /* NOTREACHED */ 4381 } 4382 4383 /* If degenerate A matches "", assume A done. */ 4384 4385 if (locinput == cur_curlyx->u.curlyx.lastloc) { 4386 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4387 "%*s whilem: empty match detected, trying continuation...\n", 4388 REPORT_CODE_OFF+depth*2, "") 4389 ); 4390 goto do_whilem_B_max; 4391 } 4392 4393 /* super-linear cache processing */ 4394 4395 if (scan->flags) { 4396 4397 if (!PL_reg_maxiter) { 4398 /* start the countdown: Postpone detection until we 4399 * know the match is not *that* much linear. */ 4400 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4); 4401 /* possible overflow for long strings and many CURLYX's */ 4402 if (PL_reg_maxiter < 0) 4403 PL_reg_maxiter = I32_MAX; 4404 PL_reg_leftiter = PL_reg_maxiter; 4405 } 4406 4407 if (PL_reg_leftiter-- == 0) { 4408 /* initialise cache */ 4409 const I32 size = (PL_reg_maxiter + 7)/8; 4410 if (PL_reg_poscache) { 4411 if ((I32)PL_reg_poscache_size < size) { 4412 Renew(PL_reg_poscache, size, char); 4413 PL_reg_poscache_size = size; 4414 } 4415 Zero(PL_reg_poscache, size, char); 4416 } 4417 else { 4418 PL_reg_poscache_size = size; 4419 Newxz(PL_reg_poscache, size, char); 4420 } 4421 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4422 "%swhilem: Detected a super-linear match, switching on caching%s...\n", 4423 PL_colors[4], PL_colors[5]) 4424 ); 4425 } 4426 4427 if (PL_reg_leftiter < 0) { 4428 /* have we already failed at this position? */ 4429 I32 offset, mask; 4430 offset = (scan->flags & 0xf) - 1 4431 + (locinput - PL_bostr) * (scan->flags>>4); 4432 mask = 1 << (offset % 8); 4433 offset /= 8; 4434 if (PL_reg_poscache[offset] & mask) { 4435 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, 4436 "%*s whilem: (cache) already tried at this position...\n", 4437 REPORT_CODE_OFF+depth*2, "") 4438 ); 4439 sayNO; /* cache records failure */ 4440 } 4441 ST.cache_offset = offset; 4442 ST.cache_mask = mask; 4443 } 4444 } 4445 4446 /* Prefer B over A for minimal matching. */ 4447 4448 if (cur_curlyx->u.curlyx.minmod) { 4449 ST.save_curlyx = cur_curlyx; 4450 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 4451 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor); 4452 REGCP_SET(ST.lastcp); 4453 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B); 4454 /* NOTREACHED */ 4455 } 4456 4457 /* Prefer A over B for maximal matching. */ 4458 4459 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */ 4460 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); 4461 cur_curlyx->u.curlyx.lastloc = locinput; 4462 REGCP_SET(ST.lastcp); 4463 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A); 4464 /* NOTREACHED */ 4465 } 4466 goto do_whilem_B_max; 4467 } 4468 /* NOTREACHED */ 4469 4470 case WHILEM_B_min: /* just matched B in a minimal match */ 4471 case WHILEM_B_max: /* just matched B in a maximal match */ 4472 cur_curlyx = ST.save_curlyx; 4473 sayYES; 4474 /* NOTREACHED */ 4475 4476 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 4477 cur_curlyx = ST.save_curlyx; 4478 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 4479 cur_curlyx->u.curlyx.count--; 4480 CACHEsayNO; 4481 /* NOTREACHED */ 4482 4483 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 4484 REGCP_UNWIND(ST.lastcp); 4485 regcppop(rex); 4486 /* FALL THROUGH */ 4487 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 4488 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 4489 cur_curlyx->u.curlyx.count--; 4490 CACHEsayNO; 4491 /* NOTREACHED */ 4492 4493 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 4494 REGCP_UNWIND(ST.lastcp); 4495 regcppop(rex); /* Restore some previous $<digit>s? */ 4496 PL_reginput = locinput; 4497 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 4498 "%*s whilem: failed, trying continuation...\n", 4499 REPORT_CODE_OFF+depth*2, "") 4500 ); 4501 do_whilem_B_max: 4502 if (cur_curlyx->u.curlyx.count >= REG_INFTY 4503 && ckWARN(WARN_REGEXP) 4504 && !(PL_reg_flags & RF_warned)) 4505 { 4506 PL_reg_flags |= RF_warned; 4507 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", 4508 "Complex regular subexpression recursion", 4509 REG_INFTY - 1); 4510 } 4511 4512 /* now try B */ 4513 ST.save_curlyx = cur_curlyx; 4514 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 4515 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B); 4516 /* NOTREACHED */ 4517 4518 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 4519 cur_curlyx = ST.save_curlyx; 4520 REGCP_UNWIND(ST.lastcp); 4521 regcppop(rex); 4522 4523 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) { 4524 /* Maximum greed exceeded */ 4525 if (cur_curlyx->u.curlyx.count >= REG_INFTY 4526 && ckWARN(WARN_REGEXP) 4527 && !(PL_reg_flags & RF_warned)) 4528 { 4529 PL_reg_flags |= RF_warned; 4530 Perl_warner(aTHX_ packWARN(WARN_REGEXP), 4531 "%s limit (%d) exceeded", 4532 "Complex regular subexpression recursion", 4533 REG_INFTY - 1); 4534 } 4535 cur_curlyx->u.curlyx.count--; 4536 CACHEsayNO; 4537 } 4538 4539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 4540 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") 4541 ); 4542 /* Try grabbing another A and see if it helps. */ 4543 PL_reginput = locinput; 4544 cur_curlyx->u.curlyx.lastloc = locinput; 4545 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); 4546 REGCP_SET(ST.lastcp); 4547 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A); 4548 /* NOTREACHED */ 4549 4550#undef ST 4551#define ST st->u.branch 4552 4553 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 4554 next = scan + ARG(scan); 4555 if (next == scan) 4556 next = NULL; 4557 scan = NEXTOPER(scan); 4558 /* FALL THROUGH */ 4559 4560 case BRANCH: /* /(...|A|...)/ */ 4561 scan = NEXTOPER(scan); /* scan now points to inner node */ 4562 ST.lastparen = *PL_reglastparen; 4563 ST.next_branch = next; 4564 REGCP_SET(ST.cp); 4565 PL_reginput = locinput; 4566 4567 /* Now go into the branch */ 4568 if (has_cutgroup) { 4569 PUSH_YES_STATE_GOTO(BRANCH_next, scan); 4570 } else { 4571 PUSH_STATE_GOTO(BRANCH_next, scan); 4572 } 4573 /* NOTREACHED */ 4574 case CUTGROUP: 4575 PL_reginput = locinput; 4576 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : 4577 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 4578 PUSH_STATE_GOTO(CUTGROUP_next,next); 4579 /* NOTREACHED */ 4580 case CUTGROUP_next_fail: 4581 do_cutgroup = 1; 4582 no_final = 1; 4583 if (st->u.mark.mark_name) 4584 sv_commit = st->u.mark.mark_name; 4585 sayNO; 4586 /* NOTREACHED */ 4587 case BRANCH_next: 4588 sayYES; 4589 /* NOTREACHED */ 4590 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 4591 if (do_cutgroup) { 4592 do_cutgroup = 0; 4593 no_final = 0; 4594 } 4595 REGCP_UNWIND(ST.cp); 4596 for (n = *PL_reglastparen; n > ST.lastparen; n--) 4597 PL_regoffs[n].end = -1; 4598 *PL_reglastparen = n; 4599 /*dmq: *PL_reglastcloseparen = n; */ 4600 scan = ST.next_branch; 4601 /* no more branches? */ 4602 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 4603 DEBUG_EXECUTE_r({ 4604 PerlIO_printf( Perl_debug_log, 4605 "%*s %sBRANCH failed...%s\n", 4606 REPORT_CODE_OFF+depth*2, "", 4607 PL_colors[4], 4608 PL_colors[5] ); 4609 }); 4610 sayNO_SILENT; 4611 } 4612 continue; /* execute next BRANCH[J] op */ 4613 /* NOTREACHED */ 4614 4615 case MINMOD: 4616 minmod = 1; 4617 break; 4618 4619#undef ST 4620#define ST st->u.curlym 4621 4622 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 4623 4624 /* This is an optimisation of CURLYX that enables us to push 4625 * only a single backtracking state, no matter how many matches 4626 * there are in {m,n}. It relies on the pattern being constant 4627 * length, with no parens to influence future backrefs 4628 */ 4629 4630 ST.me = scan; 4631 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 4632 4633 /* if paren positive, emulate an OPEN/CLOSE around A */ 4634 if (ST.me->flags) { 4635 U32 paren = ST.me->flags; 4636 if (paren > PL_regsize) 4637 PL_regsize = paren; 4638 if (paren > *PL_reglastparen) 4639 *PL_reglastparen = paren; 4640 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 4641 } 4642 ST.A = scan; 4643 ST.B = next; 4644 ST.alen = 0; 4645 ST.count = 0; 4646 ST.minmod = minmod; 4647 minmod = 0; 4648 ST.c1 = CHRTEST_UNINIT; 4649 REGCP_SET(ST.cp); 4650 4651 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ 4652 goto curlym_do_B; 4653 4654 curlym_do_A: /* execute the A in /A{m,n}B/ */ 4655 PL_reginput = locinput; 4656 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */ 4657 /* NOTREACHED */ 4658 4659 case CURLYM_A: /* we've just matched an A */ 4660 locinput = st->locinput; 4661 nextchr = UCHARAT(locinput); 4662 4663 ST.count++; 4664 /* after first match, determine A's length: u.curlym.alen */ 4665 if (ST.count == 1) { 4666 if (PL_reg_match_utf8) { 4667 char *s = locinput; 4668 while (s < PL_reginput) { 4669 ST.alen++; 4670 s += UTF8SKIP(s); 4671 } 4672 } 4673 else { 4674 ST.alen = PL_reginput - locinput; 4675 } 4676 if (ST.alen == 0) 4677 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); 4678 } 4679 DEBUG_EXECUTE_r( 4680 PerlIO_printf(Perl_debug_log, 4681 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", 4682 (int)(REPORT_CODE_OFF+(depth*2)), "", 4683 (IV) ST.count, (IV)ST.alen) 4684 ); 4685 4686 locinput = PL_reginput; 4687 4688 if (cur_eval && cur_eval->u.eval.close_paren && 4689 cur_eval->u.eval.close_paren == (U32)ST.me->flags) 4690 goto fake_end; 4691 4692 { 4693 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); 4694 if ( max == REG_INFTY || ST.count < max ) 4695 goto curlym_do_A; /* try to match another A */ 4696 } 4697 goto curlym_do_B; /* try to match B */ 4698 4699 case CURLYM_A_fail: /* just failed to match an A */ 4700 REGCP_UNWIND(ST.cp); 4701 4702 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ 4703 || (cur_eval && cur_eval->u.eval.close_paren && 4704 cur_eval->u.eval.close_paren == (U32)ST.me->flags)) 4705 sayNO; 4706 4707 curlym_do_B: /* execute the B in /A{m,n}B/ */ 4708 PL_reginput = locinput; 4709 if (ST.c1 == CHRTEST_UNINIT) { 4710 /* calculate c1 and c2 for possible match of 1st char 4711 * following curly */ 4712 ST.c1 = ST.c2 = CHRTEST_VOID; 4713 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 4714 regnode *text_node = ST.B; 4715 if (! HAS_TEXT(text_node)) 4716 FIND_NEXT_IMPT(text_node); 4717 /* this used to be 4718 4719 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) 4720 4721 But the former is redundant in light of the latter. 4722 4723 if this changes back then the macro for 4724 IS_TEXT and friends need to change. 4725 */ 4726 if (PL_regkind[OP(text_node)] == EXACT) 4727 { 4728 4729 ST.c1 = (U8)*STRING(text_node); 4730 ST.c2 = 4731 (IS_TEXTF(text_node)) 4732 ? PL_fold[ST.c1] 4733 : (IS_TEXTFL(text_node)) 4734 ? PL_fold_locale[ST.c1] 4735 : ST.c1; 4736 } 4737 } 4738 } 4739 4740 DEBUG_EXECUTE_r( 4741 PerlIO_printf(Perl_debug_log, 4742 "%*s CURLYM trying tail with matches=%"IVdf"...\n", 4743 (int)(REPORT_CODE_OFF+(depth*2)), 4744 "", (IV)ST.count) 4745 ); 4746 if (ST.c1 != CHRTEST_VOID 4747 && UCHARAT(PL_reginput) != ST.c1 4748 && UCHARAT(PL_reginput) != ST.c2) 4749 { 4750 /* simulate B failing */ 4751 DEBUG_OPTIMISE_r( 4752 PerlIO_printf(Perl_debug_log, 4753 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n", 4754 (int)(REPORT_CODE_OFF+(depth*2)),"", 4755 (IV)ST.c1,(IV)ST.c2 4756 )); 4757 state_num = CURLYM_B_fail; 4758 goto reenter_switch; 4759 } 4760 4761 if (ST.me->flags) { 4762 /* mark current A as captured */ 4763 I32 paren = ST.me->flags; 4764 if (ST.count) { 4765 PL_regoffs[paren].start 4766 = HOPc(PL_reginput, -ST.alen) - PL_bostr; 4767 PL_regoffs[paren].end = PL_reginput - PL_bostr; 4768 /*dmq: *PL_reglastcloseparen = paren; */ 4769 } 4770 else 4771 PL_regoffs[paren].end = -1; 4772 if (cur_eval && cur_eval->u.eval.close_paren && 4773 cur_eval->u.eval.close_paren == (U32)ST.me->flags) 4774 { 4775 if (ST.count) 4776 goto fake_end; 4777 else 4778 sayNO; 4779 } 4780 } 4781 4782 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */ 4783 /* NOTREACHED */ 4784 4785 case CURLYM_B_fail: /* just failed to match a B */ 4786 REGCP_UNWIND(ST.cp); 4787 if (ST.minmod) { 4788 I32 max = ARG2(ST.me); 4789 if (max != REG_INFTY && ST.count == max) 4790 sayNO; 4791 goto curlym_do_A; /* try to match a further A */ 4792 } 4793 /* backtrack one A */ 4794 if (ST.count == ARG1(ST.me) /* min */) 4795 sayNO; 4796 ST.count--; 4797 locinput = HOPc(locinput, -ST.alen); 4798 goto curlym_do_B; /* try to match B */ 4799 4800#undef ST 4801#define ST st->u.curly 4802 4803#define CURLY_SETPAREN(paren, success) \ 4804 if (paren) { \ 4805 if (success) { \ 4806 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \ 4807 PL_regoffs[paren].end = locinput - PL_bostr; \ 4808 *PL_reglastcloseparen = paren; \ 4809 } \ 4810 else \ 4811 PL_regoffs[paren].end = -1; \ 4812 } 4813 4814 case STAR: /* /A*B/ where A is width 1 */ 4815 ST.paren = 0; 4816 ST.min = 0; 4817 ST.max = REG_INFTY; 4818 scan = NEXTOPER(scan); 4819 goto repeat; 4820 case PLUS: /* /A+B/ where A is width 1 */ 4821 ST.paren = 0; 4822 ST.min = 1; 4823 ST.max = REG_INFTY; 4824 scan = NEXTOPER(scan); 4825 goto repeat; 4826 case CURLYN: /* /(A){m,n}B/ where A is width 1 */ 4827 ST.paren = scan->flags; /* Which paren to set */ 4828 if (ST.paren > PL_regsize) 4829 PL_regsize = ST.paren; 4830 if (ST.paren > *PL_reglastparen) 4831 *PL_reglastparen = ST.paren; 4832 ST.min = ARG1(scan); /* min to match */ 4833 ST.max = ARG2(scan); /* max to match */ 4834 if (cur_eval && cur_eval->u.eval.close_paren && 4835 cur_eval->u.eval.close_paren == (U32)ST.paren) { 4836 ST.min=1; 4837 ST.max=1; 4838 } 4839 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); 4840 goto repeat; 4841 case CURLY: /* /A{m,n}B/ where A is width 1 */ 4842 ST.paren = 0; 4843 ST.min = ARG1(scan); /* min to match */ 4844 ST.max = ARG2(scan); /* max to match */ 4845 scan = NEXTOPER(scan) + NODE_STEP_REGNODE; 4846 repeat: 4847 /* 4848 * Lookahead to avoid useless match attempts 4849 * when we know what character comes next. 4850 * 4851 * Used to only do .*x and .*?x, but now it allows 4852 * for )'s, ('s and (?{ ... })'s to be in the way 4853 * of the quantifier and the EXACT-like node. -- japhy 4854 */ 4855 4856 if (ST.min > ST.max) /* XXX make this a compile-time check? */ 4857 sayNO; 4858 if (HAS_TEXT(next) || JUMPABLE(next)) { 4859 U8 *s; 4860 regnode *text_node = next; 4861 4862 if (! HAS_TEXT(text_node)) 4863 FIND_NEXT_IMPT(text_node); 4864 4865 if (! HAS_TEXT(text_node)) 4866 ST.c1 = ST.c2 = CHRTEST_VOID; 4867 else { 4868 if ( PL_regkind[OP(text_node)] != EXACT ) { 4869 ST.c1 = ST.c2 = CHRTEST_VOID; 4870 goto assume_ok_easy; 4871 } 4872 else 4873 s = (U8*)STRING(text_node); 4874 4875 /* Currently we only get here when 4876 4877 PL_rekind[OP(text_node)] == EXACT 4878 4879 if this changes back then the macro for IS_TEXT and 4880 friends need to change. */ 4881 if (!UTF) { 4882 ST.c2 = ST.c1 = *s; 4883 if (IS_TEXTF(text_node)) 4884 ST.c2 = PL_fold[ST.c1]; 4885 else if (IS_TEXTFL(text_node)) 4886 ST.c2 = PL_fold_locale[ST.c1]; 4887 } 4888 else { /* UTF */ 4889 if (IS_TEXTF(text_node)) { 4890 STRLEN ulen1, ulen2; 4891 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; 4892 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; 4893 4894 to_utf8_lower((U8*)s, tmpbuf1, &ulen1); 4895 to_utf8_upper((U8*)s, tmpbuf2, &ulen2); 4896#ifdef EBCDIC 4897 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0, 4898 ckWARN(WARN_UTF8) ? 4899 0 : UTF8_ALLOW_ANY); 4900 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0, 4901 ckWARN(WARN_UTF8) ? 4902 0 : UTF8_ALLOW_ANY); 4903#else 4904 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0, 4905 uniflags); 4906 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0, 4907 uniflags); 4908#endif 4909 } 4910 else { 4911 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, 4912 uniflags); 4913 } 4914 } 4915 } 4916 } 4917 else 4918 ST.c1 = ST.c2 = CHRTEST_VOID; 4919 assume_ok_easy: 4920 4921 ST.A = scan; 4922 ST.B = next; 4923 PL_reginput = locinput; 4924 if (minmod) { 4925 minmod = 0; 4926 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min) 4927 sayNO; 4928 ST.count = ST.min; 4929 locinput = PL_reginput; 4930 REGCP_SET(ST.cp); 4931 if (ST.c1 == CHRTEST_VOID) 4932 goto curly_try_B_min; 4933 4934 ST.oldloc = locinput; 4935 4936 /* set ST.maxpos to the furthest point along the 4937 * string that could possibly match */ 4938 if (ST.max == REG_INFTY) { 4939 ST.maxpos = PL_regeol - 1; 4940 if (do_utf8) 4941 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 4942 ST.maxpos--; 4943 } 4944 else if (do_utf8) { 4945 int m = ST.max - ST.min; 4946 for (ST.maxpos = locinput; 4947 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--) 4948 ST.maxpos += UTF8SKIP(ST.maxpos); 4949 } 4950 else { 4951 ST.maxpos = locinput + ST.max - ST.min; 4952 if (ST.maxpos >= PL_regeol) 4953 ST.maxpos = PL_regeol - 1; 4954 } 4955 goto curly_try_B_min_known; 4956 4957 } 4958 else { 4959 ST.count = regrepeat(rex, ST.A, ST.max, depth); 4960 locinput = PL_reginput; 4961 if (ST.count < ST.min) 4962 sayNO; 4963 if ((ST.count > ST.min) 4964 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) 4965 { 4966 /* A{m,n} must come at the end of the string, there's 4967 * no point in backing off ... */ 4968 ST.min = ST.count; 4969 /* ...except that $ and \Z can match before *and* after 4970 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 4971 We may back off by one in this case. */ 4972 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS) 4973 ST.min--; 4974 } 4975 REGCP_SET(ST.cp); 4976 goto curly_try_B_max; 4977 } 4978 /* NOTREACHED */ 4979 4980 4981 case CURLY_B_min_known_fail: 4982 /* failed to find B in a non-greedy match where c1,c2 valid */ 4983 if (ST.paren && ST.count) 4984 PL_regoffs[ST.paren].end = -1; 4985 4986 PL_reginput = locinput; /* Could be reset... */ 4987 REGCP_UNWIND(ST.cp); 4988 /* Couldn't or didn't -- move forward. */ 4989 ST.oldloc = locinput; 4990 if (do_utf8) 4991 locinput += UTF8SKIP(locinput); 4992 else 4993 locinput++; 4994 ST.count++; 4995 curly_try_B_min_known: 4996 /* find the next place where 'B' could work, then call B */ 4997 { 4998 int n; 4999 if (do_utf8) { 5000 n = (ST.oldloc == locinput) ? 0 : 1; 5001 if (ST.c1 == ST.c2) { 5002 STRLEN len; 5003 /* set n to utf8_distance(oldloc, locinput) */ 5004 while (locinput <= ST.maxpos && 5005 utf8n_to_uvchr((U8*)locinput, 5006 UTF8_MAXBYTES, &len, 5007 uniflags) != (UV)ST.c1) { 5008 locinput += len; 5009 n++; 5010 } 5011 } 5012 else { 5013 /* set n to utf8_distance(oldloc, locinput) */ 5014 while (locinput <= ST.maxpos) { 5015 STRLEN len; 5016 const UV c = utf8n_to_uvchr((U8*)locinput, 5017 UTF8_MAXBYTES, &len, 5018 uniflags); 5019 if (c == (UV)ST.c1 || c == (UV)ST.c2) 5020 break; 5021 locinput += len; 5022 n++; 5023 } 5024 } 5025 } 5026 else { 5027 if (ST.c1 == ST.c2) { 5028 while (locinput <= ST.maxpos && 5029 UCHARAT(locinput) != ST.c1) 5030 locinput++; 5031 } 5032 else { 5033 while (locinput <= ST.maxpos 5034 && UCHARAT(locinput) != ST.c1 5035 && UCHARAT(locinput) != ST.c2) 5036 locinput++; 5037 } 5038 n = locinput - ST.oldloc; 5039 } 5040 if (locinput > ST.maxpos) 5041 sayNO; 5042 /* PL_reginput == oldloc now */ 5043 if (n) { 5044 ST.count += n; 5045 if (regrepeat(rex, ST.A, n, depth) < n) 5046 sayNO; 5047 } 5048 PL_reginput = locinput; 5049 CURLY_SETPAREN(ST.paren, ST.count); 5050 if (cur_eval && cur_eval->u.eval.close_paren && 5051 cur_eval->u.eval.close_paren == (U32)ST.paren) { 5052 goto fake_end; 5053 } 5054 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B); 5055 } 5056 /* NOTREACHED */ 5057 5058 5059 case CURLY_B_min_fail: 5060 /* failed to find B in a non-greedy match where c1,c2 invalid */ 5061 if (ST.paren && ST.count) 5062 PL_regoffs[ST.paren].end = -1; 5063 5064 REGCP_UNWIND(ST.cp); 5065 /* failed -- move forward one */ 5066 PL_reginput = locinput; 5067 if (regrepeat(rex, ST.A, 1, depth)) { 5068 ST.count++; 5069 locinput = PL_reginput; 5070 if (ST.count <= ST.max || (ST.max == REG_INFTY && 5071 ST.count > 0)) /* count overflow ? */ 5072 { 5073 curly_try_B_min: 5074 CURLY_SETPAREN(ST.paren, ST.count); 5075 if (cur_eval && cur_eval->u.eval.close_paren && 5076 cur_eval->u.eval.close_paren == (U32)ST.paren) { 5077 goto fake_end; 5078 } 5079 PUSH_STATE_GOTO(CURLY_B_min, ST.B); 5080 } 5081 } 5082 sayNO; 5083 /* NOTREACHED */ 5084 5085 5086 curly_try_B_max: 5087 /* a successful greedy match: now try to match B */ 5088 if (cur_eval && cur_eval->u.eval.close_paren && 5089 cur_eval->u.eval.close_paren == (U32)ST.paren) { 5090 goto fake_end; 5091 } 5092 { 5093 UV c = 0; 5094 if (ST.c1 != CHRTEST_VOID) 5095 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput, 5096 UTF8_MAXBYTES, 0, uniflags) 5097 : (UV) UCHARAT(PL_reginput); 5098 /* If it could work, try it. */ 5099 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) { 5100 CURLY_SETPAREN(ST.paren, ST.count); 5101 PUSH_STATE_GOTO(CURLY_B_max, ST.B); 5102 /* NOTREACHED */ 5103 } 5104 } 5105 /* FALL THROUGH */ 5106 case CURLY_B_max_fail: 5107 /* failed to find B in a greedy match */ 5108 if (ST.paren && ST.count) 5109 PL_regoffs[ST.paren].end = -1; 5110 5111 REGCP_UNWIND(ST.cp); 5112 /* back up. */ 5113 if (--ST.count < ST.min) 5114 sayNO; 5115 PL_reginput = locinput = HOPc(locinput, -1); 5116 goto curly_try_B_max; 5117 5118#undef ST 5119 5120 case END: 5121 fake_end: 5122 if (cur_eval) { 5123 /* we've just finished A in /(??{A})B/; now continue with B */ 5124 I32 tmpix; 5125 st->u.eval.toggle_reg_flags 5126 = cur_eval->u.eval.toggle_reg_flags; 5127 PL_reg_flags ^= st->u.eval.toggle_reg_flags; 5128 5129 st->u.eval.prev_rex = rex_sv; /* inner */ 5130 SETREX(rex_sv,cur_eval->u.eval.prev_rex); 5131 rex = (struct regexp *)SvANY(rex_sv); 5132 rexi = RXi_GET(rex); 5133 cur_curlyx = cur_eval->u.eval.prev_curlyx; 5134 ReREFCNT_inc(rex_sv); 5135 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */ 5136 5137 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ 5138 PL_reglastparen = &rex->lastparen; 5139 PL_reglastcloseparen = &rex->lastcloseparen; 5140 5141 REGCP_SET(st->u.eval.lastcp); 5142 PL_reginput = locinput; 5143 5144 /* Restore parens of the outer rex without popping the 5145 * savestack */ 5146 tmpix = PL_savestack_ix; 5147 PL_savestack_ix = cur_eval->u.eval.lastcp; 5148 regcppop(rex); 5149 PL_savestack_ix = tmpix; 5150 5151 st->u.eval.prev_eval = cur_eval; 5152 cur_eval = cur_eval->u.eval.prev_eval; 5153 DEBUG_EXECUTE_r( 5154 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", 5155 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); 5156 if ( nochange_depth ) 5157 nochange_depth--; 5158 5159 PUSH_YES_STATE_GOTO(EVAL_AB, 5160 st->u.eval.prev_eval->u.eval.B); /* match B */ 5161 } 5162 5163 if (locinput < reginfo->till) { 5164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, 5165 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 5166 PL_colors[4], 5167 (long)(locinput - PL_reg_starttry), 5168 (long)(reginfo->till - PL_reg_starttry), 5169 PL_colors[5])); 5170 5171 sayNO_SILENT; /* Cannot match: too short. */ 5172 } 5173 PL_reginput = locinput; /* put where regtry can find it */ 5174 sayYES; /* Success! */ 5175 5176 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ 5177 DEBUG_EXECUTE_r( 5178 PerlIO_printf(Perl_debug_log, 5179 "%*s %ssubpattern success...%s\n", 5180 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); 5181 PL_reginput = locinput; /* put where regtry can find it */ 5182 sayYES; /* Success! */ 5183 5184#undef ST 5185#define ST st->u.ifmatch 5186 5187 case SUSPEND: /* (?>A) */ 5188 ST.wanted = 1; 5189 PL_reginput = locinput; 5190 goto do_ifmatch; 5191 5192 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */ 5193 ST.wanted = 0; 5194 goto ifmatch_trivial_fail_test; 5195 5196 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */ 5197 ST.wanted = 1; 5198 ifmatch_trivial_fail_test: 5199 if (scan->flags) { 5200 char * const s = HOPBACKc(locinput, scan->flags); 5201 if (!s) { 5202 /* trivial fail */ 5203 if (logical) { 5204 logical = 0; 5205 sw = 1 - (bool)ST.wanted; 5206 } 5207 else if (ST.wanted) 5208 sayNO; 5209 next = scan + ARG(scan); 5210 if (next == scan) 5211 next = NULL; 5212 break; 5213 } 5214 PL_reginput = s; 5215 } 5216 else 5217 PL_reginput = locinput; 5218 5219 do_ifmatch: 5220 ST.me = scan; 5221 ST.logical = logical; 5222 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 5223 5224 /* execute body of (?...A) */ 5225 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan))); 5226 /* NOTREACHED */ 5227 5228 case IFMATCH_A_fail: /* body of (?...A) failed */ 5229 ST.wanted = !ST.wanted; 5230 /* FALL THROUGH */ 5231 5232 case IFMATCH_A: /* body of (?...A) succeeded */ 5233 if (ST.logical) { 5234 sw = (bool)ST.wanted; 5235 } 5236 else if (!ST.wanted) 5237 sayNO; 5238 5239 if (OP(ST.me) == SUSPEND) 5240 locinput = PL_reginput; 5241 else { 5242 locinput = PL_reginput = st->locinput; 5243 nextchr = UCHARAT(locinput); 5244 } 5245 scan = ST.me + ARG(ST.me); 5246 if (scan == ST.me) 5247 scan = NULL; 5248 continue; /* execute B */ 5249 5250#undef ST 5251 5252 case LONGJMP: 5253 next = scan + ARG(scan); 5254 if (next == scan) 5255 next = NULL; 5256 break; 5257 case COMMIT: 5258 reginfo->cutpoint = PL_regeol; 5259 /* FALLTHROUGH */ 5260 case PRUNE: 5261 PL_reginput = locinput; 5262 if (!scan->flags) 5263 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 5264 PUSH_STATE_GOTO(COMMIT_next,next); 5265 /* NOTREACHED */ 5266 case COMMIT_next_fail: 5267 no_final = 1; 5268 /* FALLTHROUGH */ 5269 case OPFAIL: 5270 sayNO; 5271 /* NOTREACHED */ 5272 5273#define ST st->u.mark 5274 case MARKPOINT: 5275 ST.prev_mark = mark_state; 5276 ST.mark_name = sv_commit = sv_yes_mark 5277 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 5278 mark_state = st; 5279 ST.mark_loc = PL_reginput = locinput; 5280 PUSH_YES_STATE_GOTO(MARKPOINT_next,next); 5281 /* NOTREACHED */ 5282 case MARKPOINT_next: 5283 mark_state = ST.prev_mark; 5284 sayYES; 5285 /* NOTREACHED */ 5286 case MARKPOINT_next_fail: 5287 if (popmark && sv_eq(ST.mark_name,popmark)) 5288 { 5289 if (ST.mark_loc > startpoint) 5290 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 5291 popmark = NULL; /* we found our mark */ 5292 sv_commit = ST.mark_name; 5293 5294 DEBUG_EXECUTE_r({ 5295 PerlIO_printf(Perl_debug_log, 5296 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", 5297 REPORT_CODE_OFF+depth*2, "", 5298 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 5299 }); 5300 } 5301 mark_state = ST.prev_mark; 5302 sv_yes_mark = mark_state ? 5303 mark_state->u.mark.mark_name : NULL; 5304 sayNO; 5305 /* NOTREACHED */ 5306 case SKIP: 5307 PL_reginput = locinput; 5308 if (scan->flags) { 5309 /* (*SKIP) : if we fail we cut here*/ 5310 ST.mark_name = NULL; 5311 ST.mark_loc = locinput; 5312 PUSH_STATE_GOTO(SKIP_next,next); 5313 } else { 5314 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 5315 otherwise do nothing. Meaning we need to scan 5316 */ 5317 regmatch_state *cur = mark_state; 5318 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); 5319 5320 while (cur) { 5321 if ( sv_eq( cur->u.mark.mark_name, 5322 find ) ) 5323 { 5324 ST.mark_name = find; 5325 PUSH_STATE_GOTO( SKIP_next, next ); 5326 } 5327 cur = cur->u.mark.prev_mark; 5328 } 5329 } 5330 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 5331 break; 5332 case SKIP_next_fail: 5333 if (ST.mark_name) { 5334 /* (*CUT:NAME) - Set up to search for the name as we 5335 collapse the stack*/ 5336 popmark = ST.mark_name; 5337 } else { 5338 /* (*CUT) - No name, we cut here.*/ 5339 if (ST.mark_loc > startpoint) 5340 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 5341 /* but we set sv_commit to latest mark_name if there 5342 is one so they can test to see how things lead to this 5343 cut */ 5344 if (mark_state) 5345 sv_commit=mark_state->u.mark.mark_name; 5346 } 5347 no_final = 1; 5348 sayNO; 5349 /* NOTREACHED */ 5350#undef ST 5351 case FOLDCHAR: 5352 n = ARG(scan); 5353 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) { 5354 locinput += ln; 5355 } else if ( 0xDF == n && !do_utf8 && !UTF ) { 5356 sayNO; 5357 } else { 5358 U8 folded[UTF8_MAXBYTES_CASE+1]; 5359 STRLEN foldlen; 5360 const char * const l = locinput; 5361 char *e = PL_regeol; 5362 to_uni_fold(n, folded, &foldlen); 5363 5364 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1, 5365 l, &e, 0, do_utf8)) { 5366 sayNO; 5367 } 5368 locinput = e; 5369 } 5370 nextchr = UCHARAT(locinput); 5371 break; 5372 case LNBREAK: 5373 if ((n=is_LNBREAK(locinput,do_utf8))) { 5374 locinput += n; 5375 nextchr = UCHARAT(locinput); 5376 } else 5377 sayNO; 5378 break; 5379 5380#define CASE_CLASS(nAmE) \ 5381 case nAmE: \ 5382 if ((n=is_##nAmE(locinput,do_utf8))) { \ 5383 locinput += n; \ 5384 nextchr = UCHARAT(locinput); \ 5385 } else \ 5386 sayNO; \ 5387 break; \ 5388 case N##nAmE: \ 5389 if ((n=is_##nAmE(locinput,do_utf8))) { \ 5390 sayNO; \ 5391 } else { \ 5392 locinput += UTF8SKIP(locinput); \ 5393 nextchr = UCHARAT(locinput); \ 5394 } \ 5395 break 5396 5397 CASE_CLASS(VERTWS); 5398 CASE_CLASS(HORIZWS); 5399#undef CASE_CLASS 5400 5401 default: 5402 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", 5403 PTR2UV(scan), OP(scan)); 5404 Perl_croak(aTHX_ "regexp memory corruption"); 5405 5406 } /* end switch */ 5407 5408 /* switch break jumps here */ 5409 scan = next; /* prepare to execute the next op and ... */ 5410 continue; /* ... jump back to the top, reusing st */ 5411 /* NOTREACHED */ 5412 5413 push_yes_state: 5414 /* push a state that backtracks on success */ 5415 st->u.yes.prev_yes_state = yes_state; 5416 yes_state = st; 5417 /* FALL THROUGH */ 5418 push_state: 5419 /* push a new regex state, then continue at scan */ 5420 { 5421 regmatch_state *newst; 5422 5423 DEBUG_STACK_r({ 5424 regmatch_state *cur = st; 5425 regmatch_state *curyes = yes_state; 5426 int curd = depth; 5427 regmatch_slab *slab = PL_regmatch_slab; 5428 for (;curd > -1;cur--,curd--) { 5429 if (cur < SLAB_FIRST(slab)) { 5430 slab = slab->prev; 5431 cur = SLAB_LAST(slab); 5432 } 5433 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", 5434 REPORT_CODE_OFF + 2 + depth * 2,"", 5435 curd, PL_reg_name[cur->resume_state], 5436 (curyes == cur) ? "yes" : "" 5437 ); 5438 if (curyes == cur) 5439 curyes = cur->u.yes.prev_yes_state; 5440 } 5441 } else 5442 DEBUG_STATE_pp("push") 5443 ); 5444 depth++; 5445 st->locinput = locinput; 5446 newst = st+1; 5447 if (newst > SLAB_LAST(PL_regmatch_slab)) 5448 newst = S_push_slab(aTHX); 5449 PL_regmatch_state = newst; 5450 5451 locinput = PL_reginput; 5452 nextchr = UCHARAT(locinput); 5453 st = newst; 5454 continue; 5455 /* NOTREACHED */ 5456 } 5457 } 5458 5459 /* 5460 * We get here only if there's trouble -- normally "case END" is 5461 * the terminating point. 5462 */ 5463 Perl_croak(aTHX_ "corrupted regexp pointers"); 5464 /*NOTREACHED*/ 5465 sayNO; 5466 5467yes: 5468 if (yes_state) { 5469 /* we have successfully completed a subexpression, but we must now 5470 * pop to the state marked by yes_state and continue from there */ 5471 assert(st != yes_state); 5472#ifdef DEBUGGING 5473 while (st != yes_state) { 5474 st--; 5475 if (st < SLAB_FIRST(PL_regmatch_slab)) { 5476 PL_regmatch_slab = PL_regmatch_slab->prev; 5477 st = SLAB_LAST(PL_regmatch_slab); 5478 } 5479 DEBUG_STATE_r({ 5480 if (no_final) { 5481 DEBUG_STATE_pp("pop (no final)"); 5482 } else { 5483 DEBUG_STATE_pp("pop (yes)"); 5484 } 5485 }); 5486 depth--; 5487 } 5488#else 5489 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 5490 || yes_state > SLAB_LAST(PL_regmatch_slab)) 5491 { 5492 /* not in this slab, pop slab */ 5493 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 5494 PL_regmatch_slab = PL_regmatch_slab->prev; 5495 st = SLAB_LAST(PL_regmatch_slab); 5496 } 5497 depth -= (st - yes_state); 5498#endif 5499 st = yes_state; 5500 yes_state = st->u.yes.prev_yes_state; 5501 PL_regmatch_state = st; 5502 5503 if (no_final) { 5504 locinput= st->locinput; 5505 nextchr = UCHARAT(locinput); 5506 } 5507 state_num = st->resume_state + no_final; 5508 goto reenter_switch; 5509 } 5510 5511 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", 5512 PL_colors[4], PL_colors[5])); 5513 5514 if (PL_reg_eval_set) { 5515 /* each successfully executed (?{...}) block does the equivalent of 5516 * local $^R = do {...} 5517 * When popping the save stack, all these locals would be undone; 5518 * bypass this by setting the outermost saved $^R to the latest 5519 * value */ 5520 if (oreplsv != GvSV(PL_replgv)) 5521 sv_setsv(oreplsv, GvSV(PL_replgv)); 5522 } 5523 result = 1; 5524 goto final_exit; 5525 5526no: 5527 DEBUG_EXECUTE_r( 5528 PerlIO_printf(Perl_debug_log, 5529 "%*s %sfailed...%s\n", 5530 REPORT_CODE_OFF+depth*2, "", 5531 PL_colors[4], PL_colors[5]) 5532 ); 5533 5534no_silent: 5535 if (no_final) { 5536 if (yes_state) { 5537 goto yes; 5538 } else { 5539 goto final_exit; 5540 } 5541 } 5542 if (depth) { 5543 /* there's a previous state to backtrack to */ 5544 st--; 5545 if (st < SLAB_FIRST(PL_regmatch_slab)) { 5546 PL_regmatch_slab = PL_regmatch_slab->prev; 5547 st = SLAB_LAST(PL_regmatch_slab); 5548 } 5549 PL_regmatch_state = st; 5550 locinput= st->locinput; 5551 nextchr = UCHARAT(locinput); 5552 5553 DEBUG_STATE_pp("pop"); 5554 depth--; 5555 if (yes_state == st) 5556 yes_state = st->u.yes.prev_yes_state; 5557 5558 state_num = st->resume_state + 1; /* failure = success + 1 */ 5559 goto reenter_switch; 5560 } 5561 result = 0; 5562 5563 final_exit: 5564 if (rex->intflags & PREGf_VERBARG_SEEN) { 5565 SV *sv_err = get_sv("REGERROR", 1); 5566 SV *sv_mrk = get_sv("REGMARK", 1); 5567 if (result) { 5568 sv_commit = &PL_sv_no; 5569 if (!sv_yes_mark) 5570 sv_yes_mark = &PL_sv_yes; 5571 } else { 5572 if (!sv_commit) 5573 sv_commit = &PL_sv_yes; 5574 sv_yes_mark = &PL_sv_no; 5575 } 5576 sv_setsv(sv_err, sv_commit); 5577 sv_setsv(sv_mrk, sv_yes_mark); 5578 } 5579 5580 /* clean up; in particular, free all slabs above current one */ 5581 LEAVE_SCOPE(oldsave); 5582 5583 return result; 5584} 5585 5586/* 5587 - regrepeat - repeatedly match something simple, report how many 5588 */ 5589/* 5590 * [This routine now assumes that it will only match on things of length 1. 5591 * That was true before, but now we assume scan - reginput is the count, 5592 * rather than incrementing count on every character. [Er, except utf8.]] 5593 */ 5594STATIC I32 5595S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) 5596{ 5597 dVAR; 5598 register char *scan; 5599 register I32 c; 5600 register char *loceol = PL_regeol; 5601 register I32 hardcount = 0; 5602 register bool do_utf8 = PL_reg_match_utf8; 5603#ifndef DEBUGGING 5604 PERL_UNUSED_ARG(depth); 5605#endif 5606 5607 PERL_ARGS_ASSERT_REGREPEAT; 5608 5609 scan = PL_reginput; 5610 if (max == REG_INFTY) 5611 max = I32_MAX; 5612 else if (max < loceol - scan) 5613 loceol = scan + max; 5614 switch (OP(p)) { 5615 case REG_ANY: 5616 if (do_utf8) { 5617 loceol = PL_regeol; 5618 while (scan < loceol && hardcount < max && *scan != '\n') { 5619 scan += UTF8SKIP(scan); 5620 hardcount++; 5621 } 5622 } else { 5623 while (scan < loceol && *scan != '\n') 5624 scan++; 5625 } 5626 break; 5627 case SANY: 5628 if (do_utf8) { 5629 loceol = PL_regeol; 5630 while (scan < loceol && hardcount < max) { 5631 scan += UTF8SKIP(scan); 5632 hardcount++; 5633 } 5634 } 5635 else 5636 scan = loceol; 5637 break; 5638 case CANY: 5639 scan = loceol; 5640 break; 5641 case EXACT: /* length of string is 1 */ 5642 c = (U8)*STRING(p); 5643 while (scan < loceol && UCHARAT(scan) == c) 5644 scan++; 5645 break; 5646 case EXACTF: /* length of string is 1 */ 5647 c = (U8)*STRING(p); 5648 while (scan < loceol && 5649 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) 5650 scan++; 5651 break; 5652 case EXACTFL: /* length of string is 1 */ 5653 PL_reg_flags |= RF_tainted; 5654 c = (U8)*STRING(p); 5655 while (scan < loceol && 5656 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) 5657 scan++; 5658 break; 5659 case ANYOF: 5660 if (do_utf8) { 5661 loceol = PL_regeol; 5662 while (hardcount < max && scan < loceol && 5663 reginclass(prog, p, (U8*)scan, 0, do_utf8)) { 5664 scan += UTF8SKIP(scan); 5665 hardcount++; 5666 } 5667 } else { 5668 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) 5669 scan++; 5670 } 5671 break; 5672 case ALNUM: 5673 if (do_utf8) { 5674 loceol = PL_regeol; 5675 LOAD_UTF8_CHARCLASS_ALNUM(); 5676 while (hardcount < max && scan < loceol && 5677 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 5678 scan += UTF8SKIP(scan); 5679 hardcount++; 5680 } 5681 } else { 5682 while (scan < loceol && isALNUM(*scan)) 5683 scan++; 5684 } 5685 break; 5686 case ALNUML: 5687 PL_reg_flags |= RF_tainted; 5688 if (do_utf8) { 5689 loceol = PL_regeol; 5690 while (hardcount < max && scan < loceol && 5691 isALNUM_LC_utf8((U8*)scan)) { 5692 scan += UTF8SKIP(scan); 5693 hardcount++; 5694 } 5695 } else { 5696 while (scan < loceol && isALNUM_LC(*scan)) 5697 scan++; 5698 } 5699 break; 5700 case NALNUM: 5701 if (do_utf8) { 5702 loceol = PL_regeol; 5703 LOAD_UTF8_CHARCLASS_ALNUM(); 5704 while (hardcount < max && scan < loceol && 5705 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { 5706 scan += UTF8SKIP(scan); 5707 hardcount++; 5708 } 5709 } else { 5710 while (scan < loceol && !isALNUM(*scan)) 5711 scan++; 5712 } 5713 break; 5714 case NALNUML: 5715 PL_reg_flags |= RF_tainted; 5716 if (do_utf8) { 5717 loceol = PL_regeol; 5718 while (hardcount < max && scan < loceol && 5719 !isALNUM_LC_utf8((U8*)scan)) { 5720 scan += UTF8SKIP(scan); 5721 hardcount++; 5722 } 5723 } else { 5724 while (scan < loceol && !isALNUM_LC(*scan)) 5725 scan++; 5726 } 5727 break; 5728 case SPACE: 5729 if (do_utf8) { 5730 loceol = PL_regeol; 5731 LOAD_UTF8_CHARCLASS_SPACE(); 5732 while (hardcount < max && scan < loceol && 5733 (*scan == ' ' || 5734 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 5735 scan += UTF8SKIP(scan); 5736 hardcount++; 5737 } 5738 } else { 5739 while (scan < loceol && isSPACE(*scan)) 5740 scan++; 5741 } 5742 break; 5743 case SPACEL: 5744 PL_reg_flags |= RF_tainted; 5745 if (do_utf8) { 5746 loceol = PL_regeol; 5747 while (hardcount < max && scan < loceol && 5748 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 5749 scan += UTF8SKIP(scan); 5750 hardcount++; 5751 } 5752 } else { 5753 while (scan < loceol && isSPACE_LC(*scan)) 5754 scan++; 5755 } 5756 break; 5757 case NSPACE: 5758 if (do_utf8) { 5759 loceol = PL_regeol; 5760 LOAD_UTF8_CHARCLASS_SPACE(); 5761 while (hardcount < max && scan < loceol && 5762 !(*scan == ' ' || 5763 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { 5764 scan += UTF8SKIP(scan); 5765 hardcount++; 5766 } 5767 } else { 5768 while (scan < loceol && !isSPACE(*scan)) 5769 scan++; 5770 } 5771 break; 5772 case NSPACEL: 5773 PL_reg_flags |= RF_tainted; 5774 if (do_utf8) { 5775 loceol = PL_regeol; 5776 while (hardcount < max && scan < loceol && 5777 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { 5778 scan += UTF8SKIP(scan); 5779 hardcount++; 5780 } 5781 } else { 5782 while (scan < loceol && !isSPACE_LC(*scan)) 5783 scan++; 5784 } 5785 break; 5786 case DIGIT: 5787 if (do_utf8) { 5788 loceol = PL_regeol; 5789 LOAD_UTF8_CHARCLASS_DIGIT(); 5790 while (hardcount < max && scan < loceol && 5791 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 5792 scan += UTF8SKIP(scan); 5793 hardcount++; 5794 } 5795 } else { 5796 while (scan < loceol && isDIGIT(*scan)) 5797 scan++; 5798 } 5799 break; 5800 case NDIGIT: 5801 if (do_utf8) { 5802 loceol = PL_regeol; 5803 LOAD_UTF8_CHARCLASS_DIGIT(); 5804 while (hardcount < max && scan < loceol && 5805 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { 5806 scan += UTF8SKIP(scan); 5807 hardcount++; 5808 } 5809 } else { 5810 while (scan < loceol && !isDIGIT(*scan)) 5811 scan++; 5812 } 5813 case LNBREAK: 5814 if (do_utf8) { 5815 loceol = PL_regeol; 5816 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) { 5817 scan += c; 5818 hardcount++; 5819 } 5820 } else { 5821 /* 5822 LNBREAK can match two latin chars, which is ok, 5823 because we have a null terminated string, but we 5824 have to use hardcount in this situation 5825 */ 5826 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) { 5827 scan+=c; 5828 hardcount++; 5829 } 5830 } 5831 break; 5832 case HORIZWS: 5833 if (do_utf8) { 5834 loceol = PL_regeol; 5835 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) { 5836 scan += c; 5837 hardcount++; 5838 } 5839 } else { 5840 while (scan < loceol && is_HORIZWS_latin1(scan)) 5841 scan++; 5842 } 5843 break; 5844 case NHORIZWS: 5845 if (do_utf8) { 5846 loceol = PL_regeol; 5847 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) { 5848 scan += UTF8SKIP(scan); 5849 hardcount++; 5850 } 5851 } else { 5852 while (scan < loceol && !is_HORIZWS_latin1(scan)) 5853 scan++; 5854 5855 } 5856 break; 5857 case VERTWS: 5858 if (do_utf8) { 5859 loceol = PL_regeol; 5860 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) { 5861 scan += c; 5862 hardcount++; 5863 } 5864 } else { 5865 while (scan < loceol && is_VERTWS_latin1(scan)) 5866 scan++; 5867 5868 } 5869 break; 5870 case NVERTWS: 5871 if (do_utf8) { 5872 loceol = PL_regeol; 5873 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) { 5874 scan += UTF8SKIP(scan); 5875 hardcount++; 5876 } 5877 } else { 5878 while (scan < loceol && !is_VERTWS_latin1(scan)) 5879 scan++; 5880 5881 } 5882 break; 5883 5884 default: /* Called on something of 0 width. */ 5885 break; /* So match right here or not at all. */ 5886 } 5887 5888 if (hardcount) 5889 c = hardcount; 5890 else 5891 c = scan - PL_reginput; 5892 PL_reginput = scan; 5893 5894 DEBUG_r({ 5895 GET_RE_DEBUG_FLAGS_DECL; 5896 DEBUG_EXECUTE_r({ 5897 SV * const prop = sv_newmortal(); 5898 regprop(prog, prop, p); 5899 PerlIO_printf(Perl_debug_log, 5900 "%*s %s can match %"IVdf" times out of %"IVdf"...\n", 5901 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); 5902 }); 5903 }); 5904 5905 return(c); 5906} 5907 5908 5909#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 5910/* 5911- regclass_swash - prepare the utf8 swash 5912*/ 5913 5914SV * 5915Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) 5916{ 5917 dVAR; 5918 SV *sw = NULL; 5919 SV *si = NULL; 5920 SV *alt = NULL; 5921 RXi_GET_DECL(prog,progi); 5922 const struct reg_data * const data = prog ? progi->data : NULL; 5923 5924 PERL_ARGS_ASSERT_REGCLASS_SWASH; 5925 5926 if (data && data->count) { 5927 const U32 n = ARG(node); 5928 5929 if (data->what[n] == 's') { 5930 SV * const rv = MUTABLE_SV(data->data[n]); 5931 AV * const av = MUTABLE_AV(SvRV(rv)); 5932 SV **const ary = AvARRAY(av); 5933 SV **a, **b; 5934 5935 /* See the end of regcomp.c:S_regclass() for 5936 * documentation of these array elements. */ 5937 5938 si = *ary; 5939 a = SvROK(ary[1]) ? &ary[1] : NULL; 5940 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL; 5941 5942 if (a) 5943 sw = *a; 5944 else if (si && doinit) { 5945 sw = swash_init("utf8", "", si, 1, 0); 5946 (void)av_store(av, 1, sw); 5947 } 5948 if (b) 5949 alt = *b; 5950 } 5951 } 5952 5953 if (listsvp) 5954 *listsvp = si; 5955 if (altsvp) 5956 *altsvp = alt; 5957 5958 return sw; 5959} 5960#endif 5961 5962/* 5963 - reginclass - determine if a character falls into a character class 5964 5965 The n is the ANYOF regnode, the p is the target string, lenp 5966 is pointer to the maximum length of how far to go in the p 5967 (if the lenp is zero, UTF8SKIP(p) is used), 5968 do_utf8 tells whether the target string is in UTF-8. 5969 5970 */ 5971 5972STATIC bool 5973S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8) 5974{ 5975 dVAR; 5976 const char flags = ANYOF_FLAGS(n); 5977 bool match = FALSE; 5978 UV c = *p; 5979 STRLEN len = 0; 5980 STRLEN plen; 5981 5982 PERL_ARGS_ASSERT_REGINCLASS; 5983 5984 if (do_utf8 && !UTF8_IS_INVARIANT(c)) { 5985 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, 5986 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) 5987 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); 5988 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for 5989 * UTF8_ALLOW_FFFF */ 5990 if (len == (STRLEN)-1) 5991 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 5992 } 5993 5994 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); 5995 if (do_utf8 || (flags & ANYOF_UNICODE)) { 5996 if (lenp) 5997 *lenp = 0; 5998 if (do_utf8 && !ANYOF_RUNTIME(n)) { 5999 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) 6000 match = TRUE; 6001 } 6002 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) 6003 match = TRUE; 6004 if (!match) { 6005 AV *av; 6006 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); 6007 6008 if (sw) { 6009 U8 * utf8_p; 6010 if (do_utf8) { 6011 utf8_p = (U8 *) p; 6012 } else { 6013 STRLEN len = 1; 6014 utf8_p = bytes_to_utf8(p, &len); 6015 } 6016 if (swash_fetch(sw, utf8_p, 1)) 6017 match = TRUE; 6018 else if (flags & ANYOF_FOLD) { 6019 if (!match && lenp && av) { 6020 I32 i; 6021 for (i = 0; i <= av_len(av); i++) { 6022 SV* const sv = *av_fetch(av, i, FALSE); 6023 STRLEN len; 6024 const char * const s = SvPV_const(sv, len); 6025 if (len <= plen && memEQ(s, (char*)utf8_p, len)) { 6026 *lenp = len; 6027 match = TRUE; 6028 break; 6029 } 6030 } 6031 } 6032 if (!match) { 6033 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 6034 6035 STRLEN tmplen; 6036 to_utf8_fold(utf8_p, tmpbuf, &tmplen); 6037 if (swash_fetch(sw, tmpbuf, 1)) 6038 match = TRUE; 6039 } 6040 } 6041 6042 /* If we allocated a string above, free it */ 6043 if (! do_utf8) Safefree(utf8_p); 6044 } 6045 } 6046 if (match && lenp && *lenp == 0) 6047 *lenp = UNISKIP(NATIVE_TO_UNI(c)); 6048 } 6049 if (!match && c < 256) { 6050 if (ANYOF_BITMAP_TEST(n, c)) 6051 match = TRUE; 6052 else if (flags & ANYOF_FOLD) { 6053 U8 f; 6054 6055 if (flags & ANYOF_LOCALE) { 6056 PL_reg_flags |= RF_tainted; 6057 f = PL_fold_locale[c]; 6058 } 6059 else 6060 f = PL_fold[c]; 6061 if (f != c && ANYOF_BITMAP_TEST(n, f)) 6062 match = TRUE; 6063 } 6064 6065 if (!match && (flags & ANYOF_CLASS)) { 6066 PL_reg_flags |= RF_tainted; 6067 if ( 6068 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || 6069 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || 6070 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || 6071 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || 6072 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || 6073 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || 6074 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || 6075 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || 6076 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || 6077 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || 6078 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || 6079 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || 6080 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || 6081 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || 6082 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || 6083 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || 6084 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || 6085 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || 6086 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || 6087 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || 6088 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || 6089 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || 6090 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || 6091 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || 6092 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || 6093 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || 6094 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || 6095 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || 6096 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || 6097 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) 6098 ) /* How's that for a conditional? */ 6099 { 6100 match = TRUE; 6101 } 6102 } 6103 } 6104 6105 return (flags & ANYOF_INVERT) ? !match : match; 6106} 6107 6108STATIC U8 * 6109S_reghop3(U8 *s, I32 off, const U8* lim) 6110{ 6111 dVAR; 6112 6113 PERL_ARGS_ASSERT_REGHOP3; 6114 6115 if (off >= 0) { 6116 while (off-- && s < lim) { 6117 /* XXX could check well-formedness here */ 6118 s += UTF8SKIP(s); 6119 } 6120 } 6121 else { 6122 while (off++ && s > lim) { 6123 s--; 6124 if (UTF8_IS_CONTINUED(*s)) { 6125 while (s > lim && UTF8_IS_CONTINUATION(*s)) 6126 s--; 6127 } 6128 /* XXX could check well-formedness here */ 6129 } 6130 } 6131 return s; 6132} 6133 6134#ifdef XXX_dmq 6135/* there are a bunch of places where we use two reghop3's that should 6136 be replaced with this routine. but since thats not done yet 6137 we ifdef it out - dmq 6138*/ 6139STATIC U8 * 6140S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) 6141{ 6142 dVAR; 6143 6144 PERL_ARGS_ASSERT_REGHOP4; 6145 6146 if (off >= 0) { 6147 while (off-- && s < rlim) { 6148 /* XXX could check well-formedness here */ 6149 s += UTF8SKIP(s); 6150 } 6151 } 6152 else { 6153 while (off++ && s > llim) { 6154 s--; 6155 if (UTF8_IS_CONTINUED(*s)) { 6156 while (s > llim && UTF8_IS_CONTINUATION(*s)) 6157 s--; 6158 } 6159 /* XXX could check well-formedness here */ 6160 } 6161 } 6162 return s; 6163} 6164#endif 6165 6166STATIC U8 * 6167S_reghopmaybe3(U8* s, I32 off, const U8* lim) 6168{ 6169 dVAR; 6170 6171 PERL_ARGS_ASSERT_REGHOPMAYBE3; 6172 6173 if (off >= 0) { 6174 while (off-- && s < lim) { 6175 /* XXX could check well-formedness here */ 6176 s += UTF8SKIP(s); 6177 } 6178 if (off >= 0) 6179 return NULL; 6180 } 6181 else { 6182 while (off++ && s > lim) { 6183 s--; 6184 if (UTF8_IS_CONTINUED(*s)) { 6185 while (s > lim && UTF8_IS_CONTINUATION(*s)) 6186 s--; 6187 } 6188 /* XXX could check well-formedness here */ 6189 } 6190 if (off <= 0) 6191 return NULL; 6192 } 6193 return s; 6194} 6195 6196static void 6197restore_pos(pTHX_ void *arg) 6198{ 6199 dVAR; 6200 regexp * const rex = (regexp *)arg; 6201 if (PL_reg_eval_set) { 6202 if (PL_reg_oldsaved) { 6203 rex->subbeg = PL_reg_oldsaved; 6204 rex->sublen = PL_reg_oldsavedlen; 6205#ifdef PERL_OLD_COPY_ON_WRITE 6206 rex->saved_copy = PL_nrs; 6207#endif 6208 RXp_MATCH_COPIED_on(rex); 6209 } 6210 PL_reg_magic->mg_len = PL_reg_oldpos; 6211 PL_reg_eval_set = 0; 6212 PL_curpm = PL_reg_oldcurpm; 6213 } 6214} 6215 6216STATIC void 6217S_to_utf8_substr(pTHX_ register regexp *prog) 6218{ 6219 int i = 1; 6220 6221 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 6222 6223 do { 6224 if (prog->substrs->data[i].substr 6225 && !prog->substrs->data[i].utf8_substr) { 6226 SV* const sv = newSVsv(prog->substrs->data[i].substr); 6227 prog->substrs->data[i].utf8_substr = sv; 6228 sv_utf8_upgrade(sv); 6229 if (SvVALID(prog->substrs->data[i].substr)) { 6230 const U8 flags = BmFLAGS(prog->substrs->data[i].substr); 6231 if (flags & FBMcf_TAIL) { 6232 /* Trim the trailing \n that fbm_compile added last 6233 time. */ 6234 SvCUR_set(sv, SvCUR(sv) - 1); 6235 /* Whilst this makes the SV technically "invalid" (as its 6236 buffer is no longer followed by "\0") when fbm_compile() 6237 adds the "\n" back, a "\0" is restored. */ 6238 } 6239 fbm_compile(sv, flags); 6240 } 6241 if (prog->substrs->data[i].substr == prog->check_substr) 6242 prog->check_utf8 = sv; 6243 } 6244 } while (i--); 6245} 6246 6247STATIC void 6248S_to_byte_substr(pTHX_ register regexp *prog) 6249{ 6250 dVAR; 6251 int i = 1; 6252 6253 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 6254 6255 do { 6256 if (prog->substrs->data[i].utf8_substr 6257 && !prog->substrs->data[i].substr) { 6258 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 6259 if (sv_utf8_downgrade(sv, TRUE)) { 6260 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 6261 const U8 flags 6262 = BmFLAGS(prog->substrs->data[i].utf8_substr); 6263 if (flags & FBMcf_TAIL) { 6264 /* Trim the trailing \n that fbm_compile added last 6265 time. */ 6266 SvCUR_set(sv, SvCUR(sv) - 1); 6267 } 6268 fbm_compile(sv, flags); 6269 } 6270 } else { 6271 SvREFCNT_dec(sv); 6272 sv = &PL_sv_undef; 6273 } 6274 prog->substrs->data[i].substr = sv; 6275 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 6276 prog->check_substr = sv; 6277 } 6278 } while (i--); 6279} 6280 6281/* 6282 * Local variables: 6283 * c-indentation-style: bsd 6284 * c-basic-offset: 4 6285 * indent-tabs-mode: t 6286 * End: 6287 * 6288 * ex: set ts=8 sts=4 sw=4 noet: 6289 */ 6290