1/* regexec.c 2 */ 3 4/* 5 * One Ring to rule them all, One Ring to find them 6 * 7 * [p.v of _The Lord of the Rings_, opening poem] 8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] 9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] 10 */ 11 12/* This file contains functions for executing a regular expression. See 13 * also regcomp.c which funnily enough, contains functions for compiling 14 * a regular expression. 15 * 16 * This file is also copied at build time to ext/re/re_exec.c, where 17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 18 * This causes the main functions to be compiled under new names and with 19 * debugging support added, which makes "use re 'debug'" work. 20 */ 21 22/* NOTE: this is derived from Henry Spencer's regexp code, and should not 23 * confused with the original package (see point 3 below). Thanks, Henry! 24 */ 25 26/* Additional note: this code is very heavily munged from Henry's version 27 * in places. In some spots I've traded clarity for efficiency, so don't 28 * blame Henry for some of the lack of readability. 29 */ 30 31/* The names of the functions have been changed from regcomp and 32 * regexec to pregcomp and pregexec in order to avoid conflicts 33 * with the POSIX routines of the same names. 34*/ 35 36#ifdef PERL_EXT_RE_BUILD 37#include "re_top.h" 38#endif 39 40/* 41 * pregcomp and pregexec -- regsub and regerror are not used in perl 42 * 43 * Copyright (c) 1986 by University of Toronto. 44 * Written by Henry Spencer. Not derived from licensed software. 45 * 46 * Permission is granted to anyone to use this software for any 47 * purpose on any computer system, and to redistribute it freely, 48 * subject to the following restrictions: 49 * 50 * 1. The author is not responsible for the consequences of use of 51 * this software, no matter how awful, even if they arise 52 * from defects in it. 53 * 54 * 2. The origin of this software must not be misrepresented, either 55 * by explicit claim or by omission. 56 * 57 * 3. Altered versions must be plainly marked as such, and must not 58 * be misrepresented as being the original software. 59 * 60 **** Alterations to Henry's code are... 61 **** 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 64 **** by Larry Wall and others 65 **** 66 **** You may distribute under the terms of either the GNU General Public 67 **** License or the Artistic License, as specified in the README file. 68 * 69 * Beware that some of this code is subtly aware of the way operator 70 * precedence is structured in regular expressions. Serious changes in 71 * regular-expression syntax might require a total rethink. 72 */ 73#include "EXTERN.h" 74#define PERL_IN_REGEX_ENGINE 75#define PERL_IN_REGEXEC_C 76#include "perl.h" 77 78#ifdef PERL_IN_XSUB_RE 79# include "re_comp.h" 80#else 81# include "regcomp.h" 82#endif 83 84#include "invlist_inline.h" 85#include "unicode_constants.h" 86 87static const char b_utf8_locale_required[] = 88 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong." 89 " Assuming a UTF-8 locale"; 90 91#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \ 92 STMT_START { \ 93 if (! IN_UTF8_CTYPE_LOCALE) { \ 94 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ 95 b_utf8_locale_required); \ 96 } \ 97 } STMT_END 98 99static const char sets_utf8_locale_required[] = 100 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; 101 102#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \ 103 STMT_START { \ 104 if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\ 105 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ 106 sets_utf8_locale_required); \ 107 } \ 108 } STMT_END 109 110#ifdef DEBUGGING 111/* At least one required character in the target string is expressible only in 112 * UTF-8. */ 113static const char non_utf8_target_but_utf8_required[] 114 = "Can't match, because target string needs to be in UTF-8\n"; 115#endif 116 117#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ 118 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\ 119 goto target; \ 120} STMT_END 121 122#ifndef STATIC 123#define STATIC static 124#endif 125 126/* 127 * Forwards. 128 */ 129 130#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) 131 132#define HOPc(pos,off) \ 133 (char *)(reginfo->is_utf8_target \ 134 ? reghop3((U8*)pos, off, \ 135 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ 136 : (U8*)(pos + off)) 137 138/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */ 139#define HOPBACK3(pos, off, lim) \ 140 (reginfo->is_utf8_target \ 141 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \ 142 : (pos - off >= lim) \ 143 ? (U8*)pos - off \ 144 : NULL) 145 146#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg)) 147 148#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) 149#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) 150 151/* lim must be +ve. Returns NULL on overshoot */ 152#define HOPMAYBE3(pos,off,lim) \ 153 (reginfo->is_utf8_target \ 154 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ 155 : ((U8*)pos + off <= lim) \ 156 ? (U8*)pos + off \ 157 : NULL) 158 159/* like HOP3, but limits the result to <= lim even for the non-utf8 case. 160 * off must be >=0; args should be vars rather than expressions */ 161#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ 162 ? reghop3((U8*)(pos), off, (U8*)(lim)) \ 163 : (U8*)((pos + off) > lim ? lim : (pos + off))) 164#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim)) 165 166#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ 167 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ 168 : (U8*)(pos + off)) 169#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) 170 171#define PLACEHOLDER /* Something for the preprocessor to grab onto */ 172/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ 173 174/* for use after a quantifier and before an EXACT-like node -- japhy */ 175/* it would be nice to rework regcomp.sym to generate this stuff. sigh 176 * 177 * NOTE that *nothing* that affects backtracking should be in here, specifically 178 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a 179 * node that is in between two EXACT like nodes when ascertaining what the required 180 * "follow" character is. This should probably be moved to regex compile time 181 * although it may be done at run time because of the REF possibility - more 182 * investigation required. -- demerphq 183*/ 184#define JUMPABLE(rn) ( \ 185 OP(rn) == OPEN || \ 186 (OP(rn) == CLOSE && \ 187 !EVAL_CLOSE_PAREN_IS(cur_eval,PARNO(rn)) ) || \ 188 OP(rn) == EVAL || \ 189 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ 190 OP(rn) == PLUS || OP(rn) == MINMOD || \ 191 OP(rn) == KEEPS || \ 192 (REGNODE_TYPE(OP(rn)) == CURLY && ARG1i(rn) > 0) \ 193) 194#define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT) 195 196#define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF ) 197 198/* 199 Search for mandatory following text node; for lookahead, the text must 200 follow but for lookbehind (FLAGS(rn) != 0) we skip to the next step. 201*/ 202#define FIND_NEXT_IMPT(rn) STMT_START { \ 203 while (JUMPABLE(rn)) { \ 204 const OPCODE type = OP(rn); \ 205 if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \ 206 rn = REGNODE_AFTER_opcode(rn,type); \ 207 else if (type == PLUS) \ 208 rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \ 209 else if (type == IFMATCH) \ 210 rn = (FLAGS(rn) == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG1u(rn); \ 211 else rn += NEXT_OFF(rn); \ 212 } \ 213} STMT_END 214 215#define SLAB_FIRST(s) (&(s)->states[0]) 216#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) 217 218static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); 219static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); 220static regmatch_state * S_push_slab(pTHX); 221 222#define REGCP_OTHER_ELEMS 3 223#define REGCP_FRAME_ELEMS 1 224/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and 225 * are needed for the regexp context stack bookkeeping. */ 226 227STATIC CHECKPOINT 228S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen comma_pDEPTH) 229{ 230 const int retval = PL_savestack_ix; 231 /* Number of bytes about to be stored in the stack */ 232 const SSize_t paren_bytes_to_push = sizeof(*RXp_OFFSp(rex)) * (maxopenparen - parenfloor); 233 /* Number of savestack[] entries to be filled by the paren data */ 234 /* Rounding is performed in case we are few elements short */ 235 const int paren_elems_to_push = (paren_bytes_to_push + sizeof(*PL_savestack) - 1) / sizeof(*PL_savestack); 236 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; 237 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; 238 239 DECLARE_AND_GET_RE_DEBUG_FLAGS; 240 241 PERL_ARGS_ASSERT_REGCPPUSH; 242 243 if (paren_elems_to_push < 0) 244 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i", 245 (int)paren_elems_to_push, (int)maxopenparen, 246 (int)parenfloor); 247 248 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) 249 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf 250 " out of range (%lu-%ld)", 251 total_elems, 252 (unsigned long)maxopenparen, 253 (long)parenfloor); 254 255 DEBUG_BUFFERS_r( 256 if ((int)maxopenparen > (int)parenfloor) 257 Perl_re_exec_indentf( aTHX_ 258 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n", 259 depth, 260 PTR2UV(rex), 261 PTR2UV(RXp_OFFSp(rex)) 262 ); 263 ); 264 265 SSGROW(total_elems + REGCP_FRAME_ELEMS); 266 assert((IV)PL_savestack_max > (IV)(total_elems + REGCP_FRAME_ELEMS)); 267 268 /* memcpy the offs inside the stack - it's faster than for loop */ 269 memcpy(&PL_savestack[PL_savestack_ix], RXp_OFFSp(rex) + parenfloor + 1, paren_bytes_to_push); 270 PL_savestack_ix += paren_elems_to_push; 271 272 DEBUG_BUFFERS_r({ 273 I32 p; 274 for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) { 275 Perl_re_exec_indentf(aTHX_ 276 " \\%" UVuf " %" IVdf " (%" IVdf ") .. %" IVdf " (regcppush)\n", 277 depth, 278 (UV)p, 279 (IV)RXp_OFFSp(rex)[p].start, 280 (IV)RXp_OFFSp(rex)[p].start_tmp, 281 (IV)RXp_OFFSp(rex)[p].end 282 ); 283 } 284 }); 285 286/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ 287 SSPUSHINT(maxopenparen); 288 SSPUSHINT(RXp_LASTPAREN(rex)); 289 SSPUSHINT(RXp_LASTCLOSEPAREN(rex)); 290 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ 291 292 293 DEBUG_BUFFERS_r({ 294 Perl_re_exec_indentf(aTHX_ 295 "finished regcppush returning %" IVdf " cur: %" IVdf "\n", 296 depth, retval, PL_savestack_ix); 297 }); 298 299 return retval; 300} 301 302/* These are needed since we do not localize EVAL nodes: */ 303#define REGCP_SET(cp) \ 304 DEBUG_STATE_r( \ 305 Perl_re_exec_indentf( aTHX_ \ 306 "Setting an EVAL scope, savestack=%" IVdf ",\n", \ 307 depth, (IV)PL_savestack_ix \ 308 ) \ 309 ); \ 310 cp = PL_savestack_ix 311 312#define REGCP_UNWIND(cp) \ 313 DEBUG_STATE_r( \ 314 if (cp != PL_savestack_ix) \ 315 Perl_re_exec_indentf( aTHX_ \ 316 "Clearing an EVAL scope, savestack=%" \ 317 IVdf "..%" IVdf "\n", \ 318 depth, (IV)(cp), (IV)PL_savestack_ix \ 319 ) \ 320 ); \ 321 regcpblow(cp) 322 323/* set the start and end positions of capture ix */ 324#define CLOSE_ANY_CAPTURE(rex, ix, s, e) \ 325 RXp_OFFSp(rex)[(ix)].start = (s); \ 326 RXp_OFFSp(rex)[(ix)].end = (e) 327 328#define CLOSE_CAPTURE(rex, ix, s, e) \ 329 CLOSE_ANY_CAPTURE(rex, ix, s, e); \ 330 if (ix > RXp_LASTPAREN(rex)) \ 331 RXp_LASTPAREN(rex) = (ix); \ 332 RXp_LASTCLOSEPAREN(rex) = (ix); \ 333 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \ 334 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " .. %" IVdf " max: %" UVuf "\n", \ 335 depth, \ 336 PTR2UV(rex), \ 337 PTR2UV(RXp_OFFSp(rex)), \ 338 (UV)(ix), \ 339 (IV)RXp_OFFSp(rex)[ix].start, \ 340 (IV)RXp_OFFSp(rex)[ix].end, \ 341 (UV)RXp_LASTPAREN(rex) \ 342 )) 343 344/* the lp and lcp args match the relevant members of the 345 * regexp structure, but in practice they should all be U16 346 * instead as we have a hard limit of U16_MAX parens. See 347 * line 4003 or so of regcomp.c where we parse OPEN parens 348 * of various types. */ 349PERL_STATIC_INLINE void 350S_unwind_paren(pTHX_ regexp *rex, U32 lp, U32 lcp comma_pDEPTH) { 351 PERL_ARGS_ASSERT_UNWIND_PAREN; 352 U32 n; 353 DECLARE_AND_GET_RE_DEBUG_FLAGS; 354 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 355 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf 356 ": invalidate (%" UVuf " .. %" UVuf ") set lcp: %" UVuf "\n", 357 depth, 358 PTR2UV(rex), 359 PTR2UV(RXp_OFFSp(rex)), 360 (UV)(lp), 361 (UV)(RXp_LASTPAREN(rex)), 362 (UV)(lcp) 363 )); 364 for (n = RXp_LASTPAREN(rex); n > lp; n--) { 365 RXp_OFFSp(rex)[n].end = -1; 366 } 367 RXp_LASTPAREN(rex) = n; 368 RXp_LASTCLOSEPAREN(rex) = lcp; 369} 370#define UNWIND_PAREN(lp,lcp) unwind_paren(rex,lp,lcp) 371 372PERL_STATIC_INLINE void 373S_capture_clear(pTHX_ regexp *rex, U16 from_ix, U16 to_ix, const char *str comma_pDEPTH) { 374 PERL_ARGS_ASSERT_CAPTURE_CLEAR; 375 PERL_UNUSED_ARG(str); /* only used for debugging */ 376 U16 my_ix; 377 DECLARE_AND_GET_RE_DEBUG_FLAGS; 378 for ( my_ix = from_ix; my_ix <= to_ix; my_ix++ ) { 379 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 380 "CAPTURE_CLEAR %s \\%" IVdf ": " 381 "%" IVdf "(%" IVdf ") .. %" IVdf 382 " => " 383 "%" IVdf "(%" IVdf ") .. %" IVdf 384 "\n", 385 depth, str, (IV)my_ix, 386 (IV)RXp_OFFSp(rex)[my_ix].start, 387 (IV)RXp_OFFSp(rex)[my_ix].start_tmp, 388 (IV)RXp_OFFSp(rex)[my_ix].end, 389 (IV)-1, (IV)-1, (IV)-1)); 390 RXp_OFFSp(rex)[my_ix].start = -1; 391 RXp_OFFSp(rex)[my_ix].start_tmp = -1; 392 RXp_OFFSp(rex)[my_ix].end = -1; 393 } 394} 395 396#define CAPTURE_CLEAR(from_ix, to_ix, str) \ 397 if (from_ix) capture_clear(rex,from_ix, to_ix, str) 398 399STATIC void 400S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p comma_pDEPTH) 401{ 402 UV i; 403 U32 paren; 404 DECLARE_AND_GET_RE_DEBUG_FLAGS; 405 406 PERL_ARGS_ASSERT_REGCPPOP; 407 408 409 DEBUG_BUFFERS_r({ 410 Perl_re_exec_indentf(aTHX_ 411 "starting regcppop at %" IVdf "\n", 412 depth, PL_savestack_ix); 413 }); 414 415 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ 416 i = SSPOPUV; 417 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ 418 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ 419 RXp_LASTCLOSEPAREN(rex) = SSPOPINT; 420 RXp_LASTPAREN(rex) = SSPOPINT; 421 *maxopenparen_p = SSPOPINT; 422 423 i -= REGCP_OTHER_ELEMS; 424 /* Now restore the parentheses context. */ 425 DEBUG_BUFFERS_r( 426 if (i || RXp_LASTPAREN(rex) + 1 <= rex->nparens) 427 Perl_re_exec_indentf( aTHX_ 428 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n", 429 depth, 430 PTR2UV(rex), 431 PTR2UV(RXp_OFFSp(rex)) 432 ); 433 ); 434 /* substract remaining elements from the stack */ 435 PL_savestack_ix -= i; 436 437 /* static assert that offs struc size is not less than stack elem size */ 438 STATIC_ASSERT_STMT(sizeof(*RXp_OFFSp(rex)) >= sizeof(*PL_savestack)); 439 440 /* calculate actual number of offs/capture groups stored */ 441 /* by doing integer division (leaving potential alignment aside) */ 442 i = (i * sizeof(*PL_savestack)) / sizeof(*RXp_OFFSp(rex)); 443 444 /* calculate paren starting point */ 445 /* i is our number of entries which we are subtracting from *maxopenparen_p */ 446 /* and we are storing + 1 this to get the beginning */ 447 paren = *maxopenparen_p - i + 1; 448 449 /* restore them */ 450 memcpy(RXp_OFFSp(rex) + paren, &PL_savestack[PL_savestack_ix], i * sizeof(*RXp_OFFSp(rex))); 451 452 DEBUG_BUFFERS_r( 453 for (; paren <= *maxopenparen_p; ++paren) { 454 Perl_re_exec_indentf(aTHX_ 455 " \\%" UVuf " %" IVdf "(%" IVdf ") .. %" IVdf " %s (regcppop)\n", 456 depth, 457 (UV)paren, 458 (IV)RXp_OFFSp(rex)[paren].start, 459 (IV)RXp_OFFSp(rex)[paren].start_tmp, 460 (IV)RXp_OFFSp(rex)[paren].end, 461 (paren > RXp_LASTPAREN(rex) ? "(skipped)" : "")); 462 } 463 ); 464#if 1 465 /* It would seem that the similar code in regtry() 466 * already takes care of this, and in fact it is in 467 * a better location to since this code can #if 0-ed out 468 * but the code in regtry() is needed or otherwise tests 469 * requiring null fields (pat.t#187 and split.t#{13,14} 470 * (as of patchlevel 7877) will fail. Then again, 471 * this code seems to be necessary or otherwise 472 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 473 * --jhi updated by dapm */ 474 for (i = RXp_LASTPAREN(rex) + 1; i <= rex->nparens; i++) { 475 if (i > *maxopenparen_p) { 476 RXp_OFFSp(rex)[i].start = -1; 477 } 478 RXp_OFFSp(rex)[i].end = -1; 479 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_ 480 " \\%" UVuf ": %s ..-1 undeffing (regcppop)\n", 481 depth, 482 (UV)i, 483 (i > *maxopenparen_p) ? "-1" : " " 484 )); 485 } 486#endif 487 DEBUG_BUFFERS_r({ 488 Perl_re_exec_indentf(aTHX_ 489 "finished regcppop at %" IVdf "\n", 490 depth, PL_savestack_ix); 491 }); 492} 493 494/* restore the parens and associated vars at savestack position ix, 495 * but without popping the stack */ 496 497STATIC void 498S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p comma_pDEPTH) 499{ 500 I32 tmpix = PL_savestack_ix; 501 PERL_ARGS_ASSERT_REGCP_RESTORE; 502 503 PL_savestack_ix = ix; 504 regcppop(rex, maxopenparen_p); 505 PL_savestack_ix = tmpix; 506} 507 508#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ 509 510STATIC bool 511S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) 512{ 513 /* Returns a boolean as to whether or not 'character' is a member of the 514 * Posix character class given by 'classnum' that should be equivalent to a 515 * value in the typedef 'char_class_number_'. 516 * 517 * Ideally this could be replaced by a just an array of function pointers 518 * to the C library functions that implement the macros this calls. 519 * However, to compile, the precise function signatures are required, and 520 * these may vary from platform to platform. To avoid having to figure 521 * out what those all are on each platform, I (khw) am using this method, 522 * which adds an extra layer of function call overhead (unless the C 523 * optimizer strips it away). But we don't particularly care about 524 * performance with locales anyway. */ 525 526 if (IN_UTF8_CTYPE_LOCALE) { 527 return cBOOL(generic_isCC_(character, classnum)); 528 } 529 530 switch ((char_class_number_) classnum) { 531 case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character); 532 case CC_ENUM_ALPHA_: return isU8_ALPHA_LC(character); 533 case CC_ENUM_ASCII_: return isU8_ASCII_LC(character); 534 case CC_ENUM_BLANK_: return isU8_BLANK_LC(character); 535 case CC_ENUM_CASED_: return isU8_CASED_LC(character); 536 case CC_ENUM_CNTRL_: return isU8_CNTRL_LC(character); 537 case CC_ENUM_DIGIT_: return isU8_DIGIT_LC(character); 538 case CC_ENUM_GRAPH_: return isU8_GRAPH_LC(character); 539 case CC_ENUM_LOWER_: return isU8_LOWER_LC(character); 540 case CC_ENUM_PRINT_: return isU8_PRINT_LC(character); 541 case CC_ENUM_PUNCT_: return isU8_PUNCT_LC(character); 542 case CC_ENUM_SPACE_: return isU8_SPACE_LC(character); 543 case CC_ENUM_UPPER_: return isU8_UPPER_LC(character); 544 case CC_ENUM_WORDCHAR_: return isU8_WORDCHAR_LC(character); 545 case CC_ENUM_XDIGIT_: return isU8_XDIGIT_LC(character); 546 default: /* VERTSPACE should never occur in locales */ 547 break; 548 } 549 550 Perl_croak(aTHX_ 551 "panic: isFOO_lc() has an unexpected character class '%d'", 552 classnum); 553 554 NOT_REACHED; /* NOTREACHED */ 555 return FALSE; 556} 557 558PERL_STATIC_INLINE I32 559S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len) 560{ 561 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be 562 * folded. Works on all folds representable without UTF-8, except for 563 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it 564 * check that the strings each have at least 'len' characters. 565 * 566 * There is almost an identical API function where s2 need not be folded: 567 * Perl_foldEQ_latin1() */ 568 569 const U8 *a = (const U8 *)s1; 570 const U8 *b = (const U8 *)s2; 571 572 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED; 573 574 assert(len >= 0); 575 576 while (len--) { 577 assert(! isUPPER_L1(*b)); 578 if (toLOWER_L1(*a) != *b) { 579 return 0; 580 } 581 a++, b++; 582 } 583 return 1; 584} 585 586STATIC bool 587S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e) 588{ 589 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded 590 * 'character' is a member of the Posix character class given by 'classnum' 591 * that should be equivalent to a value in the typedef 592 * 'char_class_number_'. 593 * 594 * This just calls isFOO_lc on the code point for the character if it is in 595 * the range 0-255. Outside that range, all characters use Unicode 596 * rules, ignoring any locale. So use the Unicode function if this class 597 * requires an inversion list, and use the Unicode macro otherwise. */ 598 599 600 PERL_ARGS_ASSERT_ISFOO_UTF8_LC; 601 602 if (UTF8_IS_INVARIANT(*character)) { 603 return isFOO_lc(classnum, *character); 604 } 605 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { 606 return isFOO_lc(classnum, 607 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1))); 608 } 609 610 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e); 611 612 switch ((char_class_number_) classnum) { 613 case CC_ENUM_SPACE_: return is_XPERLSPACE_high(character); 614 case CC_ENUM_BLANK_: return is_HORIZWS_high(character); 615 case CC_ENUM_XDIGIT_: return is_XDIGIT_high(character); 616 case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character); 617 default: 618 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], 619 utf8_to_uvchr_buf(character, e, NULL)); 620 } 621 NOT_REACHED; /* NOTREACHED */ 622} 623 624STATIC U8 * 625S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) 626{ 627 /* Returns the position of the first byte in the sequence between 's' and 628 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found. 629 * */ 630 631 PERL_ARGS_ASSERT_FIND_SPAN_END; 632 633 assert(send >= s); 634 635 if ((STRLEN) (send - s) >= PERL_WORDSIZE 636 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 637 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 638 { 639 PERL_UINTMAX_T span_word; 640 641 /* Process per-byte until reach word boundary. XXX This loop could be 642 * eliminated if we knew that this platform had fast unaligned reads */ 643 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 644 if (*s != span_byte) { 645 return s; 646 } 647 s++; 648 } 649 650 /* Create a word filled with the bytes we are spanning */ 651 span_word = PERL_COUNT_MULTIPLIER * span_byte; 652 653 /* Process per-word as long as we have at least a full word left */ 654 do { 655 656 /* Keep going if the whole word is composed of 'span_byte's */ 657 if ((* (PERL_UINTMAX_T *) s) == span_word) { 658 s += PERL_WORDSIZE; 659 continue; 660 } 661 662 /* Here, at least one byte in the word isn't 'span_byte'. */ 663 664#ifdef EBCDIC 665 666 break; 667 668#else 669 670 /* This xor leaves 1 bits only in those non-matching bytes */ 671 span_word ^= * (PERL_UINTMAX_T *) s; 672 673 /* Make sure the upper bit of each non-matching byte is set. This 674 * makes each such byte look like an ASCII platform variant byte */ 675 span_word |= span_word << 1; 676 span_word |= span_word << 2; 677 span_word |= span_word << 4; 678 679 /* That reduces the problem to what this function solves */ 680 return s + variant_byte_number(span_word); 681 682#endif 683 684 } while (s + PERL_WORDSIZE <= send); 685 } 686 687 /* Process the straggler bytes beyond the final word boundary */ 688 while (s < send) { 689 if (*s != span_byte) { 690 return s; 691 } 692 s++; 693 } 694 695 return s; 696} 697 698STATIC U8 * 699S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) 700{ 701 /* Returns the position of the first byte in the sequence between 's' 702 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte'; 703 * returns 'send' if none found. It uses word-level operations instead of 704 * byte to speed up the process */ 705 706 PERL_ARGS_ASSERT_FIND_NEXT_MASKED; 707 708 assert(send >= s); 709 assert((byte & mask) == byte); 710 711#ifndef EBCDIC 712 713 if ((STRLEN) (send - s) >= PERL_WORDSIZE 714 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 715 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 716 { 717 PERL_UINTMAX_T word, mask_word; 718 719 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 720 if (((*s) & mask) == byte) { 721 return s; 722 } 723 s++; 724 } 725 726 word = PERL_COUNT_MULTIPLIER * byte; 727 mask_word = PERL_COUNT_MULTIPLIER * mask; 728 729 do { 730 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 731 732 /* If 'masked' contains bytes with the bit pattern of 'byte' within 733 * it, xoring with 'word' will leave each of the 8 bits in such 734 * bytes be 0, and no byte containing any other bit pattern will be 735 * 0. */ 736 masked ^= word; 737 738 /* This causes the most significant bit to be set to 1 for any 739 * bytes in the word that aren't completely 0 */ 740 masked |= masked << 1; 741 masked |= masked << 2; 742 masked |= masked << 4; 743 744 /* The msbits are the same as what marks a byte as variant, so we 745 * can use this mask. If all msbits are 1, the word doesn't 746 * contain 'byte' */ 747 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) { 748 s += PERL_WORDSIZE; 749 continue; 750 } 751 752 /* Here, the msbit of bytes in the word that aren't 'byte' are 1, 753 * and any that are, are 0. Complement and re-AND to swap that */ 754 masked = ~ masked; 755 masked &= PERL_VARIANTS_WORD_MASK; 756 757 /* This reduces the problem to that solved by this function */ 758 s += variant_byte_number(masked); 759 return s; 760 761 } while (s + PERL_WORDSIZE <= send); 762 } 763 764#endif 765 766 while (s < send) { 767 if (((*s) & mask) == byte) { 768 return s; 769 } 770 s++; 771 } 772 773 return s; 774} 775 776STATIC U8 * 777S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) 778{ 779 /* Returns the position of the first byte in the sequence between 's' and 780 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'. 781 * 'span_byte' should have been ANDed with 'mask' in the call of this 782 * function. Returns 'send' if none found. Works like find_span_end(), 783 * except for the AND */ 784 785 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK; 786 787 assert(send >= s); 788 assert((span_byte & mask) == span_byte); 789 790 if ((STRLEN) (send - s) >= PERL_WORDSIZE 791 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 792 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) 793 { 794 PERL_UINTMAX_T span_word, mask_word; 795 796 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { 797 if (((*s) & mask) != span_byte) { 798 return s; 799 } 800 s++; 801 } 802 803 span_word = PERL_COUNT_MULTIPLIER * span_byte; 804 mask_word = PERL_COUNT_MULTIPLIER * mask; 805 806 do { 807 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; 808 809 if (masked == span_word) { 810 s += PERL_WORDSIZE; 811 continue; 812 } 813 814#ifdef EBCDIC 815 816 break; 817 818#else 819 820 masked ^= span_word; 821 masked |= masked << 1; 822 masked |= masked << 2; 823 masked |= masked << 4; 824 return s + variant_byte_number(masked); 825 826#endif 827 828 } while (s + PERL_WORDSIZE <= send); 829 } 830 831 while (s < send) { 832 if (((*s) & mask) != span_byte) { 833 return s; 834 } 835 s++; 836 } 837 838 return s; 839} 840 841/* 842 * pregexec and friends 843 */ 844 845#ifndef PERL_IN_XSUB_RE 846/* 847 - pregexec - match a regexp against a string 848 */ 849I32 850Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 851 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 852/* stringarg: the point in the string at which to begin matching */ 853/* strend: pointer to null at end of string */ 854/* strbeg: real beginning of string */ 855/* minend: end of match must be >= minend bytes after stringarg. */ 856/* screamer: SV being matched: only used for utf8 flag, pos() etc; string 857 * itself is accessed via the pointers above */ 858/* nosave: For optimizations. */ 859{ 860 PERL_ARGS_ASSERT_PREGEXEC; 861 862 return 863 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 864 nosave ? 0 : REXEC_COPY_STR); 865} 866#endif 867 868 869 870/* re_intuit_start(): 871 * 872 * Based on some optimiser hints, try to find the earliest position in the 873 * string where the regex could match. 874 * 875 * rx: the regex to match against 876 * sv: the SV being matched: only used for utf8 flag; the string 877 * itself is accessed via the pointers below. Note that on 878 * something like an overloaded SV, SvPOK(sv) may be false 879 * and the string pointers may point to something unrelated to 880 * the SV itself. 881 * strbeg: real beginning of string 882 * strpos: the point in the string at which to begin matching 883 * strend: pointer to the byte following the last char of the string 884 * flags currently unused; set to 0 885 * data: currently unused; set to NULL 886 * 887 * The basic idea of re_intuit_start() is to use some known information 888 * about the pattern, namely: 889 * 890 * a) the longest known anchored substring (i.e. one that's at a 891 * constant offset from the beginning of the pattern; but not 892 * necessarily at a fixed offset from the beginning of the 893 * string); 894 * b) the longest floating substring (i.e. one that's not at a constant 895 * offset from the beginning of the pattern); 896 * c) Whether the pattern is anchored to the string; either 897 * an absolute anchor: /^../, or anchored to \n: /^.../m, 898 * or anchored to pos(): /\G/; 899 * d) A start class: a real or synthetic character class which 900 * represents which characters are legal at the start of the pattern; 901 * 902 * to either quickly reject the match, or to find the earliest position 903 * within the string at which the pattern might match, thus avoiding 904 * running the full NFA engine at those earlier locations, only to 905 * eventually fail and retry further along. 906 * 907 * Returns NULL if the pattern can't match, or returns the address within 908 * the string which is the earliest place the match could occur. 909 * 910 * The longest of the anchored and floating substrings is called 'check' 911 * and is checked first. The other is called 'other' and is checked 912 * second. The 'other' substring may not be present. For example, 913 * 914 * /(abc|xyz)ABC\d{0,3}DEFG/ 915 * 916 * will have 917 * 918 * check substr (float) = "DEFG", offset 6..9 chars 919 * other substr (anchored) = "ABC", offset 3..3 chars 920 * stclass = [ax] 921 * 922 * Be aware that during the course of this function, sometimes 'anchored' 923 * refers to a substring being anchored relative to the start of the 924 * pattern, and sometimes to the pattern itself being anchored relative to 925 * the string. For example: 926 * 927 * /\dabc/: "abc" is anchored to the pattern; 928 * /^\dabc/: "abc" is anchored to the pattern and the string; 929 * /\d+abc/: "abc" is anchored to neither the pattern nor the string; 930 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, 931 * but the pattern is anchored to the string. 932 */ 933 934char * 935Perl_re_intuit_start(pTHX_ 936 REGEXP * const rx, 937 SV *sv, 938 const char * const strbeg, 939 char *strpos, 940 char *strend, 941 const U32 flags, 942 re_scream_pos_data *data) 943{ 944 struct regexp *const prog = ReANY(rx); 945 SSize_t start_shift = prog->check_offset_min; 946 /* Should be nonnegative! */ 947 SSize_t end_shift = 0; 948 /* current lowest pos in string where the regex can start matching */ 949 char *rx_origin = strpos; 950 SV *check; 951 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ 952 U8 other_ix = 1 - prog->substrs->check_ix; 953 bool ml_anch = 0; 954 char *other_last = strpos;/* latest pos 'other' substr already checked to */ 955 char *check_at = NULL; /* check substr found at this pos */ 956 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; 957 RXi_GET_DECL(prog,progi); 958 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ 959 regmatch_info *const reginfo = ®info_buf; 960 DECLARE_AND_GET_RE_DEBUG_FLAGS; 961 962 PERL_ARGS_ASSERT_RE_INTUIT_START; 963 PERL_UNUSED_ARG(flags); 964 PERL_UNUSED_ARG(data); 965 966 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 967 "Intuit: trying to determine minimum start position...\n")); 968 969 /* for now, assume that all substr offsets are positive. If at some point 970 * in the future someone wants to do clever things with lookbehind and 971 * -ve offsets, they'll need to fix up any code in this function 972 * which uses these offsets. See the thread beginning 973 * <20140113145929.GF27210@iabyn.com> 974 */ 975 assert(prog->substrs->data[0].min_offset >= 0); 976 assert(prog->substrs->data[0].max_offset >= 0); 977 assert(prog->substrs->data[1].min_offset >= 0); 978 assert(prog->substrs->data[1].max_offset >= 0); 979 assert(prog->substrs->data[2].min_offset >= 0); 980 assert(prog->substrs->data[2].max_offset >= 0); 981 982 /* for now, assume that if both present, that the floating substring 983 * doesn't start before the anchored substring. 984 * If you break this assumption (e.g. doing better optimisations 985 * with lookahead/behind), then you'll need to audit the code in this 986 * function carefully first 987 */ 988 assert( 989 ! ( (prog->anchored_utf8 || prog->anchored_substr) 990 && (prog->float_utf8 || prog->float_substr)) 991 || (prog->float_min_offset >= prog->anchored_offset)); 992 993 /* byte rather than char calculation for efficiency. It fails 994 * to quickly reject some cases that can't match, but will reject 995 * them later after doing full char arithmetic */ 996 if (prog->minlen > strend - strpos) { 997 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 998 " String too short...\n")); 999 goto fail; 1000 } 1001 1002 RXp_MATCH_UTF8_set(prog, utf8_target); 1003 reginfo->is_utf8_target = cBOOL(utf8_target); 1004 reginfo->info_aux = NULL; 1005 reginfo->strbeg = strbeg; 1006 reginfo->strend = strend; 1007 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 1008 reginfo->intuit = 1; 1009 /* not actually used within intuit, but zero for safety anyway */ 1010 reginfo->poscache_maxiter = 0; 1011 1012 if (utf8_target) { 1013 if ((!prog->anchored_utf8 && prog->anchored_substr) 1014 || (!prog->float_utf8 && prog->float_substr)) 1015 to_utf8_substr(prog); 1016 check = prog->check_utf8; 1017 } else { 1018 if (!prog->check_substr && prog->check_utf8) { 1019 if (! to_byte_substr(prog)) { 1020 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 1021 } 1022 } 1023 check = prog->check_substr; 1024 } 1025 1026 /* dump the various substring data */ 1027 DEBUG_OPTIMISE_MORE_r({ 1028 int i; 1029 for (i=0; i<=2; i++) { 1030 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr 1031 : prog->substrs->data[i].substr); 1032 if (!sv) 1033 continue; 1034 1035 Perl_re_printf( aTHX_ 1036 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf 1037 " useful=%" IVdf " utf8=%d [%s]\n", 1038 i, 1039 (IV)prog->substrs->data[i].min_offset, 1040 (IV)prog->substrs->data[i].max_offset, 1041 (IV)prog->substrs->data[i].end_shift, 1042 BmUSEFUL(sv), 1043 utf8_target ? 1 : 0, 1044 SvPEEK(sv)); 1045 } 1046 }); 1047 1048 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ 1049 1050 /* ml_anch: check after \n? 1051 * 1052 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning 1053 * with /.*.../, these flags will have been added by the 1054 * compiler: 1055 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL 1056 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL 1057 */ 1058 ml_anch = (prog->intflags & PREGf_ANCH_MBOL) 1059 && !(prog->intflags & PREGf_IMPLICIT); 1060 1061 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { 1062 /* we are only allowed to match at BOS or \G */ 1063 1064 /* trivially reject if there's a BOS anchor and we're not at BOS. 1065 * 1066 * Note that we don't try to do a similar quick reject for 1067 * \G, since generally the caller will have calculated strpos 1068 * based on pos() and gofs, so the string is already correctly 1069 * anchored by definition; and handling the exceptions would 1070 * be too fiddly (e.g. REXEC_IGNOREPOS). 1071 */ 1072 if ( strpos != strbeg 1073 && (prog->intflags & PREGf_ANCH_SBOL)) 1074 { 1075 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1076 " Not at start...\n")); 1077 goto fail; 1078 } 1079 1080 /* in the presence of an anchor, the anchored (relative to the 1081 * start of the regex) substr must also be anchored relative 1082 * to strpos. So quickly reject if substr isn't found there. 1083 * This works for \G too, because the caller will already have 1084 * subtracted gofs from pos, and gofs is the offset from the 1085 * \G to the start of the regex. For example, in /.abc\Gdef/, 1086 * where substr="abcdef", pos()=3, gofs=4, offset_min=1: 1087 * caller will have set strpos=pos()-4; we look for the substr 1088 * at position pos()-4+1, which lines up with the "a" */ 1089 1090 if (prog->check_offset_min == prog->check_offset_max) { 1091 /* Substring at constant offset from beg-of-str... */ 1092 SSize_t slen = SvCUR(check); 1093 char *s = HOP3c(strpos, prog->check_offset_min, strend); 1094 1095 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1096 " Looking for check substr at fixed offset %" IVdf "...\n", 1097 (IV)prog->check_offset_min)); 1098 1099 if (SvTAIL(check)) { 1100 /* In this case, the regex is anchored at the end too. 1101 * Unless it's a multiline match, the lengths must match 1102 * exactly, give or take a \n. NB: slen >= 1 since 1103 * the last char of check is \n */ 1104 if (!multiline 1105 && ( strend - s > slen 1106 || strend - s < slen - 1 1107 || (strend - s == slen && strend[-1] != '\n'))) 1108 { 1109 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1110 " String too long...\n")); 1111 goto fail_finish; 1112 } 1113 /* Now should match s[0..slen-2] */ 1114 slen--; 1115 } 1116 if (slen && (strend - s < slen 1117 || *SvPVX_const(check) != *s 1118 || (slen > 1 && (memNE(SvPVX_const(check), s, slen))))) 1119 { 1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1121 " String not equal...\n")); 1122 goto fail_finish; 1123 } 1124 1125 check_at = s; 1126 goto success_at_start; 1127 } 1128 } 1129 } 1130 1131 end_shift = prog->check_end_shift; 1132 1133#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ 1134 if (end_shift < 0) 1135 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ", 1136 (IV)end_shift, RX_PRECOMP(rx)); 1137#endif 1138 1139 restart: 1140 1141 /* This is the (re)entry point of the main loop in this function. 1142 * The goal of this loop is to: 1143 * 1) find the "check" substring in the region rx_origin..strend 1144 * (adjusted by start_shift / end_shift). If not found, reject 1145 * immediately. 1146 * 2) If it exists, look for the "other" substr too if defined; for 1147 * example, if the check substr maps to the anchored substr, then 1148 * check the floating substr, and vice-versa. If not found, go 1149 * back to (1) with rx_origin suitably incremented. 1150 * 3) If we find an rx_origin position that doesn't contradict 1151 * either of the substrings, then check the possible additional 1152 * constraints on rx_origin of /^.../m or a known start class. 1153 * If these fail, then depending on which constraints fail, jump 1154 * back to here, or to various other re-entry points further along 1155 * that skip some of the first steps. 1156 * 4) If we pass all those tests, update the BmUSEFUL() count on the 1157 * substring. If the start position was determined to be at the 1158 * beginning of the string - so, not rejected, but not optimised, 1159 * since we have to run regmatch from position 0 - decrement the 1160 * BmUSEFUL() count. Otherwise increment it. 1161 */ 1162 1163 1164 /* first, look for the 'check' substring */ 1165 1166 { 1167 U8* start_point; 1168 U8* end_point; 1169 1170 DEBUG_OPTIMISE_MORE_r({ 1171 Perl_re_printf( aTHX_ 1172 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf 1173 " Start shift: %" IVdf " End shift %" IVdf 1174 " Real end Shift: %" IVdf "\n", 1175 (IV)(rx_origin - strbeg), 1176 (IV)prog->check_offset_min, 1177 (IV)start_shift, 1178 (IV)end_shift, 1179 (IV)prog->check_end_shift); 1180 }); 1181 1182 end_point = HOPBACK3(strend, end_shift, rx_origin); 1183 if (!end_point) 1184 goto fail_finish; 1185 start_point = HOPMAYBE3(rx_origin, start_shift, end_point); 1186 if (!start_point) 1187 goto fail_finish; 1188 1189 1190 /* If the regex is absolutely anchored to either the start of the 1191 * string (SBOL) or to pos() (ANCH_GPOS), then 1192 * check_offset_max represents an upper bound on the string where 1193 * the substr could start. For the ANCH_GPOS case, we assume that 1194 * the caller of intuit will have already set strpos to 1195 * pos()-gofs, so in this case strpos + offset_max will still be 1196 * an upper bound on the substr. 1197 */ 1198 if (!ml_anch 1199 && prog->intflags & PREGf_ANCH 1200 && prog->check_offset_max != SSize_t_MAX) 1201 { 1202 SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check)); 1203 const char * const anchor = 1204 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); 1205 SSize_t targ_len = (char*)end_point - anchor; 1206 1207 if (check_len > targ_len) { 1208 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1209 "Target string too short to match required substring...\n")); 1210 goto fail_finish; 1211 } 1212 1213 /* do a bytes rather than chars comparison. It's conservative; 1214 * so it skips doing the HOP if the result can't possibly end 1215 * up earlier than the old value of end_point. 1216 */ 1217 assert(anchor + check_len <= (char *)end_point); 1218 if (prog->check_offset_max + check_len < targ_len) { 1219 end_point = HOP3lim((U8*)anchor, 1220 prog->check_offset_max, 1221 end_point - check_len 1222 ) 1223 + check_len; 1224 if (end_point < start_point) 1225 goto fail_finish; 1226 } 1227 } 1228 1229 check_at = fbm_instr( start_point, end_point, 1230 check, multiline ? FBMrf_MULTILINE : 0); 1231 1232 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1233 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1234 (IV)((char*)start_point - strbeg), 1235 (IV)((char*)end_point - strbeg), 1236 (IV)(check_at ? check_at - strbeg : -1) 1237 )); 1238 1239 /* Update the count-of-usability, remove useless subpatterns, 1240 unshift s. */ 1241 1242 DEBUG_EXECUTE_r({ 1243 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1244 SvPVX_const(check), RE_SV_DUMPLEN(check), 30); 1245 Perl_re_printf( aTHX_ " %s %s substr %s%s%s", 1246 (check_at ? "Found" : "Did not find"), 1247 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) 1248 ? "anchored" : "floating"), 1249 quoted, 1250 RE_SV_TAIL(check), 1251 (check_at ? " at offset " : "...\n") ); 1252 }); 1253 1254 if (!check_at) 1255 goto fail_finish; 1256 /* set rx_origin to the minimum position where the regex could start 1257 * matching, given the constraint of the just-matched check substring. 1258 * But don't set it lower than previously. 1259 */ 1260 1261 if (check_at - rx_origin > prog->check_offset_max) 1262 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); 1263 /* Finish the diagnostic message */ 1264 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1265 "%ld (rx_origin now %" IVdf ")...\n", 1266 (long)(check_at - strbeg), 1267 (IV)(rx_origin - strbeg) 1268 )); 1269 } 1270 1271 1272 /* now look for the 'other' substring if defined */ 1273 1274 if (prog->substrs->data[other_ix].utf8_substr 1275 || prog->substrs->data[other_ix].substr) 1276 { 1277 /* Take into account the "other" substring. */ 1278 char *last, *last1; 1279 char *s; 1280 SV* must; 1281 struct reg_substr_datum *other; 1282 1283 do_other_substr: 1284 other = &prog->substrs->data[other_ix]; 1285 if (!utf8_target && !other->substr) { 1286 if (!to_byte_substr(prog)) { 1287 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); 1288 } 1289 } 1290 1291 /* if "other" is anchored: 1292 * we've previously found a floating substr starting at check_at. 1293 * This means that the regex origin must lie somewhere 1294 * between min (rx_origin): HOP3(check_at, -check_offset_max) 1295 * and max: HOP3(check_at, -check_offset_min) 1296 * (except that min will be >= strpos) 1297 * So the fixed substr must lie somewhere between 1298 * HOP3(min, anchored_offset) 1299 * HOP3(max, anchored_offset) + SvCUR(substr) 1300 */ 1301 1302 /* if "other" is floating 1303 * Calculate last1, the absolute latest point where the 1304 * floating substr could start in the string, ignoring any 1305 * constraints from the earlier fixed match. It is calculated 1306 * as follows: 1307 * 1308 * strend - prog->minlen (in chars) is the absolute latest 1309 * position within the string where the origin of the regex 1310 * could appear. The latest start point for the floating 1311 * substr is float_min_offset(*) on from the start of the 1312 * regex. last1 simply combines thee two offsets. 1313 * 1314 * (*) You might think the latest start point should be 1315 * float_max_offset from the regex origin, and technically 1316 * you'd be correct. However, consider 1317 * /a\d{2,4}bcd\w/ 1318 * Here, float min, max are 3,5 and minlen is 7. 1319 * This can match either 1320 * /a\d\dbcd\w/ 1321 * /a\d\d\dbcd\w/ 1322 * /a\d\d\d\dbcd\w/ 1323 * In the first case, the regex matches minlen chars; in the 1324 * second, minlen+1, in the third, minlen+2. 1325 * In the first case, the floating offset is 3 (which equals 1326 * float_min), in the second, 4, and in the third, 5 (which 1327 * equals float_max). In all cases, the floating string bcd 1328 * can never start more than 4 chars from the end of the 1329 * string, which equals minlen - float_min. As the substring 1330 * starts to match more than float_min from the start of the 1331 * regex, it makes the regex match more than minlen chars, 1332 * and the two cancel each other out. So we can always use 1333 * float_min - minlen, rather than float_max - minlen for the 1334 * latest position in the string. 1335 * 1336 * Note that -minlen + float_min_offset is equivalent (AFAIKT) 1337 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift 1338 */ 1339 1340 assert(prog->minlen >= other->min_offset); 1341 last1 = HOP3c(strend, 1342 other->min_offset - prog->minlen, strbeg); 1343 1344 if (other_ix) {/* i.e. if (other-is-float) */ 1345 /* last is the latest point where the floating substr could 1346 * start, *given* any constraints from the earlier fixed 1347 * match. This constraint is that the floating string starts 1348 * <= float_max_offset chars from the regex origin (rx_origin). 1349 * If this value is less than last1, use it instead. 1350 */ 1351 assert(rx_origin <= last1); 1352 last = 1353 /* this condition handles the offset==infinity case, and 1354 * is a short-cut otherwise. Although it's comparing a 1355 * byte offset to a char length, it does so in a safe way, 1356 * since 1 char always occupies 1 or more bytes, 1357 * so if a string range is (last1 - rx_origin) bytes, 1358 * it will be less than or equal to (last1 - rx_origin) 1359 * chars; meaning it errs towards doing the accurate HOP3 1360 * rather than just using last1 as a short-cut */ 1361 (last1 - rx_origin) < other->max_offset 1362 ? last1 1363 : (char*)HOP3lim(rx_origin, other->max_offset, last1); 1364 } 1365 else { 1366 assert(strpos + start_shift <= check_at); 1367 last = HOP4c(check_at, other->min_offset - start_shift, 1368 strbeg, strend); 1369 } 1370 1371 s = HOP3c(rx_origin, other->min_offset, strend); 1372 if (s < other_last) /* These positions already checked */ 1373 s = other_last; 1374 1375 must = utf8_target ? other->utf8_substr : other->substr; 1376 assert(SvPOK(must)); 1377 { 1378 char *from = s; 1379 char *to = last + SvCUR(must) - (SvTAIL(must)!=0); 1380 1381 if (to > strend) 1382 to = strend; 1383 if (from > to) { 1384 s = NULL; 1385 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1386 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n", 1387 (IV)(from - strbeg), 1388 (IV)(to - strbeg) 1389 )); 1390 } 1391 else { 1392 s = fbm_instr( 1393 (unsigned char*)from, 1394 (unsigned char*)to, 1395 must, 1396 multiline ? FBMrf_MULTILINE : 0 1397 ); 1398 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1399 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n", 1400 (IV)(from - strbeg), 1401 (IV)(to - strbeg), 1402 (IV)(s ? s - strbeg : -1) 1403 )); 1404 } 1405 } 1406 1407 DEBUG_EXECUTE_r({ 1408 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 1409 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 1410 Perl_re_printf( aTHX_ " %s %s substr %s%s", 1411 s ? "Found" : "Contradicts", 1412 other_ix ? "floating" : "anchored", 1413 quoted, RE_SV_TAIL(must)); 1414 }); 1415 1416 1417 if (!s) { 1418 /* last1 is latest possible substr location. If we didn't 1419 * find it before there, we never will */ 1420 if (last >= last1) { 1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1422 "; giving up...\n")); 1423 goto fail_finish; 1424 } 1425 1426 /* try to find the check substr again at a later 1427 * position. Maybe next time we'll find the "other" substr 1428 * in range too */ 1429 other_last = HOP3c(last, 1, strend) /* highest failure */; 1430 rx_origin = 1431 other_ix /* i.e. if other-is-float */ 1432 ? HOP3c(rx_origin, 1, strend) 1433 : HOP4c(last, 1 - other->min_offset, strbeg, strend); 1434 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1435 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n", 1436 (other_ix ? "floating" : "anchored"), 1437 (long)(HOP3c(check_at, 1, strend) - strbeg), 1438 (IV)(rx_origin - strbeg) 1439 )); 1440 goto restart; 1441 } 1442 else { 1443 if (other_ix) { /* if (other-is-float) */ 1444 /* other_last is set to s, not s+1, since its possible for 1445 * a floating substr to fail first time, then succeed 1446 * second time at the same floating position; e.g.: 1447 * "-AB--AABZ" =~ /\wAB\d*Z/ 1448 * The first time round, anchored and float match at 1449 * "-(AB)--AAB(Z)" then fail on the initial \w character 1450 * class. Second time round, they match at "-AB--A(AB)(Z)". 1451 */ 1452 other_last = s; 1453 } 1454 else { 1455 rx_origin = HOP3c(s, -other->min_offset, strbeg); 1456 other_last = HOP3c(s, 1, strend); 1457 } 1458 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1459 " at offset %ld (rx_origin now %" IVdf ")...\n", 1460 (long)(s - strbeg), 1461 (IV)(rx_origin - strbeg) 1462 )); 1463 1464 } 1465 } 1466 else { 1467 DEBUG_OPTIMISE_MORE_r( 1468 Perl_re_printf( aTHX_ 1469 " Check-only match: offset min:%" IVdf " max:%" IVdf 1470 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf 1471 " strend:%" IVdf "\n", 1472 (IV)prog->check_offset_min, 1473 (IV)prog->check_offset_max, 1474 (IV)(check_at-strbeg), 1475 (IV)(rx_origin-strbeg), 1476 (IV)(rx_origin-check_at), 1477 (IV)(strend-strbeg) 1478 ) 1479 ); 1480 } 1481 1482 postprocess_substr_matches: 1483 1484 /* handle the extra constraint of /^.../m if present */ 1485 1486 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { 1487 char *s; 1488 1489 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1490 " looking for /^/m anchor")); 1491 1492 /* we have failed the constraint of a \n before rx_origin. 1493 * Find the next \n, if any, even if it's beyond the current 1494 * anchored and/or floating substrings. Whether we should be 1495 * scanning ahead for the next \n or the next substr is debatable. 1496 * On the one hand you'd expect rare substrings to appear less 1497 * often than \n's. On the other hand, searching for \n means 1498 * we're effectively flipping between check_substr and "\n" on each 1499 * iteration as the current "rarest" candidate string, which 1500 * means for example that we'll quickly reject the whole string if 1501 * hasn't got a \n, rather than trying every substr position 1502 * first 1503 */ 1504 1505 s = HOP3c(strend, - prog->minlen, strpos); 1506 if (s <= rx_origin || 1507 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) 1508 { 1509 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1510 " Did not find /%s^%s/m...\n", 1511 PL_colors[0], PL_colors[1])); 1512 goto fail_finish; 1513 } 1514 1515 /* earliest possible origin is 1 char after the \n. 1516 * (since *rx_origin == '\n', it's safe to ++ here rather than 1517 * HOP(rx_origin, 1)) */ 1518 rx_origin++; 1519 1520 if (prog->substrs->check_ix == 0 /* check is anchored */ 1521 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) 1522 { 1523 /* Position contradicts check-string; either because 1524 * check was anchored (and thus has no wiggle room), 1525 * or check was float and rx_origin is above the float range */ 1526 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1527 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n", 1528 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1529 goto restart; 1530 } 1531 1532 /* if we get here, the check substr must have been float, 1533 * is in range, and we may or may not have had an anchored 1534 * "other" substr which still contradicts */ 1535 assert(prog->substrs->check_ix); /* check is float */ 1536 1537 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { 1538 /* whoops, the anchored "other" substr exists, so we still 1539 * contradict. On the other hand, the float "check" substr 1540 * didn't contradict, so just retry the anchored "other" 1541 * substr */ 1542 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1543 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n", 1544 PL_colors[0], PL_colors[1], 1545 (IV)(rx_origin - strbeg + prog->anchored_offset), 1546 (IV)(rx_origin - strbeg) 1547 )); 1548 goto do_other_substr; 1549 } 1550 1551 /* success: we don't contradict the found floating substring 1552 * (and there's no anchored substr). */ 1553 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1554 " Found /%s^%s/m with rx_origin %ld...\n", 1555 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg))); 1556 } 1557 else { 1558 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1559 " (multiline anchor test skipped)\n")); 1560 } 1561 1562 success_at_start: 1563 1564 1565 /* if we have a starting character class, then test that extra constraint. 1566 * (trie stclasses are too expensive to use here, we are better off to 1567 * leave it to regmatch itself) */ 1568 1569 if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) { 1570 const U8* const str = (U8*)STRING(progi->regstclass); 1571 1572 /* XXX this value could be pre-computed */ 1573 const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT 1574 ? (reginfo->is_utf8_pat 1575 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) 1576 : (SSize_t)STR_LEN(progi->regstclass)) 1577 : 1); 1578 char * endpos; 1579 char *s; 1580 /* latest pos that a matching float substr constrains rx start to */ 1581 char *rx_max_float = NULL; 1582 1583 /* if the current rx_origin is anchored, either by satisfying an 1584 * anchored substring constraint, or a /^.../m constraint, then we 1585 * can reject the current origin if the start class isn't found 1586 * at the current position. If we have a float-only match, then 1587 * rx_origin is constrained to a range; so look for the start class 1588 * in that range. if neither, then look for the start class in the 1589 * whole rest of the string */ 1590 1591 /* XXX DAPM it's not clear what the minlen test is for, and why 1592 * it's not used in the floating case. Nothing in the test suite 1593 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. 1594 * Here are some old comments, which may or may not be correct: 1595 * 1596 * minlen == 0 is possible if regstclass is \b or \B, 1597 * and the fixed substr is ''$. 1598 * Since minlen is already taken into account, rx_origin+1 is 1599 * before strend; accidentally, minlen >= 1 guaranties no false 1600 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : 1601 * 0) below assumes that regstclass does not come from lookahead... 1602 * If regstclass takes bytelength more than 1: If charlength==1, OK. 1603 * This leaves EXACTF-ish only, which are dealt with in 1604 * find_byclass(). 1605 */ 1606 1607 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) 1608 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend); 1609 else if (prog->float_substr || prog->float_utf8) { 1610 rx_max_float = HOP3c(check_at, -start_shift, strbeg); 1611 endpos = HOP3clim(rx_max_float, cl_l, strend); 1612 } 1613 else 1614 endpos= strend; 1615 1616 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1617 " looking for class: start_shift: %" IVdf " check_at: %" IVdf 1618 " rx_origin: %" IVdf " endpos: %" IVdf "\n", 1619 (IV)start_shift, (IV)(check_at - strbeg), 1620 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); 1621 1622 s = find_byclass(prog, progi->regstclass, rx_origin, endpos, 1623 reginfo); 1624 if (!s) { 1625 if (endpos == strend) { 1626 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1627 " Could not match STCLASS...\n") ); 1628 goto fail; 1629 } 1630 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1631 " This position contradicts STCLASS...\n") ); 1632 if ((prog->intflags & PREGf_ANCH) && !ml_anch 1633 && !(prog->intflags & PREGf_IMPLICIT)) 1634 goto fail; 1635 1636 /* Contradict one of substrings */ 1637 if (prog->anchored_substr || prog->anchored_utf8) { 1638 if (prog->substrs->check_ix == 1) { /* check is float */ 1639 /* Have both, check_string is floating */ 1640 assert(rx_origin + start_shift <= check_at); 1641 if (rx_origin + start_shift != check_at) { 1642 /* not at latest position float substr could match: 1643 * Recheck anchored substring, but not floating. 1644 * The condition above is in bytes rather than 1645 * chars for efficiency. It's conservative, in 1646 * that it errs on the side of doing 'goto 1647 * do_other_substr'. In this case, at worst, 1648 * an extra anchored search may get done, but in 1649 * practice the extra fbm_instr() is likely to 1650 * get skipped anyway. */ 1651 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1652 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n", 1653 (long)(other_last - strbeg), 1654 (IV)(rx_origin - strbeg) 1655 )); 1656 goto do_other_substr; 1657 } 1658 } 1659 } 1660 else { 1661 /* float-only */ 1662 1663 if (ml_anch) { 1664 /* In the presence of ml_anch, we might be able to 1665 * find another \n without breaking the current float 1666 * constraint. */ 1667 1668 /* strictly speaking this should be HOP3c(..., 1, ...), 1669 * but since we goto a block of code that's going to 1670 * search for the next \n if any, its safe here */ 1671 rx_origin++; 1672 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1673 " about to look for /%s^%s/m starting at rx_origin %ld...\n", 1674 PL_colors[0], PL_colors[1], 1675 (long)(rx_origin - strbeg)) ); 1676 goto postprocess_substr_matches; 1677 } 1678 1679 /* strictly speaking this can never be true; but might 1680 * be if we ever allow intuit without substrings */ 1681 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) 1682 goto fail; 1683 1684 rx_origin = rx_max_float; 1685 } 1686 1687 /* at this point, any matching substrings have been 1688 * contradicted. Start again... */ 1689 1690 rx_origin = HOP3c(rx_origin, 1, strend); 1691 1692 /* uses bytes rather than char calculations for efficiency. 1693 * It's conservative: it errs on the side of doing 'goto restart', 1694 * where there is code that does a proper char-based test */ 1695 if (rx_origin + start_shift + end_shift > strend) { 1696 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1697 " Could not match STCLASS...\n") ); 1698 goto fail; 1699 } 1700 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 1701 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n", 1702 (prog->substrs->check_ix ? "floating" : "anchored"), 1703 (long)(rx_origin + start_shift - strbeg), 1704 (IV)(rx_origin - strbeg) 1705 )); 1706 goto restart; 1707 } 1708 1709 /* Success !!! */ 1710 1711 if (rx_origin != s) { 1712 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1713 " By STCLASS: moving %ld --> %ld\n", 1714 (long)(rx_origin - strbeg), (long)(s - strbeg)) 1715 ); 1716 } 1717 else { 1718 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1719 " Does not contradict STCLASS...\n"); 1720 ); 1721 } 1722 } 1723 1724 /* Decide whether using the substrings helped */ 1725 1726 if (rx_origin != strpos) { 1727 /* Fixed substring is found far enough so that the match 1728 cannot start at strpos. */ 1729 1730 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n")); 1731 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ 1732 } 1733 else { 1734 /* The found rx_origin position does not prohibit matching at 1735 * strpos, so calling intuit didn't gain us anything. Decrement 1736 * the BmUSEFUL() count on the check substring, and if we reach 1737 * zero, free it. */ 1738 if (!(prog->intflags & PREGf_NAUGHTY) 1739 && (utf8_target ? ( 1740 prog->check_utf8 /* Could be deleted already */ 1741 && --BmUSEFUL(prog->check_utf8) < 0 1742 && (prog->check_utf8 == prog->float_utf8) 1743 ) : ( 1744 prog->check_substr /* Could be deleted already */ 1745 && --BmUSEFUL(prog->check_substr) < 0 1746 && (prog->check_substr == prog->float_substr) 1747 ))) 1748 { 1749 /* If flags & SOMETHING - do not do it many times on the same match */ 1750 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n")); 1751 /* XXX Does the destruction order has to change with utf8_target? */ 1752 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); 1753 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); 1754 prog->check_substr = prog->check_utf8 = NULL; /* disable */ 1755 prog->float_substr = prog->float_utf8 = NULL; /* clear */ 1756 check = NULL; /* abort */ 1757 /* XXXX This is a remnant of the old implementation. It 1758 looks wasteful, since now INTUIT can use many 1759 other heuristics. */ 1760 prog->extflags &= ~RXf_USE_INTUIT; 1761 } 1762 } 1763 1764 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 1765 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", 1766 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) ); 1767 1768 return rx_origin; 1769 1770 fail_finish: /* Substring not found */ 1771 if (prog->check_substr || prog->check_utf8) /* could be removed already */ 1772 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ 1773 fail: 1774 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n", 1775 PL_colors[4], PL_colors[5])); 1776 return NULL; 1777} 1778 1779 1780#define DECL_TRIE_TYPE(scan) \ 1781 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ 1782 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \ 1783 trie_utf8l, trie_flu8, trie_flu8_latin } \ 1784 trie_type = ((FLAGS(scan) == EXACT) \ 1785 ? (utf8_target ? trie_utf8 : trie_plain) \ 1786 : (FLAGS(scan) == EXACTL) \ 1787 ? (utf8_target ? trie_utf8l : trie_plain) \ 1788 : (FLAGS(scan) == EXACTFAA) \ 1789 ? (utf8_target \ 1790 ? trie_utf8_exactfa_fold \ 1791 : trie_latin_utf8_exactfa_fold) \ 1792 : (FLAGS(scan) == EXACTFLU8 \ 1793 ? (utf8_target \ 1794 ? trie_flu8 \ 1795 : trie_flu8_latin) \ 1796 : (utf8_target \ 1797 ? trie_utf8_fold \ 1798 : trie_latin_utf8_fold))) 1799 1800/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is 1801 * 'foldbuf+sizeof(foldbuf)' */ 1802#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ 1803STMT_START { \ 1804 STRLEN skiplen; \ 1805 U8 flags = FOLD_FLAGS_FULL; \ 1806 switch (trie_type) { \ 1807 case trie_flu8: \ 1808 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 1809 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1810 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1811 } \ 1812 goto do_trie_utf8_fold; \ 1813 case trie_utf8_exactfa_fold: \ 1814 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1815 /* FALLTHROUGH */ \ 1816 case trie_utf8_fold: \ 1817 do_trie_utf8_fold: \ 1818 if ( foldlen>0 ) { \ 1819 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1820 foldlen -= len; \ 1821 uscan += len; \ 1822 len=0; \ 1823 } else { \ 1824 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \ 1825 flags); \ 1826 len = UTF8_SAFE_SKIP(uc, uc_end); \ 1827 skiplen = UVCHR_SKIP( uvc ); \ 1828 foldlen -= skiplen; \ 1829 uscan = foldbuf + skiplen; \ 1830 } \ 1831 break; \ 1832 case trie_flu8_latin: \ 1833 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 1834 goto do_trie_latin_utf8_fold; \ 1835 case trie_latin_utf8_exactfa_fold: \ 1836 flags |= FOLD_FLAGS_NOMIX_ASCII; \ 1837 /* FALLTHROUGH */ \ 1838 case trie_latin_utf8_fold: \ 1839 do_trie_latin_utf8_fold: \ 1840 if ( foldlen>0 ) { \ 1841 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \ 1842 foldlen -= len; \ 1843 uscan += len; \ 1844 len=0; \ 1845 } else { \ 1846 len = 1; \ 1847 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ 1848 skiplen = UVCHR_SKIP( uvc ); \ 1849 foldlen -= skiplen; \ 1850 uscan = foldbuf + skiplen; \ 1851 } \ 1852 break; \ 1853 case trie_utf8l: \ 1854 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 1855 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ 1856 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \ 1857 } \ 1858 /* FALLTHROUGH */ \ 1859 case trie_utf8: \ 1860 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \ 1861 break; \ 1862 case trie_plain: \ 1863 uvc = (UV)*uc; \ 1864 len = 1; \ 1865 } \ 1866 if (uvc < 256) { \ 1867 charid = trie->charmap[ uvc ]; \ 1868 } \ 1869 else { \ 1870 charid = 0; \ 1871 if (widecharmap) { \ 1872 SV** const svpp = hv_fetch(widecharmap, \ 1873 (char*)&uvc, sizeof(UV), 0); \ 1874 if (svpp) \ 1875 charid = (U16)SvIV(*svpp); \ 1876 } \ 1877 } \ 1878} STMT_END 1879 1880#define DUMP_EXEC_POS(li,s,doutf8,depth) \ 1881 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ 1882 startpos, doutf8, depth) 1883 1884#define GET_ANYOFH_INVLIST(prog, n) \ 1885 GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL) 1886 1887#define REXEC_FBC_UTF8_SCAN(CODE) \ 1888 STMT_START { \ 1889 while (s < strend) { \ 1890 CODE \ 1891 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1892 } \ 1893 } STMT_END 1894 1895#define REXEC_FBC_NON_UTF8_SCAN(CODE) \ 1896 STMT_START { \ 1897 while (s < strend) { \ 1898 CODE \ 1899 s++; \ 1900 } \ 1901 } STMT_END 1902 1903#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ 1904 STMT_START { \ 1905 while (s < strend) { \ 1906 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ 1907 } \ 1908 } STMT_END 1909 1910#define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \ 1911 STMT_START { \ 1912 while (s < strend) { \ 1913 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ 1914 } \ 1915 } STMT_END 1916 1917#define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ 1918 if (COND) { \ 1919 FBC_CHECK_AND_TRY \ 1920 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1921 previous_occurrence_end = s; \ 1922 } \ 1923 else { \ 1924 s += UTF8SKIP(s); \ 1925 } 1926 1927#define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ 1928 if (COND) { \ 1929 FBC_CHECK_AND_TRY \ 1930 s++; \ 1931 previous_occurrence_end = s; \ 1932 } \ 1933 else { \ 1934 s++; \ 1935 } 1936 1937/* We keep track of where the next character should start after an occurrence 1938 * of the one we're looking for. Knowing that, we can see right away if the 1939 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we 1940 * don't accept the 2nd and succeeding adjacent occurrences */ 1941#define FBC_CHECK_AND_TRY \ 1942 if ( ( doevery \ 1943 || s != previous_occurrence_end) \ 1944 && ( reginfo->intuit \ 1945 || (s <= reginfo->strend && regtry(reginfo, &s)))) \ 1946 { \ 1947 goto got_it; \ 1948 } 1949 1950 1951/* These differ from the above macros in that they call a function which 1952 * returns the next occurrence of the thing being looked for in 's'; and 1953 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c) 1954 * */ 1955#define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \ 1956 while (s < strend) { \ 1957 s = (char *) (f); \ 1958 if (s >= strend) { \ 1959 break; \ 1960 } \ 1961 \ 1962 FBC_CHECK_AND_TRY \ 1963 s += UTF8SKIP(s); \ 1964 previous_occurrence_end = s; \ 1965 } 1966 1967#define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \ 1968 while (s < strend) { \ 1969 s = (char *) (f); \ 1970 if (s >= strend) { \ 1971 break; \ 1972 } \ 1973 \ 1974 FBC_CHECK_AND_TRY \ 1975 s++; \ 1976 previous_occurrence_end = s; \ 1977 } 1978 1979/* This is like the above macro except the function returns NULL if there is no 1980 * occurrence, and there is a further condition that must be matched besides 1981 * the function */ 1982#define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \ 1983 while (s < strend) { \ 1984 s = (char *) (f); \ 1985 if (s == NULL) { \ 1986 s = (char *) strend; \ 1987 break; \ 1988 } \ 1989 \ 1990 if (COND) { \ 1991 FBC_CHECK_AND_TRY \ 1992 s += UTF8_SAFE_SKIP(s, reginfo->strend); \ 1993 previous_occurrence_end = s; \ 1994 } \ 1995 else { \ 1996 s += UTF8SKIP(s); \ 1997 } \ 1998 } 1999 2000/* This differs from the above macros in that it is passed a single byte that 2001 * is known to begin the next occurrence of the thing being looked for in 's'. 2002 * It does a memchr to find the next occurrence of 'byte', before trying 'COND' 2003 * at that position. */ 2004#define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \ 2005 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \ 2006 COND) 2007 2008/* This is like the function above, but takes an entire string to look for 2009 * instead of a single byte */ 2010#define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \ 2011 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \ 2012 ninstr(s, strend, substr, substr_end), \ 2013 COND) 2014 2015/* The four macros below are slightly different versions of the same logic. 2016 * 2017 * The first is for /a and /aa when the target string is UTF-8. This can only 2018 * match ascii, but it must advance based on UTF-8. The other three handle 2019 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are 2020 * looking for the boundary (or non-boundary) between a word and non-word 2021 * character. The utf8 and non-utf8 cases have the same logic, but the details 2022 * must be different. Find the "wordness" of the character just prior to this 2023 * one, and compare it with the wordness of this one. If they differ, we have 2024 * a boundary. At the beginning of the string, pretend that the previous 2025 * character was a new-line. 2026 * 2027 * All these macros uncleanly have side-effects with each other and outside 2028 * variables. So far it's been too much trouble to clean-up 2029 * 2030 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is 2031 * a word character or not. 2032 * IF_SUCCESS is code to do if it finds that we are at a boundary between 2033 * word/non-word 2034 * IF_FAIL is code to do if we aren't at a boundary between word/non-word 2035 * 2036 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we 2037 * are looking for a boundary or for a non-boundary. If we are looking for a 2038 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and 2039 * see if this tentative match actually works, and if so, to quit the loop 2040 * here. And vice-versa if we are looking for a non-boundary. 2041 * 2042 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and 2043 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of 2044 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be 2045 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal 2046 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that 2047 * complement. But in that branch we complement tmp, meaning that at the 2048 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), 2049 * which means at the top of the loop in the next iteration, it is 2050 * TEST_NON_UTF8(s-1) */ 2051#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2052 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 2053 tmp = TEST_NON_UTF8(tmp); \ 2054 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ 2055 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ 2056 tmp = !tmp; \ 2057 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ 2058 } \ 2059 else { \ 2060 IF_FAIL; \ 2061 } \ 2062 ); \ 2063 2064/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and 2065 * TEST_UTF8 is a macro that for the same input code points returns identically 2066 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an 2067 * end pointer as well) */ 2068#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ 2069 if (s == reginfo->strbeg) { \ 2070 tmp = '\n'; \ 2071 } \ 2072 else { /* Back-up to the start of the previous character */ \ 2073 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ 2074 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ 2075 0, UTF8_ALLOW_DEFAULT); \ 2076 } \ 2077 tmp = TEST_UV(tmp); \ 2078 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \ 2079 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \ 2080 tmp = !tmp; \ 2081 IF_SUCCESS; \ 2082 } \ 2083 else { \ 2084 IF_FAIL; \ 2085 } \ 2086 ); 2087 2088/* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the 2089 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases, 2090 * set-up by the FBC_BOUND, etc macros below */ 2091#define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2092 UTF8_CODE; \ 2093 /* Here, things have been set up by the previous code so that tmp is the \ 2094 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \ 2095 * against the EOS, which we treat as a \n */ \ 2096 if (tmp == ! TEST_NON_UTF8('\n')) { \ 2097 IF_SUCCESS; \ 2098 } \ 2099 else { \ 2100 IF_FAIL; \ 2101 } 2102 2103/* Same as the macro above, but the target isn't UTF-8 */ 2104#define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ 2105 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ 2106 tmp = TEST_NON_UTF8(tmp); \ 2107 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \ 2108 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \ 2109 IF_SUCCESS; \ 2110 tmp = !tmp; \ 2111 } \ 2112 else { \ 2113 IF_FAIL; \ 2114 } \ 2115 ); \ 2116 /* Here, things have been set up by the previous code so that tmp is \ 2117 * the return of TEST_NON_UTF8(s-1). We also have to check if this \ 2118 * matches against the EOS, which we treat as a \n */ \ 2119 if (tmp == ! TEST_NON_UTF8('\n')) { \ 2120 IF_SUCCESS; \ 2121 } \ 2122 else { \ 2123 IF_FAIL; \ 2124 } 2125 2126/* This is the macro to use when we want to see if something that looks like it 2127 * could match, actually does, and if so exits the loop. It needs to be used 2128 * only for bounds checking macros, as it allows for matching beyond the end of 2129 * string (which should be zero length without having to look at the string 2130 * contents) */ 2131#define REXEC_FBC_TRYIT \ 2132 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ 2133 goto got_it 2134 2135/* The only difference between the BOUND and NBOUND cases is that 2136 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in 2137 * NBOUND. This is accomplished by passing it as either the if or else clause, 2138 * with the other one being empty (PLACEHOLDER is defined as empty). 2139 * 2140 * The TEST_FOO parameters are for operating on different forms of input, but 2141 * all should be ones that return identically for the same underlying code 2142 * points */ 2143 2144#define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2145 FBC_BOUND_COMMON_UTF8( \ 2146 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ 2147 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2148 2149#define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \ 2150 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2151 2152#define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \ 2153 FBC_BOUND_COMMON_UTF8( \ 2154 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\ 2155 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2156 2157#define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \ 2158 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) 2159 2160#define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ 2161 FBC_BOUND_COMMON_UTF8( \ 2162 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2163 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2164 2165#define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \ 2166 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2167 2168#define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \ 2169 FBC_BOUND_COMMON_UTF8( \ 2170 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ 2171 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2172 2173#define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \ 2174 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) 2175 2176#ifdef DEBUGGING 2177static IV 2178S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { 2179 IV cp_out = _invlist_search(invlist, cp_in); 2180 assert(cp_out >= 0); 2181 return cp_out; 2182} 2183# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2184 invmap[S_get_break_val_cp_checked(invlist, cp)] 2185#else 2186# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ 2187 invmap[_invlist_search(invlist, cp)] 2188#endif 2189 2190/* Takes a pointer to an inversion list, a pointer to its corresponding 2191 * inversion map, and a code point, and returns the code point's value 2192 * according to the two arrays. It assumes that all code points have a value. 2193 * This is used as the base macro for macros for particular properties */ 2194#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \ 2195 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) 2196 2197/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead 2198 * of a code point, returning the value for the first code point in the string. 2199 * And it takes the particular macro name that finds the desired value given a 2200 * code point. Merely convert the UTF-8 to code point and call the cp macro */ 2201#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \ 2202 (__ASSERT_(pos < strend) \ 2203 /* Note assumes is valid UTF-8 */ \ 2204 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL)))) 2205 2206/* Returns the GCB value for the input code point */ 2207#define getGCB_VAL_CP(cp) \ 2208 _generic_GET_BREAK_VAL_CP( \ 2209 PL_GCB_invlist, \ 2210 _Perl_GCB_invmap, \ 2211 (cp)) 2212 2213/* Returns the GCB value for the first code point in the UTF-8 encoded string 2214 * bounded by pos and strend */ 2215#define getGCB_VAL_UTF8(pos, strend) \ 2216 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) 2217 2218/* Returns the LB value for the input code point */ 2219#define getLB_VAL_CP(cp) \ 2220 _generic_GET_BREAK_VAL_CP( \ 2221 PL_LB_invlist, \ 2222 _Perl_LB_invmap, \ 2223 (cp)) 2224 2225/* Returns the LB value for the first code point in the UTF-8 encoded string 2226 * bounded by pos and strend */ 2227#define getLB_VAL_UTF8(pos, strend) \ 2228 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend) 2229 2230 2231/* Returns the SB value for the input code point */ 2232#define getSB_VAL_CP(cp) \ 2233 _generic_GET_BREAK_VAL_CP( \ 2234 PL_SB_invlist, \ 2235 _Perl_SB_invmap, \ 2236 (cp)) 2237 2238/* Returns the SB value for the first code point in the UTF-8 encoded string 2239 * bounded by pos and strend */ 2240#define getSB_VAL_UTF8(pos, strend) \ 2241 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend) 2242 2243/* Returns the WB value for the input code point */ 2244#define getWB_VAL_CP(cp) \ 2245 _generic_GET_BREAK_VAL_CP( \ 2246 PL_WB_invlist, \ 2247 _Perl_WB_invmap, \ 2248 (cp)) 2249 2250/* Returns the WB value for the first code point in the UTF-8 encoded string 2251 * bounded by pos and strend */ 2252#define getWB_VAL_UTF8(pos, strend) \ 2253 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend) 2254 2255/* We know what class REx starts with. Try to find this position... */ 2256/* if reginfo->intuit, its a dryrun */ 2257/* annoyingly all the vars in this routine have different names from their counterparts 2258 in regmatch. /grrr */ 2259STATIC char * 2260S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 2261 const char *strend, regmatch_info *reginfo) 2262{ 2263 2264 /* TRUE if x+ need not match at just the 1st pos of run of x's */ 2265 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; 2266 2267 char *pat_string; /* The pattern's exactish string */ 2268 char *pat_end; /* ptr to end char of pat_string */ 2269 re_fold_t folder; /* Function for computing non-utf8 folds */ 2270 const U8 *fold_array; /* array for folding ords < 256 */ 2271 STRLEN ln; 2272 STRLEN lnc; 2273 U8 c1; 2274 U8 c2; 2275 char *e = NULL; 2276 2277 /* In some cases we accept only the first occurrence of 'x' in a sequence of 2278 * them. This variable points to just beyond the end of the previous 2279 * occurrence of 'x', hence we can tell if we are in a sequence. (Having 2280 * it point to beyond the 'x' allows us to work for UTF-8 without having to 2281 * hop back.) */ 2282 char * previous_occurrence_end = 0; 2283 2284 I32 tmp; /* Scratch variable */ 2285 const bool utf8_target = reginfo->is_utf8_target; 2286 UV utf8_fold_flags = 0; 2287 const bool is_utf8_pat = reginfo->is_utf8_pat; 2288 bool to_complement = FALSE; /* Invert the result? Taking the xor of this 2289 with a result inverts that result, as 0^1 = 2290 1 and 1^1 = 0 */ 2291 char_class_number_ classnum; 2292 2293 RXi_GET_DECL(prog,progi); 2294 2295 PERL_ARGS_ASSERT_FIND_BYCLASS; 2296 2297 /* We know what class it must start with. The case statements below have 2298 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb' 2299 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern 2300 * ('p8' and 'pb'. */ 2301 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) { 2302 SV * anyofh_list; 2303 2304 case ANYOFPOSIXL_t8_pb: 2305 case ANYOFPOSIXL_t8_p8: 2306 case ANYOFL_t8_pb: 2307 case ANYOFL_t8_p8: 2308 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2309 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); 2310 2311 /* FALLTHROUGH */ 2312 2313 case ANYOFD_t8_pb: 2314 case ANYOFD_t8_p8: 2315 case ANYOF_t8_pb: 2316 case ANYOF_t8_p8: 2317 REXEC_FBC_UTF8_CLASS_SCAN( 2318 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)); 2319 break; 2320 2321 case ANYOFPOSIXL_tb_pb: 2322 case ANYOFPOSIXL_tb_p8: 2323 case ANYOFL_tb_pb: 2324 case ANYOFL_tb_p8: 2325 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2326 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); 2327 2328 /* FALLTHROUGH */ 2329 2330 case ANYOFD_tb_pb: 2331 case ANYOFD_tb_p8: 2332 case ANYOF_tb_pb: 2333 case ANYOF_tb_p8: 2334 if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) { 2335 /* We know that s is in the bitmap range since the target isn't 2336 * UTF-8, so what happens for out-of-range values is not relevant, 2337 * so exclude that from the flags */ 2338 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); 2339 } 2340 else { 2341 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 2342 0)); 2343 } 2344 break; 2345 2346 case ANYOFM_tb_pb: /* ARG1u() is the base byte; FLAGS() the mask byte */ 2347 case ANYOFM_tb_p8: 2348 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( 2349 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c))); 2350 break; 2351 2352 case ANYOFM_t8_pb: 2353 case ANYOFM_t8_p8: 2354 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But 2355 * we do anyway for performance reasons, as otherwise we would have to 2356 * examine all the continuation characters */ 2357 REXEC_FBC_UTF8_FIND_NEXT_SCAN( 2358 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c))); 2359 break; 2360 2361 case NANYOFM_tb_pb: 2362 case NANYOFM_tb_p8: 2363 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( 2364 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG1u(c), FLAGS(c))); 2365 break; 2366 2367 case NANYOFM_t8_pb: 2368 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8 2369 variants. */ 2370 REXEC_FBC_UTF8_FIND_NEXT_SCAN( 2371 (char *) find_span_end_mask((U8 *) s, (U8 *) strend, 2372 (U8) ARG1u(c), FLAGS(c))); 2373 break; 2374 2375 /* These nodes all require at least one code point to be in UTF-8 to 2376 * match */ 2377 case ANYOFH_tb_pb: 2378 case ANYOFH_tb_p8: 2379 case ANYOFHb_tb_pb: 2380 case ANYOFHb_tb_p8: 2381 case ANYOFHbbm_tb_pb: 2382 case ANYOFHbbm_tb_p8: 2383 case ANYOFHr_tb_pb: 2384 case ANYOFHr_tb_p8: 2385 case ANYOFHs_tb_pb: 2386 case ANYOFHs_tb_p8: 2387 case EXACTFLU8_tb_pb: 2388 case EXACTFLU8_tb_p8: 2389 case EXACTFU_REQ8_tb_pb: 2390 case EXACTFU_REQ8_tb_p8: 2391 break; 2392 2393 case ANYOFH_t8_pb: 2394 case ANYOFH_t8_p8: 2395 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2396 REXEC_FBC_UTF8_CLASS_SCAN( 2397 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2398 && _invlist_contains_cp(anyofh_list, 2399 utf8_to_uvchr_buf((U8 *) s, 2400 (U8 *) strend, 2401 NULL)))); 2402 break; 2403 2404 case ANYOFHb_t8_pb: 2405 case ANYOFHb_t8_p8: 2406 { 2407 /* We know what the first byte of any matched string should be. */ 2408 U8 first_byte = FLAGS(c); 2409 2410 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2411 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2412 _invlist_contains_cp(anyofh_list, 2413 utf8_to_uvchr_buf((U8 *) s, 2414 (U8 *) strend, 2415 NULL))); 2416 } 2417 break; 2418 2419 case ANYOFHbbm_t8_pb: 2420 case ANYOFHbbm_t8_p8: 2421 { 2422 /* We know what the first byte of any matched string should be. */ 2423 U8 first_byte = FLAGS(c); 2424 2425 /* And a bitmap defines all the legal 2nd byte matches */ 2426 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2427 ( s < strend 2428 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap, 2429 (U8) s[1] & UTF_CONTINUATION_MASK))); 2430 } 2431 break; 2432 2433 case ANYOFHr_t8_pb: 2434 case ANYOFHr_t8_p8: 2435 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2436 REXEC_FBC_UTF8_CLASS_SCAN( 2437 ( inRANGE(NATIVE_UTF8_TO_I8(*s), 2438 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), 2439 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) 2440 && _invlist_contains_cp(anyofh_list, 2441 utf8_to_uvchr_buf((U8 *) s, 2442 (U8 *) strend, 2443 NULL)))); 2444 break; 2445 2446 case ANYOFHs_t8_pb: 2447 case ANYOFHs_t8_p8: 2448 anyofh_list = GET_ANYOFH_INVLIST(prog, c); 2449 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN( 2450 ((struct regnode_anyofhs *) c)->string, 2451 /* Note FLAGS is the string length in this regnode */ 2452 ((struct regnode_anyofhs *) c)->string + FLAGS(c), 2453 _invlist_contains_cp(anyofh_list, 2454 utf8_to_uvchr_buf((U8 *) s, 2455 (U8 *) strend, 2456 NULL))); 2457 break; 2458 2459 case ANYOFR_tb_pb: 2460 case ANYOFR_tb_p8: 2461 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, 2462 ANYOFRbase(c), ANYOFRdelta(c))); 2463 break; 2464 2465 case ANYOFR_t8_pb: 2466 case ANYOFR_t8_p8: 2467 REXEC_FBC_UTF8_CLASS_SCAN( 2468 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) 2469 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2470 (U8 *) strend, 2471 NULL), 2472 ANYOFRbase(c), ANYOFRdelta(c)))); 2473 break; 2474 2475 case ANYOFRb_tb_pb: 2476 case ANYOFRb_tb_p8: 2477 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, 2478 ANYOFRbase(c), ANYOFRdelta(c))); 2479 break; 2480 2481 case ANYOFRb_t8_pb: 2482 case ANYOFRb_t8_p8: 2483 { /* We know what the first byte of any matched string should be */ 2484 U8 first_byte = FLAGS(c); 2485 2486 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, 2487 withinCOUNT(utf8_to_uvchr_buf((U8 *) s, 2488 (U8 *) strend, 2489 NULL), 2490 ANYOFRbase(c), ANYOFRdelta(c))); 2491 } 2492 break; 2493 2494 case EXACTFAA_tb_pb: 2495 2496 /* Latin1 folds are not affected by /a, except it excludes the sharp s, 2497 * which these functions don't handle anyway */ 2498 fold_array = PL_fold_latin1; 2499 folder = S_foldEQ_latin1_s2_folded; 2500 goto do_exactf_non_utf8; 2501 2502 case EXACTF_tb_pb: 2503 fold_array = PL_fold; 2504 folder = Perl_foldEQ; 2505 goto do_exactf_non_utf8; 2506 2507 case EXACTFL_tb_pb: 2508 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2509 2510 if (IN_UTF8_CTYPE_LOCALE) { 2511 utf8_fold_flags = FOLDEQ_LOCALE; 2512 goto do_exactf_utf8; 2513 } 2514 2515 fold_array = PL_fold_locale; 2516 folder = Perl_foldEQ_locale; 2517 goto do_exactf_non_utf8; 2518 2519 case EXACTFU_tb_pb: 2520 /* Any 'ss' in the pattern should have been replaced by regcomp, so we 2521 * don't have to worry here about this single special case in the 2522 * Latin1 range */ 2523 fold_array = PL_fold_latin1; 2524 folder = S_foldEQ_latin1_s2_folded; 2525 2526 /* FALLTHROUGH */ 2527 2528 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there 2529 are no glitches with fold-length differences 2530 between the target string and pattern */ 2531 2532 /* The idea in the non-utf8 EXACTF* cases is to first find the first 2533 * character of the EXACTF* node and then, if necessary, 2534 * case-insensitively compare the full text of the node. c1 is the 2535 * first character. c2 is its fold. This logic will not work for 2536 * Unicode semantics and the german sharp ss, which hence should not be 2537 * compiled into a node that gets here. */ 2538 pat_string = STRINGs(c); 2539 ln = STR_LENs(c); /* length to match in octets/bytes */ 2540 2541 /* We know that we have to match at least 'ln' bytes (which is the same 2542 * as characters, since not utf8). If we have to match 3 characters, 2543 * and there are only 2 available, we know without trying that it will 2544 * fail; so don't start a match past the required minimum number from 2545 * the far end */ 2546 e = HOP3c(strend, -((SSize_t)ln), s); 2547 if (e < s) 2548 break; 2549 2550 c1 = *pat_string; 2551 c2 = fold_array[c1]; 2552 if (c1 == c2) { /* If char and fold are the same */ 2553 while (s <= e) { 2554 s = (char *) memchr(s, c1, e + 1 - s); 2555 if (s == NULL) { 2556 break; 2557 } 2558 2559 /* Check that the rest of the node matches */ 2560 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1)) 2561 && (reginfo->intuit || regtry(reginfo, &s)) ) 2562 { 2563 goto got_it; 2564 } 2565 s++; 2566 } 2567 } 2568 else { 2569 U8 bits_differing = c1 ^ c2; 2570 2571 /* If the folds differ in one bit position only, we can mask to 2572 * match either of them, and can use this faster find method. Both 2573 * ASCII and EBCDIC tend to have their case folds differ in only 2574 * one position, so this is very likely */ 2575 if (LIKELY(PL_bitcount[bits_differing] == 1)) { 2576 bits_differing = ~ bits_differing; 2577 while (s <= e) { 2578 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1, 2579 (c1 & bits_differing), bits_differing); 2580 if (s > e) { 2581 break; 2582 } 2583 2584 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1)) 2585 && (reginfo->intuit || regtry(reginfo, &s)) ) 2586 { 2587 goto got_it; 2588 } 2589 s++; 2590 } 2591 } 2592 else { /* Otherwise, stuck with looking byte-at-a-time. This 2593 should actually happen only in EXACTFL nodes */ 2594 while (s <= e) { 2595 if ( (*(U8*)s == c1 || *(U8*)s == c2) 2596 && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1)) 2597 && (reginfo->intuit || regtry(reginfo, &s)) ) 2598 { 2599 goto got_it; 2600 } 2601 s++; 2602 } 2603 } 2604 } 2605 break; 2606 2607 case EXACTFAA_tb_p8: 2608 case EXACTFAA_t8_p8: 2609 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII 2610 |FOLDEQ_S2_ALREADY_FOLDED 2611 |FOLDEQ_S2_FOLDS_SANE; 2612 goto do_exactf_utf8; 2613 2614 case EXACTFAA_NO_TRIE_tb_pb: 2615 case EXACTFAA_NO_TRIE_t8_pb: 2616 case EXACTFAA_t8_pb: 2617 2618 /* Here, and elsewhere in this file, the reason we can't consider a 2619 * non-UTF-8 pattern already folded in the presence of a UTF-8 target 2620 * is because any MICRO SIGN in the pattern won't be folded. Since the 2621 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider 2622 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */ 2623 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 2624 goto do_exactf_utf8; 2625 2626 case EXACTFL_tb_p8: 2627 case EXACTFL_t8_pb: 2628 case EXACTFL_t8_p8: 2629 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2630 utf8_fold_flags = FOLDEQ_LOCALE; 2631 goto do_exactf_utf8; 2632 2633 case EXACTFLU8_t8_pb: 2634 case EXACTFLU8_t8_p8: 2635 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 2636 | FOLDEQ_S2_FOLDS_SANE; 2637 goto do_exactf_utf8; 2638 2639 case EXACTFU_REQ8_t8_p8: 2640 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2641 goto do_exactf_utf8; 2642 2643 case EXACTFU_tb_p8: 2644 case EXACTFU_t8_pb: 2645 case EXACTFU_t8_p8: 2646 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; 2647 goto do_exactf_utf8; 2648 2649 /* The following are problematic even though pattern isn't UTF-8. Use 2650 * full functionality normally not done except for UTF-8. */ 2651 case EXACTF_t8_pb: 2652 case EXACTFUP_tb_pb: 2653 case EXACTFUP_t8_pb: 2654 2655 do_exactf_utf8: 2656 { 2657 unsigned expansion; 2658 2659 /* If one of the operands is in utf8, we can't use the simpler 2660 * folding above, due to the fact that many different characters 2661 * can have the same fold, or portion of a fold, or different- 2662 * length fold */ 2663 pat_string = STRINGs(c); 2664 ln = STR_LENs(c); /* length to match in octets/bytes */ 2665 pat_end = pat_string + ln; 2666 lnc = is_utf8_pat /* length to match in characters */ 2667 ? utf8_length((U8 *) pat_string, (U8 *) pat_end) 2668 : ln; 2669 2670 /* We have 'lnc' characters to match in the pattern, but because of 2671 * multi-character folding, each character in the target can match 2672 * up to 3 characters (Unicode guarantees it will never exceed 2673 * this) if it is utf8-encoded; and up to 2 if not (based on the 2674 * fact that the Latin 1 folds are already determined, and the only 2675 * multi-char fold in that range is the sharp-s folding to 'ss'. 2676 * Thus, a pattern character can match as little as 1/3 of a string 2677 * character. Adjust lnc accordingly, rounding up, so that if we 2678 * need to match at least 4+1/3 chars, that really is 5. */ 2679 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; 2680 lnc = (lnc + expansion - 1) / expansion; 2681 2682 /* As in the non-UTF8 case, if we have to match 3 characters, and 2683 * only 2 are left, it's guaranteed to fail, so don't start a match 2684 * that would require us to go beyond the end of the string */ 2685 e = HOP3c(strend, -((SSize_t)lnc), s); 2686 2687 /* XXX Note that we could recalculate e to stop the loop earlier, 2688 * as the worst case expansion above will rarely be met, and as we 2689 * go along we would usually find that e moves further to the left. 2690 * This would happen only after we reached the point in the loop 2691 * where if there were no expansion we should fail. Unclear if 2692 * worth the expense */ 2693 2694 while (s <= e) { 2695 char *my_strend= (char *)strend; 2696 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, 2697 pat_string, NULL, ln, is_utf8_pat, 2698 utf8_fold_flags) 2699 && (reginfo->intuit || regtry(reginfo, &s)) ) 2700 { 2701 goto got_it; 2702 } 2703 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; 2704 } 2705 } 2706 break; 2707 2708 case BOUNDA_tb_pb: 2709 case BOUNDA_tb_p8: 2710 case BOUND_tb_pb: /* /d without utf8 target is /a */ 2711 case BOUND_tb_p8: 2712 /* regcomp.c makes sure that these only have the traditional \b 2713 * meaning. */ 2714 assert(FLAGS(c) == TRADITIONAL_BOUND); 2715 2716 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A); 2717 break; 2718 2719 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ 2720 case BOUNDA_t8_p8: 2721 /* regcomp.c makes sure that these only have the traditional \b 2722 * meaning. */ 2723 assert(FLAGS(c) == TRADITIONAL_BOUND); 2724 2725 FBC_BOUND_A_UTF8(isWORDCHAR_A); 2726 break; 2727 2728 case NBOUNDA_tb_pb: 2729 case NBOUNDA_tb_p8: 2730 case NBOUND_tb_pb: /* /d without utf8 target is /a */ 2731 case NBOUND_tb_p8: 2732 /* regcomp.c makes sure that these only have the traditional \b 2733 * meaning. */ 2734 assert(FLAGS(c) == TRADITIONAL_BOUND); 2735 2736 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A); 2737 break; 2738 2739 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ 2740 case NBOUNDA_t8_p8: 2741 /* regcomp.c makes sure that these only have the traditional \b 2742 * meaning. */ 2743 assert(FLAGS(c) == TRADITIONAL_BOUND); 2744 2745 FBC_NBOUND_A_UTF8(isWORDCHAR_A); 2746 break; 2747 2748 case NBOUNDU_tb_pb: 2749 case NBOUNDU_tb_p8: 2750 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2751 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1); 2752 break; 2753 } 2754 2755 to_complement = 1; 2756 goto do_boundu_non_utf8; 2757 2758 case NBOUNDL_tb_pb: 2759 case NBOUNDL_tb_p8: 2760 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2761 if (FLAGS(c) == TRADITIONAL_BOUND) { 2762 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC); 2763 break; 2764 } 2765 2766 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2767 2768 to_complement = 1; 2769 goto do_boundu_non_utf8; 2770 2771 case BOUNDL_tb_pb: 2772 case BOUNDL_tb_p8: 2773 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2774 if (FLAGS(c) == TRADITIONAL_BOUND) { 2775 FBC_BOUND_NON_UTF8(isWORDCHAR_LC); 2776 break; 2777 } 2778 2779 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2780 2781 goto do_boundu_non_utf8; 2782 2783 case BOUNDU_tb_pb: 2784 case BOUNDU_tb_p8: 2785 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2786 FBC_BOUND_NON_UTF8(isWORDCHAR_L1); 2787 break; 2788 } 2789 2790 do_boundu_non_utf8: 2791 if (s == reginfo->strbeg) { 2792 if (reginfo->intuit || regtry(reginfo, &s)) 2793 { 2794 goto got_it; 2795 } 2796 2797 /* Didn't match. Try at the next position (if there is one) */ 2798 s++; 2799 if (UNLIKELY(s >= reginfo->strend)) { 2800 break; 2801 } 2802 } 2803 2804 switch((bound_type) FLAGS(c)) { 2805 case TRADITIONAL_BOUND: /* Should have already been handled */ 2806 assert(0); 2807 break; 2808 2809 case GCB_BOUND: 2810 /* Not utf8. Everything is a GCB except between CR and LF */ 2811 while (s < strend) { 2812 if ((to_complement ^ ( UCHARAT(s - 1) != '\r' 2813 || UCHARAT(s) != '\n')) 2814 && (reginfo->intuit || regtry(reginfo, &s))) 2815 { 2816 goto got_it; 2817 } 2818 s++; 2819 } 2820 2821 break; 2822 2823 case LB_BOUND: 2824 { 2825 LB_enum before = getLB_VAL_CP((U8) *(s -1)); 2826 while (s < strend) { 2827 LB_enum after = getLB_VAL_CP((U8) *s); 2828 if (to_complement ^ isLB(before, 2829 after, 2830 (U8*) reginfo->strbeg, 2831 (U8*) s, 2832 (U8*) reginfo->strend, 2833 0 /* target not utf8 */ ) 2834 && (reginfo->intuit || regtry(reginfo, &s))) 2835 { 2836 goto got_it; 2837 } 2838 before = after; 2839 s++; 2840 } 2841 } 2842 2843 break; 2844 2845 case SB_BOUND: 2846 { 2847 SB_enum before = getSB_VAL_CP((U8) *(s -1)); 2848 while (s < strend) { 2849 SB_enum after = getSB_VAL_CP((U8) *s); 2850 if ((to_complement ^ isSB(before, 2851 after, 2852 (U8*) reginfo->strbeg, 2853 (U8*) s, 2854 (U8*) reginfo->strend, 2855 0 /* target not utf8 */ )) 2856 && (reginfo->intuit || regtry(reginfo, &s))) 2857 { 2858 goto got_it; 2859 } 2860 before = after; 2861 s++; 2862 } 2863 } 2864 2865 break; 2866 2867 case WB_BOUND: 2868 { 2869 WB_enum previous = WB_UNKNOWN; 2870 WB_enum before = getWB_VAL_CP((U8) *(s -1)); 2871 while (s < strend) { 2872 WB_enum after = getWB_VAL_CP((U8) *s); 2873 if ((to_complement ^ isWB(previous, 2874 before, 2875 after, 2876 (U8*) reginfo->strbeg, 2877 (U8*) s, 2878 (U8*) reginfo->strend, 2879 0 /* target not utf8 */ )) 2880 && (reginfo->intuit || regtry(reginfo, &s))) 2881 { 2882 goto got_it; 2883 } 2884 previous = before; 2885 before = after; 2886 s++; 2887 } 2888 } 2889 } 2890 2891 /* Here are at the final position in the target string, which is a 2892 * boundary by definition, so matches, depending on other constraints. 2893 * */ 2894 if ( reginfo->intuit 2895 || (s <= reginfo->strend && regtry(reginfo, &s))) 2896 { 2897 goto got_it; 2898 } 2899 2900 break; 2901 2902 case BOUNDL_t8_pb: 2903 case BOUNDL_t8_p8: 2904 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2905 if (FLAGS(c) == TRADITIONAL_BOUND) { 2906 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, 2907 isWORDCHAR_LC_utf8_safe); 2908 break; 2909 } 2910 2911 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2912 2913 to_complement = 1; 2914 goto do_boundu_utf8; 2915 2916 case NBOUNDL_t8_pb: 2917 case NBOUNDL_t8_p8: 2918 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 2919 if (FLAGS(c) == TRADITIONAL_BOUND) { 2920 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, 2921 isWORDCHAR_LC_utf8_safe); 2922 break; 2923 } 2924 2925 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 2926 2927 to_complement = 1; 2928 goto do_boundu_utf8; 2929 2930 case NBOUND_t8_pb: 2931 case NBOUND_t8_p8: 2932 /* regcomp.c makes sure that these only have the traditional \b 2933 * meaning. */ 2934 assert(FLAGS(c) == TRADITIONAL_BOUND); 2935 2936 /* FALLTHROUGH */ 2937 2938 case NBOUNDU_t8_pb: 2939 case NBOUNDU_t8_p8: 2940 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2941 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, 2942 isWORDCHAR_utf8_safe); 2943 break; 2944 } 2945 2946 to_complement = 1; 2947 goto do_boundu_utf8; 2948 2949 case BOUND_t8_pb: 2950 case BOUND_t8_p8: 2951 /* regcomp.c makes sure that these only have the traditional \b 2952 * meaning. */ 2953 assert(FLAGS(c) == TRADITIONAL_BOUND); 2954 2955 /* FALLTHROUGH */ 2956 2957 case BOUNDU_t8_pb: 2958 case BOUNDU_t8_p8: 2959 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { 2960 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); 2961 break; 2962 } 2963 2964 do_boundu_utf8: 2965 if (s == reginfo->strbeg) { 2966 if (reginfo->intuit || regtry(reginfo, &s)) 2967 { 2968 goto got_it; 2969 } 2970 2971 /* Didn't match. Try at the next position (if there is one) */ 2972 s += UTF8_SAFE_SKIP(s, reginfo->strend); 2973 if (UNLIKELY(s >= reginfo->strend)) { 2974 break; 2975 } 2976 } 2977 2978 switch((bound_type) FLAGS(c)) { 2979 case TRADITIONAL_BOUND: /* Should have already been handled */ 2980 assert(0); 2981 break; 2982 2983 case GCB_BOUND: 2984 { 2985 GCB_enum before = getGCB_VAL_UTF8( 2986 reghop3((U8*)s, -1, 2987 (U8*)(reginfo->strbeg)), 2988 (U8*) reginfo->strend); 2989 while (s < strend) { 2990 GCB_enum after = getGCB_VAL_UTF8((U8*) s, 2991 (U8*) reginfo->strend); 2992 if ( (to_complement ^ isGCB(before, 2993 after, 2994 (U8*) reginfo->strbeg, 2995 (U8*) s, 2996 1 /* target is utf8 */ )) 2997 && (reginfo->intuit || regtry(reginfo, &s))) 2998 { 2999 goto got_it; 3000 } 3001 before = after; 3002 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3003 } 3004 } 3005 break; 3006 3007 case LB_BOUND: 3008 { 3009 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, 3010 -1, 3011 (U8*)(reginfo->strbeg)), 3012 (U8*) reginfo->strend); 3013 while (s < strend) { 3014 LB_enum after = getLB_VAL_UTF8((U8*) s, 3015 (U8*) reginfo->strend); 3016 if (to_complement ^ isLB(before, 3017 after, 3018 (U8*) reginfo->strbeg, 3019 (U8*) s, 3020 (U8*) reginfo->strend, 3021 1 /* target is utf8 */ ) 3022 && (reginfo->intuit || regtry(reginfo, &s))) 3023 { 3024 goto got_it; 3025 } 3026 before = after; 3027 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3028 } 3029 } 3030 3031 break; 3032 3033 case SB_BOUND: 3034 { 3035 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s, 3036 -1, 3037 (U8*)(reginfo->strbeg)), 3038 (U8*) reginfo->strend); 3039 while (s < strend) { 3040 SB_enum after = getSB_VAL_UTF8((U8*) s, 3041 (U8*) reginfo->strend); 3042 if ((to_complement ^ isSB(before, 3043 after, 3044 (U8*) reginfo->strbeg, 3045 (U8*) s, 3046 (U8*) reginfo->strend, 3047 1 /* target is utf8 */ )) 3048 && (reginfo->intuit || regtry(reginfo, &s))) 3049 { 3050 goto got_it; 3051 } 3052 before = after; 3053 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3054 } 3055 } 3056 3057 break; 3058 3059 case WB_BOUND: 3060 { 3061 /* We are at a boundary between char_sub_0 and char_sub_1. 3062 * We also keep track of the value for char_sub_-1 as we 3063 * loop through the line. Context may be needed to make a 3064 * determination, and if so, this can save having to 3065 * recalculate it */ 3066 WB_enum previous = WB_UNKNOWN; 3067 WB_enum before = getWB_VAL_UTF8( 3068 reghop3((U8*)s, 3069 -1, 3070 (U8*)(reginfo->strbeg)), 3071 (U8*) reginfo->strend); 3072 while (s < strend) { 3073 WB_enum after = getWB_VAL_UTF8((U8*) s, 3074 (U8*) reginfo->strend); 3075 if ((to_complement ^ isWB(previous, 3076 before, 3077 after, 3078 (U8*) reginfo->strbeg, 3079 (U8*) s, 3080 (U8*) reginfo->strend, 3081 1 /* target is utf8 */ )) 3082 && (reginfo->intuit || regtry(reginfo, &s))) 3083 { 3084 goto got_it; 3085 } 3086 previous = before; 3087 before = after; 3088 s += UTF8_SAFE_SKIP(s, reginfo->strend); 3089 } 3090 } 3091 } 3092 3093 /* Here are at the final position in the target string, which is a 3094 * boundary by definition, so matches, depending on other constraints. 3095 * */ 3096 3097 if ( reginfo->intuit 3098 || (s <= reginfo->strend && regtry(reginfo, &s))) 3099 { 3100 goto got_it; 3101 } 3102 break; 3103 3104 case LNBREAK_t8_pb: 3105 case LNBREAK_t8_p8: 3106 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend)); 3107 break; 3108 3109 case LNBREAK_tb_pb: 3110 case LNBREAK_tb_p8: 3111 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend)); 3112 break; 3113 3114 /* The argument to all the POSIX node types is the class number to pass 3115 * to generic_isCC_() to build a mask for searching in PL_charclass[] */ 3116 3117 case NPOSIXL_t8_pb: 3118 case NPOSIXL_t8_p8: 3119 to_complement = 1; 3120 /* FALLTHROUGH */ 3121 3122 case POSIXL_t8_pb: 3123 case POSIXL_t8_p8: 3124 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 3125 REXEC_FBC_UTF8_CLASS_SCAN( 3126 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, 3127 (U8 *) strend))); 3128 break; 3129 3130 case NPOSIXL_tb_pb: 3131 case NPOSIXL_tb_p8: 3132 to_complement = 1; 3133 /* FALLTHROUGH */ 3134 3135 case POSIXL_tb_pb: 3136 case POSIXL_tb_p8: 3137 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 3138 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3139 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); 3140 break; 3141 3142 case NPOSIXA_t8_pb: 3143 case NPOSIXA_t8_p8: 3144 /* The complement of something that matches only ASCII matches all 3145 * non-ASCII, plus everything in ASCII that isn't in the class. */ 3146 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend) 3147 || ! generic_isCC_A_(*s, FLAGS(c))); 3148 break; 3149 3150 case POSIXA_t8_pb: 3151 case POSIXA_t8_p8: 3152 /* Don't need to worry about utf8, as it can match only a single 3153 * byte invariant character. But we do anyway for performance reasons, 3154 * as otherwise we would have to examine all the continuation 3155 * characters */ 3156 REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c))); 3157 break; 3158 3159 case NPOSIXD_tb_pb: 3160 case NPOSIXD_tb_p8: 3161 case NPOSIXA_tb_pb: 3162 case NPOSIXA_tb_p8: 3163 to_complement = 1; 3164 /* FALLTHROUGH */ 3165 3166 case POSIXD_tb_pb: 3167 case POSIXD_tb_p8: 3168 case POSIXA_tb_pb: 3169 case POSIXA_tb_p8: 3170 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3171 to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c)))); 3172 break; 3173 3174 case NPOSIXU_tb_pb: 3175 case NPOSIXU_tb_p8: 3176 to_complement = 1; 3177 /* FALLTHROUGH */ 3178 3179 case POSIXU_tb_pb: 3180 case POSIXU_tb_p8: 3181 REXEC_FBC_NON_UTF8_CLASS_SCAN( 3182 to_complement ^ cBOOL(generic_isCC_(*s, 3183 FLAGS(c)))); 3184 break; 3185 3186 case NPOSIXD_t8_pb: 3187 case NPOSIXD_t8_p8: 3188 case NPOSIXU_t8_pb: 3189 case NPOSIXU_t8_p8: 3190 to_complement = 1; 3191 /* FALLTHROUGH */ 3192 3193 case POSIXD_t8_pb: 3194 case POSIXD_t8_p8: 3195 case POSIXU_t8_pb: 3196 case POSIXU_t8_p8: 3197 classnum = (char_class_number_) FLAGS(c); 3198 switch (classnum) { 3199 default: 3200 REXEC_FBC_UTF8_CLASS_SCAN( 3201 to_complement ^ cBOOL(_invlist_contains_cp( 3202 PL_XPosix_ptrs[classnum], 3203 utf8_to_uvchr_buf((U8 *) s, 3204 (U8 *) strend, 3205 NULL)))); 3206 break; 3207 3208 case CC_ENUM_SPACE_: 3209 REXEC_FBC_UTF8_CLASS_SCAN( 3210 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend))); 3211 break; 3212 3213 case CC_ENUM_BLANK_: 3214 REXEC_FBC_UTF8_CLASS_SCAN( 3215 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend))); 3216 break; 3217 3218 case CC_ENUM_XDIGIT_: 3219 REXEC_FBC_UTF8_CLASS_SCAN( 3220 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); 3221 break; 3222 3223 case CC_ENUM_VERTSPACE_: 3224 REXEC_FBC_UTF8_CLASS_SCAN( 3225 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); 3226 break; 3227 3228 case CC_ENUM_CNTRL_: 3229 REXEC_FBC_UTF8_CLASS_SCAN( 3230 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend))); 3231 break; 3232 } 3233 break; 3234 3235 case AHOCORASICKC_tb_pb: 3236 case AHOCORASICKC_tb_p8: 3237 case AHOCORASICKC_t8_pb: 3238 case AHOCORASICKC_t8_p8: 3239 case AHOCORASICK_tb_pb: 3240 case AHOCORASICK_tb_p8: 3241 case AHOCORASICK_t8_pb: 3242 case AHOCORASICK_t8_p8: 3243 { 3244 DECL_TRIE_TYPE(c); 3245 /* what trie are we using right now */ 3246 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG1u( c ) ]; 3247 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie]; 3248 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); 3249 3250 const char *last_start = strend - trie->minlen; 3251#ifdef DEBUGGING 3252 const char *real_start = s; 3253#endif 3254 STRLEN maxlen = trie->maxlen; 3255 SV *sv_points; 3256 U8 **points; /* map of where we were in the input string 3257 when reading a given char. For ASCII this 3258 is unnecessary overhead as the relationship 3259 is always 1:1, but for Unicode, especially 3260 case folded Unicode this is not true. */ 3261 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 3262 U8 *bitmap=NULL; 3263 3264 3265 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3266 3267 /* We can't just allocate points here. We need to wrap it in 3268 * an SV so it gets freed properly if there is a croak while 3269 * running the match */ 3270 ENTER; 3271 SAVETMPS; 3272 sv_points=newSV(maxlen * sizeof(U8 *)); 3273 SvCUR_set(sv_points, 3274 maxlen * sizeof(U8 *)); 3275 SvPOK_on(sv_points); 3276 sv_2mortal(sv_points); 3277 points=(U8**)SvPV_nolen(sv_points ); 3278 if ( trie_type != trie_utf8_fold 3279 && (trie->bitmap || OP(c)==AHOCORASICKC) ) 3280 { 3281 if (trie->bitmap) 3282 bitmap=(U8*)trie->bitmap; 3283 else 3284 bitmap=(U8*)ANYOF_BITMAP(c); 3285 } 3286 /* this is the Aho-Corasick algorithm modified a touch 3287 to include special handling for long "unknown char" sequences. 3288 The basic idea being that we use AC as long as we are dealing 3289 with a possible matching char, when we encounter an unknown char 3290 (and we have not encountered an accepting state) we scan forward 3291 until we find a legal starting char. 3292 AC matching is basically that of trie matching, except that when 3293 we encounter a failing transition, we fall back to the current 3294 states "fail state", and try the current char again, a process 3295 we repeat until we reach the root state, state 1, or a legal 3296 transition. If we fail on the root state then we can either 3297 terminate if we have reached an accepting state previously, or 3298 restart the entire process from the beginning if we have not. 3299 3300 */ 3301 while (s <= last_start) { 3302 const U32 uniflags = UTF8_ALLOW_DEFAULT; 3303 U8 *uc = (U8*)s; 3304 U16 charid = 0; 3305 U32 base = 1; 3306 U32 state = 1; 3307 UV uvc = 0; 3308 STRLEN len = 0; 3309 STRLEN foldlen = 0; 3310 U8 *uscan = (U8*)NULL; 3311 U8 *leftmost = NULL; 3312#ifdef DEBUGGING 3313 U32 accepted_word= 0; 3314#endif 3315 U32 pointpos = 0; 3316 3317 while ( state && uc <= (U8*)strend ) { 3318 int failed=0; 3319 U32 word = aho->states[ state ].wordnum; 3320 3321 if( state==1 ) { 3322 if ( bitmap ) { 3323 DEBUG_TRIE_EXECUTE_r( 3324 if ( uc <= (U8*)last_start 3325 && !BITMAP_TEST(bitmap,*uc) ) 3326 { 3327 dump_exec_pos( (char *)uc, c, strend, 3328 real_start, 3329 (char *)uc, utf8_target, 0 ); 3330 Perl_re_printf( aTHX_ 3331 " Scanning for legal start char...\n"); 3332 } 3333 ); 3334 if (utf8_target) { 3335 while ( uc <= (U8*)last_start 3336 && !BITMAP_TEST(bitmap,*uc) ) 3337 { 3338 uc += UTF8SKIP(uc); 3339 } 3340 } else { 3341 while ( uc <= (U8*)last_start 3342 && ! BITMAP_TEST(bitmap,*uc) ) 3343 { 3344 uc++; 3345 } 3346 } 3347 s= (char *)uc; 3348 } 3349 if (uc >(U8*)last_start) break; 3350 } 3351 3352 if ( word ) { 3353 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) 3354 % maxlen ]; 3355 if (!leftmost || lpos < leftmost) { 3356 DEBUG_r(accepted_word=word); 3357 leftmost= lpos; 3358 } 3359 if (base==0) break; 3360 3361 } 3362 points[pointpos++ % maxlen]= uc; 3363 if (foldlen || uc < (U8*)strend) { 3364 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 3365 (U8 *) strend, uscan, len, uvc, 3366 charid, foldlen, foldbuf, 3367 uniflags); 3368 DEBUG_TRIE_EXECUTE_r({ 3369 dump_exec_pos( (char *)uc, c, strend, 3370 real_start, s, utf8_target, 0); 3371 Perl_re_printf( aTHX_ 3372 " Charid:%3u CP:%4" UVxf " ", 3373 charid, uvc); 3374 }); 3375 } 3376 else { 3377 len = 0; 3378 charid = 0; 3379 } 3380 3381 3382 do { 3383#ifdef DEBUGGING 3384 word = aho->states[ state ].wordnum; 3385#endif 3386 base = aho->states[ state ].trans.base; 3387 3388 DEBUG_TRIE_EXECUTE_r({ 3389 if (failed) 3390 dump_exec_pos((char *)uc, c, strend, real_start, 3391 s, utf8_target, 0 ); 3392 Perl_re_printf( aTHX_ 3393 "%sState: %4" UVxf ", word=%" UVxf, 3394 failed ? " Fail transition to " : "", 3395 (UV)state, (UV)word); 3396 }); 3397 if ( base ) { 3398 U32 tmp; 3399 I32 offset; 3400 if (charid && 3401 ( ((offset = base + charid 3402 - 1 - trie->uniquecharcount)) >= 0) 3403 && ((U32)offset < trie->lasttrans) 3404 && trie->trans[offset].check == state 3405 && (tmp=trie->trans[offset].next)) 3406 { 3407 DEBUG_TRIE_EXECUTE_r( 3408 Perl_re_printf( aTHX_ " - legal\n")); 3409 state = tmp; 3410 break; 3411 } 3412 else { 3413 DEBUG_TRIE_EXECUTE_r( 3414 Perl_re_printf( aTHX_ " - fail\n")); 3415 failed = 1; 3416 state = aho->fail[state]; 3417 } 3418 } 3419 else { 3420 /* we must be accepting here */ 3421 DEBUG_TRIE_EXECUTE_r( 3422 Perl_re_printf( aTHX_ " - accepting\n")); 3423 failed = 1; 3424 break; 3425 } 3426 } while(state); 3427 uc += len; 3428 if (failed) { 3429 if (leftmost) 3430 break; 3431 if (!state) state = 1; 3432 } 3433 } 3434 if ( aho->states[ state ].wordnum ) { 3435 U8 *lpos = points[ (pointpos 3436 - trie->wordinfo[aho->states[ state ] 3437 .wordnum].len) % maxlen ]; 3438 if (!leftmost || lpos < leftmost) { 3439 DEBUG_r(accepted_word=aho->states[ state ].wordnum); 3440 leftmost = lpos; 3441 } 3442 } 3443 if (leftmost) { 3444 s = (char*)leftmost; 3445 DEBUG_TRIE_EXECUTE_r({ 3446 Perl_re_printf( aTHX_ "Matches word #%" UVxf 3447 " at position %" IVdf ". Trying full" 3448 " pattern...\n", 3449 (UV)accepted_word, (IV)(s - real_start) 3450 ); 3451 }); 3452 if (reginfo->intuit || regtry(reginfo, &s)) { 3453 FREETMPS; 3454 LEAVE; 3455 goto got_it; 3456 } 3457 if (s < reginfo->strend) { 3458 s = HOPc(s,1); 3459 } 3460 DEBUG_TRIE_EXECUTE_r({ 3461 Perl_re_printf( aTHX_ 3462 "Pattern failed. Looking for new start" 3463 " point...\n"); 3464 }); 3465 } else { 3466 DEBUG_TRIE_EXECUTE_r( 3467 Perl_re_printf( aTHX_ "No match.\n")); 3468 break; 3469 } 3470 } 3471 FREETMPS; 3472 LEAVE; 3473 } 3474 break; 3475 3476 case EXACTFU_REQ8_t8_pb: 3477 case EXACTFUP_tb_p8: 3478 case EXACTFUP_t8_p8: 3479 case EXACTF_tb_p8: 3480 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */ 3481 case EXACTFAA_NO_TRIE_tb_p8: 3482 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8 3483 patterns */ 3484 assert(0); 3485 3486 default: 3487 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); 3488 } /* End of switch on node type */ 3489 3490 return 0; 3491 3492 got_it: 3493 return s; 3494} 3495 3496/* set RX_SAVED_COPY, RX_SUBBEG etc. 3497 * flags have same meanings as with regexec_flags() */ 3498 3499static void 3500S_reg_set_capture_string(pTHX_ REGEXP * const rx, 3501 char *strbeg, 3502 char *strend, 3503 SV *sv, 3504 U32 flags, 3505 bool utf8_target) 3506{ 3507 struct regexp *const prog = ReANY(rx); 3508 3509 if (flags & REXEC_COPY_STR) { 3510#ifdef PERL_ANY_COW 3511 if (SvCANCOW(sv)) { 3512 DEBUG_C(Perl_re_printf( aTHX_ 3513 "Copy on write: regexp capture, type %d\n", 3514 (int) SvTYPE(sv))); 3515 /* Create a new COW SV to share the match string and store 3516 * in saved_copy, unless the current COW SV in saved_copy 3517 * is valid and suitable for our purpose */ 3518 if (( RXp_SAVED_COPY(prog) 3519 && SvIsCOW(RXp_SAVED_COPY(prog)) 3520 && SvPOKp(RXp_SAVED_COPY(prog)) 3521 && SvIsCOW(sv) 3522 && SvPOKp(sv) 3523 && SvPVX(sv) == SvPVX(RXp_SAVED_COPY(prog)))) 3524 { 3525 /* just reuse saved_copy SV */ 3526 if (RXp_MATCH_COPIED(prog)) { 3527 Safefree(RXp_SUBBEG(prog)); 3528 RXp_MATCH_COPIED_off(prog); 3529 } 3530 } 3531 else { 3532 /* create new COW SV to share string */ 3533 RXp_MATCH_COPY_FREE(prog); 3534 RXp_SAVED_COPY(prog) = sv_setsv_cow(RXp_SAVED_COPY(prog), sv); 3535 } 3536 RXp_SUBBEG(prog) = (char *)SvPVX_const(RXp_SAVED_COPY(prog)); 3537 assert (SvPOKp(RXp_SAVED_COPY(prog))); 3538 RXp_SUBLEN(prog) = strend - strbeg; 3539 RXp_SUBOFFSET(prog) = 0; 3540 RXp_SUBCOFFSET(prog) = 0; 3541 } else 3542#endif 3543 { 3544 SSize_t min = 0; 3545 SSize_t max = strend - strbeg; 3546 SSize_t sublen; 3547 3548 if ( (flags & REXEC_COPY_SKIP_POST) 3549 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3550 && !(PL_sawampersand & SAWAMPERSAND_RIGHT) 3551 ) { /* don't copy $' part of string */ 3552 SSize_t offs_end; 3553 U32 n = 0; 3554 max = -1; 3555 /* calculate the right-most part of the string covered 3556 * by a capture. Due to lookahead, this may be to 3557 * the right of $&, so we have to scan all captures */ 3558 while (n <= RXp_LASTPAREN(prog)) { 3559 if ((offs_end = RXp_OFFS_END(prog,n)) > max) 3560 max = offs_end; 3561 n++; 3562 } 3563 if (max == -1) 3564 max = (PL_sawampersand & SAWAMPERSAND_LEFT) 3565 ? RXp_OFFS_START(prog,0) 3566 : 0; 3567 assert(max >= 0 && max <= strend - strbeg); 3568 } 3569 3570 if ( (flags & REXEC_COPY_SKIP_PRE) 3571 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ 3572 && !(PL_sawampersand & SAWAMPERSAND_LEFT) 3573 ) { /* don't copy $` part of string */ 3574 U32 n = 0; 3575 min = max; 3576 /* calculate the left-most part of the string covered 3577 * by a capture. Due to lookbehind, this may be to 3578 * the left of $&, so we have to scan all captures */ 3579 while (min && n <= RXp_LASTPAREN(prog)) { 3580 I32 start = RXp_OFFS_START(prog,n); 3581 if ( start != -1 3582 && start < min) 3583 { 3584 min = start; 3585 } 3586 n++; 3587 } 3588 if ((PL_sawampersand & SAWAMPERSAND_RIGHT) 3589 && min > RXp_OFFS_END(prog,0) 3590 ) 3591 min = RXp_OFFS_END(prog,0); 3592 3593 } 3594 3595 assert(min >= 0 && min <= max && min <= strend - strbeg); 3596 sublen = max - min; 3597 3598 if (RXp_MATCH_COPIED(prog)) { 3599 if (sublen > RXp_SUBLEN(prog)) 3600 RXp_SUBBEG(prog) = 3601 (char*)saferealloc(RXp_SUBBEG(prog), sublen+1); 3602 } 3603 else 3604 RXp_SUBBEG(prog) = (char*)safemalloc(sublen+1); 3605 Copy(strbeg + min, RXp_SUBBEG(prog), sublen, char); 3606 RXp_SUBBEG(prog)[sublen] = '\0'; 3607 RXp_SUBOFFSET(prog) = min; 3608 RXp_SUBLEN(prog) = sublen; 3609 RXp_MATCH_COPIED_on(prog); 3610 } 3611 RXp_SUBCOFFSET(prog) = RXp_SUBOFFSET(prog); 3612 if (RXp_SUBOFFSET(prog) && utf8_target) { 3613 /* Convert byte offset to chars. 3614 * XXX ideally should only compute this if @-/@+ 3615 * has been seen, a la PL_sawampersand ??? */ 3616 3617 /* If there's a direct correspondence between the 3618 * string which we're matching and the original SV, 3619 * then we can use the utf8 len cache associated with 3620 * the SV. In particular, it means that under //g, 3621 * sv_pos_b2u() will use the previously cached 3622 * position to speed up working out the new length of 3623 * subcoffset, rather than counting from the start of 3624 * the string each time. This stops 3625 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; 3626 * from going quadratic */ 3627 if (SvPOKp(sv) && SvPVX(sv) == strbeg) 3628 RXp_SUBCOFFSET(prog) = sv_pos_b2u_flags(sv, RXp_SUBCOFFSET(prog), 3629 SV_GMAGIC|SV_CONST_RETURN); 3630 else 3631 RXp_SUBCOFFSET(prog) = utf8_length((U8*)strbeg, 3632 (U8*)(strbeg+RXp_SUBOFFSET(prog))); 3633 } 3634 } 3635 else { 3636 RXp_MATCH_COPY_FREE(prog); 3637 RXp_SUBBEG(prog) = strbeg; 3638 RXp_SUBOFFSET(prog) = 0; 3639 RXp_SUBCOFFSET(prog) = 0; 3640 RXp_SUBLEN(prog) = strend - strbeg; 3641 } 3642} 3643 3644 3645 3646 3647/* 3648 - regexec_flags - match a regexp against a string 3649 */ 3650I32 3651Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, 3652 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) 3653/* stringarg: the point in the string at which to begin matching */ 3654/* strend: pointer to null at end of string */ 3655/* strbeg: real beginning of string */ 3656/* minend: end of match must be >= minend bytes after stringarg. */ 3657/* sv: SV being matched: only used for utf8 flag, pos() etc; string 3658 * itself is accessed via the pointers above */ 3659/* data: May be used for some additional optimizations. 3660 Currently unused. */ 3661/* flags: For optimizations. See REXEC_* in regexp.h */ 3662 3663{ 3664 struct regexp *const prog = ReANY(rx); 3665 char *s; 3666 regnode *c; 3667 char *startpos; 3668 SSize_t minlen; /* must match at least this many chars */ 3669 SSize_t dontbother = 0; /* how many characters not to try at end */ 3670 const bool utf8_target = cBOOL(DO_UTF8(sv)); 3671 I32 multiline; 3672 RXi_GET_DECL(prog,progi); 3673 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ 3674 regmatch_info *const reginfo = ®info_buf; 3675 regexp_paren_pair *swap = NULL; 3676 I32 oldsave; 3677 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3678 3679 PERL_ARGS_ASSERT_REGEXEC_FLAGS; 3680 PERL_UNUSED_ARG(data); 3681 3682 /* Be paranoid... */ 3683 if (prog == NULL) { 3684 Perl_croak(aTHX_ "NULL regexp parameter"); 3685 } 3686 3687 DEBUG_EXECUTE_r( 3688 debug_start_match(rx, utf8_target, stringarg, strend, 3689 "Matching"); 3690 ); 3691 3692 startpos = stringarg; 3693 3694 /* set these early as they may be used by the HOP macros below */ 3695 reginfo->strbeg = strbeg; 3696 reginfo->strend = strend; 3697 reginfo->is_utf8_target = cBOOL(utf8_target); 3698 3699 if (prog->intflags & PREGf_GPOS_SEEN) { 3700 MAGIC *mg; 3701 3702 /* set reginfo->ganch, the position where \G can match */ 3703 3704 reginfo->ganch = 3705 (flags & REXEC_IGNOREPOS) 3706 ? stringarg /* use start pos rather than pos() */ 3707 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0) 3708 /* Defined pos(): */ 3709 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) 3710 : strbeg; /* pos() not defined; use start of string */ 3711 3712 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3713 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg))); 3714 3715 /* in the presence of \G, we may need to start looking earlier in 3716 * the string than the suggested start point of stringarg: 3717 * if prog->gofs is set, then that's a known, fixed minimum 3718 * offset, such as 3719 * /..\G/: gofs = 2 3720 * /ab|c\G/: gofs = 1 3721 * or if the minimum offset isn't known, then we have to go back 3722 * to the start of the string, e.g. /w+\G/ 3723 */ 3724 3725 if (prog->intflags & PREGf_ANCH_GPOS) { 3726 if (prog->gofs) { 3727 startpos = HOPBACKc(reginfo->ganch, prog->gofs); 3728 if (!startpos || 3729 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) 3730 { 3731 DEBUG_GPOS_r(Perl_re_printf( aTHX_ 3732 "fail: ganch-gofs before earliest possible start\n")); 3733 return 0; 3734 } 3735 } 3736 else 3737 startpos = reginfo->ganch; 3738 } 3739 else if (prog->gofs) { 3740 startpos = HOPBACKc(startpos, prog->gofs); 3741 if (!startpos) 3742 startpos = strbeg; 3743 } 3744 else if (prog->intflags & PREGf_GPOS_FLOAT) 3745 startpos = strbeg; 3746 } 3747 3748 minlen = prog->minlen; 3749 if ((startpos + minlen) > strend || startpos < strbeg) { 3750 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3751 "Regex match can't succeed, so not even tried\n")); 3752 return 0; 3753 } 3754 3755 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), 3756 * which will call destuctors to reset PL_regmatch_state, free higher 3757 * PL_regmatch_slabs, and clean up regmatch_info_aux and 3758 * regmatch_info_aux_eval */ 3759 3760 oldsave = PL_savestack_ix; 3761 3762 s = startpos; 3763 3764 if ((prog->extflags & RXf_USE_INTUIT) 3765 && !(flags & REXEC_CHECKED)) 3766 { 3767 s = re_intuit_start(rx, sv, strbeg, startpos, strend, 3768 flags, NULL); 3769 if (!s) 3770 return 0; 3771 3772 if (prog->extflags & RXf_CHECK_ALL) { 3773 /* we can match based purely on the result of INTUIT. 3774 * Set up captures etc just for $& and $-[0] 3775 * (an intuit-only match wont have $1,$2,..) */ 3776 assert(!prog->nparens); 3777 3778 /* s/// doesn't like it if $& is earlier than where we asked it to 3779 * start searching (which can happen on something like /.\G/) */ 3780 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 3781 && (s < stringarg)) 3782 { 3783 /* this should only be possible under \G */ 3784 assert(prog->intflags & PREGf_GPOS_SEEN); 3785 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3786 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 3787 goto phooey; 3788 } 3789 3790 /* match via INTUIT shouldn't have any captures. 3791 * Let @-, @+, $^N know */ 3792 RXp_LASTPAREN(prog) = RXp_LASTCLOSEPAREN(prog) = 0; 3793 RXp_MATCH_UTF8_set(prog, utf8_target); 3794 SSize_t match_start = s - strbeg; 3795 SSize_t match_end = utf8_target 3796 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg 3797 : s - strbeg + prog->minlenret; 3798 CLOSE_ANY_CAPTURE(prog, 0, match_start, match_end); 3799 if ( !(flags & REXEC_NOT_FIRST) ) 3800 S_reg_set_capture_string(aTHX_ rx, 3801 strbeg, strend, 3802 sv, flags, utf8_target); 3803 3804 return 1; 3805 } 3806 } 3807 3808 multiline = prog->extflags & RXf_PMf_MULTILINE; 3809 3810 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { 3811 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 3812 "String too short [regexec_flags]...\n")); 3813 goto phooey; 3814 } 3815 3816 /* Check validity of program. */ 3817 if (UCHARAT(progi->program) != REG_MAGIC) { 3818 Perl_croak(aTHX_ "corrupted regexp program"); 3819 } 3820 3821 RXp_MATCH_TAINTED_off(prog); 3822 RXp_MATCH_UTF8_set(prog, utf8_target); 3823 3824 reginfo->prog = rx; /* Yes, sorry that this is confusing. */ 3825 reginfo->intuit = 0; 3826 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); 3827 reginfo->warned = FALSE; 3828 reginfo->sv = sv; 3829 reginfo->poscache_maxiter = 0; /* not yet started a countdown */ 3830 /* see how far we have to get to not match where we matched before */ 3831 reginfo->till = stringarg + minend; 3832 3833 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { 3834 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after 3835 S_cleanup_regmatch_info_aux has executed (registered by 3836 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies 3837 magic belonging to this SV. 3838 Not newSVsv, either, as it does not COW. 3839 */ 3840 reginfo->sv = newSV_type(SVt_NULL); 3841 SvSetSV_nosteal(reginfo->sv, sv); 3842 SAVEFREESV(reginfo->sv); 3843 } 3844 3845 /* reserve next 2 or 3 slots in PL_regmatch_state: 3846 * slot N+0: may currently be in use: skip it 3847 * slot N+1: use for regmatch_info_aux struct 3848 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s 3849 * slot N+3: ready for use by regmatch() 3850 */ 3851 3852 { 3853 regmatch_state *old_regmatch_state; 3854 regmatch_slab *old_regmatch_slab; 3855 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; 3856 3857 /* on first ever match, allocate first slab */ 3858 if (!PL_regmatch_slab) { 3859 Newx(PL_regmatch_slab, 1, regmatch_slab); 3860 PL_regmatch_slab->prev = NULL; 3861 PL_regmatch_slab->next = NULL; 3862 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); 3863 } 3864 3865 old_regmatch_state = PL_regmatch_state; 3866 old_regmatch_slab = PL_regmatch_slab; 3867 3868 for (i=0; i <= max; i++) { 3869 if (i == 1) 3870 reginfo->info_aux = &(PL_regmatch_state->u.info_aux); 3871 else if (i ==2) 3872 reginfo->info_aux_eval = 3873 reginfo->info_aux->info_aux_eval = 3874 &(PL_regmatch_state->u.info_aux_eval); 3875 3876 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) 3877 PL_regmatch_state = S_push_slab(aTHX); 3878 } 3879 3880 /* note initial PL_regmatch_state position; at end of match we'll 3881 * pop back to there and free any higher slabs */ 3882 3883 reginfo->info_aux->old_regmatch_state = old_regmatch_state; 3884 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; 3885 reginfo->info_aux->poscache = NULL; 3886 3887 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); 3888 3889 if ((prog->extflags & RXf_EVAL_SEEN)) 3890 S_setup_eval_state(aTHX_ reginfo); 3891 else 3892 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; 3893 } 3894 3895 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { 3896 /* We have to be careful. If the previous successful match 3897 was from this regex we don't want a subsequent partially 3898 successful match to clobber the old results. 3899 So when we detect this possibility we add a swap buffer 3900 to the re, and switch the buffer each match. If we fail, 3901 we switch it back; otherwise we leave it swapped. 3902 */ 3903 swap = RXp_OFFSp(prog); 3904 /* avoid leak if we die, or clean up anyway if match completes */ 3905 SAVEFREEPV(swap); 3906 Newxz(RXp_OFFSp(prog), (prog->nparens + 1), regexp_paren_pair); 3907 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 3908 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n", 3909 0, 3910 PTR2UV(prog), 3911 PTR2UV(swap), 3912 PTR2UV(RXp_OFFSp(prog)) 3913 )); 3914 } 3915 3916 if (prog->recurse_locinput) 3917 Zero(prog->recurse_locinput,prog->nparens + 1, char *); 3918 3919 /* Simplest case: anchored match (but not \G) need be tried only once, 3920 * or with MBOL, only at the beginning of each line. 3921 * 3922 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets 3923 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't 3924 * match at the start of the string then it won't match anywhere else 3925 * either; while with /.*.../, if it doesn't match at the beginning, 3926 * the earliest it could match is at the start of the next line */ 3927 3928 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { 3929 char *end; 3930 3931 if (regtry(reginfo, &s)) 3932 goto got_it; 3933 3934 if (!(prog->intflags & PREGf_ANCH_MBOL)) 3935 goto phooey; 3936 3937 /* didn't match at start, try at other newline positions */ 3938 3939 if (minlen) 3940 dontbother = minlen - 1; 3941 end = HOP3c(strend, -dontbother, strbeg) - 1; 3942 3943 /* skip to next newline */ 3944 3945 while (s <= end) { /* note it could be possible to match at the end of the string */ 3946 /* NB: newlines are the same in unicode as they are in latin */ 3947 if (*s++ != '\n') 3948 continue; 3949 if (prog->check_substr || prog->check_utf8) { 3950 /* note that with PREGf_IMPLICIT, intuit can only fail 3951 * or return the start position, so it's of limited utility. 3952 * Nevertheless, I made the decision that the potential for 3953 * quick fail was still worth it - DAPM */ 3954 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL); 3955 if (!s) 3956 goto phooey; 3957 } 3958 if (regtry(reginfo, &s)) 3959 goto got_it; 3960 } 3961 goto phooey; 3962 } /* end anchored search */ 3963 3964 /* anchored \G match */ 3965 if (prog->intflags & PREGf_ANCH_GPOS) 3966 { 3967 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ 3968 assert(prog->intflags & PREGf_GPOS_SEEN); 3969 /* For anchored \G, the only position it can match from is 3970 * (ganch-gofs); we already set startpos to this above; if intuit 3971 * moved us on from there, we can't possibly succeed */ 3972 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs)); 3973 if (s == startpos && regtry(reginfo, &s)) 3974 goto got_it; 3975 goto phooey; 3976 } 3977 3978 /* Messy cases: unanchored match. */ 3979 3980 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { 3981 /* we have /x+whatever/ */ 3982 /* it must be a one character string (XXXX Except is_utf8_pat?) */ 3983 char ch; 3984#ifdef DEBUGGING 3985 int did_match = 0; 3986#endif 3987 if (utf8_target) { 3988 if (! prog->anchored_utf8) { 3989 to_utf8_substr(prog); 3990 } 3991 ch = SvPVX_const(prog->anchored_utf8)[0]; 3992 REXEC_FBC_UTF8_SCAN( 3993 if (*s == ch) { 3994 DEBUG_EXECUTE_r( did_match = 1 ); 3995 if (regtry(reginfo, &s)) goto got_it; 3996 s += UTF8_SAFE_SKIP(s, strend); 3997 while (s < strend && *s == ch) 3998 s += UTF8SKIP(s); 3999 } 4000 ); 4001 4002 } 4003 else { 4004 if (! prog->anchored_substr) { 4005 if (! to_byte_substr(prog)) { 4006 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4007 } 4008 } 4009 ch = SvPVX_const(prog->anchored_substr)[0]; 4010 REXEC_FBC_NON_UTF8_SCAN( 4011 if (*s == ch) { 4012 DEBUG_EXECUTE_r( did_match = 1 ); 4013 if (regtry(reginfo, &s)) goto got_it; 4014 s++; 4015 while (s < strend && *s == ch) 4016 s++; 4017 } 4018 ); 4019 } 4020 DEBUG_EXECUTE_r(if (!did_match) 4021 Perl_re_printf( aTHX_ 4022 "Did not find anchored character...\n") 4023 ); 4024 } 4025 else if (prog->anchored_substr != NULL 4026 || prog->anchored_utf8 != NULL 4027 || ((prog->float_substr != NULL || prog->float_utf8 != NULL) 4028 && prog->float_max_offset < strend - s)) { 4029 SV *must; 4030 SSize_t back_max; 4031 SSize_t back_min; 4032 char *last; 4033 char *last1; /* Last position checked before */ 4034#ifdef DEBUGGING 4035 int did_match = 0; 4036#endif 4037 if (prog->anchored_substr || prog->anchored_utf8) { 4038 if (utf8_target) { 4039 if (! prog->anchored_utf8) { 4040 to_utf8_substr(prog); 4041 } 4042 must = prog->anchored_utf8; 4043 } 4044 else { 4045 if (! prog->anchored_substr) { 4046 if (! to_byte_substr(prog)) { 4047 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4048 } 4049 } 4050 must = prog->anchored_substr; 4051 } 4052 back_max = back_min = prog->anchored_offset; 4053 } else { 4054 if (utf8_target) { 4055 if (! prog->float_utf8) { 4056 to_utf8_substr(prog); 4057 } 4058 must = prog->float_utf8; 4059 } 4060 else { 4061 if (! prog->float_substr) { 4062 if (! to_byte_substr(prog)) { 4063 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4064 } 4065 } 4066 must = prog->float_substr; 4067 } 4068 back_max = prog->float_max_offset; 4069 back_min = prog->float_min_offset; 4070 } 4071 4072 if (back_min<0) { 4073 last = strend; 4074 } else { 4075 last = HOP3c(strend, /* Cannot start after this */ 4076 -(SSize_t)(CHR_SVLEN(must) 4077 - (SvTAIL(must) != 0) + back_min), strbeg); 4078 } 4079 if (s > reginfo->strbeg) 4080 last1 = HOPc(s, -1); 4081 else 4082 last1 = s - 1; /* bogus */ 4083 4084 /* XXXX check_substr already used to find "s", can optimize if 4085 check_substr==must. */ 4086 dontbother = 0; 4087 strend = HOPc(strend, -dontbother); 4088 while ( (s <= last) && 4089 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), 4090 (unsigned char*)strend, must, 4091 multiline ? FBMrf_MULTILINE : 0)) ) { 4092 DEBUG_EXECUTE_r( did_match = 1 ); 4093 if (HOPc(s, -back_max) > last1) { 4094 last1 = HOPc(s, -back_min); 4095 s = HOPc(s, -back_max); 4096 } 4097 else { 4098 char * const t = (last1 >= reginfo->strbeg) 4099 ? HOPc(last1, 1) : last1 + 1; 4100 4101 last1 = HOPc(s, -back_min); 4102 s = t; 4103 } 4104 if (utf8_target) { 4105 while (s <= last1) { 4106 if (regtry(reginfo, &s)) 4107 goto got_it; 4108 if (s >= last1) { 4109 s++; /* to break out of outer loop */ 4110 break; 4111 } 4112 s += UTF8SKIP(s); 4113 } 4114 } 4115 else { 4116 while (s <= last1) { 4117 if (regtry(reginfo, &s)) 4118 goto got_it; 4119 s++; 4120 } 4121 } 4122 } 4123 DEBUG_EXECUTE_r(if (!did_match) { 4124 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), 4125 SvPVX_const(must), RE_SV_DUMPLEN(must), 30); 4126 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n", 4127 ((must == prog->anchored_substr || must == prog->anchored_utf8) 4128 ? "anchored" : "floating"), 4129 quoted, RE_SV_TAIL(must)); 4130 }); 4131 goto phooey; 4132 } 4133 else if ( (c = progi->regstclass) ) { 4134 if (minlen) { 4135 const OPCODE op = OP(progi->regstclass); 4136 /* don't bother with what can't match */ 4137 if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE) 4138 strend = HOPc(strend, -(minlen - 1)); 4139 } 4140 DEBUG_EXECUTE_r({ 4141 SV * const prop = sv_newmortal(); 4142 regprop(prog, prop, c, reginfo, NULL); 4143 { 4144 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), 4145 s,strend-s,PL_dump_re_max_len); 4146 Perl_re_printf( aTHX_ 4147 "Matching stclass %.*s against %s (%d bytes)\n", 4148 (int)SvCUR(prop), SvPVX_const(prop), 4149 quoted, (int)(strend - s)); 4150 } 4151 }); 4152 if (find_byclass(prog, c, s, strend, reginfo)) 4153 goto got_it; 4154 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n")); 4155 } 4156 else { 4157 dontbother = 0; 4158 if (prog->float_substr != NULL || prog->float_utf8 != NULL) { 4159 /* Trim the end. */ 4160 char *last= NULL; 4161 SV* float_real; 4162 STRLEN len; 4163 const char *little; 4164 4165 if (utf8_target) { 4166 if (! prog->float_utf8) { 4167 to_utf8_substr(prog); 4168 } 4169 float_real = prog->float_utf8; 4170 } 4171 else { 4172 if (! prog->float_substr) { 4173 if (! to_byte_substr(prog)) { 4174 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); 4175 } 4176 } 4177 float_real = prog->float_substr; 4178 } 4179 4180 little = SvPV_const(float_real, len); 4181 if (SvTAIL(float_real)) { 4182 /* This means that float_real contains an artificial \n on 4183 * the end due to the presence of something like this: 4184 * /foo$/ where we can match both "foo" and "foo\n" at the 4185 * end of the string. So we have to compare the end of the 4186 * string first against the float_real without the \n and 4187 * then against the full float_real with the string. We 4188 * have to watch out for cases where the string might be 4189 * smaller than the float_real or the float_real without 4190 * the \n. */ 4191 char *checkpos= strend - len; 4192 DEBUG_OPTIMISE_r( 4193 Perl_re_printf( aTHX_ 4194 "%sChecking for float_real.%s\n", 4195 PL_colors[4], PL_colors[5])); 4196 if (checkpos + 1 < strbeg) { 4197 /* can't match, even if we remove the trailing \n 4198 * string is too short to match */ 4199 DEBUG_EXECUTE_r( 4200 Perl_re_printf( aTHX_ 4201 "%sString shorter than required trailing substring, cannot match.%s\n", 4202 PL_colors[4], PL_colors[5])); 4203 goto phooey; 4204 } else if (memEQ(checkpos + 1, little, len - 1)) { 4205 /* can match, the end of the string matches without the 4206 * "\n" */ 4207 last = checkpos + 1; 4208 } else if (checkpos < strbeg) { 4209 /* cant match, string is too short when the "\n" is 4210 * included */ 4211 DEBUG_EXECUTE_r( 4212 Perl_re_printf( aTHX_ 4213 "%sString does not contain required trailing substring, cannot match.%s\n", 4214 PL_colors[4], PL_colors[5])); 4215 goto phooey; 4216 } else if (!multiline) { 4217 /* non multiline match, so compare with the "\n" at the 4218 * end of the string */ 4219 if (memEQ(checkpos, little, len)) { 4220 last= checkpos; 4221 } else { 4222 DEBUG_EXECUTE_r( 4223 Perl_re_printf( aTHX_ 4224 "%sString does not contain required trailing substring, cannot match.%s\n", 4225 PL_colors[4], PL_colors[5])); 4226 goto phooey; 4227 } 4228 } else { 4229 /* multiline match, so we have to search for a place 4230 * where the full string is located */ 4231 goto find_last; 4232 } 4233 } else { 4234 find_last: 4235 if (len) 4236 last = rninstr(s, strend, little, little + len); 4237 else 4238 last = strend; /* matching "$" */ 4239 } 4240 if (!last) { 4241 /* at one point this block contained a comment which was 4242 * probably incorrect, which said that this was a "should not 4243 * happen" case. Even if it was true when it was written I am 4244 * pretty sure it is not anymore, so I have removed the comment 4245 * and replaced it with this one. Yves */ 4246 DEBUG_EXECUTE_r( 4247 Perl_re_printf( aTHX_ 4248 "%sString does not contain required substring, cannot match.%s\n", 4249 PL_colors[4], PL_colors[5] 4250 )); 4251 goto phooey; 4252 } 4253 dontbother = strend - last + prog->float_min_offset; 4254 } 4255 if (minlen && (dontbother < minlen)) 4256 dontbother = minlen - 1; 4257 strend -= dontbother; /* this one's always in bytes! */ 4258 /* We don't know much -- general case. */ 4259 if (utf8_target) { 4260 for (;;) { 4261 if (regtry(reginfo, &s)) 4262 goto got_it; 4263 if (s >= strend) 4264 break; 4265 s += UTF8SKIP(s); 4266 }; 4267 } 4268 else { 4269 do { 4270 if (regtry(reginfo, &s)) 4271 goto got_it; 4272 } while (s++ < strend); 4273 } 4274 } 4275 4276 /* Failure. */ 4277 goto phooey; 4278 4279 got_it: 4280 /* s/// doesn't like it if $& is earlier than where we asked it to 4281 * start searching (which can happen on something like /.\G/) */ 4282 if ( (flags & REXEC_FAIL_ON_UNDERFLOW) 4283 && (RXp_OFFS_START(prog,0) < stringarg - strbeg)) 4284 { 4285 /* this should only be possible under \G */ 4286 assert(prog->intflags & PREGf_GPOS_SEEN); 4287 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 4288 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); 4289 goto phooey; 4290 } 4291 4292 /* clean up; this will trigger destructors that will free all slabs 4293 * above the current one, and cleanup the regmatch_info_aux 4294 * and regmatch_info_aux_eval sructs */ 4295 4296 LEAVE_SCOPE(oldsave); 4297 4298 if (RXp_PAREN_NAMES(prog)) 4299 (void)hv_iterinit(RXp_PAREN_NAMES(prog)); 4300 4301 /* make sure $`, $&, $', and $digit will work later */ 4302 if ( !(flags & REXEC_NOT_FIRST) ) 4303 S_reg_set_capture_string(aTHX_ rx, 4304 strbeg, reginfo->strend, 4305 sv, flags, utf8_target); 4306 4307 return 1; 4308 4309 phooey: 4310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n", 4311 PL_colors[4], PL_colors[5])); 4312 4313 if (swap) { 4314 /* we failed :-( roll it back. 4315 * Since the swap buffer will be freed on scope exit which follows 4316 * shortly, restore the old captures by copying 'swap's original 4317 * data to the new offs buffer 4318 */ 4319 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 4320 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n", 4321 0, 4322 PTR2UV(prog), 4323 PTR2UV(RXp_OFFSp(prog)), 4324 PTR2UV(swap) 4325 )); 4326 4327 Copy(swap, RXp_OFFSp(prog), prog->nparens + 1, regexp_paren_pair); 4328 } 4329 4330 /* clean up; this will trigger destructors that will free all slabs 4331 * above the current one, and cleanup the regmatch_info_aux 4332 * and regmatch_info_aux_eval sructs */ 4333 4334 LEAVE_SCOPE(oldsave); 4335 4336 return 0; 4337} 4338 4339 4340/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. 4341 * Do inc before dec, in case old and new rex are the same */ 4342#define SET_reg_curpm(Re2) \ 4343 if (reginfo->info_aux_eval) { \ 4344 (void)ReREFCNT_inc(Re2); \ 4345 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ 4346 PM_SETRE((PL_reg_curpm), (Re2)); \ 4347 } 4348 4349 4350/* 4351 - regtry - try match at specific point 4352 */ 4353STATIC bool /* 0 failure, 1 success */ 4354S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) 4355{ 4356 CHECKPOINT lastcp; 4357 REGEXP *const rx = reginfo->prog; 4358 regexp *const prog = ReANY(rx); 4359 SSize_t result; 4360#ifdef DEBUGGING 4361 U32 depth = 0; /* used by REGCP_SET */ 4362#endif 4363 RXi_GET_DECL(prog,progi); 4364 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4365 4366 PERL_ARGS_ASSERT_REGTRY; 4367 4368 reginfo->cutpoint=NULL; 4369 4370 RXp_OFFSp(prog)[0].start = *startposp - reginfo->strbeg; 4371 RXp_LASTPAREN(prog) = 0; 4372 RXp_LASTCLOSEPAREN(prog) = 0; 4373 4374 /* XXXX What this code is doing here?!!! There should be no need 4375 to do this again and again, RXp_LASTPAREN(prog) should take care of 4376 this! --ilya*/ 4377 4378 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. 4379 * Actually, the code in regcppop() (which Ilya may be meaning by 4380 * RXp_LASTPAREN(prog)), is not needed at all by the test suite 4381 * (op/regexp, op/pat, op/split), but that code is needed otherwise 4382 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ 4383 * Meanwhile, this code *is* needed for the 4384 * above-mentioned test suite tests to succeed. The common theme 4385 * on those tests seems to be returning null fields from matches. 4386 * --jhi updated by dapm */ 4387 4388 /* After encountering a variant of the issue mentioned above I think 4389 * the point Ilya was making is that if we properly unwind whenever 4390 * we set lastparen to a smaller value then we should not need to do 4391 * this every time, only when needed. So if we have tests that fail if 4392 * we remove this, then it suggests somewhere else we are improperly 4393 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and 4394 * places it is called, and related regcp() routines. - Yves */ 4395#if 1 4396 if (prog->nparens) { 4397 regexp_paren_pair *pp = RXp_OFFSp(prog); 4398 I32 i; 4399 for (i = prog->nparens; i > (I32)RXp_LASTPAREN(prog); i--) { 4400 ++pp; 4401 pp->start = -1; 4402 pp->end = -1; 4403 } 4404 } 4405#endif 4406 REGCP_SET(lastcp); 4407 result = regmatch(reginfo, *startposp, progi->program + 1); 4408 if (result != -1) { 4409 RXp_OFFSp(prog)[0].end = result; 4410 return 1; 4411 } 4412 if (reginfo->cutpoint) 4413 *startposp= reginfo->cutpoint; 4414 REGCP_UNWIND(lastcp); 4415 return 0; 4416} 4417 4418/* this is used to determine how far from the left messages like 4419 'failed...' are printed in regexec.c. It should be set such that 4420 messages are inline with the regop output that created them. 4421*/ 4422#define REPORT_CODE_OFF 29 4423#define INDENT_CHARS(depth) ((int)(depth) % 20) 4424#ifdef DEBUGGING 4425int 4426Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...) 4427{ 4428 va_list ap; 4429 int result; 4430 PerlIO *f= Perl_debug_log; 4431 PERL_ARGS_ASSERT_RE_EXEC_INDENTF; 4432 va_start(ap, depth); 4433 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" ); 4434 result = PerlIO_vprintf(f, fmt, ap); 4435 va_end(ap); 4436 return result; 4437} 4438#endif /* DEBUGGING */ 4439 4440/* grab a new slab and return the first slot in it */ 4441 4442STATIC regmatch_state * 4443S_push_slab(pTHX) 4444{ 4445 regmatch_slab *s = PL_regmatch_slab->next; 4446 if (!s) { 4447 Newx(s, 1, regmatch_slab); 4448 s->prev = PL_regmatch_slab; 4449 s->next = NULL; 4450 PL_regmatch_slab->next = s; 4451 } 4452 PL_regmatch_slab = s; 4453 return SLAB_FIRST(s); 4454} 4455 4456#ifdef DEBUGGING 4457 4458STATIC void 4459S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, 4460 const char *start, const char *end, const char *blurb) 4461{ 4462 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; 4463 4464 PERL_ARGS_ASSERT_DEBUG_START_MATCH; 4465 4466 if (!PL_colorset) 4467 reginitcolors(); 4468 { 4469 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), 4470 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len); 4471 4472 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), 4473 start, end - start, PL_dump_re_max_len); 4474 4475 Perl_re_printf( aTHX_ 4476 "%s%s REx%s %s against %s\n", 4477 PL_colors[4], blurb, PL_colors[5], s0, s1); 4478 4479 if (utf8_target||utf8_pat) 4480 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n", 4481 utf8_pat ? "pattern" : "", 4482 utf8_pat && utf8_target ? " and " : "", 4483 utf8_target ? "string" : "" 4484 ); 4485 } 4486} 4487 4488STATIC void 4489S_dump_exec_pos(pTHX_ const char *locinput, 4490 const regnode *scan, 4491 const char *loc_regeol, 4492 const char *loc_bostr, 4493 const char *loc_reg_starttry, 4494 const bool utf8_target, 4495 const U32 depth 4496 ) 4497{ 4498 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; 4499 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ 4500 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); 4501 /* The part of the string before starttry has one color 4502 (pref0_len chars), between starttry and current 4503 position another one (pref_len - pref0_len chars), 4504 after the current position the third one. 4505 We assume that pref0_len <= pref_len, otherwise we 4506 decrease pref0_len. */ 4507 int pref_len = (locinput - loc_bostr) > (5 + taill) - l 4508 ? (5 + taill) - l : locinput - loc_bostr; 4509 int pref0_len; 4510 4511 PERL_ARGS_ASSERT_DUMP_EXEC_POS; 4512 4513 if (utf8_target) { 4514 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) { 4515 pref_len++; 4516 } 4517 } 4518 pref0_len = pref_len - (locinput - loc_reg_starttry); 4519 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) 4520 l = ( loc_regeol - locinput > (5 + taill) - pref_len 4521 ? (5 + taill) - pref_len : loc_regeol - locinput); 4522 if (utf8_target) { 4523 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) { 4524 l--; 4525 } 4526 } 4527 if (pref0_len < 0) 4528 pref0_len = 0; 4529 if (pref0_len > pref_len) 4530 pref0_len = pref_len; 4531 { 4532 const int is_uni = utf8_target ? 1 : 0; 4533 4534 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), 4535 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5); 4536 4537 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), 4538 (locinput - pref_len + pref0_len), 4539 pref_len - pref0_len, PL_dump_re_max_len, 2, 3); 4540 4541 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), 4542 locinput, loc_regeol - locinput, 10, 0, 1); 4543 4544 const STRLEN tlen=len0+len1+len2; 4545 Perl_re_printf( aTHX_ 4546 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ", 4547 (IV)(locinput - loc_bostr), 4548 len0, s0, 4549 len1, s1, 4550 (docolor ? "" : "> <"), 4551 len2, s2, 4552 (int)(tlen > 19 ? 0 : 19 - tlen), 4553 "", 4554 (UV)depth); 4555 } 4556} 4557 4558#endif 4559 4560/* reg_check_named_buff_matched() 4561 * Checks to see if a named buffer has matched. The data array of 4562 * buffer numbers corresponding to the buffer is expected to reside 4563 * in the regexp->data->data array in the slot stored in the ARG1u() of 4564 * node involved. Note that this routine doesn't actually care about the 4565 * name, that information is not preserved from compilation to execution. 4566 * Returns the index of the leftmost defined buffer with the given name 4567 * or 0 if non of the buffers matched. 4568 */ 4569STATIC I32 4570S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) 4571{ 4572 I32 n; 4573 RXi_GET_DECL(rex,rexi); 4574 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 4575 I32 *nums=(I32*)SvPVX(sv_dat); 4576 4577 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; 4578 4579 for ( n=0; n<SvIVX(sv_dat); n++ ) { 4580 if ((I32)RXp_LASTPAREN(rex) >= nums[n] && 4581 RXp_OFFS_END(rex,nums[n]) != -1) 4582 { 4583 return nums[n]; 4584 } 4585 } 4586 return 0; 4587} 4588 4589static bool 4590S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, 4591 struct next_matchable_info * m, 4592 regmatch_info *reginfo) 4593{ 4594 /* This function determines various characteristics about every possible 4595 * initial match of the passed-in EXACTish <text_node>, and stores them in 4596 * <*m>. 4597 * 4598 * That includes a match string and a parallel mask, such that if you AND 4599 * the target string with the mask and compare with the match string, 4600 * you'll have a pretty good idea, perhaps even perfect, if that portion of 4601 * the target matches or not. 4602 * 4603 * The motivation behind this function is to allow the caller to set up 4604 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where 4605 * B is an arbitrary EXACTish node. To find the end of .*, we look for the 4606 * beginning oF B, which is the passed in <text_node> That's where this 4607 * function comes in. The values it returns can quickly be used to rule 4608 * out many, or all, cases of possible matches not actually being the 4609 * beginning of B, <text_node>. It is also used in regrepeat() where we 4610 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently 4611 * determine where the span of 'A's stop. 4612 * 4613 * If <text_node> is of type EXACT, there is only one possible character 4614 * that can match its first character, and so the situation is quite 4615 * simple. But things can get much more complicated if folding is 4616 * involved. It may be that the first character of an EXACTFish node 4617 * doesn't participate in any possible fold, e.g., punctuation, so it can 4618 * be matched only by itself. The vast majority of characters that are in 4619 * folds match just two things, their lower and upper-case equivalents. 4620 * But not all are like that; some have multiple possible matches, or match 4621 * sequences of more than one character. This function sorts all that out. 4622 * 4623 * It returns information about all possibilities of what the first 4624 * character(s) of <text_node> could look like. Again, if <text_node> is a 4625 * plain EXACT node, that's just the actual first bytes of the first 4626 * character; but otherwise it is the bytes, that when masked, match all 4627 * possible combinations of all the initial bytes of all the characters 4628 * that could match, folded. (Actually, this is a slight over promise. It 4629 * handles only up to the initial 5 bytes, which is enough for all Unicode 4630 * characters, but not for all non-Unicode ones.) 4631 * 4632 * Here's an example to clarify. Suppose the first character of 4633 * <text_node> is the letter 'C', and we are under /i matching. That means 4634 * 'c' also matches. The representations of these two characters differ in 4635 * just one bit, so the mask would be a zero in that position and ones in 4636 * the other 7. And the returned string would be the AND of these two 4637 * characters, and would be one byte long, since these characters are each 4638 * a single byte. ANDing the target <text_node> with this mask will yield 4639 * the returned string if and only if <text_node> begins with one of these 4640 * two characters. So, the function would also return that the definitive 4641 * length matched is 1 byte. 4642 * 4643 * Now, suppose instead of the letter 'C', <text_node> begins with the 4644 * letter 'F'. The situation is much more complicated because there are 4645 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also 4646 * begins with 'f', and hence could match. We add these into the returned 4647 * string and mask, but the result isn't definitive; the caller has to 4648 * check further if its AND and compare pass. But the failure of that 4649 * compare will quickly rule out most possible inputs. 4650 * 4651 * Much of this could be done in regcomp.c at compile time, except for 4652 * locale-dependent, and UTF-8 target dependent data. Extra data fields 4653 * could be used for one or the other eventualities. 4654 * 4655 * If this function determines that no possible character in the target 4656 * string can match, it returns FALSE; otherwise TRUE. (The FALSE 4657 * situation occurs if the first character in <text_node> requires UTF-8 to 4658 * represent, and the target string isn't in UTF-8.) 4659 * 4660 * Some analysis is in GH #18414, located at the time of this writing at: 4661 * https://github.com/Perl/perl5/issues/18414 4662 */ 4663 4664 const bool utf8_target = reginfo->is_utf8_target; 4665 bool utf8_pat = reginfo->is_utf8_pat; 4666 4667 PERL_UINT_FAST8_T i; 4668 4669 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e 4670 */ 4671 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } }; 4672 U8 lengths[MAX_MATCHES] = { 0 }; 4673 4674 U8 index_of_longest = 0; 4675 4676 U8 *pat = (U8*)STRING(text_node); 4677 Size_t pat_len = STR_LEN(text_node); 4678 U8 op = OP(text_node); 4679 4680 U8 byte_mask[5] = {0}; 4681 U8 byte_anded[5] = {0}; 4682 4683 /* There are some folds in Unicode to multiple characters. This will hold 4684 * such characters that could fold to the beginning of 'text_node' */ 4685 UV multi_fold_from = 0; 4686 4687 /* We may have to create a modified copy of the pattern */ 4688 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' }; 4689 4690 m->max_length = 0; 4691 m->min_length = 255; 4692 m->count = 0; 4693 4694 /* Even if the first character in the node can match something in Latin1, 4695 * if there is anything in the node that can't, the match must fail */ 4696 if (! utf8_target && isEXACT_REQ8(op)) { 4697 return FALSE; 4698 } 4699 4700/* Define a temporary op for use in this function, using an existing one that 4701 * should never be a real op during execution */ 4702#define TURKISH PSEUDO 4703 4704 /* What to do about these two nodes had to be deferred to runtime (which is 4705 * now). If the extra information we now have so indicates, turn them into 4706 * EXACTFU nodes */ 4707 if ( (op == EXACTF && utf8_target) 4708 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 4709 { 4710 if (op == EXACTFL && IN_UTF8_TURKIC_LOCALE) { 4711 op = TURKISH; 4712 } 4713 else { 4714 op = EXACTFU; 4715 } 4716 4717 /* And certain situations are better handled if we create a modified 4718 * version of the pattern */ 4719 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the 4720 specific problematic characters */ 4721 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) { 4722 4723 /* The node could start with characters that are the first ones 4724 * of a multi-character fold. */ 4725 multi_fold_from 4726 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); 4727 if (multi_fold_from) { 4728 4729 /* Here, they do form a sequence that matches the fold of a 4730 * single character. That single character then is a 4731 * possible match. Below we will look again at this, but 4732 * the code below is expecting every character in the 4733 * pattern to be folded, which the input isn't required to 4734 * be in this case. So, just fold the single character, 4735 * and the result will be in the expected form. */ 4736 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, 4737 FOLD_FLAGS_FULL); 4738 pat = mod_pat; 4739 } 4740 /* Turkish has a couple extra possibilities. */ 4741 else if ( UNLIKELY(op == TURKISH) 4742 && pat_len >= 3 4743 && isALPHA_FOLD_EQ(pat[0], 'f') 4744 && ( memBEGINs(pat + 1, pat_len - 1, 4745 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) 4746 || ( pat_len >= 4 4747 && isALPHA_FOLD_EQ(pat[1], 'f') 4748 && memBEGINs(pat + 2, pat_len - 2, 4749 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) 4750 ))) { 4751 /* The macros for finding a multi-char fold don't include 4752 * the Turkish possibilities, in which U+130 folds to 'i'. 4753 * Hard-code these. It's very unlikely that Unicode will 4754 * ever add any others. */ 4755 if (pat[1] == 'f') { 4756 pat_len = 3; 4757 Copy("ffi", mod_pat, pat_len, U8); 4758 } 4759 else { 4760 pat_len = 2; 4761 Copy("fi", mod_pat, pat_len, U8); 4762 } 4763 pat = mod_pat; 4764 } 4765 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat) 4766 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8)) 4767 && LIKELY(memNEs(pat, pat_len, 4768 LATIN_SMALL_LETTER_SHARP_S_UTF8)) 4769 && (LIKELY(op != TURKISH || *pat != 'I'))) 4770 { 4771 /* For all cases of things between 0-255, except the ones 4772 * in the conditional above, the fold is just the lower 4773 * case, which is faster than the more general case. */ 4774 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0], 4775 pat[1])); 4776 pat_len = 1; 4777 pat = mod_pat; 4778 utf8_pat = FALSE; 4779 } 4780 else { /* Code point above 255, or needs special handling */ 4781 _to_utf8_fold_flags(pat, pat + pat_len, 4782 mod_pat, &pat_len, 4783 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE); 4784 pat = mod_pat; 4785 } 4786 } 4787 } 4788 else if /* Below is not a UTF-8 pattern; there's a somewhat different 4789 set of problematic characters */ 4790 ((multi_fold_from 4791 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len))) 4792 { 4793 /* We may have to canonicalize a multi-char fold, as in the UTF-8 4794 * case */ 4795 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, 4796 FOLD_FLAGS_FULL); 4797 pat = mod_pat; 4798 } 4799 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) { 4800 mod_pat[0] = mod_pat[1] = 's'; 4801 pat_len = 2; 4802 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant 4803 chars, and speeds copying */ 4804 pat = mod_pat; 4805 } 4806 else if (LIKELY(op != TURKISH || *pat != 'I')) { 4807 mod_pat[0] = toLOWER_L1(*pat); 4808 pat_len = 1; 4809 pat = mod_pat; 4810 } 4811 } 4812 else if /* Below isn't a node that we convert to UTF-8 */ 4813 ( utf8_target 4814 && ! utf8_pat 4815 && op == EXACTFAA_NO_TRIE 4816 && *pat == LATIN_SMALL_LETTER_SHARP_S) 4817 { 4818 /* A very special case. Folding U+DF goes to U+17F under /iaa. We 4819 * did this at compile time when the pattern was UTF-8 , but otherwise 4820 * we couldn't do it earlier, because it requires a UTF-8 target for 4821 * this match to be legal. */ 4822 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1); 4823 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 4824 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8); 4825 pat = mod_pat; 4826 utf8_pat = TRUE; 4827 } 4828 4829 /* Here, we have taken care of the initial work for a few very problematic 4830 * situations, possibly creating a modified pattern. 4831 * 4832 * Now ready for the general case. We build up all the possible things 4833 * that could match the first character of the pattern into the elements of 4834 * 'matches[]' 4835 * 4836 * Everything generally matches at least itself. But if there is a 4837 * UTF8ness mismatch, we have to convert to that of the target string. */ 4838 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */ 4839 matches[0][0] = pat[0]; 4840 lengths[0] = 1; 4841 m->count++; 4842 } 4843 else if (utf8_target) { 4844 if (utf8_pat) { 4845 lengths[0] = UTF8SKIP(pat); 4846 Copy(pat, matches[0], lengths[0], U8); 4847 m->count++; 4848 } 4849 else { /* target is UTF-8, pattern isn't */ 4850 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]); 4851 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]); 4852 lengths[0] = 2; 4853 m->count++; 4854 } 4855 } 4856 else if (! utf8_pat) { /* Neither is UTF-8 */ 4857 matches[0][0] = pat[0]; 4858 lengths[0] = 1; 4859 m->count++; 4860 } 4861 else /* target isn't UTF-8; pattern is. No match possible unless the 4862 pattern's first character can fit in a byte */ 4863 if (UTF8_IS_DOWNGRADEABLE_START(*pat)) 4864 { 4865 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]); 4866 lengths[0] = 1; 4867 m->count++; 4868 } 4869 4870 /* Here we have taken care of any necessary node-type changes */ 4871 4872 if (m->count) { 4873 m->max_length = lengths[0]; 4874 m->min_length = lengths[0]; 4875 } 4876 4877 /* For non-folding nodes, there are no other possible candidate matches, 4878 * but for foldable ones, we have to look further. */ 4879 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */ 4880 UV folded; /* The first character in the pattern, folded */ 4881 U32 first_fold_from; /* A character that folds to it */ 4882 const U32 * remaining_fold_froms; /* The remaining characters that 4883 fold to it, if any */ 4884 Size_t folds_to_count; /* The total number of characters that fold to 4885 'folded' */ 4886 4887 /* If the node begins with a sequence of more than one character that 4888 * together form the fold of a single character, it is called a 4889 * 'multi-character fold', and the normal functions don't handle this 4890 * case. We set 'multi_fold_from' to the single folded-from character, 4891 * which is handled in an extra iteration below */ 4892 if (utf8_pat) { 4893 folded = valid_utf8_to_uvchr(pat, NULL); 4894 multi_fold_from 4895 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); 4896 } 4897 else { 4898 folded = *pat; 4899 4900 /* This may generate illegal combinations for things like EXACTF, 4901 * but rather than repeat the logic and exclude them here, all such 4902 * illegalities are checked for and skipped below in the loop */ 4903 multi_fold_from 4904 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len); 4905 } 4906 4907 /* Everything matches at least itself; initialize to that because the 4908 * only the branches below that set it are the ones where the number 4909 * isn't 1. */ 4910 folds_to_count = 1; 4911 4912 /* There are a few special cases for locale-dependent nodes, where the 4913 * run-time context was needed before we could know what matched */ 4914 if (UNLIKELY(op == EXACTFL) && folded < 256) { 4915 first_fold_from = PL_fold_locale[folded]; 4916 } 4917 else if ( op == EXACTFL && utf8_target && utf8_pat 4918 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8 4919 LATIN_SMALL_LETTER_LONG_S_UTF8)) 4920 { 4921 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S; 4922 } 4923 else if (UNLIKELY( op == TURKISH 4924 && ( isALPHA_FOLD_EQ(folded, 'i') 4925 || inRANGE(folded, 4926 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE, 4927 LATIN_SMALL_LETTER_DOTLESS_I)))) 4928 { /* Turkish folding requires special handling */ 4929 if (folded == 'i') 4930 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 4931 else if (folded == 'I') 4932 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I; 4933 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) 4934 first_fold_from = 'i'; 4935 else first_fold_from = 'I'; 4936 } 4937 else { 4938 /* Here, isn't a special case: use the generic function to 4939 * calculate what folds to this */ 4940 redo_multi: 4941 /* Look up what code points (besides itself) fold to 'folded'; 4942 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */ 4943 folds_to_count = _inverse_folds(folded, &first_fold_from, 4944 &remaining_fold_froms); 4945 } 4946 4947 /* Add each character that folds to 'folded' to the list of them, 4948 * subject to limitations based on the node type and target UTF8ness. 4949 * If there was a character that folded to multiple characters, do an 4950 * extra iteration for it. (Note the extra iteration if there is a 4951 * multi-character fold) */ 4952 for (i = 0; i < folds_to_count 4953 + UNLIKELY(multi_fold_from != 0); i++) 4954 { 4955 UV fold_from = 0; 4956 4957 if (i >= folds_to_count) { /* Final iteration: handle the 4958 multi-char */ 4959 fold_from = multi_fold_from; 4960 } 4961 else if (i == 0) { 4962 fold_from = first_fold_from; 4963 } 4964 else if (i < folds_to_count) { 4965 fold_from = remaining_fold_froms[i-1]; 4966 } 4967 4968 if (folded == fold_from) { /* We already added the character 4969 itself */ 4970 continue; 4971 } 4972 4973 /* EXACTF doesn't have any non-ascii folds */ 4974 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) { 4975 continue; 4976 } 4977 4978 /* In /iaa nodes, neither or both must be ASCII to be a legal fold 4979 * */ 4980 if ( isASCII(folded) != isASCII(fold_from) 4981 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE)) 4982 4983 { 4984 continue; 4985 } 4986 4987 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8 4988 * locale, but those have been converted to EXACTFU above) */ 4989 if ( op == EXACTFL 4990 && (folded < 256) != (fold_from < 256)) 4991 { 4992 continue; 4993 } 4994 4995 /* If this triggers, it likely is because of the unlikely case 4996 * where a new Unicode standard has changed what MAX_MATCHES should 4997 * be set to */ 4998 assert(m->count < MAX_MATCHES); 4999 5000 /* Add this character to the list of possible matches */ 5001 if (utf8_target) { 5002 uvchr_to_utf8(matches[(U8) m->count], fold_from); 5003 lengths[m->count] = UVCHR_SKIP(fold_from); 5004 m->count++; 5005 } 5006 else { /* Non-UTF8 target: no code point above 255 can appear in it 5007 */ 5008 if (fold_from > 255) { 5009 continue; 5010 } 5011 5012 matches[m->count][0] = fold_from; 5013 lengths[m->count] = 1; 5014 m->count++; 5015 } 5016 5017 /* Update min and mlengths */ 5018 if (m->min_length > lengths[m->count-1]) { 5019 m->min_length = lengths[m->count-1]; 5020 } 5021 5022 if (m->max_length < lengths[m->count-1]) { 5023 index_of_longest = m->count - 1; 5024 m->max_length = lengths[index_of_longest]; 5025 } 5026 } /* looped through each potential fold */ 5027 5028 /* If there is something that folded to an initial multi-character 5029 * fold, repeat, using it. This catches some edge cases. An example 5030 * of one is /ss/i when UTF-8 encoded. The function 5031 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF 5032 * (LATIN SMALL SHARP S). If it returned a list of characters, this 5033 * code wouldn't be needed. But since it doesn't, we have to look what 5034 * folds to the U+DF. In this case, U+1E9E does, and has to be added. 5035 * */ 5036 if (multi_fold_from) { 5037 folded = multi_fold_from; 5038 multi_fold_from = 0; 5039 goto redo_multi; 5040 } 5041 } /* End of finding things that participate in this fold */ 5042 5043 if (m->count == 0) { /* If nothing found, can't match */ 5044 m->min_length = 0; 5045 return FALSE; 5046 } 5047 5048 /* Have calculated all possible matches. Now calculate the mask and AND 5049 * values */ 5050 m->initial_exact = 0; 5051 m->initial_definitive = 0; 5052 5053 { 5054 unsigned int mask_ones = 0; 5055 unsigned int possible_ones = 0; 5056 U8 j; 5057 5058 /* For each byte that is in all possible matches ... */ 5059 for (j = 0; j < MIN(m->min_length, 5); j++) { 5060 5061 /* Initialize the accumulator for this byte */ 5062 byte_mask[j] = 0xFF; 5063 byte_anded[j] = matches[0][j]; 5064 5065 /* Then the rest of the rows (folds). The mask is based on, like, 5066 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0 5067 * where they differ. */ 5068 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) { 5069 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]); 5070 byte_anded[j] &= matches[i][j]; 5071 } 5072 5073 /* Keep track of the number of initial mask bytes that are all one 5074 * bits. The code calling this can use this number to know that 5075 * a string that matches this number of bytes in the pattern is an 5076 * exact match of that pattern for this number of bytes. But also 5077 * counted are the number of initial bytes that in total have a 5078 * single zero bit. If a string matches those, masked, it must be 5079 * one of two possibilites, both of which this function has 5080 * determined are legal. (But if that single 0 is one of the 5081 * initial bits for masking a UTF-8 start byte, that could 5082 * incorrectly lead to different length strings appearing to be 5083 * equivalent, so only do this optimization when the matchables are 5084 * all the same length. This was uncovered by testing 5085 * /\x{029E}/i.) */ 5086 if (m->min_length == m->max_length) { 5087 mask_ones += PL_bitcount[byte_mask[j]]; 5088 possible_ones += 8; 5089 if (mask_ones + 1 >= possible_ones) { 5090 m->initial_definitive++; 5091 if (mask_ones >= possible_ones) { 5092 m->initial_exact++; 5093 } 5094 } 5095 } 5096 } 5097 } 5098 5099 /* The first byte is separate for speed */ 5100 m->first_byte_mask = byte_mask[0]; 5101 m->first_byte_anded = byte_anded[0]; 5102 5103 /* Then pack up to the next 4 bytes into a word */ 5104 m->mask32 = m->anded32 = 0; 5105 for (i = 1; i < MIN(m->min_length, 5); i++) { 5106 U8 which = i; 5107 U8 shift = (which - 1) * 8; 5108 m->mask32 |= (U32) byte_mask[i] << shift; 5109 m->anded32 |= (U32) byte_anded[i] << shift; 5110 } 5111 5112 /* Finally, take the match strings and place them sequentially into a 5113 * one-dimensional array. (This is done to save significant space in the 5114 * structure.) Sort so the longest (presumably the least likely) is last. 5115 * XXX When this gets moved to regcomp, may want to fully sort shortest 5116 * first, but above we generally used the folded code point first, and 5117 * those tend to be no longer than their upper case values, so this is 5118 * already pretty well sorted by size. 5119 * 5120 * If the asserts fail, it's most likely because a new version of the 5121 * Unicode standard requires more space; simply increase the declaration 5122 * size. */ 5123 { 5124 U8 cur_pos = 0; 5125 U8 output_index = 0; 5126 5127 if (m->count > 1) { /* No need to sort a single entry */ 5128 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) { 5129 5130 /* Keep the same order for all but the longest. (If the 5131 * asserts fail, it could be because m->matches is declared too 5132 * short, either because of a new Unicode release, or an 5133 * overlooked test case, or it could be a bug.) */ 5134 if (i != index_of_longest) { 5135 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches)); 5136 Copy(matches[i], m->matches + cur_pos, lengths[i], U8); 5137 cur_pos += lengths[i]; 5138 m->lengths[output_index++] = lengths[i]; 5139 } 5140 } 5141 } 5142 5143 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches)); 5144 Copy(matches[index_of_longest], m->matches + cur_pos, 5145 lengths[index_of_longest], U8); 5146 5147 /* Place the longest match last */ 5148 m->lengths[output_index] = lengths[index_of_longest]; 5149 } 5150 5151 5152 return TRUE; 5153} 5154 5155PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */ 5156bool 5157S_test_EXACTISH_ST(const char * loc, 5158 struct next_matchable_info info) 5159{ 5160 /* This function uses the data set up in setup_EXACTISH_ST() to see if the 5161 * bytes starting at 'loc' can match based on 'next_matchable_info' */ 5162 5163 U32 input32 = 0; 5164 5165 /* Check the first byte */ 5166 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded) 5167 return FALSE; 5168 5169 /* Pack the next up-to-4 bytes into a 32 bit word */ 5170 switch (info.min_length) { 5171 default: 5172 input32 |= (U32) ((U8) loc[4]) << 3 * 8; 5173 /* FALLTHROUGH */ 5174 case 4: 5175 input32 |= (U8) loc[3] << 2 * 8; 5176 /* FALLTHROUGH */ 5177 case 3: 5178 input32 |= (U8) loc[2] << 1 * 8; 5179 /* FALLTHROUGH */ 5180 case 2: 5181 input32 |= (U8) loc[1]; 5182 break; 5183 case 1: 5184 return TRUE; /* We already tested and passed the 0th byte */ 5185 case 0: 5186 ASSUME(0); 5187 } 5188 5189 /* And AND that with the mask and compare that with the assembled ANDED 5190 * values */ 5191 return (input32 & info.mask32) == info.anded32; 5192} 5193 5194STATIC bool 5195S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) 5196{ 5197 /* returns a boolean indicating if there is a Grapheme Cluster Boundary 5198 * between the inputs. See https://www.unicode.org/reports/tr29/. */ 5199 5200 PERL_ARGS_ASSERT_ISGCB; 5201 5202 switch (GCB_table[before][after]) { 5203 case GCB_BREAKABLE: 5204 return TRUE; 5205 5206 case GCB_NOBREAK: 5207 return FALSE; 5208 5209 case GCB_RI_then_RI: 5210 { 5211 int RI_count = 1; 5212 U8 * temp_pos = (U8 *) curpos; 5213 5214 /* Do not break within emoji flag sequences. That is, do not 5215 * break between regional indicator (RI) symbols if there is an 5216 * odd number of RI characters before the break point. 5217 * GB12 sot (RI RI)* RI �� RI 5218 * GB13 [^RI] (RI RI)* RI �� RI */ 5219 5220 while (backup_one_GCB(strbeg, 5221 &temp_pos, 5222 utf8_target) == GCB_Regional_Indicator) 5223 { 5224 RI_count++; 5225 } 5226 5227 return RI_count % 2 != 1; 5228 } 5229 5230 case GCB_EX_then_EM: 5231 5232 /* GB10 ( E_Base | E_Base_GAZ ) Extend* �� E_Modifier */ 5233 { 5234 U8 * temp_pos = (U8 *) curpos; 5235 GCB_enum prev; 5236 5237 do { 5238 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 5239 } 5240 while (prev == GCB_Extend); 5241 5242 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ; 5243 } 5244 5245 case GCB_Maybe_Emoji_NonBreak: 5246 5247 { 5248 5249 /* Do not break within emoji modifier sequences or emoji zwj sequences. 5250 GB11 \p{Extended_Pictographic} Extend* ZWJ �� \p{Extended_Pictographic} 5251 */ 5252 U8 * temp_pos = (U8 *) curpos; 5253 GCB_enum prev; 5254 5255 do { 5256 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target); 5257 } 5258 while (prev == GCB_Extend); 5259 5260 return prev != GCB_ExtPict_XX; 5261 } 5262 5263 default: 5264 break; 5265 } 5266 5267#ifdef DEBUGGING 5268 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n", 5269 before, after, GCB_table[before][after]); 5270 assert(0); 5271#endif 5272 return TRUE; 5273} 5274 5275STATIC GCB_enum 5276S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5277{ 5278 GCB_enum gcb; 5279 5280 PERL_ARGS_ASSERT_BACKUP_ONE_GCB; 5281 5282 if (*curpos < strbeg) { 5283 return GCB_EDGE; 5284 } 5285 5286 if (utf8_target) { 5287 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5288 U8 * prev_prev_char_pos; 5289 5290 if (! prev_char_pos) { 5291 return GCB_EDGE; 5292 } 5293 5294 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 5295 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5296 *curpos = prev_char_pos; 5297 prev_char_pos = prev_prev_char_pos; 5298 } 5299 else { 5300 *curpos = (U8 *) strbeg; 5301 return GCB_EDGE; 5302 } 5303 } 5304 else { 5305 if (*curpos - 2 < strbeg) { 5306 *curpos = (U8 *) strbeg; 5307 return GCB_EDGE; 5308 } 5309 (*curpos)--; 5310 gcb = getGCB_VAL_CP(*(*curpos - 1)); 5311 } 5312 5313 return gcb; 5314} 5315 5316/* Combining marks attach to most classes that precede them, but this defines 5317 * the exceptions (from TR14) */ 5318#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \ 5319 || prev == LB_Mandatory_Break \ 5320 || prev == LB_Carriage_Return \ 5321 || prev == LB_Line_Feed \ 5322 || prev == LB_Next_Line \ 5323 || prev == LB_Space \ 5324 || prev == LB_ZWSpace)) 5325 5326STATIC bool 5327S_isLB(pTHX_ LB_enum before, 5328 LB_enum after, 5329 const U8 * const strbeg, 5330 const U8 * const curpos, 5331 const U8 * const strend, 5332 const bool utf8_target) 5333{ 5334 U8 * temp_pos = (U8 *) curpos; 5335 LB_enum prev = before; 5336 5337 /* Is the boundary between 'before' and 'after' line-breakable? 5338 * Most of this is just a table lookup of a generated table from Unicode 5339 * rules. But some rules require context to decide, and so have to be 5340 * implemented in code */ 5341 5342 PERL_ARGS_ASSERT_ISLB; 5343 5344 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5345 5346 redo: 5347 before = prev; 5348 switch (LB_table[before][after]) { 5349 case LB_BREAKABLE: 5350 return TRUE; 5351 5352 case LB_NOBREAK: 5353 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 5354 return FALSE; 5355 5356 case LB_SP_foo + LB_BREAKABLE: 5357 case LB_SP_foo + LB_NOBREAK: 5358 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN: 5359 5360 /* When we have something following a SP, we have to look at the 5361 * context in order to know what to do. 5362 * 5363 * SP SP should not reach here because LB7: Do not break before 5364 * spaces. (For two spaces in a row there is nothing that 5365 * overrides that) */ 5366 assert(after != LB_Space); 5367 5368 /* Here we have a space followed by a non-space. Mostly this is a 5369 * case of LB18: "Break after spaces". But there are complications 5370 * as the handling of spaces is somewhat tricky. They are in a 5371 * number of rules, which have to be applied in priority order, but 5372 * something earlier in the string can cause a rule to be skipped 5373 * and a lower priority rule invoked. A prime example is LB7 which 5374 * says don't break before a space. But rule LB8 (lower priority) 5375 * says that the first break opportunity after a ZW is after any 5376 * span of spaces immediately after it. If a ZW comes before a SP 5377 * in the input, rule LB8 applies, and not LB7. Other such rules 5378 * involve combining marks which are rules 9 and 10, but they may 5379 * override higher priority rules if they come earlier in the 5380 * string. Since we're doing random access into the middle of the 5381 * string, we have to look for rules that should get applied based 5382 * on both string position and priority. Combining marks do not 5383 * attach to either ZW nor SP, so we don't have to consider them 5384 * until later. 5385 * 5386 * To check for LB8, we have to find the first non-space character 5387 * before this span of spaces */ 5388 do { 5389 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5390 } 5391 while (prev == LB_Space); 5392 5393 /* LB8 Break before any character following a zero-width space, 5394 * even if one or more spaces intervene. 5395 * ZW SP* �� 5396 * So if we have a ZW just before this span, and to get here this 5397 * is the final space in the span. */ 5398 if (prev == LB_ZWSpace) { 5399 return TRUE; 5400 } 5401 5402 /* Here, not ZW SP+. There are several rules that have higher 5403 * priority than LB18 and can be resolved now, as they don't depend 5404 * on anything earlier in the string (except ZW, which we have 5405 * already handled). One of these rules is LB11 Do not break 5406 * before Word joiner, but we have specially encoded that in the 5407 * lookup table so it is caught by the single test below which 5408 * catches the other ones. */ 5409 if (LB_table[LB_Space][after] - LB_SP_foo 5410 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN) 5411 { 5412 return FALSE; 5413 } 5414 5415 /* If we get here, we have to XXX consider combining marks. */ 5416 if (prev == LB_Combining_Mark) { 5417 5418 /* What happens with these depends on the character they 5419 * follow. */ 5420 do { 5421 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5422 } 5423 while (prev == LB_Combining_Mark); 5424 5425 /* Most times these attach to and inherit the characteristics 5426 * of that character, but not always, and when not, they are to 5427 * be treated as AL by rule LB10. */ 5428 if (! LB_CM_ATTACHES_TO(prev)) { 5429 prev = LB_Alphabetic; 5430 } 5431 } 5432 5433 /* Here, we have the character preceding the span of spaces all set 5434 * up. We follow LB18: "Break after spaces" unless the table shows 5435 * that is overridden */ 5436 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN; 5437 5438 case LB_CM_ZWJ_foo: 5439 5440 /* We don't know how to treat the CM except by looking at the first 5441 * non-CM character preceding it. ZWJ is treated as CM */ 5442 do { 5443 prev = backup_one_LB(strbeg, &temp_pos, utf8_target); 5444 } 5445 while (prev == LB_Combining_Mark || prev == LB_ZWJ); 5446 5447 /* Here, 'prev' is that first earlier non-CM character. If the CM 5448 * attaches to it, then it inherits the behavior of 'prev'. If it 5449 * doesn't attach, it is to be treated as an AL */ 5450 if (! LB_CM_ATTACHES_TO(prev)) { 5451 prev = LB_Alphabetic; 5452 } 5453 5454 goto redo; 5455 5456 case LB_HY_or_BA_then_foo + LB_BREAKABLE: 5457 case LB_HY_or_BA_then_foo + LB_NOBREAK: 5458 5459 /* LB21a Don't break after Hebrew + Hyphen. 5460 * HL (HY | BA) �� */ 5461 5462 if (backup_one_LB(strbeg, &temp_pos, utf8_target) 5463 == LB_Hebrew_Letter) 5464 { 5465 return FALSE; 5466 } 5467 5468 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE; 5469 5470 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE: 5471 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK: 5472 5473 /* LB25a (PR | PO) �� ( OP | HY )? NU */ 5474 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) { 5475 return FALSE; 5476 } 5477 5478 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY 5479 == LB_BREAKABLE; 5480 5481 case LB_SY_or_IS_then_various + LB_BREAKABLE: 5482 case LB_SY_or_IS_then_various + LB_NOBREAK: 5483 { 5484 /* LB25d NU (SY | IS)* �� (NU | SY | IS | CL | CP ) */ 5485 5486 LB_enum temp = prev; 5487 do { 5488 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5489 } 5490 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric); 5491 if (temp == LB_Numeric) { 5492 return FALSE; 5493 } 5494 5495 return LB_table[prev][after] - LB_SY_or_IS_then_various 5496 == LB_BREAKABLE; 5497 } 5498 5499 case LB_various_then_PO_or_PR + LB_BREAKABLE: 5500 case LB_various_then_PO_or_PR + LB_NOBREAK: 5501 { 5502 /* LB25e NU (SY | IS)* (CL | CP)? �� (PO | PR) */ 5503 5504 LB_enum temp = prev; 5505 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis) 5506 { 5507 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5508 } 5509 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) { 5510 temp = backup_one_LB(strbeg, &temp_pos, utf8_target); 5511 } 5512 if (temp == LB_Numeric) { 5513 return FALSE; 5514 } 5515 return LB_various_then_PO_or_PR; 5516 } 5517 5518 case LB_RI_then_RI + LB_NOBREAK: 5519 case LB_RI_then_RI + LB_BREAKABLE: 5520 { 5521 int RI_count = 1; 5522 5523 /* LB30a Break between two regional indicator symbols if and 5524 * only if there are an even number of regional indicators 5525 * preceding the position of the break. 5526 * 5527 * sot (RI RI)* RI �� RI 5528 * [^RI] (RI RI)* RI �� RI */ 5529 5530 while (backup_one_LB(strbeg, 5531 &temp_pos, 5532 utf8_target) == LB_Regional_Indicator) 5533 { 5534 RI_count++; 5535 } 5536 5537 return RI_count % 2 == 0; 5538 } 5539 5540 default: 5541 break; 5542 } 5543 5544#ifdef DEBUGGING 5545 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n", 5546 before, after, LB_table[before][after]); 5547 assert(0); 5548#endif 5549 return TRUE; 5550} 5551 5552STATIC LB_enum 5553S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5554{ 5555 5556 LB_enum lb; 5557 5558 PERL_ARGS_ASSERT_ADVANCE_ONE_LB; 5559 5560 if (*curpos >= strend) { 5561 return LB_EDGE; 5562 } 5563 5564 if (utf8_target) { 5565 *curpos += UTF8SKIP(*curpos); 5566 if (*curpos >= strend) { 5567 return LB_EDGE; 5568 } 5569 lb = getLB_VAL_UTF8(*curpos, strend); 5570 } 5571 else { 5572 (*curpos)++; 5573 if (*curpos >= strend) { 5574 return LB_EDGE; 5575 } 5576 lb = getLB_VAL_CP(**curpos); 5577 } 5578 5579 return lb; 5580} 5581 5582STATIC LB_enum 5583S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5584{ 5585 LB_enum lb; 5586 5587 PERL_ARGS_ASSERT_BACKUP_ONE_LB; 5588 5589 if (*curpos < strbeg) { 5590 return LB_EDGE; 5591 } 5592 5593 if (utf8_target) { 5594 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5595 U8 * prev_prev_char_pos; 5596 5597 if (! prev_char_pos) { 5598 return LB_EDGE; 5599 } 5600 5601 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { 5602 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5603 *curpos = prev_char_pos; 5604 prev_char_pos = prev_prev_char_pos; 5605 } 5606 else { 5607 *curpos = (U8 *) strbeg; 5608 return LB_EDGE; 5609 } 5610 } 5611 else { 5612 if (*curpos - 2 < strbeg) { 5613 *curpos = (U8 *) strbeg; 5614 return LB_EDGE; 5615 } 5616 (*curpos)--; 5617 lb = getLB_VAL_CP(*(*curpos - 1)); 5618 } 5619 5620 return lb; 5621} 5622 5623STATIC bool 5624S_isSB(pTHX_ SB_enum before, 5625 SB_enum after, 5626 const U8 * const strbeg, 5627 const U8 * const curpos, 5628 const U8 * const strend, 5629 const bool utf8_target) 5630{ 5631 /* returns a boolean indicating if there is a Sentence Boundary Break 5632 * between the inputs. See https://www.unicode.org/reports/tr29/ */ 5633 5634 U8 * lpos = (U8 *) curpos; 5635 bool has_para_sep = FALSE; 5636 bool has_sp = FALSE; 5637 5638 PERL_ARGS_ASSERT_ISSB; 5639 5640 /* Break at the start and end of text. 5641 SB1. sot �� 5642 SB2. �� eot 5643 But unstated in Unicode is don't break if the text is empty */ 5644 if (before == SB_EDGE || after == SB_EDGE) { 5645 return before != after; 5646 } 5647 5648 /* SB 3: Do not break within CRLF. */ 5649 if (before == SB_CR && after == SB_LF) { 5650 return FALSE; 5651 } 5652 5653 /* Break after paragraph separators. CR and LF are considered 5654 * so because Unicode views text as like word processing text where there 5655 * are no newlines except between paragraphs, and the word processor takes 5656 * care of wrapping without there being hard line-breaks in the text *./ 5657 SB4. Sep | CR | LF �� */ 5658 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5659 return TRUE; 5660 } 5661 5662 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF. 5663 * (See Section 6.2, Replacing Ignore Rules.) 5664 SB5. X (Extend | Format)* ��� X */ 5665 if (after == SB_Extend || after == SB_Format) { 5666 5667 /* Implied is that the these characters attach to everything 5668 * immediately prior to them except for those separator-type 5669 * characters. And the rules earlier have already handled the case 5670 * when one of those immediately precedes the extend char */ 5671 return FALSE; 5672 } 5673 5674 if (before == SB_Extend || before == SB_Format) { 5675 U8 * temp_pos = lpos; 5676 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5677 if ( backup != SB_EDGE 5678 && backup != SB_Sep 5679 && backup != SB_CR 5680 && backup != SB_LF) 5681 { 5682 before = backup; 5683 lpos = temp_pos; 5684 } 5685 5686 /* Here, both 'before' and 'backup' are these types; implied is that we 5687 * don't break between them */ 5688 if (backup == SB_Extend || backup == SB_Format) { 5689 return FALSE; 5690 } 5691 } 5692 5693 /* Do not break after ambiguous terminators like period, if they are 5694 * immediately followed by a number or lowercase letter, if they are 5695 * between uppercase letters, if the first following letter (optionally 5696 * after certain punctuation) is lowercase, or if they are followed by 5697 * "continuation" punctuation such as comma, colon, or semicolon. For 5698 * example, a period may be an abbreviation or numeric period, and thus may 5699 * not mark the end of a sentence. 5700 5701 * SB6. ATerm �� Numeric */ 5702 if (before == SB_ATerm && after == SB_Numeric) { 5703 return FALSE; 5704 } 5705 5706 /* SB7. (Upper | Lower) ATerm �� Upper */ 5707 if (before == SB_ATerm && after == SB_Upper) { 5708 U8 * temp_pos = lpos; 5709 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); 5710 if (backup == SB_Upper || backup == SB_Lower) { 5711 return FALSE; 5712 } 5713 } 5714 5715 /* The remaining rules that aren't the final one, all require an STerm or 5716 * an ATerm after having backed up over some Close* Sp*, and in one case an 5717 * optional Paragraph separator, although one rule doesn't have any Sp's in it. 5718 * So do that backup now, setting flags if either Sp or a paragraph 5719 * separator are found */ 5720 5721 if (before == SB_Sep || before == SB_CR || before == SB_LF) { 5722 has_para_sep = TRUE; 5723 before = backup_one_SB(strbeg, &lpos, utf8_target); 5724 } 5725 5726 if (before == SB_Sp) { 5727 has_sp = TRUE; 5728 do { 5729 before = backup_one_SB(strbeg, &lpos, utf8_target); 5730 } 5731 while (before == SB_Sp); 5732 } 5733 5734 while (before == SB_Close) { 5735 before = backup_one_SB(strbeg, &lpos, utf8_target); 5736 } 5737 5738 /* The next few rules apply only when the backed-up-to is an ATerm, and in 5739 * most cases an STerm */ 5740 if (before == SB_STerm || before == SB_ATerm) { 5741 5742 /* So, here the lhs matches 5743 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)? 5744 * and we have set flags if we found an Sp, or the optional Sep,CR,LF. 5745 * The rules that apply here are: 5746 * 5747 * SB8 ATerm Close* Sp* �� ( ��(OLetter | Upper | Lower | Sep | CR 5748 | LF | STerm | ATerm) )* Lower 5749 SB8a (STerm | ATerm) Close* Sp* �� (SContinue | STerm | ATerm) 5750 SB9 (STerm | ATerm) Close* �� (Close | Sp | Sep | CR | LF) 5751 SB10 (STerm | ATerm) Close* Sp* �� (Sp | Sep | CR | LF) 5752 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? �� 5753 */ 5754 5755 /* And all but SB11 forbid having seen a paragraph separator */ 5756 if (! has_para_sep) { 5757 if (before == SB_ATerm) { /* SB8 */ 5758 U8 * rpos = (U8 *) curpos; 5759 SB_enum later = after; 5760 5761 while ( later != SB_OLetter 5762 && later != SB_Upper 5763 && later != SB_Lower 5764 && later != SB_Sep 5765 && later != SB_CR 5766 && later != SB_LF 5767 && later != SB_STerm 5768 && later != SB_ATerm 5769 && later != SB_EDGE) 5770 { 5771 later = advance_one_SB(&rpos, strend, utf8_target); 5772 } 5773 if (later == SB_Lower) { 5774 return FALSE; 5775 } 5776 } 5777 5778 if ( after == SB_SContinue /* SB8a */ 5779 || after == SB_STerm 5780 || after == SB_ATerm) 5781 { 5782 return FALSE; 5783 } 5784 5785 if (! has_sp) { /* SB9 applies only if there was no Sp* */ 5786 if ( after == SB_Close 5787 || after == SB_Sp 5788 || after == SB_Sep 5789 || after == SB_CR 5790 || after == SB_LF) 5791 { 5792 return FALSE; 5793 } 5794 } 5795 5796 /* SB10. This and SB9 could probably be combined some way, but khw 5797 * has decided to follow the Unicode rule book precisely for 5798 * simplified maintenance */ 5799 if ( after == SB_Sp 5800 || after == SB_Sep 5801 || after == SB_CR 5802 || after == SB_LF) 5803 { 5804 return FALSE; 5805 } 5806 } 5807 5808 /* SB11. */ 5809 return TRUE; 5810 } 5811 5812 /* Otherwise, do not break. 5813 SB12. Any �� Any */ 5814 5815 return FALSE; 5816} 5817 5818STATIC SB_enum 5819S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) 5820{ 5821 SB_enum sb; 5822 5823 PERL_ARGS_ASSERT_ADVANCE_ONE_SB; 5824 5825 if (*curpos >= strend) { 5826 return SB_EDGE; 5827 } 5828 5829 if (utf8_target) { 5830 do { 5831 *curpos += UTF8SKIP(*curpos); 5832 if (*curpos >= strend) { 5833 return SB_EDGE; 5834 } 5835 sb = getSB_VAL_UTF8(*curpos, strend); 5836 } while (sb == SB_Extend || sb == SB_Format); 5837 } 5838 else { 5839 do { 5840 (*curpos)++; 5841 if (*curpos >= strend) { 5842 return SB_EDGE; 5843 } 5844 sb = getSB_VAL_CP(**curpos); 5845 } while (sb == SB_Extend || sb == SB_Format); 5846 } 5847 5848 return sb; 5849} 5850 5851STATIC SB_enum 5852S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 5853{ 5854 SB_enum sb; 5855 5856 PERL_ARGS_ASSERT_BACKUP_ONE_SB; 5857 5858 if (*curpos < strbeg) { 5859 return SB_EDGE; 5860 } 5861 5862 if (utf8_target) { 5863 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 5864 if (! prev_char_pos) { 5865 return SB_EDGE; 5866 } 5867 5868 /* Back up over Extend and Format. curpos is always just to the right 5869 * of the character whose value we are getting */ 5870 do { 5871 U8 * prev_prev_char_pos; 5872 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, 5873 strbeg))) 5874 { 5875 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 5876 *curpos = prev_char_pos; 5877 prev_char_pos = prev_prev_char_pos; 5878 } 5879 else { 5880 *curpos = (U8 *) strbeg; 5881 return SB_EDGE; 5882 } 5883 } while (sb == SB_Extend || sb == SB_Format); 5884 } 5885 else { 5886 do { 5887 if (*curpos - 2 < strbeg) { 5888 *curpos = (U8 *) strbeg; 5889 return SB_EDGE; 5890 } 5891 (*curpos)--; 5892 sb = getSB_VAL_CP(*(*curpos - 1)); 5893 } while (sb == SB_Extend || sb == SB_Format); 5894 } 5895 5896 return sb; 5897} 5898 5899STATIC bool 5900S_isWB(pTHX_ WB_enum previous, 5901 WB_enum before, 5902 WB_enum after, 5903 const U8 * const strbeg, 5904 const U8 * const curpos, 5905 const U8 * const strend, 5906 const bool utf8_target) 5907{ 5908 /* Return a boolean as to if the boundary between 'before' and 'after' is 5909 * a Unicode word break, using their published algorithm, but tailored for 5910 * Perl by treating spans of white space as one unit. Context may be 5911 * needed to make this determination. If the value for the character 5912 * before 'before' is known, it is passed as 'previous'; otherwise that 5913 * should be set to WB_UNKNOWN. The other input parameters give the 5914 * boundaries and current position in the matching of the string. That 5915 * is, 'curpos' marks the position where the character whose wb value is 5916 * 'after' begins. See http://www.unicode.org/reports/tr29/ */ 5917 5918 U8 * before_pos = (U8 *) curpos; 5919 U8 * after_pos = (U8 *) curpos; 5920 WB_enum prev = before; 5921 WB_enum next; 5922 5923 PERL_ARGS_ASSERT_ISWB; 5924 5925 /* Rule numbers in the comments below are as of Unicode 9.0 */ 5926 5927 redo: 5928 before = prev; 5929 switch (WB_table[before][after]) { 5930 case WB_BREAKABLE: 5931 return TRUE; 5932 5933 case WB_NOBREAK: 5934 return FALSE; 5935 5936 case WB_hs_then_hs: /* 2 horizontal spaces in a row */ 5937 next = advance_one_WB(&after_pos, strend, utf8_target, 5938 FALSE /* Don't skip Extend nor Format */ ); 5939 /* A space immediately preceding an Extend or Format is attached 5940 * to by them, and hence gets separated from previous spaces. 5941 * Otherwise don't break between horizontal white space */ 5942 return next == WB_Extend || next == WB_Format; 5943 5944 /* WB4 Ignore Format and Extend characters, except when they appear at 5945 * the beginning of a region of text. This code currently isn't 5946 * general purpose, but it works as the rules are currently and likely 5947 * to be laid out. The reason it works is that when 'they appear at 5948 * the beginning of a region of text', the rule is to break before 5949 * them, just like any other character. Therefore, the default rule 5950 * applies and we don't have to look in more depth. Should this ever 5951 * change, we would have to have 2 'case' statements, like in the rules 5952 * below, and backup a single character (not spacing over the extend 5953 * ones) and then see if that is one of the region-end characters and 5954 * go from there */ 5955 case WB_Ex_or_FO_or_ZWJ_then_foo: 5956 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 5957 goto redo; 5958 5959 case WB_DQ_then_HL + WB_BREAKABLE: 5960 case WB_DQ_then_HL + WB_NOBREAK: 5961 5962 /* WB7c Hebrew_Letter Double_Quote �� Hebrew_Letter */ 5963 5964 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 5965 == WB_Hebrew_Letter) 5966 { 5967 return FALSE; 5968 } 5969 5970 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE; 5971 5972 case WB_HL_then_DQ + WB_BREAKABLE: 5973 case WB_HL_then_DQ + WB_NOBREAK: 5974 5975 /* WB7b Hebrew_Letter �� Double_Quote Hebrew_Letter */ 5976 5977 if (advance_one_WB(&after_pos, strend, utf8_target, 5978 TRUE /* Do skip Extend and Format */ ) 5979 == WB_Hebrew_Letter) 5980 { 5981 return FALSE; 5982 } 5983 5984 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE; 5985 5986 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK: 5987 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE: 5988 5989 /* WB6 (ALetter | Hebrew_Letter) �� (MidLetter | MidNumLet 5990 * | Single_Quote) (ALetter | Hebrew_Letter) */ 5991 5992 next = advance_one_WB(&after_pos, strend, utf8_target, 5993 TRUE /* Do skip Extend and Format */ ); 5994 5995 if (next == WB_ALetter || next == WB_Hebrew_Letter) 5996 { 5997 return FALSE; 5998 } 5999 6000 return WB_table[before][after] 6001 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE; 6002 6003 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK: 6004 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE: 6005 6006 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet 6007 * | Single_Quote) �� (ALetter | Hebrew_Letter) */ 6008 6009 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); 6010 if (prev == WB_ALetter || prev == WB_Hebrew_Letter) 6011 { 6012 return FALSE; 6013 } 6014 6015 return WB_table[before][after] 6016 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE; 6017 6018 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK: 6019 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE: 6020 6021 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) �� Numeric 6022 * */ 6023 6024 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) 6025 == WB_Numeric) 6026 { 6027 return FALSE; 6028 } 6029 6030 return WB_table[before][after] 6031 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE; 6032 6033 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK: 6034 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE: 6035 6036 /* WB12 Numeric �� (MidNum | MidNumLet | Single_Quote) Numeric */ 6037 6038 if (advance_one_WB(&after_pos, strend, utf8_target, 6039 TRUE /* Do skip Extend and Format */ ) 6040 == WB_Numeric) 6041 { 6042 return FALSE; 6043 } 6044 6045 return WB_table[before][after] 6046 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE; 6047 6048 case WB_RI_then_RI + WB_NOBREAK: 6049 case WB_RI_then_RI + WB_BREAKABLE: 6050 { 6051 int RI_count = 1; 6052 6053 /* Do not break within emoji flag sequences. That is, do not 6054 * break between regional indicator (RI) symbols if there is an 6055 * odd number of RI characters before the potential break 6056 * point. 6057 * 6058 * WB15 sot (RI RI)* RI �� RI 6059 * WB16 [^RI] (RI RI)* RI �� RI */ 6060 6061 while (backup_one_WB(&previous, 6062 strbeg, 6063 &before_pos, 6064 utf8_target) == WB_Regional_Indicator) 6065 { 6066 RI_count++; 6067 } 6068 6069 return RI_count % 2 != 1; 6070 } 6071 6072 default: 6073 break; 6074 } 6075 6076#ifdef DEBUGGING 6077 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n", 6078 before, after, WB_table[before][after]); 6079 assert(0); 6080#endif 6081 return TRUE; 6082} 6083 6084STATIC WB_enum 6085S_advance_one_WB(pTHX_ U8 ** curpos, 6086 const U8 * const strend, 6087 const bool utf8_target, 6088 const bool skip_Extend_Format) 6089{ 6090 WB_enum wb; 6091 6092 PERL_ARGS_ASSERT_ADVANCE_ONE_WB; 6093 6094 if (*curpos >= strend) { 6095 return WB_EDGE; 6096 } 6097 6098 if (utf8_target) { 6099 6100 /* Advance over Extend and Format */ 6101 do { 6102 *curpos += UTF8SKIP(*curpos); 6103 if (*curpos >= strend) { 6104 return WB_EDGE; 6105 } 6106 wb = getWB_VAL_UTF8(*curpos, strend); 6107 } while ( skip_Extend_Format 6108 && (wb == WB_Extend || wb == WB_Format)); 6109 } 6110 else { 6111 do { 6112 (*curpos)++; 6113 if (*curpos >= strend) { 6114 return WB_EDGE; 6115 } 6116 wb = getWB_VAL_CP(**curpos); 6117 } while ( skip_Extend_Format 6118 && (wb == WB_Extend || wb == WB_Format)); 6119 } 6120 6121 return wb; 6122} 6123 6124STATIC WB_enum 6125S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target) 6126{ 6127 WB_enum wb; 6128 6129 PERL_ARGS_ASSERT_BACKUP_ONE_WB; 6130 6131 /* If we know what the previous character's break value is, don't have 6132 * to look it up */ 6133 if (*previous != WB_UNKNOWN) { 6134 wb = *previous; 6135 6136 /* But we need to move backwards by one */ 6137 if (utf8_target) { 6138 *curpos = reghopmaybe3(*curpos, -1, strbeg); 6139 if (! *curpos) { 6140 *previous = WB_EDGE; 6141 *curpos = (U8 *) strbeg; 6142 } 6143 else { 6144 *previous = WB_UNKNOWN; 6145 } 6146 } 6147 else { 6148 (*curpos)--; 6149 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN; 6150 } 6151 6152 /* And we always back up over these three types */ 6153 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) { 6154 return wb; 6155 } 6156 } 6157 6158 if (*curpos < strbeg) { 6159 return WB_EDGE; 6160 } 6161 6162 if (utf8_target) { 6163 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); 6164 if (! prev_char_pos) { 6165 return WB_EDGE; 6166 } 6167 6168 /* Back up over Extend and Format. curpos is always just to the right 6169 * of the character whose value we are getting */ 6170 do { 6171 U8 * prev_prev_char_pos; 6172 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, 6173 -1, 6174 strbeg))) 6175 { 6176 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); 6177 *curpos = prev_char_pos; 6178 prev_char_pos = prev_prev_char_pos; 6179 } 6180 else { 6181 *curpos = (U8 *) strbeg; 6182 return WB_EDGE; 6183 } 6184 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ); 6185 } 6186 else { 6187 do { 6188 if (*curpos - 2 < strbeg) { 6189 *curpos = (U8 *) strbeg; 6190 return WB_EDGE; 6191 } 6192 (*curpos)--; 6193 wb = getWB_VAL_CP(*(*curpos - 1)); 6194 } while (wb == WB_Extend || wb == WB_Format); 6195 } 6196 6197 return wb; 6198} 6199 6200/* Macros for regmatch(), using its internal variables */ 6201#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ 6202#define NEXTCHR_IS_EOS (nextbyte < 0) 6203 6204#define SET_nextchr \ 6205 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) 6206 6207#define SET_locinput(p) \ 6208 locinput = (p); \ 6209 SET_nextchr 6210 6211#define sayYES goto yes 6212#define sayNO goto no 6213#define sayNO_SILENT goto no_silent 6214 6215/* we don't use STMT_START/END here because it leads to 6216 "unreachable code" warnings, which are bogus, but distracting. */ 6217#define CACHEsayNO \ 6218 if (ST.cache_mask) \ 6219 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ 6220 sayNO 6221 6222#define EVAL_CLOSE_PAREN_IS(st,expr) \ 6223( \ 6224 ( ( st ) ) && \ 6225 ( ( st )->u.eval.close_paren ) && \ 6226 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 6227) 6228 6229#define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \ 6230( \ 6231 ( ( st ) ) && \ 6232 ( ( st )->u.eval.close_paren ) && \ 6233 ( ( expr ) ) && \ 6234 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \ 6235) 6236 6237 6238#define EVAL_CLOSE_PAREN_SET(st,expr) \ 6239 (st)->u.eval.close_paren = ( (expr) + 1 ) 6240 6241#define EVAL_CLOSE_PAREN_CLEAR(st) \ 6242 (st)->u.eval.close_paren = 0 6243 6244/* push a new state then goto it */ 6245 6246#define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ 6247 pushinput = input; \ 6248 pusheol = eol; \ 6249 pushsr0 = sr0; \ 6250 scan = node; \ 6251 st->resume_state = state; \ 6252 goto push_state; 6253 6254/* push a new state with success backtracking, then goto it */ 6255 6256#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ 6257 pushinput = input; \ 6258 pusheol = eol; \ 6259 pushsr0 = sr0; \ 6260 scan = node; \ 6261 st->resume_state = state; \ 6262 goto push_yes_state; 6263 6264#define DEBUG_STATE_pp(pp) \ 6265 DEBUG_STATE_r({ \ 6266 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \ 6267 Perl_re_printf( aTHX_ \ 6268 "%*s" pp " %s%s%s%s%s\n", \ 6269 INDENT_CHARS(depth), "", \ 6270 REGNODE_NAME(st->resume_state), \ 6271 ((st==yes_state||st==mark_state) ? "[" : ""), \ 6272 ((st==yes_state) ? "Y" : ""), \ 6273 ((st==mark_state) ? "M" : ""), \ 6274 ((st==yes_state||st==mark_state) ? "]" : "") \ 6275 ); \ 6276 }); 6277 6278/* 6279 6280regmatch() - main matching routine 6281 6282This is basically one big switch statement in a loop. We execute an op, 6283set 'next' to point the next op, and continue. If we come to a point which 6284we may need to backtrack to on failure such as (A|B|C), we push a 6285backtrack state onto the backtrack stack. On failure, we pop the top 6286state, and re-enter the loop at the state indicated. If there are no more 6287states to pop, we return failure. 6288 6289Sometimes we also need to backtrack on success; for example /A+/, where 6290after successfully matching one A, we need to go back and try to 6291match another one; similarly for lookahead assertions: if the assertion 6292completes successfully, we backtrack to the state just before the assertion 6293and then carry on. In these cases, the pushed state is marked as 6294'backtrack on success too'. This marking is in fact done by a chain of 6295pointers, each pointing to the previous 'yes' state. On success, we pop to 6296the nearest yes state, discarding any intermediate failure-only states. 6297Sometimes a yes state is pushed just to force some cleanup code to be 6298called at the end of a successful match or submatch; e.g. (??{$re}) uses 6299it to free the inner regex. 6300 6301Note that failure backtracking rewinds the cursor position, while 6302success backtracking leaves it alone. 6303 6304A pattern is complete when the END op is executed, while a subpattern 6305such as (?=foo) is complete when the SUCCESS op is executed. Both of these 6306ops trigger the "pop to last yes state if any, otherwise return true" 6307behaviour. 6308 6309A common convention in this function is to use A and B to refer to the two 6310subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is 6311the subpattern to be matched possibly multiple times, while B is the entire 6312rest of the pattern. Variable and state names reflect this convention. 6313 6314The states in the main switch are the union of ops and failure/success of 6315substates associated with that op. For example, IFMATCH is the op 6316that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 6317'execute IFMATCH'; while IFMATCH_A is a state saying that we have just 6318successfully matched A and IFMATCH_A_fail is a state saying that we have 6319just failed to match A. Resume states always come in pairs. The backtrack 6320state we push is marked as 'IFMATCH_A', but when that is popped, we resume 6321at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking 6322on success or failure. 6323 6324The struct that holds a backtracking state is actually a big union, with 6325one variant for each major type of op. The variable st points to the 6326top-most backtrack struct. To make the code clearer, within each 6327block of code we #define ST to alias the relevant union. 6328 6329Here's a concrete example of a (vastly oversimplified) IFMATCH 6330implementation: 6331 6332 switch (state) { 6333 .... 6334 6335#define ST st->u.ifmatch 6336 6337 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 6338 ST.foo = ...; // some state we wish to save 6339 ... 6340 // push a yes backtrack state with a resume value of 6341 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the 6342 // first node of A: 6343 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); 6344 // NOTREACHED 6345 6346 case IFMATCH_A: // we have successfully executed A; now continue with B 6347 next = B; 6348 bar = ST.foo; // do something with the preserved value 6349 break; 6350 6351 case IFMATCH_A_fail: // A failed, so the assertion failed 6352 ...; // do some housekeeping, then ... 6353 sayNO; // propagate the failure 6354 6355#undef ST 6356 6357 ... 6358 } 6359 6360For any old-timers reading this who are familiar with the old recursive 6361approach, the code above is equivalent to: 6362 6363 case IFMATCH: // we are executing the IFMATCH op, (?=A)B 6364 { 6365 int foo = ... 6366 ... 6367 if (regmatch(A)) { 6368 next = B; 6369 bar = foo; 6370 break; 6371 } 6372 ...; // do some housekeeping, then ... 6373 sayNO; // propagate the failure 6374 } 6375 6376The topmost backtrack state, pointed to by st, is usually free. If you 6377want to claim it, populate any ST.foo fields in it with values you wish to 6378save, then do one of 6379 6380 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol); 6381 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol); 6382 6383which sets that backtrack state's resume value to 'resume_state', pushes a 6384new free entry to the top of the backtrack stack, then goes to 'node'. 6385On backtracking, the free slot is popped, and the saved state becomes the 6386new free state. An ST.foo field in this new top state can be temporarily 6387accessed to retrieve values, but once the main loop is re-entered, it 6388becomes available for reuse. 6389 6390Note that the depth of the backtrack stack constantly increases during the 6391left-to-right execution of the pattern, rather than going up and down with 6392the pattern nesting. For example the stack is at its maximum at Z at the 6393end of the pattern, rather than at X in the following: 6394 6395 /(((X)+)+)+....(Y)+....Z/ 6396 6397The only exceptions to this are lookahead/behind assertions and the cut, 6398(?>A), which pop all the backtrack states associated with A before 6399continuing. 6400 6401Backtrack state structs are allocated in slabs of about 4K in size. 6402PL_regmatch_state and st always point to the currently active state, 6403and PL_regmatch_slab points to the slab currently containing 6404PL_regmatch_state. The first time regmatch() is called, the first slab is 6405allocated, and is never freed until interpreter destruction. When the slab 6406is full, a new one is allocated and chained to the end. At exit from 6407regmatch(), slabs allocated since entry are freed. 6408 6409In order to work with variable length lookbehinds, an upper limit is placed on 6410lookbehinds which is set to where the match position is at the end of where the 6411lookbehind would get to. Nothing in the lookbehind should match above that, 6412except we should be able to look beyond if for things like \b, which need the 6413next character in the string to be able to determine if this is a boundary or 6414not. We also can't match the end of string/line unless we are also at the end 6415of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs 6416that match a width, we have to add a condition that they are within the legal 6417bounds of our window into the string. 6418 6419*/ 6420 6421/* returns -1 on failure, $+[0] on success */ 6422STATIC SSize_t 6423S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) 6424{ 6425 const bool utf8_target = reginfo->is_utf8_target; 6426 const U32 uniflags = UTF8_ALLOW_DEFAULT; 6427 REGEXP *rex_sv = reginfo->prog; 6428 regexp *rex = ReANY(rex_sv); 6429 RXi_GET_DECL(rex,rexi); 6430 /* the current state. This is a cached copy of PL_regmatch_state */ 6431 regmatch_state *st; 6432 /* cache heavy used fields of st in registers */ 6433 regnode *scan; 6434 regnode *next; 6435 U32 n = 0; /* general value; init to avoid compiler warning */ 6436 U32 utmp = 0; /* tmp variable - valid for at most one opcode */ 6437 SSize_t ln = 0; /* len or last; init to avoid compiler warning */ 6438 SSize_t endref = 0; /* offset of end of backref when ln is start */ 6439 char *locinput = startpos; 6440 char *loceol = reginfo->strend; 6441 char *pushinput; /* where to continue after a PUSH */ 6442 char *pusheol; /* where to stop matching (loceol) after a PUSH */ 6443 U8 *pushsr0; /* save starting pos of script run */ 6444 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1 6445 at EOS */ 6446 6447 bool result = 0; /* return value of S_regmatch */ 6448 U32 depth = 0; /* depth of backtrack stack */ 6449 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ 6450 const U32 max_nochange_depth = 6451 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? 6452 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; 6453 regmatch_state *yes_state = NULL; /* state to pop to on success of 6454 subpattern */ 6455 /* mark_state piggy backs on the yes_state logic so that when we unwind 6456 the stack on success we can update the mark_state as we go */ 6457 regmatch_state *mark_state = NULL; /* last mark state we have seen */ 6458 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ 6459 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ 6460 U32 state_num; 6461 bool no_final = 0; /* prevent failure from backtracking? */ 6462 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ 6463 char *startpoint = locinput; 6464 SV *popmark = NULL; /* are we looking for a mark? */ 6465 SV *sv_commit = NULL; /* last mark name seen in failure */ 6466 SV *sv_yes_mark = NULL; /* last mark name we have seen 6467 during a successful match */ 6468 U32 lastopen = 0; /* last open we saw */ 6469 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0; 6470 SV* const oreplsv = GvSVn(PL_replgv); 6471 /* these three flags are set by various ops to signal information to 6472 * the very next op. They have a useful lifetime of exactly one loop 6473 * iteration, and are not preserved or restored by state pushes/pops 6474 */ 6475 bool sw = 0; /* the condition value in (?(cond)a|b) */ 6476 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ 6477 int logical = 0; /* the following EVAL is: 6478 0: (?{...}) 6479 1: (?(?{...})X|Y) 6480 2: (??{...}) 6481 or the following IFMATCH/UNLESSM is: 6482 false: plain (?=foo) 6483 true: used as a condition: (?(?=foo)) 6484 */ 6485 PAD* last_pad = NULL; 6486 dMULTICALL; 6487 U8 gimme = G_SCALAR; 6488 CV *caller_cv = NULL; /* who called us */ 6489 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ 6490 U32 maxopenparen = 0; /* max '(' index seen so far */ 6491 int to_complement; /* Invert the result? */ 6492 char_class_number_ classnum; 6493 bool is_utf8_pat = reginfo->is_utf8_pat; 6494 bool match = FALSE; 6495 I32 orig_savestack_ix = PL_savestack_ix; 6496 U8 * script_run_begin = NULL; 6497 char *match_end= NULL; /* where a match MUST end to be considered successful */ 6498 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */ 6499 re_fold_t folder = NULL; /* used by various EXACTish regops */ 6500 const U8 * fold_array = NULL; /* used by various EXACTish regops */ 6501 6502/* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */ 6503#if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL)) 6504# define SOLARIS_BAD_OPTIMIZER 6505 const U32 *pl_charclass_dup = PL_charclass; 6506# define PL_charclass pl_charclass_dup 6507#endif 6508 6509#ifdef DEBUGGING 6510 DECLARE_AND_GET_RE_DEBUG_FLAGS; 6511#endif 6512 6513 /* protect against undef(*^R) */ 6514 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); 6515 6516 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ 6517 multicall_oldcatch = 0; 6518 PERL_UNUSED_VAR(multicall_cop); 6519 6520 PERL_ARGS_ASSERT_REGMATCH; 6521 6522 st = PL_regmatch_state; 6523 6524 /* Note that nextbyte is a byte even in UTF */ 6525 SET_nextchr; 6526 scan = prog; 6527 6528 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ 6529 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 6530 Perl_re_printf( aTHX_ "regmatch start\n" ); 6531 })); 6532 6533 while (scan != NULL) { 6534 next = scan + NEXT_OFF(scan); 6535 if (next == scan) 6536 next = NULL; 6537 state_num = OP(scan); 6538 6539 reenter_switch: 6540 DEBUG_EXECUTE_r( 6541 if (state_num <= REGNODE_MAX) { 6542 SV * const prop = sv_newmortal(); 6543 regnode *rnext = regnext(scan); 6544 6545 DUMP_EXEC_POS( locinput, scan, utf8_target, depth ); 6546 regprop(rex, prop, scan, reginfo, NULL); 6547 Perl_re_printf( aTHX_ 6548 "%*s%" IVdf ":%s(%" IVdf ")\n", 6549 INDENT_CHARS(depth), "", 6550 (IV)(scan - rexi->program), 6551 SvPVX_const(prop), 6552 (REGNODE_TYPE(OP(scan)) == END || !rnext) ? 6553 0 : (IV)(rnext - rexi->program)); 6554 } 6555 ); 6556 6557 to_complement = 0; 6558 6559 SET_nextchr; 6560 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS)); 6561 6562 switch (state_num) { 6563 SV * anyofh_list; 6564 6565 case SBOL: /* /^../ and /\A../ */ 6566 if (locinput == reginfo->strbeg) 6567 break; 6568 sayNO; 6569 6570 case MBOL: /* /^../m */ 6571 if (locinput == reginfo->strbeg || 6572 (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) 6573 { 6574 break; 6575 } 6576 sayNO; 6577 6578 case GPOS: /* \G */ 6579 if (locinput == reginfo->ganch) 6580 break; 6581 sayNO; 6582 6583 case KEEPS: /* \K */ 6584 /* update the startpoint */ 6585 st->u.keeper.val = RXp_OFFS_START(rex,0); 6586 RXp_OFFSp(rex)[0].start = locinput - reginfo->strbeg; 6587 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, 6588 script_run_begin); 6589 NOT_REACHED; /* NOTREACHED */ 6590 6591 case KEEPS_next_fail: 6592 /* rollback the start point change */ 6593 RXp_OFFSp(rex)[0].start = st->u.keeper.val; 6594 sayNO_SILENT; 6595 NOT_REACHED; /* NOTREACHED */ 6596 6597 case MEOL: /* /..$/m */ 6598 if (!NEXTCHR_IS_EOS && nextbyte != '\n') 6599 sayNO; 6600 break; 6601 6602 case SEOL: /* /..$/ */ 6603 if (!NEXTCHR_IS_EOS && nextbyte != '\n') 6604 sayNO; 6605 if (reginfo->strend - locinput > 1) 6606 sayNO; 6607 break; 6608 6609 case EOS: /* \z */ 6610 if (!NEXTCHR_IS_EOS) 6611 sayNO; 6612 break; 6613 6614 case SANY: /* /./s */ 6615 if (NEXTCHR_IS_EOS || locinput >= loceol) 6616 sayNO; 6617 goto increment_locinput; 6618 6619 case REG_ANY: /* /./ */ 6620 if ( NEXTCHR_IS_EOS 6621 || locinput >= loceol 6622 || nextbyte == '\n') 6623 { 6624 sayNO; 6625 } 6626 goto increment_locinput; 6627 6628 6629#undef ST 6630#define ST st->u.trie 6631 case TRIEC: /* (ab|cd) with known charclass */ 6632 /* In this case the charclass data is available inline so 6633 we can fail fast without a lot of extra overhead. 6634 */ 6635 if ( ! NEXTCHR_IS_EOS 6636 && locinput < loceol 6637 && ! ANYOF_BITMAP_TEST(scan, nextbyte)) 6638 { 6639 DEBUG_EXECUTE_r( 6640 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6641 depth, PL_colors[4], PL_colors[5]) 6642 ); 6643 sayNO_SILENT; 6644 NOT_REACHED; /* NOTREACHED */ 6645 } 6646 /* FALLTHROUGH */ 6647 case TRIE: /* (ab|cd) */ 6648 /* the basic plan of execution of the trie is: 6649 * At the beginning, run though all the states, and 6650 * find the longest-matching word. Also remember the position 6651 * of the shortest matching word. For example, this pattern: 6652 * 1 2 3 4 5 6653 * ab|a|x|abcd|abc 6654 * when matched against the string "abcde", will generate 6655 * accept states for all words except 3, with the longest 6656 * matching word being 4, and the shortest being 2 (with 6657 * the position being after char 1 of the string). 6658 * 6659 * Then for each matching word, in word order (i.e. 1,2,4,5), 6660 * we run the remainder of the pattern; on each try setting 6661 * the current position to the character following the word, 6662 * returning to try the next word on failure. 6663 * 6664 * We avoid having to build a list of words at runtime by 6665 * using a compile-time structure, wordinfo[].prev, which 6666 * gives, for each word, the previous accepting word (if any). 6667 * In the case above it would contain the mappings 1->2, 2->0, 6668 * 3->0, 4->5, 5->1. We can use this table to generate, from 6669 * the longest word (4 above), a list of all words, by 6670 * following the list of prev pointers; this gives us the 6671 * unordered list 4,5,1,2. Then given the current word we have 6672 * just tried, we can go through the list and find the 6673 * next-biggest word to try (so if we just failed on word 2, 6674 * the next in the list is 4). 6675 * 6676 * Since at runtime we don't record the matching position in 6677 * the string for each word, we have to work that out for 6678 * each word we're about to process. The wordinfo table holds 6679 * the character length of each word; given that we recorded 6680 * at the start: the position of the shortest word and its 6681 * length in chars, we just need to move the pointer the 6682 * difference between the two char lengths. Depending on 6683 * Unicode status and folding, that's cheap or expensive. 6684 * 6685 * This algorithm is optimised for the case where are only a 6686 * small number of accept states, i.e. 0,1, or maybe 2. 6687 * With lots of accepts states, and having to try all of them, 6688 * it becomes quadratic on number of accept states to find all 6689 * the next words. 6690 */ 6691 6692 { 6693 /* what type of TRIE am I? (utf8 makes this contextual) */ 6694 DECL_TRIE_TYPE(scan); 6695 6696 /* what trie are we using right now */ 6697 reg_trie_data * const trie 6698 = (reg_trie_data*)rexi->data->data[ ARG1u( scan ) ]; 6699 ST.before_paren = trie->before_paren; 6700 ST.after_paren = trie->after_paren; 6701 assert(ST.before_paren<=rex->nparens); 6702 assert(ST.after_paren<=rex->nparens); 6703 6704 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG1u( scan ) + 1 ]); 6705 U32 state = trie->startstate; 6706 6707 if (FLAGS(scan) == EXACTL || FLAGS(scan) == EXACTFLU8) { 6708 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 6709 if (utf8_target 6710 && ! NEXTCHR_IS_EOS 6711 && UTF8_IS_ABOVE_LATIN1(nextbyte) 6712 && FLAGS(scan) == EXACTL) 6713 { 6714 /* We only output for EXACTL, as we let the folder 6715 * output this message for EXACTFLU8 to avoid 6716 * duplication */ 6717 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 6718 reginfo->strend); 6719 } 6720 } 6721 if ( trie->bitmap 6722 && ( NEXTCHR_IS_EOS 6723 || locinput >= loceol 6724 || ! TRIE_BITMAP_TEST(trie, nextbyte))) 6725 { 6726 if (trie->states[ state ].wordnum) { 6727 DEBUG_EXECUTE_r( 6728 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n", 6729 depth, PL_colors[4], PL_colors[5]) 6730 ); 6731 if (!trie->jump) 6732 break; 6733 } else { 6734 DEBUG_EXECUTE_r( 6735 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", 6736 depth, PL_colors[4], PL_colors[5]) 6737 ); 6738 sayNO_SILENT; 6739 } 6740 } 6741 6742 { 6743 U8 *uc = ( U8* )locinput; 6744 6745 STRLEN len = 0; 6746 STRLEN foldlen = 0; 6747 U8 *uscan = (U8*)NULL; 6748 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; 6749 U32 charcount = 0; /* how many input chars we have matched */ 6750 U32 accepted = 0; /* have we seen any accepting states? */ 6751 6752 ST.jump = trie->jump; 6753 ST.j_before_paren = trie->j_before_paren; 6754 ST.j_after_paren= trie->j_after_paren; 6755 ST.me = scan; 6756 ST.firstpos = NULL; 6757 ST.longfold = FALSE; /* char longer if folded => it's harder */ 6758 ST.nextword = 0; 6759 6760 /* fully traverse the TRIE; note the position of the 6761 shortest accept state and the wordnum of the longest 6762 accept state */ 6763 6764 while ( state && uc <= (U8*)(loceol) ) { 6765 U32 base = trie->states[ state ].trans.base; 6766 UV uvc = 0; 6767 U16 charid = 0; 6768 U16 wordnum; 6769 wordnum = trie->states[ state ].wordnum; 6770 6771 if (wordnum) { /* it's an accept state */ 6772 if (!accepted) { 6773 accepted = 1; 6774 /* record first match position */ 6775 if (ST.longfold) { 6776 ST.firstpos = (U8*)locinput; 6777 ST.firstchars = 0; 6778 } 6779 else { 6780 ST.firstpos = uc; 6781 ST.firstchars = charcount; 6782 } 6783 } 6784 if (!ST.nextword || wordnum < ST.nextword) 6785 ST.nextword = wordnum; 6786 ST.topword = wordnum; 6787 } 6788 6789 DEBUG_TRIE_EXECUTE_r({ 6790 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth ); 6791 /* HERE */ 6792 PerlIO_printf( Perl_debug_log, 6793 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ", 6794 INDENT_CHARS(depth), "", PL_colors[4], 6795 (UV)state, (accepted ? 'Y' : 'N')); 6796 }); 6797 6798 /* read a char and goto next state */ 6799 if ( base && (foldlen || uc < (U8*)(loceol))) { 6800 I32 offset; 6801 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, 6802 (U8 *) loceol, uscan, 6803 len, uvc, charid, foldlen, 6804 foldbuf, uniflags); 6805 charcount++; 6806 if (foldlen>0) 6807 ST.longfold = TRUE; 6808 if (charid && 6809 ( ((offset = 6810 base + charid - 1 - trie->uniquecharcount)) >= 0) 6811 6812 && ((U32)offset < trie->lasttrans) 6813 && trie->trans[offset].check == state) 6814 { 6815 state = trie->trans[offset].next; 6816 } 6817 else { 6818 state = 0; 6819 } 6820 uc += len; 6821 6822 } 6823 else { 6824 state = 0; 6825 } 6826 DEBUG_TRIE_EXECUTE_r( 6827 Perl_re_printf( aTHX_ 6828 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n", 6829 charid, uvc, (UV)state, PL_colors[5] ); 6830 ); 6831 } 6832 if (!accepted) 6833 sayNO; 6834 6835 /* calculate total number of accept states */ 6836 { 6837 U16 w = ST.topword; 6838 accepted = 0; 6839 while (w) { 6840 w = trie->wordinfo[w].prev; 6841 accepted++; 6842 } 6843 ST.accepted = accepted; 6844 } 6845 6846 DEBUG_EXECUTE_r( 6847 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n", 6848 depth, 6849 PL_colors[4], (IV)ST.accepted, PL_colors[5] ); 6850 ); 6851 goto trie_first_try; /* jump into the fail handler */ 6852 }} 6853 NOT_REACHED; /* NOTREACHED */ 6854 6855 case TRIE_next_fail: /* we failed - try next alternative */ 6856 { 6857 U8 *uc; 6858 if (RE_PESSIMISTIC_PARENS) { 6859 REGCP_UNWIND(ST.lastcp); 6860 regcppop(rex,&maxopenparen); 6861 } 6862 if ( ST.jump ) { 6863 /* undo any captures done in the tail part of a branch, 6864 * e.g. 6865 * /(?:X(.)(.)|Y(.)).../ 6866 * where the trie just matches X then calls out to do the 6867 * rest of the branch */ 6868 REGCP_UNWIND(ST.cp); 6869 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 6870 if (ST.after_paren) { 6871 assert(ST.before_paren<=rex->nparens && ST.after_paren<=rex->nparens); 6872 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "TRIE_next_fail"); 6873 } 6874 } 6875 if (!--ST.accepted) { 6876 DEBUG_EXECUTE_r({ 6877 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n", 6878 depth, 6879 PL_colors[4], 6880 PL_colors[5] ); 6881 }); 6882 sayNO_SILENT; 6883 } 6884 { 6885 /* Find next-highest word to process. Note that this code 6886 * is O(N^2) per trie run (O(N) per branch), so keep tight */ 6887 U16 min = 0; 6888 U16 word; 6889 U16 const nextword = ST.nextword; 6890 reg_trie_wordinfo * const wordinfo 6891 = ((reg_trie_data*)rexi->data->data[ARG1u(ST.me)])->wordinfo; 6892 for (word=ST.topword; word; word=wordinfo[word].prev) { 6893 if (word > nextword && (!min || word < min)) 6894 min = word; 6895 } 6896 ST.nextword = min; 6897 } 6898 6899 trie_first_try: 6900 if (do_cutgroup) { 6901 do_cutgroup = 0; 6902 no_final = 0; 6903 } 6904 6905 if ( ST.jump ) { 6906 ST.lastparen = RXp_LASTPAREN(rex); 6907 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 6908 REGCP_SET(ST.cp); 6909 } 6910 6911 /* find start char of end of current word */ 6912 { 6913 U32 chars; /* how many chars to skip */ 6914 reg_trie_data * const trie 6915 = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)]; 6916 6917 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) 6918 >= ST.firstchars); 6919 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) 6920 - ST.firstchars; 6921 uc = ST.firstpos; 6922 6923 if (ST.longfold) { 6924 /* the hard option - fold each char in turn and find 6925 * its folded length (which may be different */ 6926 U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; 6927 STRLEN foldlen; 6928 STRLEN len; 6929 UV uvc; 6930 U8 *uscan; 6931 6932 while (chars) { 6933 if (utf8_target) { 6934 /* XXX This assumes the length is well-formed, as 6935 * does the UTF8SKIP below */ 6936 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, 6937 uniflags); 6938 uc += len; 6939 } 6940 else { 6941 uvc = *uc; 6942 uc++; 6943 } 6944 uvc = to_uni_fold(uvc, foldbuf, &foldlen); 6945 uscan = foldbuf; 6946 while (foldlen) { 6947 if (!--chars) 6948 break; 6949 uvc = utf8n_to_uvchr(uscan, foldlen, &len, 6950 uniflags); 6951 uscan += len; 6952 foldlen -= len; 6953 } 6954 } 6955 } 6956 else { 6957 if (utf8_target) 6958 uc = utf8_hop(uc, chars); 6959 else 6960 uc += chars; 6961 } 6962 } 6963 if (ST.jump && ST.jump[ST.nextword]) { 6964 scan = ST.me + ST.jump[ST.nextword]; 6965 ST.before_paren = ST.j_before_paren[ST.nextword]; 6966 assert(ST.before_paren <= rex->nparens); 6967 ST.after_paren = ST.j_after_paren[ST.nextword]; 6968 assert(ST.after_paren <= rex->nparens); 6969 } else { 6970 scan = ST.me + NEXT_OFF(ST.me); 6971 } 6972 6973 6974 DEBUG_EXECUTE_r({ 6975 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n", 6976 depth, 6977 PL_colors[4], 6978 ST.nextword, 6979 PL_colors[5] 6980 ); 6981 }); 6982 6983 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { 6984 if (RE_PESSIMISTIC_PARENS) { 6985 (void)regcppush(rex, 0, maxopenparen); 6986 REGCP_SET(ST.lastcp); 6987 } 6988 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, 6989 script_run_begin); 6990 NOT_REACHED; /* NOTREACHED */ 6991 } 6992 /* only one choice left - just continue */ 6993 DEBUG_EXECUTE_r({ 6994 AV *const trie_words 6995 = MUTABLE_AV(rexi->data->data[ARG1u(ST.me)+TRIE_WORDS_OFFSET]); 6996 SV ** const tmp = trie_words 6997 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL; 6998 SV *sv= tmp ? sv_newmortal() : NULL; 6999 7000 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n", 7001 depth, PL_colors[4], 7002 ST.nextword, 7003 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, 7004 PL_colors[0], PL_colors[1], 7005 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII 7006 ) 7007 : "not compiled under -Dr", 7008 PL_colors[5] ); 7009 }); 7010 7011 locinput = (char*)uc; 7012 continue; /* execute rest of RE */ 7013 /* NOTREACHED */ 7014 } 7015#undef ST 7016 7017 case LEXACT_REQ8: 7018 if (! utf8_target) { 7019 sayNO; 7020 } 7021 /* FALLTHROUGH */ 7022 7023 case LEXACT: 7024 { 7025 char *s; 7026 7027 s = STRINGl(scan); 7028 ln = STR_LENl(scan); 7029 goto join_short_long_exact; 7030 7031 case EXACTL: /* /abc/l */ 7032 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7033 7034 /* Complete checking would involve going through every character 7035 * matched by the string to see if any is above latin1. But the 7036 * comparison otherwise might very well be a fast assembly 7037 * language routine, and I (khw) don't think slowing things down 7038 * just to check for this warning is worth it. So this just checks 7039 * the first character */ 7040 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) { 7041 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); 7042 } 7043 goto do_exact; 7044 case EXACT_REQ8: 7045 if (! utf8_target) { 7046 sayNO; 7047 } 7048 /* FALLTHROUGH */ 7049 7050 case EXACT: /* /abc/ */ 7051 do_exact: 7052 s = STRINGs(scan); 7053 ln = STR_LENs(scan); 7054 7055 join_short_long_exact: 7056 if (utf8_target != is_utf8_pat) { 7057 /* The target and the pattern have differing utf8ness. */ 7058 char *l = locinput; 7059 const char * const e = s + ln; 7060 7061 if (utf8_target) { 7062 /* The target is utf8, the pattern is not utf8. 7063 * Above-Latin1 code points can't match the pattern; 7064 * invariants match exactly, and the other Latin1 ones need 7065 * to be downgraded to a single byte in order to do the 7066 * comparison. (If we could be confident that the target 7067 * is not malformed, this could be refactored to have fewer 7068 * tests by just assuming that if the first bytes match, it 7069 * is an invariant, but there are tests in the test suite 7070 * dealing with (??{...}) which violate this) */ 7071 while (s < e) { 7072 if ( l >= loceol 7073 || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) 7074 { 7075 sayNO; 7076 } 7077 if (UTF8_IS_INVARIANT(*(U8*)l)) { 7078 if (*l != *s) { 7079 sayNO; 7080 } 7081 l++; 7082 } 7083 else { 7084 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) 7085 { 7086 sayNO; 7087 } 7088 l += 2; 7089 } 7090 s++; 7091 } 7092 } 7093 else { 7094 /* The target is not utf8, the pattern is utf8. */ 7095 while (s < e) { 7096 if ( l >= loceol 7097 || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) 7098 { 7099 sayNO; 7100 } 7101 if (UTF8_IS_INVARIANT(*(U8*)s)) { 7102 if (*s != *l) { 7103 sayNO; 7104 } 7105 s++; 7106 } 7107 else { 7108 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) 7109 { 7110 sayNO; 7111 } 7112 s += 2; 7113 } 7114 l++; 7115 } 7116 } 7117 locinput = l; 7118 } 7119 else { 7120 /* The target and the pattern have the same utf8ness. */ 7121 /* Inline the first character, for speed. */ 7122 if ( loceol - locinput < ln 7123 || UCHARAT(s) != nextbyte 7124 || (ln > 1 && memNE(s, locinput, ln))) 7125 { 7126 sayNO; 7127 } 7128 locinput += ln; 7129 } 7130 break; 7131 } 7132 7133 case EXACTFL: /* /abc/il */ 7134 { 7135 const char * s; 7136 U32 fold_utf8_flags; 7137 7138 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7139 folder = Perl_foldEQ_locale; 7140 fold_array = PL_fold_locale; 7141 fold_utf8_flags = FOLDEQ_LOCALE; 7142 goto do_exactf; 7143 7144 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so 7145 is effectively /u; hence to match, target 7146 must be UTF-8. */ 7147 if (! utf8_target) { 7148 sayNO; 7149 } 7150 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED 7151 | FOLDEQ_S2_FOLDS_SANE; 7152 folder = S_foldEQ_latin1_s2_folded; 7153 fold_array = PL_fold_latin1; 7154 goto do_exactf; 7155 7156 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ 7157 if (! utf8_target) { 7158 sayNO; 7159 } 7160 assert(is_utf8_pat); 7161 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 7162#ifdef DEBUGGING 7163 /* this is only used in an assert check, so we restrict it to DEBUGGING mode. 7164 * In theory neither of these variables should be used in this mode. */ 7165 folder = NULL; 7166 fold_array = NULL; 7167#endif 7168 goto do_exactf; 7169 7170 case EXACTFUP: /* /foo/iu, and something is problematic in 7171 'foo' so can't take shortcuts. */ 7172 assert(! is_utf8_pat); 7173 folder = Perl_foldEQ_latin1; 7174 fold_array = PL_fold_latin1; 7175 fold_utf8_flags = 0; 7176 goto do_exactf; 7177 7178 case EXACTFU: /* /abc/iu */ 7179 folder = S_foldEQ_latin1_s2_folded; 7180 fold_array = PL_fold_latin1; 7181 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; 7182 goto do_exactf; 7183 7184 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 7185 patterns */ 7186 assert(! is_utf8_pat); 7187 /* FALLTHROUGH */ 7188 case EXACTFAA: /* /abc/iaa */ 7189 folder = S_foldEQ_latin1_s2_folded; 7190 fold_array = PL_fold_latin1; 7191 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7192 if (is_utf8_pat || ! utf8_target) { 7193 7194 /* The possible presence of a MICRO SIGN in the pattern forbids 7195 * us to view a non-UTF-8 pattern as folded when there is a 7196 * UTF-8 target */ 7197 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED 7198 |FOLDEQ_S2_FOLDS_SANE; 7199 } 7200 goto do_exactf; 7201 7202 7203 case EXACTF: /* /abc/i This node only generated for 7204 non-utf8 patterns */ 7205 assert(! is_utf8_pat); 7206 folder = Perl_foldEQ; 7207 fold_array = PL_fold; 7208 fold_utf8_flags = 0; 7209 7210 do_exactf: 7211 s = STRINGs(scan); 7212 ln = STR_LENs(scan); 7213 7214 if ( utf8_target 7215 || is_utf8_pat 7216 || state_num == EXACTFUP 7217 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) 7218 { 7219 /* Either target or the pattern are utf8, or has the issue where 7220 * the fold lengths may differ. */ 7221 const char * const l = locinput; 7222 char *e = loceol; 7223 7224 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target, 7225 s, 0, ln, is_utf8_pat,fold_utf8_flags)) 7226 { 7227 sayNO; 7228 } 7229 locinput = e; 7230 break; 7231 } 7232 7233 /* Neither the target nor the pattern are utf8 */ 7234 assert(fold_array); 7235 if (UCHARAT(s) != nextbyte 7236 && !NEXTCHR_IS_EOS 7237 && UCHARAT(s) != fold_array[nextbyte]) 7238 { 7239 sayNO; 7240 } 7241 if (loceol - locinput < ln) 7242 sayNO; 7243 assert(folder); 7244 if (ln > 1 && ! folder(aTHX_ locinput, s, ln)) 7245 sayNO; 7246 locinput += ln; 7247 break; 7248 } 7249 7250 case NBOUNDL: /* /\B/l */ 7251 to_complement = 1; 7252 /* FALLTHROUGH */ 7253 7254 case BOUNDL: /* /\b/l */ 7255 { 7256 bool b1, b2; 7257 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7258 7259 if (FLAGS(scan) != TRADITIONAL_BOUND) { 7260 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; 7261 goto boundu; 7262 } 7263 7264 if (utf8_target) { 7265 if (locinput == reginfo->strbeg) 7266 b1 = isWORDCHAR_LC('\n'); 7267 else { 7268 U8 *p = reghop3((U8*)locinput, -1, 7269 (U8*)(reginfo->strbeg)); 7270 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); 7271 } 7272 b2 = (NEXTCHR_IS_EOS) 7273 ? isWORDCHAR_LC('\n') 7274 : isWORDCHAR_LC_utf8_safe((U8*) locinput, 7275 (U8*) reginfo->strend); 7276 } 7277 else { /* Here the string isn't utf8 */ 7278 b1 = (locinput == reginfo->strbeg) 7279 ? isWORDCHAR_LC('\n') 7280 : isWORDCHAR_LC(UCHARAT(locinput - 1)); 7281 b2 = (NEXTCHR_IS_EOS) 7282 ? isWORDCHAR_LC('\n') 7283 : isWORDCHAR_LC(nextbyte); 7284 } 7285 if (to_complement ^ (b1 == b2)) { 7286 sayNO; 7287 } 7288 break; 7289 } 7290 7291 case NBOUND: /* /\B/ */ 7292 to_complement = 1; 7293 /* FALLTHROUGH */ 7294 7295 case BOUND: /* /\b/ */ 7296 if (utf8_target) { 7297 goto bound_utf8; 7298 } 7299 goto bound_ascii_match_only; 7300 7301 case NBOUNDA: /* /\B/a */ 7302 to_complement = 1; 7303 /* FALLTHROUGH */ 7304 7305 case BOUNDA: /* /\b/a */ 7306 { 7307 bool b1, b2; 7308 7309 bound_ascii_match_only: 7310 /* Here the string isn't utf8, or is utf8 and only ascii characters 7311 * are to match \w. In the latter case looking at the byte just 7312 * prior to the current one may be just the final byte of a 7313 * multi-byte character. This is ok. There are two cases: 7314 * 1) it is a single byte character, and then the test is doing 7315 * just what it's supposed to. 7316 * 2) it is a multi-byte character, in which case the final byte is 7317 * never mistakable for ASCII, and so the test will say it is 7318 * not a word character, which is the correct answer. */ 7319 b1 = (locinput == reginfo->strbeg) 7320 ? isWORDCHAR_A('\n') 7321 : isWORDCHAR_A(UCHARAT(locinput - 1)); 7322 b2 = (NEXTCHR_IS_EOS) 7323 ? isWORDCHAR_A('\n') 7324 : isWORDCHAR_A(nextbyte); 7325 if (to_complement ^ (b1 == b2)) { 7326 sayNO; 7327 } 7328 break; 7329 } 7330 7331 case NBOUNDU: /* /\B/u */ 7332 to_complement = 1; 7333 /* FALLTHROUGH */ 7334 7335 case BOUNDU: /* /\b/u */ 7336 7337 boundu: 7338 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) { 7339 match = FALSE; 7340 } 7341 else if (utf8_target) { 7342 bound_utf8: 7343 switch((bound_type) FLAGS(scan)) { 7344 case TRADITIONAL_BOUND: 7345 { 7346 bool b1, b2; 7347 if (locinput == reginfo->strbeg) { 7348 b1 = 0 /* isWORDCHAR_L1('\n') */; 7349 } 7350 else { 7351 U8 *p = reghop3((U8*)locinput, -1, 7352 (U8*)(reginfo->strbeg)); 7353 7354 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); 7355 } 7356 b2 = (NEXTCHR_IS_EOS) 7357 ? 0 /* isWORDCHAR_L1('\n') */ 7358 : isWORDCHAR_utf8_safe((U8*)locinput, 7359 (U8*) reginfo->strend); 7360 match = cBOOL(b1 != b2); 7361 break; 7362 } 7363 case GCB_BOUND: 7364 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7365 match = TRUE; /* GCB always matches at begin and 7366 end */ 7367 } 7368 else { 7369 /* Find the gcb values of previous and current 7370 * chars, then see if is a break point */ 7371 match = isGCB(getGCB_VAL_UTF8( 7372 reghop3((U8*)locinput, 7373 -1, 7374 (U8*)(reginfo->strbeg)), 7375 (U8*) reginfo->strend), 7376 getGCB_VAL_UTF8((U8*) locinput, 7377 (U8*) reginfo->strend), 7378 (U8*) reginfo->strbeg, 7379 (U8*) locinput, 7380 utf8_target); 7381 } 7382 break; 7383 7384 case LB_BOUND: 7385 if (locinput == reginfo->strbeg) { 7386 match = FALSE; 7387 } 7388 else if (NEXTCHR_IS_EOS) { 7389 match = TRUE; 7390 } 7391 else { 7392 match = isLB(getLB_VAL_UTF8( 7393 reghop3((U8*)locinput, 7394 -1, 7395 (U8*)(reginfo->strbeg)), 7396 (U8*) reginfo->strend), 7397 getLB_VAL_UTF8((U8*) locinput, 7398 (U8*) reginfo->strend), 7399 (U8*) reginfo->strbeg, 7400 (U8*) locinput, 7401 (U8*) reginfo->strend, 7402 utf8_target); 7403 } 7404 break; 7405 7406 case SB_BOUND: /* Always matches at begin and end */ 7407 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7408 match = TRUE; 7409 } 7410 else { 7411 match = isSB(getSB_VAL_UTF8( 7412 reghop3((U8*)locinput, 7413 -1, 7414 (U8*)(reginfo->strbeg)), 7415 (U8*) reginfo->strend), 7416 getSB_VAL_UTF8((U8*) locinput, 7417 (U8*) reginfo->strend), 7418 (U8*) reginfo->strbeg, 7419 (U8*) locinput, 7420 (U8*) reginfo->strend, 7421 utf8_target); 7422 } 7423 break; 7424 7425 case WB_BOUND: 7426 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7427 match = TRUE; 7428 } 7429 else { 7430 match = isWB(WB_UNKNOWN, 7431 getWB_VAL_UTF8( 7432 reghop3((U8*)locinput, 7433 -1, 7434 (U8*)(reginfo->strbeg)), 7435 (U8*) reginfo->strend), 7436 getWB_VAL_UTF8((U8*) locinput, 7437 (U8*) reginfo->strend), 7438 (U8*) reginfo->strbeg, 7439 (U8*) locinput, 7440 (U8*) reginfo->strend, 7441 utf8_target); 7442 } 7443 break; 7444 } 7445 } 7446 else { /* Not utf8 target */ 7447 switch((bound_type) FLAGS(scan)) { 7448 case TRADITIONAL_BOUND: 7449 { 7450 bool b1, b2; 7451 b1 = (locinput == reginfo->strbeg) 7452 ? 0 /* isWORDCHAR_L1('\n') */ 7453 : isWORDCHAR_L1(UCHARAT(locinput - 1)); 7454 b2 = (NEXTCHR_IS_EOS) 7455 ? 0 /* isWORDCHAR_L1('\n') */ 7456 : isWORDCHAR_L1(nextbyte); 7457 match = cBOOL(b1 != b2); 7458 break; 7459 } 7460 7461 case GCB_BOUND: 7462 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7463 match = TRUE; /* GCB always matches at begin and 7464 end */ 7465 } 7466 else { /* Only CR-LF combo isn't a GCB in 0-255 7467 range */ 7468 match = UCHARAT(locinput - 1) != '\r' 7469 || UCHARAT(locinput) != '\n'; 7470 } 7471 break; 7472 7473 case LB_BOUND: 7474 if (locinput == reginfo->strbeg) { 7475 match = FALSE; 7476 } 7477 else if (NEXTCHR_IS_EOS) { 7478 match = TRUE; 7479 } 7480 else { 7481 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)), 7482 getLB_VAL_CP(UCHARAT(locinput)), 7483 (U8*) reginfo->strbeg, 7484 (U8*) locinput, 7485 (U8*) reginfo->strend, 7486 utf8_target); 7487 } 7488 break; 7489 7490 case SB_BOUND: /* Always matches at begin and end */ 7491 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7492 match = TRUE; 7493 } 7494 else { 7495 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)), 7496 getSB_VAL_CP(UCHARAT(locinput)), 7497 (U8*) reginfo->strbeg, 7498 (U8*) locinput, 7499 (U8*) reginfo->strend, 7500 utf8_target); 7501 } 7502 break; 7503 7504 case WB_BOUND: 7505 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { 7506 match = TRUE; 7507 } 7508 else { 7509 match = isWB(WB_UNKNOWN, 7510 getWB_VAL_CP(UCHARAT(locinput -1)), 7511 getWB_VAL_CP(UCHARAT(locinput)), 7512 (U8*) reginfo->strbeg, 7513 (U8*) locinput, 7514 (U8*) reginfo->strend, 7515 utf8_target); 7516 } 7517 break; 7518 } 7519 } 7520 7521 if (to_complement ^ ! match) { 7522 sayNO; 7523 } 7524 break; 7525 7526 case ANYOFPOSIXL: 7527 case ANYOFL: /* /[abc]/l */ 7528 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7529 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan); 7530 7531 /* FALLTHROUGH */ 7532 case ANYOFD: /* /[abc]/d */ 7533 case ANYOF: /* /[abc]/ */ 7534 if (NEXTCHR_IS_EOS || locinput >= loceol) 7535 sayNO; 7536 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput)) 7537 && ! ANYOF_FLAGS(scan) 7538 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan)) 7539 { 7540 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) { 7541 sayNO; 7542 } 7543 locinput++; 7544 } 7545 else { 7546 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol, 7547 utf8_target)) 7548 { 7549 sayNO; 7550 } 7551 goto increment_locinput; 7552 } 7553 break; 7554 7555 case ANYOFM: 7556 if ( NEXTCHR_IS_EOS 7557 || (UCHARAT(locinput) & FLAGS(scan)) != ARG1u(scan) 7558 || locinput >= loceol) 7559 { 7560 sayNO; 7561 } 7562 locinput++; /* ANYOFM is always single byte */ 7563 break; 7564 7565 case NANYOFM: 7566 if ( NEXTCHR_IS_EOS 7567 || (UCHARAT(locinput) & FLAGS(scan)) == ARG1u(scan) 7568 || locinput >= loceol) 7569 { 7570 sayNO; 7571 } 7572 goto increment_locinput; 7573 break; 7574 7575 case ANYOFH: 7576 if ( ! utf8_target 7577 || NEXTCHR_IS_EOS 7578 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 7579 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7580 || ! _invlist_contains_cp(anyofh_list, 7581 utf8_to_uvchr_buf((U8 *) locinput, 7582 (U8 *) loceol, 7583 NULL))) 7584 { 7585 sayNO; 7586 } 7587 goto increment_locinput; 7588 break; 7589 7590 case ANYOFHb: 7591 if ( ! utf8_target 7592 || NEXTCHR_IS_EOS 7593 || ANYOF_FLAGS(scan) != (U8) *locinput 7594 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7595 || ! _invlist_contains_cp(anyofh_list, 7596 utf8_to_uvchr_buf((U8 *) locinput, 7597 (U8 *) loceol, 7598 NULL))) 7599 { 7600 sayNO; 7601 } 7602 goto increment_locinput; 7603 break; 7604 7605 case ANYOFHbbm: 7606 if ( ! utf8_target 7607 || NEXTCHR_IS_EOS 7608 || ANYOF_FLAGS(scan) != (U8) locinput[0] 7609 || locinput >= reginfo->strend 7610 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap, 7611 (U8) locinput[1] & UTF_CONTINUATION_MASK)) 7612 { 7613 sayNO; 7614 } 7615 goto increment_locinput; 7616 break; 7617 7618 case ANYOFHr: 7619 if ( ! utf8_target 7620 || NEXTCHR_IS_EOS 7621 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), 7622 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), 7623 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) 7624 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7625 || ! _invlist_contains_cp(anyofh_list, 7626 utf8_to_uvchr_buf((U8 *) locinput, 7627 (U8 *) loceol, 7628 NULL))) 7629 { 7630 sayNO; 7631 } 7632 goto increment_locinput; 7633 break; 7634 7635 case ANYOFHs: 7636 if ( ! utf8_target 7637 || NEXTCHR_IS_EOS 7638 || loceol - locinput < FLAGS(scan) 7639 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) 7640 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan)) 7641 || ! _invlist_contains_cp(anyofh_list, 7642 utf8_to_uvchr_buf((U8 *) locinput, 7643 (U8 *) loceol, 7644 NULL))) 7645 { 7646 sayNO; 7647 } 7648 goto increment_locinput; 7649 break; 7650 7651 case ANYOFR: 7652 if (NEXTCHR_IS_EOS) { 7653 sayNO; 7654 } 7655 7656 if (utf8_target) { 7657 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) 7658 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 7659 (U8 *) reginfo->strend, 7660 NULL), 7661 ANYOFRbase(scan), ANYOFRdelta(scan))) 7662 { 7663 sayNO; 7664 } 7665 } 7666 else { 7667 if (! withinCOUNT((U8) *locinput, 7668 ANYOFRbase(scan), ANYOFRdelta(scan))) 7669 { 7670 sayNO; 7671 } 7672 } 7673 goto increment_locinput; 7674 break; 7675 7676 case ANYOFRb: 7677 if (NEXTCHR_IS_EOS) { 7678 sayNO; 7679 } 7680 7681 if (utf8_target) { 7682 if ( ANYOF_FLAGS(scan) != (U8) *locinput 7683 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, 7684 (U8 *) reginfo->strend, 7685 NULL), 7686 ANYOFRbase(scan), ANYOFRdelta(scan))) 7687 { 7688 sayNO; 7689 } 7690 } 7691 else { 7692 if (! withinCOUNT((U8) *locinput, 7693 ANYOFRbase(scan), ANYOFRdelta(scan))) 7694 { 7695 sayNO; 7696 } 7697 } 7698 goto increment_locinput; 7699 break; 7700 7701 /* The argument (FLAGS) to all the POSIX node types is the class number 7702 * */ 7703 7704 case NPOSIXL: /* \W or [:^punct:] etc. under /l */ 7705 to_complement = 1; 7706 /* FALLTHROUGH */ 7707 7708 case POSIXL: /* \w or [:punct:] etc. under /l */ 7709 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7710 if (NEXTCHR_IS_EOS || locinput >= loceol) 7711 sayNO; 7712 7713 /* Use isFOO_lc() for characters within Latin1. (Note that 7714 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7715 * wouldn't be invariant) */ 7716 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { 7717 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) { 7718 sayNO; 7719 } 7720 7721 locinput++; 7722 break; 7723 } 7724 7725 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7726 /* An above Latin-1 code point, or malformed */ 7727 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, 7728 reginfo->strend); 7729 goto utf8_posix_above_latin1; 7730 } 7731 7732 /* Here is a UTF-8 variant code point below 256 and the target is 7733 * UTF-8 */ 7734 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), 7735 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, 7736 *(locinput + 1)))))) 7737 { 7738 sayNO; 7739 } 7740 7741 goto increment_locinput; 7742 7743 case NPOSIXD: /* \W or [:^punct:] etc. under /d */ 7744 to_complement = 1; 7745 /* FALLTHROUGH */ 7746 7747 case POSIXD: /* \w or [:punct:] etc. under /d */ 7748 if (utf8_target) { 7749 goto utf8_posix; 7750 } 7751 goto posixa; 7752 7753 case NPOSIXA: /* \W or [:^punct:] etc. under /a */ 7754 7755 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7756 sayNO; 7757 } 7758 7759 /* All UTF-8 variants match */ 7760 if (! UTF8_IS_INVARIANT(nextbyte)) { 7761 goto increment_locinput; 7762 } 7763 7764 to_complement = 1; 7765 goto join_nposixa; 7766 7767 case POSIXA: /* \w or [:punct:] etc. under /a */ 7768 7769 posixa: 7770 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in 7771 * UTF-8, and also from NPOSIXA even in UTF-8 when the current 7772 * character is a single byte */ 7773 7774 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7775 sayNO; 7776 } 7777 7778 join_nposixa: 7779 7780 if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte, 7781 FLAGS(scan))))) 7782 { 7783 sayNO; 7784 } 7785 7786 /* Here we are either not in utf8, or we matched a utf8-invariant, 7787 * so the next char is the next byte */ 7788 locinput++; 7789 break; 7790 7791 case NPOSIXU: /* \W or [:^punct:] etc. under /u */ 7792 to_complement = 1; 7793 /* FALLTHROUGH */ 7794 7795 case POSIXU: /* \w or [:punct:] etc. under /u */ 7796 utf8_posix: 7797 if (NEXTCHR_IS_EOS || locinput >= loceol) { 7798 sayNO; 7799 } 7800 7801 /* Use generic_isCC_() for characters within Latin1. (Note that 7802 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else 7803 * wouldn't be invariant) */ 7804 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { 7805 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte, 7806 FLAGS(scan))))) 7807 { 7808 sayNO; 7809 } 7810 locinput++; 7811 } 7812 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { 7813 if (! (to_complement 7814 ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, 7815 *(locinput + 1)), 7816 FLAGS(scan))))) 7817 { 7818 sayNO; 7819 } 7820 locinput += 2; 7821 } 7822 else { /* Handle above Latin-1 code points */ 7823 utf8_posix_above_latin1: 7824 classnum = (char_class_number_) FLAGS(scan); 7825 switch (classnum) { 7826 default: 7827 if (! (to_complement 7828 ^ cBOOL(_invlist_contains_cp( 7829 PL_XPosix_ptrs[classnum], 7830 utf8_to_uvchr_buf((U8 *) locinput, 7831 (U8 *) reginfo->strend, 7832 NULL))))) 7833 { 7834 sayNO; 7835 } 7836 break; 7837 case CC_ENUM_SPACE_: 7838 if (! (to_complement 7839 ^ cBOOL(is_XPERLSPACE_high(locinput)))) 7840 { 7841 sayNO; 7842 } 7843 break; 7844 case CC_ENUM_BLANK_: 7845 if (! (to_complement 7846 ^ cBOOL(is_HORIZWS_high(locinput)))) 7847 { 7848 sayNO; 7849 } 7850 break; 7851 case CC_ENUM_XDIGIT_: 7852 if (! (to_complement 7853 ^ cBOOL(is_XDIGIT_high(locinput)))) 7854 { 7855 sayNO; 7856 } 7857 break; 7858 case CC_ENUM_VERTSPACE_: 7859 if (! (to_complement 7860 ^ cBOOL(is_VERTWS_high(locinput)))) 7861 { 7862 sayNO; 7863 } 7864 break; 7865 case CC_ENUM_CNTRL_: /* These can't match above Latin1 */ 7866 case CC_ENUM_ASCII_: 7867 if (! to_complement) { 7868 sayNO; 7869 } 7870 break; 7871 } 7872 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); 7873 } 7874 break; 7875 7876 case CLUMP: /* Match \X: logical Unicode character. This is defined as 7877 a Unicode extended Grapheme Cluster */ 7878 if (NEXTCHR_IS_EOS || locinput >= loceol) 7879 sayNO; 7880 if (! utf8_target) { 7881 7882 /* Match either CR LF or '.', as all the other possibilities 7883 * require utf8 */ 7884 locinput++; /* Match the . or CR */ 7885 if (nextbyte == '\r' /* And if it was CR, and the next is LF, 7886 match the LF */ 7887 && locinput < loceol 7888 && UCHARAT(locinput) == '\n') 7889 { 7890 locinput++; 7891 } 7892 } 7893 else { 7894 7895 /* Get the gcb type for the current character */ 7896 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput, 7897 (U8*) reginfo->strend); 7898 7899 /* Then scan through the input until we get to the first 7900 * character whose type is supposed to be a gcb with the 7901 * current character. (There is always a break at the 7902 * end-of-input) */ 7903 locinput += UTF8SKIP(locinput); 7904 while (locinput < loceol) { 7905 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput, 7906 (U8*) reginfo->strend); 7907 if (isGCB(prev_gcb, cur_gcb, 7908 (U8*) reginfo->strbeg, (U8*) locinput, 7909 utf8_target)) 7910 { 7911 break; 7912 } 7913 7914 prev_gcb = cur_gcb; 7915 locinput += UTF8SKIP(locinput); 7916 } 7917 7918 7919 } 7920 break; 7921 7922 case REFFLN: /* /\g{name}/il */ 7923 { /* The capture buffer cases. The ones beginning with N for the 7924 named buffers just convert to the equivalent numbered and 7925 pretend they were called as the corresponding numbered buffer 7926 op. */ 7927 /* don't initialize these in the declaration, it makes C++ 7928 unhappy */ 7929 const char *s; 7930 char type; 7931 re_fold_t folder; 7932 const U8 *fold_array; 7933 UV utf8_fold_flags; 7934 7935 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7936 folder = Perl_foldEQ_locale; 7937 fold_array = PL_fold_locale; 7938 type = REFFL; 7939 utf8_fold_flags = FOLDEQ_LOCALE; 7940 goto do_nref; 7941 7942 case REFFAN: /* /\g{name}/iaa */ 7943 folder = Perl_foldEQ_latin1; 7944 fold_array = PL_fold_latin1; 7945 type = REFFA; 7946 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7947 goto do_nref; 7948 7949 case REFFUN: /* /\g{name}/iu */ 7950 folder = Perl_foldEQ_latin1; 7951 fold_array = PL_fold_latin1; 7952 type = REFFU; 7953 utf8_fold_flags = 0; 7954 goto do_nref; 7955 7956 case REFFN: /* /\g{name}/i */ 7957 folder = Perl_foldEQ; 7958 fold_array = PL_fold; 7959 type = REFF; 7960 utf8_fold_flags = 0; 7961 goto do_nref; 7962 7963 case REFN: /* /\g{name}/ */ 7964 type = REF; 7965 folder = NULL; 7966 fold_array = NULL; 7967 utf8_fold_flags = 0; 7968 do_nref: 7969 7970 /* For the named back references, find the corresponding buffer 7971 * number */ 7972 n = reg_check_named_buff_matched(rex,scan); 7973 7974 if ( ! n ) { 7975 sayNO; 7976 } 7977 goto do_nref_ref_common; 7978 7979 case REFFL: /* /\1/il */ 7980 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 7981 folder = Perl_foldEQ_locale; 7982 fold_array = PL_fold_locale; 7983 utf8_fold_flags = FOLDEQ_LOCALE; 7984 goto do_ref; 7985 7986 case REFFA: /* /\1/iaa */ 7987 folder = Perl_foldEQ_latin1; 7988 fold_array = PL_fold_latin1; 7989 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; 7990 goto do_ref; 7991 7992 case REFFU: /* /\1/iu */ 7993 folder = Perl_foldEQ_latin1; 7994 fold_array = PL_fold_latin1; 7995 utf8_fold_flags = 0; 7996 goto do_ref; 7997 7998 case REFF: /* /\1/i */ 7999 folder = Perl_foldEQ; 8000 fold_array = PL_fold; 8001 utf8_fold_flags = 0; 8002 goto do_ref; 8003 8004#undef ST 8005#define ST st->u.backref 8006 case REF: /* /\1/ */ 8007 folder = NULL; 8008 fold_array = NULL; 8009 utf8_fold_flags = 0; 8010 8011 do_ref: 8012 type = OP(scan); 8013 n = ARG1u(scan); /* which paren pair */ 8014 if (rex->logical_to_parno) { 8015 n = rex->logical_to_parno[n]; 8016 do { 8017 if ( RXp_LASTPAREN(rex) < n || 8018 RXp_OFFS_START(rex,n) == -1 || 8019 RXp_OFFS_END(rex,n) == -1 8020 ) { 8021 n = rex->parno_to_logical_next[n]; 8022 } 8023 else { 8024 break; 8025 } 8026 } while(n); 8027 8028 if (!n) /* this means there is nothing that matched */ 8029 sayNO; 8030 } 8031 8032 do_nref_ref_common: 8033 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 8034 if (RXp_LASTPAREN(rex) < n) 8035 sayNO; 8036 8037 ln = RXp_OFFSp(rex)[n].start; 8038 endref = RXp_OFFSp(rex)[n].end; 8039 if (ln == -1 || endref == -1) 8040 sayNO; /* Do not match unless seen CLOSEn. */ 8041 8042 if (ln == endref) 8043 goto ref_yes; 8044 8045 s = reginfo->strbeg + ln; 8046 if (type != REF /* REF can do byte comparison */ 8047 && (utf8_target || type == REFFU || type == REFFL)) 8048 { 8049 char * limit = loceol; 8050 8051 /* This call case insensitively compares the entire buffer 8052 * at s, with the current input starting at locinput, but 8053 * not going off the end given by loceol, and 8054 * returns in <limit> upon success, how much of the 8055 * current input was matched */ 8056 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target, 8057 locinput, &limit, 0, utf8_target, utf8_fold_flags)) 8058 { 8059 sayNO; 8060 } 8061 locinput = limit; 8062 goto ref_yes; 8063 } 8064 8065 /* Not utf8: Inline the first character, for speed. */ 8066 if ( ! NEXTCHR_IS_EOS 8067 && locinput < loceol 8068 && UCHARAT(s) != nextbyte 8069 && ( type == REF 8070 || UCHARAT(s) != fold_array[nextbyte])) 8071 { 8072 sayNO; 8073 } 8074 ln = endref - ln; 8075 if (locinput + ln > loceol) 8076 sayNO; 8077 if (ln > 1 && (type == REF 8078 ? memNE(s, locinput, ln) 8079 : ! folder(aTHX_ locinput, s, ln))) 8080 sayNO; 8081 locinput += ln; 8082 } 8083 ref_yes: 8084 if (FLAGS(scan)) { /* == VOLATILE_REF but only other value is 0 */ 8085 ST.cp = regcppush(rex, ARG2u(scan) - 1, maxopenparen); 8086 REGCP_SET(ST.lastcp); 8087 PUSH_STATE_GOTO(REF_next, next, locinput, loceol, 8088 script_run_begin); 8089 } 8090 break; 8091 NOT_REACHED; /* NOTREACHED */ 8092 8093 case REF_next: 8094 sayYES; 8095 break; 8096 8097 case REF_next_fail: 8098 REGCP_UNWIND(ST.lastcp); 8099 regcppop(rex, &maxopenparen); 8100 sayNO; 8101 break; 8102 8103 case NOTHING: /* null op; e.g. the 'nothing' following 8104 * the '*' in m{(a+|b)*}' */ 8105 break; 8106 case TAIL: /* placeholder while compiling (A|B|C) */ 8107 break; 8108 8109#undef ST 8110#define ST st->u.eval 8111#define CUR_EVAL cur_eval->u.eval 8112 8113 { 8114 SV *ret; 8115 REGEXP *re_sv; 8116 regexp *re; 8117 regexp_internal *rei; 8118 regnode *startpoint; 8119 U32 arg; 8120 8121 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ 8122 arg = ARG1u(scan); 8123 if (cur_eval && cur_eval->locinput == locinput) { 8124 if ( ++nochange_depth > max_nochange_depth ) 8125 Perl_croak(aTHX_ 8126 "Pattern subroutine nesting without pos change" 8127 " exceeded limit in regex"); 8128 } else { 8129 nochange_depth = 0; 8130 } 8131 re_sv = rex_sv; 8132 re = rex; 8133 rei = rexi; 8134 startpoint = scan + ARG2i(scan); 8135 EVAL_CLOSE_PAREN_SET( st, arg ); 8136 /* Detect infinite recursion 8137 * 8138 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/ 8139 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever. 8140 * So we track the position in the string we are at each time 8141 * we recurse and if we try to enter the same routine twice from 8142 * the same position we throw an error. 8143 */ 8144 if ( rex->recurse_locinput[arg] == locinput ) { 8145 /* FIXME: we should show the regop that is failing as part 8146 * of the error message. */ 8147 Perl_croak(aTHX_ "Infinite recursion in regex"); 8148 } else { 8149 ST.prev_recurse_locinput= rex->recurse_locinput[arg]; 8150 rex->recurse_locinput[arg]= locinput; 8151 8152 DEBUG_r({ 8153 DECLARE_AND_GET_RE_DEBUG_FLAGS; 8154 DEBUG_STACK_r({ 8155 Perl_re_exec_indentf( aTHX_ 8156 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", 8157 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg] 8158 ); 8159 }); 8160 }); 8161 } 8162 8163 /* Save all the positions seen so far. */ 8164 ST.cp = regcppush(rex, 0, maxopenparen); 8165 REGCP_SET(ST.lastcp); 8166 8167 /* and then jump to the code we share with EVAL */ 8168 goto eval_recurse_doit; 8169 /* NOTREACHED */ 8170 8171 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ 8172 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { 8173 if ( ++nochange_depth > max_nochange_depth ) 8174 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); 8175 } else { 8176 nochange_depth = 0; 8177 } 8178 { 8179 /* execute the code in the {...} */ 8180 8181 dSP; 8182 IV before; 8183 OP * const oop = PL_op; 8184 COP * const ocurcop = PL_curcop; 8185 OP *nop; 8186 CV *newcv; 8187 8188 /* save *all* paren positions */ 8189 ST.cp = regcppush(rex, 0, maxopenparen); 8190 REGCP_SET(ST.lastcp); 8191 8192 if (!caller_cv) 8193 caller_cv = find_runcv(NULL); 8194 8195 n = ARG1u(scan); 8196 8197 if (rexi->data->what[n] == 'r') { /* code from an external qr */ 8198 newcv = (ReANY( 8199 (REGEXP*)(rexi->data->data[n]) 8200 ))->qr_anoncv; 8201 nop = (OP*)rexi->data->data[n+1]; 8202 } 8203 else if (rexi->data->what[n] == 'l') { /* literal code */ 8204 newcv = caller_cv; 8205 nop = (OP*)rexi->data->data[n]; 8206 assert(CvDEPTH(newcv)); 8207 } 8208 else { 8209 /* literal with own CV */ 8210 assert(rexi->data->what[n] == 'L'); 8211 newcv = rex->qr_anoncv; 8212 nop = (OP*)rexi->data->data[n]; 8213 } 8214 8215 /* Some notes about MULTICALL and the context and save stacks. 8216 * 8217 * In something like 8218 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../ 8219 * since codeblocks don't introduce a new scope (so that 8220 * local() etc accumulate), at the end of a successful 8221 * match there will be a SAVEt_CLEARSV on the savestack 8222 * for each of $x, $y, $z. If the three code blocks above 8223 * happen to have come from different CVs (e.g. via 8224 * embedded qr//s), then we must ensure that during any 8225 * savestack unwinding, PL_comppad always points to the 8226 * right pad at each moment. We achieve this by 8227 * interleaving SAVEt_COMPPAD's on the savestack whenever 8228 * there is a change of pad. 8229 * In theory whenever we call a code block, we should 8230 * push a CXt_SUB context, then pop it on return from 8231 * that code block. This causes a bit of an issue in that 8232 * normally popping a context also clears the savestack 8233 * back to cx->blk_oldsaveix, but here we specifically 8234 * don't want to clear the save stack on exit from the 8235 * code block. 8236 * Also for efficiency we don't want to keep pushing and 8237 * popping the single SUB context as we backtrack etc. 8238 * So instead, we push a single context the first time 8239 * we need, it, then hang onto it until the end of this 8240 * function. Whenever we encounter a new code block, we 8241 * update the CV etc if that's changed. During the times 8242 * in this function where we're not executing a code 8243 * block, having the SUB context still there is a bit 8244 * naughty - but we hope that no-one notices. 8245 * When the SUB context is initially pushed, we fake up 8246 * cx->blk_oldsaveix to be as if we'd pushed this context 8247 * on first entry to S_regmatch rather than at some random 8248 * point during the regexe execution. That way if we 8249 * croak, popping the context stack will ensure that 8250 * *everything* SAVEd by this function is undone and then 8251 * the context popped, rather than e.g., popping the 8252 * context (and restoring the original PL_comppad) then 8253 * popping more of the savestack and restoring a bad 8254 * PL_comppad. 8255 */ 8256 8257 /* If this is the first EVAL, push a MULTICALL. On 8258 * subsequent calls, if we're executing a different CV, or 8259 * if PL_comppad has got messed up from backtracking 8260 * through SAVECOMPPADs, then refresh the context. 8261 */ 8262 if (newcv != last_pushed_cv || PL_comppad != last_pad) 8263 { 8264 U8 flags = (CXp_SUB_RE | 8265 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); 8266 SAVECOMPPAD(); 8267 if (last_pushed_cv) { 8268 CHANGE_MULTICALL_FLAGS(newcv, flags); 8269 } 8270 else { 8271 PUSH_MULTICALL_FLAGS(newcv, flags); 8272 } 8273 /* see notes above */ 8274 CX_CUR()->blk_oldsaveix = orig_savestack_ix; 8275 8276 last_pushed_cv = newcv; 8277 } 8278 else { 8279 /* these assignments are just to silence compiler 8280 * warnings */ 8281 multicall_cop = NULL; 8282 } 8283 last_pad = PL_comppad; 8284 8285 /* the initial nextstate you would normally execute 8286 * at the start of an eval (which would cause error 8287 * messages to come from the eval), may be optimised 8288 * away from the execution path in the regex code blocks; 8289 * so manually set PL_curcop to it initially */ 8290 { 8291 OP *o = cUNOPx(nop)->op_first; 8292 assert(o->op_type == OP_NULL); 8293 if (o->op_targ == OP_SCOPE) { 8294 o = cUNOPo->op_first; 8295 } 8296 else { 8297 assert(o->op_targ == OP_LEAVE); 8298 o = cUNOPo->op_first; 8299 assert(o->op_type == OP_ENTER); 8300 o = OpSIBLING(o); 8301 } 8302 8303 if (o->op_type != OP_STUB) { 8304 assert( o->op_type == OP_NEXTSTATE 8305 || o->op_type == OP_DBSTATE 8306 || (o->op_type == OP_NULL 8307 && ( o->op_targ == OP_NEXTSTATE 8308 || o->op_targ == OP_DBSTATE 8309 ) 8310 ) 8311 ); 8312 PL_curcop = (COP*)o; 8313 } 8314 } 8315 nop = nop->op_next; 8316 8317 DEBUG_STATE_r( Perl_re_printf( aTHX_ 8318 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) ); 8319 8320 RXp_OFFSp(rex)[0].end = locinput - reginfo->strbeg; 8321 if (reginfo->info_aux_eval->pos_magic) 8322 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, 8323 reginfo->sv, reginfo->strbeg, 8324 locinput - reginfo->strbeg); 8325 8326 if (sv_yes_mark) { 8327 SV *sv_mrk = get_sv("REGMARK", 1); 8328 sv_setsv(sv_mrk, sv_yes_mark); 8329 } 8330 8331 /* we don't use MULTICALL here as we want to call the 8332 * first op of the block of interest, rather than the 8333 * first op of the sub. Also, we don't want to free 8334 * the savestack frame */ 8335 before = (IV)(SP-PL_stack_base); 8336 PL_op = nop; 8337 CALLRUNOPS(aTHX); /* Scalar context. */ 8338 SPAGAIN; 8339 if ((IV)(SP-PL_stack_base) == before) 8340 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ 8341 else { 8342 ret = POPs; 8343 PUTBACK; 8344 } 8345 8346 /* before restoring everything, evaluate the returned 8347 * value, so that 'uninit' warnings don't use the wrong 8348 * PL_op or pad. Also need to process any magic vars 8349 * (e.g. $1) *before* parentheses are restored */ 8350 8351 PL_op = NULL; 8352 8353 re_sv = NULL; 8354 if (logical == 0) { /* /(?{ ... })/ and /(*{ ... })/ */ 8355 SV *replsv = save_scalar(PL_replgv); 8356 sv_setsv(replsv, ret); /* $^R */ 8357 SvSETMAGIC(replsv); 8358 } 8359 else if (logical == 1) { /* /(?(?{...})X|Y)/ */ 8360 sw = cBOOL(SvTRUE_NN(ret)); 8361 logical = 0; 8362 } 8363 else { /* /(??{ ... }) */ 8364 /* if its overloaded, let the regex compiler handle 8365 * it; otherwise extract regex, or stringify */ 8366 if (SvGMAGICAL(ret)) 8367 ret = sv_mortalcopy(ret); 8368 if (!SvAMAGIC(ret)) { 8369 SV *sv = ret; 8370 if (SvROK(sv)) 8371 sv = SvRV(sv); 8372 if (SvTYPE(sv) == SVt_REGEXP) 8373 re_sv = (REGEXP*) sv; 8374 else if (SvSMAGICAL(ret)) { 8375 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); 8376 if (mg) 8377 re_sv = (REGEXP *) mg->mg_obj; 8378 } 8379 8380 /* force any undef warnings here */ 8381 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { 8382 ret = sv_mortalcopy(ret); 8383 (void) SvPV_force_nolen(ret); 8384 } 8385 } 8386 8387 } 8388 8389 /* *** Note that at this point we don't restore 8390 * PL_comppad, (or pop the CxSUB) on the assumption it may 8391 * be used again soon. This is safe as long as nothing 8392 * in the regexp code uses the pad ! */ 8393 PL_op = oop; 8394 PL_curcop = ocurcop; 8395 regcp_restore(rex, ST.lastcp, &maxopenparen); 8396 PL_curpm_under = PL_curpm; 8397 PL_curpm = PL_reg_curpm; 8398 8399 if (logical != 2) { 8400 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, 8401 script_run_begin); 8402 /* NOTREACHED */ 8403 } 8404 } 8405 8406 /* only /(??{ ... })/ from now on */ 8407 logical = 0; 8408 { 8409 /* extract RE object from returned value; compiling if 8410 * necessary */ 8411 8412 if (re_sv) { 8413 re_sv = reg_temp_copy(NULL, re_sv); 8414 } 8415 else { 8416 U32 pm_flags = 0; 8417 8418 if (SvUTF8(ret) && IN_BYTES) { 8419 /* In use 'bytes': make a copy of the octet 8420 * sequence, but without the flag on */ 8421 STRLEN len; 8422 const char *const p = SvPV(ret, len); 8423 ret = newSVpvn_flags(p, len, SVs_TEMP); 8424 } 8425 if (rex->intflags & PREGf_USE_RE_EVAL) 8426 pm_flags |= PMf_USE_RE_EVAL; 8427 8428 /* if we got here, it should be an engine which 8429 * supports compiling code blocks and stuff */ 8430 assert(rex->engine && rex->engine->op_comp); 8431 assert(!(FLAGS(scan) & ~RXf_PMf_COMPILETIME)); 8432 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, 8433 rex->engine, NULL, NULL, 8434 /* copy /msixn etc to inner pattern */ 8435 ARG2i(scan), 8436 pm_flags); 8437 8438 if (!(SvFLAGS(ret) 8439 & (SVs_TEMP | SVs_GMG | SVf_ROK)) 8440 && (!SvPADTMP(ret) || SvREADONLY(ret))) { 8441 /* This isn't a first class regexp. Instead, it's 8442 caching a regexp onto an existing, Perl visible 8443 scalar. */ 8444 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); 8445 } 8446 } 8447 SAVEFREESV(re_sv); 8448 re = ReANY(re_sv); 8449 } 8450 RXp_MATCH_COPIED_off(re); 8451 RXp_SUBBEG(re) = RXp_SUBBEG(rex); 8452 RXp_SUBLEN(re) = RXp_SUBLEN(rex); 8453 RXp_SUBOFFSET(re) = RXp_SUBOFFSET(rex); 8454 RXp_SUBCOFFSET(re) = RXp_SUBCOFFSET(rex); 8455 RXp_LASTPAREN(re) = 0; 8456 RXp_LASTCLOSEPAREN(re) = 0; 8457 rei = RXi_GET(re); 8458 DEBUG_EXECUTE_r( 8459 debug_start_match(re_sv, utf8_target, locinput, 8460 reginfo->strend, "EVAL/GOSUB: Matching embedded"); 8461 ); 8462 startpoint = rei->program + 1; 8463 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0; 8464 * close_paren only for GOSUB */ 8465 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */ 8466 8467 /* note we saved the paren state earlier: 8468 ST.cp = regcppush(rex, 0, maxopenparen); 8469 REGCP_SET(ST.lastcp); 8470 */ 8471 /* and set maxopenparen to 0, since we are starting a "fresh" match */ 8472 maxopenparen = 0; 8473 /* run the pattern returned from (??{...}) */ 8474 8475 eval_recurse_doit: /* Share code with GOSUB below this line 8476 * At this point we expect the stack context to be 8477 * set up correctly */ 8478 8479 /* invalidate the S-L poscache. We're now executing a 8480 * different set of WHILEM ops (and their associated 8481 * indexes) against the same string, so the bits in the 8482 * cache are meaningless. Setting maxiter to zero forces 8483 * the cache to be invalidated and zeroed before reuse. 8484 * XXX This is too dramatic a measure. Ideally we should 8485 * save the old cache and restore when running the outer 8486 * pattern again */ 8487 reginfo->poscache_maxiter = 0; 8488 8489 /* the new regexp might have a different is_utf8_pat than we do */ 8490 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); 8491 8492 ST.prev_rex = rex_sv; 8493 ST.prev_curlyx = cur_curlyx; 8494 rex_sv = re_sv; 8495 SET_reg_curpm(rex_sv); 8496 rex = re; 8497 rexi = rei; 8498 cur_curlyx = NULL; 8499 ST.B = next; 8500 ST.prev_eval = cur_eval; 8501 cur_eval = st; 8502 /* now continue from first node in postoned RE */ 8503 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, 8504 loceol, script_run_begin); 8505 NOT_REACHED; /* NOTREACHED */ 8506 } 8507 8508 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */ 8509 /* note: this is called twice; first after popping B, then A */ 8510 DEBUG_STACK_r({ 8511 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n", 8512 depth, cur_eval, ST.prev_eval); 8513 }); 8514 8515#define SET_RECURSE_LOCINPUT(STR,VAL)\ 8516 if ( cur_eval && CUR_EVAL.close_paren ) {\ 8517 DEBUG_STACK_r({ \ 8518 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\ 8519 depth, \ 8520 CUR_EVAL.close_paren - 1,\ 8521 cur_eval, \ 8522 VAL); \ 8523 }); \ 8524 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\ 8525 } 8526 8527 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput); 8528 8529 rex_sv = ST.prev_rex; 8530 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8531 SET_reg_curpm(rex_sv); 8532 rex = ReANY(rex_sv); 8533 rexi = RXi_GET(rex); 8534 { 8535 /* preserve $^R across LEAVE's. See Bug 121070. */ 8536 SV *save_sv= GvSV(PL_replgv); 8537 SV *replsv; 8538 SvREFCNT_inc(save_sv); 8539 regcpblow(ST.cp); /* LEAVE in disguise */ 8540 /* don't move this initialization up */ 8541 replsv = GvSV(PL_replgv); 8542 sv_setsv(replsv, save_sv); 8543 SvSETMAGIC(replsv); 8544 SvREFCNT_dec(save_sv); 8545 } 8546 cur_eval = ST.prev_eval; 8547 cur_curlyx = ST.prev_curlyx; 8548 8549 /* Invalidate cache. See "invalidate" comment above. */ 8550 reginfo->poscache_maxiter = 0; 8551 if ( nochange_depth ) 8552 nochange_depth--; 8553 8554 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput); 8555 sayYES; 8556 8557 8558 case EVAL_B_fail: /* unsuccessful B in (?{...})B */ 8559 REGCP_UNWIND(ST.lastcp); 8560 regcppop(rex, &maxopenparen); 8561 sayNO; 8562 8563 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ 8564 /* note: this is called twice; first after popping B, then A */ 8565 DEBUG_STACK_r({ 8566 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n", 8567 depth, cur_eval, ST.prev_eval); 8568 }); 8569 8570 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput); 8571 8572 rex_sv = ST.prev_rex; 8573 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 8574 SET_reg_curpm(rex_sv); 8575 rex = ReANY(rex_sv); 8576 rexi = RXi_GET(rex); 8577 8578 REGCP_UNWIND(ST.lastcp); 8579 regcppop(rex, &maxopenparen); 8580 cur_eval = ST.prev_eval; 8581 cur_curlyx = ST.prev_curlyx; 8582 8583 /* Invalidate cache. See "invalidate" comment above. */ 8584 reginfo->poscache_maxiter = 0; 8585 if ( nochange_depth ) 8586 nochange_depth--; 8587 8588 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput); 8589 sayNO_SILENT; 8590#undef ST 8591 8592 case OPEN: /* ( */ 8593 n = PARNO(scan); /* which paren pair */ 8594 RXp_OFFSp(rex)[n].start_tmp = locinput - reginfo->strbeg; 8595 if (n > maxopenparen) 8596 maxopenparen = n; 8597 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ 8598 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n", 8599 depth, 8600 PTR2UV(rex), 8601 PTR2UV(RXp_OFFSp(rex)), 8602 (UV)n, 8603 (IV)RXp_OFFSp(rex)[n].start_tmp, 8604 (UV)maxopenparen 8605 )); 8606 lastopen = n; 8607 break; 8608 8609 case SROPEN: /* (*SCRIPT_RUN: */ 8610 script_run_begin = (U8 *) locinput; 8611 break; 8612 8613 8614 case CLOSE: /* ) */ 8615 n = PARNO(scan); /* which paren pair */ 8616 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp, 8617 locinput - reginfo->strbeg); 8618 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) ) 8619 goto fake_end; 8620 8621 break; 8622 8623 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */ 8624 8625 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target)) 8626 { 8627 sayNO; 8628 } 8629 8630 break; 8631 8632 8633 case ACCEPT: /* (*ACCEPT) */ 8634 is_accepted = true; 8635 if (FLAGS(scan)) 8636 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 8637 utmp = ARG2u(scan); 8638 8639 if ( utmp ) { 8640 regnode *cursor; 8641 for ( 8642 cursor = scan; 8643 cursor && ( OP(cursor) != END ); 8644 cursor = ( 8645 REGNODE_TYPE( OP(cursor) ) == END 8646 || REGNODE_TYPE( OP(cursor) ) == WHILEM 8647 ) 8648 ? REGNODE_AFTER(cursor) 8649 : regnext(cursor) 8650 ){ 8651 if ( OP(cursor) != CLOSE ) 8652 continue; 8653 8654 n = PARNO(cursor); 8655 8656 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */ 8657 continue; /* so skip this one */ 8658 8659 CLOSE_CAPTURE(rex, n, RXp_OFFSp(rex)[n].start_tmp, 8660 locinput - reginfo->strbeg); 8661 8662 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) ) 8663 break; 8664 } 8665 } 8666 goto fake_end; 8667 /* NOTREACHED */ 8668 8669 case GROUPP: /* (?(1)) */ 8670 n = ARG1u(scan); /* which paren pair */ 8671 sw = cBOOL(RXp_LASTPAREN(rex) >= n && RXp_OFFS_END(rex,n) != -1); 8672 break; 8673 8674 case GROUPPN: /* (?(<name>)) */ 8675 /* reg_check_named_buff_matched returns 0 for no match */ 8676 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); 8677 break; 8678 8679 case INSUBP: /* (?(R)) */ 8680 n = ARG1u(scan); 8681 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg 8682 * of SCAN is already set up as matches a eval.close_paren */ 8683 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n); 8684 break; 8685 8686 case DEFINEP: /* (?(DEFINE)) */ 8687 sw = 0; 8688 break; 8689 8690 case IFTHEN: /* (?(cond)A|B) */ 8691 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ 8692 if (sw) 8693 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN); 8694 else { 8695 next = scan + ARG1u(scan); 8696 if (OP(next) == IFTHEN) /* Fake one. */ 8697 next = REGNODE_AFTER_type(next,tregnode_IFTHEN); 8698 } 8699 break; 8700 8701 case LOGICAL: /* modifier for EVAL and IFMATCH */ 8702 logical = FLAGS(scan) & EVAL_FLAGS_MASK; /* reserve a bit for optimistic eval */ 8703 break; 8704 8705/******************************************************************* 8706 8707The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ 8708pattern, where A and B are subpatterns. (For simple A, CURLYM or 8709STAR/PLUS/CURLY/CURLYN are used instead.) 8710 8711A*B is compiled as <CURLYX><A><WHILEM><B> 8712 8713On entry to the subpattern, CURLYX is called. This pushes a CURLYX 8714state, which contains the current count, initialised to -1. It also sets 8715cur_curlyx to point to this state, with any previous value saved in the 8716state block. 8717 8718CURLYX then jumps straight to the WHILEM op, rather than executing A, 8719since the pattern may possibly match zero times (i.e. it's a while {} loop 8720rather than a do {} while loop). 8721 8722Each entry to WHILEM represents a successful match of A. The count in the 8723CURLYX block is incremented, another WHILEM state is pushed, and execution 8724passes to A or B depending on greediness and the current count. 8725 8726For example, if matching against the string a1a2a3b (where the aN are 8727substrings that match /A/), then the match progresses as follows: (the 8728pushed states are interspersed with the bits of strings matched so far): 8729 8730 <CURLYX cnt=-1> 8731 <CURLYX cnt=0><WHILEM> 8732 <CURLYX cnt=1><WHILEM> a1 <WHILEM> 8733 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM> 8734 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> 8735 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b 8736 8737(Contrast this with something like CURLYM, which maintains only a single 8738backtrack state: 8739 8740 <CURLYM cnt=0> a1 8741 a1 <CURLYM cnt=1> a2 8742 a1 a2 <CURLYM cnt=2> a3 8743 a1 a2 a3 <CURLYM cnt=3> b 8744) 8745 8746Each WHILEM state block marks a point to backtrack to upon partial failure 8747of A or B, and also contains some minor state data related to that 8748iteration. The CURLYX block, pointed to by cur_curlyx, contains the 8749overall state, such as the count, and pointers to the A and B ops. 8750 8751This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx 8752must always point to the *current* CURLYX block, the rules are: 8753 8754When executing CURLYX, save the old cur_curlyx in the CURLYX state block, 8755and set cur_curlyx to point the new block. 8756 8757When popping the CURLYX block after a successful or unsuccessful match, 8758restore the previous cur_curlyx. 8759 8760When WHILEM is about to execute B, save the current cur_curlyx, and set it 8761to the outer one saved in the CURLYX block. 8762 8763When popping the WHILEM block after a successful or unsuccessful B match, 8764restore the previous cur_curlyx. 8765 8766Here's an example for the pattern (AI* BI)*BO 8767I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: 8768 8769cur_ 8770curlyx backtrack stack 8771------ --------------- 8772NULL 8773CO <CO prev=NULL> <WO> 8774CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8775CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8776NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo 8777 8778At this point the pattern succeeds, and we work back down the stack to 8779clean up, restoring as we go: 8780 8781CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi 8782CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai 8783CO <CO prev=NULL> <WO> 8784NULL 8785 8786*******************************************************************/ 8787 8788#define ST st->u.curlyx 8789 8790 case CURLYX: /* start of /A*B/ (for complex A) */ 8791 { 8792 /* No need to save/restore up to this paren */ 8793 I32 parenfloor = FLAGS(scan); 8794 8795 assert(next); /* keep Coverity happy */ 8796 if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */ 8797 next += ARG1u(next); 8798 8799 /* XXXX Probably it is better to teach regpush to support 8800 parenfloor > maxopenparen ... */ 8801 if (parenfloor > (I32)RXp_LASTPAREN(rex)) 8802 parenfloor = RXp_LASTPAREN(rex); /* Pessimization... */ 8803 8804 ST.prev_curlyx= cur_curlyx; 8805 cur_curlyx = st; 8806 ST.cp = PL_savestack_ix; 8807 8808 /* these fields contain the state of the current curly. 8809 * they are accessed by subsequent WHILEMs */ 8810 ST.parenfloor = parenfloor; 8811 ST.me = scan; 8812 ST.B = next; 8813 ST.minmod = minmod; 8814 minmod = 0; 8815 ST.count = -1; /* this will be updated by WHILEM */ 8816 ST.lastloc = NULL; /* this will be updated by WHILEM */ 8817 8818 PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol, 8819 script_run_begin); 8820 NOT_REACHED; /* NOTREACHED */ 8821 } 8822 8823 case CURLYX_end: /* just finished matching all of A*B */ 8824 cur_curlyx = ST.prev_curlyx; 8825 sayYES; 8826 NOT_REACHED; /* NOTREACHED */ 8827 8828 case CURLYX_end_fail: /* just failed to match all of A*B */ 8829 regcpblow(ST.cp); 8830 cur_curlyx = ST.prev_curlyx; 8831 sayNO; 8832 NOT_REACHED; /* NOTREACHED */ 8833 8834 8835#undef ST 8836#define ST st->u.whilem 8837 8838 case WHILEM: /* just matched an A in /A*B/ (for complex A) */ 8839 { 8840 /* see the discussion above about CURLYX/WHILEM */ 8841 I32 n; 8842 int min, max; 8843 /* U16 first_paren, last_paren; */ 8844 regnode *A; 8845 8846 assert(cur_curlyx); /* keep Coverity happy */ 8847 8848 min = ARG1i(cur_curlyx->u.curlyx.me); 8849 max = ARG2i(cur_curlyx->u.curlyx.me); 8850 /* first_paren = ARG3a(cur_curlyx->u.curlyx.me); */ 8851 /* last_paren = ARG3b(cur_curlyx->u.curlyx.me); */ 8852 A = REGNODE_AFTER(cur_curlyx->u.curlyx.me); 8853 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ 8854 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; 8855 ST.cache_offset = 0; 8856 ST.cache_mask = 0; 8857 8858 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n", 8859 depth, (long)n, min, max) 8860 ); 8861 8862 /* First just match a string of min A's. */ 8863 if (n < min) { 8864 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen); 8865 cur_curlyx->u.curlyx.lastloc = locinput; 8866 REGCP_SET(ST.lastcp); 8867 8868 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, 8869 script_run_begin); 8870 NOT_REACHED; /* NOTREACHED */ 8871 } 8872 8873 /* If degenerate A matches "", assume A done. */ 8874 8875 if (locinput == cur_curlyx->u.curlyx.lastloc) { 8876 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n", 8877 depth) 8878 ); 8879 goto do_whilem_B_max; 8880 } 8881 8882 /* super-linear cache processing. 8883 * 8884 * The idea here is that for certain types of CURLYX/WHILEM - 8885 * principally those whose upper bound is infinity (and 8886 * excluding regexes that have things like \1 and other very 8887 * non-regular expressiony things), then if a pattern like 8888 * /....A*.../ fails and we backtrack to the WHILEM, then we 8889 * make a note that this particular WHILEM op was at string 8890 * position 47 (say) when the rest of pattern failed. Then, if 8891 * we ever find ourselves back at that WHILEM, and at string 8892 * position 47 again, we can just fail immediately rather than 8893 * running the rest of the pattern again. 8894 * 8895 * This is very handy when patterns start to go 8896 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up 8897 * with a combinatorial explosion of backtracking. 8898 * 8899 * The cache is implemented as a bit array, with one bit per 8900 * string byte position per WHILEM op (up to 16) - so its 8901 * between 0.25 and 2x the string size. 8902 * 8903 * To avoid allocating a poscache buffer every time, we do an 8904 * initially countdown; only after we have executed a WHILEM 8905 * op (string-length x #WHILEMs) times do we allocate the 8906 * cache. 8907 * 8908 * The top 4 bits of FLAGS(scan) byte say how many different 8909 * relevant CURLLYX/WHILEM op pairs there are, while the 8910 * bottom 4-bits is the identifying index number of this 8911 * WHILEM. 8912 */ 8913 8914 if (FLAGS(scan)) { 8915 8916 if (!reginfo->poscache_maxiter) { 8917 /* start the countdown: Postpone detection until we 8918 * know the match is not *that* much linear. */ 8919 reginfo->poscache_maxiter 8920 = (reginfo->strend - reginfo->strbeg + 1) 8921 * (FLAGS(scan)>>4); 8922 /* possible overflow for long strings and many CURLYX's */ 8923 if (reginfo->poscache_maxiter < 0) 8924 reginfo->poscache_maxiter = I32_MAX; 8925 reginfo->poscache_iter = reginfo->poscache_maxiter; 8926 } 8927 8928 if (reginfo->poscache_iter-- == 0) { 8929 /* initialise cache */ 8930 const SSize_t size = (reginfo->poscache_maxiter + 7)/8; 8931 regmatch_info_aux *const aux = reginfo->info_aux; 8932 if (aux->poscache) { 8933 if ((SSize_t)reginfo->poscache_size < size) { 8934 Renew(aux->poscache, size, char); 8935 reginfo->poscache_size = size; 8936 } 8937 Zero(aux->poscache, size, char); 8938 } 8939 else { 8940 reginfo->poscache_size = size; 8941 Newxz(aux->poscache, size, char); 8942 } 8943 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_ 8944 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n", 8945 PL_colors[4], PL_colors[5]) 8946 ); 8947 } 8948 8949 if (reginfo->poscache_iter < 0) { 8950 /* have we already failed at this position? */ 8951 SSize_t offset, mask; 8952 8953 reginfo->poscache_iter = -1; /* stop eventual underflow */ 8954 offset = (FLAGS(scan) & 0xf) - 1 8955 + (locinput - reginfo->strbeg) 8956 * (FLAGS(scan)>>4); 8957 mask = 1 << (offset % 8); 8958 offset /= 8; 8959 if (reginfo->info_aux->poscache[offset] & mask) { 8960 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n", 8961 depth) 8962 ); 8963 cur_curlyx->u.curlyx.count--; 8964 sayNO; /* cache records failure */ 8965 } 8966 ST.cache_offset = offset; 8967 ST.cache_mask = mask; 8968 } 8969 } 8970 8971 /* Prefer B over A for minimal matching. */ 8972 8973 if (cur_curlyx->u.curlyx.minmod) { 8974 ST.save_curlyx = cur_curlyx; 8975 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 8976 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, 8977 locinput, loceol, script_run_begin); 8978 NOT_REACHED; /* NOTREACHED */ 8979 } 8980 8981 /* Prefer A over B for maximal matching. */ 8982 8983 if (n < max) { /* More greed allowed? */ 8984 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, 8985 maxopenparen); 8986 cur_curlyx->u.curlyx.lastloc = locinput; 8987 REGCP_SET(ST.lastcp); 8988 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, 8989 script_run_begin); 8990 NOT_REACHED; /* NOTREACHED */ 8991 } 8992 goto do_whilem_B_max; 8993 } 8994 NOT_REACHED; /* NOTREACHED */ 8995 8996 case WHILEM_B_min: /* just matched B in a minimal match */ 8997 case WHILEM_B_max: /* just matched B in a maximal match */ 8998 cur_curlyx = ST.save_curlyx; 8999 sayYES; 9000 NOT_REACHED; /* NOTREACHED */ 9001 9002 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ 9003 cur_curlyx = ST.save_curlyx; 9004 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 9005 cur_curlyx->u.curlyx.count--; 9006 CACHEsayNO; 9007 NOT_REACHED; /* NOTREACHED */ 9008 9009 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ 9010 /* FALLTHROUGH */ 9011 case WHILEM_A_pre_fail: /* just failed to match even minimal A */ 9012 REGCP_UNWIND(ST.lastcp); 9013 regcppop(rex, &maxopenparen); 9014 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; 9015 cur_curlyx->u.curlyx.count--; 9016 CACHEsayNO; 9017 NOT_REACHED; /* NOTREACHED */ 9018 9019 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ 9020 REGCP_UNWIND(ST.lastcp); 9021 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 9022 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n", 9023 depth) 9024 ); 9025 9026 do_whilem_B_max: 9027 /* now try B */ 9028 ST.save_curlyx = cur_curlyx; 9029 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; 9030 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, 9031 locinput, loceol, script_run_begin); 9032 NOT_REACHED; /* NOTREACHED */ 9033 9034 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ 9035 cur_curlyx = ST.save_curlyx; 9036 9037 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2i(cur_curlyx->u.curlyx.me)) { 9038 /* Maximum greed exceeded */ 9039 cur_curlyx->u.curlyx.count--; 9040 CACHEsayNO; 9041 } 9042 9043 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth) 9044 ); 9045 /* Try grabbing another A and see if it helps. */ 9046 cur_curlyx->u.curlyx.lastloc = locinput; 9047 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen); 9048 REGCP_SET(ST.lastcp); 9049 PUSH_STATE_GOTO(WHILEM_A_min, 9050 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me), 9051 locinput, loceol, script_run_begin); 9052 NOT_REACHED; /* NOTREACHED */ 9053 9054#undef ST 9055#define ST st->u.branch 9056 9057 case BRANCHJ: /* /(...|A|...)/ with long next pointer */ 9058 next = scan + ARG1u(scan); 9059 if (next == scan) 9060 next = NULL; 9061 ST.before_paren = ARG2a(scan); 9062 ST.after_paren = ARG2b(scan); 9063 goto branch_logic; 9064 NOT_REACHED; /* NOTREACHED */ 9065 9066 case BRANCH: /* /(...|A|...)/ */ 9067 ST.before_paren = ARG1a(scan); 9068 ST.after_paren = ARG1b(scan); 9069 branch_logic: 9070 scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */ 9071 assert(scan); 9072 ST.lastparen = RXp_LASTPAREN(rex); 9073 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 9074 ST.next_branch = next; 9075 REGCP_SET(ST.cp); 9076 if (RE_PESSIMISTIC_PARENS) { 9077 regcppush(rex, 0, maxopenparen); 9078 REGCP_SET(ST.lastcp); 9079 } 9080 9081 /* Now go into the branch */ 9082 if (has_cutgroup) { 9083 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 9084 script_run_begin); 9085 } else { 9086 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, 9087 script_run_begin); 9088 } 9089 NOT_REACHED; /* NOTREACHED */ 9090 9091 case CUTGROUP: /* /(*THEN)/ */ 9092 sv_yes_mark = st->u.mark.mark_name = FLAGS(scan) 9093 ? MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]) 9094 : NULL; 9095 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, 9096 script_run_begin); 9097 NOT_REACHED; /* NOTREACHED */ 9098 9099 case CUTGROUP_next_fail: 9100 do_cutgroup = 1; 9101 no_final = 1; 9102 if (st->u.mark.mark_name) 9103 sv_commit = st->u.mark.mark_name; 9104 sayNO; 9105 NOT_REACHED; /* NOTREACHED */ 9106 9107 case BRANCH_next: 9108 sayYES; 9109 NOT_REACHED; /* NOTREACHED */ 9110 9111 case BRANCH_next_fail: /* that branch failed; try the next, if any */ 9112 if (do_cutgroup) { 9113 do_cutgroup = 0; 9114 no_final = 0; 9115 } 9116 if (RE_PESSIMISTIC_PARENS) { 9117 REGCP_UNWIND(ST.lastcp); 9118 regcppop(rex,&maxopenparen); 9119 } 9120 REGCP_UNWIND(ST.cp); 9121 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9122 CAPTURE_CLEAR(ST.before_paren+1, ST.after_paren, "BRANCH_next_fail"); 9123 scan = ST.next_branch; 9124 /* no more branches? */ 9125 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { 9126 DEBUG_EXECUTE_r({ 9127 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n", 9128 depth, 9129 PL_colors[4], 9130 PL_colors[5] ); 9131 }); 9132 sayNO_SILENT; 9133 } 9134 continue; /* execute next BRANCH[J] op */ 9135 /* NOTREACHED */ 9136 9137 case MINMOD: /* next op will be non-greedy, e.g. A*? */ 9138 minmod = 1; 9139 break; 9140 9141#undef ST 9142#define ST st->u.curlym 9143 9144 case CURLYM: /* /A{m,n}B/ where A is fixed-length */ 9145 9146 /* This is an optimisation of CURLYX that enables us to push 9147 * only a single backtracking state, no matter how many matches 9148 * there are in {m,n}. It relies on the pattern being constant 9149 * length, with no parens to influence future backrefs 9150 */ 9151 9152 ST.me = scan; 9153 scan = REGNODE_AFTER_type(scan, tregnode_CURLYM); 9154 9155 ST.lastparen = RXp_LASTPAREN(rex); 9156 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 9157 9158 /* if paren positive, emulate an OPEN/CLOSE around A */ 9159 if (FLAGS(ST.me)) { 9160 U32 paren = FLAGS(ST.me); 9161 lastopen = paren; 9162 if (paren > maxopenparen) 9163 maxopenparen = paren; 9164 scan += NEXT_OFF(scan); /* Skip former OPEN. */ 9165 } 9166 ST.A = scan; 9167 ST.B = next; 9168 ST.alen = 0; 9169 ST.count = 0; 9170 ST.minmod = minmod; 9171 minmod = 0; 9172 ST.Binfo.count = -1; 9173 REGCP_SET(ST.cp); 9174 9175 if (!(ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me))) /* min/max */ 9176 goto curlym_do_B; 9177 9178 curlym_do_A: /* execute the A in /A{m,n}B/ */ 9179 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ 9180 script_run_begin); 9181 NOT_REACHED; /* NOTREACHED */ 9182 9183 case CURLYM_A: /* we've just matched an A */ 9184 ST.count++; 9185 /* after first match, determine A's length: u.curlym.alen */ 9186 if (ST.count == 1) { 9187 if (reginfo->is_utf8_target) { 9188 char *s = st->locinput; 9189 while (s < locinput) { 9190 ST.alen++; 9191 s += UTF8SKIP(s); 9192 } 9193 } 9194 else { 9195 ST.alen = locinput - st->locinput; 9196 } 9197 if (ST.alen == 0) 9198 ST.count = ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me); 9199 } 9200 DEBUG_EXECUTE_r( 9201 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n", 9202 depth, (IV) ST.count, (IV)ST.alen) 9203 ); 9204 9205 if (FLAGS(ST.me)) { 9206 /* emulate CLOSE: mark current A as captured */ 9207 U32 paren = (U32)FLAGS(ST.me); 9208 CLOSE_CAPTURE(rex, paren, 9209 HOPc(locinput, -ST.alen) - reginfo->strbeg, 9210 locinput - reginfo->strbeg); 9211 } 9212 9213 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me))) 9214 goto fake_end; 9215 9216 9217 if (!is_accepted) { 9218 I32 max = (ST.minmod ? ARG1i(ST.me) : ARG2i(ST.me)); 9219 if ( max == REG_INFTY || ST.count < max ) 9220 goto curlym_do_A; /* try to match another A */ 9221 } 9222 goto curlym_do_B; /* try to match B */ 9223 9224 case CURLYM_A_fail: /* just failed to match an A */ 9225 REGCP_UNWIND(ST.cp); 9226 9227 9228 if (ST.minmod || ST.count < ARG1i(ST.me) /* min*/ 9229 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me))) 9230 sayNO; 9231 9232 curlym_do_B: /* execute the B in /A{m,n}B/ */ 9233 if (is_accepted) 9234 goto curlym_close_B; 9235 9236 if (ST.Binfo.count < 0) { 9237 /* calculate possible match of 1st char following curly */ 9238 assert(ST.B); 9239 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { 9240 regnode *text_node = ST.B; 9241 if (! HAS_TEXT(text_node)) 9242 FIND_NEXT_IMPT(text_node); 9243 if (REGNODE_TYPE(OP(text_node)) == EXACT) { 9244 if (! S_setup_EXACTISH_ST(aTHX_ text_node, 9245 &ST.Binfo, reginfo)) 9246 { 9247 sayNO; 9248 } 9249 } 9250 } 9251 } 9252 9253 DEBUG_EXECUTE_r( 9254 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n", 9255 depth, (IV)ST.count) 9256 ); 9257 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) { 9258 assert(ST.Binfo.count > 0); 9259 9260 /* Do a quick test to hopefully rule out most non-matches */ 9261 if ( locinput + ST.Binfo.min_length > loceol 9262 || ! S_test_EXACTISH_ST(locinput, ST.Binfo)) 9263 { 9264 DEBUG_OPTIMISE_r( 9265 Perl_re_exec_indentf( aTHX_ 9266 "CURLYM Fast bail next target=0x%X anded==0x%X" 9267 " mask=0x%X\n", 9268 depth, 9269 (int) nextbyte, ST.Binfo.first_byte_anded, 9270 ST.Binfo.first_byte_mask) 9271 ); 9272 state_num = CURLYM_B_fail; 9273 goto reenter_switch; 9274 } 9275 } 9276 9277 curlym_close_B: 9278 if (FLAGS(ST.me)) { 9279 /* emulate CLOSE: mark current A as captured */ 9280 U32 paren = (U32)FLAGS(ST.me); 9281 if (ST.count || is_accepted) { 9282 CLOSE_CAPTURE(rex, paren, 9283 HOPc(locinput, -ST.alen) - reginfo->strbeg, 9284 locinput - reginfo->strbeg); 9285 } 9286 else 9287 RXp_OFFSp(rex)[paren].end = -1; 9288 9289 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)FLAGS(ST.me))) 9290 { 9291 if (ST.count || is_accepted) 9292 goto fake_end; 9293 else 9294 sayNO; 9295 } 9296 } 9297 9298 if (is_accepted) 9299 goto fake_end; 9300 9301 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ 9302 script_run_begin); 9303 NOT_REACHED; /* NOTREACHED */ 9304 9305 case CURLYM_B_fail: /* just failed to match a B */ 9306 REGCP_UNWIND(ST.cp); 9307 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9308 if (ST.minmod) { 9309 I32 max = ARG2i(ST.me); 9310 if (max != REG_INFTY && ST.count == max) 9311 sayNO; 9312 goto curlym_do_A; /* try to match a further A */ 9313 } 9314 /* backtrack one A */ 9315 if (ST.count == ARG1i(ST.me) /* min */) 9316 sayNO; 9317 ST.count--; 9318 SET_locinput(HOPc(locinput, -ST.alen)); 9319 goto curlym_do_B; /* try to match B */ 9320 9321#undef ST 9322#define ST st->u.curly 9323 9324#define CURLY_SETPAREN(paren, success) \ 9325 if (paren) { \ 9326 if (success) { \ 9327 CLOSE_CAPTURE(rex, paren, HOPc(locinput, -1) - reginfo->strbeg, \ 9328 locinput - reginfo->strbeg); \ 9329 } \ 9330 else { \ 9331 RXp_OFFSp(rex)[paren].end = -1; \ 9332 RXp_LASTPAREN(rex) = ST.lastparen; \ 9333 RXp_LASTCLOSEPAREN(rex) = ST.lastcloseparen; \ 9334 } \ 9335 } 9336 9337 case STAR: /* /A*B/ where A is width 1 char */ 9338 ST.paren = 0; 9339 ST.min = 0; 9340 ST.max = REG_INFTY; 9341 scan = REGNODE_AFTER_type(scan,tregnode_STAR); 9342 goto repeat; 9343 9344 case PLUS: /* /A+B/ where A is width 1 char */ 9345 ST.paren = 0; 9346 ST.min = 1; 9347 ST.max = REG_INFTY; 9348 scan = REGNODE_AFTER_type(scan,tregnode_PLUS); 9349 goto repeat; 9350 9351 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ 9352 ST.paren = FLAGS(scan); /* Which paren to set */ 9353 ST.lastparen = RXp_LASTPAREN(rex); 9354 ST.lastcloseparen = RXp_LASTCLOSEPAREN(rex); 9355 if (ST.paren > maxopenparen) 9356 maxopenparen = ST.paren; 9357 ST.min = ARG1i(scan); /* min to match */ 9358 ST.max = ARG2i(scan); /* max to match */ 9359 scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN)); 9360 9361 /* handle the single-char capture called as a GOSUB etc */ 9362 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren)) 9363 { 9364 char *li = locinput; 9365 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1)) 9366 sayNO; 9367 SET_locinput(li); 9368 goto fake_end; 9369 } 9370 9371 goto repeat; 9372 9373 case CURLY: /* /A{m,n}B/ where A is width 1 char */ 9374 ST.paren = 0; 9375 ST.min = ARG1i(scan); /* min to match */ 9376 ST.max = ARG2i(scan); /* max to match */ 9377 scan = REGNODE_AFTER_type(scan, tregnode_CURLY); 9378 repeat: 9379 /* 9380 * Lookahead to avoid useless match attempts 9381 * when we know what character comes next. 9382 * 9383 * Used to only do .*x and .*?x, but now it allows 9384 * for )'s, ('s and (?{ ... })'s to be in the way 9385 * of the quantifier and the EXACT-like node. -- japhy 9386 */ 9387 9388 assert(ST.min <= ST.max); 9389 if (! HAS_TEXT(next) && ! JUMPABLE(next)) { 9390 ST.Binfo.count = 0; 9391 } 9392 else { 9393 regnode *text_node = next; 9394 9395 if (! HAS_TEXT(text_node)) 9396 FIND_NEXT_IMPT(text_node); 9397 9398 if (! HAS_TEXT(text_node)) 9399 ST.Binfo.count = 0; 9400 else { 9401 if ( REGNODE_TYPE(OP(text_node)) != EXACT ) { 9402 ST.Binfo.count = 0; 9403 } 9404 else { 9405 if (! S_setup_EXACTISH_ST(aTHX_ text_node, 9406 &ST.Binfo, reginfo)) 9407 { 9408 sayNO; 9409 } 9410 } 9411 } 9412 } 9413 9414 ST.A = scan; 9415 ST.B = next; 9416 if (minmod) { 9417 char *li = locinput; 9418 minmod = 0; 9419 if (ST.min && 9420 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min) 9421 < ST.min) 9422 sayNO; 9423 SET_locinput(li); 9424 ST.count = ST.min; 9425 REGCP_SET(ST.cp); 9426 9427 if (ST.Binfo.count <= 0) 9428 goto curly_try_B_min; 9429 9430 ST.oldloc = locinput; 9431 9432 /* set ST.maxpos to the furthest point along the 9433 * string that could possibly match, i.e., that a match could 9434 * start at. */ 9435 if (ST.max == REG_INFTY) { 9436 ST.maxpos = loceol - 1; 9437 if (utf8_target) 9438 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) 9439 ST.maxpos--; 9440 } 9441 else if (utf8_target) { 9442 int m = ST.max - ST.min; 9443 for (ST.maxpos = locinput; 9444 m >0 && ST.maxpos < loceol; m--) 9445 ST.maxpos += UTF8SKIP(ST.maxpos); 9446 } 9447 else { 9448 ST.maxpos = locinput + ST.max - ST.min; 9449 if (ST.maxpos >= loceol) 9450 ST.maxpos = loceol - 1; 9451 } 9452 goto curly_try_B_min_known; 9453 9454 } 9455 else { 9456 /* avoid taking address of locinput, so it can remain 9457 * a register var */ 9458 char *li = locinput; 9459 if (ST.max) 9460 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max); 9461 else 9462 ST.count = 0; 9463 if (ST.count < ST.min) 9464 sayNO; 9465 SET_locinput(li); 9466 if ((ST.count > ST.min) 9467 && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL)) 9468 { 9469 /* A{m,n} must come at the end of the string, there's 9470 * no point in backing off ... */ 9471 ST.min = ST.count; 9472 /* ...except that $ and \Z can match before *and* after 9473 newline at the end. Consider "\n\n" =~ /\n+\Z\n/. 9474 We may back off by one in this case. */ 9475 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) 9476 ST.min--; 9477 } 9478 REGCP_SET(ST.cp); 9479 goto curly_try_B_max; 9480 } 9481 NOT_REACHED; /* NOTREACHED */ 9482 9483 case CURLY_B_min_fail: 9484 /* failed to find B in a non-greedy match. */ 9485 if (RE_PESSIMISTIC_PARENS) { 9486 REGCP_UNWIND(ST.lastcp); 9487 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 9488 } 9489 REGCP_UNWIND(ST.cp); 9490 if (ST.paren) { 9491 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9492 } 9493 9494 if (ST.Binfo.count == 0) { 9495 /* failed -- move forward one */ 9496 char *li = locinput; 9497 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) { 9498 sayNO; 9499 } 9500 locinput = li; 9501 ST.count++; 9502 if (!( ST.count <= ST.max 9503 /* count overflow ? */ 9504 || (ST.max == REG_INFTY && ST.count > 0)) 9505 ) 9506 sayNO; 9507 } 9508 else { 9509 int n; 9510 /* Couldn't or didn't -- move forward. */ 9511 ST.oldloc = locinput; 9512 if (utf8_target) 9513 locinput += UTF8SKIP(locinput); 9514 else 9515 locinput++; 9516 ST.count++; 9517 9518 curly_try_B_min_known: 9519 /* find the next place where 'B' could work, then call B */ 9520 if (locinput + ST.Binfo.initial_exact < loceol) { 9521 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) { 9522 9523 /* Here, the mask is all 1's for the entire length of 9524 * any possible match. (That actually means that there 9525 * is only one possible match.) Look for the next 9526 * occurrence */ 9527 locinput = ninstr(locinput, loceol, 9528 (char *) ST.Binfo.matches, 9529 (char *) ST.Binfo.matches 9530 + ST.Binfo.initial_exact); 9531 if (locinput == NULL) { 9532 sayNO; 9533 } 9534 } 9535 else do { 9536 /* If the first byte(s) of the mask are all ones, it 9537 * means those bytes must match identically, so can use 9538 * ninstr() to find the next possible matchpoint */ 9539 if (ST.Binfo.initial_exact > 0) { 9540 locinput = ninstr(locinput, loceol, 9541 (char *) ST.Binfo.matches, 9542 (char *) ST.Binfo.matches 9543 + ST.Binfo.initial_exact); 9544 } 9545 else { /* Otherwise find the next byte that matches, 9546 masked */ 9547 locinput = (char *) find_next_masked( 9548 (U8 *) locinput, (U8 *) loceol, 9549 ST.Binfo.first_byte_anded, 9550 ST.Binfo.first_byte_mask); 9551 /* Advance to the end of a multi-byte character */ 9552 if (utf8_target) { 9553 while ( locinput < loceol 9554 && UTF8_IS_CONTINUATION(*locinput)) 9555 { 9556 locinput++; 9557 } 9558 } 9559 } 9560 if ( locinput == NULL 9561 || locinput + ST.Binfo.min_length > loceol) 9562 { 9563 sayNO; 9564 } 9565 9566 /* Here, we have found a possible match point; if can't 9567 * rule it out, quit the loop so can check fully */ 9568 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) { 9569 break; 9570 } 9571 9572 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1; 9573 9574 } while (locinput <= ST.maxpos); 9575 } 9576 9577 if (locinput > ST.maxpos) 9578 sayNO; 9579 9580 n = (utf8_target) 9581 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput) 9582 : (STRLEN) (locinput - ST.oldloc); 9583 9584 9585 /* Here is at the beginning of a character that meets the mask 9586 * criteria. Need to make sure that some real possibility */ 9587 9588 if (n) { 9589 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is 9590 * at what may be the beginning of b; check that everything 9591 * between oldloc and locinput matches */ 9592 char *li = ST.oldloc; 9593 ST.count += n; 9594 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n) 9595 sayNO; 9596 assert(n == REG_INFTY || locinput == li); 9597 } 9598 } 9599 9600 curly_try_B_min: 9601 if (RE_PESSIMISTIC_PARENS) { 9602 (void)regcppush(rex, 0, maxopenparen); 9603 REGCP_SET(ST.lastcp); 9604 } 9605 CURLY_SETPAREN(ST.paren, ST.count); 9606 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, 9607 script_run_begin); 9608 NOT_REACHED; /* NOTREACHED */ 9609 9610 9611 curly_try_B_max: 9612 /* a successful greedy match: now try to match B */ 9613 if ( ST.Binfo.count <= 0 9614 || ( ST.Binfo.count > 0 9615 && locinput + ST.Binfo.min_length <= loceol 9616 && S_test_EXACTISH_ST(locinput, ST.Binfo))) 9617 { 9618 if (RE_PESSIMISTIC_PARENS) { 9619 (void)regcppush(rex, 0, maxopenparen); 9620 REGCP_SET(ST.lastcp); 9621 } 9622 CURLY_SETPAREN(ST.paren, ST.count); 9623 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, 9624 script_run_begin); 9625 NOT_REACHED; /* NOTREACHED */ 9626 } 9627 goto CURLY_B_all_failed; 9628 NOT_REACHED; /* NOTREACHED */ 9629 9630 case CURLY_B_max_fail: 9631 /* failed to find B in a greedy match */ 9632 9633 if (RE_PESSIMISTIC_PARENS) { 9634 REGCP_UNWIND(ST.lastcp); 9635 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */ 9636 } 9637 CURLY_B_all_failed: 9638 REGCP_UNWIND(ST.cp); 9639 if (ST.paren) { 9640 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); 9641 } 9642 /* back up. */ 9643 if (--ST.count < ST.min) 9644 sayNO; 9645 locinput = HOPc(locinput, -1); 9646 goto curly_try_B_max; 9647 9648#undef ST 9649 9650 case END: /* last op of main pattern */ 9651 fake_end: 9652 if (cur_eval) { 9653 /* we've just finished A in /(??{A})B/; now continue with B */ 9654 is_accepted= false; 9655 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput); 9656 st->u.eval.prev_rex = rex_sv; /* inner */ 9657 9658 /* Save *all* the positions. */ 9659 st->u.eval.cp = regcppush(rex, 0, maxopenparen); 9660 rex_sv = CUR_EVAL.prev_rex; 9661 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); 9662 SET_reg_curpm(rex_sv); 9663 rex = ReANY(rex_sv); 9664 rexi = RXi_GET(rex); 9665 9666 st->u.eval.prev_curlyx = cur_curlyx; 9667 cur_curlyx = CUR_EVAL.prev_curlyx; 9668 9669 REGCP_SET(st->u.eval.lastcp); 9670 9671 /* Restore parens of the outer rex without popping the 9672 * savestack */ 9673 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen); 9674 9675 st->u.eval.prev_eval = cur_eval; 9676 cur_eval = CUR_EVAL.prev_eval; 9677 DEBUG_EXECUTE_r( 9678 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n", 9679 depth, cur_eval);); 9680 if ( nochange_depth ) 9681 nochange_depth--; 9682 9683 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); 9684 9685 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ 9686 st->u.eval.prev_eval->u.eval.B, 9687 locinput, loceol, script_run_begin); 9688 } 9689 9690 if (locinput < reginfo->till) { 9691 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ 9692 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n", 9693 PL_colors[4], 9694 (long)(locinput - startpos), 9695 (long)(reginfo->till - startpos), 9696 PL_colors[5])); 9697 9698 sayNO_SILENT; /* Cannot match: too short. */ 9699 } 9700 sayYES; /* Success! */ 9701 9702 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH 9703 matches end at the right spot, required for 9704 variable length matches. */ 9705 if (match_end && locinput != match_end) 9706 { 9707 DEBUG_EXECUTE_r( 9708 Perl_re_exec_indentf( aTHX_ 9709 "%sLOOKBEHIND_END: subpattern failed...%s\n", 9710 depth, PL_colors[4], PL_colors[5])); 9711 sayNO; /* Variable length match didn't line up */ 9712 } 9713 /* FALLTHROUGH */ 9714 9715 case SUCCEED: /* successful SUSPEND/CURLYM and 9716 *lookahead* IFMATCH/UNLESSM*/ 9717 DEBUG_EXECUTE_r( 9718 Perl_re_exec_indentf( aTHX_ 9719 "%sSUCCEED: subpattern success...%s\n", 9720 depth, PL_colors[4], PL_colors[5])); 9721 sayYES; /* Success! */ 9722 9723#undef ST 9724#define ST st->u.ifmatch 9725 9726 case SUSPEND: /* (?>A) */ 9727 ST.wanted = 1; 9728 ST.start = locinput; 9729 ST.end = loceol; 9730 ST.count = 1; 9731 goto do_ifmatch; 9732 9733 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ 9734 ST.wanted = 0; 9735 goto ifmatch_trivial_fail_test; 9736 9737 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ 9738 ST.wanted = 1; 9739 ifmatch_trivial_fail_test: 9740 ST.prev_match_end= match_end; 9741 ST.count = NEXT_OFF(scan) + 1; /* next_off repurposed to be 9742 lookbehind count, requires 9743 non-zero flags */ 9744 if (! FLAGS(scan)) { /* 'flags' zero means lookahed */ 9745 9746 /* Lookahead starts here and ends at the normal place */ 9747 ST.start = locinput; 9748 ST.end = loceol; 9749 match_end = NULL; 9750 } 9751 else { 9752 PERL_UINT_FAST8_T back_count = FLAGS(scan); 9753 char * s; 9754 match_end = locinput; 9755 9756 /* Lookbehind can look beyond the current position */ 9757 ST.end = loceol; 9758 9759 /* ... and starts at the first place in the input that is in 9760 * the range of the possible start positions */ 9761 for (; ST.count > 0; ST.count--, back_count--) { 9762 s = HOPBACKc(locinput, back_count); 9763 if (s) { 9764 ST.start = s; 9765 goto do_ifmatch; 9766 } 9767 } 9768 9769 /* If the lookbehind doesn't start in the actual string, is a 9770 * trivial match failure */ 9771 match_end = ST.prev_match_end; 9772 if (logical) { 9773 logical = 0; 9774 sw = 1 - cBOOL(ST.wanted); 9775 } 9776 else if (ST.wanted) 9777 sayNO; 9778 9779 /* Here, we didn't want it to match, so is actually success */ 9780 next = scan + ARG1u(scan); 9781 if (next == scan) 9782 next = NULL; 9783 break; 9784 } 9785 9786 do_ifmatch: 9787 ST.me = scan; 9788 ST.logical = logical; 9789 logical = 0; /* XXX: reset state of logical once it has been saved into ST */ 9790 9791 /* execute body of (?...A) */ 9792 PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start, 9793 ST.end, script_run_begin); 9794 NOT_REACHED; /* NOTREACHED */ 9795 9796 { 9797 bool matched; 9798 9799 case IFMATCH_A_fail: /* body of (?...A) failed */ 9800 if (! ST.logical && ST.count > 1) { 9801 9802 /* It isn't a real failure until we've tried all starting 9803 * positions. Move to the next starting position and retry */ 9804 ST.count--; 9805 ST.start = HOPc(ST.start, 1); 9806 scan = ST.me; 9807 logical = ST.logical; 9808 goto do_ifmatch; 9809 } 9810 9811 /* Here, all starting positions have been tried. */ 9812 matched = FALSE; 9813 goto ifmatch_done; 9814 9815 case IFMATCH_A: /* body of (?...A) succeeded */ 9816 matched = TRUE; 9817 ifmatch_done: 9818 sw = matched == ST.wanted; 9819 match_end = ST.prev_match_end; 9820 if (! ST.logical && !sw) { 9821 sayNO; 9822 } 9823 9824 if (OP(ST.me) != SUSPEND) { 9825 /* restore old position except for (?>...) */ 9826 locinput = st->locinput; 9827 loceol = st->loceol; 9828 script_run_begin = st->sr0; 9829 } 9830 scan = ST.me + ARG1u(ST.me); 9831 if (scan == ST.me) 9832 scan = NULL; 9833 continue; /* execute B */ 9834 } 9835 9836#undef ST 9837 9838 case LONGJMP: /* alternative with many branches compiles to 9839 * (BRANCHJ; EXACT ...; LONGJMP ) x N */ 9840 next = scan + ARG1u(scan); 9841 if (next == scan) 9842 next = NULL; 9843 break; 9844 9845 case COMMIT: /* (*COMMIT) */ 9846 reginfo->cutpoint = loceol; 9847 /* FALLTHROUGH */ 9848 9849 case PRUNE: /* (*PRUNE) */ 9850 if (FLAGS(scan)) 9851 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9852 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, 9853 script_run_begin); 9854 NOT_REACHED; /* NOTREACHED */ 9855 9856 case COMMIT_next_fail: 9857 no_final = 1; 9858 /* FALLTHROUGH */ 9859 sayNO; 9860 NOT_REACHED; /* NOTREACHED */ 9861 9862 case OPFAIL: /* (*FAIL) */ 9863 if (FLAGS(scan)) 9864 sv_commit = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9865 if (logical) { 9866 /* deal with (?(?!)X|Y) properly, 9867 * make sure we trigger the no branch 9868 * of the trailing IFTHEN structure*/ 9869 sw= 0; 9870 break; 9871 } else { 9872 sayNO; 9873 } 9874 NOT_REACHED; /* NOTREACHED */ 9875 9876#define ST st->u.mark 9877 case MARKPOINT: /* (*MARK:foo) */ 9878 ST.prev_mark = mark_state; 9879 ST.mark_name = sv_commit = sv_yes_mark 9880 = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9881 mark_state = st; 9882 ST.mark_loc = locinput; 9883 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, 9884 script_run_begin); 9885 NOT_REACHED; /* NOTREACHED */ 9886 9887 case MARKPOINT_next: 9888 mark_state = ST.prev_mark; 9889 sayYES; 9890 NOT_REACHED; /* NOTREACHED */ 9891 9892 case MARKPOINT_next_fail: 9893 if (popmark && sv_eq(ST.mark_name,popmark)) 9894 { 9895 if (ST.mark_loc > startpoint) 9896 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9897 popmark = NULL; /* we found our mark */ 9898 sv_commit = ST.mark_name; 9899 9900 DEBUG_EXECUTE_r({ 9901 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n", 9902 depth, 9903 PL_colors[4], SVfARG(sv_commit), PL_colors[5]); 9904 }); 9905 } 9906 mark_state = ST.prev_mark; 9907 sv_yes_mark = mark_state ? 9908 mark_state->u.mark.mark_name : NULL; 9909 sayNO; 9910 NOT_REACHED; /* NOTREACHED */ 9911 9912 case SKIP: /* (*SKIP) */ 9913 if (!FLAGS(scan)) { 9914 /* (*SKIP) : if we fail we cut here*/ 9915 ST.mark_name = NULL; 9916 ST.mark_loc = locinput; 9917 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, 9918 script_run_begin); 9919 } else { 9920 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, 9921 otherwise do nothing. Meaning we need to scan 9922 */ 9923 regmatch_state *cur = mark_state; 9924 SV *find = MUTABLE_SV(rexi->data->data[ ARG1u( scan ) ]); 9925 9926 while (cur) { 9927 if ( sv_eq( cur->u.mark.mark_name, 9928 find ) ) 9929 { 9930 ST.mark_name = find; 9931 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, 9932 script_run_begin); 9933 } 9934 cur = cur->u.mark.prev_mark; 9935 } 9936 } 9937 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ 9938 break; 9939 9940 case SKIP_next_fail: 9941 if (ST.mark_name) { 9942 /* (*CUT:NAME) - Set up to search for the name as we 9943 collapse the stack*/ 9944 popmark = ST.mark_name; 9945 } else { 9946 /* (*CUT) - No name, we cut here.*/ 9947 if (ST.mark_loc > startpoint) 9948 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); 9949 /* but we set sv_commit to latest mark_name if there 9950 is one so they can test to see how things lead to this 9951 cut */ 9952 if (mark_state) 9953 sv_commit=mark_state->u.mark.mark_name; 9954 } 9955 no_final = 1; 9956 sayNO; 9957 NOT_REACHED; /* NOTREACHED */ 9958#undef ST 9959 9960 case LNBREAK: /* \R */ 9961 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) { 9962 locinput += n; 9963 } else 9964 sayNO; 9965 break; 9966 9967 default: 9968 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n", 9969 PTR2UV(scan), OP(scan)); 9970 Perl_croak(aTHX_ "regexp memory corruption"); 9971 9972 /* this is a point to jump to in order to increment 9973 * locinput by one character */ 9974 increment_locinput: 9975 assert(!NEXTCHR_IS_EOS); 9976 if (utf8_target) { 9977 locinput += PL_utf8skip[nextbyte]; 9978 /* locinput is allowed to go 1 char off the end (signifying 9979 * EOS), but not 2+ */ 9980 if (locinput > loceol) 9981 sayNO; 9982 } 9983 else 9984 locinput++; 9985 break; 9986 9987 } /* end switch */ 9988 9989 /* switch break jumps here */ 9990 scan = next; /* prepare to execute the next op and ... */ 9991 continue; /* ... jump back to the top, reusing st */ 9992 /* NOTREACHED */ 9993 9994 push_yes_state: 9995 /* push a state that backtracks on success */ 9996 st->u.yes.prev_yes_state = yes_state; 9997 yes_state = st; 9998 /* FALLTHROUGH */ 9999 push_state: 10000 /* push a new regex state, then continue at scan */ 10001 { 10002 regmatch_state *newst; 10003 DECLARE_AND_GET_RE_DEBUG_FLAGS; 10004 10005 DEBUG_r( /* DEBUG_STACK_r */ 10006 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) { 10007 regmatch_state *cur = st; 10008 regmatch_state *curyes = yes_state; 10009 U32 i; 10010 regmatch_slab *slab = PL_regmatch_slab; 10011 for (i = 0; i < 3 && i <= depth; cur--,i++) { 10012 if (cur < SLAB_FIRST(slab)) { 10013 slab = slab->prev; 10014 cur = SLAB_LAST(slab); 10015 } 10016 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n", 10017 depth, 10018 i ? " " : "push", 10019 depth - i, REGNODE_NAME(cur->resume_state), 10020 (curyes == cur) ? "yes" : "" 10021 ); 10022 if (curyes == cur) 10023 curyes = cur->u.yes.prev_yes_state; 10024 } 10025 } else { 10026 DEBUG_STATE_pp("push") 10027 }); 10028 depth++; 10029 st->locinput = locinput; 10030 st->loceol = loceol; 10031 st->sr0 = script_run_begin; 10032 newst = st+1; 10033 if (newst > SLAB_LAST(PL_regmatch_slab)) 10034 newst = S_push_slab(aTHX); 10035 PL_regmatch_state = newst; 10036 10037 locinput = pushinput; 10038 loceol = pusheol; 10039 script_run_begin = pushsr0; 10040 st = newst; 10041 continue; 10042 /* NOTREACHED */ 10043 } 10044 } 10045#ifdef SOLARIS_BAD_OPTIMIZER 10046# undef PL_charclass 10047#endif 10048 10049 /* 10050 * We get here only if there's trouble -- normally "case END" is 10051 * the terminating point. 10052 */ 10053 Perl_croak(aTHX_ "corrupted regexp pointers"); 10054 NOT_REACHED; /* NOTREACHED */ 10055 10056 yes: 10057 if (yes_state) { 10058 /* we have successfully completed a subexpression, but we must now 10059 * pop to the state marked by yes_state and continue from there */ 10060 assert(st != yes_state); 10061#ifdef DEBUGGING 10062 while (st != yes_state) { 10063 st--; 10064 if (st < SLAB_FIRST(PL_regmatch_slab)) { 10065 PL_regmatch_slab = PL_regmatch_slab->prev; 10066 st = SLAB_LAST(PL_regmatch_slab); 10067 } 10068 DEBUG_STATE_r({ 10069 if (no_final) { 10070 DEBUG_STATE_pp("pop (no final)"); 10071 } else { 10072 DEBUG_STATE_pp("pop (yes)"); 10073 } 10074 }); 10075 depth--; 10076 } 10077#else 10078 while (yes_state < SLAB_FIRST(PL_regmatch_slab) 10079 || yes_state > SLAB_LAST(PL_regmatch_slab)) 10080 { 10081 /* not in this slab, pop slab */ 10082 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); 10083 PL_regmatch_slab = PL_regmatch_slab->prev; 10084 st = SLAB_LAST(PL_regmatch_slab); 10085 } 10086 depth -= (st - yes_state); 10087#endif 10088 st = yes_state; 10089 yes_state = st->u.yes.prev_yes_state; 10090 PL_regmatch_state = st; 10091 10092 if (no_final) { 10093 locinput= st->locinput; 10094 loceol= st->loceol; 10095 script_run_begin = st->sr0; 10096 } 10097 state_num = st->resume_state + no_final; 10098 goto reenter_switch; 10099 } 10100 10101 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n", 10102 PL_colors[4], PL_colors[5])); 10103 10104 if (reginfo->info_aux_eval) { 10105 /* each successfully executed (?{...}) block does the equivalent of 10106 * local $^R = do {...} 10107 * When popping the save stack, all these locals would be undone; 10108 * bypass this by setting the outermost saved $^R to the latest 10109 * value */ 10110 /* I don't know if this is needed or works properly now. 10111 * see code related to PL_replgv elsewhere in this file. 10112 * Yves 10113 */ 10114 if (oreplsv != GvSV(PL_replgv)) { 10115 sv_setsv(oreplsv, GvSV(PL_replgv)); 10116 SvSETMAGIC(oreplsv); 10117 } 10118 } 10119 result = 1; 10120 goto final_exit; 10121 10122 no: 10123 DEBUG_EXECUTE_r( 10124 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n", 10125 depth, 10126 PL_colors[4], PL_colors[5]) 10127 ); 10128 10129 no_silent: 10130 if (no_final) { 10131 if (yes_state) { 10132 goto yes; 10133 } else { 10134 goto final_exit; 10135 } 10136 } 10137 if (depth) { 10138 /* there's a previous state to backtrack to */ 10139 st--; 10140 if (st < SLAB_FIRST(PL_regmatch_slab)) { 10141 PL_regmatch_slab = PL_regmatch_slab->prev; 10142 st = SLAB_LAST(PL_regmatch_slab); 10143 } 10144 PL_regmatch_state = st; 10145 locinput= st->locinput; 10146 loceol= st->loceol; 10147 script_run_begin = st->sr0; 10148 10149 DEBUG_STATE_pp("pop"); 10150 depth--; 10151 if (yes_state == st) 10152 yes_state = st->u.yes.prev_yes_state; 10153 10154 state_num = st->resume_state + 1; /* failure = success + 1 */ 10155 PERL_ASYNC_CHECK(); 10156 goto reenter_switch; 10157 } 10158 result = 0; 10159 10160 final_exit: 10161 if (rex->intflags & PREGf_VERBARG_SEEN) { 10162 SV *sv_err = get_sv("REGERROR", 1); 10163 SV *sv_mrk = get_sv("REGMARK", 1); 10164 if (result) { 10165 sv_commit = &PL_sv_no; 10166 if (!sv_yes_mark) 10167 sv_yes_mark = &PL_sv_yes; 10168 } else { 10169 if (!sv_commit) 10170 sv_commit = &PL_sv_yes; 10171 sv_yes_mark = &PL_sv_no; 10172 } 10173 assert(sv_err); 10174 assert(sv_mrk); 10175 sv_setsv(sv_err, sv_commit); 10176 sv_setsv(sv_mrk, sv_yes_mark); 10177 } 10178 10179 10180 if (last_pushed_cv) { 10181 dSP; 10182 /* see "Some notes about MULTICALL" above */ 10183 POP_MULTICALL; 10184 PERL_UNUSED_VAR(SP); 10185 } 10186 else 10187 LEAVE_SCOPE(orig_savestack_ix); 10188 10189 assert(!result || locinput - reginfo->strbeg >= 0); 10190 return result ? locinput - reginfo->strbeg : -1; 10191} 10192 10193/* 10194 - regrepeat - repeatedly match something simple, report how many 10195 * 10196 * What 'simple' means is a node which can be the operand of a quantifier like 10197 * '+', or {1,3} 10198 * 10199 * startposp - pointer to a pointer to the start position. This is updated 10200 * to point to the byte following the highest successful 10201 * match. 10202 * p - the regnode to be repeatedly matched against. 10203 * loceol - pointer to the end position beyond which we aren't supposed to 10204 * look. 10205 * reginfo - struct holding match state, such as utf8_target 10206 * max - maximum number of things to match. 10207 * depth - (for debugging) backtracking depth. 10208 */ 10209STATIC I32 10210S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, 10211 char * loceol, regmatch_info *const reginfo, I32 max comma_pDEPTH) 10212{ 10213 char *scan; /* Pointer to current position in target string */ 10214 I32 c; 10215 char *this_eol = loceol; /* potentially adjusted version. */ 10216 I32 hardcount = 0; /* How many matches so far */ 10217 bool utf8_target = reginfo->is_utf8_target; 10218 unsigned int to_complement = 0; /* Invert the result? */ 10219 char_class_number_ classnum; 10220 10221 PERL_ARGS_ASSERT_REGREPEAT; 10222 10223 /* This routine is structured so that we switch on the input OP. Each OP 10224 * case: statement contains a loop to repeatedly apply the OP, advancing 10225 * the input until it fails, or reaches the end of the input, or until it 10226 * reaches the upper limit of matches. */ 10227 10228 scan = *startposp; 10229 if (max == REG_INFTY) /* This is a special marker to go to the platform's 10230 max */ 10231 max = I32_MAX; 10232 else if (! utf8_target && this_eol - scan > max) 10233 this_eol = scan + max; 10234 10235 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> 10236 * down to the maximum of how far we should go in it (but leaving it set to 10237 * the real end if the maximum permissible would take us beyond that). 10238 * This allows us to make the loop exit condition that we haven't gone past 10239 * <this_eol> to also mean that we haven't exceeded the max permissible 10240 * count, saving a test each time through the loop. But it assumes that 10241 * the OP matches a single byte, which is true for most of the OPs below 10242 * when applied to a non-UTF-8 target. Those relatively few OPs that don't 10243 * have this characteristic have to compensate. 10244 * 10245 * There is no such adjustment for UTF-8 targets, since the number of bytes 10246 * per character can vary. OPs will have to test both that the count is 10247 * less than the max permissible (using <hardcount> to keep track), and 10248 * that we are still within the bounds of the string (using <this_eol>. A 10249 * few OPs match a single byte no matter what the encoding. They can omit 10250 * the max test if, for the UTF-8 case, they do the adjustment that was 10251 * skipped above. 10252 * 10253 * Thus, the code above sets things up for the common case; and exceptional 10254 * cases need extra work; the common case is to make sure <scan> doesn't go 10255 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the 10256 * count doesn't exceed the maximum permissible */ 10257 10258 switch (with_t_UTF8ness(OP(p), utf8_target)) { 10259 SV * anyofh_list; 10260 10261 case REG_ANY_t8: 10262 while (scan < this_eol && hardcount < max && *scan != '\n') { 10263 scan += UTF8SKIP(scan); 10264 hardcount++; 10265 } 10266 break; 10267 10268 case REG_ANY_tb: 10269 scan = (char *) memchr(scan, '\n', this_eol - scan); 10270 if (! scan) { 10271 scan = this_eol; 10272 } 10273 break; 10274 10275 case SANY_t8: 10276 while (scan < this_eol && hardcount < max) { 10277 scan += UTF8SKIP(scan); 10278 hardcount++; 10279 } 10280 break; 10281 10282 case SANY_tb: 10283 scan = this_eol; 10284 break; 10285 10286 case EXACT_REQ8_tb: 10287 case LEXACT_REQ8_tb: 10288 case EXACTFU_REQ8_tb: 10289 break; 10290 10291 case EXACTL_t8: 10292 if (UTF8_IS_ABOVE_LATIN1(*scan)) { 10293 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); 10294 } 10295 /* FALLTHROUGH */ 10296 10297 case EXACTL_tb: 10298 case EXACTFL_t8: 10299 case EXACTFL_tb: 10300 case EXACTFLU8_t8: 10301 case EXACTFLU8_tb: 10302 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10303 /* FALLTHROUGH */ 10304 10305 case EXACT_REQ8_t8: 10306 case LEXACT_REQ8_t8: 10307 case EXACTFU_REQ8_t8: 10308 case LEXACT_t8: 10309 case LEXACT_tb: 10310 case EXACT_t8: 10311 case EXACT_tb: 10312 case EXACTF_t8: 10313 case EXACTF_tb: 10314 case EXACTFAA_NO_TRIE_t8: 10315 case EXACTFAA_NO_TRIE_tb: 10316 case EXACTFAA_t8: 10317 case EXACTFAA_tb: 10318 case EXACTFU_t8: 10319 case EXACTFU_tb: 10320 case EXACTFUP_t8: 10321 case EXACTFUP_tb: 10322 10323 { 10324 struct next_matchable_info Binfo; 10325 PERL_UINT_FAST8_T definitive_len; 10326 10327 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); 10328 10329 /* Set up termination info, and quit if we can rule out that we've 10330 * gotten a match of the termination criteria */ 10331 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo) 10332 || scan + Binfo.min_length > this_eol 10333 || ! S_test_EXACTISH_ST(scan, Binfo)) 10334 { 10335 break; 10336 } 10337 10338 definitive_len = Binfo.initial_definitive; 10339 10340 /* Here there are potential matches, and the first byte(s) matched our 10341 * filter 10342 * 10343 * If we got a definitive match of some initial bytes, there is no 10344 * possibility of false positives as far as it got */ 10345 if (definitive_len > 0) { 10346 10347 /* If as far as it got is the maximum possible, there were no false 10348 * positives at all. Since we have everything set up, see how many 10349 * repeats there are. */ 10350 if (definitive_len >= Binfo.max_length) { 10351 10352 /* We've already found one match */ 10353 scan += definitive_len; 10354 hardcount++; 10355 10356 /* If want more than the one match, and there is room for more, 10357 * see if there are any */ 10358 if (hardcount < max && scan + definitive_len <= this_eol) { 10359 10360 /* If the character is only a single byte long, just span 10361 * all such bytes. */ 10362 if (definitive_len == 1) { 10363 const char * orig_scan = scan; 10364 10365 if (this_eol - (scan - hardcount) > max) { 10366 this_eol = scan - hardcount + max; 10367 } 10368 10369 /* Use different routines depending on whether it's an 10370 * exact match or matches with a mask */ 10371 if (Binfo.initial_exact == 1) { 10372 scan = (char *) find_span_end((U8 *) scan, 10373 (U8 *) this_eol, 10374 Binfo.matches[0]); 10375 } 10376 else { 10377 scan = (char *) find_span_end_mask( 10378 (U8 *) scan, 10379 (U8 *) this_eol, 10380 Binfo.first_byte_anded, 10381 Binfo.first_byte_mask); 10382 } 10383 10384 hardcount += scan - orig_scan; 10385 } 10386 else { /* Here, the full character definitive match is more 10387 than one byte */ 10388 while ( hardcount < max 10389 && scan + definitive_len <= this_eol 10390 && S_test_EXACTISH_ST(scan, Binfo)) 10391 { 10392 scan += definitive_len; 10393 hardcount++; 10394 } 10395 } 10396 } 10397 10398 break; 10399 } /* End of a full character is definitively matched */ 10400 10401 /* Here, an initial portion of the character matched definitively, 10402 * and the rest matched as well, but could have false positives */ 10403 10404 do { 10405 int i; 10406 U8 * matches = Binfo.matches; 10407 10408 /* The first bytes were definitive. Look at the remaining */ 10409 for (i = 0; i < Binfo.count; i++) { 10410 if (memEQ(scan + definitive_len, 10411 matches + definitive_len, 10412 Binfo.lengths[i] - definitive_len)) 10413 { 10414 goto found_a_completion; 10415 } 10416 10417 matches += Binfo.lengths[i]; 10418 } 10419 10420 /* Didn't find anything to complete our initial match. Stop 10421 * here */ 10422 break; 10423 10424 found_a_completion: 10425 10426 /* Here, matched a full character, Include it in the result, 10427 * and then look to see if the next char matches */ 10428 hardcount++; 10429 scan += Binfo.lengths[i]; 10430 10431 } while ( hardcount < max 10432 && scan + definitive_len < this_eol 10433 && S_test_EXACTISH_ST(scan, Binfo)); 10434 10435 /* Here, have advanced as far as possible */ 10436 break; 10437 } /* End of found some initial bytes that definitively matched */ 10438 10439 /* Here, we can't rule out that we have found the beginning of 'B', but 10440 * there were no initial bytes that could rule out anything 10441 * definitively. Use brute force to examine all the possibilities */ 10442 while (scan < this_eol && hardcount < max) { 10443 int i; 10444 U8 * matches = Binfo.matches; 10445 10446 for (i = 0; i < Binfo.count; i++) { 10447 if (memEQ(scan, matches, Binfo.lengths[i])) { 10448 goto found1; 10449 } 10450 10451 matches += Binfo.lengths[i]; 10452 } 10453 10454 break; 10455 10456 found1: 10457 hardcount++; 10458 scan += Binfo.lengths[i]; 10459 } 10460 10461 break; 10462 } 10463 10464 case ANYOFPOSIXL_t8: 10465 case ANYOFL_t8: 10466 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10467 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); 10468 10469 /* FALLTHROUGH */ 10470 case ANYOFD_t8: 10471 case ANYOF_t8: 10472 while ( hardcount < max 10473 && scan < this_eol 10474 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) 10475 { 10476 scan += UTF8SKIP(scan); 10477 hardcount++; 10478 } 10479 break; 10480 10481 case ANYOFPOSIXL_tb: 10482 case ANYOFL_tb: 10483 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10484 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); 10485 /* FALLTHROUGH */ 10486 10487 case ANYOFD_tb: 10488 case ANYOF_tb: 10489 if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) { 10490 while ( scan < this_eol 10491 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0)) 10492 scan++; 10493 } 10494 else { 10495 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan))) 10496 scan++; 10497 } 10498 break; 10499 10500 case ANYOFM_t8: 10501 if (this_eol - scan > max) { 10502 10503 /* We didn't adjust <this_eol> at the beginning of this routine 10504 * because is UTF-8, but it is actually ok to do so, since here, to 10505 * match, 1 char == 1 byte. */ 10506 this_eol = scan + max; 10507 } 10508 /* FALLTHROUGH */ 10509 10510 case ANYOFM_tb: 10511 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, 10512 (U8) ARG1u(p), FLAGS(p)); 10513 break; 10514 10515 case NANYOFM_t8: 10516 while ( hardcount < max 10517 && scan < this_eol 10518 && (*scan & FLAGS(p)) != ARG1u(p)) 10519 { 10520 scan += UTF8SKIP(scan); 10521 hardcount++; 10522 } 10523 break; 10524 10525 case NANYOFM_tb: 10526 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, 10527 (U8) ARG1u(p), FLAGS(p)); 10528 break; 10529 10530 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */ 10531 case ANYOFHb_tb: 10532 case ANYOFHbbm_tb: 10533 case ANYOFHr_tb: 10534 case ANYOFHs_tb: 10535 break; 10536 10537 case ANYOFH_t8: 10538 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10539 while ( hardcount < max 10540 && scan < this_eol 10541 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10542 && _invlist_contains_cp(anyofh_list, 10543 utf8_to_uvchr_buf((U8 *) scan, 10544 (U8 *) this_eol, 10545 NULL))) 10546 { 10547 scan += UTF8SKIP(scan); 10548 hardcount++; 10549 } 10550 break; 10551 10552 case ANYOFHb_t8: 10553 /* we know the first byte must be the FLAGS field */ 10554 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10555 while ( hardcount < max 10556 && scan < this_eol 10557 && (U8) *scan == ANYOF_FLAGS(p) 10558 && _invlist_contains_cp(anyofh_list, 10559 utf8_to_uvchr_buf((U8 *) scan, 10560 (U8 *) this_eol, 10561 NULL))) 10562 { 10563 scan += UTF8SKIP(scan); 10564 hardcount++; 10565 } 10566 break; 10567 10568 case ANYOFHbbm_t8: 10569 while ( hardcount < max 10570 && scan + 1 < this_eol 10571 && (U8) *scan == ANYOF_FLAGS(p) 10572 && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap, 10573 (U8) scan[1] & UTF_CONTINUATION_MASK)) 10574 { 10575 scan += 2; /* This node only matces 2-byte UTF-8 */ 10576 hardcount++; 10577 } 10578 break; 10579 10580 case ANYOFHr_t8: 10581 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10582 while ( hardcount < max 10583 && scan < this_eol 10584 && inRANGE(NATIVE_UTF8_TO_I8(*scan), 10585 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), 10586 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) 10587 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10588 && _invlist_contains_cp(anyofh_list, 10589 utf8_to_uvchr_buf((U8 *) scan, 10590 (U8 *) this_eol, 10591 NULL))) 10592 { 10593 scan += UTF8SKIP(scan); 10594 hardcount++; 10595 } 10596 break; 10597 10598 case ANYOFHs_t8: 10599 anyofh_list = GET_ANYOFH_INVLIST(prog, p); 10600 while ( hardcount < max 10601 && scan + FLAGS(p) < this_eol 10602 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) 10603 && _invlist_contains_cp(anyofh_list, 10604 utf8_to_uvchr_buf((U8 *) scan, 10605 (U8 *) this_eol, 10606 NULL))) 10607 { 10608 scan += UTF8SKIP(scan); 10609 hardcount++; 10610 } 10611 break; 10612 10613 case ANYOFR_t8: 10614 while ( hardcount < max 10615 && scan < this_eol 10616 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) 10617 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 10618 (U8 *) this_eol, 10619 NULL), 10620 ANYOFRbase(p), ANYOFRdelta(p))) 10621 { 10622 scan += UTF8SKIP(scan); 10623 hardcount++; 10624 } 10625 break; 10626 10627 case ANYOFR_tb: 10628 while ( hardcount < max 10629 && scan < this_eol 10630 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 10631 { 10632 scan++; 10633 hardcount++; 10634 } 10635 break; 10636 10637 case ANYOFRb_t8: 10638 while ( hardcount < max 10639 && scan < this_eol 10640 && (U8) *scan == ANYOF_FLAGS(p) 10641 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, 10642 (U8 *) this_eol, 10643 NULL), 10644 ANYOFRbase(p), ANYOFRdelta(p))) 10645 { 10646 scan += UTF8SKIP(scan); 10647 hardcount++; 10648 } 10649 break; 10650 10651 case ANYOFRb_tb: 10652 while ( hardcount < max 10653 && scan < this_eol 10654 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) 10655 { 10656 scan++; 10657 hardcount++; 10658 } 10659 break; 10660 10661 /* The argument (FLAGS) to all the POSIX node types is the class number */ 10662 10663 case NPOSIXL_tb: 10664 to_complement = 1; 10665 /* FALLTHROUGH */ 10666 10667 case POSIXL_tb: 10668 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 10669 while ( scan < this_eol 10670 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) 10671 { 10672 scan++; 10673 } 10674 break; 10675 10676 case NPOSIXL_t8: 10677 to_complement = 1; 10678 /* FALLTHROUGH */ 10679 10680 case POSIXL_t8: 10681 while ( hardcount < max && scan < this_eol 10682 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), 10683 (U8 *) scan, 10684 (U8 *) this_eol))) 10685 { 10686 scan += UTF8SKIP(scan); 10687 hardcount++; 10688 } 10689 break; 10690 10691 case POSIXD_tb: 10692 /* FALLTHROUGH */ 10693 10694 case POSIXA_t8: 10695 if (this_eol - scan > max) { 10696 10697 /* We didn't adjust <this_eol> at the beginning of this routine 10698 * because is UTF-8, but it is actually ok to do so, since here, to 10699 * match, 1 char == 1 byte. */ 10700 this_eol = scan + max; 10701 } 10702 /* FALLTHROUGH */ 10703 10704 case POSIXA_tb: 10705 while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) { 10706 scan++; 10707 } 10708 break; 10709 10710 case NPOSIXD_tb: 10711 /* FALLTHROUGH */ 10712 10713 case NPOSIXA_tb: 10714 while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) { 10715 scan++; 10716 } 10717 break; 10718 10719 case NPOSIXA_t8: 10720 10721 /* The complement of something that matches only ASCII matches all 10722 * non-ASCII, plus everything in ASCII that isn't in the class. */ 10723 while ( hardcount < max && scan < this_eol 10724 && ( ! isASCII_utf8_safe(scan, loceol) 10725 || ! generic_isCC_A_((U8) *scan, FLAGS(p)))) 10726 { 10727 scan += UTF8SKIP(scan); 10728 hardcount++; 10729 } 10730 break; 10731 10732 case NPOSIXU_tb: 10733 to_complement = 1; 10734 /* FALLTHROUGH */ 10735 10736 case POSIXU_tb: 10737 while ( scan < this_eol 10738 && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p)))) 10739 { 10740 scan++; 10741 } 10742 break; 10743 10744 case NPOSIXU_t8: 10745 case NPOSIXD_t8: 10746 to_complement = 1; 10747 /* FALLTHROUGH */ 10748 10749 case POSIXD_t8: 10750 case POSIXU_t8: 10751 classnum = (char_class_number_) FLAGS(p); 10752 switch (classnum) { 10753 default: 10754 while ( hardcount < max && scan < this_eol 10755 && to_complement 10756 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum], 10757 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL)))) 10758 { 10759 scan += UTF8SKIP(scan); 10760 hardcount++; 10761 } 10762 break; 10763 10764 /* For the classes below, the knowledge of how to handle every code 10765 * point is compiled into Perl via a macro. This code is written 10766 * for making the loops as tight as possible. It could be 10767 * refactored to save space instead. */ 10768 10769 case CC_ENUM_SPACE_: 10770 while ( hardcount < max 10771 && scan < this_eol 10772 && (to_complement 10773 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol)))) 10774 { 10775 scan += UTF8SKIP(scan); 10776 hardcount++; 10777 } 10778 break; 10779 case CC_ENUM_BLANK_: 10780 while ( hardcount < max 10781 && scan < this_eol 10782 && (to_complement 10783 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol)))) 10784 { 10785 scan += UTF8SKIP(scan); 10786 hardcount++; 10787 } 10788 break; 10789 case CC_ENUM_XDIGIT_: 10790 while ( hardcount < max 10791 && scan < this_eol 10792 && (to_complement 10793 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol)))) 10794 { 10795 scan += UTF8SKIP(scan); 10796 hardcount++; 10797 } 10798 break; 10799 case CC_ENUM_VERTSPACE_: 10800 while ( hardcount < max 10801 && scan < this_eol 10802 && (to_complement 10803 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol)))) 10804 { 10805 scan += UTF8SKIP(scan); 10806 hardcount++; 10807 } 10808 break; 10809 case CC_ENUM_CNTRL_: 10810 while ( hardcount < max 10811 && scan < this_eol 10812 && (to_complement 10813 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol)))) 10814 { 10815 scan += UTF8SKIP(scan); 10816 hardcount++; 10817 } 10818 break; 10819 } 10820 break; 10821 10822 case LNBREAK_t8: 10823 while ( hardcount < max && scan < this_eol 10824 && (c=is_LNBREAK_utf8_safe(scan, this_eol))) 10825 { 10826 scan += c; 10827 hardcount++; 10828 } 10829 break; 10830 10831 case LNBREAK_tb: 10832 /* LNBREAK can match one or two latin chars, which is ok, but we have 10833 * to use hardcount in this situation, and throw away the adjustment to 10834 * <this_eol> done before the switch statement */ 10835 while ( 10836 hardcount < max && scan < loceol 10837 && (c = is_LNBREAK_latin1_safe(scan, loceol)) 10838 ) { 10839 scan += c; 10840 hardcount++; 10841 } 10842 break; 10843 10844 default: 10845 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized" 10846 " node type %d='%s'", OP(p), REGNODE_NAME(OP(p))); 10847 NOT_REACHED; /* NOTREACHED */ 10848 10849 } 10850 10851 if (hardcount) 10852 c = hardcount; 10853 else 10854 c = scan - *startposp; 10855 *startposp = scan; 10856 10857 DEBUG_r({ 10858 DECLARE_AND_GET_RE_DEBUG_FLAGS; 10859 DEBUG_EXECUTE_r({ 10860 SV * const prop = sv_newmortal(); 10861 regprop(prog, prop, p, reginfo, NULL); 10862 Perl_re_exec_indentf( aTHX_ 10863 "%s can match %" IVdf " times out of %" IVdf "...\n", 10864 depth, SvPVX_const(prop),(IV)c,(IV)max); 10865 }); 10866 }); 10867 10868 return(c); 10869} 10870 10871/* 10872 - reginclass - determine if a character falls into a character class 10873 10874 n is the ANYOF-type regnode 10875 p is the target string 10876 p_end points to one byte beyond the end of the target string 10877 utf8_target tells whether p is in UTF-8. 10878 10879 Returns true if matched; false otherwise. 10880 10881 Note that this can be a synthetic start class, a combination of various 10882 nodes, so things you think might be mutually exclusive, such as locale, 10883 aren't. It can match both locale and non-locale 10884 10885 */ 10886 10887STATIC bool 10888S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) 10889{ 10890 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) 10891 ? 0 10892 : ANYOF_FLAGS(n); 10893 bool match = FALSE; 10894 UV c = *p; 10895 10896 PERL_ARGS_ASSERT_REGINCLASS; 10897 10898 /* If c is not already the code point, get it. Note that 10899 * UTF8_IS_INVARIANT() works even if not in UTF-8 */ 10900 if (! UTF8_IS_INVARIANT(c) && utf8_target) { 10901 STRLEN c_len = 0; 10902 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT; 10903 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY); 10904 if (c_len == (STRLEN)-1) { 10905 _force_out_malformed_utf8_message(p, p_end, 10906 utf8n_flags, 10907 1 /* 1 means die */ ); 10908 NOT_REACHED; /* NOTREACHED */ 10909 } 10910 if ( c > 255 10911 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL) 10912 && ! (flags & ANYOFL_UTF8_LOCALE_REQD)) 10913 { 10914 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); 10915 } 10916 } 10917 10918 /* If this character is potentially in the bitmap, check it */ 10919 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { 10920 if (ANYOF_BITMAP_TEST(n, c)) 10921 match = TRUE; 10922 else if ( (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) 10923 && OP(n) == ANYOFD 10924 && ! utf8_target 10925 && ! isASCII(c)) 10926 { 10927 match = TRUE; 10928 } 10929 else if (flags & ANYOF_LOCALE_FLAGS) { 10930 if ( (flags & ANYOFL_FOLD) 10931 && c < 256 10932 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) 10933 { 10934 match = TRUE; 10935 } 10936 else if ( ANYOF_POSIXL_TEST_ANY_SET(n) 10937 && c <= U8_MAX /* param to isFOO_lc() */ 10938 ) { 10939 /* The data structure is arranged so bits 0, 2, 4, ... are set 10940 * if the class includes the Posix character class given by 10941 * bit/2; and 1, 3, 5, ... are set if the class includes the 10942 * complemented Posix class given by int(bit/2), so the 10943 * remainder modulo 2 tells us if to complement or not. 10944 * 10945 * Note that this code assumes that all the classes are closed 10946 * under folding. For example, if a character matches \w, then 10947 * its fold does too; and vice versa. This should be true for 10948 * any well-behaved locale for all the currently defined Posix 10949 * classes, except for :lower: and :upper:, which are handled 10950 * by the pseudo-class :cased: which matches if either of the 10951 * other two does. To get rid of this assumption, an outer 10952 * loop could be used below to iterate over both the source 10953 * character, and its fold (if different) */ 10954 10955 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n); 10956 10957 do { 10958 /* Find the next set bit indicating a class to try matching 10959 * against */ 10960 U8 bit_pos = lsbit_pos32(posixl_bits); 10961 10962 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) { 10963 match = TRUE; 10964 break; 10965 } 10966 10967 /* Remove this class from consideration; repeat */ 10968 POSIXL_CLEAR(posixl_bits, bit_pos); 10969 } while(posixl_bits != 0); 10970 } 10971 } 10972 } 10973 10974 /* If the bitmap didn't (or couldn't) match, and something outside the 10975 * bitmap could match, try that. */ 10976 if (!match) { 10977 if ( c >= NUM_ANYOF_CODE_POINTS 10978 && ANYOF_ONLY_HAS_BITMAP(n) 10979 && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)) 10980 { 10981 /* In this case, the ARG is set up so that the final bit indicates 10982 * whether it matches or not */ 10983 match = ARG1u(n) & 1; 10984 } 10985 else 10986 /* Here, the main way it could match is if the code point is 10987 * outside the bitmap and an inversion list exists to handle such 10988 * things. The other ways all occur when a flag is set to indicate 10989 * we need special handling. That handling doesn't come in to 10990 * effect for ANYOFD nodes unless the target string is UTF-8 and 10991 * that matters to code point being matched. */ 10992 if ( c >= NUM_ANYOF_CODE_POINTS 10993 || ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) 10994 && ( UNLIKELY(OP(n) != ANYOFD) 10995 || (utf8_target && ! isASCII_uvchr(c) 10996# if NUM_ANYOF_CODE_POINTS > 256 10997 && c < 256 10998# endif 10999 )))) 11000 { 11001 /* Here, we have an inversion list for outside-the-bitmap code 11002 * points and/or the flag is set indicating special handling is 11003 * needed. The flag is set when there is auxiliary data beyond the 11004 * normal inversion list, or if there is the possibility of a 11005 * special Turkic locale match, even without auxiliary data. 11006 * 11007 * First check if there is an inversion list and/or auxiliary data. 11008 * */ 11009 if (ANYOF_HAS_AUX(n)) { 11010 SV* only_utf8_locale = NULL; 11011 11012 /* This call will return in 'definition' the union of the base 11013 * non-bitmap inversion list, if any, plus the deferred 11014 * definitions of user-defined properties, if any. It croaks 11015 * if there is such a property but which still has no definition 11016 * available */ 11017 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, 11018 &only_utf8_locale, NULL); 11019 if (definition) { 11020 /* Most likely is the outside-the-bitmap inversion list. */ 11021 if (_invlist_contains_cp(definition, c)) { 11022 match = TRUE; 11023 } 11024 else /* Failing that, hardcode the two tests for a Turkic 11025 locale */ 11026 if ( UNLIKELY(IN_UTF8_TURKIC_LOCALE) 11027 && isALPHA_FOLD_EQ(*p, 'i')) 11028 { 11029 /* Turkish locales have these hard-coded rules 11030 * overriding normal ones */ 11031 if (*p == 'i') { 11032 if (_invlist_contains_cp(definition, 11033 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 11034 { 11035 match = TRUE; 11036 } 11037 } 11038 else if (_invlist_contains_cp(definition, 11039 LATIN_SMALL_LETTER_DOTLESS_I)) 11040 { 11041 match = TRUE; 11042 } 11043 } 11044 } 11045 11046 if ( UNLIKELY(only_utf8_locale) 11047 && UNLIKELY(IN_UTF8_CTYPE_LOCALE) 11048 && ! match) 11049 { 11050 match = _invlist_contains_cp(only_utf8_locale, c); 11051 } 11052 } 11053 11054 /* In a Turkic locale under folding, hard-code the I i case pair 11055 * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES 11056 * flag set unless [Ii] were match possibilities */ 11057 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE) && ! match) { 11058 if (utf8_target) { 11059 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 11060 if (ANYOF_BITMAP_TEST(n, 'i')) { 11061 match = TRUE; 11062 } 11063 } 11064 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) { 11065 if (ANYOF_BITMAP_TEST(n, 'I')) { 11066 match = TRUE; 11067 } 11068 } 11069 } 11070 11071#if NUM_ANYOF_CODE_POINTS > 256 11072 /* Larger bitmap means these special cases aren't handled 11073 * outside the bitmap above. */ 11074 if (*p == 'i') { 11075 if (ANYOF_BITMAP_TEST(n, 11076 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) 11077 { 11078 match = TRUE; 11079 } 11080 } 11081 else if (*p == 'I') { 11082 if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) { 11083 match = TRUE; 11084 } 11085 } 11086#endif 11087 } 11088 } 11089 11090 if ( UNICODE_IS_SUPER(c) 11091 && (flags & ANYOF_WARN_SUPER__shared) 11092 && OP(n) != ANYOFD 11093 && ckWARN_d(WARN_NON_UNICODE)) 11094 { 11095 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 11096 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c); 11097 } 11098 } 11099 11100#if ANYOF_INVERT != 1 11101 /* Depending on compiler optimization cBOOL takes time, so if don't have to 11102 * use it, don't */ 11103# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, 11104#endif 11105 11106 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ 11107 return (flags & ANYOF_INVERT) ^ match; 11108} 11109 11110STATIC U8 * 11111S_reghop3(U8 *s, SSize_t off, const U8* lim) 11112{ 11113 /* return the position 'off' UTF-8 characters away from 's', forward if 11114 * 'off' >= 0, backwards if negative. But don't go outside of position 11115 * 'lim', which better be < s if off < 0 */ 11116 11117 PERL_ARGS_ASSERT_REGHOP3; 11118 11119 if (off >= 0) { 11120 while (off-- && s < lim) { 11121 /* XXX could check well-formedness here */ 11122 U8 *new_s = s + UTF8SKIP(s); 11123 if (new_s > lim) /* lim may be in the middle of a long character */ 11124 return s; 11125 s = new_s; 11126 } 11127 } 11128 else { 11129 while (off++ && s > lim) { 11130 s--; 11131 if (UTF8_IS_CONTINUED(*s)) { 11132 while (s > lim && UTF8_IS_CONTINUATION(*s)) 11133 s--; 11134 if (! UTF8_IS_START(*s)) { 11135 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 11136 } 11137 } 11138 /* XXX could check well-formedness here */ 11139 } 11140 } 11141 return s; 11142} 11143 11144STATIC U8 * 11145S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) 11146{ 11147 PERL_ARGS_ASSERT_REGHOP4; 11148 11149 if (off >= 0) { 11150 while (off-- && s < rlim) { 11151 /* XXX could check well-formedness here */ 11152 s += UTF8SKIP(s); 11153 } 11154 } 11155 else { 11156 while (off++ && s > llim) { 11157 s--; 11158 if (UTF8_IS_CONTINUED(*s)) { 11159 while (s > llim && UTF8_IS_CONTINUATION(*s)) 11160 s--; 11161 if (! UTF8_IS_START(*s)) { 11162 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 11163 } 11164 } 11165 /* XXX could check well-formedness here */ 11166 } 11167 } 11168 return s; 11169} 11170 11171/* like reghop3, but returns NULL on overrun, rather than returning last 11172 * char pos */ 11173 11174STATIC U8 * 11175S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim) 11176{ 11177 PERL_ARGS_ASSERT_REGHOPMAYBE3; 11178 11179 if (off >= 0) { 11180 while (off-- && s < lim) { 11181 /* XXX could check well-formedness here */ 11182 s += UTF8SKIP(s); 11183 } 11184 if (off >= 0) 11185 return NULL; 11186 } 11187 else { 11188 while (off++ && s > lim) { 11189 s--; 11190 if (UTF8_IS_CONTINUED(*s)) { 11191 while (s > lim && UTF8_IS_CONTINUATION(*s)) 11192 s--; 11193 if (! UTF8_IS_START(*s)) { 11194 Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); 11195 } 11196 } 11197 /* XXX could check well-formedness here */ 11198 } 11199 if (off <= 0) 11200 return NULL; 11201 } 11202 return s; 11203} 11204 11205 11206/* when executing a regex that may have (?{}), extra stuff needs setting 11207 up that will be visible to the called code, even before the current 11208 match has finished. In particular: 11209 11210 * $_ is localised to the SV currently being matched; 11211 * pos($_) is created if necessary, ready to be updated on each call-out 11212 to code; 11213 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm 11214 isn't set until the current pattern is successfully finished), so that 11215 $1 etc of the match-so-far can be seen; 11216 * save the old values of subbeg etc of the current regex, and set then 11217 to the current string (again, this is normally only done at the end 11218 of execution) 11219*/ 11220 11221static void 11222S_setup_eval_state(pTHX_ regmatch_info *const reginfo) 11223{ 11224 MAGIC *mg; 11225 regexp *const rex = ReANY(reginfo->prog); 11226 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; 11227 11228 eval_state->rex = rex; 11229 eval_state->sv = reginfo->sv; 11230 11231 if (reginfo->sv) { 11232 /* Make $_ available to executed code. */ 11233 if (reginfo->sv != DEFSV) { 11234 SAVE_DEFSV; 11235 DEFSV_set(reginfo->sv); 11236 } 11237 /* will be dec'd by S_cleanup_regmatch_info_aux */ 11238 SvREFCNT_inc_NN(reginfo->sv); 11239 11240 if (!(mg = mg_find_mglob(reginfo->sv))) { 11241 /* prepare for quick setting of pos */ 11242 mg = sv_magicext_mglob(reginfo->sv); 11243 mg->mg_len = -1; 11244 } 11245 eval_state->pos_magic = mg; 11246 eval_state->pos = mg->mg_len; 11247 eval_state->pos_flags = mg->mg_flags; 11248 } 11249 else 11250 eval_state->pos_magic = NULL; 11251 11252 if (!PL_reg_curpm) { 11253 /* PL_reg_curpm is a fake PMOP that we can attach the current 11254 * regex to and point PL_curpm at, so that $1 et al are visible 11255 * within a /(?{})/. It's just allocated once per interpreter the 11256 * first time its needed */ 11257 Newxz(PL_reg_curpm, 1, PMOP); 11258#ifdef USE_ITHREADS 11259 { 11260 SV* const repointer = &PL_sv_undef; 11261 /* this regexp is also owned by the new PL_reg_curpm, which 11262 will try to free it. */ 11263 av_push(PL_regex_padav, repointer); 11264 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav); 11265 PL_regex_pad = AvARRAY(PL_regex_padav); 11266 } 11267#endif 11268 } 11269 SET_reg_curpm(reginfo->prog); 11270 eval_state->curpm = PL_curpm; 11271 PL_curpm_under = PL_curpm; 11272 PL_curpm = PL_reg_curpm; 11273 if (RXp_MATCH_COPIED(rex)) { 11274 /* Here is a serious problem: we cannot rewrite subbeg, 11275 since it may be needed if this match fails. Thus 11276 $` inside (?{}) could fail... */ 11277 eval_state->subbeg = RXp_SUBBEG(rex); 11278 eval_state->sublen = RXp_SUBLEN(rex); 11279 eval_state->suboffset = RXp_SUBOFFSET(rex); 11280 eval_state->subcoffset = RXp_SUBCOFFSET(rex); 11281#ifdef PERL_ANY_COW 11282 eval_state->saved_copy = RXp_SAVED_COPY(rex); 11283#endif 11284 RXp_MATCH_COPIED_off(rex); 11285 } 11286 else 11287 eval_state->subbeg = NULL; 11288 RXp_SUBBEG(rex) = (char *)reginfo->strbeg; 11289 RXp_SUBOFFSET(rex) = 0; 11290 RXp_SUBCOFFSET(rex) = 0; 11291 RXp_SUBLEN(rex) = reginfo->strend - reginfo->strbeg; 11292} 11293 11294 11295/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ 11296 11297static void 11298S_cleanup_regmatch_info_aux(pTHX_ void *arg) 11299{ 11300 regmatch_info_aux *aux = (regmatch_info_aux *) arg; 11301 regmatch_info_aux_eval *eval_state = aux->info_aux_eval; 11302 regmatch_slab *s; 11303 11304 Safefree(aux->poscache); 11305 11306 if (eval_state) { 11307 11308 /* undo the effects of S_setup_eval_state() */ 11309 11310 if (eval_state->subbeg) { 11311 regexp * const rex = eval_state->rex; 11312 RXp_SUBBEG(rex) = eval_state->subbeg; 11313 RXp_SUBLEN(rex) = eval_state->sublen; 11314 RXp_SUBOFFSET(rex) = eval_state->suboffset; 11315 RXp_SUBCOFFSET(rex) = eval_state->subcoffset; 11316#ifdef PERL_ANY_COW 11317 RXp_SAVED_COPY(rex) = eval_state->saved_copy; 11318#endif 11319 RXp_MATCH_COPIED_on(rex); 11320 } 11321 if (eval_state->pos_magic) 11322 { 11323 eval_state->pos_magic->mg_len = eval_state->pos; 11324 eval_state->pos_magic->mg_flags = 11325 (eval_state->pos_magic->mg_flags & ~MGf_BYTES) 11326 | (eval_state->pos_flags & MGf_BYTES); 11327 } 11328 11329 PL_curpm = eval_state->curpm; 11330 SvREFCNT_dec(eval_state->sv); 11331 } 11332 11333 PL_regmatch_state = aux->old_regmatch_state; 11334 PL_regmatch_slab = aux->old_regmatch_slab; 11335 11336 /* free all slabs above current one - this must be the last action 11337 * of this function, as aux and eval_state are allocated within 11338 * slabs and may be freed here */ 11339 11340 s = PL_regmatch_slab->next; 11341 if (s) { 11342 PL_regmatch_slab->next = NULL; 11343 while (s) { 11344 regmatch_slab * const osl = s; 11345 s = s->next; 11346 Safefree(osl); 11347 } 11348 } 11349} 11350 11351 11352STATIC void 11353S_to_utf8_substr(pTHX_ regexp *prog) 11354{ 11355 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile 11356 * on the converted value */ 11357 11358 int i = 1; 11359 11360 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; 11361 11362 do { 11363 if (prog->substrs->data[i].substr 11364 && !prog->substrs->data[i].utf8_substr) { 11365 SV* const sv = newSVsv(prog->substrs->data[i].substr); 11366 prog->substrs->data[i].utf8_substr = sv; 11367 sv_utf8_upgrade(sv); 11368 if (SvVALID(prog->substrs->data[i].substr)) { 11369 if (SvTAIL(prog->substrs->data[i].substr)) { 11370 /* Trim the trailing \n that fbm_compile added last 11371 time. */ 11372 SvCUR_set(sv, SvCUR(sv) - 1); 11373 /* Whilst this makes the SV technically "invalid" (as its 11374 buffer is no longer followed by "\0") when fbm_compile() 11375 adds the "\n" back, a "\0" is restored. */ 11376 fbm_compile(sv, FBMcf_TAIL); 11377 } else 11378 fbm_compile(sv, 0); 11379 } 11380 if (prog->substrs->data[i].substr == prog->check_substr) 11381 prog->check_utf8 = sv; 11382 } 11383 } while (i--); 11384} 11385 11386STATIC bool 11387S_to_byte_substr(pTHX_ regexp *prog) 11388{ 11389 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile 11390 * on the converted value; returns FALSE if can't be converted. */ 11391 11392 int i = 1; 11393 11394 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; 11395 11396 do { 11397 if (prog->substrs->data[i].utf8_substr 11398 && !prog->substrs->data[i].substr) { 11399 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); 11400 if (! sv_utf8_downgrade(sv, TRUE)) { 11401 SvREFCNT_dec_NN(sv); 11402 return FALSE; 11403 } 11404 if (SvVALID(prog->substrs->data[i].utf8_substr)) { 11405 if (SvTAIL(prog->substrs->data[i].utf8_substr)) { 11406 /* Trim the trailing \n that fbm_compile added last 11407 time. */ 11408 SvCUR_set(sv, SvCUR(sv) - 1); 11409 fbm_compile(sv, FBMcf_TAIL); 11410 } else 11411 fbm_compile(sv, 0); 11412 } 11413 prog->substrs->data[i].substr = sv; 11414 if (prog->substrs->data[i].utf8_substr == prog->check_utf8) 11415 prog->check_substr = sv; 11416 } 11417 } while (i--); 11418 11419 return TRUE; 11420} 11421 11422#ifndef PERL_IN_XSUB_RE 11423 11424bool 11425Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) 11426{ 11427 /* Temporary helper function for toke.c. Verify that the code point 'cp' 11428 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in 11429 * the larger string bounded by 'strbeg' and 'strend'. 11430 * 11431 * 'cp' needs to be assigned (if not, a future version of the Unicode 11432 * Standard could make it something that combines with adjacent characters, 11433 * so code using it would then break), and there has to be a GCB break 11434 * before and after the character. */ 11435 11436 11437 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; 11438 const U8 * prev_cp_start; 11439 11440 PERL_ARGS_ASSERT_IS_GRAPHEME; 11441 11442 if ( UNLIKELY(UNICODE_IS_SUPER(cp)) 11443 || UNLIKELY(UNICODE_IS_NONCHAR(cp))) 11444 { 11445 /* These are considered graphemes */ 11446 return TRUE; 11447 } 11448 11449 /* Otherwise, unassigned code points are forbidden */ 11450 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST( 11451 _invlist_search(PL_Assigned_invlist, cp)))) 11452 { 11453 return FALSE; 11454 } 11455 11456 cp_gcb_val = getGCB_VAL_CP(cp); 11457 11458 /* Find the GCB value of the previous code point in the input */ 11459 prev_cp_start = utf8_hop_back(s, -1, strbeg); 11460 if (UNLIKELY(prev_cp_start == s)) { 11461 prev_cp_gcb_val = GCB_EDGE; 11462 } 11463 else { 11464 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend); 11465 } 11466 11467 /* And check that is a grapheme boundary */ 11468 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s, 11469 TRUE /* is UTF-8 encoded */ )) 11470 { 11471 return FALSE; 11472 } 11473 11474 /* Similarly verify there is a break between the current character and the 11475 * following one */ 11476 s += UTF8SKIP(s); 11477 if (s >= strend) { 11478 next_cp_gcb_val = GCB_EDGE; 11479 } 11480 else { 11481 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend); 11482 } 11483 11484 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE); 11485} 11486 11487/* 11488=for apidoc_section $unicode 11489 11490=for apidoc isSCRIPT_RUN 11491 11492Returns a bool as to whether or not the sequence of bytes from C<s> up to but 11493not including C<send> form a "script run". C<utf8_target> is TRUE iff the 11494sequence starting at C<s> is to be treated as UTF-8. To be precise, except for 11495two degenerate cases given below, this function returns TRUE iff all code 11496points in it come from any combination of three "scripts" given by the Unicode 11497"Script Extensions" property: Common, Inherited, and possibly one other. 11498Additionally all decimal digits must come from the same consecutive sequence of 1149910. 11500 11501For example, if all the characters in the sequence are Greek, or Common, or 11502Inherited, this function will return TRUE, provided any decimal digits in it 11503are from the same block of digits in Common. (These are the ASCII digits 11504"0".."9" and additionally a block for full width forms of these, and several 11505others used in mathematical notation.) For scripts (unlike Greek) that have 11506their own digits defined this will accept either digits from that set or from 11507one of the Common digit sets, but not a combination of the two. Some scripts, 11508such as Arabic, have more than one set of digits. All digits must come from 11509the same set for this function to return TRUE. 11510 11511C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE 11512contain the script found, using the C<SCX_enum> typedef. Its value will be 11513C<SCX_INVALID> if the function returns FALSE. 11514 11515If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for) 11516will be C<SCX_INVALID>. 11517 11518If the sequence contains a single code point which is unassigned to a character 11519in the version of Unicode being used, the function will return TRUE, and the 11520script will be C<SCX_Unknown>. Any other combination of unassigned code points 11521in the input sequence will result in the function treating the input as not 11522being a script run. 11523 11524The returned script will be C<SCX_Inherited> iff all the code points in it are 11525from the Inherited script. 11526 11527Otherwise, the returned script will be C<SCX_Common> iff all the code points in 11528it are from the Inherited or Common scripts. 11529 11530=cut 11531 11532*/ 11533 11534bool 11535Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) 11536{ 11537 /* Basically, it looks at each character in the sequence to see if the 11538 * above conditions are met; if not it fails. It uses an inversion map to 11539 * find the enum corresponding to the script of each character. But this 11540 * is complicated by the fact that a few code points can be in any of 11541 * several scripts. The data has been constructed so that there are 11542 * additional enum values (all negative) for these situations. The 11543 * absolute value of those is an index into another table which contains 11544 * pointers to auxiliary tables for each such situation. Each aux array 11545 * lists all the scripts for the given situation. There is another, 11546 * parallel, table that gives the number of entries in each aux table. 11547 * These are all defined in charclass_invlists.h */ 11548 11549 /* XXX Here are the additional things UTS 39 says could be done: 11550 * 11551 * Forbid sequences of the same nonspacing mark 11552 * 11553 * Check to see that all the characters are in the sets of exemplar 11554 * characters for at least one language in the Unicode Common Locale Data 11555 * Repository [CLDR]. */ 11556 11557 11558 /* Things that match /\d/u */ 11559 SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_]; 11560 UV * decimals_array = invlist_array(decimals_invlist); 11561 11562 /* What code point is the digit '0' of the script run? (0 meaning FALSE if 11563 * not currently known) */ 11564 UV zero_of_run = 0; 11565 11566 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */ 11567 SCX_enum script_of_char = SCX_INVALID; 11568 11569 /* If the script remains not fully determined from iteration to iteration, 11570 * this is the current intersection of the possiblities. */ 11571 SCX_enum * intersection = NULL; 11572 PERL_UINT_FAST8_T intersection_len = 0; 11573 11574 bool retval = TRUE; 11575 SCX_enum * ret_script = NULL; 11576 11577 assert(send >= s); 11578 11579 PERL_ARGS_ASSERT_ISSCRIPT_RUN; 11580 11581 /* All code points in 0..255 are either Common or Latin, so must be a 11582 * script run. We can return immediately unless we need to know which 11583 * script it is. */ 11584 if (! utf8_target && LIKELY(send > s)) { 11585 if (ret_script == NULL) { 11586 return TRUE; 11587 } 11588 11589 /* If any character is Latin, the run is Latin */ 11590 while (s < send) { 11591 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) { 11592 *ret_script = SCX_Latin; 11593 return TRUE; 11594 } 11595 } 11596 11597 /* Here, all are Common */ 11598 *ret_script = SCX_Common; 11599 return TRUE; 11600 } 11601 11602 /* Look at each character in the sequence */ 11603 while (s < send) { 11604 /* If the current character being examined is a digit, this is the code 11605 * point of the zero for its sequence of 10 */ 11606 UV zero_of_char; 11607 11608 UV cp; 11609 11610 /* The code allows all scripts to use the ASCII digits. This is 11611 * because they are in the Common script. Hence any ASCII ones found 11612 * are ok, unless and until a digit from another set has already been 11613 * encountered. digit ranges in Common are not similarly blessed) */ 11614 if (UNLIKELY(isDIGIT(*s))) { 11615 if (UNLIKELY(script_of_run == SCX_Unknown)) { 11616 retval = FALSE; 11617 break; 11618 } 11619 if (zero_of_run) { 11620 if (zero_of_run != '0') { 11621 retval = FALSE; 11622 break; 11623 } 11624 } 11625 else { 11626 zero_of_run = '0'; 11627 } 11628 s++; 11629 continue; 11630 } 11631 11632 /* Here, isn't an ASCII digit. Find the code point of the character */ 11633 if (! UTF8_IS_INVARIANT(*s)) { 11634 Size_t len; 11635 cp = valid_utf8_to_uvchr((U8 *) s, &len); 11636 s += len; 11637 } 11638 else { 11639 cp = *(s++); 11640 } 11641 11642 /* If is within the range [+0 .. +9] of the script's zero, it also is a 11643 * digit in that script. We can skip the rest of this code for this 11644 * character. */ 11645 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) { 11646 continue; 11647 } 11648 11649 /* Find the character's script. The correct values are hard-coded here 11650 * for small-enough code points. */ 11651 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely 11652 unlikely to change */ 11653 if ( cp > 255 11654 || ( isALPHA_L1(cp) 11655 && LIKELY(cp != MICRO_SIGN_NATIVE))) 11656 { 11657 script_of_char = SCX_Latin; 11658 } 11659 else { 11660 script_of_char = SCX_Common; 11661 } 11662 } 11663 else { 11664 script_of_char = _Perl_SCX_invmap[ 11665 _invlist_search(PL_SCX_invlist, cp)]; 11666 } 11667 11668 /* We arbitrarily accept a single unassigned character, but not in 11669 * combination with anything else, and not a run of them. */ 11670 if ( UNLIKELY(script_of_run == SCX_Unknown) 11671 || UNLIKELY( script_of_run != SCX_INVALID 11672 && script_of_char == SCX_Unknown)) 11673 { 11674 retval = FALSE; 11675 break; 11676 } 11677 11678 /* For the first character, or the run is inherited, the run's script 11679 * is set to the char's */ 11680 if ( UNLIKELY(script_of_run == SCX_INVALID) 11681 || UNLIKELY(script_of_run == SCX_Inherited)) 11682 { 11683 script_of_run = script_of_char; 11684 } 11685 11686 /* For the character's script to be Unknown, it must be the first 11687 * character in the sequence (for otherwise a test above would have 11688 * prevented us from reaching here), and we have set the run's script 11689 * to it. Nothing further to be done for this character */ 11690 if (UNLIKELY(script_of_char == SCX_Unknown)) { 11691 continue; 11692 } 11693 11694 /* We accept 'inherited' script characters currently even at the 11695 * beginning. (We know that no characters in Inherited are digits, or 11696 * we'd have to check for that) */ 11697 if (UNLIKELY(script_of_char == SCX_Inherited)) { 11698 continue; 11699 } 11700 11701 /* If the run so far is Common, and the new character isn't, change the 11702 * run's script to that of this character */ 11703 if (script_of_run == SCX_Common && script_of_char != SCX_Common) { 11704 script_of_run = script_of_char; 11705 } 11706 11707 /* Now we can see if the script of the new character is the same as 11708 * that of the run */ 11709 if (LIKELY(script_of_char == script_of_run)) { 11710 /* By far the most common case */ 11711 goto scripts_match; 11712 } 11713 11714 /* Here, the script of the run isn't Common. But characters in Common 11715 * match any script */ 11716 if (script_of_char == SCX_Common) { 11717 goto scripts_match; 11718 } 11719 11720#ifndef HAS_SCX_AUX_TABLES 11721 11722 /* Too early a Unicode version to have a code point belonging to more 11723 * than one script, so, if the scripts don't exactly match, fail */ 11724 PERL_UNUSED_VAR(intersection_len); 11725 retval = FALSE; 11726 break; 11727 11728#else 11729 11730 /* Here there is no exact match between the character's script and the 11731 * run's. And we've handled the special cases of scripts Unknown, 11732 * Inherited, and Common. 11733 * 11734 * Negative script numbers signify that the value may be any of several 11735 * scripts, and we need to look at auxiliary information to make our 11736 * determination. But if both are non-negative, we can fail now */ 11737 if (LIKELY(script_of_char >= 0)) { 11738 const SCX_enum * search_in; 11739 PERL_UINT_FAST8_T search_in_len; 11740 PERL_UINT_FAST8_T i; 11741 11742 if (LIKELY(script_of_run >= 0)) { 11743 retval = FALSE; 11744 break; 11745 } 11746 11747 /* Use the previously constructed set of possible scripts, if any. 11748 * */ 11749 if (intersection) { 11750 search_in = intersection; 11751 search_in_len = intersection_len; 11752 } 11753 else { 11754 search_in = SCX_AUX_TABLE_ptrs[-script_of_run]; 11755 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11756 } 11757 11758 for (i = 0; i < search_in_len; i++) { 11759 if (search_in[i] == script_of_char) { 11760 script_of_run = script_of_char; 11761 goto scripts_match; 11762 } 11763 } 11764 11765 retval = FALSE; 11766 break; 11767 } 11768 else if (LIKELY(script_of_run >= 0)) { 11769 /* script of character could be one of several, but run is a single 11770 * script */ 11771 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char]; 11772 const PERL_UINT_FAST8_T search_in_len 11773 = SCX_AUX_TABLE_lengths[-script_of_char]; 11774 PERL_UINT_FAST8_T i; 11775 11776 for (i = 0; i < search_in_len; i++) { 11777 if (search_in[i] == script_of_run) { 11778 script_of_char = script_of_run; 11779 goto scripts_match; 11780 } 11781 } 11782 11783 retval = FALSE; 11784 break; 11785 } 11786 else { 11787 /* Both run and char could be in one of several scripts. If the 11788 * intersection is empty, then this character isn't in this script 11789 * run. Otherwise, we need to calculate the intersection to use 11790 * for future iterations of the loop, unless we are already at the 11791 * final character */ 11792 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char]; 11793 const PERL_UINT_FAST8_T char_len 11794 = SCX_AUX_TABLE_lengths[-script_of_char]; 11795 const SCX_enum * search_run; 11796 PERL_UINT_FAST8_T run_len; 11797 11798 SCX_enum * new_overlap = NULL; 11799 PERL_UINT_FAST8_T i, j; 11800 11801 if (intersection) { 11802 search_run = intersection; 11803 run_len = intersection_len; 11804 } 11805 else { 11806 search_run = SCX_AUX_TABLE_ptrs[-script_of_run]; 11807 run_len = SCX_AUX_TABLE_lengths[-script_of_run]; 11808 } 11809 11810 intersection_len = 0; 11811 11812 for (i = 0; i < run_len; i++) { 11813 for (j = 0; j < char_len; j++) { 11814 if (search_run[i] == search_char[j]) { 11815 11816 /* Here, the script at i,j matches. That means this 11817 * character is in the run. But continue on to find 11818 * the complete intersection, for the next loop 11819 * iteration, and for the digit check after it. 11820 * 11821 * On the first found common script, we malloc space 11822 * for the intersection list for the worst case of the 11823 * intersection, which is the minimum of the number of 11824 * scripts remaining in each set. */ 11825 if (intersection_len == 0) { 11826 Newx(new_overlap, 11827 MIN(run_len - i, char_len - j), 11828 SCX_enum); 11829 } 11830 new_overlap[intersection_len++] = search_run[i]; 11831 } 11832 } 11833 } 11834 11835 /* Here we've looked through everything. If they have no scripts 11836 * in common, not a run */ 11837 if (intersection_len == 0) { 11838 retval = FALSE; 11839 break; 11840 } 11841 11842 /* If there is only a single script in common, set to that. 11843 * Otherwise, use the intersection going forward */ 11844 Safefree(intersection); 11845 intersection = NULL; 11846 if (intersection_len == 1) { 11847 script_of_run = script_of_char = new_overlap[0]; 11848 Safefree(new_overlap); 11849 new_overlap = NULL; 11850 } 11851 else { 11852 intersection = new_overlap; 11853 } 11854 } 11855 11856#endif 11857 11858 scripts_match: 11859 11860 /* Here, the script of the character is compatible with that of the 11861 * run. That means that in most cases, it continues the script run. 11862 * Either it and the run match exactly, or one or both can be in any of 11863 * several scripts, and the intersection is not empty. However, if the 11864 * character is a decimal digit, it could still mean failure if it is 11865 * from the wrong sequence of 10. So, we need to look at if it's a 11866 * digit. We've already handled the 10 digits [0-9], and the next 11867 * lowest one is this one: */ 11868 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { 11869 continue; /* Not a digit; this character is part of the run */ 11870 } 11871 11872 /* If we have a definitive '0' for the script of this character, we 11873 * know that for this to be a digit, it must be in the range of +0..+9 11874 * of that zero. */ 11875 if ( script_of_char >= 0 11876 && (zero_of_char = script_zeros[script_of_char])) 11877 { 11878 if (! withinCOUNT(cp, zero_of_char, 9)) { 11879 continue; /* Not a digit; this character is part of the run 11880 */ 11881 } 11882 11883 } 11884 else { /* Need to look up if this character is a digit or not */ 11885 SSize_t index_of_zero_of_char; 11886 index_of_zero_of_char = _invlist_search(decimals_invlist, cp); 11887 if ( UNLIKELY(index_of_zero_of_char < 0) 11888 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char)) 11889 { 11890 continue; /* Not a digit; this character is part of the run. 11891 */ 11892 } 11893 11894 zero_of_char = decimals_array[index_of_zero_of_char]; 11895 } 11896 11897 /* Here, the character is a decimal digit, and the zero of its sequence 11898 * of 10 is in 'zero_of_char'. If we already have a zero for this run, 11899 * they better be the same. */ 11900 if (zero_of_run) { 11901 if (zero_of_run != zero_of_char) { 11902 retval = FALSE; 11903 break; 11904 } 11905 } 11906 else { /* Otherwise we now have a zero for this run */ 11907 zero_of_run = zero_of_char; 11908 } 11909 } /* end of looping through CLOSESR text */ 11910 11911 Safefree(intersection); 11912 11913 if (ret_script != NULL) { 11914 if (retval) { 11915 *ret_script = script_of_run; 11916 } 11917 else { 11918 *ret_script = SCX_INVALID; 11919 } 11920 } 11921 11922 return retval; 11923} 11924#endif /* ifndef PERL_IN_XSUB_RE */ 11925 11926/* Buffer logic. */ 11927SV* 11928Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, 11929 const U32 flags) 11930{ 11931 PERL_ARGS_ASSERT_REG_NAMED_BUFF; 11932 11933 PERL_UNUSED_ARG(value); 11934 11935 if (flags & RXapif_FETCH) { 11936 return reg_named_buff_fetch(rx, key, flags); 11937 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { 11938 Perl_croak_no_modify(); 11939 return NULL; 11940 } else if (flags & RXapif_EXISTS) { 11941 return reg_named_buff_exists(rx, key, flags) 11942 ? &PL_sv_yes 11943 : &PL_sv_no; 11944 } else if (flags & RXapif_REGNAMES) { 11945 return reg_named_buff_all(rx, flags); 11946 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { 11947 return reg_named_buff_scalar(rx, flags); 11948 } else { 11949 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); 11950 return NULL; 11951 } 11952} 11953 11954SV* 11955Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, 11956 const U32 flags) 11957{ 11958 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; 11959 PERL_UNUSED_ARG(lastkey); 11960 11961 if (flags & RXapif_FIRSTKEY) 11962 return reg_named_buff_firstkey(rx, flags); 11963 else if (flags & RXapif_NEXTKEY) 11964 return reg_named_buff_nextkey(rx, flags); 11965 else { 11966 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", 11967 (int)flags); 11968 return NULL; 11969 } 11970} 11971 11972SV* 11973Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, 11974 const U32 flags) 11975{ 11976 SV *ret; 11977 struct regexp *const rx = ReANY(r); 11978 11979 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; 11980 11981 if (rx && RXp_PAREN_NAMES(rx)) { 11982 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); 11983 if (he_str) { 11984 IV i; 11985 SV* sv_dat=HeVAL(he_str); 11986 I32 *nums=(I32*)SvPVX(sv_dat); 11987 AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL; 11988 for ( i=0; i<SvIVX(sv_dat); i++ ) { 11989 if ((I32)(rx->nparens) >= nums[i] 11990 && RXp_OFFS_VALID(rx,nums[i])) 11991 { 11992 ret = newSVpvs(""); 11993 Perl_reg_numbered_buff_fetch_flags(aTHX_ r, nums[i], ret, REG_FETCH_ABSOLUTE); 11994 if (!retarray) 11995 return ret; 11996 } else { 11997 if (retarray) 11998 ret = newSV_type(SVt_NULL); 11999 } 12000 if (retarray) 12001 av_push_simple(retarray, ret); 12002 } 12003 if (retarray) 12004 return newRV_noinc(MUTABLE_SV(retarray)); 12005 } 12006 } 12007 return NULL; 12008} 12009 12010bool 12011Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, 12012 const U32 flags) 12013{ 12014 struct regexp *const rx = ReANY(r); 12015 12016 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; 12017 12018 if (rx && RXp_PAREN_NAMES(rx)) { 12019 if (flags & RXapif_ALL) { 12020 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); 12021 } else { 12022 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); 12023 if (sv) { 12024 SvREFCNT_dec_NN(sv); 12025 return TRUE; 12026 } else { 12027 return FALSE; 12028 } 12029 } 12030 } else { 12031 return FALSE; 12032 } 12033} 12034 12035SV* 12036Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) 12037{ 12038 struct regexp *const rx = ReANY(r); 12039 12040 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; 12041 12042 if ( rx && RXp_PAREN_NAMES(rx) ) { 12043 (void)hv_iterinit(RXp_PAREN_NAMES(rx)); 12044 12045 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); 12046 } else { 12047 return FALSE; 12048 } 12049} 12050 12051SV* 12052Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) 12053{ 12054 struct regexp *const rx = ReANY(r); 12055 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12056 12057 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; 12058 12059 if (rx && RXp_PAREN_NAMES(rx)) { 12060 HV *hv = RXp_PAREN_NAMES(rx); 12061 HE *temphe; 12062 while ( (temphe = hv_iternext_flags(hv, 0)) ) { 12063 IV i; 12064 IV parno = 0; 12065 SV* sv_dat = HeVAL(temphe); 12066 I32 *nums = (I32*)SvPVX(sv_dat); 12067 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 12068 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] && 12069 RXp_OFFS_VALID(rx,nums[i])) 12070 { 12071 parno = nums[i]; 12072 break; 12073 } 12074 } 12075 if (parno || flags & RXapif_ALL) { 12076 return newSVhek(HeKEY_hek(temphe)); 12077 } 12078 } 12079 } 12080 return NULL; 12081} 12082 12083SV* 12084Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) 12085{ 12086 SV *ret; 12087 AV *av; 12088 SSize_t length; 12089 struct regexp *const rx = ReANY(r); 12090 12091 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; 12092 12093 if (rx && RXp_PAREN_NAMES(rx)) { 12094 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { 12095 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); 12096 } else if (flags & RXapif_ONE) { 12097 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); 12098 av = MUTABLE_AV(SvRV(ret)); 12099 length = av_count(av); 12100 SvREFCNT_dec_NN(ret); 12101 return newSViv(length); 12102 } else { 12103 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", 12104 (int)flags); 12105 return NULL; 12106 } 12107 } 12108 return &PL_sv_undef; 12109} 12110 12111SV* 12112Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) 12113{ 12114 struct regexp *const rx = ReANY(r); 12115 AV *av = newAV(); 12116 12117 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; 12118 12119 if (rx && RXp_PAREN_NAMES(rx)) { 12120 HV *hv= RXp_PAREN_NAMES(rx); 12121 HE *temphe; 12122 (void)hv_iterinit(hv); 12123 while ( (temphe = hv_iternext_flags(hv, 0)) ) { 12124 IV i; 12125 IV parno = 0; 12126 SV* sv_dat = HeVAL(temphe); 12127 I32 *nums = (I32*)SvPVX(sv_dat); 12128 for ( i = 0; i < SvIVX(sv_dat); i++ ) { 12129 if ((I32)(RXp_LASTPAREN(rx)) >= nums[i] && 12130 RXp_OFFS_VALID(rx,nums[i])) 12131 { 12132 parno = nums[i]; 12133 break; 12134 } 12135 } 12136 if (parno || flags & RXapif_ALL) { 12137 av_push_simple(av, newSVhek(HeKEY_hek(temphe))); 12138 } 12139 } 12140 } 12141 12142 return newRV_noinc(MUTABLE_SV(av)); 12143} 12144 12145void 12146Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const re, const I32 paren, 12147 SV * const sv) 12148{ 12149 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; 12150 Perl_reg_numbered_buff_fetch_flags(aTHX_ re, paren, sv, 0); 12151} 12152 12153#ifndef PERL_IN_XSUB_RE 12154 12155void 12156Perl_reg_numbered_buff_fetch_flags(pTHX_ REGEXP * const re, const I32 paren, 12157 SV * const sv, U32 flags) 12158{ 12159 struct regexp *const rx = ReANY(re); 12160 char *s = NULL; 12161 SSize_t i,t = 0; 12162 SSize_t s1, t1; 12163 I32 n = paren; 12164 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens; 12165 12166 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH_FLAGS; 12167 12168 if ( n == RX_BUFF_IDX_CARET_PREMATCH 12169 || n == RX_BUFF_IDX_CARET_FULLMATCH 12170 || n == RX_BUFF_IDX_CARET_POSTMATCH 12171 ) 12172 { 12173 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 12174 if (!keepcopy) { 12175 /* on something like 12176 * $r = qr/.../; 12177 * /$qr/p; 12178 * the KEEPCOPY is set on the PMOP rather than the regex */ 12179 if (PL_curpm && re == PM_GETRE(PL_curpm)) 12180 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 12181 } 12182 if (!keepcopy) 12183 goto ret_undef; 12184 } 12185 12186 if (!RXp_SUBBEG(rx)) 12187 goto ret_undef; 12188 12189 if (n == RX_BUFF_IDX_CARET_FULLMATCH) 12190 /* no need to distinguish between them any more */ 12191 n = RX_BUFF_IDX_FULLMATCH; 12192 12193 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) 12194 && (i = RXp_OFFS_START(rx,0)) != -1) 12195 { 12196 /* $`, ${^PREMATCH} */ 12197 s = RXp_SUBBEG(rx); 12198 } 12199 else 12200 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) 12201 && (t = RXp_OFFS_END(rx,0)) != -1) 12202 { 12203 /* $', ${^POSTMATCH} */ 12204 s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + t; 12205 i = RXp_SUBLEN(rx) + RXp_SUBOFFSET(rx) - t; 12206 } 12207 else /* when flags is true we do an absolute lookup, and compare against rx->nparens */ 12208 if (inRANGE(n, 0, flags ? (I32)rx->nparens : logical_nparens)) { 12209 I32 *map = (!flags && n) ? rx->logical_to_parno : NULL; 12210 I32 true_parno = map ? map[n] : n; 12211 do { 12212 if (((s1 = RXp_OFFS_START(rx,true_parno)) != -1) && 12213 ((t1 = RXp_OFFS_END(rx,true_parno)) != -1)) 12214 { 12215 /* $&, ${^MATCH}, $1 ... */ 12216 i = t1 - s1; 12217 s = RXp_SUBBEG(rx) + s1 - RXp_SUBOFFSET(rx); 12218 goto found_it; 12219 } 12220 else if (map) { 12221 true_parno = rx->parno_to_logical_next[true_parno]; 12222 } 12223 else { 12224 break; 12225 } 12226 } while (true_parno); 12227 goto ret_undef; 12228 } else { 12229 goto ret_undef; 12230 } 12231 12232 found_it: 12233 assert(s >= RXp_SUBBEG(rx)); 12234 assert((STRLEN)RXp_SUBLEN(rx) >= (STRLEN)((s - RXp_SUBBEG(rx)) + i) ); 12235 if (i >= 0) { 12236#ifdef NO_TAINT_SUPPORT 12237 sv_setpvn(sv, s, i); 12238#else 12239 const int oldtainted = TAINT_get; 12240 TAINT_NOT; 12241 sv_setpvn(sv, s, i); 12242 TAINT_set(oldtainted); 12243#endif 12244 if (RXp_MATCH_UTF8(rx)) 12245 SvUTF8_on(sv); 12246 else 12247 SvUTF8_off(sv); 12248 if (TAINTING_get) { 12249 if (RXp_MATCH_TAINTED(rx)) { 12250 if (SvTYPE(sv) >= SVt_PVMG) { 12251 MAGIC* const mg = SvMAGIC(sv); 12252 MAGIC* mgt; 12253 TAINT; 12254 SvMAGIC_set(sv, mg->mg_moremagic); 12255 SvTAINT(sv); 12256 if ((mgt = SvMAGIC(sv))) { 12257 mg->mg_moremagic = mgt; 12258 SvMAGIC_set(sv, mg); 12259 } 12260 } else { 12261 TAINT; 12262 SvTAINT(sv); 12263 } 12264 } else 12265 SvTAINTED_off(sv); 12266 } 12267 } else { 12268 ret_undef: 12269 sv_set_undef(sv); 12270 return; 12271 } 12272} 12273 12274#endif 12275 12276void 12277Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, 12278 SV const * const value) 12279{ 12280 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; 12281 12282 PERL_UNUSED_ARG(rx); 12283 PERL_UNUSED_ARG(paren); 12284 PERL_UNUSED_ARG(value); 12285 12286 if (!PL_localizing) 12287 Perl_croak_no_modify(); 12288} 12289 12290I32 12291Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, 12292 const I32 paren) 12293{ 12294 struct regexp *const rx = ReANY(r); 12295 I32 i,j; 12296 I32 s1, t1; 12297 I32 logical_nparens = rx->logical_nparens ? rx->logical_nparens : rx->nparens; 12298 12299 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; 12300 12301 if ( paren == RX_BUFF_IDX_CARET_PREMATCH 12302 || paren == RX_BUFF_IDX_CARET_FULLMATCH 12303 || paren == RX_BUFF_IDX_CARET_POSTMATCH 12304 ) 12305 { 12306 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); 12307 if (!keepcopy) { 12308 /* on something like 12309 * $r = qr/.../; 12310 * /$qr/p; 12311 * the KEEPCOPY is set on the PMOP rather than the regex */ 12312 if (PL_curpm && r == PM_GETRE(PL_curpm)) 12313 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); 12314 } 12315 if (!keepcopy) 12316 goto warn_undef; 12317 } 12318 12319 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ 12320 switch (paren) { 12321 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ 12322 case RX_BUFF_IDX_PREMATCH: /* $` */ 12323 if ( (i = RXp_OFFS_START(rx,0)) != -1) { 12324 if (i > 0) { 12325 s1 = 0; 12326 t1 = i; 12327 goto getlen; 12328 } 12329 } 12330 return 0; 12331 12332 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ 12333 case RX_BUFF_IDX_POSTMATCH: /* $' */ 12334 if ( (j = RXp_OFFS_END(rx,0)) != -1 ) { 12335 i = RXp_SUBLEN(rx) - j; 12336 if (i > 0) { 12337 s1 = j; 12338 t1 = RXp_SUBLEN(rx); 12339 goto getlen; 12340 } 12341 } 12342 return 0; 12343 12344 default: /* $& / ${^MATCH}, $1, $2, ... */ 12345 if (paren <= logical_nparens) { 12346 I32 true_paren = rx->logical_to_parno 12347 ? rx->logical_to_parno[paren] 12348 : paren; 12349 do { 12350 if (((s1 = RXp_OFFS_START(rx,true_paren)) != -1) && 12351 ((t1 = RXp_OFFS_END(rx,true_paren)) != -1)) 12352 { 12353 i = t1 - s1; 12354 goto getlen; 12355 } else if (rx->parno_to_logical_next) { 12356 true_paren = rx->parno_to_logical_next[true_paren]; 12357 } else { 12358 break; 12359 } 12360 } while(true_paren); 12361 } 12362 warn_undef: 12363 if (ckWARN(WARN_UNINITIALIZED)) 12364 report_uninit((const SV *)sv); 12365 return 0; 12366 } 12367 getlen: 12368 if (i > 0 && RXp_MATCH_UTF8(rx)) { 12369 const char * const s = RXp_SUBBEG(rx) - RXp_SUBOFFSET(rx) + s1; 12370 const U8 *ep; 12371 STRLEN el; 12372 12373 i = t1 - s1; 12374 if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) 12375 i = el; 12376 } 12377 return i; 12378} 12379 12380/* 12381 * ex: set ts=8 sts=4 sw=4 et: 12382 */ 12383