1/* regcomp.c 2 */ 3 4/* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10/* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20/* NOTE: this is derived from Henry Spencer's regexp code, and should not 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24/* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29/* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32*/ 33 34/* 35 * pregcomp and pregexec -- regsub and regerror are not used in perl 36 * 37 * Copyright (c) 1986 by University of Toronto. 38 * Written by Henry Spencer. Not derived from licensed software. 39 * 40 * Permission is granted to anyone to use this software for any 41 * purpose on any computer system, and to redistribute it freely, 42 * subject to the following restrictions: 43 * 44 * 1. The author is not responsible for the consequences of use of 45 * this software, no matter how awful, even if they arise 46 * from defects in it. 47 * 48 * 2. The origin of this software must not be misrepresented, either 49 * by explicit claim or by omission. 50 * 51 * 3. Altered versions must be plainly marked as such, and must not 52 * be misrepresented as being the original software. 53 * 54 * 55 **** Alterations to Henry's code are... 56 **** 57 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 58 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 59 **** by Larry Wall and others 60 **** 61 **** You may distribute under the terms of either the GNU General Public 62 **** License or the Artistic License, as specified in the README file. 63 64 * 65 * Beware that some of this code is subtly aware of the way operator 66 * precedence is structured in regular expressions. Serious changes in 67 * regular-expression syntax might require a total rethink. 68 */ 69 70/* Note on debug output: 71 * 72 * This is set up so that -Dr turns on debugging like all other flags that are 73 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to 74 * all regular expressions encountered in a program, and gives a huge amount of 75 * output for all but the shortest programs. 76 * 77 * The ability to output pattern debugging information lexically, and with much 78 * finer grained control was added, with 'use re qw(Debug ....);' available even 79 * in non-DEBUGGING builds. This is accomplished by copying the contents of 80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c. 81 * Those files are compiled and linked into the perl executable, and they are 82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls 83 * to re.pm. 84 * 85 * That would normally mean linking errors when two functions of the same name 86 * are attempted to be placed into the same executable. That is solved in one 87 * of four ways: 88 * 1) Static functions aren't known outside the file they are in, so for the 89 * many functions of that type in this file, it just isn't a problem. 90 * 2) Most externally known functions are enclosed in 91 * #ifndef PERL_IN_XSUB_RE 92 * ... 93 * #endif 94 * blocks, so there is only one definition for them in the whole 95 * executable, the one in regcomp.c (or regexec.c). The implication of 96 * that is any debugging info that comes from them is controlled only by 97 * -Dr. Further, any static function they call will also be the version 98 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr. 99 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to 100 * have different names, so that what gets loaded in the executable is 101 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function 102 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging 103 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo' 104 * versions and their callees are under control of re.pm. The catch is 105 * that references to all these go through the regexp_engine structure, 106 * which is initialized in regcomp.h to the Perl_foo versions, and 107 * substituted out in lexical scopes where 'use re' is in effect to the 108 * 'my_foo' ones. That structure is public API, so it would be a hard 109 * sell to add any additional members. 110 * 4) For functions in regcomp.c and re_comp.c that are called only from, 111 * respectively, regexec.c and re_exec.c, they can have two different 112 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and 113 * embed.fnc. 114 * 115 * The bottom line is that if you add code to one of the public functions 116 * listed in ext/re/re_top.h, debugging automagically works. But if you write 117 * a new function that needs to do debugging or there is a chain of calls from 118 * it that need to do debugging, all functions in the chain should use options 119 * 2) or 4) above. 120 * 121 * A function may have to be split so that debugging stuff is static, but it 122 * calls out to some other function that only gets compiled in regcomp.c to 123 * access data that we don't want to duplicate. 124 */ 125 126#ifdef PERL_EXT_RE_BUILD 127#include "re_top.h" 128#endif 129 130#include "EXTERN.h" 131#define PERL_IN_REGEX_ENGINE 132#define PERL_IN_REGCOMP_ANY 133#define PERL_IN_REGCOMP_C 134#include "perl.h" 135 136#ifdef PERL_IN_XSUB_RE 137# include "re_comp.h" 138EXTERN_C const struct regexp_engine my_reg_engine; 139EXTERN_C const struct regexp_engine wild_reg_engine; 140#else 141# include "regcomp.h" 142#endif 143 144#include "invlist_inline.h" 145#include "unicode_constants.h" 146#include "regcomp_internal.h" 147 148/* ========================================================= 149 * BEGIN edit_distance stuff. 150 * 151 * This calculates how many single character changes of any type are needed to 152 * transform a string into another one. It is taken from version 3.1 of 153 * 154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS 155 */ 156 157/* Our unsorted dictionary linked list. */ 158/* Note we use UVs, not chars. */ 159 160struct dictionary{ 161 UV key; 162 UV value; 163 struct dictionary* next; 164}; 165typedef struct dictionary item; 166 167 168PERL_STATIC_INLINE item* 169push(UV key, item* curr) 170{ 171 item* head; 172 Newx(head, 1, item); 173 head->key = key; 174 head->value = 0; 175 head->next = curr; 176 return head; 177} 178 179 180PERL_STATIC_INLINE item* 181find(item* head, UV key) 182{ 183 item* iterator = head; 184 while (iterator){ 185 if (iterator->key == key){ 186 return iterator; 187 } 188 iterator = iterator->next; 189 } 190 191 return NULL; 192} 193 194PERL_STATIC_INLINE item* 195uniquePush(item* head, UV key) 196{ 197 item* iterator = head; 198 199 while (iterator){ 200 if (iterator->key == key) { 201 return head; 202 } 203 iterator = iterator->next; 204 } 205 206 return push(key, head); 207} 208 209PERL_STATIC_INLINE void 210dict_free(item* head) 211{ 212 item* iterator = head; 213 214 while (iterator) { 215 item* temp = iterator; 216 iterator = iterator->next; 217 Safefree(temp); 218 } 219 220 head = NULL; 221} 222 223/* End of Dictionary Stuff */ 224 225/* All calculations/work are done here */ 226STATIC int 227S_edit_distance(const UV* src, 228 const UV* tgt, 229 const STRLEN x, /* length of src[] */ 230 const STRLEN y, /* length of tgt[] */ 231 const SSize_t maxDistance 232) 233{ 234 item *head = NULL; 235 UV swapCount, swapScore, targetCharCount, i, j; 236 UV *scores; 237 UV score_ceil = x + y; 238 239 PERL_ARGS_ASSERT_EDIT_DISTANCE; 240 241 /* initialize matrix start values */ 242 Newx(scores, ( (x + 2) * (y + 2)), UV); 243 scores[0] = score_ceil; 244 scores[1 * (y + 2) + 0] = score_ceil; 245 scores[0 * (y + 2) + 1] = score_ceil; 246 scores[1 * (y + 2) + 1] = 0; 247 head = uniquePush(uniquePush(head, src[0]), tgt[0]); 248 249 /* work loops */ 250 /* i = src index */ 251 /* j = tgt index */ 252 for (i=1;i<=x;i++) { 253 if (i < x) 254 head = uniquePush(head, src[i]); 255 scores[(i+1) * (y + 2) + 1] = i; 256 scores[(i+1) * (y + 2) + 0] = score_ceil; 257 swapCount = 0; 258 259 for (j=1;j<=y;j++) { 260 if (i == 1) { 261 if(j < y) 262 head = uniquePush(head, tgt[j]); 263 scores[1 * (y + 2) + (j + 1)] = j; 264 scores[0 * (y + 2) + (j + 1)] = score_ceil; 265 } 266 267 targetCharCount = find(head, tgt[j-1])->value; 268 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; 269 270 if (src[i-1] != tgt[j-1]){ 271 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); 272 } 273 else { 274 swapCount = j; 275 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); 276 } 277 } 278 279 find(head, src[i-1])->value = i; 280 } 281 282 { 283 IV score = scores[(x+1) * (y + 2) + (y + 1)]; 284 dict_free(head); 285 Safefree(scores); 286 return (maxDistance != 0 && maxDistance < score)?(-1):score; 287 } 288} 289 290/* END of edit_distance() stuff 291 * ========================================================= */ 292 293/* add a data member to the struct reg_data attached to this regex, it should 294 * always return a non-zero return. the 's' argument is the type of the items 295 * being added and the n is the number of items. The length of 's' should match 296 * the number of items. */ 297U32 298Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) 299{ 300 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1; 301 302 PERL_ARGS_ASSERT_REG_ADD_DATA; 303 304 /* in the below expression we have (count + n - 1), the minus one is there 305 * because the struct that we allocate already contains a slot for 1 data 306 * item, so we do not need to allocate it the first time. IOW, the 307 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need 308 * to allocate. See struct reg_data in regcomp.h 309 */ 310 Renewc(RExC_rxi->data, 311 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)), 312 char, struct reg_data); 313 /* however in the data->what expression we use (count + n) and do not 314 * subtract one from the result because the data structure contains a 315 * pointer to an array, and does not allocate the first element as part of 316 * the data struct. */ 317 if (count > 1) 318 Renew(RExC_rxi->data->what, (count + n), U8); 319 else { 320 /* when count == 1 it means we have not initialized anything. 321 * we always fill the 0 slot of the data array with a '%' entry, which 322 * means "zero" (all the other types are letters) which exists purely 323 * so the return from reg_add_data is ALWAYS true, so we can tell it apart 324 * from a "no value" idx=0 in places where we would return an index 325 * into reg_add_data. This is particularly important with the new "single 326 * pass, usually, but not always" strategy that we use, where the code 327 * will use a 0 to represent "not able to compute this yet". 328 */ 329 Newx(RExC_rxi->data->what, n+1, U8); 330 /* fill in the placeholder slot of 0 with a what of '%', we use 331 * this because it sorta looks like a zero (0/0) and it is not a letter 332 * like any of the other "whats", this type should never be created 333 * any other way but here. '%' happens to also not appear in this 334 * file for any other reason (at the time of writing this comment)*/ 335 RExC_rxi->data->what[0]= '%'; 336 RExC_rxi->data->data[0]= NULL; 337 } 338 RExC_rxi->data->count = count + n; 339 Copy(s, RExC_rxi->data->what + count, n, U8); 340 assert(count>0); 341 return count; 342} 343 344/*XXX: todo make this not included in a non debugging perl, but appears to be 345 * used anyway there, in 'use re' */ 346#ifndef PERL_IN_XSUB_RE 347void 348Perl_reginitcolors(pTHX) 349{ 350 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 351 if (s) { 352 char *t = savepv(s); 353 int i = 0; 354 PL_colors[0] = t; 355 while (++i < 6) { 356 t = strchr(t, '\t'); 357 if (t) { 358 *t = '\0'; 359 PL_colors[i] = ++t; 360 } 361 else 362 PL_colors[i] = t = (char *)""; 363 } 364 } else { 365 int i = 0; 366 while (i < 6) 367 PL_colors[i++] = (char *)""; 368 } 369 PL_colorset = 1; 370} 371#endif 372 373 374#ifdef TRIE_STUDY_OPT 375/* search for "restudy" in this file for a detailed explanation */ 376#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ 377 STMT_START { \ 378 if ( \ 379 (data.flags & SCF_TRIE_RESTUDY) \ 380 && ! restudied++ \ 381 ) { \ 382 dOsomething; \ 383 goto reStudy; \ 384 } \ 385 } STMT_END 386#else 387#define CHECK_RESTUDY_GOTO_butfirst 388#endif 389 390/* 391 * pregcomp - compile a regular expression into internal code 392 * 393 * Decides which engine's compiler to call based on the hint currently in 394 * scope 395 */ 396 397#ifndef PERL_IN_XSUB_RE 398 399/* return the currently in-scope regex engine (or the default if none) */ 400 401regexp_engine const * 402Perl_current_re_engine(pTHX) 403{ 404 if (IN_PERL_COMPILETIME) { 405 HV * const table = GvHV(PL_hintgv); 406 SV **ptr; 407 408 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) 409 return &PL_core_reg_engine; 410 ptr = hv_fetchs(table, "regcomp", FALSE); 411 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) 412 return &PL_core_reg_engine; 413 return INT2PTR(regexp_engine*, SvIV(*ptr)); 414 } 415 else { 416 SV *ptr; 417 if (!PL_curcop->cop_hints_hash) 418 return &PL_core_reg_engine; 419 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); 420 if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) 421 return &PL_core_reg_engine; 422 return INT2PTR(regexp_engine*, SvIV(ptr)); 423 } 424} 425 426 427REGEXP * 428Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) 429{ 430 regexp_engine const *eng = current_re_engine(); 431 DECLARE_AND_GET_RE_DEBUG_FLAGS; 432 433 PERL_ARGS_ASSERT_PREGCOMP; 434 435 /* Dispatch a request to compile a regexp to correct regexp engine. */ 436 DEBUG_COMPILE_r({ 437 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n", 438 PTR2UV(eng)); 439 }); 440 return CALLREGCOMP_ENG(eng, pattern, flags); 441} 442#endif 443 444/* 445=for apidoc re_compile 446 447Compile the regular expression pattern C<pattern>, returning a pointer to the 448compiled object for later matching with the internal regex engine. 449 450This function is typically used by a custom regexp engine C<.comp()> function 451to hand off to the core regexp engine those patterns it doesn't want to handle 452itself (typically passing through the same flags it was called with). In 453almost all other cases, a regexp should be compiled by calling L</C<pregcomp>> 454to compile using the currently active regexp engine. 455 456If C<pattern> is already a C<REGEXP>, this function does nothing but return a 457pointer to the input. Otherwise the PV is extracted and treated like a string 458representing a pattern. See L<perlre>. 459 460The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names 461all begin with C<RXf_>. 462 463=cut 464 465 * public entry point for the perl core's own regex compiling code. 466 * It's actually a wrapper for Perl_re_op_compile that only takes an SV 467 * pattern rather than a list of OPs, and uses the internal engine rather 468 * than the current one */ 469 470REGEXP * 471Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) 472{ 473 SV *pat = pattern; /* defeat constness! */ 474 475 PERL_ARGS_ASSERT_RE_COMPILE; 476 477 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, 478#ifdef PERL_IN_XSUB_RE 479 &my_reg_engine, 480#else 481 &PL_core_reg_engine, 482#endif 483 NULL, NULL, rx_flags, 0); 484} 485 486static void 487S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs) 488{ 489 int n; 490 491 if (--cbs->refcnt > 0) 492 return; 493 for (n = 0; n < cbs->count; n++) { 494 REGEXP *rx = cbs->cb[n].src_regex; 495 if (rx) { 496 cbs->cb[n].src_regex = NULL; 497 SvREFCNT_dec_NN(rx); 498 } 499 } 500 Safefree(cbs->cb); 501 Safefree(cbs); 502} 503 504 505static struct reg_code_blocks * 506S_alloc_code_blocks(pTHX_ int ncode) 507{ 508 struct reg_code_blocks *cbs; 509 Newx(cbs, 1, struct reg_code_blocks); 510 cbs->count = ncode; 511 cbs->refcnt = 1; 512 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs); 513 if (ncode) 514 Newx(cbs->cb, ncode, struct reg_code_block); 515 else 516 cbs->cb = NULL; 517 return cbs; 518} 519 520 521/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code 522 * blocks, recalculate the indices. Update pat_p and plen_p in-place to 523 * point to the realloced string and length. 524 * 525 * This is essentially a copy of Perl_bytes_to_utf8() with the code index 526 * stuff added */ 527 528static void 529S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, 530 char **pat_p, STRLEN *plen_p, int num_code_blocks) 531{ 532 U8 *const src = (U8*)*pat_p; 533 U8 *dst, *d; 534 int n=0; 535 STRLEN s = 0; 536 bool do_end = 0; 537 DECLARE_AND_GET_RE_DEBUG_FLAGS; 538 539 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 540 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 541 542 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ 543 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8); 544 d = dst; 545 546 while (s < *plen_p) { 547 append_utf8_from_native_byte(src[s], &d); 548 549 if (n < num_code_blocks) { 550 assert(pRExC_state->code_blocks); 551 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) { 552 pRExC_state->code_blocks->cb[n].start = d - dst - 1; 553 assert(*(d - 1) == '('); 554 do_end = 1; 555 } 556 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) { 557 pRExC_state->code_blocks->cb[n].end = d - dst - 1; 558 assert(*(d - 1) == ')'); 559 do_end = 0; 560 n++; 561 } 562 } 563 s++; 564 } 565 *d = '\0'; 566 *plen_p = d - dst; 567 *pat_p = (char*) dst; 568 SAVEFREEPV(*pat_p); 569 RExC_orig_utf8 = RExC_utf8 = 1; 570} 571 572 573 574/* S_concat_pat(): concatenate a list of args to the pattern string pat, 575 * while recording any code block indices, and handling overloading, 576 * nested qr// objects etc. If pat is null, it will allocate a new 577 * string, or just return the first arg, if there's only one. 578 * 579 * Returns the malloced/updated pat. 580 * patternp and pat_count is the array of SVs to be concatted; 581 * oplist is the optional list of ops that generated the SVs; 582 * recompile_p is a pointer to a boolean that will be set if 583 * the regex will need to be recompiled. 584 * delim, if non-null is an SV that will be inserted between each element 585 */ 586 587static SV* 588S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, 589 SV *pat, SV ** const patternp, int pat_count, 590 OP *oplist, bool *recompile_p, SV *delim) 591{ 592 SV **svp; 593 int n = 0; 594 bool use_delim = FALSE; 595 bool alloced = FALSE; 596 597 /* if we know we have at least two args, create an empty string, 598 * then concatenate args to that. For no args, return an empty string */ 599 if (!pat && pat_count != 1) { 600 pat = newSVpvs(""); 601 SAVEFREESV(pat); 602 alloced = TRUE; 603 } 604 605 for (svp = patternp; svp < patternp + pat_count; svp++) { 606 SV *sv; 607 SV *rx = NULL; 608 STRLEN orig_patlen = 0; 609 bool code = 0; 610 SV *msv = use_delim ? delim : *svp; 611 if (!msv) msv = &PL_sv_undef; 612 613 /* if we've got a delimiter, we go round the loop twice for each 614 * svp slot (except the last), using the delimiter the second 615 * time round */ 616 if (use_delim) { 617 svp--; 618 use_delim = FALSE; 619 } 620 else if (delim) 621 use_delim = TRUE; 622 623 if (SvTYPE(msv) == SVt_PVAV) { 624 /* we've encountered an interpolated array within 625 * the pattern, e.g. /...@a..../. Expand the list of elements, 626 * then recursively append elements. 627 * The code in this block is based on S_pushav() */ 628 629 AV *const av = (AV*)msv; 630 const SSize_t maxarg = AvFILL(av) + 1; 631 SV **array; 632 633 if (oplist) { 634 assert(oplist->op_type == OP_PADAV 635 || oplist->op_type == OP_RV2AV); 636 oplist = OpSIBLING(oplist); 637 } 638 639 if (SvRMAGICAL(av)) { 640 SSize_t i; 641 642 Newx(array, maxarg, SV*); 643 SAVEFREEPV(array); 644 for (i=0; i < maxarg; i++) { 645 SV ** const svp = av_fetch(av, i, FALSE); 646 array[i] = svp ? *svp : &PL_sv_undef; 647 } 648 } 649 else 650 array = AvARRAY(av); 651 652 if (maxarg > 0) { 653 pat = S_concat_pat(aTHX_ pRExC_state, pat, 654 array, maxarg, NULL, recompile_p, 655 /* $" */ 656 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); 657 } 658 else if (!pat) { 659 pat = newSVpvs_flags("", SVs_TEMP); 660 } 661 662 continue; 663 } 664 665 666 /* we make the assumption here that each op in the list of 667 * op_siblings maps to one SV pushed onto the stack, 668 * except for code blocks, with have both an OP_NULL and 669 * an OP_CONST. 670 * This allows us to match up the list of SVs against the 671 * list of OPs to find the next code block. 672 * 673 * Note that PUSHMARK PADSV PADSV .. 674 * is optimised to 675 * PADRANGE PADSV PADSV .. 676 * so the alignment still works. */ 677 678 if (oplist) { 679 if (oplist->op_type == OP_NULL 680 && (oplist->op_flags & OPf_SPECIAL)) 681 { 682 assert(n < pRExC_state->code_blocks->count); 683 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0; 684 pRExC_state->code_blocks->cb[n].block = oplist; 685 pRExC_state->code_blocks->cb[n].src_regex = NULL; 686 n++; 687 code = 1; 688 oplist = OpSIBLING(oplist); /* skip CONST */ 689 assert(oplist); 690 } 691 oplist = OpSIBLING(oplist);; 692 } 693 694 /* apply magic and QR overloading to arg */ 695 696 SvGETMAGIC(msv); 697 if (SvROK(msv) && SvAMAGIC(msv)) { 698 SV *sv = AMG_CALLunary(msv, regexp_amg); 699 if (sv) { 700 if (SvROK(sv)) 701 sv = SvRV(sv); 702 if (SvTYPE(sv) != SVt_REGEXP) 703 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); 704 msv = sv; 705 } 706 } 707 708 /* try concatenation overload ... */ 709 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && 710 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) 711 { 712 sv_setsv(pat, sv); 713 /* overloading involved: all bets are off over literal 714 * code. Pretend we haven't seen it */ 715 if (n) 716 pRExC_state->code_blocks->count -= n; 717 n = 0; 718 } 719 else { 720 /* ... or failing that, try "" overload */ 721 while (SvAMAGIC(msv) 722 && (sv = AMG_CALLunary(msv, string_amg)) 723 && sv != msv 724 && !( SvROK(msv) 725 && SvROK(sv) 726 && SvRV(msv) == SvRV(sv)) 727 ) { 728 msv = sv; 729 SvGETMAGIC(msv); 730 } 731 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) 732 msv = SvRV(msv); 733 734 if (pat) { 735 /* this is a partially unrolled 736 * sv_catsv_nomg(pat, msv); 737 * that allows us to adjust code block indices if 738 * needed */ 739 STRLEN dlen; 740 char *dst = SvPV_force_nomg(pat, dlen); 741 orig_patlen = dlen; 742 if (SvUTF8(msv) && !SvUTF8(pat)) { 743 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); 744 sv_setpvn(pat, dst, dlen); 745 SvUTF8_on(pat); 746 } 747 sv_catsv_nomg(pat, msv); 748 rx = msv; 749 } 750 else { 751 /* We have only one SV to process, but we need to verify 752 * it is properly null terminated or we will fail asserts 753 * later. In theory we probably shouldn't get such SV's, 754 * but if we do we should handle it gracefully. */ 755 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) { 756 /* not a string, or a string with a trailing null */ 757 pat = msv; 758 } else { 759 /* a string with no trailing null, we need to copy it 760 * so it has a trailing null */ 761 pat = sv_2mortal(newSVsv(msv)); 762 } 763 } 764 765 if (code) 766 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1; 767 } 768 769 /* extract any code blocks within any embedded qr//'s */ 770 if (rx && SvTYPE(rx) == SVt_REGEXP 771 && RX_ENGINE((REGEXP*)rx)->op_comp) 772 { 773 774 RXi_GET_DECL(ReANY((REGEXP *)rx), ri); 775 if (ri->code_blocks && ri->code_blocks->count) { 776 int i; 777 /* the presence of an embedded qr// with code means 778 * we should always recompile: the text of the 779 * qr// may not have changed, but it may be a 780 * different closure than last time */ 781 *recompile_p = 1; 782 if (pRExC_state->code_blocks) { 783 int new_count = pRExC_state->code_blocks->count 784 + ri->code_blocks->count; 785 Renew(pRExC_state->code_blocks->cb, 786 new_count, struct reg_code_block); 787 pRExC_state->code_blocks->count = new_count; 788 } 789 else 790 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ 791 ri->code_blocks->count); 792 793 for (i=0; i < ri->code_blocks->count; i++) { 794 struct reg_code_block *src, *dst; 795 STRLEN offset = orig_patlen 796 + ReANY((REGEXP *)rx)->pre_prefix; 797 assert(n < pRExC_state->code_blocks->count); 798 src = &ri->code_blocks->cb[i]; 799 dst = &pRExC_state->code_blocks->cb[n]; 800 dst->start = src->start + offset; 801 dst->end = src->end + offset; 802 dst->block = src->block; 803 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) 804 src->src_regex 805 ? src->src_regex 806 : (REGEXP*)rx); 807 n++; 808 } 809 } 810 } 811 } 812 /* avoid calling magic multiple times on a single element e.g. =~ $qr */ 813 if (alloced) 814 SvSETMAGIC(pat); 815 816 return pat; 817} 818 819 820 821/* see if there are any run-time code blocks in the pattern. 822 * False positives are allowed */ 823 824static bool 825S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 826 char *pat, STRLEN plen) 827{ 828 int n = 0; 829 STRLEN s; 830 831 PERL_UNUSED_CONTEXT; 832 833 for (s = 0; s < plen; s++) { 834 if ( pRExC_state->code_blocks 835 && n < pRExC_state->code_blocks->count 836 && s == pRExC_state->code_blocks->cb[n].start) 837 { 838 s = pRExC_state->code_blocks->cb[n].end; 839 n++; 840 continue; 841 } 842 /* TODO ideally should handle [..], (#..), /#.../x to reduce false 843 * positives here */ 844 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && 845 (pat[s+2] == '{' 846 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) 847 ) 848 return 1; 849 } 850 return 0; 851} 852 853/* Handle run-time code blocks. We will already have compiled any direct 854 * or indirect literal code blocks. Now, take the pattern 'pat' and make a 855 * copy of it, but with any literal code blocks blanked out and 856 * appropriate chars escaped; then feed it into 857 * 858 * eval "qr'modified_pattern'" 859 * 860 * For example, 861 * 862 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno 863 * 864 * becomes 865 * 866 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' 867 * 868 * After eval_sv()-ing that, grab any new code blocks from the returned qr 869 * and merge them with any code blocks of the original regexp. 870 * 871 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; 872 * instead, just save the qr and return FALSE; this tells our caller that 873 * the original pattern needs upgrading to utf8. 874 */ 875 876static bool 877S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 878 char *pat, STRLEN plen) 879{ 880 SV *qr; 881 882 DECLARE_AND_GET_RE_DEBUG_FLAGS; 883 884 if (pRExC_state->runtime_code_qr) { 885 /* this is the second time we've been called; this should 886 * only happen if the main pattern got upgraded to utf8 887 * during compilation; re-use the qr we compiled first time 888 * round (which should be utf8 too) 889 */ 890 qr = pRExC_state->runtime_code_qr; 891 pRExC_state->runtime_code_qr = NULL; 892 assert(RExC_utf8 && SvUTF8(qr)); 893 } 894 else { 895 int n = 0; 896 STRLEN s; 897 char *p, *newpat; 898 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */ 899 SV *sv, *qr_ref; 900 dSP; 901 902 /* determine how many extra chars we need for ' and \ escaping */ 903 for (s = 0; s < plen; s++) { 904 if (pat[s] == '\'' || pat[s] == '\\') 905 newlen++; 906 } 907 908 Newx(newpat, newlen, char); 909 p = newpat; 910 *p++ = 'q'; *p++ = 'r'; *p++ = '\''; 911 912 for (s = 0; s < plen; s++) { 913 if ( pRExC_state->code_blocks 914 && n < pRExC_state->code_blocks->count 915 && s == pRExC_state->code_blocks->cb[n].start) 916 { 917 /* blank out literal code block so that they aren't 918 * recompiled: eg change from/to: 919 * /(?{xyz})/ 920 * /(?=====)/ 921 * and 922 * /(??{xyz})/ 923 * /(?======)/ 924 * and 925 * /(?(?{xyz}))/ 926 * /(?(?=====))/ 927 */ 928 assert(pat[s] == '('); 929 assert(pat[s+1] == '?'); 930 *p++ = '('; 931 *p++ = '?'; 932 s += 2; 933 while (s < pRExC_state->code_blocks->cb[n].end) { 934 *p++ = '='; 935 s++; 936 } 937 *p++ = ')'; 938 n++; 939 continue; 940 } 941 if (pat[s] == '\'' || pat[s] == '\\') 942 *p++ = '\\'; 943 *p++ = pat[s]; 944 } 945 *p++ = '\''; 946 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) { 947 *p++ = 'x'; 948 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) { 949 *p++ = 'x'; 950 } 951 } 952 *p++ = '\0'; 953 DEBUG_COMPILE_r({ 954 Perl_re_printf( aTHX_ 955 "%sre-parsing pattern for runtime code:%s %s\n", 956 PL_colors[4], PL_colors[5], newpat); 957 }); 958 959 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); 960 Safefree(newpat); 961 962 ENTER; 963 SAVETMPS; 964 save_re_context(); 965 PUSHSTACKi(PERLSI_REQUIRE); 966 /* G_RE_REPARSING causes the toker to collapse \\ into \ when 967 * parsing qr''; normally only q'' does this. It also alters 968 * hints handling */ 969 eval_sv(sv, G_SCALAR|G_RE_REPARSING); 970 SvREFCNT_dec_NN(sv); 971 SPAGAIN; 972 qr_ref = POPs; 973 PUTBACK; 974 { 975 SV * const errsv = ERRSV; 976 if (SvTRUE_NN(errsv)) 977 /* use croak_sv ? */ 978 Perl_croak_nocontext("%" SVf, SVfARG(errsv)); 979 } 980 assert(SvROK(qr_ref)); 981 qr = SvRV(qr_ref); 982 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); 983 /* the leaving below frees the tmp qr_ref. 984 * Give qr a life of its own */ 985 SvREFCNT_inc(qr); 986 POPSTACK; 987 FREETMPS; 988 LEAVE; 989 990 } 991 992 if (!RExC_utf8 && SvUTF8(qr)) { 993 /* first time through; the pattern got upgraded; save the 994 * qr for the next time through */ 995 assert(!pRExC_state->runtime_code_qr); 996 pRExC_state->runtime_code_qr = qr; 997 return 0; 998 } 999 1000 1001 /* extract any code blocks within the returned qr// */ 1002 1003 1004 /* merge the main (r1) and run-time (r2) code blocks into one */ 1005 { 1006 RXi_GET_DECL(ReANY((REGEXP *)qr), r2); 1007 struct reg_code_block *new_block, *dst; 1008 RExC_state_t * const r1 = pRExC_state; /* convenient alias */ 1009 int i1 = 0, i2 = 0; 1010 int r1c, r2c; 1011 1012 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */ 1013 { 1014 SvREFCNT_dec_NN(qr); 1015 return 1; 1016 } 1017 1018 if (!r1->code_blocks) 1019 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0); 1020 1021 r1c = r1->code_blocks->count; 1022 r2c = r2->code_blocks->count; 1023 1024 Newx(new_block, r1c + r2c, struct reg_code_block); 1025 1026 dst = new_block; 1027 1028 while (i1 < r1c || i2 < r2c) { 1029 struct reg_code_block *src; 1030 bool is_qr = 0; 1031 1032 if (i1 == r1c) { 1033 src = &r2->code_blocks->cb[i2++]; 1034 is_qr = 1; 1035 } 1036 else if (i2 == r2c) 1037 src = &r1->code_blocks->cb[i1++]; 1038 else if ( r1->code_blocks->cb[i1].start 1039 < r2->code_blocks->cb[i2].start) 1040 { 1041 src = &r1->code_blocks->cb[i1++]; 1042 assert(src->end < r2->code_blocks->cb[i2].start); 1043 } 1044 else { 1045 assert( r1->code_blocks->cb[i1].start 1046 > r2->code_blocks->cb[i2].start); 1047 src = &r2->code_blocks->cb[i2++]; 1048 is_qr = 1; 1049 assert(src->end < r1->code_blocks->cb[i1].start); 1050 } 1051 1052 assert(pat[src->start] == '('); 1053 assert(pat[src->end] == ')'); 1054 dst->start = src->start; 1055 dst->end = src->end; 1056 dst->block = src->block; 1057 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) 1058 : src->src_regex; 1059 dst++; 1060 } 1061 r1->code_blocks->count += r2c; 1062 Safefree(r1->code_blocks->cb); 1063 r1->code_blocks->cb = new_block; 1064 } 1065 1066 SvREFCNT_dec_NN(qr); 1067 return 1; 1068} 1069 1070 1071STATIC bool 1072S_setup_longest(pTHX_ RExC_state_t *pRExC_state, 1073 struct reg_substr_datum *rsd, 1074 struct scan_data_substrs *sub, 1075 STRLEN longest_length) 1076{ 1077 /* This is the common code for setting up the floating and fixed length 1078 * string data extracted from Perl_re_op_compile() below. Returns a boolean 1079 * as to whether succeeded or not */ 1080 1081 I32 t; 1082 SSize_t ml; 1083 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL); 1084 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL); 1085 1086 if (! (longest_length 1087 || (eol /* Can't have SEOL and MULTI */ 1088 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) 1089 ) 1090 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ 1091 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) 1092 { 1093 return FALSE; 1094 } 1095 1096 /* copy the information about the longest from the reg_scan_data 1097 over to the program. */ 1098 if (SvUTF8(sub->str)) { 1099 rsd->substr = NULL; 1100 rsd->utf8_substr = sub->str; 1101 } else { 1102 rsd->substr = sub->str; 1103 rsd->utf8_substr = NULL; 1104 } 1105 /* end_shift is how many chars that must be matched that 1106 follow this item. We calculate it ahead of time as once the 1107 lookbehind offset is added in we lose the ability to correctly 1108 calculate it.*/ 1109 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length; 1110 rsd->end_shift = ml - sub->min_offset 1111 - longest_length 1112 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL 1113 * intead? - DAPM 1114 + (SvTAIL(sub->str) != 0) 1115 */ 1116 + sub->lookbehind; 1117 1118 t = (eol/* Can't have SEOL and MULTI */ 1119 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); 1120 fbm_compile(sub->str, t ? FBMcf_TAIL : 0); 1121 1122 return TRUE; 1123} 1124 1125STATIC void 1126S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx) 1127{ 1128 /* Calculates and sets in the compiled pattern 'Rx' the string to compile, 1129 * properly wrapped with the right modifiers */ 1130 1131 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 1132 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags) 1133 != REGEX_DEPENDS_CHARSET); 1134 1135 /* The caret is output if there are any defaults: if not all the STD 1136 * flags are set, or if no character set specifier is needed */ 1137 bool has_default = 1138 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) 1139 || ! has_charset); 1140 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) 1141 == REG_RUN_ON_COMMENT_SEEN); 1142 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD) 1143 >> RXf_PMf_STD_PMMOD_SHIFT); 1144 const char *fptr = STD_PAT_MODS; /*"msixxn"*/ 1145 char *p; 1146 STRLEN pat_len = RExC_precomp_end - RExC_precomp; 1147 1148 /* We output all the necessary flags; we never output a minus, as all 1149 * those are defaults, so are 1150 * covered by the caret */ 1151 const STRLEN wraplen = pat_len + has_p + has_runon 1152 + has_default /* If needs a caret */ 1153 + PL_bitcount[reganch] /* 1 char for each set standard flag */ 1154 1155 /* If needs a character set specifier */ 1156 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) 1157 + (sizeof("(?:)") - 1); 1158 1159 PERL_ARGS_ASSERT_SET_REGEX_PV; 1160 1161 /* make sure PL_bitcount bounds not exceeded */ 1162 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8); 1163 1164 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */ 1165 SvPOK_on(Rx); 1166 if (RExC_utf8) 1167 SvFLAGS(Rx) |= SVf_UTF8; 1168 *p++='('; *p++='?'; 1169 1170 /* If a default, cover it using the caret */ 1171 if (has_default) { 1172 *p++= DEFAULT_PAT_MOD; 1173 } 1174 if (has_charset) { 1175 STRLEN len; 1176 const char* name; 1177 1178 name = get_regex_charset_name(RExC_rx->extflags, &len); 1179 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */ 1180 assert(RExC_utf8); 1181 name = UNICODE_PAT_MODS; 1182 len = sizeof(UNICODE_PAT_MODS) - 1; 1183 } 1184 Copy(name, p, len, char); 1185 p += len; 1186 } 1187 if (has_p) 1188 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 1189 { 1190 char ch; 1191 while((ch = *fptr++)) { 1192 if(reganch & 1) 1193 *p++ = ch; 1194 reganch >>= 1; 1195 } 1196 } 1197 1198 *p++ = ':'; 1199 Copy(RExC_precomp, p, pat_len, char); 1200 assert ((RX_WRAPPED(Rx) - p) < 16); 1201 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx); 1202 p += pat_len; 1203 1204 /* Adding a trailing \n causes this to compile properly: 1205 my $R = qr / A B C # D E/x; /($R)/ 1206 Otherwise the parens are considered part of the comment */ 1207 if (has_runon) 1208 *p++ = '\n'; 1209 *p++ = ')'; 1210 *p = 0; 1211 SvCUR_set(Rx, p - RX_WRAPPED(Rx)); 1212} 1213 1214/* 1215 * Perl_re_op_compile - the perl internal RE engine's function to compile a 1216 * regular expression into internal code. 1217 * The pattern may be passed either as: 1218 * a list of SVs (patternp plus pat_count) 1219 * a list of OPs (expr) 1220 * If both are passed, the SV list is used, but the OP list indicates 1221 * which SVs are actually pre-compiled code blocks 1222 * 1223 * The SVs in the list have magic and qr overloading applied to them (and 1224 * the list may be modified in-place with replacement SVs in the latter 1225 * case). 1226 * 1227 * If the pattern hasn't changed from old_re, then old_re will be 1228 * returned. 1229 * 1230 * eng is the current engine. If that engine has an op_comp method, then 1231 * handle directly (i.e. we assume that op_comp was us); otherwise, just 1232 * do the initial concatenation of arguments and pass on to the external 1233 * engine. 1234 * 1235 * If is_bare_re is not null, set it to a boolean indicating whether the 1236 * arg list reduced (after overloading) to a single bare regex which has 1237 * been returned (i.e. /$qr/). 1238 * 1239 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. 1240 * 1241 * pm_flags contains the PMf_* flags, typically based on those from the 1242 * pm_flags field of the related PMOP. Currently we're only interested in 1243 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD. 1244 * 1245 * For many years this code had an initial sizing pass that calculated 1246 * (sometimes incorrectly, leading to security holes) the size needed for the 1247 * compiled pattern. That was changed by commit 1248 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a 1249 * node at a time, as parsing goes along. Patches welcome to fix any obsolete 1250 * references to this sizing pass. 1251 * 1252 * Now, an initial crude guess as to the size needed is made, based on the 1253 * length of the pattern. Patches welcome to improve that guess. That amount 1254 * of space is malloc'd and then immediately freed, and then clawed back node 1255 * by node. This design is to minimize, to the extent possible, memory churn 1256 * when doing the reallocs. 1257 * 1258 * A separate parentheses counting pass may be needed in some cases. 1259 * (Previously the sizing pass did this.) Patches welcome to reduce the number 1260 * of these cases. 1261 * 1262 * The existence of a sizing pass necessitated design decisions that are no 1263 * longer needed. There are potential areas of simplification. 1264 * 1265 * Beware that the optimization-preparation code in here knows about some 1266 * of the structure of the compiled regexp. [I'll say.] 1267 */ 1268 1269REGEXP * 1270Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, 1271 OP *expr, const regexp_engine* eng, REGEXP *old_re, 1272 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags) 1273{ 1274 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */ 1275 STRLEN plen; 1276 char *exp; 1277 regnode *scan; 1278 I32 flags; 1279 SSize_t minlen = 0; 1280 U32 rx_flags; 1281 SV *pat; 1282 SV** new_patternp = patternp; 1283 1284 /* these are all flags - maybe they should be turned 1285 * into a single int with different bit masks */ 1286 I32 sawlookahead = 0; 1287 I32 sawplus = 0; 1288 I32 sawopen = 0; 1289 I32 sawminmod = 0; 1290 1291 regex_charset initial_charset = get_regex_charset(orig_rx_flags); 1292 bool recompile = 0; 1293 bool runtime_code = 0; 1294 scan_data_t data; 1295 RExC_state_t RExC_state; 1296 RExC_state_t * const pRExC_state = &RExC_state; 1297#ifdef TRIE_STUDY_OPT 1298 /* search for "restudy" in this file for a detailed explanation */ 1299 int restudied = 0; 1300 RExC_state_t copyRExC_state; 1301#endif 1302 DECLARE_AND_GET_RE_DEBUG_FLAGS; 1303 1304 PERL_ARGS_ASSERT_RE_OP_COMPILE; 1305 1306 DEBUG_r(if (!PL_colorset) reginitcolors()); 1307 1308 1309 pRExC_state->warn_text = NULL; 1310 pRExC_state->unlexed_names = NULL; 1311 pRExC_state->code_blocks = NULL; 1312 1313 if (is_bare_re) 1314 *is_bare_re = FALSE; 1315 1316 if (expr && (expr->op_type == OP_LIST || 1317 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { 1318 /* allocate code_blocks if needed */ 1319 OP *o; 1320 int ncode = 0; 1321 1322 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) 1323 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) 1324 ncode++; /* count of DO blocks */ 1325 1326 if (ncode) 1327 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode); 1328 } 1329 1330 if (!pat_count) { 1331 /* compile-time pattern with just OP_CONSTs and DO blocks */ 1332 1333 int n; 1334 OP *o; 1335 1336 /* find how many CONSTs there are */ 1337 assert(expr); 1338 n = 0; 1339 if (expr->op_type == OP_CONST) 1340 n = 1; 1341 else 1342 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 1343 if (o->op_type == OP_CONST) 1344 n++; 1345 } 1346 1347 /* fake up an SV array */ 1348 1349 assert(!new_patternp); 1350 Newx(new_patternp, n, SV*); 1351 SAVEFREEPV(new_patternp); 1352 pat_count = n; 1353 1354 n = 0; 1355 if (expr->op_type == OP_CONST) 1356 new_patternp[n] = cSVOPx_sv(expr); 1357 else 1358 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 1359 if (o->op_type == OP_CONST) 1360 new_patternp[n++] = cSVOPo_sv; 1361 } 1362 1363 } 1364 1365 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 1366 "Assembling pattern from %d elements%s\n", pat_count, 1367 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 1368 1369 /* set expr to the first arg op */ 1370 1371 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count 1372 && expr->op_type != OP_CONST) 1373 { 1374 expr = cLISTOPx(expr)->op_first; 1375 assert( expr->op_type == OP_PUSHMARK 1376 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) 1377 || expr->op_type == OP_PADRANGE); 1378 expr = OpSIBLING(expr); 1379 } 1380 1381 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, 1382 expr, &recompile, NULL); 1383 1384 /* handle bare (possibly after overloading) regex: foo =~ $re */ 1385 { 1386 SV *re = pat; 1387 if (SvROK(re)) 1388 re = SvRV(re); 1389 if (SvTYPE(re) == SVt_REGEXP) { 1390 if (is_bare_re) 1391 *is_bare_re = TRUE; 1392 SvREFCNT_inc(re); 1393 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 1394 "Precompiled pattern%s\n", 1395 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 1396 1397 return (REGEXP*)re; 1398 } 1399 } 1400 1401 exp = SvPV_nomg(pat, plen); 1402 1403 if (!eng->op_comp) { 1404 if ((SvUTF8(pat) && IN_BYTES) 1405 || SvGMAGICAL(pat) || SvAMAGIC(pat)) 1406 { 1407 /* make a temporary copy; either to convert to bytes, 1408 * or to avoid repeating get-magic / overloaded stringify */ 1409 pat = newSVpvn_flags(exp, plen, SVs_TEMP | 1410 (IN_BYTES ? 0 : SvUTF8(pat))); 1411 } 1412 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); 1413 } 1414 1415 /* ignore the utf8ness if the pattern is 0 length */ 1416 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); 1417 RExC_uni_semantics = 0; 1418 RExC_contains_locale = 0; 1419 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); 1420 RExC_in_script_run = 0; 1421 RExC_study_started = 0; 1422 pRExC_state->runtime_code_qr = NULL; 1423 RExC_frame_head= NULL; 1424 RExC_frame_last= NULL; 1425 RExC_frame_count= 0; 1426 RExC_latest_warn_offset = 0; 1427 RExC_use_BRANCHJ = 0; 1428 RExC_warned_WARN_EXPERIMENTAL__VLB = 0; 1429 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0; 1430 RExC_logical_total_parens = 0; 1431 RExC_total_parens = 0; 1432 RExC_logical_to_parno = NULL; 1433 RExC_parno_to_logical = NULL; 1434 RExC_open_parens = NULL; 1435 RExC_close_parens = NULL; 1436 RExC_paren_names = NULL; 1437 RExC_size = 0; 1438 RExC_seen_d_op = FALSE; 1439#ifdef DEBUGGING 1440 RExC_paren_name_list = NULL; 1441#endif 1442 1443 DEBUG_r({ 1444 RExC_mysv1= sv_newmortal(); 1445 RExC_mysv2= sv_newmortal(); 1446 }); 1447 1448 DEBUG_COMPILE_r({ 1449 SV *dsv= sv_newmortal(); 1450 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len); 1451 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n", 1452 PL_colors[4], PL_colors[5], s); 1453 }); 1454 1455 /* we jump here if we have to recompile, e.g., from upgrading the pattern 1456 * to utf8 */ 1457 1458 if ((pm_flags & PMf_USE_RE_EVAL) 1459 /* this second condition covers the non-regex literal case, 1460 * i.e. $foo =~ '(?{})'. */ 1461 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) 1462 ) 1463 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); 1464 1465 redo_parse: 1466 /* return old regex if pattern hasn't changed */ 1467 /* XXX: note in the below we have to check the flags as well as the 1468 * pattern. 1469 * 1470 * Things get a touch tricky as we have to compare the utf8 flag 1471 * independently from the compile flags. */ 1472 1473 if ( old_re 1474 && !recompile 1475 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8) 1476 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) 1477 && RX_PRELEN(old_re) == plen 1478 && memEQ(RX_PRECOMP(old_re), exp, plen) 1479 && !runtime_code /* with runtime code, always recompile */ ) 1480 { 1481 DEBUG_COMPILE_r({ 1482 SV *dsv= sv_newmortal(); 1483 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len); 1484 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n", 1485 PL_colors[4], PL_colors[5], s); 1486 }); 1487 return old_re; 1488 } 1489 1490 /* Allocate the pattern's SV */ 1491 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP); 1492 RExC_rx = ReANY(Rx); 1493 if ( RExC_rx == NULL ) 1494 FAIL("Regexp out of space"); 1495 1496 rx_flags = orig_rx_flags; 1497 1498 if ( toUSE_UNI_CHARSET_NOT_DEPENDS 1499 && initial_charset == REGEX_DEPENDS_CHARSET) 1500 { 1501 1502 /* Set to use unicode semantics if the pattern is in utf8 and has the 1503 * 'depends' charset specified, as it means unicode when utf8 */ 1504 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 1505 RExC_uni_semantics = 1; 1506 } 1507 1508 RExC_pm_flags = pm_flags; 1509 1510 if (runtime_code) { 1511 assert(TAINTING_get || !TAINT_get); 1512 if (TAINT_get) 1513 Perl_croak(aTHX_ "Eval-group in insecure regular expression"); 1514 1515 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { 1516 /* whoops, we have a non-utf8 pattern, whilst run-time code 1517 * got compiled as utf8. Try again with a utf8 pattern */ 1518 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 1519 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); 1520 goto redo_parse; 1521 } 1522 } 1523 assert(!pRExC_state->runtime_code_qr); 1524 1525 RExC_sawback = 0; 1526 1527 RExC_seen = 0; 1528 RExC_maxlen = 0; 1529 RExC_in_lookaround = 0; 1530 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 1531 RExC_recode_x_to_native = 0; 1532 RExC_in_multi_char_class = 0; 1533 1534 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; 1535 RExC_precomp_end = RExC_end = exp + plen; 1536 RExC_nestroot = 0; 1537 RExC_whilem_seen = 0; 1538 RExC_end_op = NULL; 1539 RExC_recurse = NULL; 1540 RExC_study_chunk_recursed = NULL; 1541 RExC_study_chunk_recursed_bytes= 0; 1542 RExC_recurse_count = 0; 1543 RExC_sets_depth = 0; 1544 pRExC_state->code_index = 0; 1545 1546 /* Initialize the string in the compiled pattern. This is so that there is 1547 * something to output if necessary */ 1548 set_regex_pv(pRExC_state, Rx); 1549 1550 DEBUG_PARSE_r({ 1551 Perl_re_printf( aTHX_ 1552 "Starting parse and generation\n"); 1553 RExC_lastnum=0; 1554 RExC_lastparse=NULL; 1555 }); 1556 1557 /* Allocate space and zero-initialize. Note, the two step process 1558 of zeroing when in debug mode, thus anything assigned has to 1559 happen after that */ 1560 if (! RExC_size) { 1561 1562 /* On the first pass of the parse, we guess how big this will be. Then 1563 * we grow in one operation to that amount and then give it back. As 1564 * we go along, we re-allocate what we need. 1565 * 1566 * XXX Currently the guess is essentially that the pattern will be an 1567 * EXACT node with one byte input, one byte output. This is crude, and 1568 * better heuristics are welcome. 1569 * 1570 * On any subsequent passes, we guess what we actually computed in the 1571 * latest earlier pass. Such a pass probably didn't complete so is 1572 * missing stuff. We could improve those guesses by knowing where the 1573 * parse stopped, and use the length so far plus apply the above 1574 * assumption to what's left. */ 1575 RExC_size = STR_SZ(RExC_end - RExC_start); 1576 } 1577 1578 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal); 1579 if ( RExC_rxi == NULL ) 1580 FAIL("Regexp out of space"); 1581 1582 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char); 1583 RXi_SET( RExC_rx, RExC_rxi ); 1584 1585 /* We start from 0 (over from 0 in the case this is a reparse. The first 1586 * node parsed will give back any excess memory we have allocated so far). 1587 * */ 1588 RExC_size = 0; 1589 1590 /* non-zero initialization begins here */ 1591 RExC_rx->engine= eng; 1592 RExC_rx->extflags = rx_flags; 1593 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; 1594 1595 if (pm_flags & PMf_IS_QR) { 1596 RExC_rxi->code_blocks = pRExC_state->code_blocks; 1597 if (RExC_rxi->code_blocks) { 1598 RExC_rxi->code_blocks->refcnt++; 1599 } 1600 } 1601 1602 RExC_rx->intflags = 0; 1603 1604 RExC_flags = rx_flags; /* don't let top level (?i) bleed */ 1605 RExC_parse_set(exp); 1606 1607 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv 1608 * code makes sure the final byte is an uncounted NUL. But should this 1609 * ever not be the case, lots of things could read beyond the end of the 1610 * buffer: loops like 1611 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1); 1612 * strchr(RExC_parse, "foo"); 1613 * etc. So it is worth noting. */ 1614 assert(*RExC_end == '\0'); 1615 1616 RExC_naughty = 0; 1617 RExC_npar = 1; 1618 RExC_logical_npar = 1; 1619 RExC_parens_buf_size = 0; 1620 RExC_emit_start = RExC_rxi->program; 1621 pRExC_state->code_index = 0; 1622 1623 *((char*) RExC_emit_start) = (char) REG_MAGIC; 1624 RExC_emit = NODE_STEP_REGNODE; 1625 1626 /* Do the parse */ 1627 if (reg(pRExC_state, 0, &flags, 1)) { 1628 1629 /* Success!, But we may need to redo the parse knowing how many parens 1630 * there actually are */ 1631 if (IN_PARENS_PASS) { 1632 flags |= RESTART_PARSE; 1633 } 1634 1635 /* We have that number in RExC_npar */ 1636 RExC_total_parens = RExC_npar; 1637 RExC_logical_total_parens = RExC_logical_npar; 1638 } 1639 else if (! MUST_RESTART(flags)) { 1640 ReREFCNT_dec(Rx); 1641 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags); 1642 } 1643 1644 /* Here, we either have success, or we have to redo the parse for some reason */ 1645 if (MUST_RESTART(flags)) { 1646 1647 /* It's possible to write a regexp in ascii that represents Unicode 1648 codepoints outside of the byte range, such as via \x{100}. If we 1649 detect such a sequence we have to convert the entire pattern to utf8 1650 and then recompile, as our sizing calculation will have been based 1651 on 1 byte == 1 character, but we will need to use utf8 to encode 1652 at least some part of the pattern, and therefore must convert the whole 1653 thing. 1654 -- dmq */ 1655 if (flags & NEED_UTF8) { 1656 1657 /* We have stored the offset of the final warning output so far. 1658 * That must be adjusted. Any variant characters between the start 1659 * of the pattern and this warning count for 2 bytes in the final, 1660 * so just add them again */ 1661 if (UNLIKELY(RExC_latest_warn_offset > 0)) { 1662 RExC_latest_warn_offset += 1663 variant_under_utf8_count((U8 *) exp, (U8 *) exp 1664 + RExC_latest_warn_offset); 1665 } 1666 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 1667 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); 1668 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n")); 1669 } 1670 else { 1671 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); 1672 } 1673 1674 if (ALL_PARENS_COUNTED) { 1675 /* Make enough room for all the known parens, and zero it */ 1676 Renew(RExC_open_parens, RExC_total_parens, regnode_offset); 1677 Zero(RExC_open_parens, RExC_total_parens, regnode_offset); 1678 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ 1679 1680 Renew(RExC_close_parens, RExC_total_parens, regnode_offset); 1681 Zero(RExC_close_parens, RExC_total_parens, regnode_offset); 1682 /* we do NOT reinitialize RExC_logical_to_parno and 1683 * RExC_parno_to_logical here. We need their data on the second 1684 * pass */ 1685 } 1686 else { /* Parse did not complete. Reinitialize the parentheses 1687 structures */ 1688 RExC_total_parens = 0; 1689 if (RExC_open_parens) { 1690 Safefree(RExC_open_parens); 1691 RExC_open_parens = NULL; 1692 } 1693 if (RExC_close_parens) { 1694 Safefree(RExC_close_parens); 1695 RExC_close_parens = NULL; 1696 } 1697 if (RExC_logical_to_parno) { 1698 Safefree(RExC_logical_to_parno); 1699 RExC_logical_to_parno = NULL; 1700 } 1701 if (RExC_parno_to_logical) { 1702 Safefree(RExC_parno_to_logical); 1703 RExC_parno_to_logical = NULL; 1704 } 1705 } 1706 1707 /* Clean up what we did in this parse */ 1708 SvREFCNT_dec_NN(RExC_rx_sv); 1709 1710 goto redo_parse; 1711 } 1712 1713 /* Here, we have successfully parsed and generated the pattern's program 1714 * for the regex engine. We are ready to finish things up and look for 1715 * optimizations. */ 1716 1717 /* Update the string to compile, with correct modifiers, etc */ 1718 set_regex_pv(pRExC_state, Rx); 1719 1720 RExC_rx->nparens = RExC_total_parens - 1; 1721 RExC_rx->logical_nparens = RExC_logical_total_parens - 1; 1722 1723 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */ 1724 if (RExC_whilem_seen > 15) 1725 RExC_whilem_seen = 15; 1726 1727 DEBUG_PARSE_r({ 1728 Perl_re_printf( aTHX_ 1729 "Required size %" IVdf " nodes\n", (IV)RExC_size); 1730 RExC_lastnum=0; 1731 RExC_lastparse=NULL; 1732 }); 1733 1734 SetProgLen(RExC_rxi,RExC_size); 1735 1736 DEBUG_DUMP_PRE_OPTIMIZE_r({ 1737 SV * const sv = sv_newmortal(); 1738 RXi_GET_DECL(RExC_rx, ri); 1739 DEBUG_RExC_seen(); 1740 Perl_re_printf( aTHX_ "Program before optimization:\n"); 1741 1742 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL, 1743 sv, 0, 0); 1744 }); 1745 1746 DEBUG_OPTIMISE_r( 1747 Perl_re_printf( aTHX_ "Starting post parse optimization\n"); 1748 ); 1749 1750 /* XXXX To minimize changes to RE engine we always allocate 1751 3-units-long substrs field. */ 1752 Newx(RExC_rx->substrs, 1, struct reg_substr_data); 1753 if (RExC_recurse_count) { 1754 Newx(RExC_recurse, RExC_recurse_count, regnode *); 1755 SAVEFREEPV(RExC_recurse); 1756 } 1757 1758 if (RExC_seen & REG_RECURSE_SEEN) { 1759 /* Note, RExC_total_parens is 1 + the number of parens in a pattern. 1760 * So its 1 if there are no parens. */ 1761 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) + 1762 ((RExC_total_parens & 0x07) != 0); 1763 Newx(RExC_study_chunk_recursed, 1764 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); 1765 SAVEFREEPV(RExC_study_chunk_recursed); 1766 } 1767 1768 reStudy: 1769 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; 1770 DEBUG_r( 1771 RExC_study_chunk_recursed_count= 0; 1772 ); 1773 Zero(RExC_rx->substrs, 1, struct reg_substr_data); 1774 if (RExC_study_chunk_recursed) { 1775 Zero(RExC_study_chunk_recursed, 1776 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); 1777 } 1778 1779 1780#ifdef TRIE_STUDY_OPT 1781 /* search for "restudy" in this file for a detailed explanation */ 1782 if (!restudied) { 1783 StructCopy(&zero_scan_data, &data, scan_data_t); 1784 copyRExC_state = RExC_state; 1785 } else { 1786 U32 seen=RExC_seen; 1787 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n")); 1788 1789 RExC_state = copyRExC_state; 1790 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) 1791 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 1792 else 1793 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; 1794 StructCopy(&zero_scan_data, &data, scan_data_t); 1795 } 1796#else 1797 StructCopy(&zero_scan_data, &data, scan_data_t); 1798#endif 1799 1800 /* Dig out information for optimizations. */ 1801 RExC_rx->extflags = RExC_flags; /* was pm_op */ 1802 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 1803 1804 if (UTF) 1805 SvUTF8_on(Rx); /* Unicode in it? */ 1806 RExC_rxi->regstclass = NULL; 1807 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ 1808 RExC_rx->intflags |= PREGf_NAUGHTY; 1809 scan = RExC_rxi->program + 1; /* First BRANCH. */ 1810 1811 /* testing for BRANCH here tells us whether there is "must appear" 1812 data in the pattern. If there is then we can use it for optimisations */ 1813 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. 1814 */ 1815 SSize_t fake_deltap; 1816 STRLEN longest_length[2]; 1817 regnode_ssc ch_class; /* pointed to by data */ 1818 int stclass_flag; 1819 SSize_t last_close = 0; /* pointed to by data */ 1820 regnode *first= scan; 1821 regnode *first_next= regnext(first); 1822 regnode *last_close_op= NULL; 1823 int i; 1824 1825 /* 1826 * Skip introductions and multiplicators >= 1 1827 * so that we can extract the 'meat' of the pattern that must 1828 * match in the large if() sequence following. 1829 * NOTE that EXACT is NOT covered here, as it is normally 1830 * picked up by the optimiser separately. 1831 * 1832 * This is unfortunate as the optimiser isnt handling lookahead 1833 * properly currently. 1834 * 1835 */ 1836 while (1) 1837 { 1838 if (OP(first) == OPEN) 1839 sawopen = 1; 1840 else 1841 if (OP(first) == IFMATCH && !FLAGS(first)) 1842 /* for now we can't handle lookbehind IFMATCH */ 1843 sawlookahead = 1; 1844 else 1845 if (OP(first) == PLUS) 1846 sawplus = 1; 1847 else 1848 if (OP(first) == MINMOD) 1849 sawminmod = 1; 1850 else 1851 if (!( 1852 /* An OR of *one* alternative - should not happen now. */ 1853 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 1854 /* An {n,m} with n>0 */ 1855 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) || 1856 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END) 1857 )){ 1858 break; 1859 } 1860 1861 first = REGNODE_AFTER(first); 1862 first_next= regnext(first); 1863 } 1864 1865 /* Starting-point info. */ 1866 again: 1867 DEBUG_PEEP("first:", first, 0, 0); 1868 /* Ignore EXACT as we deal with it later. */ 1869 if (REGNODE_TYPE(OP(first)) == EXACT) { 1870 if (! isEXACTFish(OP(first))) { 1871 NOOP; /* Empty, get anchored substr later. */ 1872 } 1873 else 1874 RExC_rxi->regstclass = first; 1875 } 1876#ifdef TRIE_STCLASS 1877 else if (REGNODE_TYPE(OP(first)) == TRIE && 1878 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0) 1879 { 1880 /* this can happen only on restudy 1881 * Search for "restudy" in this file to find 1882 * a comment with details. */ 1883 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); 1884 } 1885#endif 1886 else if (REGNODE_SIMPLE(OP(first))) 1887 RExC_rxi->regstclass = first; 1888 else if (REGNODE_TYPE(OP(first)) == BOUND || 1889 REGNODE_TYPE(OP(first)) == NBOUND) 1890 RExC_rxi->regstclass = first; 1891 else if (REGNODE_TYPE(OP(first)) == BOL) { 1892 RExC_rx->intflags |= (OP(first) == MBOL 1893 ? PREGf_ANCH_MBOL 1894 : PREGf_ANCH_SBOL); 1895 first = REGNODE_AFTER(first); 1896 goto again; 1897 } 1898 else if (OP(first) == GPOS) { 1899 RExC_rx->intflags |= PREGf_ANCH_GPOS; 1900 first = REGNODE_AFTER_type(first,tregnode_GPOS); 1901 goto again; 1902 } 1903 else if ((!sawopen || !RExC_sawback) && 1904 !sawlookahead && 1905 (OP(first) == STAR && 1906 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) && 1907 !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN)) 1908 { 1909 /* turn .* into ^.* with an implied $*=1 */ 1910 const int type = 1911 (OP(REGNODE_AFTER(first)) == REG_ANY) 1912 ? PREGf_ANCH_MBOL 1913 : PREGf_ANCH_SBOL; 1914 RExC_rx->intflags |= (type | PREGf_IMPLICIT); 1915 first = REGNODE_AFTER(first); 1916 goto again; 1917 } 1918 if (sawplus && !sawminmod && !sawlookahead 1919 && (!sawopen || !RExC_sawback) 1920 && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */ 1921 /* x+ must match at the 1st pos of run of x's */ 1922 RExC_rx->intflags |= PREGf_SKIP; 1923 1924 /* Scan is after the zeroth branch, first is atomic matcher. */ 1925#ifdef TRIE_STUDY_OPT 1926 /* search for "restudy" in this file for a detailed explanation */ 1927 DEBUG_PARSE_r( 1928 if (!restudied) 1929 Perl_re_printf( aTHX_ "first at %" IVdf "\n", 1930 (IV)(first - scan + 1)) 1931 ); 1932#else 1933 DEBUG_PARSE_r( 1934 Perl_re_printf( aTHX_ "first at %" IVdf "\n", 1935 (IV)(first - scan + 1)) 1936 ); 1937#endif 1938 1939 1940 /* 1941 * If there's something expensive in the r.e., find the 1942 * longest literal string that must appear and make it the 1943 * regmust. Resolve ties in favor of later strings, since 1944 * the regstart check works with the beginning of the r.e. 1945 * and avoiding duplication strengthens checking. Not a 1946 * strong reason, but sufficient in the absence of others. 1947 * [Now we resolve ties in favor of the earlier string if 1948 * it happens that c_offset_min has been invalidated, since the 1949 * earlier string may buy us something the later one won't.] 1950 */ 1951 1952 data.substrs[0].str = newSVpvs(""); 1953 data.substrs[1].str = newSVpvs(""); 1954 data.last_found = newSVpvs(""); 1955 data.cur_is_floating = 0; /* initially any found substring is fixed */ 1956 ENTER_with_name("study_chunk"); 1957 SAVEFREESV(data.substrs[0].str); 1958 SAVEFREESV(data.substrs[1].str); 1959 SAVEFREESV(data.last_found); 1960 first = scan; 1961 if (!RExC_rxi->regstclass) { 1962 ssc_init(pRExC_state, &ch_class); 1963 data.start_class = &ch_class; 1964 stclass_flag = SCF_DO_STCLASS_AND; 1965 } else /* XXXX Check for BOUND? */ 1966 stclass_flag = 0; 1967 data.last_closep = &last_close; 1968 data.last_close_opp = &last_close_op; 1969 1970 DEBUG_RExC_seen(); 1971 /* 1972 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/ 1973 * (NO top level branches) 1974 */ 1975 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap, 1976 scan + RExC_size, /* Up to end */ 1977 &data, -1, 0, NULL, 1978 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag 1979 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 1980 0, TRUE); 1981 /* search for "restudy" in this file for a detailed explanation 1982 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */ 1983 1984 1985 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); 1986 1987 1988 if ( RExC_total_parens == 1 && !data.cur_is_floating 1989 && data.last_start_min == 0 && data.last_end > 0 1990 && !RExC_seen_zerolen 1991 && !(RExC_seen & REG_VERBARG_SEEN) 1992 && !(RExC_seen & REG_GPOS_SEEN) 1993 ){ 1994 RExC_rx->extflags |= RXf_CHECK_ALL; 1995 } 1996 scan_commit(pRExC_state, &data,&minlen, 0); 1997 1998 1999 /* XXX this is done in reverse order because that's the way the 2000 * code was before it was parameterised. Don't know whether it 2001 * actually needs doing in reverse order. DAPM */ 2002 for (i = 1; i >= 0; i--) { 2003 longest_length[i] = CHR_SVLEN(data.substrs[i].str); 2004 2005 if ( !( i 2006 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */ 2007 && data.substrs[0].min_offset 2008 == data.substrs[1].min_offset 2009 && SvCUR(data.substrs[0].str) 2010 == SvCUR(data.substrs[1].str) 2011 ) 2012 && S_setup_longest (aTHX_ pRExC_state, 2013 &(RExC_rx->substrs->data[i]), 2014 &(data.substrs[i]), 2015 longest_length[i])) 2016 { 2017 RExC_rx->substrs->data[i].min_offset = 2018 data.substrs[i].min_offset - data.substrs[i].lookbehind; 2019 2020 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset; 2021 /* Don't offset infinity */ 2022 if (data.substrs[i].max_offset < OPTIMIZE_INFTY) 2023 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind; 2024 SvREFCNT_inc_simple_void_NN(data.substrs[i].str); 2025 } 2026 else { 2027 RExC_rx->substrs->data[i].substr = NULL; 2028 RExC_rx->substrs->data[i].utf8_substr = NULL; 2029 longest_length[i] = 0; 2030 } 2031 } 2032 2033 LEAVE_with_name("study_chunk"); 2034 2035 if (RExC_rxi->regstclass 2036 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY)) 2037 RExC_rxi->regstclass = NULL; 2038 2039 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr) 2040 || RExC_rx->substrs->data[0].min_offset) 2041 && stclass_flag 2042 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 2043 && is_ssc_worth_it(pRExC_state, data.start_class)) 2044 { 2045 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); 2046 2047 ssc_finalize(pRExC_state, data.start_class); 2048 2049 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 2050 StructCopy(data.start_class, 2051 (regnode_ssc*)RExC_rxi->data->data[n], 2052 regnode_ssc); 2053 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; 2054 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 2055 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 2056 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); 2057 Perl_re_printf( aTHX_ 2058 "synthetic stclass \"%s\".\n", 2059 SvPVX_const(sv));}); 2060 data.start_class = NULL; 2061 } 2062 2063 /* A temporary algorithm prefers floated substr to fixed one of 2064 * same length to dig more info. */ 2065 i = (longest_length[0] <= longest_length[1]); 2066 RExC_rx->substrs->check_ix = i; 2067 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift; 2068 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr; 2069 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr; 2070 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset; 2071 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset; 2072 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))) 2073 RExC_rx->intflags |= PREGf_NOSCAN; 2074 2075 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) { 2076 RExC_rx->extflags |= RXf_USE_INTUIT; 2077 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8)) 2078 RExC_rx->extflags |= RXf_INTUIT_TAIL; 2079 } 2080 2081 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 2082 if ( (STRLEN)minlen < longest_length[1] ) 2083 minlen= longest_length[1]; 2084 if ( (STRLEN)minlen < longest_length[0] ) 2085 minlen= longest_length[0]; 2086 */ 2087 } 2088 else { 2089 /* Several toplevels. Best we can is to set minlen. */ 2090 SSize_t fake_deltap; 2091 regnode_ssc ch_class; 2092 SSize_t last_close = 0; 2093 regnode *last_close_op = NULL; 2094 2095 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n")); 2096 2097 scan = RExC_rxi->program + 1; 2098 ssc_init(pRExC_state, &ch_class); 2099 data.start_class = &ch_class; 2100 data.last_closep = &last_close; 2101 data.last_close_opp = &last_close_op; 2102 2103 DEBUG_RExC_seen(); 2104 /* 2105 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../ 2106 * (patterns WITH top level branches) 2107 */ 2108 minlen = study_chunk(pRExC_state, 2109 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL, 2110 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied 2111 ? SCF_TRIE_DOING_RESTUDY 2112 : 0), 2113 0, TRUE); 2114 /* search for "restudy" in this file for a detailed explanation 2115 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */ 2116 2117 CHECK_RESTUDY_GOTO_butfirst(NOOP); 2118 2119 RExC_rx->check_substr = NULL; 2120 RExC_rx->check_utf8 = NULL; 2121 RExC_rx->substrs->data[0].substr = NULL; 2122 RExC_rx->substrs->data[0].utf8_substr = NULL; 2123 RExC_rx->substrs->data[1].substr = NULL; 2124 RExC_rx->substrs->data[1].utf8_substr = NULL; 2125 2126 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 2127 && is_ssc_worth_it(pRExC_state, data.start_class)) 2128 { 2129 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); 2130 2131 ssc_finalize(pRExC_state, data.start_class); 2132 2133 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 2134 StructCopy(data.start_class, 2135 (regnode_ssc*)RExC_rxi->data->data[n], 2136 regnode_ssc); 2137 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; 2138 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 2139 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 2140 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); 2141 Perl_re_printf( aTHX_ 2142 "synthetic stclass \"%s\".\n", 2143 SvPVX_const(sv));}); 2144 data.start_class = NULL; 2145 } 2146 } 2147 2148 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { 2149 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; 2150 RExC_rx->maxlen = REG_INFTY; 2151 } 2152 else { 2153 RExC_rx->maxlen = RExC_maxlen; 2154 } 2155 2156 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 2157 the "real" pattern. */ 2158 DEBUG_OPTIMISE_r({ 2159 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n", 2160 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen); 2161 }); 2162 RExC_rx->minlenret = minlen; 2163 if (RExC_rx->minlen < minlen) 2164 RExC_rx->minlen = minlen; 2165 2166 if (RExC_seen & REG_RECURSE_SEEN ) { 2167 RExC_rx->intflags |= PREGf_RECURSE_SEEN; 2168 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *); 2169 } 2170 if (RExC_seen & REG_GPOS_SEEN) 2171 RExC_rx->intflags |= PREGf_GPOS_SEEN; 2172 2173 if (RExC_seen & REG_PESSIMIZE_SEEN) 2174 RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN; 2175 2176 if (RExC_seen & REG_LOOKBEHIND_SEEN) 2177 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the 2178 lookbehind */ 2179 if (pRExC_state->code_blocks) 2180 RExC_rx->extflags |= RXf_EVAL_SEEN; 2181 2182 if (RExC_seen & REG_VERBARG_SEEN) { 2183 RExC_rx->intflags |= PREGf_VERBARG_SEEN; 2184 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ 2185 } 2186 2187 if (RExC_seen & REG_CUTGROUP_SEEN) 2188 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN; 2189 2190 if (pm_flags & PMf_USE_RE_EVAL) 2191 RExC_rx->intflags |= PREGf_USE_RE_EVAL; 2192 2193 if (RExC_paren_names) 2194 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 2195 else 2196 RXp_PAREN_NAMES(RExC_rx) = NULL; 2197 2198 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED 2199 * so it can be used in pp.c */ 2200 if (RExC_rx->intflags & PREGf_ANCH) 2201 RExC_rx->extflags |= RXf_IS_ANCHORED; 2202 2203 2204 { 2205 /* this is used to identify "special" patterns that might result 2206 * in Perl NOT calling the regex engine and instead doing the match "itself", 2207 * particularly special cases in split//. By having the regex compiler 2208 * do this pattern matching at a regop level (instead of by inspecting the pattern) 2209 * we avoid weird issues with equivalent patterns resulting in different behavior, 2210 * AND we allow non Perl engines to get the same optimizations by the setting the 2211 * flags appropriately - Yves */ 2212 regnode *first = RExC_rxi->program + 1; 2213 U8 fop = OP(first); 2214 regnode *next = NULL; 2215 U8 nop = 0; 2216 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) { 2217 next = REGNODE_AFTER(first); 2218 nop = OP(next); 2219 } 2220 /* It's safe to read through *next only if OP(first) is a regop of 2221 * the right type (not EXACT, for example). 2222 */ 2223 if (REGNODE_TYPE(fop) == NOTHING && nop == END) 2224 RExC_rx->extflags |= RXf_NULL; 2225 else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END) 2226 /* when fop is SBOL first->flags will be true only when it was 2227 * produced by parsing /\A/, and not when parsing /^/. This is 2228 * very important for the split code as there we want to 2229 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. 2230 * See rt #122761 for more details. -- Yves */ 2231 RExC_rx->extflags |= RXf_START_ONLY; 2232 else if (fop == PLUS 2233 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_ 2234 && OP(regnext(first)) == END) 2235 RExC_rx->extflags |= RXf_WHITE; 2236 else if ( RExC_rx->extflags & RXf_SPLIT 2237 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop)) 2238 && STR_LEN(first) == 1 2239 && *(STRING(first)) == ' ' 2240 && OP(regnext(first)) == END ) 2241 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 2242 2243 } 2244 2245 if (RExC_contains_locale) { 2246 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED; 2247 } 2248 2249#ifdef DEBUGGING 2250 if (RExC_paren_names) { 2251 RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a")); 2252 RExC_rxi->data->data[RExC_rxi->name_list_idx] 2253 = (void*)SvREFCNT_inc(RExC_paren_name_list); 2254 } else 2255#endif 2256 RExC_rxi->name_list_idx = 0; 2257 2258 while ( RExC_recurse_count > 0 ) { 2259 const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; 2260 /* 2261 * This data structure is set up in study_chunk() and is used 2262 * to calculate the distance between a GOSUB regopcode and 2263 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's) 2264 * it refers to. 2265 * 2266 * If for some reason someone writes code that optimises 2267 * away a GOSUB opcode then the assert should be changed to 2268 * an if(scan) to guard the ARG2i_SET() - Yves 2269 * 2270 */ 2271 assert(scan && OP(scan) == GOSUB); 2272 ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan)); 2273 } 2274 if (RExC_logical_total_parens != RExC_total_parens) { 2275 Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32); 2276 /* we rebuild this below */ 2277 Zero(RExC_logical_to_parno, RExC_total_parens, I32); 2278 for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) { 2279 int logical_parno= RExC_parno_to_logical[parno]; 2280 assert(logical_parno); 2281 RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno]; 2282 RExC_logical_to_parno[logical_parno] = parno; 2283 } 2284 RExC_rx->logical_to_parno = RExC_logical_to_parno; 2285 RExC_rx->parno_to_logical = RExC_parno_to_logical; 2286 RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next; 2287 RExC_logical_to_parno = NULL; 2288 RExC_parno_to_logical = NULL; 2289 RExC_parno_to_logical_next = NULL; 2290 } else { 2291 RExC_rx->logical_to_parno = NULL; 2292 RExC_rx->parno_to_logical = NULL; 2293 RExC_rx->parno_to_logical_next = NULL; 2294 } 2295 2296 Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair); 2297 /* assume we don't need to swap parens around before we match */ 2298 DEBUG_TEST_r({ 2299 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n", 2300 (unsigned long)RExC_study_chunk_recursed_count); 2301 }); 2302 DEBUG_DUMP_r({ 2303 DEBUG_RExC_seen(); 2304 Perl_re_printf( aTHX_ "Final program:\n"); 2305 regdump(RExC_rx); 2306 }); 2307 2308 if (RExC_open_parens) { 2309 Safefree(RExC_open_parens); 2310 RExC_open_parens = NULL; 2311 } 2312 if (RExC_close_parens) { 2313 Safefree(RExC_close_parens); 2314 RExC_close_parens = NULL; 2315 } 2316 if (RExC_logical_to_parno) { 2317 Safefree(RExC_logical_to_parno); 2318 RExC_logical_to_parno = NULL; 2319 } 2320 if (RExC_parno_to_logical) { 2321 Safefree(RExC_parno_to_logical); 2322 RExC_parno_to_logical = NULL; 2323 } 2324 2325#ifdef USE_ITHREADS 2326 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated 2327 * by setting the regexp SV to readonly-only instead. If the 2328 * pattern's been recompiled, the USEDness should remain. */ 2329 if (old_re && SvREADONLY(old_re)) 2330 SvREADONLY_on(Rx); 2331#endif 2332 return Rx; 2333} 2334 2335 2336 2337SV* 2338Perl_reg_qr_package(pTHX_ REGEXP * const rx) 2339{ 2340 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 2341 PERL_UNUSED_ARG(rx); 2342 if (0) 2343 return NULL; 2344 else 2345 return newSVpvs("Regexp"); 2346} 2347 2348/* Scans the name of a named buffer from the pattern. 2349 * If flags is REG_RSN_RETURN_NULL returns null. 2350 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 2351 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 2352 * to the parsed name as looked up in the RExC_paren_names hash. 2353 * If there is an error throws a vFAIL().. type exception. 2354 */ 2355 2356#define REG_RSN_RETURN_NULL 0 2357#define REG_RSN_RETURN_NAME 1 2358#define REG_RSN_RETURN_DATA 2 2359 2360STATIC SV* 2361S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 2362{ 2363 char *name_start = RExC_parse; 2364 SV* sv_name; 2365 2366 PERL_ARGS_ASSERT_REG_SCAN_NAME; 2367 2368 assert (RExC_parse <= RExC_end); 2369 if (RExC_parse == RExC_end) NOOP; 2370 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) { 2371 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by 2372 * using do...while */ 2373 if (UTF) 2374 do { 2375 RExC_parse_inc_utf8(); 2376 } while ( RExC_parse < RExC_end 2377 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end)); 2378 else 2379 do { 2380 RExC_parse_inc_by(1); 2381 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse)); 2382 } else { 2383 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending 2384 character */ 2385 vFAIL("Group name must start with a non-digit word character"); 2386 } 2387 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 2388 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 2389 if ( flags == REG_RSN_RETURN_NAME) 2390 return sv_name; 2391 else if (flags==REG_RSN_RETURN_DATA) { 2392 HE *he_str = NULL; 2393 SV *sv_dat = NULL; 2394 if ( ! sv_name ) /* should not happen*/ 2395 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 2396 if (RExC_paren_names) 2397 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 2398 if ( he_str ) 2399 sv_dat = HeVAL(he_str); 2400 if ( ! sv_dat ) { /* Didn't find group */ 2401 2402 /* It might be a forward reference; we can't fail until we 2403 * know, by completing the parse to get all the groups, and 2404 * then reparsing */ 2405 if (ALL_PARENS_COUNTED) { 2406 vFAIL("Reference to nonexistent named group"); 2407 } 2408 else { 2409 REQUIRE_PARENS_PASS; 2410 } 2411 } 2412 return sv_dat; 2413 } 2414 2415 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", 2416 (unsigned long) flags); 2417} 2418 2419#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 2420 if (RExC_lastparse!=RExC_parse) { \ 2421 Perl_re_printf( aTHX_ "%s", \ 2422 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ 2423 RExC_end - RExC_parse, 16, \ 2424 "", "", \ 2425 PERL_PV_ESCAPE_UNI_DETECT | \ 2426 PERL_PV_PRETTY_ELLIPSES | \ 2427 PERL_PV_PRETTY_LTGT | \ 2428 PERL_PV_ESCAPE_RE | \ 2429 PERL_PV_PRETTY_EXACTSIZE \ 2430 ) \ 2431 ); \ 2432 } else \ 2433 Perl_re_printf( aTHX_ "%16s",""); \ 2434 \ 2435 if (RExC_lastnum!=RExC_emit) \ 2436 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \ 2437 else \ 2438 Perl_re_printf( aTHX_ "|%4s",""); \ 2439 Perl_re_printf( aTHX_ "|%*s%-4s", \ 2440 (int)((depth*2)), "", \ 2441 (funcname) \ 2442 ); \ 2443 RExC_lastnum=RExC_emit; \ 2444 RExC_lastparse=RExC_parse; \ 2445}) 2446 2447 2448 2449#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 2450 DEBUG_PARSE_MSG((funcname)); \ 2451 Perl_re_printf( aTHX_ "%4s","\n"); \ 2452}) 2453#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\ 2454 DEBUG_PARSE_MSG((funcname)); \ 2455 Perl_re_printf( aTHX_ fmt "\n",args); \ 2456}) 2457 2458 2459STATIC void 2460S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) 2461{ 2462 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' 2463 * constructs, and updates RExC_flags with them. On input, RExC_parse 2464 * should point to the first flag; it is updated on output to point to the 2465 * final ')' or ':'. There needs to be at least one flag, or this will 2466 * abort */ 2467 2468 /* for (?g), (?gc), and (?o) warnings; warning 2469 about (?c) will warn about (?g) -- japhy */ 2470 2471#define WASTED_O 0x01 2472#define WASTED_G 0x02 2473#define WASTED_C 0x04 2474#define WASTED_GC (WASTED_G|WASTED_C) 2475 I32 wastedflags = 0x00; 2476 U32 posflags = 0, negflags = 0; 2477 U32 *flagsp = &posflags; 2478 char has_charset_modifier = '\0'; 2479 regex_charset cs; 2480 bool has_use_defaults = FALSE; 2481 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ 2482 int x_mod_count = 0; 2483 2484 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; 2485 2486 /* '^' as an initial flag sets certain defaults */ 2487 if (UCHARAT(RExC_parse) == '^') { 2488 RExC_parse_inc_by(1); 2489 has_use_defaults = TRUE; 2490 STD_PMMOD_FLAGS_CLEAR(&RExC_flags); 2491 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS) 2492 ? REGEX_UNICODE_CHARSET 2493 : REGEX_DEPENDS_CHARSET; 2494 set_regex_charset(&RExC_flags, cs); 2495 } 2496 else { 2497 cs = get_regex_charset(RExC_flags); 2498 if ( cs == REGEX_DEPENDS_CHARSET 2499 && (toUSE_UNI_CHARSET_NOT_DEPENDS)) 2500 { 2501 cs = REGEX_UNICODE_CHARSET; 2502 } 2503 } 2504 2505 while (RExC_parse < RExC_end) { 2506 /* && memCHRs("iogcmsx", *RExC_parse) */ 2507 /* (?g), (?gc) and (?o) are useless here 2508 and must be globally applied -- japhy */ 2509 if ((RExC_pm_flags & PMf_WILDCARD)) { 2510 if (flagsp == & negflags) { 2511 if (*RExC_parse == 'm') { 2512 RExC_parse_inc_by(1); 2513 /* diag_listed_as: Use of %s is not allowed in Unicode 2514 property wildcard subpatterns in regex; marked by <-- 2515 HERE in m/%s/ */ 2516 vFAIL("Use of modifier '-m' is not allowed in Unicode" 2517 " property wildcard subpatterns"); 2518 } 2519 } 2520 else { 2521 if (*RExC_parse == 's') { 2522 goto modifier_illegal_in_wildcard; 2523 } 2524 } 2525 } 2526 2527 switch (*RExC_parse) { 2528 2529 /* Code for the imsxn flags */ 2530 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); 2531 2532 case LOCALE_PAT_MOD: 2533 if (has_charset_modifier) { 2534 goto excess_modifier; 2535 } 2536 else if (flagsp == &negflags) { 2537 goto neg_modifier; 2538 } 2539 cs = REGEX_LOCALE_CHARSET; 2540 has_charset_modifier = LOCALE_PAT_MOD; 2541 break; 2542 case UNICODE_PAT_MOD: 2543 if (has_charset_modifier) { 2544 goto excess_modifier; 2545 } 2546 else if (flagsp == &negflags) { 2547 goto neg_modifier; 2548 } 2549 cs = REGEX_UNICODE_CHARSET; 2550 has_charset_modifier = UNICODE_PAT_MOD; 2551 break; 2552 case ASCII_RESTRICT_PAT_MOD: 2553 if (flagsp == &negflags) { 2554 goto neg_modifier; 2555 } 2556 if (has_charset_modifier) { 2557 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { 2558 goto excess_modifier; 2559 } 2560 /* Doubled modifier implies more restricted */ 2561 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; 2562 } 2563 else { 2564 cs = REGEX_ASCII_RESTRICTED_CHARSET; 2565 } 2566 has_charset_modifier = ASCII_RESTRICT_PAT_MOD; 2567 break; 2568 case DEPENDS_PAT_MOD: 2569 if (has_use_defaults) { 2570 goto fail_modifiers; 2571 } 2572 else if (flagsp == &negflags) { 2573 goto neg_modifier; 2574 } 2575 else if (has_charset_modifier) { 2576 goto excess_modifier; 2577 } 2578 2579 /* The dual charset means unicode semantics if the 2580 * pattern (or target, not known until runtime) are 2581 * utf8, or something in the pattern indicates unicode 2582 * semantics */ 2583 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS) 2584 ? REGEX_UNICODE_CHARSET 2585 : REGEX_DEPENDS_CHARSET; 2586 has_charset_modifier = DEPENDS_PAT_MOD; 2587 break; 2588 excess_modifier: 2589 RExC_parse_inc_by(1); 2590 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { 2591 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); 2592 } 2593 else if (has_charset_modifier == *(RExC_parse - 1)) { 2594 vFAIL2("Regexp modifier \"%c\" may not appear twice", 2595 *(RExC_parse - 1)); 2596 } 2597 else { 2598 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); 2599 } 2600 NOT_REACHED; /*NOTREACHED*/ 2601 neg_modifier: 2602 RExC_parse_inc_by(1); 2603 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", 2604 *(RExC_parse - 1)); 2605 NOT_REACHED; /*NOTREACHED*/ 2606 case GLOBAL_PAT_MOD: /* 'g' */ 2607 if (RExC_pm_flags & PMf_WILDCARD) { 2608 goto modifier_illegal_in_wildcard; 2609 } 2610 /*FALLTHROUGH*/ 2611 case ONCE_PAT_MOD: /* 'o' */ 2612 if (ckWARN(WARN_REGEXP)) { 2613 const I32 wflagbit = *RExC_parse == 'o' 2614 ? WASTED_O 2615 : WASTED_G; 2616 if (! (wastedflags & wflagbit) ) { 2617 wastedflags |= wflagbit; 2618 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 2619 vWARN5( 2620 RExC_parse + 1, 2621 "Useless (%s%c) - %suse /%c modifier", 2622 flagsp == &negflags ? "?-" : "?", 2623 *RExC_parse, 2624 flagsp == &negflags ? "don't " : "", 2625 *RExC_parse 2626 ); 2627 } 2628 } 2629 break; 2630 2631 case CONTINUE_PAT_MOD: /* 'c' */ 2632 if (RExC_pm_flags & PMf_WILDCARD) { 2633 goto modifier_illegal_in_wildcard; 2634 } 2635 if (ckWARN(WARN_REGEXP)) { 2636 if (! (wastedflags & WASTED_C) ) { 2637 wastedflags |= WASTED_GC; 2638 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 2639 vWARN3( 2640 RExC_parse + 1, 2641 "Useless (%sc) - %suse /gc modifier", 2642 flagsp == &negflags ? "?-" : "?", 2643 flagsp == &negflags ? "don't " : "" 2644 ); 2645 } 2646 } 2647 break; 2648 case KEEPCOPY_PAT_MOD: /* 'p' */ 2649 if (RExC_pm_flags & PMf_WILDCARD) { 2650 goto modifier_illegal_in_wildcard; 2651 } 2652 if (flagsp == &negflags) { 2653 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); 2654 } else { 2655 *flagsp |= RXf_PMf_KEEPCOPY; 2656 } 2657 break; 2658 case '-': 2659 /* A flag is a default iff it is following a minus, so 2660 * if there is a minus, it means will be trying to 2661 * re-specify a default which is an error */ 2662 if (has_use_defaults || flagsp == &negflags) { 2663 goto fail_modifiers; 2664 } 2665 flagsp = &negflags; 2666 wastedflags = 0; /* reset so (?g-c) warns twice */ 2667 x_mod_count = 0; 2668 break; 2669 case ':': 2670 case ')': 2671 2672 if ( (RExC_pm_flags & PMf_WILDCARD) 2673 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET) 2674 { 2675 RExC_parse_inc_by(1); 2676 /* diag_listed_as: Use of %s is not allowed in Unicode 2677 property wildcard subpatterns in regex; marked by <-- 2678 HERE in m/%s/ */ 2679 vFAIL2("Use of modifier '%c' is not allowed in Unicode" 2680 " property wildcard subpatterns", 2681 has_charset_modifier); 2682 } 2683 2684 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) { 2685 negflags |= RXf_PMf_EXTENDED_MORE; 2686 } 2687 RExC_flags |= posflags; 2688 2689 if (negflags & RXf_PMf_EXTENDED) { 2690 negflags |= RXf_PMf_EXTENDED_MORE; 2691 } 2692 RExC_flags &= ~negflags; 2693 set_regex_charset(&RExC_flags, cs); 2694 2695 return; 2696 default: 2697 fail_modifiers: 2698 RExC_parse_inc_if_char(); 2699 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 2700 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized", 2701 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 2702 NOT_REACHED; /*NOTREACHED*/ 2703 } 2704 2705 RExC_parse_inc(); 2706 } 2707 2708 vFAIL("Sequence (?... not terminated"); 2709 2710 modifier_illegal_in_wildcard: 2711 RExC_parse_inc_by(1); 2712 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard 2713 subpatterns in regex; marked by <-- HERE in m/%s/ */ 2714 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard" 2715 " subpatterns", *(RExC_parse - 1)); 2716} 2717 2718/* 2719 - reg - regular expression, i.e. main body or parenthesized thing 2720 * 2721 * Caller must absorb opening parenthesis. 2722 * 2723 * Combining parenthesis handling with the base level of regular expression 2724 * is a trifle forced, but the need to tie the tails of the branches to what 2725 * follows makes it hard to avoid. 2726 */ 2727 2728STATIC regnode_offset 2729S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, 2730 I32 *flagp, 2731 char * backref_parse_start, 2732 char ch 2733 ) 2734{ 2735 regnode_offset ret; 2736 char* name_start = RExC_parse; 2737 U32 num = 0; 2738 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); 2739 DECLARE_AND_GET_RE_DEBUG_FLAGS; 2740 2741 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; 2742 2743 if (RExC_parse != name_start && ch == '}') { 2744 while (isBLANK(*RExC_parse)) { 2745 RExC_parse_inc_by(1); 2746 } 2747 } 2748 if (RExC_parse == name_start || *RExC_parse != ch) { 2749 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 2750 vFAIL2("Sequence %.3s... not terminated", backref_parse_start); 2751 } 2752 2753 if (sv_dat) { 2754 num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); 2755 RExC_rxi->data->data[num]=(void*)sv_dat; 2756 SvREFCNT_inc_simple_void_NN(sv_dat); 2757 } 2758 RExC_sawback = 1; 2759 ret = reg2node(pRExC_state, 2760 ((! FOLD) 2761 ? REFN 2762 : (ASCII_FOLD_RESTRICTED) 2763 ? REFFAN 2764 : (AT_LEAST_UNI_SEMANTICS) 2765 ? REFFUN 2766 : (LOC) 2767 ? REFFLN 2768 : REFFN), 2769 num, RExC_nestroot); 2770 if (RExC_nestroot && num >= (U32)RExC_nestroot) 2771 FLAGS(REGNODE_p(ret)) = VOLATILE_REF; 2772 *flagp |= HASWIDTH; 2773 2774 nextchar(pRExC_state); 2775 return ret; 2776} 2777 2778/* reg_la_NOTHING() 2779 * 2780 * Maybe parse a parenthesized lookaround construct that is equivalent to a 2781 * NOTHING regop when the construct is empty. 2782 * 2783 * Calls skip_to_be_ignored_text() before checking if the construct is empty. 2784 * 2785 * Checks for unterminated constructs and throws a "not terminated" error 2786 * with the appropriate type if necessary 2787 * 2788 * Assuming it does not throw an exception increments RExC_seen_zerolen. 2789 * 2790 * If the construct is empty generates a NOTHING op and returns its 2791 * regnode_offset, which the caller would then return to its caller. 2792 * 2793 * If the construct is not empty increments RExC_in_lookaround, and turns 2794 * on any flags provided in RExC_seen, and then returns 0 to signify 2795 * that parsing should continue. 2796 * 2797 * PS: I would have called this reg_parse_lookaround_NOTHING() but then 2798 * any use of it would have had to be broken onto multiple lines, hence 2799 * the abbreviation. 2800 */ 2801STATIC regnode_offset 2802S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags, 2803 const char *type) 2804{ 2805 2806 PERL_ARGS_ASSERT_REG_LA_NOTHING; 2807 2808 /* false below so we do not force /x */ 2809 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE); 2810 2811 if (RExC_parse >= RExC_end) 2812 vFAIL2("Sequence (%s... not terminated", type); 2813 2814 /* Always increment as NOTHING regops are zerolen */ 2815 RExC_seen_zerolen++; 2816 2817 if (*RExC_parse == ')') { 2818 regnode_offset ret= reg_node(pRExC_state, NOTHING); 2819 nextchar(pRExC_state); 2820 return ret; 2821 } 2822 2823 RExC_seen |= flags; 2824 RExC_in_lookaround++; 2825 return 0; /* keep parsing! */ 2826} 2827 2828/* reg_la_OPFAIL() 2829 * 2830 * Maybe parse a parenthesized lookaround construct that is equivalent to a 2831 * OPFAIL regop when the construct is empty. 2832 * 2833 * Calls skip_to_be_ignored_text() before checking if the construct is empty. 2834 * 2835 * Checks for unterminated constructs and throws a "not terminated" error 2836 * if necessary. 2837 * 2838 * If the construct is empty generates an OPFAIL op and returns its 2839 * regnode_offset which the caller should then return to its caller. 2840 * 2841 * If the construct is not empty increments RExC_in_lookaround, and also 2842 * increments RExC_seen_zerolen, and turns on the flags provided in 2843 * RExC_seen, and then returns 0 to signify that parsing should continue. 2844 * 2845 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then 2846 * any use of it would have had to be broken onto multiple lines, hence 2847 * the abbreviation. 2848 */ 2849 2850STATIC regnode_offset 2851S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags, 2852 const char *type) 2853{ 2854 2855 PERL_ARGS_ASSERT_REG_LA_OPFAIL; 2856 2857 /* FALSE so we don't force to /x below */; 2858 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE); 2859 2860 if (RExC_parse >= RExC_end) 2861 vFAIL2("Sequence (%s... not terminated", type); 2862 2863 if (*RExC_parse == ')') { 2864 regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0); 2865 nextchar(pRExC_state); 2866 return ret; /* return produced regop */ 2867 } 2868 2869 /* only increment zerolen *after* we check if we produce an OPFAIL 2870 * as an OPFAIL does not match a zero length construct, as it 2871 * does not match ever. */ 2872 RExC_seen_zerolen++; 2873 RExC_seen |= flags; 2874 RExC_in_lookaround++; 2875 return 0; /* keep parsing! */ 2876} 2877 2878/* Below are the main parsing routines. 2879 * 2880 * S_reg() parses a whole pattern or subpattern. It itself handles things 2881 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each 2882 * alternation '|' in the '...' pattern. 2883 * S_regbranch() effectively implements the concatenation operator, handling 2884 * one alternative of '|', repeatedly calling S_regpiece on each 2885 * segment of the input. 2886 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input, 2887 * and then adds any quantifier for that chunk. 2888 * S_regatom() parses the next chunk of the input, returning when it 2889 * determines it has found a complete atomic chunk. The chunk may 2890 * be a nested subpattern, in which case S_reg is called 2891 * recursively 2892 * 2893 * The functions generate regnodes as they go along, appending each to the 2894 * pattern data structure so far. They return the offset of the current final 2895 * node into that structure, or 0 on failure. 2896 * 2897 * There are three parameters common to all of them: 2898 * pRExC_state is a structure with much information about the current 2899 * state of the parse. It's easy to add new elements to 2900 * convey new information, but beware that an error return may 2901 * require clearing the element. 2902 * flagp is a pointer to bit flags set in a lower level to pass up 2903 * to higher levels information, such as the cause of a 2904 * failure, or some characteristic about the generated node 2905 * depth is roughly the recursion depth, mostly unused except for 2906 * pretty printing debugging info. 2907 * 2908 * There are ancillary functions that these may farm work out to, using the 2909 * same parameters. 2910 * 2911 * The protocol for handling flags is that each function will, before 2912 * returning, add into *flagp the flags it needs to pass up. Each function has 2913 * a second flags variable, typically named 'flags', which it sets and clears 2914 * at will. Flag bits in it are used in that function, and it calls the next 2915 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return, 2916 * 'flags' will contain whatever it had before the call, plus whatever that 2917 * function passed up. If it wants to pass any of these up to its caller, it 2918 * has to add them to its *flagp. This means that it takes extra steps to keep 2919 * passing a flag upwards, and otherwise the flag bit is cleared for higher 2920 * functions. 2921 */ 2922 2923/* On success, returns the offset at which any next node should be placed into 2924 * the regex engine program being compiled. 2925 * 2926 * Returns 0 otherwise, with *flagp set to indicate why: 2927 * TRYAGAIN at the end of (?) that only sets flags. 2928 * RESTART_PARSE if the parse needs to be restarted, or'd with 2929 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8. 2930 * Otherwise would only return 0 if regbranch() returns 0, which cannot 2931 * happen. */ 2932STATIC regnode_offset 2933S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) 2934 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. 2935 * 2 is like 1, but indicates that nextchar() has been called to advance 2936 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and 2937 * this flag alerts us to the need to check for that */ 2938{ 2939 regnode_offset ret = 0; /* Will be the head of the group. */ 2940 regnode_offset br; 2941 regnode_offset lastbr; 2942 regnode_offset ender = 0; 2943 I32 logical_parno = 0; 2944 I32 parno = 0; 2945 I32 flags; 2946 U32 oregflags = RExC_flags; 2947 bool have_branch = 0; 2948 bool is_open = 0; 2949 I32 freeze_paren = 0; 2950 I32 after_freeze = 0; 2951 I32 num; /* numeric backreferences */ 2952 SV * max_open; /* Max number of unclosed parens */ 2953 I32 was_in_lookaround = RExC_in_lookaround; 2954 I32 fake_eval = 0; /* matches paren */ 2955 2956 /* The difference between the following variables can be seen with * 2957 * the broken pattern /(?:foo/ where segment_parse_start will point * 2958 * at the 'f', and reg_parse_start will point at the '(' */ 2959 2960 /* the following is used for unmatched '(' errors */ 2961 char * const reg_parse_start = RExC_parse; 2962 2963 /* the following is used to track where various segments of 2964 * the pattern that we parse out started. */ 2965 char * segment_parse_start = RExC_parse; 2966 2967 DECLARE_AND_GET_RE_DEBUG_FLAGS; 2968 2969 PERL_ARGS_ASSERT_REG; 2970 DEBUG_PARSE("reg "); 2971 2972 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD); 2973 assert(max_open); 2974 if (!SvIOK(max_open)) { 2975 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT); 2976 } 2977 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each 2978 open paren */ 2979 vFAIL("Too many nested open parens"); 2980 } 2981 2982 *flagp = 0; /* Initialize. */ 2983 2984 /* Having this true makes it feasible to have a lot fewer tests for the 2985 * parse pointer being in scope. For example, we can write 2986 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1); 2987 * instead of 2988 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1); 2989 */ 2990 assert(*RExC_end == '\0'); 2991 2992 /* Make an OPEN node, if parenthesized. */ 2993 if (paren) { 2994 2995 /* Under /x, space and comments can be gobbled up between the '(' and 2996 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such 2997 * intervening space, as the sequence is a token, and a token should be 2998 * indivisible */ 2999 bool has_intervening_patws = (paren == 2) 3000 && *(RExC_parse - 1) != '('; 3001 3002 if (RExC_parse >= RExC_end) { 3003 vFAIL("Unmatched ("); 3004 } 3005 3006 if (paren == 'r') { /* Atomic script run */ 3007 paren = '>'; 3008 goto parse_rest; 3009 } 3010 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */ 3011 if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */ 3012 fake_eval = '{'; 3013 goto handle_qmark; 3014 } 3015 3016 char *start_verb = RExC_parse + 1; 3017 STRLEN verb_len; 3018 char *start_arg = NULL; 3019 unsigned char op = 0; 3020 int arg_required = 0; 3021 int internal_argval = -1; /* if > -1 no argument allowed */ 3022 bool has_upper = FALSE; 3023 U32 seen_flag_set = 0; /* RExC_seen flags we must set */ 3024 3025 if (has_intervening_patws) { 3026 RExC_parse_inc_by(1); /* past the '*' */ 3027 3028 /* For strict backwards compatibility, don't change the message 3029 * now that we also have lowercase operands */ 3030 if (isUPPER(*RExC_parse)) { 3031 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); 3032 } 3033 else { 3034 vFAIL("In '(*...)', the '(' and '*' must be adjacent"); 3035 } 3036 } 3037 while (RExC_parse < RExC_end && *RExC_parse != ')' ) { 3038 if ( *RExC_parse == ':' ) { 3039 start_arg = RExC_parse + 1; 3040 break; 3041 } 3042 else if (! UTF) { 3043 if (isUPPER(*RExC_parse)) { 3044 has_upper = TRUE; 3045 } 3046 RExC_parse_inc_by(1); 3047 } 3048 else { 3049 RExC_parse_inc_utf8(); 3050 } 3051 } 3052 verb_len = RExC_parse - start_verb; 3053 if ( start_arg ) { 3054 if (RExC_parse >= RExC_end) { 3055 goto unterminated_verb_pattern; 3056 } 3057 3058 RExC_parse_inc(); 3059 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) { 3060 RExC_parse_inc(); 3061 } 3062 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) { 3063 unterminated_verb_pattern: 3064 if (has_upper) { 3065 vFAIL("Unterminated verb pattern argument"); 3066 } 3067 else { 3068 vFAIL("Unterminated '(*...' argument"); 3069 } 3070 } 3071 } else { 3072 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) { 3073 if (has_upper) { 3074 vFAIL("Unterminated verb pattern"); 3075 } 3076 else { 3077 vFAIL("Unterminated '(*...' construct"); 3078 } 3079 } 3080 } 3081 3082 /* Here, we know that RExC_parse < RExC_end */ 3083 3084 switch ( *start_verb ) { 3085 case 'A': /* (*ACCEPT) */ 3086 if ( memEQs(start_verb, verb_len,"ACCEPT") ) { 3087 op = ACCEPT; 3088 internal_argval = RExC_nestroot; 3089 } 3090 break; 3091 case 'C': /* (*COMMIT) */ 3092 if ( memEQs(start_verb, verb_len,"COMMIT") ) 3093 op = COMMIT; 3094 break; 3095 case 'F': /* (*FAIL) */ 3096 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) { 3097 op = OPFAIL; 3098 } 3099 break; 3100 case ':': /* (*:NAME) */ 3101 case 'M': /* (*MARK:NAME) */ 3102 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) { 3103 op = MARKPOINT; 3104 arg_required = 1; 3105 } 3106 break; 3107 case 'P': /* (*PRUNE) */ 3108 if ( memEQs(start_verb, verb_len,"PRUNE") ) 3109 op = PRUNE; 3110 break; 3111 case 'S': /* (*SKIP) */ 3112 if ( memEQs(start_verb, verb_len,"SKIP") ) 3113 op = SKIP; 3114 break; 3115 case 'T': /* (*THEN) */ 3116 /* [19:06] <TimToady> :: is then */ 3117 if ( memEQs(start_verb, verb_len,"THEN") ) { 3118 op = CUTGROUP; 3119 RExC_seen |= REG_CUTGROUP_SEEN; 3120 } 3121 break; 3122 case 'a': 3123 if ( memEQs(start_verb, verb_len, "asr") 3124 || memEQs(start_verb, verb_len, "atomic_script_run")) 3125 { 3126 paren = 'r'; /* Mnemonic: recursed run */ 3127 goto script_run; 3128 } 3129 else if (memEQs(start_verb, verb_len, "atomic")) { 3130 paren = 't'; /* AtOMIC */ 3131 goto alpha_assertions; 3132 } 3133 break; 3134 case 'p': 3135 if ( memEQs(start_verb, verb_len, "plb") 3136 || memEQs(start_verb, verb_len, "positive_lookbehind")) 3137 { 3138 paren = 'b'; 3139 goto lookbehind_alpha_assertions; 3140 } 3141 else if ( memEQs(start_verb, verb_len, "pla") 3142 || memEQs(start_verb, verb_len, "positive_lookahead")) 3143 { 3144 paren = 'a'; 3145 goto alpha_assertions; 3146 } 3147 break; 3148 case 'n': 3149 if ( memEQs(start_verb, verb_len, "nlb") 3150 || memEQs(start_verb, verb_len, "negative_lookbehind")) 3151 { 3152 paren = 'B'; 3153 goto lookbehind_alpha_assertions; 3154 } 3155 else if ( memEQs(start_verb, verb_len, "nla") 3156 || memEQs(start_verb, verb_len, "negative_lookahead")) 3157 { 3158 paren = 'A'; 3159 goto alpha_assertions; 3160 } 3161 break; 3162 case 's': 3163 if ( memEQs(start_verb, verb_len, "sr") 3164 || memEQs(start_verb, verb_len, "script_run")) 3165 { 3166 regnode_offset atomic; 3167 3168 paren = 's'; 3169 3170 script_run: 3171 3172 /* This indicates Unicode rules. */ 3173 REQUIRE_UNI_RULES(flagp, 0); 3174 3175 if (! start_arg) { 3176 goto no_colon; 3177 } 3178 3179 RExC_parse_set(start_arg); 3180 3181 if (RExC_in_script_run) { 3182 3183 /* Nested script runs are treated as no-ops, because 3184 * if the nested one fails, the outer one must as 3185 * well. It could fail sooner, and avoid (??{} with 3186 * side effects, but that is explicitly documented as 3187 * undefined behavior. */ 3188 3189 ret = 0; 3190 3191 if (paren == 's') { 3192 paren = ':'; 3193 goto parse_rest; 3194 } 3195 3196 /* But, the atomic part of a nested atomic script run 3197 * isn't a no-op, but can be treated just like a '(?>' 3198 * */ 3199 paren = '>'; 3200 goto parse_rest; 3201 } 3202 3203 if (paren == 's') { 3204 /* Here, we're starting a new regular script run */ 3205 ret = reg_node(pRExC_state, SROPEN); 3206 RExC_in_script_run = 1; 3207 is_open = 1; 3208 goto parse_rest; 3209 } 3210 3211 /* Here, we are starting an atomic script run. This is 3212 * handled by recursing to deal with the atomic portion 3213 * separately, enclosed in SROPEN ... SRCLOSE nodes */ 3214 3215 ret = reg_node(pRExC_state, SROPEN); 3216 3217 RExC_in_script_run = 1; 3218 3219 atomic = reg(pRExC_state, 'r', &flags, depth); 3220 if (flags & (RESTART_PARSE|NEED_UTF8)) { 3221 *flagp = flags & (RESTART_PARSE|NEED_UTF8); 3222 return 0; 3223 } 3224 3225 if (! REGTAIL(pRExC_state, ret, atomic)) { 3226 REQUIRE_BRANCHJ(flagp, 0); 3227 } 3228 3229 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state, 3230 SRCLOSE))) 3231 { 3232 REQUIRE_BRANCHJ(flagp, 0); 3233 } 3234 3235 RExC_in_script_run = 0; 3236 return ret; 3237 } 3238 3239 break; 3240 3241 lookbehind_alpha_assertions: 3242 seen_flag_set = REG_LOOKBEHIND_SEEN; 3243 /*FALLTHROUGH*/ 3244 3245 alpha_assertions: 3246 3247 if ( !start_arg ) { 3248 goto no_colon; 3249 } 3250 3251 if ( RExC_parse == start_arg ) { 3252 if ( paren == 'A' || paren == 'B' ) { 3253 /* An empty negative lookaround assertion is failure. 3254 * See also: S_reg_la_OPFAIL() */ 3255 3256 /* Note: OPFAIL is *not* zerolen. */ 3257 ret = reg1node(pRExC_state, OPFAIL, 0); 3258 nextchar(pRExC_state); 3259 return ret; 3260 } 3261 else 3262 if ( paren == 'a' || paren == 'b' ) { 3263 /* An empty positive lookaround assertion is success. 3264 * See also: S_reg_la_NOTHING() */ 3265 3266 /* Note: NOTHING is zerolen, so increment here */ 3267 RExC_seen_zerolen++; 3268 ret = reg_node(pRExC_state, NOTHING); 3269 nextchar(pRExC_state); 3270 return ret; 3271 } 3272 } 3273 3274 RExC_seen_zerolen++; 3275 RExC_in_lookaround++; 3276 RExC_seen |= seen_flag_set; 3277 3278 RExC_parse_set(start_arg); 3279 goto parse_rest; 3280 3281 no_colon: 3282 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'", 3283 UTF8fARG(UTF, verb_len, start_verb)); 3284 NOT_REACHED; /*NOTREACHED*/ 3285 3286 } /* End of switch */ 3287 if ( ! op ) { 3288 RExC_parse_inc_safe(); 3289 if (has_upper || verb_len == 0) { 3290 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'", 3291 UTF8fARG(UTF, verb_len, start_verb)); 3292 } 3293 else { 3294 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'", 3295 UTF8fARG(UTF, verb_len, start_verb)); 3296 } 3297 } 3298 if ( RExC_parse == start_arg ) { 3299 start_arg = NULL; 3300 } 3301 if ( arg_required && !start_arg ) { 3302 vFAIL3( "Verb pattern '%.*s' has a mandatory argument", 3303 (int) verb_len, start_verb); 3304 } 3305 if (internal_argval == -1) { 3306 ret = reg1node(pRExC_state, op, 0); 3307 } else { 3308 ret = reg2node(pRExC_state, op, 0, internal_argval); 3309 } 3310 RExC_seen |= REG_VERBARG_SEEN; 3311 if (start_arg) { 3312 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); 3313 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state, 3314 STR_WITH_LEN("S")); 3315 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv; 3316 FLAGS(REGNODE_p(ret)) = 1; 3317 } else { 3318 FLAGS(REGNODE_p(ret)) = 0; 3319 } 3320 if ( internal_argval != -1 ) 3321 ARG2i_SET(REGNODE_p(ret), internal_argval); 3322 nextchar(pRExC_state); 3323 return ret; 3324 } 3325 else if (*RExC_parse == '?') { /* (?...) */ 3326 handle_qmark: 3327 ; /* make sure the label has a statement associated with it*/ 3328 bool is_logical = 0, is_optimistic = 0; 3329 const char * const seqstart = RExC_parse; 3330 const char * endptr; 3331 const char non_existent_group_msg[] 3332 = "Reference to nonexistent group"; 3333 const char impossible_group[] = "Invalid reference to group"; 3334 3335 if (has_intervening_patws) { 3336 RExC_parse_inc_by(1); 3337 vFAIL("In '(?...)', the '(' and '?' must be adjacent"); 3338 } 3339 3340 RExC_parse_inc_by(1); /* past the '?' */ 3341 if (!fake_eval) { 3342 paren = *RExC_parse; /* might be a trailing NUL, if not 3343 well-formed */ 3344 is_optimistic = 0; 3345 } else { 3346 is_optimistic = 1; 3347 paren = fake_eval; 3348 } 3349 RExC_parse_inc(); 3350 if (RExC_parse > RExC_end) { 3351 paren = '\0'; 3352 } 3353 ret = 0; /* For look-ahead/behind. */ 3354 switch (paren) { 3355 3356 case 'P': /* (?P...) variants for those used to PCRE/Python */ 3357 paren = *RExC_parse; 3358 if ( paren == '<') { /* (?P<...>) named capture */ 3359 RExC_parse_inc_by(1); 3360 if (RExC_parse >= RExC_end) { 3361 vFAIL("Sequence (?P<... not terminated"); 3362 } 3363 goto named_capture; 3364 } 3365 else if (paren == '>') { /* (?P>name) named recursion */ 3366 RExC_parse_inc_by(1); 3367 if (RExC_parse >= RExC_end) { 3368 vFAIL("Sequence (?P>... not terminated"); 3369 } 3370 goto named_recursion; 3371 } 3372 else if (paren == '=') { /* (?P=...) named backref */ 3373 RExC_parse_inc_by(1); 3374 return handle_named_backref(pRExC_state, flagp, 3375 segment_parse_start, ')'); 3376 } 3377 RExC_parse_inc_if_char(); 3378 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 3379 vFAIL3("Sequence (%.*s...) not recognized", 3380 (int) (RExC_parse - seqstart), seqstart); 3381 NOT_REACHED; /*NOTREACHED*/ 3382 case '<': /* (?<...) */ 3383 /* If you want to support (?<*...), first reconcile with GH #17363 */ 3384 if (*RExC_parse == '!') { 3385 paren = ','; /* negative lookbehind (?<! ... ) */ 3386 RExC_parse_inc_by(1); 3387 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!"))) 3388 return ret; 3389 break; 3390 } 3391 else 3392 if (*RExC_parse == '=') { 3393 /* paren = '<' - negative lookahead (?<= ... ) */ 3394 RExC_parse_inc_by(1); 3395 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<="))) 3396 return ret; 3397 break; 3398 } 3399 else 3400 named_capture: 3401 { /* (?<...>) */ 3402 char *name_start; 3403 SV *svname; 3404 paren= '>'; 3405 /* FALLTHROUGH */ 3406 case '\'': /* (?'...') */ 3407 name_start = RExC_parse; 3408 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME); 3409 if ( RExC_parse == name_start 3410 || RExC_parse >= RExC_end 3411 || *RExC_parse != paren) 3412 { 3413 vFAIL2("Sequence (?%c... not terminated", 3414 paren=='>' ? '<' : (char) paren); 3415 } 3416 { 3417 HE *he_str; 3418 SV *sv_dat = NULL; 3419 if (!svname) /* shouldn't happen */ 3420 Perl_croak(aTHX_ 3421 "panic: reg_scan_name returned NULL"); 3422 if (!RExC_paren_names) { 3423 RExC_paren_names= newHV(); 3424 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 3425#ifdef DEBUGGING 3426 RExC_paren_name_list= newAV(); 3427 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 3428#endif 3429 } 3430 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 3431 if ( he_str ) 3432 sv_dat = HeVAL(he_str); 3433 if ( ! sv_dat ) { 3434 /* croak baby croak */ 3435 Perl_croak(aTHX_ 3436 "panic: paren_name hash element allocation failed"); 3437 } else if ( SvPOK(sv_dat) ) { 3438 /* (?|...) can mean we have dupes so scan to check 3439 its already been stored. Maybe a flag indicating 3440 we are inside such a construct would be useful, 3441 but the arrays are likely to be quite small, so 3442 for now we punt -- dmq */ 3443 IV count = SvIV(sv_dat); 3444 I32 *pv = (I32*)SvPVX(sv_dat); 3445 IV i; 3446 for ( i = 0 ; i < count ; i++ ) { 3447 if ( pv[i] == RExC_npar ) { 3448 count = 0; 3449 break; 3450 } 3451 } 3452 if ( count ) { 3453 pv = (I32*)SvGROW(sv_dat, 3454 SvCUR(sv_dat) + sizeof(I32)+1); 3455 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 3456 pv[count] = RExC_npar; 3457 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 3458 } 3459 } else { 3460 (void)SvUPGRADE(sv_dat, SVt_PVNV); 3461 sv_setpvn(sv_dat, (char *)&(RExC_npar), 3462 sizeof(I32)); 3463 SvIOK_on(sv_dat); 3464 SvIV_set(sv_dat, 1); 3465 } 3466#ifdef DEBUGGING 3467 /* No, this does not cause a memory leak under 3468 * debugging. RExC_paren_name_list is freed later 3469 * on in the dump process. - Yves 3470 */ 3471 if (!av_store(RExC_paren_name_list, 3472 RExC_npar, SvREFCNT_inc_NN(svname))) 3473 SvREFCNT_dec_NN(svname); 3474#endif 3475 3476 } 3477 nextchar(pRExC_state); 3478 paren = 1; 3479 goto capturing_parens; 3480 } 3481 NOT_REACHED; /*NOTREACHED*/ 3482 case '=': /* (?=...) */ 3483 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?="))) 3484 return ret; 3485 break; 3486 case '!': /* (?!...) */ 3487 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!"))) 3488 return ret; 3489 break; 3490 case '|': /* (?|...) */ 3491 /* branch reset, behave like a (?:...) except that 3492 buffers in alternations share the same numbers */ 3493 paren = ':'; 3494 after_freeze = freeze_paren = RExC_logical_npar; 3495 3496 /* XXX This construct currently requires an extra pass. 3497 * Investigation would be required to see if that could be 3498 * changed */ 3499 REQUIRE_PARENS_PASS; 3500 break; 3501 case ':': /* (?:...) */ 3502 case '>': /* (?>...) */ 3503 break; 3504 case '$': /* (?$...) */ 3505 case '@': /* (?@...) */ 3506 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 3507 break; 3508 case '0' : /* (?0) */ 3509 case 'R' : /* (?R) */ 3510 if (RExC_parse == RExC_end || *RExC_parse != ')') 3511 FAIL("Sequence (?R) not terminated"); 3512 num = 0; 3513 RExC_seen |= REG_RECURSE_SEEN; 3514 3515 /* XXX These constructs currently require an extra pass. 3516 * It probably could be changed */ 3517 REQUIRE_PARENS_PASS; 3518 3519 *flagp |= POSTPONED; 3520 goto gen_recurse_regop; 3521 /*notreached*/ 3522 /* named and numeric backreferences */ 3523 case '&': /* (?&NAME) */ 3524 segment_parse_start = RExC_parse - 1; 3525 named_recursion: 3526 { 3527 SV *sv_dat = reg_scan_name(pRExC_state, 3528 REG_RSN_RETURN_DATA); 3529 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 3530 } 3531 if (RExC_parse >= RExC_end || *RExC_parse != ')') 3532 vFAIL("Sequence (?&... not terminated"); 3533 goto gen_recurse_regop; 3534 /* NOTREACHED */ 3535 case '+': 3536 if (! inRANGE(RExC_parse[0], '1', '9')) { 3537 RExC_parse_inc_by(1); 3538 vFAIL("Illegal pattern"); 3539 } 3540 goto parse_recursion; 3541 /* NOTREACHED*/ 3542 case '-': /* (?-1) */ 3543 if (! inRANGE(RExC_parse[0], '1', '9')) { 3544 RExC_parse--; /* rewind to let it be handled later */ 3545 goto parse_flags; 3546 } 3547 /* FALLTHROUGH */ 3548 case '1': case '2': case '3': case '4': /* (?1) */ 3549 case '5': case '6': case '7': case '8': case '9': 3550 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */ 3551 parse_recursion: 3552 { 3553 bool is_neg = FALSE; 3554 UV unum; 3555 segment_parse_start = RExC_parse - 1; 3556 if (*RExC_parse == '-') { 3557 RExC_parse_inc_by(1); 3558 is_neg = TRUE; 3559 } 3560 endptr = RExC_end; 3561 if (grok_atoUV(RExC_parse, &unum, &endptr) 3562 && unum <= I32_MAX 3563 ) { 3564 num = (I32)unum; 3565 RExC_parse_set((char*)endptr); 3566 } 3567 else { /* Overflow, or something like that. Position 3568 beyond all digits for the message */ 3569 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) { 3570 RExC_parse_inc_by(1); 3571 } 3572 vFAIL(impossible_group); 3573 } 3574 if (is_neg) { 3575 /* -num is always representable on 1 and 2's complement 3576 * machines */ 3577 num = -num; 3578 } 3579 } 3580 if (*RExC_parse!=')') 3581 vFAIL("Expecting close bracket"); 3582 3583 if (paren == '-' || paren == '+') { 3584 3585 /* Don't overflow */ 3586 if (UNLIKELY(I32_MAX - RExC_npar < num)) { 3587 RExC_parse_inc_by(1); 3588 vFAIL(impossible_group); 3589 } 3590 3591 /* 3592 Diagram of capture buffer numbering. 3593 Top line is the normal capture buffer numbers 3594 Bottom line is the negative indexing as from 3595 the X (the (?-2)) 3596 3597 1 2 3 4 5 X Y 6 7 3598 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/ 3599 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 3600 - 5 4 3 2 1 X Y x x 3601 3602 Resolve to absolute group. Recall that RExC_npar is +1 of 3603 the actual parenthesis group number. For lookahead, we 3604 have to compensate for that. Using the above example, when 3605 we get to Y in the parse, num is 2 and RExC_npar is 6. We 3606 want 7 for +2, and 4 for -2. 3607 */ 3608 if ( paren == '+' ) { 3609 num--; 3610 } 3611 3612 num += RExC_npar; 3613 3614 if (paren == '-' && num < 1) { 3615 RExC_parse_inc_by(1); 3616 vFAIL(non_existent_group_msg); 3617 } 3618 } 3619 else 3620 if (num && num < RExC_logical_npar) { 3621 num = RExC_logical_to_parno[num]; 3622 } 3623 else 3624 if (ALL_PARENS_COUNTED) { 3625 if (num < RExC_logical_total_parens) { 3626 num = RExC_logical_to_parno[num]; 3627 } 3628 else { 3629 RExC_parse_inc_by(1); 3630 vFAIL(non_existent_group_msg); 3631 } 3632 } 3633 else { 3634 REQUIRE_PARENS_PASS; 3635 } 3636 3637 3638 gen_recurse_regop: 3639 if (num >= RExC_npar) { 3640 3641 /* It might be a forward reference; we can't fail until we 3642 * know, by completing the parse to get all the groups, and 3643 * then reparsing */ 3644 if (ALL_PARENS_COUNTED) { 3645 if (num >= RExC_total_parens) { 3646 RExC_parse_inc_by(1); 3647 vFAIL(non_existent_group_msg); 3648 } 3649 } 3650 else { 3651 REQUIRE_PARENS_PASS; 3652 } 3653 } 3654 3655 /* We keep track how many GOSUB items we have produced. 3656 To start off the ARG2i() of the GOSUB holds its "id", 3657 which is used later in conjunction with RExC_recurse 3658 to calculate the offset we need to jump for the GOSUB, 3659 which it will store in the final representation. 3660 We have to defer the actual calculation until much later 3661 as the regop may move. 3662 */ 3663 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count); 3664 RExC_recurse_count++; 3665 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 3666 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", 3667 22, "| |", (int)(depth * 2 + 1), "", 3668 (UV)ARG1u(REGNODE_p(ret)), 3669 (IV)ARG2i(REGNODE_p(ret)))); 3670 RExC_seen |= REG_RECURSE_SEEN; 3671 3672 *flagp |= POSTPONED; 3673 assert(*RExC_parse == ')'); 3674 nextchar(pRExC_state); 3675 return ret; 3676 3677 /* NOTREACHED */ 3678 3679 case '?': /* (??...) */ 3680 is_logical = 1; 3681 if (*RExC_parse != '{') { 3682 RExC_parse_inc_if_char(); 3683 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 3684 vFAIL2utf8f( 3685 "Sequence (%" UTF8f "...) not recognized", 3686 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 3687 NOT_REACHED; /*NOTREACHED*/ 3688 } 3689 *flagp |= POSTPONED; 3690 paren = '{'; 3691 RExC_parse_inc_by(1); 3692 /* FALLTHROUGH */ 3693 case '{': /* (?{...}) */ 3694 { 3695 U32 n = 0; 3696 struct reg_code_block *cb; 3697 OP * o; 3698 3699 RExC_seen_zerolen++; 3700 3701 if ( !pRExC_state->code_blocks 3702 || pRExC_state->code_index 3703 >= pRExC_state->code_blocks->count 3704 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start 3705 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) 3706 - RExC_start) 3707 ) { 3708 if (RExC_pm_flags & PMf_USE_RE_EVAL) 3709 FAIL("panic: Sequence (?{...}): no code block found\n"); 3710 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 3711 } 3712 /* this is a pre-compiled code block (?{...}) */ 3713 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index]; 3714 RExC_parse_set(RExC_start + cb->end); 3715 o = cb->block; 3716 if (cb->src_regex) { 3717 n = reg_add_data(pRExC_state, STR_WITH_LEN("rl")); 3718 RExC_rxi->data->data[n] = 3719 (void*)SvREFCNT_inc((SV*)cb->src_regex); 3720 RExC_rxi->data->data[n+1] = (void*)o; 3721 } 3722 else { 3723 n = reg_add_data(pRExC_state, 3724 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); 3725 RExC_rxi->data->data[n] = (void*)o; 3726 } 3727 pRExC_state->code_index++; 3728 nextchar(pRExC_state); 3729 if (!is_optimistic) 3730 RExC_seen |= REG_PESSIMIZE_SEEN; 3731 3732 if (is_logical) { 3733 regnode_offset eval; 3734 ret = reg_node(pRExC_state, LOGICAL); 3735 FLAGS(REGNODE_p(ret)) = 2; 3736 3737 eval = reg2node(pRExC_state, EVAL, 3738 n, 3739 3740 /* for later propagation into (??{}) 3741 * return value */ 3742 RExC_flags & RXf_PMf_COMPILETIME 3743 ); 3744 FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG; 3745 if (! REGTAIL(pRExC_state, ret, eval)) { 3746 REQUIRE_BRANCHJ(flagp, 0); 3747 } 3748 return ret; 3749 } 3750 ret = reg2node(pRExC_state, EVAL, n, 0); 3751 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG; 3752 3753 return ret; 3754 } 3755 case '(': /* (?(?{...})...) and (?(?=...)...) */ 3756 { 3757 int is_define= 0; 3758 const int DEFINE_len = sizeof("DEFINE") - 1; 3759 if ( RExC_parse < RExC_end - 1 3760 && ( ( RExC_parse[0] == '?' /* (?(?...)) */ 3761 && ( RExC_parse[1] == '=' 3762 || RExC_parse[1] == '!' 3763 || RExC_parse[1] == '<' 3764 || RExC_parse[1] == '{')) 3765 || ( RExC_parse[0] == '*' /* (?(*...)) */ 3766 && ( RExC_parse[1] == '{' 3767 || ( memBEGINs(RExC_parse + 1, 3768 (Size_t) (RExC_end - (RExC_parse + 1)), 3769 "pla:") 3770 || memBEGINs(RExC_parse + 1, 3771 (Size_t) (RExC_end - (RExC_parse + 1)), 3772 "plb:") 3773 || memBEGINs(RExC_parse + 1, 3774 (Size_t) (RExC_end - (RExC_parse + 1)), 3775 "nla:") 3776 || memBEGINs(RExC_parse + 1, 3777 (Size_t) (RExC_end - (RExC_parse + 1)), 3778 "nlb:") 3779 || memBEGINs(RExC_parse + 1, 3780 (Size_t) (RExC_end - (RExC_parse + 1)), 3781 "positive_lookahead:") 3782 || memBEGINs(RExC_parse + 1, 3783 (Size_t) (RExC_end - (RExC_parse + 1)), 3784 "positive_lookbehind:") 3785 || memBEGINs(RExC_parse + 1, 3786 (Size_t) (RExC_end - (RExC_parse + 1)), 3787 "negative_lookahead:") 3788 || memBEGINs(RExC_parse + 1, 3789 (Size_t) (RExC_end - (RExC_parse + 1)), 3790 "negative_lookbehind:"))))) 3791 ) { /* Lookahead or eval. */ 3792 I32 flag; 3793 regnode_offset tail; 3794 3795 ret = reg_node(pRExC_state, LOGICAL); 3796 FLAGS(REGNODE_p(ret)) = 1; 3797 3798 tail = reg(pRExC_state, 1, &flag, depth+1); 3799 RETURN_FAIL_ON_RESTART(flag, flagp); 3800 if (! REGTAIL(pRExC_state, ret, tail)) { 3801 REQUIRE_BRANCHJ(flagp, 0); 3802 } 3803 goto insert_if; 3804 } 3805 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 3806 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 3807 { 3808 char ch = RExC_parse[0] == '<' ? '>' : '\''; 3809 char *name_start= RExC_parse; 3810 RExC_parse_inc_by(1); 3811 U32 num = 0; 3812 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); 3813 if ( RExC_parse == name_start 3814 || RExC_parse >= RExC_end 3815 || *RExC_parse != ch) 3816 { 3817 vFAIL2("Sequence (?(%c... not terminated", 3818 (ch == '>' ? '<' : ch)); 3819 } 3820 RExC_parse_inc_by(1); 3821 if (sv_dat) { 3822 num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); 3823 RExC_rxi->data->data[num]=(void*)sv_dat; 3824 SvREFCNT_inc_simple_void_NN(sv_dat); 3825 } 3826 ret = reg1node(pRExC_state, GROUPPN, num); 3827 goto insert_if_check_paren; 3828 } 3829 else if (memBEGINs(RExC_parse, 3830 (STRLEN) (RExC_end - RExC_parse), 3831 "DEFINE")) 3832 { 3833 ret = reg1node(pRExC_state, DEFINEP, 0); 3834 RExC_parse_inc_by(DEFINE_len); 3835 is_define = 1; 3836 goto insert_if_check_paren; 3837 } 3838 else if (RExC_parse[0] == 'R') { 3839 RExC_parse_inc_by(1); 3840 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval" 3841 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)" 3842 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)" 3843 */ 3844 parno = 0; 3845 if (RExC_parse[0] == '0') { 3846 parno = 1; 3847 RExC_parse_inc_by(1); 3848 } 3849 else if (inRANGE(RExC_parse[0], '1', '9')) { 3850 UV uv; 3851 endptr = RExC_end; 3852 if (grok_atoUV(RExC_parse, &uv, &endptr) 3853 && uv <= I32_MAX 3854 ) { 3855 parno = (I32)uv + 1; 3856 RExC_parse_set((char*)endptr); 3857 } 3858 /* else "Switch condition not recognized" below */ 3859 } else if (RExC_parse[0] == '&') { 3860 SV *sv_dat; 3861 RExC_parse_inc_by(1); 3862 sv_dat = reg_scan_name(pRExC_state, 3863 REG_RSN_RETURN_DATA); 3864 if (sv_dat) 3865 parno = 1 + *((I32 *)SvPVX(sv_dat)); 3866 } 3867 ret = reg1node(pRExC_state, INSUBP, parno); 3868 goto insert_if_check_paren; 3869 } 3870 else if (inRANGE(RExC_parse[0], '1', '9')) { 3871 /* (?(1)...) */ 3872 char c; 3873 UV uv; 3874 endptr = RExC_end; 3875 if (grok_atoUV(RExC_parse, &uv, &endptr) 3876 && uv <= I32_MAX 3877 ) { 3878 parno = (I32)uv; 3879 RExC_parse_set((char*)endptr); 3880 } 3881 else { 3882 vFAIL("panic: grok_atoUV returned FALSE"); 3883 } 3884 ret = reg1node(pRExC_state, GROUPP, parno); 3885 3886 insert_if_check_paren: 3887 if (UCHARAT(RExC_parse) != ')') { 3888 RExC_parse_inc_safe(); 3889 vFAIL("Switch condition not recognized"); 3890 } 3891 nextchar(pRExC_state); 3892 insert_if: 3893 if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state, 3894 IFTHEN, 0))) 3895 { 3896 REQUIRE_BRANCHJ(flagp, 0); 3897 } 3898 br = regbranch(pRExC_state, &flags, 1, depth+1); 3899 if (br == 0) { 3900 RETURN_FAIL_ON_RESTART(flags,flagp); 3901 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, 3902 (UV) flags); 3903 } else 3904 if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state, 3905 LONGJMP, 0))) 3906 { 3907 REQUIRE_BRANCHJ(flagp, 0); 3908 } 3909 c = UCHARAT(RExC_parse); 3910 nextchar(pRExC_state); 3911 if (flags&HASWIDTH) 3912 *flagp |= HASWIDTH; 3913 if (c == '|') { 3914 if (is_define) 3915 vFAIL("(?(DEFINE)....) does not allow branches"); 3916 3917 /* Fake one for optimizer. */ 3918 lastbr = reg1node(pRExC_state, IFTHEN, 0); 3919 3920 if (!regbranch(pRExC_state, &flags, 1, depth+1)) { 3921 RETURN_FAIL_ON_RESTART(flags, flagp); 3922 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, 3923 (UV) flags); 3924 } 3925 if (! REGTAIL(pRExC_state, ret, lastbr)) { 3926 REQUIRE_BRANCHJ(flagp, 0); 3927 } 3928 if (flags&HASWIDTH) 3929 *flagp |= HASWIDTH; 3930 c = UCHARAT(RExC_parse); 3931 nextchar(pRExC_state); 3932 } 3933 else 3934 lastbr = 0; 3935 if (c != ')') { 3936 if (RExC_parse >= RExC_end) 3937 vFAIL("Switch (?(condition)... not terminated"); 3938 else 3939 vFAIL("Switch (?(condition)... contains too many branches"); 3940 } 3941 ender = reg_node(pRExC_state, TAIL); 3942 if (! REGTAIL(pRExC_state, br, ender)) { 3943 REQUIRE_BRANCHJ(flagp, 0); 3944 } 3945 if (lastbr) { 3946 if (! REGTAIL(pRExC_state, lastbr, ender)) { 3947 REQUIRE_BRANCHJ(flagp, 0); 3948 } 3949 if (! REGTAIL(pRExC_state, 3950 REGNODE_OFFSET( 3951 REGNODE_AFTER(REGNODE_p(lastbr))), 3952 ender)) 3953 { 3954 REQUIRE_BRANCHJ(flagp, 0); 3955 } 3956 } 3957 else 3958 if (! REGTAIL(pRExC_state, ret, ender)) { 3959 REQUIRE_BRANCHJ(flagp, 0); 3960 } 3961#if 0 /* Removing this doesn't cause failures in the test suite -- khw */ 3962 RExC_size++; /* XXX WHY do we need this?!! 3963 For large programs it seems to be required 3964 but I can't figure out why. -- dmq*/ 3965#endif 3966 return ret; 3967 } 3968 RExC_parse_inc_safe(); 3969 vFAIL("Unknown switch condition (?(...))"); 3970 } 3971 case '[': /* (?[ ... ]) */ 3972 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1); 3973 case 0: /* A NUL */ 3974 RExC_parse--; /* for vFAIL to print correctly */ 3975 vFAIL("Sequence (? incomplete"); 3976 break; 3977 3978 case ')': 3979 if (RExC_strict) { /* [perl #132851] */ 3980 ckWARNreg(RExC_parse, "Empty (?) without any modifiers"); 3981 } 3982 /* FALLTHROUGH */ 3983 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */ 3984 /* FALLTHROUGH */ 3985 default: /* e.g., (?i) */ 3986 RExC_parse_set((char *) seqstart + 1); 3987 parse_flags: 3988 parse_lparen_question_flags(pRExC_state); 3989 if (UCHARAT(RExC_parse) != ':') { 3990 if (RExC_parse < RExC_end) 3991 nextchar(pRExC_state); 3992 *flagp = TRYAGAIN; 3993 return 0; 3994 } 3995 paren = ':'; 3996 nextchar(pRExC_state); 3997 ret = 0; 3998 goto parse_rest; 3999 } /* end switch */ 4000 } 4001 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ 4002 capturing_parens: 4003 parno = RExC_npar; 4004 RExC_npar++; 4005 if (RExC_npar >= U16_MAX) 4006 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar); 4007 4008 logical_parno = RExC_logical_npar; 4009 RExC_logical_npar++; 4010 if (! ALL_PARENS_COUNTED) { 4011 /* If we are in our first pass through (and maybe only pass), 4012 * we need to allocate memory for the capturing parentheses 4013 * data structures. 4014 */ 4015 4016 if (!RExC_parens_buf_size) { 4017 /* first guess at number of parens we might encounter */ 4018 RExC_parens_buf_size = 10; 4019 4020 /* setup RExC_open_parens, which holds the address of each 4021 * OPEN tag, and to make things simpler for the 0 index the 4022 * start of the program - this is used later for offsets */ 4023 Newxz(RExC_open_parens, RExC_parens_buf_size, 4024 regnode_offset); 4025 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ 4026 4027 /* setup RExC_close_parens, which holds the address of each 4028 * CLOSE tag, and to make things simpler for the 0 index 4029 * the end of the program - this is used later for offsets 4030 * */ 4031 Newxz(RExC_close_parens, RExC_parens_buf_size, 4032 regnode_offset); 4033 /* we don't know where end op starts yet, so we don't need to 4034 * set RExC_close_parens[0] like we do RExC_open_parens[0] 4035 * above */ 4036 4037 Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32); 4038 Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32); 4039 } 4040 else if (RExC_npar > RExC_parens_buf_size) { 4041 I32 old_size = RExC_parens_buf_size; 4042 4043 RExC_parens_buf_size *= 2; 4044 4045 Renew(RExC_open_parens, RExC_parens_buf_size, 4046 regnode_offset); 4047 Zero(RExC_open_parens + old_size, 4048 RExC_parens_buf_size - old_size, regnode_offset); 4049 4050 Renew(RExC_close_parens, RExC_parens_buf_size, 4051 regnode_offset); 4052 Zero(RExC_close_parens + old_size, 4053 RExC_parens_buf_size - old_size, regnode_offset); 4054 4055 Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32); 4056 Zero(RExC_logical_to_parno + old_size, 4057 RExC_parens_buf_size - old_size, I32); 4058 4059 Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32); 4060 Zero(RExC_parno_to_logical + old_size, 4061 RExC_parens_buf_size - old_size, I32); 4062 } 4063 } 4064 4065 ret = reg1node(pRExC_state, OPEN, parno); 4066 if (!RExC_nestroot) 4067 RExC_nestroot = parno; 4068 if (RExC_open_parens && !RExC_open_parens[parno]) 4069 { 4070 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 4071 "%*s%*s Setting open paren #%" IVdf " to %zu\n", 4072 22, "| |", (int)(depth * 2 + 1), "", 4073 (IV)parno, ret)); 4074 RExC_open_parens[parno]= ret; 4075 } 4076 if (RExC_parno_to_logical) { 4077 RExC_parno_to_logical[parno] = logical_parno; 4078 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno]) 4079 RExC_logical_to_parno[logical_parno] = parno; 4080 } 4081 is_open = 1; 4082 } else { 4083 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ 4084 paren = ':'; 4085 ret = 0; 4086 } 4087 } 4088 else /* ! paren */ 4089 ret = 0; 4090 4091 parse_rest: 4092 /* Pick up the branches, linking them together. */ 4093 segment_parse_start = RExC_parse; 4094 I32 npar_before_regbranch = RExC_npar - 1; 4095 br = regbranch(pRExC_state, &flags, 1, depth+1); 4096 4097 /* branch_len = (paren != 0); */ 4098 4099 if (br == 0) { 4100 RETURN_FAIL_ON_RESTART(flags, flagp); 4101 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); 4102 } 4103 if (*RExC_parse == '|') { 4104 if (RExC_use_BRANCHJ) { 4105 reginsert(pRExC_state, BRANCHJ, br, depth+1); 4106 ARG2a_SET(REGNODE_p(br), npar_before_regbranch); 4107 ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1); 4108 } 4109 else { 4110 reginsert(pRExC_state, BRANCH, br, depth+1); 4111 ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch); 4112 ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1); 4113 } 4114 have_branch = 1; 4115 } 4116 else if (paren == ':') { 4117 *flagp |= flags&SIMPLE; 4118 } 4119 if (is_open) { /* Starts with OPEN. */ 4120 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */ 4121 REQUIRE_BRANCHJ(flagp, 0); 4122 } 4123 } 4124 else if (paren != '?') /* Not Conditional */ 4125 ret = br; 4126 *flagp |= flags & (HASWIDTH | POSTPONED); 4127 lastbr = br; 4128 while (*RExC_parse == '|') { 4129 if (RExC_use_BRANCHJ) { 4130 bool shut_gcc_up; 4131 4132 ender = reg1node(pRExC_state, LONGJMP, 0); 4133 4134 /* Append to the previous. */ 4135 shut_gcc_up = REGTAIL(pRExC_state, 4136 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))), 4137 ender); 4138 PERL_UNUSED_VAR(shut_gcc_up); 4139 } 4140 nextchar(pRExC_state); 4141 if (freeze_paren) { 4142 if (RExC_logical_npar > after_freeze) 4143 after_freeze = RExC_logical_npar; 4144 RExC_logical_npar = freeze_paren; 4145 } 4146 br = regbranch(pRExC_state, &flags, 0, depth+1); 4147 4148 if (br == 0) { 4149 RETURN_FAIL_ON_RESTART(flags, flagp); 4150 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); 4151 } 4152 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */ 4153 REQUIRE_BRANCHJ(flagp, 0); 4154 } 4155 assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ); 4156 assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ); 4157 if (OP(REGNODE_p(br)) == BRANCH) { 4158 if (OP(REGNODE_p(lastbr)) == BRANCH) 4159 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br))); 4160 else 4161 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br))); 4162 } 4163 else 4164 if (OP(REGNODE_p(br)) == BRANCHJ) { 4165 if (OP(REGNODE_p(lastbr)) == BRANCH) 4166 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br))); 4167 else 4168 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br))); 4169 } 4170 4171 lastbr = br; 4172 *flagp |= flags & (HASWIDTH | POSTPONED); 4173 } 4174 4175 if (have_branch || paren != ':') { 4176 regnode * br; 4177 4178 /* Make a closing node, and hook it on the end. */ 4179 switch (paren) { 4180 case ':': 4181 ender = reg_node(pRExC_state, TAIL); 4182 break; 4183 case 1: case 2: 4184 ender = reg1node(pRExC_state, CLOSE, parno); 4185 if ( RExC_close_parens ) { 4186 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 4187 "%*s%*s Setting close paren #%" IVdf " to %zu\n", 4188 22, "| |", (int)(depth * 2 + 1), "", 4189 (IV)parno, ender)); 4190 RExC_close_parens[parno]= ender; 4191 if (RExC_nestroot == parno) 4192 RExC_nestroot = 0; 4193 } 4194 break; 4195 case 's': 4196 ender = reg_node(pRExC_state, SRCLOSE); 4197 RExC_in_script_run = 0; 4198 break; 4199 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */ 4200 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */ 4201 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */ 4202 case '<': /* (?<= ... ) */ 4203 case ',': /* (?<! ... ) */ 4204 *flagp &= ~HASWIDTH; 4205 ender = reg_node(pRExC_state, LOOKBEHIND_END); 4206 break; 4207 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */ 4208 case 'a': 4209 case 'A': 4210 case '=': 4211 case '!': 4212 *flagp &= ~HASWIDTH; 4213 /* FALLTHROUGH */ 4214 case 't': /* aTomic */ 4215 case '>': 4216 ender = reg_node(pRExC_state, SUCCEED); 4217 break; 4218 case 0: 4219 ender = reg_node(pRExC_state, END); 4220 assert(!RExC_end_op); /* there can only be one! */ 4221 RExC_end_op = REGNODE_p(ender); 4222 if (RExC_close_parens) { 4223 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 4224 "%*s%*s Setting close paren #0 (END) to %zu\n", 4225 22, "| |", (int)(depth * 2 + 1), "", 4226 ender)); 4227 4228 RExC_close_parens[0]= ender; 4229 } 4230 break; 4231 } 4232 DEBUG_PARSE_r({ 4233 DEBUG_PARSE_MSG("lsbr"); 4234 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state); 4235 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state); 4236 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", 4237 SvPV_nolen_const(RExC_mysv1), 4238 (IV)lastbr, 4239 SvPV_nolen_const(RExC_mysv2), 4240 (IV)ender, 4241 (IV)(ender - lastbr) 4242 ); 4243 }); 4244 if (OP(REGNODE_p(lastbr)) == BRANCH) { 4245 ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1); 4246 } 4247 else 4248 if (OP(REGNODE_p(lastbr)) == BRANCHJ) { 4249 ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1); 4250 } 4251 4252 if (! REGTAIL(pRExC_state, lastbr, ender)) { 4253 REQUIRE_BRANCHJ(flagp, 0); 4254 } 4255 4256 if (have_branch) { 4257 char is_nothing= 1; 4258 if (depth==1) 4259 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 4260 4261 /* Hook the tails of the branches to the closing node. */ 4262 for (br = REGNODE_p(ret); br; br = regnext(br)) { 4263 const U8 op = REGNODE_TYPE(OP(br)); 4264 regnode *nextoper = REGNODE_AFTER(br); 4265 if (op == BRANCH) { 4266 if (! REGTAIL_STUDY(pRExC_state, 4267 REGNODE_OFFSET(nextoper), 4268 ender)) 4269 { 4270 REQUIRE_BRANCHJ(flagp, 0); 4271 } 4272 if ( OP(nextoper) != NOTHING 4273 || regnext(nextoper) != REGNODE_p(ender)) 4274 is_nothing= 0; 4275 } 4276 else if (op == BRANCHJ) { 4277 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state, 4278 REGNODE_OFFSET(nextoper), 4279 ender); 4280 PERL_UNUSED_VAR(shut_gcc_up); 4281 /* for now we always disable this optimisation * / 4282 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ); 4283 if ( OP(nopr) != NOTHING 4284 || regnext(nopr) != REGNODE_p(ender)) 4285 */ 4286 is_nothing= 0; 4287 } 4288 } 4289 if (is_nothing) { 4290 regnode * ret_as_regnode = REGNODE_p(ret); 4291 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH 4292 ? regnext(ret_as_regnode) 4293 : ret_as_regnode; 4294 DEBUG_PARSE_r({ 4295 DEBUG_PARSE_MSG("NADA"); 4296 regprop(RExC_rx, RExC_mysv1, ret_as_regnode, 4297 NULL, pRExC_state); 4298 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), 4299 NULL, pRExC_state); 4300 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", 4301 SvPV_nolen_const(RExC_mysv1), 4302 (IV)REG_NODE_NUM(ret_as_regnode), 4303 SvPV_nolen_const(RExC_mysv2), 4304 (IV)ender, 4305 (IV)(ender - ret) 4306 ); 4307 }); 4308 OP(br)= NOTHING; 4309 if (OP(REGNODE_p(ender)) == TAIL) { 4310 NEXT_OFF(br)= 0; 4311 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE; 4312 } else { 4313 regnode *opt; 4314 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ ) 4315 OP(opt)= OPTIMIZED; 4316 NEXT_OFF(br)= REGNODE_p(ender) - br; 4317 } 4318 } 4319 } 4320 } 4321 4322 { 4323 const char *p; 4324 /* Even/odd or x=don't care: 010101x10x */ 4325 static const char parens[] = "=!aA<,>Bbt"; 4326 /* flag below is set to 0 up through 'A'; 1 for larger */ 4327 4328 if (paren && (p = strchr(parens, paren))) { 4329 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 4330 int flag = (p - parens) > 3; 4331 4332 if (paren == '>' || paren == 't') { 4333 node = SUSPEND, flag = 0; 4334 } 4335 4336 reginsert(pRExC_state, node, ret, depth+1); 4337 FLAGS(REGNODE_p(ret)) = flag; 4338 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL))) 4339 { 4340 REQUIRE_BRANCHJ(flagp, 0); 4341 } 4342 } 4343 } 4344 4345 /* Check for proper termination. */ 4346 if (paren) { 4347 /* restore original flags, but keep (?p) and, if we've encountered 4348 * something in the parse that changes /d rules into /u, keep the /u */ 4349 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); 4350 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) { 4351 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 4352 } 4353 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { 4354 RExC_parse_set(reg_parse_start); 4355 vFAIL("Unmatched ("); 4356 } 4357 nextchar(pRExC_state); 4358 } 4359 else if (!paren && RExC_parse < RExC_end) { 4360 if (*RExC_parse == ')') { 4361 RExC_parse_inc_by(1); 4362 vFAIL("Unmatched )"); 4363 } 4364 else 4365 FAIL("Junk on end of regexp"); /* "Can't happen". */ 4366 NOT_REACHED; /* NOTREACHED */ 4367 } 4368 4369 if (after_freeze > RExC_logical_npar) 4370 RExC_logical_npar = after_freeze; 4371 4372 RExC_in_lookaround = was_in_lookaround; 4373 4374 return(ret); 4375} 4376 4377/* 4378 - regbranch - one alternative of an | operator 4379 * 4380 * Implements the concatenation operator. 4381 * 4382 * On success, returns the offset at which any next node should be placed into 4383 * the regex engine program being compiled. 4384 * 4385 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs 4386 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to 4387 * UTF-8 4388 */ 4389STATIC regnode_offset 4390S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 4391{ 4392 regnode_offset ret; 4393 regnode_offset chain = 0; 4394 regnode_offset latest; 4395 regnode *branch_node = NULL; 4396 I32 flags = 0, c = 0; 4397 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4398 4399 PERL_ARGS_ASSERT_REGBRANCH; 4400 4401 DEBUG_PARSE("brnc"); 4402 4403 if (first) 4404 ret = 0; 4405 else { 4406 if (RExC_use_BRANCHJ) { 4407 ret = reg2node(pRExC_state, BRANCHJ, 0, 0); 4408 branch_node = REGNODE_p(ret); 4409 ARG2a_SET(branch_node, (U16)RExC_npar-1); 4410 } else { 4411 ret = reg1node(pRExC_state, BRANCH, 0); 4412 branch_node = REGNODE_p(ret); 4413 ARG1a_SET(branch_node, (U16)RExC_npar-1); 4414 } 4415 } 4416 4417 *flagp = 0; /* Initialize. */ 4418 4419 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 4420 FALSE /* Don't force to /x */ ); 4421 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 4422 flags &= ~TRYAGAIN; 4423 latest = regpiece(pRExC_state, &flags, depth+1); 4424 if (latest == 0) { 4425 if (flags & TRYAGAIN) 4426 continue; 4427 RETURN_FAIL_ON_RESTART(flags, flagp); 4428 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags); 4429 } 4430 else if (ret == 0) 4431 ret = latest; 4432 *flagp |= flags&(HASWIDTH|POSTPONED); 4433 if (chain != 0) { 4434 /* FIXME adding one for every branch after the first is probably 4435 * excessive now we have TRIE support. (hv) */ 4436 MARK_NAUGHTY(1); 4437 if (! REGTAIL(pRExC_state, chain, latest)) { 4438 /* XXX We could just redo this branch, but figuring out what 4439 * bookkeeping needs to be reset is a pain, and it's likely 4440 * that other branches that goto END will also be too large */ 4441 REQUIRE_BRANCHJ(flagp, 0); 4442 } 4443 } 4444 chain = latest; 4445 c++; 4446 } 4447 if (chain == 0) { /* Loop ran zero times. */ 4448 chain = reg_node(pRExC_state, NOTHING); 4449 if (ret == 0) 4450 ret = chain; 4451 } 4452 if (c == 1) { 4453 *flagp |= flags & SIMPLE; 4454 } 4455 return ret; 4456} 4457 4458#define RBRACE 0 4459#define MIN_S 1 4460#define MIN_E 2 4461#define MAX_S 3 4462#define MAX_E 4 4463 4464#ifndef PERL_IN_XSUB_RE 4465bool 4466Perl_regcurly(const char *s, const char *e, const char * result[5]) 4467{ 4468 /* This function matches a {m,n} quantifier. When called with a NULL final 4469 * argument, it simply parses the input from 's' up through 'e-1', and 4470 * returns a boolean as to whether or not this input is syntactically a 4471 * {m,n} quantifier. 4472 * 4473 * When called with a non-NULL final parameter, and when the function 4474 * returns TRUE, it additionally stores information into the array 4475 * specified by that parameter about what it found in the parse. The 4476 * parameter must be a pointer into a 5 element array of 'const char *' 4477 * elements. The returned information is as follows: 4478 * result[RBRACE] points to the closing brace 4479 * result[MIN_S] points to the first byte of the lower bound 4480 * result[MIN_E] points to one beyond the final byte of the lower bound 4481 * result[MAX_S] points to the first byte of the upper bound 4482 * result[MAX_E] points to one beyond the final byte of the upper bound 4483 * 4484 * If the quantifier is of the form {m,} (meaning an infinite upper 4485 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point 4486 * to is irrelevant, just that it's the same place 4487 * 4488 * If instead the quantifier is of the form {m} there is actually only 4489 * one bound, and both the upper and lower result[] elements are set to 4490 * point to it. 4491 * 4492 * This function checks only for syntactic validity; it leaves checking for 4493 * semantic validity and raising any diagnostics to the caller. This 4494 * function is called in multiple places to check for syntax, but only from 4495 * one for semantics. It makes it as simple as possible for the 4496 * syntax-only callers, while furnishing just enough information for the 4497 * semantic caller. 4498 */ 4499 4500 const char * min_start = NULL; 4501 const char * max_start = NULL; 4502 const char * min_end = NULL; 4503 const char * max_end = NULL; 4504 4505 bool has_comma = FALSE; 4506 4507 PERL_ARGS_ASSERT_REGCURLY; 4508 4509 if (s >= e || *s++ != '{') 4510 return FALSE; 4511 4512 while (s < e && isBLANK(*s)) { 4513 s++; 4514 } 4515 4516 if isDIGIT(*s) { 4517 min_start = s; 4518 do { 4519 s++; 4520 } while (s < e && isDIGIT(*s)); 4521 min_end = s; 4522 } 4523 4524 while (s < e && isBLANK(*s)) { 4525 s++; 4526 } 4527 4528 if (*s == ',') { 4529 has_comma = TRUE; 4530 s++; 4531 4532 while (s < e && isBLANK(*s)) { 4533 s++; 4534 } 4535 4536 if isDIGIT(*s) { 4537 max_start = s; 4538 do { 4539 s++; 4540 } while (s < e && isDIGIT(*s)); 4541 max_end = s; 4542 } 4543 } 4544 4545 while (s < e && isBLANK(*s)) { 4546 s++; 4547 } 4548 /* Need at least one number */ 4549 if (s >= e || *s != '}' || (! min_start && ! max_end)) { 4550 return FALSE; 4551 } 4552 4553 if (result) { 4554 4555 result[RBRACE] = s; 4556 4557 result[MIN_S] = min_start; 4558 result[MIN_E] = min_end; 4559 if (has_comma) { 4560 if (max_start) { 4561 result[MAX_S] = max_start; 4562 result[MAX_E] = max_end; 4563 } 4564 else { 4565 /* Having no value after the comma is signalled by setting 4566 * start and end to the same value. What that value is isn't 4567 * relevant; NULL is chosen simply because it will fail if the 4568 * caller mistakenly uses it */ 4569 result[MAX_S] = result[MAX_E] = NULL; 4570 } 4571 } 4572 else { /* No comma means lower and upper bounds are the same */ 4573 result[MAX_S] = min_start; 4574 result[MAX_E] = min_end; 4575 } 4576 } 4577 4578 return TRUE; 4579} 4580#endif 4581 4582U32 4583S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, 4584 const char * start, const char * end) 4585{ 4586 /* This is a helper function for regpiece() to compute, given the 4587 * quantifier {m,n}, the value of either m or n, based on the starting 4588 * position 'start' in the string, through the byte 'end-1', returning it 4589 * if valid, and failing appropriately if not. It knows the restrictions 4590 * imposed on quantifier values */ 4591 4592 UV uv; 4593 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX); 4594 4595 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE; 4596 4597 if (grok_atoUV(start, &uv, &end)) { 4598 if (uv < REG_INFTY) { /* A valid, small-enough number */ 4599 return (U32) uv; 4600 } 4601 } 4602 else if (*start == '0') { /* grok_atoUV() fails for only two reasons: 4603 leading zeros or overflow */ 4604 RExC_parse_set((char * ) end); 4605 4606 /* Perhaps too generic a msg for what is only failure from having 4607 * leading zeros, but this is how it's always behaved. */ 4608 vFAIL("Invalid quantifier in {,}"); 4609 NOT_REACHED; /*NOTREACHED*/ 4610 } 4611 4612 /* Here, found a quantifier, but was too large; either it overflowed or was 4613 * too big a legal number */ 4614 RExC_parse_set((char * ) end); 4615 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 4616 4617 NOT_REACHED; /*NOTREACHED*/ 4618 return U32_MAX; /* Perhaps some compilers will be expecting a return */ 4619} 4620 4621/* 4622 - regpiece - something followed by possible quantifier * + ? {n,m} 4623 * 4624 * Note that the branching code sequences used for ? and the general cases 4625 * of * and + are somewhat optimized: they use the same NOTHING node as 4626 * both the endmarker for their branch list and the body of the last branch. 4627 * It might seem that this node could be dispensed with entirely, but the 4628 * endmarker role is not redundant. 4629 * 4630 * On success, returns the offset at which any next node should be placed into 4631 * the regex engine program being compiled. 4632 * 4633 * Returns 0 otherwise, with *flagp set to indicate why: 4634 * TRYAGAIN if regatom() returns 0 with TRYAGAIN. 4635 * RESTART_PARSE if the parse needs to be restarted, or'd with 4636 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8. 4637 */ 4638STATIC regnode_offset 4639S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 4640{ 4641 regnode_offset ret; 4642 char op; 4643 I32 flags; 4644 const char * const origparse = RExC_parse; 4645 I32 min; 4646 I32 max = REG_INFTY; 4647 I32 npar_before = RExC_npar-1; 4648 4649 /* Save the original in case we change the emitted regop to a FAIL. */ 4650 const regnode_offset orig_emit = RExC_emit; 4651 4652 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4653 4654 PERL_ARGS_ASSERT_REGPIECE; 4655 4656 DEBUG_PARSE("piec"); 4657 4658 ret = regatom(pRExC_state, &flags, depth+1); 4659 if (ret == 0) { 4660 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN); 4661 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); 4662 } 4663 I32 npar_after = RExC_npar-1; 4664 4665 op = *RExC_parse; 4666 switch (op) { 4667 const char * regcurly_return[5]; 4668 4669 case '*': 4670 nextchar(pRExC_state); 4671 min = 0; 4672 break; 4673 4674 case '+': 4675 nextchar(pRExC_state); 4676 min = 1; 4677 break; 4678 4679 case '?': 4680 nextchar(pRExC_state); 4681 min = 0; max = 1; 4682 break; 4683 4684 case '{': /* A '{' may or may not indicate a quantifier; call regcurly() 4685 to determine which */ 4686 if (regcurly(RExC_parse, RExC_end, regcurly_return)) { 4687 const char * min_start = regcurly_return[MIN_S]; 4688 const char * min_end = regcurly_return[MIN_E]; 4689 const char * max_start = regcurly_return[MAX_S]; 4690 const char * max_end = regcurly_return[MAX_E]; 4691 4692 if (min_start) { 4693 min = get_quantifier_value(pRExC_state, min_start, min_end); 4694 } 4695 else { 4696 min = 0; 4697 } 4698 4699 if (max_start == max_end) { /* Was of the form {m,} */ 4700 max = REG_INFTY; 4701 } 4702 else if (max_start == min_start) { /* Was of the form {m} */ 4703 max = min; 4704 } 4705 else { /* Was of the form {m,n} */ 4706 assert(max_end >= max_start); 4707 4708 max = get_quantifier_value(pRExC_state, max_start, max_end); 4709 } 4710 4711 RExC_parse_set((char *) regcurly_return[RBRACE]); 4712 nextchar(pRExC_state); 4713 4714 if (max < min) { /* If can't match, warn and optimize to fail 4715 unconditionally */ 4716 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); 4717 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); 4718 NEXT_OFF(REGNODE_p(orig_emit)) = 4719 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE; 4720 return ret; 4721 } 4722 else if (min == max && *RExC_parse == '?') { 4723 ckWARN2reg(RExC_parse + 1, 4724 "Useless use of greediness modifier '%c'", 4725 *RExC_parse); 4726 } 4727 4728 break; 4729 } /* End of is {m,n} */ 4730 4731 /* Here was a '{', but what followed it didn't form a quantifier. */ 4732 /* FALLTHROUGH */ 4733 4734 default: 4735 *flagp = flags; 4736 return(ret); 4737 NOT_REACHED; /*NOTREACHED*/ 4738 } 4739 4740 /* Here we have a quantifier, and have calculated 'min' and 'max'. 4741 * 4742 * Check and possibly adjust a zero width operand */ 4743 if (! (flags & (HASWIDTH|POSTPONED))) { 4744 if (max > REG_INFTY/3) { 4745 ckWARN2reg(RExC_parse, 4746 "%" UTF8f " matches null string many times", 4747 UTF8fARG(UTF, (RExC_parse >= origparse 4748 ? RExC_parse - origparse 4749 : 0), 4750 origparse)); 4751 } 4752 4753 /* There's no point in trying to match something 0 length more than 4754 * once except for extra side effects, which we don't have here since 4755 * not POSTPONED */ 4756 if (max > 1) { 4757 max = 1; 4758 if (min > max) { 4759 min = max; 4760 } 4761 } 4762 } 4763 4764 /* If this is a code block pass it up */ 4765 *flagp |= (flags & POSTPONED); 4766 4767 if (max > 0) { 4768 *flagp |= (flags & HASWIDTH); 4769 if (max == REG_INFTY) 4770 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 4771 } 4772 4773 /* 'SIMPLE' operands don't require full generality */ 4774 if ((flags&SIMPLE)) { 4775 if (max == REG_INFTY) { 4776 if (min == 0) { 4777 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) { 4778 goto min0_maxINF_wildcard_forbidden; 4779 } 4780 4781 reginsert(pRExC_state, STAR, ret, depth+1); 4782 MARK_NAUGHTY(4); 4783 goto done_main_op; 4784 } 4785 else if (min == 1) { 4786 reginsert(pRExC_state, PLUS, ret, depth+1); 4787 MARK_NAUGHTY(3); 4788 goto done_main_op; 4789 } 4790 } 4791 4792 /* Here, SIMPLE, but not the '*' and '+' special cases */ 4793 4794 MARK_NAUGHTY_EXP(2, 2); 4795 reginsert(pRExC_state, CURLY, ret, depth+1); 4796 } 4797 else { /* not SIMPLE */ 4798 const regnode_offset w = reg_node(pRExC_state, WHILEM); 4799 4800 FLAGS(REGNODE_p(w)) = 0; 4801 if (! REGTAIL(pRExC_state, ret, w)) { 4802 REQUIRE_BRANCHJ(flagp, 0); 4803 } 4804 if (RExC_use_BRANCHJ) { 4805 reginsert(pRExC_state, LONGJMP, ret, depth+1); 4806 reginsert(pRExC_state, NOTHING, ret, depth+1); 4807 REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP); 4808 } 4809 reginsert(pRExC_state, CURLYX, ret, depth+1); 4810 if (RExC_use_BRANCHJ) 4811 /* Go over NOTHING to LONGJMP. */ 4812 REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING); 4813 4814 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state, 4815 NOTHING))) 4816 { 4817 REQUIRE_BRANCHJ(flagp, 0); 4818 } 4819 RExC_whilem_seen++; 4820 MARK_NAUGHTY_EXP(1, 4); /* compound interest */ 4821 } 4822 4823 /* Finish up the CURLY/CURLYX case */ 4824 FLAGS(REGNODE_p(ret)) = 0; 4825 4826 ARG1i_SET(REGNODE_p(ret), min); 4827 ARG2i_SET(REGNODE_p(ret), max); 4828 4829 /* if we had a npar_after then we need to increment npar_before, 4830 * we want to track the range of parens we need to reset each iteration 4831 */ 4832 if (npar_after!=npar_before) { 4833 ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1); 4834 ARG3b_SET(REGNODE_p(ret), (U16)npar_after); 4835 } else { 4836 ARG3a_SET(REGNODE_p(ret), 0); 4837 ARG3b_SET(REGNODE_p(ret), 0); 4838 } 4839 4840 done_main_op: 4841 4842 /* Process any greediness modifiers */ 4843 if (*RExC_parse == '?') { 4844 nextchar(pRExC_state); 4845 reginsert(pRExC_state, MINMOD, ret, depth+1); 4846 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) { 4847 REQUIRE_BRANCHJ(flagp, 0); 4848 } 4849 } 4850 else if (*RExC_parse == '+') { 4851 regnode_offset ender; 4852 nextchar(pRExC_state); 4853 ender = reg_node(pRExC_state, SUCCEED); 4854 if (! REGTAIL(pRExC_state, ret, ender)) { 4855 REQUIRE_BRANCHJ(flagp, 0); 4856 } 4857 reginsert(pRExC_state, SUSPEND, ret, depth+1); 4858 ender = reg_node(pRExC_state, TAIL); 4859 if (! REGTAIL(pRExC_state, ret, ender)) { 4860 REQUIRE_BRANCHJ(flagp, 0); 4861 } 4862 } 4863 4864 /* Forbid extra quantifiers */ 4865 if (isQUANTIFIER(RExC_parse, RExC_end)) { 4866 RExC_parse_inc_by(1); 4867 vFAIL("Nested quantifiers"); 4868 } 4869 4870 return(ret); 4871 4872 min0_maxINF_wildcard_forbidden: 4873 4874 /* Here we are in a wildcard match, and the minimum match length is 0, and 4875 * the max could be infinity. This is currently forbidden. The only 4876 * reason is to make it harder to write patterns that take a long long time 4877 * to halt, and because the use of this construct isn't necessary in 4878 * matching Unicode property values */ 4879 RExC_parse_inc_by(1); 4880 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard 4881 subpatterns in regex; marked by <-- HERE in m/%s/ 4882 */ 4883 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard" 4884 " subpatterns"); 4885 4886 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't 4887 * legal at all in wildcards, so can't get this far */ 4888 4889 NOT_REACHED; /*NOTREACHED*/ 4890} 4891 4892STATIC bool 4893S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, 4894 regnode_offset * node_p, 4895 UV * code_point_p, 4896 int * cp_count, 4897 I32 * flagp, 4898 const bool strict, 4899 const U32 depth 4900 ) 4901{ 4902 /* This routine teases apart the various meanings of \N and returns 4903 * accordingly. The input parameters constrain which meaning(s) is/are valid 4904 * in the current context. 4905 * 4906 * Exactly one of <node_p> and <code_point_p> must be non-NULL. 4907 * 4908 * If <code_point_p> is not NULL, the context is expecting the result to be a 4909 * single code point. If this \N instance turns out to a single code point, 4910 * the function returns TRUE and sets *code_point_p to that code point. 4911 * 4912 * If <node_p> is not NULL, the context is expecting the result to be one of 4913 * the things representable by a regnode. If this \N instance turns out to be 4914 * one such, the function generates the regnode, returns TRUE and sets *node_p 4915 * to point to the offset of that regnode into the regex engine program being 4916 * compiled. 4917 * 4918 * If this instance of \N isn't legal in any context, this function will 4919 * generate a fatal error and not return. 4920 * 4921 * On input, RExC_parse should point to the first char following the \N at the 4922 * time of the call. On successful return, RExC_parse will have been updated 4923 * to point to just after the sequence identified by this routine. Also 4924 * *flagp has been updated as needed. 4925 * 4926 * When there is some problem with the current context and this \N instance, 4927 * the function returns FALSE, without advancing RExC_parse, nor setting 4928 * *node_p, nor *code_point_p, nor *flagp. 4929 * 4930 * If <cp_count> is not NULL, the caller wants to know the length (in code 4931 * points) that this \N sequence matches. This is set, and the input is 4932 * parsed for errors, even if the function returns FALSE, as detailed below. 4933 * 4934 * There are 6 possibilities here, as detailed in the next 6 paragraphs. 4935 * 4936 * Probably the most common case is for the \N to specify a single code point. 4937 * *cp_count will be set to 1, and *code_point_p will be set to that code 4938 * point. 4939 * 4940 * Another possibility is for the input to be an empty \N{}. This is no 4941 * longer accepted, and will generate a fatal error. 4942 * 4943 * Another possibility is for a custom charnames handler to be in effect which 4944 * translates the input name to an empty string. *cp_count will be set to 0. 4945 * *node_p will be set to a generated NOTHING node. 4946 * 4947 * Still another possibility is for the \N to mean [^\n]. *cp_count will be 4948 * set to 0. *node_p will be set to a generated REG_ANY node. 4949 * 4950 * The fifth possibility is that \N resolves to a sequence of more than one 4951 * code points. *cp_count will be set to the number of code points in the 4952 * sequence. *node_p will be set to a generated node returned by this 4953 * function calling S_reg(). 4954 * 4955 * The sixth and final possibility is that it is premature to be calling this 4956 * function; the parse needs to be restarted. This can happen when this 4957 * changes from /d to /u rules, or when the pattern needs to be upgraded to 4958 * UTF-8. The latter occurs only when the fifth possibility would otherwise 4959 * be in effect, and is because one of those code points requires the pattern 4960 * to be recompiled as UTF-8. The function returns FALSE, and sets the 4961 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this 4962 * happens, the caller needs to desist from continuing parsing, and return 4963 * this information to its caller. This is not set for when there is only one 4964 * code point, as this can be called as part of an ANYOF node, and they can 4965 * store above-Latin1 code points without the pattern having to be in UTF-8. 4966 * 4967 * For non-single-quoted regexes, the tokenizer has resolved character and 4968 * sequence names inside \N{...} into their Unicode values, normalizing the 4969 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the 4970 * hex-represented code points in the sequence. This is done there because 4971 * the names can vary based on what charnames pragma is in scope at the time, 4972 * so we need a way to take a snapshot of what they resolve to at the time of 4973 * the original parse. [perl #56444]. 4974 * 4975 * That parsing is skipped for single-quoted regexes, so here we may get 4976 * '\N{NAME}', which is parsed now. If the single-quoted regex is something 4977 * like '\N{U+41}', that code point is Unicode, and has to be translated into 4978 * the native character set for non-ASCII platforms. The other possibilities 4979 * are already native, so no translation is done. */ 4980 4981 char * endbrace; /* points to '}' following the name */ 4982 char * e; /* points to final non-blank before endbrace */ 4983 char* p = RExC_parse; /* Temporary */ 4984 4985 SV * substitute_parse = NULL; 4986 char *orig_end; 4987 char *save_start; 4988 I32 flags; 4989 4990 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4991 4992 PERL_ARGS_ASSERT_GROK_BSLASH_N; 4993 4994 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ 4995 assert(! (node_p && cp_count)); /* At most 1 should be set */ 4996 4997 if (cp_count) { /* Initialize return for the most common case */ 4998 *cp_count = 1; 4999 } 5000 5001 /* The [^\n] meaning of \N ignores spaces and comments under the /x 5002 * modifier. The other meanings do not (except blanks adjacent to and 5003 * within the braces), so use a temporary until we find out which we are 5004 * being called with */ 5005 skip_to_be_ignored_text(pRExC_state, &p, 5006 FALSE /* Don't force to /x */ ); 5007 5008 /* Disambiguate between \N meaning a named character versus \N meaning 5009 * [^\n]. The latter is assumed when the {...} following the \N is a legal 5010 * quantifier, or if there is no '{' at all */ 5011 if (*p != '{' || regcurly(p, RExC_end, NULL)) { 5012 RExC_parse_set(p); 5013 if (cp_count) { 5014 *cp_count = -1; 5015 } 5016 5017 if (! node_p) { 5018 return FALSE; 5019 } 5020 5021 *node_p = reg_node(pRExC_state, REG_ANY); 5022 *flagp |= HASWIDTH|SIMPLE; 5023 MARK_NAUGHTY(1); 5024 return TRUE; 5025 } 5026 5027 /* The test above made sure that the next real character is a '{', but 5028 * under the /x modifier, it could be separated by space (or a comment and 5029 * \n) and this is not allowed (for consistency with \x{...} and the 5030 * tokenizer handling of \N{NAME}). */ 5031 if (*RExC_parse != '{') { 5032 vFAIL("Missing braces on \\N{}"); 5033 } 5034 5035 RExC_parse_inc_by(1); /* Skip past the '{' */ 5036 5037 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 5038 if (! endbrace) { /* no trailing brace */ 5039 vFAIL2("Missing right brace on \\%c{}", 'N'); 5040 } 5041 5042 /* Here, we have decided it should be a named character or sequence. These 5043 * imply Unicode semantics */ 5044 REQUIRE_UNI_RULES(flagp, FALSE); 5045 5046 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to 5047 * nothing at all (not allowed under strict) */ 5048 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') { 5049 RExC_parse_set(endbrace); 5050 if (strict) { 5051 RExC_parse_inc_by(1); /* Position after the "}" */ 5052 vFAIL("Zero length \\N{}"); 5053 } 5054 5055 if (cp_count) { 5056 *cp_count = 0; 5057 } 5058 nextchar(pRExC_state); 5059 if (! node_p) { 5060 return FALSE; 5061 } 5062 5063 *node_p = reg_node(pRExC_state, NOTHING); 5064 return TRUE; 5065 } 5066 5067 while (isBLANK(*RExC_parse)) { 5068 RExC_parse_inc_by(1); 5069 } 5070 5071 e = endbrace; 5072 while (RExC_parse < e && isBLANK(*(e-1))) { 5073 e--; 5074 } 5075 5076 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) { 5077 5078 /* Here, the name isn't of the form U+.... This can happen if the 5079 * pattern is single-quoted, so didn't get evaluated in toke.c. Now 5080 * is the time to find out what the name means */ 5081 5082 const STRLEN name_len = e - RExC_parse; 5083 SV * value_sv; /* What does this name evaluate to */ 5084 SV ** value_svp; 5085 const U8 * value; /* string of name's value */ 5086 STRLEN value_len; /* and its length */ 5087 5088 /* RExC_unlexed_names is a hash of names that weren't evaluated by 5089 * toke.c, and their values. Make sure is initialized */ 5090 if (! RExC_unlexed_names) { 5091 RExC_unlexed_names = newHV(); 5092 } 5093 5094 /* If we have already seen this name in this pattern, use that. This 5095 * allows us to only call the charnames handler once per name per 5096 * pattern. A broken or malicious handler could return something 5097 * different each time, which could cause the results to vary depending 5098 * on if something gets added or subtracted from the pattern that 5099 * causes the number of passes to change, for example */ 5100 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse, 5101 name_len, 0))) 5102 { 5103 value_sv = *value_svp; 5104 } 5105 else { /* Otherwise we have to go out and get the name */ 5106 const char * error_msg = NULL; 5107 value_sv = get_and_check_backslash_N_name(RExC_parse, e, 5108 UTF, 5109 &error_msg); 5110 if (error_msg) { 5111 RExC_parse_set(endbrace); 5112 vFAIL(error_msg); 5113 } 5114 5115 /* If no error message, should have gotten a valid return */ 5116 assert (value_sv); 5117 5118 /* Save the name's meaning for later use */ 5119 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len, 5120 value_sv, 0)) 5121 { 5122 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 5123 } 5124 } 5125 5126 /* Here, we have the value the name evaluates to in 'value_sv' */ 5127 value = (U8 *) SvPV(value_sv, value_len); 5128 5129 /* See if the result is one code point vs 0 or multiple */ 5130 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv) 5131 ? UTF8SKIP(value) 5132 : 1))) 5133 { 5134 /* Here, exactly one code point. If that isn't what is wanted, 5135 * fail */ 5136 if (! code_point_p) { 5137 RExC_parse_set(p); 5138 return FALSE; 5139 } 5140 5141 /* Convert from string to numeric code point */ 5142 *code_point_p = (SvUTF8(value_sv)) 5143 ? valid_utf8_to_uvchr(value, NULL) 5144 : *value; 5145 5146 /* Have parsed this entire single code point \N{...}. *cp_count 5147 * has already been set to 1, so don't do it again. */ 5148 RExC_parse_set(endbrace); 5149 nextchar(pRExC_state); 5150 return TRUE; 5151 } /* End of is a single code point */ 5152 5153 /* Count the code points, if caller desires. The API says to do this 5154 * even if we will later return FALSE */ 5155 if (cp_count) { 5156 *cp_count = 0; 5157 5158 *cp_count = (SvUTF8(value_sv)) 5159 ? utf8_length(value, value + value_len) 5160 : value_len; 5161 } 5162 5163 /* Fail if caller doesn't want to handle a multi-code-point sequence. 5164 * But don't back the pointer up if the caller wants to know how many 5165 * code points there are (they need to handle it themselves in this 5166 * case). */ 5167 if (! node_p) { 5168 if (! cp_count) { 5169 RExC_parse_set(p); 5170 } 5171 return FALSE; 5172 } 5173 5174 /* Convert this to a sub-pattern of the form "(?: ... )", and then call 5175 * reg recursively to parse it. That way, it retains its atomicness, 5176 * while not having to worry about any special handling that some code 5177 * points may have. */ 5178 5179 substitute_parse = newSVpvs("?:"); 5180 sv_catsv(substitute_parse, value_sv); 5181 sv_catpv(substitute_parse, ")"); 5182 5183 /* The value should already be native, so no need to convert on EBCDIC 5184 * platforms.*/ 5185 assert(! RExC_recode_x_to_native); 5186 5187 } 5188 else { /* \N{U+...} */ 5189 Size_t count = 0; /* code point count kept internally */ 5190 5191 /* We can get to here when the input is \N{U+...} or when toke.c has 5192 * converted a name to the \N{U+...} form. This include changing a 5193 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */ 5194 5195 RExC_parse_inc_by(2); /* Skip past the 'U+' */ 5196 5197 /* Code points are separated by dots. The '}' terminates the whole 5198 * thing. */ 5199 5200 do { /* Loop until the ending brace */ 5201 I32 flags = PERL_SCAN_SILENT_OVERFLOW 5202 | PERL_SCAN_SILENT_ILLDIGIT 5203 | PERL_SCAN_NOTIFY_ILLDIGIT 5204 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES 5205 | PERL_SCAN_DISALLOW_PREFIX; 5206 STRLEN len = e - RExC_parse; 5207 NV overflow_value; 5208 char * start_digit = RExC_parse; 5209 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value); 5210 5211 if (len == 0) { 5212 RExC_parse_inc_by(1); 5213 bad_NU: 5214 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 5215 } 5216 5217 RExC_parse_inc_by(len); 5218 5219 if (cp > MAX_LEGAL_CP) { 5220 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0)); 5221 } 5222 5223 if (RExC_parse >= e) { /* Got to the closing '}' */ 5224 if (count) { 5225 goto do_concat; 5226 } 5227 5228 /* Here, is a single code point; fail if doesn't want that */ 5229 if (! code_point_p) { 5230 RExC_parse_set(p); 5231 return FALSE; 5232 } 5233 5234 /* A single code point is easy to handle; just return it */ 5235 *code_point_p = UNI_TO_NATIVE(cp); 5236 RExC_parse_set(endbrace); 5237 nextchar(pRExC_state); 5238 return TRUE; 5239 } 5240 5241 /* Here, the parse stopped bfore the ending brace. This is legal 5242 * only if that character is a dot separating code points, like a 5243 * multiple character sequence (of the form "\N{U+c1.c2. ... }". 5244 * So the next character must be a dot (and the one after that 5245 * can't be the ending brace, or we'd have something like 5246 * \N{U+100.} ) 5247 * */ 5248 if (*RExC_parse != '.' || RExC_parse + 1 >= e) { 5249 /*point to after 1st invalid */ 5250 RExC_parse_incf(RExC_orig_utf8); 5251 /*Guard against malformed utf8*/ 5252 RExC_parse_set(MIN(e, RExC_parse)); 5253 goto bad_NU; 5254 } 5255 5256 /* Here, looks like its really a multiple character sequence. Fail 5257 * if that's not what the caller wants. But continue with counting 5258 * and error checking if they still want a count */ 5259 if (! node_p && ! cp_count) { 5260 return FALSE; 5261 } 5262 5263 /* What is done here is to convert this to a sub-pattern of the 5264 * form \x{char1}\x{char2}... and then call reg recursively to 5265 * parse it (enclosing in "(?: ... )" ). That way, it retains its 5266 * atomicness, while not having to worry about special handling 5267 * that some code points may have. We don't create a subpattern, 5268 * but go through the motions of code point counting and error 5269 * checking, if the caller doesn't want a node returned. */ 5270 5271 if (node_p && ! substitute_parse) { 5272 substitute_parse = newSVpvs("?:"); 5273 } 5274 5275 do_concat: 5276 5277 if (node_p) { 5278 /* Convert to notation the rest of the code understands */ 5279 sv_catpvs(substitute_parse, "\\x{"); 5280 sv_catpvn(substitute_parse, start_digit, 5281 RExC_parse - start_digit); 5282 sv_catpvs(substitute_parse, "}"); 5283 } 5284 5285 /* Move to after the dot (or ending brace the final time through.) 5286 * */ 5287 RExC_parse_inc_by(1); 5288 count++; 5289 5290 } while (RExC_parse < e); 5291 5292 if (! node_p) { /* Doesn't want the node */ 5293 assert (cp_count); 5294 5295 *cp_count = count; 5296 return FALSE; 5297 } 5298 5299 sv_catpvs(substitute_parse, ")"); 5300 5301 /* The values are Unicode, and therefore have to be converted to native 5302 * on a non-Unicode (meaning non-ASCII) platform. */ 5303 SET_recode_x_to_native(1); 5304 } 5305 5306 /* Here, we have the string the name evaluates to, ready to be parsed, 5307 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}" 5308 * constructs. This can be called from within a substitute parse already. 5309 * The error reporting mechanism doesn't work for 2 levels of this, but the 5310 * code above has validated this new construct, so there should be no 5311 * errors generated by the below. And this isn't an exact copy, so the 5312 * mechanism to seamlessly deal with this won't work, so turn off warnings 5313 * during it */ 5314 save_start = RExC_start; 5315 orig_end = RExC_end; 5316 5317 RExC_start = SvPVX(substitute_parse); 5318 RExC_parse_set(RExC_start); 5319 RExC_end = RExC_parse + SvCUR(substitute_parse); 5320 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE; 5321 5322 *node_p = reg(pRExC_state, 1, &flags, depth+1); 5323 5324 /* Restore the saved values */ 5325 RESTORE_WARNINGS; 5326 RExC_start = save_start; 5327 RExC_parse_set(endbrace); 5328 RExC_end = orig_end; 5329 SET_recode_x_to_native(0); 5330 5331 SvREFCNT_dec_NN(substitute_parse); 5332 5333 if (! *node_p) { 5334 RETURN_FAIL_ON_RESTART(flags, flagp); 5335 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf, 5336 (UV) flags); 5337 } 5338 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED); 5339 5340 nextchar(pRExC_state); 5341 5342 return TRUE; 5343} 5344 5345 5346STATIC U8 5347S_compute_EXACTish(RExC_state_t *pRExC_state) 5348{ 5349 U8 op; 5350 5351 PERL_ARGS_ASSERT_COMPUTE_EXACTISH; 5352 5353 if (! FOLD) { 5354 return (LOC) 5355 ? EXACTL 5356 : EXACT; 5357 } 5358 5359 op = get_regex_charset(RExC_flags); 5360 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { 5361 op--; /* /a is same as /u, and map /aa's offset to what /a's would have 5362 been, so there is no hole */ 5363 } 5364 5365 return op + EXACTF; 5366} 5367 5368/* Parse backref decimal value, unless it's too big to sensibly be a backref, 5369 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ 5370 5371static I32 5372S_backref_value(char *p, char *e) 5373{ 5374 const char* endptr = e; 5375 UV val; 5376 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX) 5377 return (I32)val; 5378 return I32_MAX; 5379} 5380 5381 5382/* 5383 - regatom - the lowest level 5384 5385 Try to identify anything special at the start of the current parse position. 5386 If there is, then handle it as required. This may involve generating a 5387 single regop, such as for an assertion; or it may involve recursing, such as 5388 to handle a () structure. 5389 5390 If the string doesn't start with something special then we gobble up 5391 as much literal text as we can. If we encounter a quantifier, we have to 5392 back off the final literal character, as that quantifier applies to just it 5393 and not to the whole string of literals. 5394 5395 Once we have been able to handle whatever type of thing started the 5396 sequence, we return the offset into the regex engine program being compiled 5397 at which any next regnode should be placed. 5398 5399 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN. 5400 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be 5401 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 5402 Otherwise does not return 0. 5403 5404 Note: we have to be careful with escapes, as they can be both literal 5405 and special, and in the case of \10 and friends, context determines which. 5406 5407 A summary of the code structure is: 5408 5409 switch (first_byte) { 5410 cases for each special: 5411 handle this special; 5412 break; 5413 case '\\': 5414 switch (2nd byte) { 5415 cases for each unambiguous special: 5416 handle this special; 5417 break; 5418 cases for each ambiguous special/literal: 5419 disambiguate; 5420 if (special) handle here 5421 else goto defchar; 5422 default: // unambiguously literal: 5423 goto defchar; 5424 } 5425 default: // is a literal char 5426 // FALL THROUGH 5427 defchar: 5428 create EXACTish node for literal; 5429 while (more input and node isn't full) { 5430 switch (input_byte) { 5431 cases for each special; 5432 make sure parse pointer is set so that the next call to 5433 regatom will see this special first 5434 goto loopdone; // EXACTish node terminated by prev. char 5435 default: 5436 append char to EXACTISH node; 5437 } 5438 get next input byte; 5439 } 5440 loopdone: 5441 } 5442 return the generated node; 5443 5444 Specifically there are two separate switches for handling 5445 escape sequences, with the one for handling literal escapes requiring 5446 a dummy entry for all of the special escapes that are actually handled 5447 by the other. 5448 5449*/ 5450 5451STATIC regnode_offset 5452S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 5453{ 5454 regnode_offset ret = 0; 5455 I32 flags = 0; 5456 char *atom_parse_start; 5457 U8 op; 5458 int invert = 0; 5459 5460 DECLARE_AND_GET_RE_DEBUG_FLAGS; 5461 5462 *flagp = 0; /* Initialize. */ 5463 5464 DEBUG_PARSE("atom"); 5465 5466 PERL_ARGS_ASSERT_REGATOM; 5467 5468 tryagain: 5469 atom_parse_start = RExC_parse; 5470 assert(RExC_parse < RExC_end); 5471 switch ((U8)*RExC_parse) { 5472 case '^': 5473 RExC_seen_zerolen++; 5474 nextchar(pRExC_state); 5475 if (RExC_flags & RXf_PMf_MULTILINE) 5476 ret = reg_node(pRExC_state, MBOL); 5477 else 5478 ret = reg_node(pRExC_state, SBOL); 5479 break; 5480 case '$': 5481 nextchar(pRExC_state); 5482 if (*RExC_parse) 5483 RExC_seen_zerolen++; 5484 if (RExC_flags & RXf_PMf_MULTILINE) 5485 ret = reg_node(pRExC_state, MEOL); 5486 else 5487 ret = reg_node(pRExC_state, SEOL); 5488 break; 5489 case '.': 5490 nextchar(pRExC_state); 5491 if (RExC_flags & RXf_PMf_SINGLELINE) 5492 ret = reg_node(pRExC_state, SANY); 5493 else 5494 ret = reg_node(pRExC_state, REG_ANY); 5495 *flagp |= HASWIDTH|SIMPLE; 5496 MARK_NAUGHTY(1); 5497 break; 5498 case '[': 5499 { 5500 char * const cc_parse_start = ++RExC_parse; 5501 ret = regclass(pRExC_state, flagp, depth+1, 5502 FALSE, /* means parse the whole char class */ 5503 TRUE, /* allow multi-char folds */ 5504 FALSE, /* don't silence non-portable warnings. */ 5505 (bool) RExC_strict, 5506 TRUE, /* Allow an optimized regnode result */ 5507 NULL); 5508 if (ret == 0) { 5509 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 5510 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, 5511 (UV) *flagp); 5512 } 5513 if (*RExC_parse != ']') { 5514 RExC_parse_set(cc_parse_start); 5515 vFAIL("Unmatched ["); 5516 } 5517 nextchar(pRExC_state); 5518 break; 5519 } 5520 case '(': 5521 nextchar(pRExC_state); 5522 ret = reg(pRExC_state, 2, &flags, depth+1); 5523 if (ret == 0) { 5524 if (flags & TRYAGAIN) { 5525 if (RExC_parse >= RExC_end) { 5526 /* Make parent create an empty node if needed. */ 5527 *flagp |= TRYAGAIN; 5528 return(0); 5529 } 5530 goto tryagain; 5531 } 5532 RETURN_FAIL_ON_RESTART(flags, flagp); 5533 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf, 5534 (UV) flags); 5535 } 5536 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED); 5537 break; 5538 case '|': 5539 case ')': 5540 if (flags & TRYAGAIN) { 5541 *flagp |= TRYAGAIN; 5542 return 0; 5543 } 5544 vFAIL("Internal urp"); 5545 /* Supposed to be caught earlier. */ 5546 break; 5547 case '?': 5548 case '+': 5549 case '*': 5550 RExC_parse_inc_by(1); 5551 vFAIL("Quantifier follows nothing"); 5552 break; 5553 case '\\': 5554 /* Special Escapes 5555 5556 This switch handles escape sequences that resolve to some kind 5557 of special regop and not to literal text. Escape sequences that 5558 resolve to literal text are handled below in the switch marked 5559 "Literal Escapes". 5560 5561 Every entry in this switch *must* have a corresponding entry 5562 in the literal escape switch. However, the opposite is not 5563 required, as the default for this switch is to jump to the 5564 literal text handling code. 5565 */ 5566 RExC_parse_inc_by(1); 5567 switch ((U8)*RExC_parse) { 5568 /* Special Escapes */ 5569 case 'A': 5570 RExC_seen_zerolen++; 5571 /* Under wildcards, this is changed to match \n; should be 5572 * invisible to the user, as they have to compile under /m */ 5573 if (RExC_pm_flags & PMf_WILDCARD) { 5574 ret = reg_node(pRExC_state, MBOL); 5575 } 5576 else { 5577 ret = reg_node(pRExC_state, SBOL); 5578 /* SBOL is shared with /^/ so we set the flags so we can tell 5579 * /\A/ from /^/ in split. */ 5580 FLAGS(REGNODE_p(ret)) = 1; 5581 } 5582 goto finish_meta_pat; 5583 case 'G': 5584 if (RExC_pm_flags & PMf_WILDCARD) { 5585 RExC_parse_inc_by(1); 5586 /* diag_listed_as: Use of %s is not allowed in Unicode property 5587 wildcard subpatterns in regex; marked by <-- HERE in m/%s/ 5588 */ 5589 vFAIL("Use of '\\G' is not allowed in Unicode property" 5590 " wildcard subpatterns"); 5591 } 5592 ret = reg_node(pRExC_state, GPOS); 5593 RExC_seen |= REG_GPOS_SEEN; 5594 goto finish_meta_pat; 5595 case 'K': 5596 if (!RExC_in_lookaround) { 5597 RExC_seen_zerolen++; 5598 ret = reg_node(pRExC_state, KEEPS); 5599 /* XXX:dmq : disabling in-place substitution seems to 5600 * be necessary here to avoid cases of memory corruption, as 5601 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 5602 */ 5603 RExC_seen |= REG_LOOKBEHIND_SEEN; 5604 goto finish_meta_pat; 5605 } 5606 else { 5607 ++RExC_parse; /* advance past the 'K' */ 5608 vFAIL("\\K not permitted in lookahead/lookbehind"); 5609 } 5610 case 'Z': 5611 if (RExC_pm_flags & PMf_WILDCARD) { 5612 /* See comment under \A above */ 5613 ret = reg_node(pRExC_state, MEOL); 5614 } 5615 else { 5616 ret = reg_node(pRExC_state, SEOL); 5617 } 5618 RExC_seen_zerolen++; /* Do not optimize RE away */ 5619 goto finish_meta_pat; 5620 case 'z': 5621 if (RExC_pm_flags & PMf_WILDCARD) { 5622 /* See comment under \A above */ 5623 ret = reg_node(pRExC_state, MEOL); 5624 } 5625 else { 5626 ret = reg_node(pRExC_state, EOS); 5627 } 5628 RExC_seen_zerolen++; /* Do not optimize RE away */ 5629 goto finish_meta_pat; 5630 case 'C': 5631 vFAIL("\\C no longer supported"); 5632 case 'X': 5633 ret = reg_node(pRExC_state, CLUMP); 5634 *flagp |= HASWIDTH; 5635 goto finish_meta_pat; 5636 5637 case 'B': 5638 invert = 1; 5639 /* FALLTHROUGH */ 5640 case 'b': 5641 { 5642 U8 flags = 0; 5643 regex_charset charset = get_regex_charset(RExC_flags); 5644 5645 RExC_seen_zerolen++; 5646 RExC_seen |= REG_LOOKBEHIND_SEEN; 5647 op = BOUND + charset; 5648 5649 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { 5650 flags = TRADITIONAL_BOUND; 5651 if (op > BOUNDA) { /* /aa is same as /a */ 5652 op = BOUNDA; 5653 } 5654 } 5655 else { 5656 STRLEN length; 5657 char name = *RExC_parse; 5658 char * endbrace = (char *) memchr(RExC_parse, '}', 5659 RExC_end - RExC_parse); 5660 char * e = endbrace; 5661 5662 RExC_parse_inc_by(2); 5663 5664 if (! endbrace) { 5665 vFAIL2("Missing right brace on \\%c{}", name); 5666 } 5667 5668 while (isBLANK(*RExC_parse)) { 5669 RExC_parse_inc_by(1); 5670 } 5671 5672 while (RExC_parse < e && isBLANK(*(e - 1))) { 5673 e--; 5674 } 5675 5676 if (e == RExC_parse) { 5677 RExC_parse_set(endbrace + 1); /* After the '}' */ 5678 vFAIL2("Empty \\%c{}", name); 5679 } 5680 5681 length = e - RExC_parse; 5682 5683 switch (*RExC_parse) { 5684 case 'g': 5685 if ( length != 1 5686 && (memNEs(RExC_parse + 1, length - 1, "cb"))) 5687 { 5688 goto bad_bound_type; 5689 } 5690 flags = GCB_BOUND; 5691 break; 5692 case 'l': 5693 if (length != 2 || *(RExC_parse + 1) != 'b') { 5694 goto bad_bound_type; 5695 } 5696 flags = LB_BOUND; 5697 break; 5698 case 's': 5699 if (length != 2 || *(RExC_parse + 1) != 'b') { 5700 goto bad_bound_type; 5701 } 5702 flags = SB_BOUND; 5703 break; 5704 case 'w': 5705 if (length != 2 || *(RExC_parse + 1) != 'b') { 5706 goto bad_bound_type; 5707 } 5708 flags = WB_BOUND; 5709 break; 5710 default: 5711 bad_bound_type: 5712 RExC_parse_set(e); 5713 vFAIL2utf8f( 5714 "'%" UTF8f "' is an unknown bound type", 5715 UTF8fARG(UTF, length, e - length)); 5716 NOT_REACHED; /*NOTREACHED*/ 5717 } 5718 RExC_parse_set(endbrace); 5719 REQUIRE_UNI_RULES(flagp, 0); 5720 5721 if (op == BOUND) { 5722 op = BOUNDU; 5723 } 5724 else if (op >= BOUNDA) { /* /aa is same as /a */ 5725 op = BOUNDU; 5726 length += 4; 5727 5728 /* Don't have to worry about UTF-8, in this message because 5729 * to get here the contents of the \b must be ASCII */ 5730 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ 5731 "Using /u for '%.*s' instead of /%s", 5732 (unsigned) length, 5733 endbrace - length + 1, 5734 (charset == REGEX_ASCII_RESTRICTED_CHARSET) 5735 ? ASCII_RESTRICT_PAT_MODS 5736 : ASCII_MORE_RESTRICT_PAT_MODS); 5737 } 5738 } 5739 5740 if (op == BOUND) { 5741 RExC_seen_d_op = TRUE; 5742 } 5743 else if (op == BOUNDL) { 5744 RExC_contains_locale = 1; 5745 } 5746 5747 if (invert) { 5748 op += NBOUND - BOUND; 5749 } 5750 5751 ret = reg_node(pRExC_state, op); 5752 FLAGS(REGNODE_p(ret)) = flags; 5753 5754 goto finish_meta_pat; 5755 } 5756 5757 case 'R': 5758 ret = reg_node(pRExC_state, LNBREAK); 5759 *flagp |= HASWIDTH|SIMPLE; 5760 goto finish_meta_pat; 5761 5762 case 'd': 5763 case 'D': 5764 case 'h': 5765 case 'H': 5766 case 'p': 5767 case 'P': 5768 case 's': 5769 case 'S': 5770 case 'v': 5771 case 'V': 5772 case 'w': 5773 case 'W': 5774 /* These all have the same meaning inside [brackets], and it knows 5775 * how to do the best optimizations for them. So, pretend we found 5776 * these within brackets, and let it do the work */ 5777 RExC_parse--; 5778 5779 ret = regclass(pRExC_state, flagp, depth+1, 5780 TRUE, /* means just parse this element */ 5781 FALSE, /* don't allow multi-char folds */ 5782 FALSE, /* don't silence non-portable warnings. It 5783 would be a bug if these returned 5784 non-portables */ 5785 (bool) RExC_strict, 5786 TRUE, /* Allow an optimized regnode result */ 5787 NULL); 5788 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 5789 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 5790 * multi-char folds are allowed. */ 5791 if (!ret) 5792 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, 5793 (UV) *flagp); 5794 5795 RExC_parse--; /* regclass() leaves this one too far ahead */ 5796 5797 finish_meta_pat: 5798 /* The escapes above that don't take a parameter can't be 5799 * followed by a '{'. But 'pX', 'p{foo}' and 5800 * correspondingly 'P' can be */ 5801 if ( RExC_parse - atom_parse_start == 1 5802 && UCHARAT(RExC_parse + 1) == '{' 5803 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL))) 5804 { 5805 RExC_parse_inc_by(2); 5806 vFAIL("Unescaped left brace in regex is illegal here"); 5807 } 5808 nextchar(pRExC_state); 5809 break; 5810 case 'N': 5811 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the 5812 * \N{...} evaluates to a sequence of more than one code points). 5813 * The function call below returns a regnode, which is our result. 5814 * The parameters cause it to fail if the \N{} evaluates to a 5815 * single code point; we handle those like any other literal. The 5816 * reason that the multicharacter case is handled here and not as 5817 * part of the EXACtish code is because of quantifiers. In 5818 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it 5819 * this way makes that Just Happen. dmq. 5820 * join_exact() will join this up with adjacent EXACTish nodes 5821 * later on, if appropriate. */ 5822 ++RExC_parse; 5823 if (grok_bslash_N(pRExC_state, 5824 &ret, /* Want a regnode returned */ 5825 NULL, /* Fail if evaluates to a single code 5826 point */ 5827 NULL, /* Don't need a count of how many code 5828 points */ 5829 flagp, 5830 RExC_strict, 5831 depth) 5832 ) { 5833 break; 5834 } 5835 5836 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 5837 5838 /* Here, evaluates to a single code point. Go get that */ 5839 RExC_parse_set(atom_parse_start); 5840 goto defchar; 5841 5842 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */ 5843 parse_named_seq: /* Also handle non-numeric \g{...} */ 5844 { 5845 char ch; 5846 if ( RExC_parse >= RExC_end - 1 5847 || (( ch = RExC_parse[1]) != '<' 5848 && ch != '\'' 5849 && ch != '{')) 5850 { 5851 RExC_parse_inc_by(1); 5852 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 5853 vFAIL2("Sequence %.2s... not terminated", atom_parse_start); 5854 } else { 5855 RExC_parse_inc_by(2); 5856 if (ch == '{') { 5857 while (isBLANK(*RExC_parse)) { 5858 RExC_parse_inc_by(1); 5859 } 5860 } 5861 ret = handle_named_backref(pRExC_state, 5862 flagp, 5863 atom_parse_start, 5864 (ch == '<') 5865 ? '>' 5866 : (ch == '{') 5867 ? '}' 5868 : '\''); 5869 } 5870 break; 5871 } 5872 case 'g': 5873 case '1': case '2': case '3': case '4': 5874 case '5': case '6': case '7': case '8': case '9': 5875 { 5876 I32 num; 5877 char * endbrace = NULL; 5878 char * s = RExC_parse; 5879 char * e = RExC_end; 5880 5881 if (*s == 'g') { 5882 bool isrel = 0; 5883 5884 s++; 5885 if (*s == '{') { 5886 endbrace = (char *) memchr(s, '}', RExC_end - s); 5887 if (! endbrace ) { 5888 5889 /* Missing '}'. Position after the number to give 5890 * a better indication to the user of where the 5891 * problem is. */ 5892 s++; 5893 if (*s == '-') { 5894 s++; 5895 } 5896 5897 /* If it looks to be a name and not a number, go 5898 * handle it there */ 5899 if (! isDIGIT(*s)) { 5900 goto parse_named_seq; 5901 } 5902 5903 do { 5904 s++; 5905 } while isDIGIT(*s); 5906 5907 RExC_parse_set(s); 5908 vFAIL("Unterminated \\g{...} pattern"); 5909 } 5910 5911 s++; /* Past the '{' */ 5912 5913 while (isBLANK(*s)) { 5914 s++; 5915 } 5916 5917 /* Ignore trailing blanks */ 5918 e = endbrace; 5919 while (s < e && isBLANK(*(e - 1))) { 5920 e--; 5921 } 5922 } 5923 5924 /* Here, have isolated the meat of the construct from any 5925 * surrounding braces */ 5926 5927 if (*s == '-') { 5928 isrel = 1; 5929 s++; 5930 } 5931 5932 if (endbrace && !isDIGIT(*s)) { 5933 goto parse_named_seq; 5934 } 5935 5936 RExC_parse_set(s); 5937 num = S_backref_value(RExC_parse, RExC_end); 5938 if (num == 0) 5939 vFAIL("Reference to invalid group 0"); 5940 else if (num == I32_MAX) { 5941 if (isDIGIT(*RExC_parse)) 5942 vFAIL("Reference to nonexistent group"); 5943 else 5944 vFAIL("Unterminated \\g... pattern"); 5945 } 5946 5947 if (isrel) { 5948 num = RExC_npar - num; 5949 if (num < 1) 5950 vFAIL("Reference to nonexistent or unclosed group"); 5951 } 5952 else 5953 if (num < RExC_logical_npar) { 5954 num = RExC_logical_to_parno[num]; 5955 } 5956 else 5957 if (ALL_PARENS_COUNTED) { 5958 if (num < RExC_logical_total_parens) 5959 num = RExC_logical_to_parno[num]; 5960 else { 5961 num = -1; 5962 } 5963 } 5964 else{ 5965 REQUIRE_PARENS_PASS; 5966 } 5967 } 5968 else { 5969 num = S_backref_value(RExC_parse, RExC_end); 5970 /* bare \NNN might be backref or octal - if it is larger 5971 * than or equal RExC_npar then it is assumed to be an 5972 * octal escape. Note RExC_npar is +1 from the actual 5973 * number of parens. */ 5974 /* Note we do NOT check if num == I32_MAX here, as that is 5975 * handled by the RExC_npar check */ 5976 5977 if ( /* any numeric escape < 10 is always a backref */ 5978 num > 9 5979 /* any numeric escape < RExC_npar is a backref */ 5980 && num >= RExC_logical_npar 5981 /* cannot be an octal escape if it starts with [89] 5982 * */ 5983 && ! inRANGE(*RExC_parse, '8', '9') 5984 ) { 5985 /* Probably not meant to be a backref, instead likely 5986 * to be an octal character escape, e.g. \35 or \777. 5987 * The above logic should make it obvious why using 5988 * octal escapes in patterns is problematic. - Yves */ 5989 RExC_parse_set(atom_parse_start); 5990 goto defchar; 5991 } 5992 if (num < RExC_logical_npar) { 5993 num = RExC_logical_to_parno[num]; 5994 } 5995 else 5996 if (ALL_PARENS_COUNTED) { 5997 if (num < RExC_logical_total_parens) { 5998 num = RExC_logical_to_parno[num]; 5999 } else { 6000 num = -1; 6001 } 6002 } else { 6003 REQUIRE_PARENS_PASS; 6004 } 6005 } 6006 6007 /* At this point RExC_parse points at a numeric escape like 6008 * \12 or \88 or the digits in \g{34} or \g34 or something 6009 * similar, which we should NOT treat as an octal escape. It 6010 * may or may not be a valid backref escape. For instance 6011 * \88888888 is unlikely to be a valid backref. 6012 * 6013 * We've already figured out what value the digits represent. 6014 * Now, move the parse to beyond them. */ 6015 if (endbrace) { 6016 RExC_parse_set(endbrace + 1); 6017 } 6018 else while (isDIGIT(*RExC_parse)) { 6019 RExC_parse_inc_by(1); 6020 } 6021 if (num < 0) 6022 vFAIL("Reference to nonexistent group"); 6023 6024 if (num >= (I32)RExC_npar) { 6025 /* It might be a forward reference; we can't fail until we 6026 * know, by completing the parse to get all the groups, and 6027 * then reparsing */ 6028 if (ALL_PARENS_COUNTED) { 6029 if (num >= RExC_total_parens) { 6030 vFAIL("Reference to nonexistent group"); 6031 } 6032 } 6033 else { 6034 REQUIRE_PARENS_PASS; 6035 } 6036 } 6037 RExC_sawback = 1; 6038 ret = reg2node(pRExC_state, 6039 ((! FOLD) 6040 ? REF 6041 : (ASCII_FOLD_RESTRICTED) 6042 ? REFFA 6043 : (AT_LEAST_UNI_SEMANTICS) 6044 ? REFFU 6045 : (LOC) 6046 ? REFFL 6047 : REFF), 6048 num, RExC_nestroot); 6049 if (RExC_nestroot && num >= RExC_nestroot) 6050 FLAGS(REGNODE_p(ret)) = VOLATILE_REF; 6051 if (OP(REGNODE_p(ret)) == REFF) { 6052 RExC_seen_d_op = TRUE; 6053 } 6054 *flagp |= HASWIDTH; 6055 6056 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 6057 FALSE /* Don't force to /x */ ); 6058 } 6059 break; 6060 case '\0': 6061 if (RExC_parse >= RExC_end) 6062 FAIL("Trailing \\"); 6063 /* FALLTHROUGH */ 6064 default: 6065 /* Do not generate "unrecognized" warnings here, we fall 6066 back into the quick-grab loop below */ 6067 RExC_parse_set(atom_parse_start); 6068 goto defchar; 6069 } /* end of switch on a \foo sequence */ 6070 break; 6071 6072 case '#': 6073 6074 /* '#' comments should have been spaced over before this function was 6075 * called */ 6076 assert((RExC_flags & RXf_PMf_EXTENDED) == 0); 6077 /* 6078 if (RExC_flags & RXf_PMf_EXTENDED) { 6079 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) ); 6080 if (RExC_parse < RExC_end) 6081 goto tryagain; 6082 } 6083 */ 6084 6085 /* FALLTHROUGH */ 6086 6087 default: 6088 defchar: { 6089 6090 /* Here, we have determined that the next thing is probably a 6091 * literal character. RExC_parse points to the first byte of its 6092 * definition. (It still may be an escape sequence that evaluates 6093 * to a single character) */ 6094 6095 STRLEN len = 0; 6096 UV ender = 0; 6097 char *p; 6098 char *s, *old_s = NULL, *old_old_s = NULL; 6099 char *s0; 6100 U32 max_string_len = 255; 6101 6102 /* We may have to reparse the node, artificially stopping filling 6103 * it early, based on info gleaned in the first parse. This 6104 * variable gives where we stop. Make it above the normal stopping 6105 * place first time through; otherwise it would stop too early */ 6106 U32 upper_fill = max_string_len + 1; 6107 6108 /* We start out as an EXACT node, even if under /i, until we find a 6109 * character which is in a fold. The algorithm now segregates into 6110 * separate nodes, characters that fold from those that don't under 6111 * /i. (This hopefully will create nodes that are fixed strings 6112 * even under /i, giving the optimizer something to grab on to.) 6113 * So, if a node has something in it and the next character is in 6114 * the opposite category, that node is closed up, and the function 6115 * returns. Then regatom is called again, and a new node is 6116 * created for the new category. */ 6117 U8 node_type = EXACT; 6118 6119 /* Assume the node will be fully used; the excess is given back at 6120 * the end. Under /i, we may need to temporarily add the fold of 6121 * an extra character or two at the end to check for splitting 6122 * multi-char folds, so allocate extra space for that. We can't 6123 * make any other length assumptions, as a byte input sequence 6124 * could shrink down. */ 6125 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len 6126 + ((! FOLD) 6127 ? 0 6128 : 2 * ((UTF) 6129 ? UTF8_MAXBYTES_CASE 6130 /* Max non-UTF-8 expansion is 2 */ : 2))); 6131 6132 bool next_is_quantifier; 6133 char * oldp = NULL; 6134 6135 /* We can convert EXACTF nodes to EXACTFU if they contain only 6136 * characters that match identically regardless of the target 6137 * string's UTF8ness. The reason to do this is that EXACTF is not 6138 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at 6139 * runtime. 6140 * 6141 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they 6142 * contain only above-Latin1 characters (hence must be in UTF8), 6143 * which don't participate in folds with Latin1-range characters, 6144 * as the latter's folds aren't known until runtime. */ 6145 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); 6146 6147 /* Single-character EXACTish nodes are almost always SIMPLE. This 6148 * allows us to override this as encountered */ 6149 U8 maybe_SIMPLE = SIMPLE; 6150 6151 /* Does this node contain something that can't match unless the 6152 * target string is (also) in UTF-8 */ 6153 bool requires_utf8_target = FALSE; 6154 6155 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */ 6156 bool has_ss = FALSE; 6157 6158 /* So is the MICRO SIGN */ 6159 bool has_micro_sign = FALSE; 6160 6161 /* Set when we fill up the current node and there is still more 6162 * text to process */ 6163 bool overflowed; 6164 6165 /* Allocate an EXACT node. The node_type may change below to 6166 * another EXACTish node, but since the size of the node doesn't 6167 * change, it works */ 6168 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes); 6169 FILL_NODE(ret, node_type); 6170 RExC_emit += NODE_STEP_REGNODE; 6171 6172 s = STRING(REGNODE_p(ret)); 6173 6174 s0 = s; 6175 6176 reparse: 6177 6178 p = RExC_parse; 6179 len = 0; 6180 s = s0; 6181 node_type = EXACT; 6182 oldp = NULL; 6183 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); 6184 maybe_SIMPLE = SIMPLE; 6185 requires_utf8_target = FALSE; 6186 has_ss = FALSE; 6187 has_micro_sign = FALSE; 6188 6189 continue_parse: 6190 6191 /* This breaks under rare circumstances. If folding, we do not 6192 * want to split a node at a character that is a non-final in a 6193 * multi-char fold, as an input string could just happen to want to 6194 * match across the node boundary. The code at the end of the loop 6195 * looks for this, and backs off until it finds not such a 6196 * character, but it is possible (though extremely, extremely 6197 * unlikely) for all characters in the node to be non-final fold 6198 * ones, in which case we just leave the node fully filled, and 6199 * hope that it doesn't match the string in just the wrong place */ 6200 6201 assert( ! UTF /* Is at the beginning of a character */ 6202 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) 6203 || UTF8_IS_START(UCHARAT(RExC_parse))); 6204 6205 overflowed = FALSE; 6206 6207 /* Here, we have a literal character. Find the maximal string of 6208 * them in the input that we can fit into a single EXACTish node. 6209 * We quit at the first non-literal or when the node gets full, or 6210 * under /i the categorization of folding/non-folding character 6211 * changes */ 6212 while (p < RExC_end && len < upper_fill) { 6213 6214 /* In most cases each iteration adds one byte to the output. 6215 * The exceptions override this */ 6216 Size_t added_len = 1; 6217 6218 oldp = p; 6219 old_old_s = old_s; 6220 old_s = s; 6221 6222 /* White space has already been ignored */ 6223 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 6224 || ! is_PATWS_safe((p), RExC_end, UTF)); 6225 6226 switch ((U8)*p) { 6227 const char* message; 6228 U32 packed_warn; 6229 U8 grok_c_char; 6230 6231 case '^': 6232 case '$': 6233 case '.': 6234 case '[': 6235 case '(': 6236 case ')': 6237 case '|': 6238 goto loopdone; 6239 case '\\': 6240 /* Literal Escapes Switch 6241 6242 This switch is meant to handle escape sequences that 6243 resolve to a literal character. 6244 6245 Every escape sequence that represents something 6246 else, like an assertion or a char class, is handled 6247 in the switch marked 'Special Escapes' above in this 6248 routine, but also has an entry here as anything that 6249 isn't explicitly mentioned here will be treated as 6250 an unescaped equivalent literal. 6251 */ 6252 6253 switch ((U8)*++p) { 6254 6255 /* These are all the special escapes. */ 6256 case 'A': /* Start assertion */ 6257 case 'b': case 'B': /* Word-boundary assertion*/ 6258 case 'C': /* Single char !DANGEROUS! */ 6259 case 'd': case 'D': /* digit class */ 6260 case 'g': case 'G': /* generic-backref, pos assertion */ 6261 case 'h': case 'H': /* HORIZWS */ 6262 case 'k': case 'K': /* named backref, keep marker */ 6263 case 'p': case 'P': /* Unicode property */ 6264 case 'R': /* LNBREAK */ 6265 case 's': case 'S': /* space class */ 6266 case 'v': case 'V': /* VERTWS */ 6267 case 'w': case 'W': /* word class */ 6268 case 'X': /* eXtended Unicode "combining 6269 character sequence" */ 6270 case 'z': case 'Z': /* End of line/string assertion */ 6271 --p; 6272 goto loopdone; 6273 6274 /* Anything after here is an escape that resolves to a 6275 literal. (Except digits, which may or may not) 6276 */ 6277 case 'n': 6278 ender = '\n'; 6279 p++; 6280 break; 6281 case 'N': /* Handle a single-code point named character. */ 6282 RExC_parse_set( p + 1 ); 6283 if (! grok_bslash_N(pRExC_state, 6284 NULL, /* Fail if evaluates to 6285 anything other than a 6286 single code point */ 6287 &ender, /* The returned single code 6288 point */ 6289 NULL, /* Don't need a count of 6290 how many code points */ 6291 flagp, 6292 RExC_strict, 6293 depth) 6294 ) { 6295 if (*flagp & NEED_UTF8) 6296 FAIL("panic: grok_bslash_N set NEED_UTF8"); 6297 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 6298 6299 /* Here, it wasn't a single code point. Go close 6300 * up this EXACTish node. The switch() prior to 6301 * this switch handles the other cases */ 6302 p = oldp; 6303 RExC_parse_set(p); 6304 goto loopdone; 6305 } 6306 p = RExC_parse; 6307 RExC_parse_set(atom_parse_start); 6308 6309 /* The \N{} means the pattern, if previously /d, 6310 * becomes /u. That means it can't be an EXACTF node, 6311 * but an EXACTFU */ 6312 if (node_type == EXACTF) { 6313 node_type = EXACTFU; 6314 6315 /* If the node already contains something that 6316 * differs between EXACTF and EXACTFU, reparse it 6317 * as EXACTFU */ 6318 if (! maybe_exactfu) { 6319 len = 0; 6320 s = s0; 6321 goto reparse; 6322 } 6323 } 6324 6325 break; 6326 case 'r': 6327 ender = '\r'; 6328 p++; 6329 break; 6330 case 't': 6331 ender = '\t'; 6332 p++; 6333 break; 6334 case 'f': 6335 ender = '\f'; 6336 p++; 6337 break; 6338 case 'e': 6339 ender = ESC_NATIVE; 6340 p++; 6341 break; 6342 case 'a': 6343 ender = '\a'; 6344 p++; 6345 break; 6346 case 'o': 6347 if (! grok_bslash_o(&p, 6348 RExC_end, 6349 &ender, 6350 &message, 6351 &packed_warn, 6352 (bool) RExC_strict, 6353 FALSE, /* No illegal cp's */ 6354 UTF)) 6355 { 6356 RExC_parse_set(p); /* going to die anyway; point to 6357 exact spot of failure */ 6358 vFAIL(message); 6359 } 6360 6361 if (message && TO_OUTPUT_WARNINGS(p)) { 6362 warn_non_literal_string(p, packed_warn, message); 6363 } 6364 break; 6365 case 'x': 6366 if (! grok_bslash_x(&p, 6367 RExC_end, 6368 &ender, 6369 &message, 6370 &packed_warn, 6371 (bool) RExC_strict, 6372 FALSE, /* No illegal cp's */ 6373 UTF)) 6374 { 6375 RExC_parse_set(p); /* going to die anyway; point 6376 to exact spot of failure */ 6377 vFAIL(message); 6378 } 6379 6380 if (message && TO_OUTPUT_WARNINGS(p)) { 6381 warn_non_literal_string(p, packed_warn, message); 6382 } 6383 6384#ifdef EBCDIC 6385 if (ender < 0x100) { 6386 if (RExC_recode_x_to_native) { 6387 ender = LATIN1_TO_NATIVE(ender); 6388 } 6389 } 6390#endif 6391 break; 6392 case 'c': 6393 p++; 6394 if (! grok_bslash_c(*p, &grok_c_char, 6395 &message, &packed_warn)) 6396 { 6397 /* going to die anyway; point to exact spot of 6398 * failure */ 6399 char *new_p= p + ((UTF) 6400 ? UTF8_SAFE_SKIP(p, RExC_end) 6401 : 1); 6402 RExC_parse_set(new_p); 6403 vFAIL(message); 6404 } 6405 6406 ender = grok_c_char; 6407 p++; 6408 if (message && TO_OUTPUT_WARNINGS(p)) { 6409 warn_non_literal_string(p, packed_warn, message); 6410 } 6411 6412 break; 6413 case '8': case '9': /* must be a backreference */ 6414 --p; 6415 /* we have an escape like \8 which cannot be an octal escape 6416 * so we exit the loop, and let the outer loop handle this 6417 * escape which may or may not be a legitimate backref. */ 6418 goto loopdone; 6419 case '1': case '2': case '3':case '4': 6420 case '5': case '6': case '7': 6421 6422 /* When we parse backslash escapes there is ambiguity 6423 * between backreferences and octal escapes. Any escape 6424 * from \1 - \9 is a backreference, any multi-digit 6425 * escape which does not start with 0 and which when 6426 * evaluated as decimal could refer to an already 6427 * parsed capture buffer is a back reference. Anything 6428 * else is octal. 6429 * 6430 * Note this implies that \118 could be interpreted as 6431 * 118 OR as "\11" . "8" depending on whether there 6432 * were 118 capture buffers defined already in the 6433 * pattern. */ 6434 6435 /* NOTE, RExC_npar is 1 more than the actual number of 6436 * parens we have seen so far, hence the "<" as opposed 6437 * to "<=" */ 6438 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar) 6439 { /* Not to be treated as an octal constant, go 6440 find backref */ 6441 p = oldp; 6442 goto loopdone; 6443 } 6444 /* FALLTHROUGH */ 6445 case '0': 6446 { 6447 I32 flags = PERL_SCAN_SILENT_ILLDIGIT 6448 | PERL_SCAN_NOTIFY_ILLDIGIT; 6449 STRLEN numlen = 3; 6450 ender = grok_oct(p, &numlen, &flags, NULL); 6451 p += numlen; 6452 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT) 6453 && isDIGIT(*p) /* like \08, \178 */ 6454 && ckWARN(WARN_REGEXP)) 6455 { 6456 reg_warn_non_literal_string( 6457 p + 1, 6458 form_alien_digit_msg(8, numlen, p, 6459 RExC_end, UTF, FALSE)); 6460 } 6461 } 6462 break; 6463 case '\0': 6464 if (p >= RExC_end) 6465 FAIL("Trailing \\"); 6466 /* FALLTHROUGH */ 6467 default: 6468 if (isALPHANUMERIC(*p)) { 6469 /* An alpha followed by '{' is going to fail next 6470 * iteration, so don't output this warning in that 6471 * case */ 6472 if (! isALPHA(*p) || *(p + 1) != '{') { 6473 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s" 6474 " passed through", p); 6475 } 6476 } 6477 goto normal_default; 6478 } /* End of switch on '\' */ 6479 break; 6480 case '{': 6481 /* Trying to gain new uses for '{' without breaking too 6482 * much existing code is hard. The solution currently 6483 * adopted is: 6484 * 1) If there is no ambiguity that a '{' should always 6485 * be taken literally, at the start of a construct, we 6486 * just do so. 6487 * 2) If the literal '{' conflicts with our desired use 6488 * of it as a metacharacter, we die. The deprecation 6489 * cycles for this have come and gone. 6490 * 3) If there is ambiguity, we raise a simple warning. 6491 * This could happen, for example, if the user 6492 * intended it to introduce a quantifier, but slightly 6493 * misspelled the quantifier. Without this warning, 6494 * the quantifier would silently be taken as a literal 6495 * string of characters instead of a meta construct */ 6496 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) { 6497 if ( RExC_strict 6498 || ( p > atom_parse_start + 1 6499 && isALPHA_A(*(p - 1)) 6500 && *(p - 2) == '\\')) 6501 { 6502 RExC_parse_set(p + 1); 6503 vFAIL("Unescaped left brace in regex is " 6504 "illegal here"); 6505 } 6506 ckWARNreg(p + 1, "Unescaped left brace in regex is" 6507 " passed through"); 6508 } 6509 goto normal_default; 6510 case '}': 6511 case ']': 6512 if (p > RExC_parse && RExC_strict) { 6513 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p); 6514 } 6515 /*FALLTHROUGH*/ 6516 default: /* A literal character */ 6517 normal_default: 6518 if (! UTF8_IS_INVARIANT(*p) && UTF) { 6519 STRLEN numlen; 6520 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 6521 &numlen, UTF8_ALLOW_DEFAULT); 6522 p += numlen; 6523 } 6524 else 6525 ender = (U8) *p++; 6526 break; 6527 } /* End of switch on the literal */ 6528 6529 /* Here, have looked at the literal character, and <ender> 6530 * contains its ordinal; <p> points to the character after it. 6531 * */ 6532 6533 if (ender > 255) { 6534 REQUIRE_UTF8(flagp); 6535 if ( UNICODE_IS_PERL_EXTENDED(ender) 6536 && TO_OUTPUT_WARNINGS(p)) 6537 { 6538 ckWARN2_non_literal_string(p, 6539 packWARN(WARN_PORTABLE), 6540 PL_extended_cp_format, 6541 ender); 6542 } 6543 } 6544 6545 /* We need to check if the next non-ignored thing is a 6546 * quantifier. Move <p> to after anything that should be 6547 * ignored, which, as a side effect, positions <p> for the next 6548 * loop iteration */ 6549 skip_to_be_ignored_text(pRExC_state, &p, 6550 FALSE /* Don't force to /x */ ); 6551 6552 /* If the next thing is a quantifier, it applies to this 6553 * character only, which means that this character has to be in 6554 * its own node and can't just be appended to the string in an 6555 * existing node, so if there are already other characters in 6556 * the node, close the node with just them, and set up to do 6557 * this character again next time through, when it will be the 6558 * only thing in its new node */ 6559 6560 next_is_quantifier = LIKELY(p < RExC_end) 6561 && UNLIKELY(isQUANTIFIER(p, RExC_end)); 6562 6563 if (next_is_quantifier && LIKELY(len)) { 6564 p = oldp; 6565 goto loopdone; 6566 } 6567 6568 /* Ready to add 'ender' to the node */ 6569 6570 if (! FOLD) { /* The simple case, just append the literal */ 6571 not_fold_common: 6572 6573 /* Don't output if it would overflow */ 6574 if (UNLIKELY(len > max_string_len - ((UTF) 6575 ? UVCHR_SKIP(ender) 6576 : 1))) 6577 { 6578 overflowed = TRUE; 6579 break; 6580 } 6581 6582 if (UVCHR_IS_INVARIANT(ender) || ! UTF) { 6583 *(s++) = (char) ender; 6584 } 6585 else { 6586 U8 * new_s = uvchr_to_utf8((U8*)s, ender); 6587 added_len = (char *) new_s - s; 6588 s = (char *) new_s; 6589 6590 if (ender > 255) { 6591 requires_utf8_target = TRUE; 6592 } 6593 } 6594 } 6595 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { 6596 6597 /* Here are folding under /l, and the code point is 6598 * problematic. If this is the first character in the 6599 * node, change the node type to folding. Otherwise, if 6600 * this is the first problematic character, close up the 6601 * existing node, so can start a new node with this one */ 6602 if (! len) { 6603 node_type = EXACTFL; 6604 RExC_contains_locale = 1; 6605 } 6606 else if (node_type == EXACT) { 6607 p = oldp; 6608 goto loopdone; 6609 } 6610 6611 /* This problematic code point means we can't simplify 6612 * things */ 6613 maybe_exactfu = FALSE; 6614 6615 /* Although these two characters have folds that are 6616 * locale-problematic, they also have folds to above Latin1 6617 * that aren't a problem. Doing these now helps at 6618 * runtime. */ 6619 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU 6620 || ender == LATIN_CAPITAL_LETTER_SHARP_S)) 6621 { 6622 goto fold_anyway; 6623 } 6624 6625 /* Here, we are adding a problematic fold character. 6626 * "Problematic" in this context means that its fold isn't 6627 * known until runtime. (The non-problematic code points 6628 * are the above-Latin1 ones that fold to also all 6629 * above-Latin1. Their folds don't vary no matter what the 6630 * locale is.) But here we have characters whose fold 6631 * depends on the locale. We just add in the unfolded 6632 * character, and wait until runtime to fold it */ 6633 goto not_fold_common; 6634 } 6635 else /* regular fold; see if actually is in a fold */ 6636 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender)) 6637 || (ender > 255 6638 && ! _invlist_contains_cp(PL_in_some_fold, ender))) 6639 { 6640 /* Here, folding, but the character isn't in a fold. 6641 * 6642 * Start a new node if previous characters in the node were 6643 * folded */ 6644 if (len && node_type != EXACT) { 6645 p = oldp; 6646 goto loopdone; 6647 } 6648 6649 /* Here, continuing a node with non-folded characters. Add 6650 * this one */ 6651 goto not_fold_common; 6652 } 6653 else { /* Here, does participate in some fold */ 6654 6655 /* If this is the first character in the node, change its 6656 * type to folding. Otherwise, if this is the first 6657 * folding character in the node, close up the existing 6658 * node, so can start a new node with this one. */ 6659 if (! len) { 6660 node_type = compute_EXACTish(pRExC_state); 6661 } 6662 else if (node_type == EXACT) { 6663 p = oldp; 6664 goto loopdone; 6665 } 6666 6667 if (UTF) { /* Alway use the folded value for UTF-8 6668 patterns */ 6669 if (UVCHR_IS_INVARIANT(ender)) { 6670 if (UNLIKELY(len + 1 > max_string_len)) { 6671 overflowed = TRUE; 6672 break; 6673 } 6674 6675 *(s)++ = (U8) toFOLD(ender); 6676 } 6677 else { 6678 UV folded; 6679 6680 fold_anyway: 6681 folded = _to_uni_fold_flags( 6682 ender, 6683 (U8 *) s, /* We have allocated extra space 6684 in 's' so can't run off the 6685 end */ 6686 &added_len, 6687 FOLD_FLAGS_FULL 6688 | (( ASCII_FOLD_RESTRICTED 6689 || node_type == EXACTFL) 6690 ? FOLD_FLAGS_NOMIX_ASCII 6691 : 0)); 6692 if (UNLIKELY(len + added_len > max_string_len)) { 6693 overflowed = TRUE; 6694 break; 6695 } 6696 6697 s += added_len; 6698 6699 if ( folded > 255 6700 && LIKELY(folded != GREEK_SMALL_LETTER_MU)) 6701 { 6702 /* U+B5 folds to the MU, so its possible for a 6703 * non-UTF-8 target to match it */ 6704 requires_utf8_target = TRUE; 6705 } 6706 } 6707 } 6708 else { /* Here is non-UTF8. */ 6709 6710 /* The fold will be one or (rarely) two characters. 6711 * Check that there's room for at least a single one 6712 * before setting any flags, etc. Because otherwise an 6713 * overflowing character could cause a flag to be set 6714 * even though it doesn't end up in this node. (For 6715 * the two character fold, we check again, before 6716 * setting any flags) */ 6717 if (UNLIKELY(len + 1 > max_string_len)) { 6718 overflowed = TRUE; 6719 break; 6720 } 6721 6722#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 6723 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 6724 || UNICODE_DOT_DOT_VERSION > 0) 6725 6726 /* On non-ancient Unicodes, check for the only possible 6727 * multi-char fold */ 6728 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { 6729 6730 /* This potential multi-char fold means the node 6731 * can't be simple (because it could match more 6732 * than a single char). And in some cases it will 6733 * match 'ss', so set that flag */ 6734 maybe_SIMPLE = 0; 6735 has_ss = TRUE; 6736 6737 /* It can't change to be an EXACTFU (unless already 6738 * is one). We fold it iff under /u rules. */ 6739 if (node_type != EXACTFU) { 6740 maybe_exactfu = FALSE; 6741 } 6742 else { 6743 if (UNLIKELY(len + 2 > max_string_len)) { 6744 overflowed = TRUE; 6745 break; 6746 } 6747 6748 *(s++) = 's'; 6749 *(s++) = 's'; 6750 added_len = 2; 6751 6752 goto done_with_this_char; 6753 } 6754 } 6755 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's')) 6756 && LIKELY(len > 0) 6757 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's'))) 6758 { 6759 /* Also, the sequence 'ss' is special when not 6760 * under /u. If the target string is UTF-8, it 6761 * should match SHARP S; otherwise it won't. So, 6762 * here we have to exclude the possibility of this 6763 * node moving to /u.*/ 6764 has_ss = TRUE; 6765 maybe_exactfu = FALSE; 6766 } 6767#endif 6768 /* Here, the fold will be a single character */ 6769 6770 if (UNLIKELY(ender == MICRO_SIGN)) { 6771 has_micro_sign = TRUE; 6772 } 6773 else if (PL_fold[ender] != PL_fold_latin1[ender]) { 6774 6775 /* If the character's fold differs between /d and 6776 * /u, this can't change to be an EXACTFU node */ 6777 maybe_exactfu = FALSE; 6778 } 6779 6780 *(s++) = (DEPENDS_SEMANTICS) 6781 ? (char) toFOLD(ender) 6782 6783 /* Under /u, the fold of any character in 6784 * the 0-255 range happens to be its 6785 * lowercase equivalent, except for LATIN 6786 * SMALL LETTER SHARP S, which was handled 6787 * above, and the MICRO SIGN, whose fold 6788 * requires UTF-8 to represent. */ 6789 : (char) toLOWER_L1(ender); 6790 } 6791 } /* End of adding current character to the node */ 6792 6793 done_with_this_char: 6794 6795 len += added_len; 6796 6797 if (next_is_quantifier) { 6798 6799 /* Here, the next input is a quantifier, and to get here, 6800 * the current character is the only one in the node. */ 6801 goto loopdone; 6802 } 6803 6804 } /* End of loop through literal characters */ 6805 6806 /* Here we have either exhausted the input or run out of room in 6807 * the node. If the former, we are done. (If we encountered a 6808 * character that can't be in the node, transfer is made directly 6809 * to <loopdone>, and so we wouldn't have fallen off the end of the 6810 * loop.) */ 6811 if (LIKELY(! overflowed)) { 6812 goto loopdone; 6813 } 6814 6815 /* Here we have run out of room. We can grow plain EXACT and 6816 * LEXACT nodes. If the pattern is gigantic enough, though, 6817 * eventually we'll have to artificially chunk the pattern into 6818 * multiple nodes. */ 6819 if (! LOC && (node_type == EXACT || node_type == LEXACT)) { 6820 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret))); 6821 Size_t overhead_expansion = 0; 6822 char temp[256]; 6823 Size_t max_nodes_for_string; 6824 Size_t achievable; 6825 SSize_t delta; 6826 6827 /* Here we couldn't fit the final character in the current 6828 * node, so it will have to be reparsed, no matter what else we 6829 * do */ 6830 p = oldp; 6831 6832 /* If would have overflowed a regular EXACT node, switch 6833 * instead to an LEXACT. The code below is structured so that 6834 * the actual growing code is common to changing from an EXACT 6835 * or just increasing the LEXACT size. This means that we have 6836 * to save the string in the EXACT case before growing, and 6837 * then copy it afterwards to its new location */ 6838 if (node_type == EXACT) { 6839 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT); 6840 RExC_emit += overhead_expansion; 6841 Copy(s0, temp, len, char); 6842 } 6843 6844 /* Ready to grow. If it was a plain EXACT, the string was 6845 * saved, and the first few bytes of it overwritten by adding 6846 * an argument field. We assume, as we do elsewhere in this 6847 * file, that one byte of remaining input will translate into 6848 * one byte of output, and if that's too small, we grow again, 6849 * if too large the excess memory is freed at the end */ 6850 6851 max_nodes_for_string = U16_MAX - overhead - overhead_expansion; 6852 achievable = MIN(max_nodes_for_string, 6853 current_string_nodes + STR_SZ(RExC_end - p)); 6854 delta = achievable - current_string_nodes; 6855 6856 /* If there is just no more room, go finish up this chunk of 6857 * the pattern. */ 6858 if (delta <= 0) { 6859 goto loopdone; 6860 } 6861 6862 change_engine_size(pRExC_state, delta + overhead_expansion); 6863 current_string_nodes += delta; 6864 max_string_len 6865 = sizeof(struct regnode) * current_string_nodes; 6866 upper_fill = max_string_len + 1; 6867 6868 /* If the length was small, we know this was originally an 6869 * EXACT node now converted to LEXACT, and the string has to be 6870 * restored. Otherwise the string was untouched. 260 is just 6871 * a number safely above 255 so don't have to worry about 6872 * getting it precise */ 6873 if (len < 260) { 6874 node_type = LEXACT; 6875 FILL_NODE(ret, node_type); 6876 s0 = STRING(REGNODE_p(ret)); 6877 Copy(temp, s0, len, char); 6878 s = s0 + len; 6879 } 6880 6881 goto continue_parse; 6882 } 6883 else if (FOLD) { 6884 bool splittable = FALSE; 6885 bool backed_up = FALSE; 6886 char * e; /* should this be U8? */ 6887 char * s_start; /* should this be U8? */ 6888 6889 /* Here is /i. Running out of room creates a problem if we are 6890 * folding, and the split happens in the middle of a 6891 * multi-character fold, as a match that should have occurred, 6892 * won't, due to the way nodes are matched, and our artificial 6893 * boundary. So back off until we aren't splitting such a 6894 * fold. If there is no such place to back off to, we end up 6895 * taking the entire node as-is. This can happen if the node 6896 * consists entirely of 'f' or entirely of 's' characters (or 6897 * things that fold to them) as 'ff' and 'ss' are 6898 * multi-character folds. 6899 * 6900 * The Unicode standard says that multi character folds consist 6901 * of either two or three characters. That means we would be 6902 * splitting one if the final character in the node is at the 6903 * beginning of either type, or is the second of a three 6904 * character fold. 6905 * 6906 * At this point: 6907 * ender is the code point of the character that won't fit 6908 * in the node 6909 * s points to just beyond the final byte in the node. 6910 * It's where we would place ender if there were 6911 * room, and where in fact we do place ender's fold 6912 * in the code below, as we've over-allocated space 6913 * for s0 (hence s) to allow for this 6914 * e starts at 's' and advances as we append things. 6915 * old_s is the same as 's'. (If ender had fit, 's' would 6916 * have been advanced to beyond it). 6917 * old_old_s points to the beginning byte of the final 6918 * character in the node 6919 * p points to the beginning byte in the input of the 6920 * character beyond 'ender'. 6921 * oldp points to the beginning byte in the input of 6922 * 'ender'. 6923 * 6924 * In the case of /il, we haven't folded anything that could be 6925 * affected by the locale. That means only above-Latin1 6926 * characters that fold to other above-latin1 characters get 6927 * folded at compile time. To check where a good place to 6928 * split nodes is, everything in it will have to be folded. 6929 * The boolean 'maybe_exactfu' keeps track in /il if there are 6930 * any unfolded characters in the node. */ 6931 bool need_to_fold_loc = LOC && ! maybe_exactfu; 6932 6933 /* If we do need to fold the node, we need a place to store the 6934 * folded copy, and a way to map back to the unfolded original 6935 * */ 6936 char * locfold_buf = NULL; 6937 Size_t * loc_correspondence = NULL; 6938 6939 if (! need_to_fold_loc) { /* The normal case. Just 6940 initialize to the actual node */ 6941 e = s; 6942 s_start = s0; 6943 s = old_old_s; /* Point to the beginning of the final char 6944 that fits in the node */ 6945 } 6946 else { 6947 6948 /* Here, we have filled a /il node, and there are unfolded 6949 * characters in it. If the runtime locale turns out to be 6950 * UTF-8, there are possible multi-character folds, just 6951 * like when not under /l. The node hence can't terminate 6952 * in the middle of such a fold. To determine this, we 6953 * have to create a folded copy of this node. That means 6954 * reparsing the node, folding everything assuming a UTF-8 6955 * locale. (If at runtime it isn't such a locale, the 6956 * actions here wouldn't have been necessary, but we have 6957 * to assume the worst case.) If we find we need to back 6958 * off the folded string, we do so, and then map that 6959 * position back to the original unfolded node, which then 6960 * gets output, truncated at that spot */ 6961 6962 char * redo_p = RExC_parse; 6963 char * redo_e; 6964 char * old_redo_e; 6965 6966 /* Allow enough space assuming a single byte input folds to 6967 * a single byte output, plus assume that the two unparsed 6968 * characters (that we may need) fold to the largest number 6969 * of bytes possible, plus extra for one more worst case 6970 * scenario. In the loop below, if we start eating into 6971 * that final spare space, we enlarge this initial space */ 6972 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1; 6973 6974 Newxz(locfold_buf, size, char); 6975 Newxz(loc_correspondence, size, Size_t); 6976 6977 /* Redo this node's parse, folding into 'locfold_buf' */ 6978 redo_p = RExC_parse; 6979 old_redo_e = redo_e = locfold_buf; 6980 while (redo_p <= oldp) { 6981 6982 old_redo_e = redo_e; 6983 loc_correspondence[redo_e - locfold_buf] 6984 = redo_p - RExC_parse; 6985 6986 if (UTF) { 6987 Size_t added_len; 6988 6989 (void) _to_utf8_fold_flags((U8 *) redo_p, 6990 (U8 *) RExC_end, 6991 (U8 *) redo_e, 6992 &added_len, 6993 FOLD_FLAGS_FULL); 6994 redo_e += added_len; 6995 redo_p += UTF8SKIP(redo_p); 6996 } 6997 else { 6998 6999 /* Note that if this code is run on some ancient 7000 * Unicode versions, SHARP S doesn't fold to 'ss', 7001 * but rather than clutter the code with #ifdef's, 7002 * as is done above, we ignore that possibility. 7003 * This is ok because this code doesn't affect what 7004 * gets matched, but merely where the node gets 7005 * split */ 7006 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) { 7007 *redo_e++ = toLOWER_L1(UCHARAT(redo_p)); 7008 } 7009 else { 7010 *redo_e++ = 's'; 7011 *redo_e++ = 's'; 7012 } 7013 redo_p++; 7014 } 7015 7016 7017 /* If we're getting so close to the end that a 7018 * worst-case fold in the next character would cause us 7019 * to overflow, increase, assuming one byte output byte 7020 * per one byte input one, plus room for another worst 7021 * case fold */ 7022 if ( redo_p <= oldp 7023 && redo_e > locfold_buf + size 7024 - (UTF8_MAXBYTES_CASE + 1)) 7025 { 7026 Size_t new_size = size 7027 + (oldp - redo_p) 7028 + UTF8_MAXBYTES_CASE + 1; 7029 Ptrdiff_t e_offset = redo_e - locfold_buf; 7030 7031 Renew(locfold_buf, new_size, char); 7032 Renew(loc_correspondence, new_size, Size_t); 7033 size = new_size; 7034 7035 redo_e = locfold_buf + e_offset; 7036 } 7037 } 7038 7039 /* Set so that things are in terms of the folded, temporary 7040 * string */ 7041 s = old_redo_e; 7042 s_start = locfold_buf; 7043 e = redo_e; 7044 7045 } 7046 7047 /* Here, we have 's', 's_start' and 'e' set up to point to the 7048 * input that goes into the node, folded. 7049 * 7050 * If the final character of the node and the fold of ender 7051 * form the first two characters of a three character fold, we 7052 * need to peek ahead at the next (unparsed) character in the 7053 * input to determine if the three actually do form such a 7054 * fold. Just looking at that character is not generally 7055 * sufficient, as it could be, for example, an escape sequence 7056 * that evaluates to something else, and it needs to be folded. 7057 * 7058 * khw originally thought to just go through the parse loop one 7059 * extra time, but that doesn't work easily as that iteration 7060 * could cause things to think that the parse is over and to 7061 * goto loopdone. The character could be a '$' for example, or 7062 * the character beyond could be a quantifier, and other 7063 * glitches as well. 7064 * 7065 * The solution used here for peeking ahead is to look at that 7066 * next character. If it isn't ASCII punctuation, then it will 7067 * be something that would continue on in an EXACTish node if 7068 * there were space. We append the fold of it to s, having 7069 * reserved enough room in s0 for the purpose. If we can't 7070 * reasonably peek ahead, we instead assume the worst case: 7071 * that it is something that would form the completion of a 7072 * multi-char fold. 7073 * 7074 * If we can't split between s and ender, we work backwards 7075 * character-by-character down to s0. At each current point 7076 * see if we are at the beginning of a multi-char fold. If so, 7077 * that means we would be splitting the fold across nodes, and 7078 * so we back up one and try again. 7079 * 7080 * If we're not at the beginning, we still could be at the 7081 * final two characters of a (rare) three character fold. We 7082 * check if the sequence starting at the character before the 7083 * current position (and including the current and next 7084 * characters) is a three character fold. If not, the node can 7085 * be split here. If it is, we have to backup two characters 7086 * and try again. 7087 * 7088 * Otherwise, the node can be split at the current position. 7089 * 7090 * The same logic is used for UTF-8 patterns and not */ 7091 if (UTF) { 7092 Size_t added_len; 7093 7094 /* Append the fold of ender */ 7095 (void) _to_uni_fold_flags( 7096 ender, 7097 (U8 *) e, 7098 &added_len, 7099 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 7100 ? FOLD_FLAGS_NOMIX_ASCII 7101 : 0)); 7102 e += added_len; 7103 7104 /* 's' and the character folded to by ender may be the 7105 * first two of a three-character fold, in which case the 7106 * node should not be split here. That may mean examining 7107 * the so-far unparsed character starting at 'p'. But if 7108 * ender folded to more than one character, we already have 7109 * three characters to look at. Also, we first check if 7110 * the sequence consisting of s and the next character form 7111 * the first two of some three character fold. If not, 7112 * there's no need to peek ahead. */ 7113 if ( added_len <= UTF8SKIP(e - added_len) 7114 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e))) 7115 { 7116 /* Here, the two do form the beginning of a potential 7117 * three character fold. The unexamined character may 7118 * or may not complete it. Peek at it. It might be 7119 * something that ends the node or an escape sequence, 7120 * in which case we don't know without a lot of work 7121 * what it evaluates to, so we have to assume the worst 7122 * case: that it does complete the fold, and so we 7123 * can't split here. All such instances will have 7124 * that character be an ASCII punctuation character, 7125 * like a backslash. So, for that case, backup one and 7126 * drop down to try at that position */ 7127 if (isPUNCT(*p)) { 7128 s = (char *) utf8_hop_back((U8 *) s, -1, 7129 (U8 *) s_start); 7130 backed_up = TRUE; 7131 } 7132 else { 7133 /* Here, since it's not punctuation, it must be a 7134 * real character, and we can append its fold to 7135 * 'e' (having deliberately reserved enough space 7136 * for this eventuality) and drop down to check if 7137 * the three actually do form a folded sequence */ 7138 (void) _to_utf8_fold_flags( 7139 (U8 *) p, (U8 *) RExC_end, 7140 (U8 *) e, 7141 &added_len, 7142 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 7143 ? FOLD_FLAGS_NOMIX_ASCII 7144 : 0)); 7145 e += added_len; 7146 } 7147 } 7148 7149 /* Here, we either have three characters available in 7150 * sequence starting at 's', or we have two characters and 7151 * know that the following one can't possibly be part of a 7152 * three character fold. We go through the node backwards 7153 * until we find a place where we can split it without 7154 * breaking apart a multi-character fold. At any given 7155 * point we have to worry about if such a fold begins at 7156 * the current 's', and also if a three-character fold 7157 * begins at s-1, (containing s and s+1). Splitting in 7158 * either case would break apart a fold */ 7159 do { 7160 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1, 7161 (U8 *) s_start); 7162 7163 /* If is a multi-char fold, can't split here. Backup 7164 * one char and try again */ 7165 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) { 7166 s = prev_s; 7167 backed_up = TRUE; 7168 continue; 7169 } 7170 7171 /* If the two characters beginning at 's' are part of a 7172 * three character fold starting at the character 7173 * before s, we can't split either before or after s. 7174 * Backup two chars and try again */ 7175 if ( LIKELY(s > s_start) 7176 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e))) 7177 { 7178 s = prev_s; 7179 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start); 7180 backed_up = TRUE; 7181 continue; 7182 } 7183 7184 /* Here there's no multi-char fold between s and the 7185 * next character following it. We can split */ 7186 splittable = TRUE; 7187 break; 7188 7189 } while (s > s_start); /* End of loops backing up through the node */ 7190 7191 /* Here we either couldn't find a place to split the node, 7192 * or else we broke out of the loop setting 'splittable' to 7193 * true. In the latter case, the place to split is between 7194 * the first and second characters in the sequence starting 7195 * at 's' */ 7196 if (splittable) { 7197 s += UTF8SKIP(s); 7198 } 7199 } 7200 else { /* Pattern not UTF-8 */ 7201 if ( ender != LATIN_SMALL_LETTER_SHARP_S 7202 || ASCII_FOLD_RESTRICTED) 7203 { 7204 assert( toLOWER_L1(ender) < 256 ); 7205 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */ 7206 } 7207 else { 7208 *e++ = 's'; 7209 *e++ = 's'; 7210 } 7211 7212 if ( e - s <= 1 7213 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e))) 7214 { 7215 if (isPUNCT(*p)) { 7216 s--; 7217 backed_up = TRUE; 7218 } 7219 else { 7220 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S 7221 || ASCII_FOLD_RESTRICTED) 7222 { 7223 assert( toLOWER_L1(ender) < 256 ); 7224 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */ 7225 } 7226 else { 7227 *e++ = 's'; 7228 *e++ = 's'; 7229 } 7230 } 7231 } 7232 7233 do { 7234 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) { 7235 s--; 7236 backed_up = TRUE; 7237 continue; 7238 } 7239 7240 if ( LIKELY(s > s_start) 7241 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e))) 7242 { 7243 s -= 2; 7244 backed_up = TRUE; 7245 continue; 7246 } 7247 7248 splittable = TRUE; 7249 break; 7250 7251 } while (s > s_start); 7252 7253 if (splittable) { 7254 s++; 7255 } 7256 } 7257 7258 /* Here, we are done backing up. If we didn't backup at all 7259 * (the likely case), just proceed */ 7260 if (backed_up) { 7261 7262 /* If we did find a place to split, reparse the entire node 7263 * stopping where we have calculated. */ 7264 if (splittable) { 7265 7266 /* If we created a temporary folded string under /l, we 7267 * have to map that back to the original */ 7268 if (need_to_fold_loc) { 7269 upper_fill = loc_correspondence[s - s_start]; 7270 if (upper_fill == 0) { 7271 FAIL2("panic: loc_correspondence[%d] is 0", 7272 (int) (s - s_start)); 7273 } 7274 Safefree(locfold_buf); 7275 Safefree(loc_correspondence); 7276 } 7277 else { 7278 upper_fill = s - s0; 7279 } 7280 goto reparse; 7281 } 7282 7283 /* Here the node consists entirely of non-final multi-char 7284 * folds. (Likely it is all 'f's or all 's's.) There's no 7285 * decent place to split it, so give up and just take the 7286 * whole thing */ 7287 len = old_s - s0; 7288 } 7289 7290 if (need_to_fold_loc) { 7291 Safefree(locfold_buf); 7292 Safefree(loc_correspondence); 7293 } 7294 } /* End of verifying node ends with an appropriate char */ 7295 7296 /* We need to start the next node at the character that didn't fit 7297 * in this one */ 7298 p = oldp; 7299 7300 loopdone: /* Jumped to when encounters something that shouldn't be 7301 in the node */ 7302 7303 /* Free up any over-allocated space; cast is to silence bogus 7304 * warning in MS VC */ 7305 change_engine_size(pRExC_state, 7306 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len))); 7307 7308 /* I (khw) don't know if you can get here with zero length, but the 7309 * old code handled this situation by creating a zero-length EXACT 7310 * node. Might as well be NOTHING instead */ 7311 if (len == 0) { 7312 OP(REGNODE_p(ret)) = NOTHING; 7313 } 7314 else { 7315 7316 /* If the node type is EXACT here, check to see if it 7317 * should be EXACTL, or EXACT_REQ8. */ 7318 if (node_type == EXACT) { 7319 if (LOC) { 7320 node_type = EXACTL; 7321 } 7322 else if (requires_utf8_target) { 7323 node_type = EXACT_REQ8; 7324 } 7325 } 7326 else if (node_type == LEXACT) { 7327 if (requires_utf8_target) { 7328 node_type = LEXACT_REQ8; 7329 } 7330 } 7331 else if (FOLD) { 7332 if ( UNLIKELY(has_micro_sign || has_ss) 7333 && (node_type == EXACTFU || ( node_type == EXACTF 7334 && maybe_exactfu))) 7335 { /* These two conditions are problematic in non-UTF-8 7336 EXACTFU nodes. */ 7337 assert(! UTF); 7338 node_type = EXACTFUP; 7339 } 7340 else if (node_type == EXACTFL) { 7341 7342 /* 'maybe_exactfu' is deliberately set above to 7343 * indicate this node type, where all code points in it 7344 * are above 255 */ 7345 if (maybe_exactfu) { 7346 node_type = EXACTFLU8; 7347 } 7348 else if (UNLIKELY( 7349 _invlist_contains_cp(PL_HasMultiCharFold, ender))) 7350 { 7351 /* A character that folds to more than one will 7352 * match multiple characters, so can't be SIMPLE. 7353 * We don't have to worry about this with EXACTFLU8 7354 * nodes just above, as they have already been 7355 * folded (since the fold doesn't vary at run 7356 * time). Here, if the final character in the node 7357 * folds to multiple, it can't be simple. (This 7358 * only has an effect if the node has only a single 7359 * character, hence the final one, as elsewhere we 7360 * turn off simple for nodes whose length > 1 */ 7361 maybe_SIMPLE = 0; 7362 } 7363 } 7364 else if (node_type == EXACTF) { /* Means is /di */ 7365 7366 /* This intermediate variable is needed solely because 7367 * the asserts in the macro where used exceed Win32's 7368 * literal string capacity */ 7369 char first_char = * STRING(REGNODE_p(ret)); 7370 7371 /* If 'maybe_exactfu' is clear, then we need to stay 7372 * /di. If it is set, it means there are no code 7373 * points that match differently depending on UTF8ness 7374 * of the target string, so it can become an EXACTFU 7375 * node */ 7376 if (! maybe_exactfu) { 7377 RExC_seen_d_op = TRUE; 7378 } 7379 else if ( isALPHA_FOLD_EQ(first_char, 's') 7380 || isALPHA_FOLD_EQ(ender, 's')) 7381 { 7382 /* But, if the node begins or ends in an 's' we 7383 * have to defer changing it into an EXACTFU, as 7384 * the node could later get joined with another one 7385 * that ends or begins with 's' creating an 'ss' 7386 * sequence which would then wrongly match the 7387 * sharp s without the target being UTF-8. We 7388 * create a special node that we resolve later when 7389 * we join nodes together */ 7390 7391 node_type = EXACTFU_S_EDGE; 7392 } 7393 else { 7394 node_type = EXACTFU; 7395 } 7396 } 7397 7398 if (requires_utf8_target && node_type == EXACTFU) { 7399 node_type = EXACTFU_REQ8; 7400 } 7401 } 7402 7403 OP(REGNODE_p(ret)) = node_type; 7404 setSTR_LEN(REGNODE_p(ret), len); 7405 RExC_emit += STR_SZ(len); 7406 7407 /* If the node isn't a single character, it can't be SIMPLE */ 7408 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) { 7409 maybe_SIMPLE = 0; 7410 } 7411 7412 *flagp |= HASWIDTH | maybe_SIMPLE; 7413 } 7414 7415 RExC_parse_set(p); 7416 7417 { 7418 /* len is STRLEN which is unsigned, need to copy to signed */ 7419 IV iv = len; 7420 if (iv < 0) 7421 vFAIL("Internal disaster"); 7422 } 7423 7424 } /* End of label 'defchar:' */ 7425 break; 7426 } /* End of giant switch on input character */ 7427 7428 /* Position parse to next real character */ 7429 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 7430 FALSE /* Don't force to /x */ ); 7431 if ( *RExC_parse == '{' 7432 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL)) 7433 { 7434 if (RExC_strict) { 7435 RExC_parse_inc_by(1); 7436 vFAIL("Unescaped left brace in regex is illegal here"); 7437 } 7438 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is" 7439 " passed through"); 7440 } 7441 7442 return(ret); 7443} 7444 7445 7446void 7447Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) 7448{ 7449 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It 7450 * sets up the bitmap and any flags, removing those code points from the 7451 * inversion list, setting it to NULL should it become completely empty */ 7452 7453 7454 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST; 7455 7456 /* There is no bitmap for this node type */ 7457 if (REGNODE_TYPE(OP(node)) != ANYOF) { 7458 return; 7459 } 7460 7461 ANYOF_BITMAP_ZERO(node); 7462 if (*invlist_ptr) { 7463 7464 /* This gets set if we actually need to modify things */ 7465 bool change_invlist = FALSE; 7466 7467 UV start, end; 7468 7469 /* Start looking through *invlist_ptr */ 7470 invlist_iterinit(*invlist_ptr); 7471 while (invlist_iternext(*invlist_ptr, &start, &end)) { 7472 UV high; 7473 int i; 7474 7475 /* Quit if are above what we should change */ 7476 if (start >= NUM_ANYOF_CODE_POINTS) { 7477 break; 7478 } 7479 7480 change_invlist = TRUE; 7481 7482 /* Set all the bits in the range, up to the max that we are doing */ 7483 high = (end < NUM_ANYOF_CODE_POINTS - 1) 7484 ? end 7485 : NUM_ANYOF_CODE_POINTS - 1; 7486 for (i = start; i <= (int) high; i++) { 7487 ANYOF_BITMAP_SET(node, i); 7488 } 7489 } 7490 invlist_iterfinish(*invlist_ptr); 7491 7492 /* Done with loop; remove any code points that are in the bitmap from 7493 * *invlist_ptr */ 7494 if (change_invlist) { 7495 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); 7496 } 7497 7498 /* If have completely emptied it, remove it completely */ 7499 if (_invlist_len(*invlist_ptr) == 0) { 7500 SvREFCNT_dec_NN(*invlist_ptr); 7501 *invlist_ptr = NULL; 7502 } 7503 } 7504} 7505 7506/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 7507 Character classes ([:foo:]) can also be negated ([:^foo:]). 7508 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 7509 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 7510 but trigger failures because they are currently unimplemented. */ 7511 7512#define POSIXCC_DONE(c) ((c) == ':') 7513#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 7514#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 7515#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';') 7516 7517#define WARNING_PREFIX "Assuming NOT a POSIX class since " 7518#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one" 7519#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon" 7520 7521#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1) 7522 7523/* 'posix_warnings' and 'warn_text' are names of variables in the following 7524 * routine. q.v. */ 7525#define ADD_POSIX_WARNING(p, text) STMT_START { \ 7526 if (posix_warnings) { \ 7527 if (! RExC_warn_text ) RExC_warn_text = \ 7528 (AV *) sv_2mortal((SV *) newAV()); \ 7529 av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \ 7530 WARNING_PREFIX \ 7531 text \ 7532 REPORT_LOCATION, \ 7533 REPORT_LOCATION_ARGS(p))); \ 7534 } \ 7535 } STMT_END 7536#define CLEAR_POSIX_WARNINGS() \ 7537 STMT_START { \ 7538 if (posix_warnings && RExC_warn_text) \ 7539 av_clear(RExC_warn_text); \ 7540 } STMT_END 7541 7542#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \ 7543 STMT_START { \ 7544 CLEAR_POSIX_WARNINGS(); \ 7545 return ret; \ 7546 } STMT_END 7547 7548STATIC int 7549S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, 7550 7551 const char * const s, /* Where the putative posix class begins. 7552 Normally, this is one past the '['. This 7553 parameter exists so it can be somewhere 7554 besides RExC_parse. */ 7555 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or 7556 NULL */ 7557 AV ** posix_warnings, /* Where to place any generated warnings, or 7558 NULL */ 7559 const bool check_only /* Don't die if error */ 7560) 7561{ 7562 /* This parses what the caller thinks may be one of the three POSIX 7563 * constructs: 7564 * 1) a character class, like [:blank:] 7565 * 2) a collating symbol, like [. .] 7566 * 3) an equivalence class, like [= =] 7567 * In the latter two cases, it croaks if it finds a syntactically legal 7568 * one, as these are not handled by Perl. 7569 * 7570 * The main purpose is to look for a POSIX character class. It returns: 7571 * a) the class number 7572 * if it is a completely syntactically and semantically legal class. 7573 * 'updated_parse_ptr', if not NULL, is set to point to just after the 7574 * closing ']' of the class 7575 * b) OOB_NAMEDCLASS 7576 * if it appears that one of the three POSIX constructs was meant, but 7577 * its specification was somehow defective. 'updated_parse_ptr', if 7578 * not NULL, is set to point to the character just after the end 7579 * character of the class. See below for handling of warnings. 7580 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS 7581 * if it doesn't appear that a POSIX construct was intended. 7582 * 'updated_parse_ptr' is not changed. No warnings nor errors are 7583 * raised. 7584 * 7585 * In b) there may be errors or warnings generated. If 'check_only' is 7586 * TRUE, then any errors are discarded. Warnings are returned to the 7587 * caller via an AV* created into '*posix_warnings' if it is not NULL. If 7588 * instead it is NULL, warnings are suppressed. 7589 * 7590 * The reason for this function, and its complexity is that a bracketed 7591 * character class can contain just about anything. But it's easy to 7592 * mistype the very specific posix class syntax but yielding a valid 7593 * regular bracketed class, so it silently gets compiled into something 7594 * quite unintended. 7595 * 7596 * The solution adopted here maintains backward compatibility except that 7597 * it adds a warning if it looks like a posix class was intended but 7598 * improperly specified. The warning is not raised unless what is input 7599 * very closely resembles one of the 14 legal posix classes. To do this, 7600 * it uses fuzzy parsing. It calculates how many single-character edits it 7601 * would take to transform what was input into a legal posix class. Only 7602 * if that number is quite small does it think that the intention was a 7603 * posix class. Obviously these are heuristics, and there will be cases 7604 * where it errs on one side or another, and they can be tweaked as 7605 * experience informs. 7606 * 7607 * The syntax for a legal posix class is: 7608 * 7609 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/ 7610 * 7611 * What this routine considers syntactically to be an intended posix class 7612 * is this (the comments indicate some restrictions that the pattern 7613 * doesn't show): 7614 * 7615 * qr/(?x: \[? # The left bracket, possibly 7616 * # omitted 7617 * \h* # possibly followed by blanks 7618 * (?: \^ \h* )? # possibly a misplaced caret 7619 * [:;]? # The opening class character, 7620 * # possibly omitted. A typo 7621 * # semi-colon can also be used. 7622 * \h* 7623 * \^? # possibly a correctly placed 7624 * # caret, but not if there was also 7625 * # a misplaced one 7626 * \h* 7627 * .{3,15} # The class name. If there are 7628 * # deviations from the legal syntax, 7629 * # its edit distance must be close 7630 * # to a real class name in order 7631 * # for it to be considered to be 7632 * # an intended posix class. 7633 * \h* 7634 * [[:punct:]]? # The closing class character, 7635 * # possibly omitted. If not a colon 7636 * # nor semi colon, the class name 7637 * # must be even closer to a valid 7638 * # one 7639 * \h* 7640 * \]? # The right bracket, possibly 7641 * # omitted. 7642 * )/ 7643 * 7644 * In the above, \h must be ASCII-only. 7645 * 7646 * These are heuristics, and can be tweaked as field experience dictates. 7647 * There will be cases when someone didn't intend to specify a posix class 7648 * that this warns as being so. The goal is to minimize these, while 7649 * maximizing the catching of things intended to be a posix class that 7650 * aren't parsed as such. 7651 */ 7652 7653 const char* p = s; 7654 const char * const e = RExC_end; 7655 unsigned complement = 0; /* If to complement the class */ 7656 bool found_problem = FALSE; /* Assume OK until proven otherwise */ 7657 bool has_opening_bracket = FALSE; 7658 bool has_opening_colon = FALSE; 7659 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find 7660 valid class */ 7661 const char * possible_end = NULL; /* used for a 2nd parse pass */ 7662 const char* name_start; /* ptr to class name first char */ 7663 7664 /* If the number of single-character typos the input name is away from a 7665 * legal name is no more than this number, it is considered to have meant 7666 * the legal name */ 7667 int max_distance = 2; 7668 7669 /* to store the name. The size determines the maximum length before we 7670 * decide that no posix class was intended. Should be at least 7671 * sizeof("alphanumeric") */ 7672 UV input_text[15]; 7673 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric"); 7674 7675 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; 7676 7677 CLEAR_POSIX_WARNINGS(); 7678 7679 if (p >= e) { 7680 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 7681 } 7682 7683 if (*(p - 1) != '[') { 7684 ADD_POSIX_WARNING(p, "it doesn't start with a '['"); 7685 found_problem = TRUE; 7686 } 7687 else { 7688 has_opening_bracket = TRUE; 7689 } 7690 7691 /* They could be confused and think you can put spaces between the 7692 * components */ 7693 if (isBLANK(*p)) { 7694 found_problem = TRUE; 7695 7696 do { 7697 p++; 7698 } while (p < e && isBLANK(*p)); 7699 7700 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7701 } 7702 7703 /* For [. .] and [= =]. These are quite different internally from [: :], 7704 * so they are handled separately. */ 7705 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' 7706 and 1 for at least one char in it 7707 */ 7708 { 7709 const char open_char = *p; 7710 const char * temp_ptr = p + 1; 7711 7712 /* These two constructs are not handled by perl, and if we find a 7713 * syntactically valid one, we croak. khw, who wrote this code, finds 7714 * this explanation of them very unclear: 7715 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html 7716 * And searching the rest of the internet wasn't very helpful either. 7717 * It looks like just about any byte can be in these constructs, 7718 * depending on the locale. But unless the pattern is being compiled 7719 * under /l, which is very rare, Perl runs under the C or POSIX locale. 7720 * In that case, it looks like [= =] isn't allowed at all, and that 7721 * [. .] could be any single code point, but for longer strings the 7722 * constituent characters would have to be the ASCII alphabetics plus 7723 * the minus-hyphen. Any sensible locale definition would limit itself 7724 * to these. And any portable one definitely should. Trying to parse 7725 * the general case is a nightmare (see [perl #127604]). So, this code 7726 * looks only for interiors of these constructs that match: 7727 * qr/.|[-\w]{2,}/ 7728 * Using \w relaxes the apparent rules a little, without adding much 7729 * danger of mistaking something else for one of these constructs. 7730 * 7731 * [. .] in some implementations described on the internet is usable to 7732 * escape a character that otherwise is special in bracketed character 7733 * classes. For example [.].] means a literal right bracket instead of 7734 * the ending of the class 7735 * 7736 * [= =] can legitimately contain a [. .] construct, but we don't 7737 * handle this case, as that [. .] construct will later get parsed 7738 * itself and croak then. And [= =] is checked for even when not under 7739 * /l, as Perl has long done so. 7740 * 7741 * The code below relies on there being a trailing NUL, so it doesn't 7742 * have to keep checking if the parse ptr < e. 7743 */ 7744 if (temp_ptr[1] == open_char) { 7745 temp_ptr++; 7746 } 7747 else while ( temp_ptr < e 7748 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) 7749 { 7750 temp_ptr++; 7751 } 7752 7753 if (*temp_ptr == open_char) { 7754 temp_ptr++; 7755 if (*temp_ptr == ']') { 7756 temp_ptr++; 7757 if (! found_problem && ! check_only) { 7758 RExC_parse_set((char *) temp_ptr); 7759 vFAIL3("POSIX syntax [%c %c] is reserved for future " 7760 "extensions", open_char, open_char); 7761 } 7762 7763 /* Here, the syntax wasn't completely valid, or else the call 7764 * is to check-only */ 7765 if (updated_parse_ptr) { 7766 *updated_parse_ptr = (char *) temp_ptr; 7767 } 7768 7769 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS); 7770 } 7771 } 7772 7773 /* If we find something that started out to look like one of these 7774 * constructs, but isn't, we continue below so that it can be checked 7775 * for being a class name with a typo of '.' or '=' instead of a colon. 7776 * */ 7777 } 7778 7779 /* Here, we think there is a possibility that a [: :] class was meant, and 7780 * we have the first real character. It could be they think the '^' comes 7781 * first */ 7782 if (*p == '^') { 7783 found_problem = TRUE; 7784 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon"); 7785 complement = 1; 7786 p++; 7787 7788 if (isBLANK(*p)) { 7789 found_problem = TRUE; 7790 7791 do { 7792 p++; 7793 } while (p < e && isBLANK(*p)); 7794 7795 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7796 } 7797 } 7798 7799 /* But the first character should be a colon, which they could have easily 7800 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to 7801 * distinguish from a colon, so treat that as a colon). */ 7802 if (*p == ':') { 7803 p++; 7804 has_opening_colon = TRUE; 7805 } 7806 else if (*p == ';') { 7807 found_problem = TRUE; 7808 p++; 7809 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 7810 has_opening_colon = TRUE; 7811 } 7812 else { 7813 found_problem = TRUE; 7814 ADD_POSIX_WARNING(p, "there must be a starting ':'"); 7815 7816 /* Consider an initial punctuation (not one of the recognized ones) to 7817 * be a left terminator */ 7818 if (*p != '^' && *p != ']' && isPUNCT(*p)) { 7819 p++; 7820 } 7821 } 7822 7823 /* They may think that you can put spaces between the components */ 7824 if (isBLANK(*p)) { 7825 found_problem = TRUE; 7826 7827 do { 7828 p++; 7829 } while (p < e && isBLANK(*p)); 7830 7831 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7832 } 7833 7834 if (*p == '^') { 7835 7836 /* We consider something like [^:^alnum:]] to not have been intended to 7837 * be a posix class, but XXX maybe we should */ 7838 if (complement) { 7839 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 7840 } 7841 7842 complement = 1; 7843 p++; 7844 } 7845 7846 /* Again, they may think that you can put spaces between the components */ 7847 if (isBLANK(*p)) { 7848 found_problem = TRUE; 7849 7850 do { 7851 p++; 7852 } while (p < e && isBLANK(*p)); 7853 7854 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7855 } 7856 7857 if (*p == ']') { 7858 7859 /* XXX This ']' may be a typo, and something else was meant. But 7860 * treating it as such creates enough complications, that that 7861 * possibility isn't currently considered here. So we assume that the 7862 * ']' is what is intended, and if we've already found an initial '[', 7863 * this leaves this construct looking like [:] or [:^], which almost 7864 * certainly weren't intended to be posix classes */ 7865 if (has_opening_bracket) { 7866 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 7867 } 7868 7869 /* But this function can be called when we parse the colon for 7870 * something like qr/[alpha:]]/, so we back up to look for the 7871 * beginning */ 7872 p--; 7873 7874 if (*p == ';') { 7875 found_problem = TRUE; 7876 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 7877 } 7878 else if (*p != ':') { 7879 7880 /* XXX We are currently very restrictive here, so this code doesn't 7881 * consider the possibility that, say, /[alpha.]]/ was intended to 7882 * be a posix class. */ 7883 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 7884 } 7885 7886 /* Here we have something like 'foo:]'. There was no initial colon, 7887 * and we back up over 'foo. XXX Unlike the going forward case, we 7888 * don't handle typos of non-word chars in the middle */ 7889 has_opening_colon = FALSE; 7890 p--; 7891 7892 while (p > RExC_start && isWORDCHAR(*p)) { 7893 p--; 7894 } 7895 p++; 7896 7897 /* Here, we have positioned ourselves to where we think the first 7898 * character in the potential class is */ 7899 } 7900 7901 /* Now the interior really starts. There are certain key characters that 7902 * can end the interior, or these could just be typos. To catch both 7903 * cases, we may have to do two passes. In the first pass, we keep on 7904 * going unless we come to a sequence that matches 7905 * qr/ [[:punct:]] [[:blank:]]* \] /xa 7906 * This means it takes a sequence to end the pass, so two typos in a row if 7907 * that wasn't what was intended. If the class is perfectly formed, just 7908 * this one pass is needed. We also stop if there are too many characters 7909 * being accumulated, but this number is deliberately set higher than any 7910 * real class. It is set high enough so that someone who thinks that 7911 * 'alphanumeric' is a correct name would get warned that it wasn't. 7912 * While doing the pass, we keep track of where the key characters were in 7913 * it. If we don't find an end to the class, and one of the key characters 7914 * was found, we redo the pass, but stop when we get to that character. 7915 * Thus the key character was considered a typo in the first pass, but a 7916 * terminator in the second. If two key characters are found, we stop at 7917 * the second one in the first pass. Again this can miss two typos, but 7918 * catches a single one 7919 * 7920 * In the first pass, 'possible_end' starts as NULL, and then gets set to 7921 * point to the first key character. For the second pass, it starts as -1. 7922 * */ 7923 7924 name_start = p; 7925 parse_name: 7926 { 7927 bool has_blank = FALSE; 7928 bool has_upper = FALSE; 7929 bool has_terminating_colon = FALSE; 7930 bool has_terminating_bracket = FALSE; 7931 bool has_semi_colon = FALSE; 7932 unsigned int name_len = 0; 7933 int punct_count = 0; 7934 7935 while (p < e) { 7936 7937 /* Squeeze out blanks when looking up the class name below */ 7938 if (isBLANK(*p) ) { 7939 has_blank = TRUE; 7940 found_problem = TRUE; 7941 p++; 7942 continue; 7943 } 7944 7945 /* The name will end with a punctuation */ 7946 if (isPUNCT(*p)) { 7947 const char * peek = p + 1; 7948 7949 /* Treat any non-']' punctuation followed by a ']' (possibly 7950 * with intervening blanks) as trying to terminate the class. 7951 * ']]' is very likely to mean a class was intended (but 7952 * missing the colon), but the warning message that gets 7953 * generated shows the error position better if we exit the 7954 * loop at the bottom (eventually), so skip it here. */ 7955 if (*p != ']') { 7956 if (peek < e && isBLANK(*peek)) { 7957 has_blank = TRUE; 7958 found_problem = TRUE; 7959 do { 7960 peek++; 7961 } while (peek < e && isBLANK(*peek)); 7962 } 7963 7964 if (peek < e && *peek == ']') { 7965 has_terminating_bracket = TRUE; 7966 if (*p == ':') { 7967 has_terminating_colon = TRUE; 7968 } 7969 else if (*p == ';') { 7970 has_semi_colon = TRUE; 7971 has_terminating_colon = TRUE; 7972 } 7973 else { 7974 found_problem = TRUE; 7975 } 7976 p = peek + 1; 7977 goto try_posix; 7978 } 7979 } 7980 7981 /* Here we have punctuation we thought didn't end the class. 7982 * Keep track of the position of the key characters that are 7983 * more likely to have been class-enders */ 7984 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') { 7985 7986 /* Allow just one such possible class-ender not actually 7987 * ending the class. */ 7988 if (possible_end) { 7989 break; 7990 } 7991 possible_end = p; 7992 } 7993 7994 /* If we have too many punctuation characters, no use in 7995 * keeping going */ 7996 if (++punct_count > max_distance) { 7997 break; 7998 } 7999 8000 /* Treat the punctuation as a typo. */ 8001 input_text[name_len++] = *p; 8002 p++; 8003 } 8004 else if (isUPPER(*p)) { /* Use lowercase for lookup */ 8005 input_text[name_len++] = toLOWER(*p); 8006 has_upper = TRUE; 8007 found_problem = TRUE; 8008 p++; 8009 } else if (! UTF || UTF8_IS_INVARIANT(*p)) { 8010 input_text[name_len++] = *p; 8011 p++; 8012 } 8013 else { 8014 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL); 8015 p+= UTF8SKIP(p); 8016 } 8017 8018 /* The declaration of 'input_text' is how long we allow a potential 8019 * class name to be, before saying they didn't mean a class name at 8020 * all */ 8021 if (name_len >= C_ARRAY_LENGTH(input_text)) { 8022 break; 8023 } 8024 } 8025 8026 /* We get to here when the possible class name hasn't been properly 8027 * terminated before: 8028 * 1) we ran off the end of the pattern; or 8029 * 2) found two characters, each of which might have been intended to 8030 * be the name's terminator 8031 * 3) found so many punctuation characters in the purported name, 8032 * that the edit distance to a valid one is exceeded 8033 * 4) we decided it was more characters than anyone could have 8034 * intended to be one. */ 8035 8036 found_problem = TRUE; 8037 8038 /* In the final two cases, we know that looking up what we've 8039 * accumulated won't lead to a match, even a fuzzy one. */ 8040 if ( name_len >= C_ARRAY_LENGTH(input_text) 8041 || punct_count > max_distance) 8042 { 8043 /* If there was an intermediate key character that could have been 8044 * an intended end, redo the parse, but stop there */ 8045 if (possible_end && possible_end != (char *) -1) { 8046 possible_end = (char *) -1; /* Special signal value to say 8047 we've done a first pass */ 8048 p = name_start; 8049 goto parse_name; 8050 } 8051 8052 /* Otherwise, it can't have meant to have been a class */ 8053 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 8054 } 8055 8056 /* If we ran off the end, and the final character was a punctuation 8057 * one, back up one, to look at that final one just below. Later, we 8058 * will restore the parse pointer if appropriate */ 8059 if (name_len && p == e && isPUNCT(*(p-1))) { 8060 p--; 8061 name_len--; 8062 } 8063 8064 if (p < e && isPUNCT(*p)) { 8065 if (*p == ']') { 8066 has_terminating_bracket = TRUE; 8067 8068 /* If this is a 2nd ']', and the first one is just below this 8069 * one, consider that to be the real terminator. This gives a 8070 * uniform and better positioning for the warning message */ 8071 if ( possible_end 8072 && possible_end != (char *) -1 8073 && *possible_end == ']' 8074 && name_len && input_text[name_len - 1] == ']') 8075 { 8076 name_len--; 8077 p = possible_end; 8078 8079 /* And this is actually equivalent to having done the 2nd 8080 * pass now, so set it to not try again */ 8081 possible_end = (char *) -1; 8082 } 8083 } 8084 else { 8085 if (*p == ':') { 8086 has_terminating_colon = TRUE; 8087 } 8088 else if (*p == ';') { 8089 has_semi_colon = TRUE; 8090 has_terminating_colon = TRUE; 8091 } 8092 p++; 8093 } 8094 } 8095 8096 try_posix: 8097 8098 /* Here, we have a class name to look up. We can short circuit the 8099 * stuff below for short names that can't possibly be meant to be a 8100 * class name. (We can do this on the first pass, as any second pass 8101 * will yield an even shorter name) */ 8102 if (name_len < 3) { 8103 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 8104 } 8105 8106 /* Find which class it is. Initially switch on the length of the name. 8107 * */ 8108 switch (name_len) { 8109 case 4: 8110 if (memEQs(name_start, 4, "word")) { 8111 /* this is not POSIX, this is the Perl \w */ 8112 class_number = ANYOF_WORDCHAR; 8113 } 8114 break; 8115 case 5: 8116 /* Names all of length 5: alnum alpha ascii blank cntrl digit 8117 * graph lower print punct space upper 8118 * Offset 4 gives the best switch position. */ 8119 switch (name_start[4]) { 8120 case 'a': 8121 if (memBEGINs(name_start, 5, "alph")) /* alpha */ 8122 class_number = ANYOF_ALPHA; 8123 break; 8124 case 'e': 8125 if (memBEGINs(name_start, 5, "spac")) /* space */ 8126 class_number = ANYOF_SPACE; 8127 break; 8128 case 'h': 8129 if (memBEGINs(name_start, 5, "grap")) /* graph */ 8130 class_number = ANYOF_GRAPH; 8131 break; 8132 case 'i': 8133 if (memBEGINs(name_start, 5, "asci")) /* ascii */ 8134 class_number = ANYOF_ASCII; 8135 break; 8136 case 'k': 8137 if (memBEGINs(name_start, 5, "blan")) /* blank */ 8138 class_number = ANYOF_BLANK; 8139 break; 8140 case 'l': 8141 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */ 8142 class_number = ANYOF_CNTRL; 8143 break; 8144 case 'm': 8145 if (memBEGINs(name_start, 5, "alnu")) /* alnum */ 8146 class_number = ANYOF_ALPHANUMERIC; 8147 break; 8148 case 'r': 8149 if (memBEGINs(name_start, 5, "lowe")) /* lower */ 8150 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; 8151 else if (memBEGINs(name_start, 5, "uppe")) /* upper */ 8152 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; 8153 break; 8154 case 't': 8155 if (memBEGINs(name_start, 5, "digi")) /* digit */ 8156 class_number = ANYOF_DIGIT; 8157 else if (memBEGINs(name_start, 5, "prin")) /* print */ 8158 class_number = ANYOF_PRINT; 8159 else if (memBEGINs(name_start, 5, "punc")) /* punct */ 8160 class_number = ANYOF_PUNCT; 8161 break; 8162 } 8163 break; 8164 case 6: 8165 if (memEQs(name_start, 6, "xdigit")) 8166 class_number = ANYOF_XDIGIT; 8167 break; 8168 } 8169 8170 /* If the name exactly matches a posix class name the class number will 8171 * here be set to it, and the input almost certainly was meant to be a 8172 * posix class, so we can skip further checking. If instead the syntax 8173 * is exactly correct, but the name isn't one of the legal ones, we 8174 * will return that as an error below. But if neither of these apply, 8175 * it could be that no posix class was intended at all, or that one 8176 * was, but there was a typo. We tease these apart by doing fuzzy 8177 * matching on the name */ 8178 if (class_number == OOB_NAMEDCLASS && found_problem) { 8179 const UV posix_names[][6] = { 8180 { 'a', 'l', 'n', 'u', 'm' }, 8181 { 'a', 'l', 'p', 'h', 'a' }, 8182 { 'a', 's', 'c', 'i', 'i' }, 8183 { 'b', 'l', 'a', 'n', 'k' }, 8184 { 'c', 'n', 't', 'r', 'l' }, 8185 { 'd', 'i', 'g', 'i', 't' }, 8186 { 'g', 'r', 'a', 'p', 'h' }, 8187 { 'l', 'o', 'w', 'e', 'r' }, 8188 { 'p', 'r', 'i', 'n', 't' }, 8189 { 'p', 'u', 'n', 'c', 't' }, 8190 { 's', 'p', 'a', 'c', 'e' }, 8191 { 'u', 'p', 'p', 'e', 'r' }, 8192 { 'w', 'o', 'r', 'd' }, 8193 { 'x', 'd', 'i', 'g', 'i', 't' } 8194 }; 8195 /* The names of the above all have added NULs to make them the same 8196 * size, so we need to also have the real lengths */ 8197 const UV posix_name_lengths[] = { 8198 sizeof("alnum") - 1, 8199 sizeof("alpha") - 1, 8200 sizeof("ascii") - 1, 8201 sizeof("blank") - 1, 8202 sizeof("cntrl") - 1, 8203 sizeof("digit") - 1, 8204 sizeof("graph") - 1, 8205 sizeof("lower") - 1, 8206 sizeof("print") - 1, 8207 sizeof("punct") - 1, 8208 sizeof("space") - 1, 8209 sizeof("upper") - 1, 8210 sizeof("word") - 1, 8211 sizeof("xdigit")- 1 8212 }; 8213 unsigned int i; 8214 int temp_max = max_distance; /* Use a temporary, so if we 8215 reparse, we haven't changed the 8216 outer one */ 8217 8218 /* Use a smaller max edit distance if we are missing one of the 8219 * delimiters */ 8220 if ( has_opening_bracket + has_opening_colon < 2 8221 || has_terminating_bracket + has_terminating_colon < 2) 8222 { 8223 temp_max--; 8224 } 8225 8226 /* See if the input name is close to a legal one */ 8227 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) { 8228 8229 /* Short circuit call if the lengths are too far apart to be 8230 * able to match */ 8231 if (abs( (int) (name_len - posix_name_lengths[i])) 8232 > temp_max) 8233 { 8234 continue; 8235 } 8236 8237 if (edit_distance(input_text, 8238 posix_names[i], 8239 name_len, 8240 posix_name_lengths[i], 8241 temp_max 8242 ) 8243 > -1) 8244 { /* If it is close, it probably was intended to be a class */ 8245 goto probably_meant_to_be; 8246 } 8247 } 8248 8249 /* Here the input name is not close enough to a valid class name 8250 * for us to consider it to be intended to be a posix class. If 8251 * we haven't already done so, and the parse found a character that 8252 * could have been terminators for the name, but which we absorbed 8253 * as typos during the first pass, repeat the parse, signalling it 8254 * to stop at that character */ 8255 if (possible_end && possible_end != (char *) -1) { 8256 possible_end = (char *) -1; 8257 p = name_start; 8258 goto parse_name; 8259 } 8260 8261 /* Here neither pass found a close-enough class name */ 8262 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 8263 } 8264 8265 probably_meant_to_be: 8266 8267 /* Here we think that a posix specification was intended. Update any 8268 * parse pointer */ 8269 if (updated_parse_ptr) { 8270 *updated_parse_ptr = (char *) p; 8271 } 8272 8273 /* If a posix class name was intended but incorrectly specified, we 8274 * output or return the warnings */ 8275 if (found_problem) { 8276 8277 /* We set flags for these issues in the parse loop above instead of 8278 * adding them to the list of warnings, because we can parse it 8279 * twice, and we only want one warning instance */ 8280 if (has_upper) { 8281 ADD_POSIX_WARNING(p, "the name must be all lowercase letters"); 8282 } 8283 if (has_blank) { 8284 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 8285 } 8286 if (has_semi_colon) { 8287 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 8288 } 8289 else if (! has_terminating_colon) { 8290 ADD_POSIX_WARNING(p, "there is no terminating ':'"); 8291 } 8292 if (! has_terminating_bracket) { 8293 ADD_POSIX_WARNING(p, "there is no terminating ']'"); 8294 } 8295 8296 if ( posix_warnings 8297 && RExC_warn_text 8298 && av_count(RExC_warn_text) > 0) 8299 { 8300 *posix_warnings = RExC_warn_text; 8301 } 8302 } 8303 else if (class_number != OOB_NAMEDCLASS) { 8304 /* If it is a known class, return the class. The class number 8305 * #defines are structured so each complement is +1 to the normal 8306 * one */ 8307 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement); 8308 } 8309 else if (! check_only) { 8310 8311 /* Here, it is an unrecognized class. This is an error (unless the 8312 * call is to check only, which we've already handled above) */ 8313 const char * const complement_string = (complement) 8314 ? "^" 8315 : ""; 8316 RExC_parse_set((char *) p); 8317 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown", 8318 complement_string, 8319 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); 8320 } 8321 } 8322 8323 return OOB_NAMEDCLASS; 8324} 8325#undef ADD_POSIX_WARNING 8326 8327STATIC unsigned int 8328S_regex_set_precedence(const U8 my_operator) { 8329 8330 /* Returns the precedence in the (?[...]) construct of the input operator, 8331 * specified by its character representation. The precedence follows 8332 * general Perl rules, but it extends this so that ')' and ']' have (low) 8333 * precedence even though they aren't really operators */ 8334 8335 switch (my_operator) { 8336 case '!': 8337 return 5; 8338 case '&': 8339 return 4; 8340 case '^': 8341 case '|': 8342 case '+': 8343 case '-': 8344 return 3; 8345 case ')': 8346 return 2; 8347 case ']': 8348 return 1; 8349 } 8350 8351 NOT_REACHED; /* NOTREACHED */ 8352 return 0; /* Silence compiler warning */ 8353} 8354 8355STATIC regnode_offset 8356S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, 8357 I32 *flagp, U32 depth) 8358{ 8359 /* Handle the (?[...]) construct to do set operations */ 8360 8361 U8 curchar; /* Current character being parsed */ 8362 UV start, end; /* End points of code point ranges */ 8363 SV* final = NULL; /* The end result inversion list */ 8364 SV* result_string; /* 'final' stringified */ 8365 AV* stack; /* stack of operators and operands not yet 8366 resolved */ 8367 AV* fence_stack = NULL; /* A stack containing the positions in 8368 'stack' of where the undealt-with left 8369 parens would be if they were actually 8370 put there */ 8371 /* The 'volatile' is a workaround for an optimiser bug 8372 * in Solaris Studio 12.3. See RT #127455 */ 8373 volatile IV fence = 0; /* Position of where most recent undealt- 8374 with left paren in stack is; -1 if none. 8375 */ 8376 STRLEN len; /* Temporary */ 8377 regnode_offset node; /* Temporary, and final regnode returned by 8378 this function */ 8379 const bool save_fold = FOLD; /* Temporary */ 8380 char *save_end, *save_parse; /* Temporaries */ 8381 const bool in_locale = LOC; /* we turn off /l during processing */ 8382 8383 DECLARE_AND_GET_RE_DEBUG_FLAGS; 8384 8385 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; 8386 8387 DEBUG_PARSE("xcls"); 8388 8389 if (in_locale) { 8390 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 8391 } 8392 8393 /* The use of this operator implies /u. This is required so that the 8394 * compile time values are valid in all runtime cases */ 8395 REQUIRE_UNI_RULES(flagp, 0); 8396 8397 /* Everything in this construct is a metacharacter. Operands begin with 8398 * either a '\' (for an escape sequence), or a '[' for a bracketed 8399 * character class. Any other character should be an operator, or 8400 * parenthesis for grouping. Both types of operands are handled by calling 8401 * regclass() to parse them. It is called with a parameter to indicate to 8402 * return the computed inversion list. The parsing here is implemented via 8403 * a stack. Each entry on the stack is a single character representing one 8404 * of the operators; or else a pointer to an operand inversion list. */ 8405 8406#define IS_OPERATOR(a) SvIOK(a) 8407#define IS_OPERAND(a) (! IS_OPERATOR(a)) 8408 8409 /* The stack is kept in ��ukasiewicz order. (That's pronounced similar 8410 * to luke-a-shave-itch (or -itz), but people who didn't want to bother 8411 * with pronouncing it called it Reverse Polish instead, but now that YOU 8412 * know how to pronounce it you can use the correct term, thus giving due 8413 * credit to the person who invented it, and impressing your geek friends. 8414 * Wikipedia says that the pronunciation of "��" has been changing so that 8415 * it is now more like an English initial W (as in wonk) than an L.) 8416 * 8417 * This means that, for example, 'a | b & c' is stored on the stack as 8418 * 8419 * c [4] 8420 * b [3] 8421 * & [2] 8422 * a [1] 8423 * | [0] 8424 * 8425 * where the numbers in brackets give the stack [array] element number. 8426 * In this implementation, parentheses are not stored on the stack. 8427 * Instead a '(' creates a "fence" so that the part of the stack below the 8428 * fence is invisible except to the corresponding ')' (this allows us to 8429 * replace testing for parens, by using instead subtraction of the fence 8430 * position). As new operands are processed they are pushed onto the stack 8431 * (except as noted in the next paragraph). New operators of higher 8432 * precedence than the current final one are inserted on the stack before 8433 * the lhs operand (so that when the rhs is pushed next, everything will be 8434 * in the correct positions shown above. When an operator of equal or 8435 * lower precedence is encountered in parsing, all the stacked operations 8436 * of equal or higher precedence are evaluated, leaving the result as the 8437 * top entry on the stack. This makes higher precedence operations 8438 * evaluate before lower precedence ones, and causes operations of equal 8439 * precedence to left associate. 8440 * 8441 * The only unary operator '!' is immediately pushed onto the stack when 8442 * encountered. When an operand is encountered, if the top of the stack is 8443 * a '!", the complement is immediately performed, and the '!' popped. The 8444 * resulting value is treated as a new operand, and the logic in the 8445 * previous paragraph is executed. Thus in the expression 8446 * [a] + ! [b] 8447 * the stack looks like 8448 * 8449 * ! 8450 * a 8451 * + 8452 * 8453 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack 8454 * becomes 8455 * 8456 * !b 8457 * a 8458 * + 8459 * 8460 * A ')' is treated as an operator with lower precedence than all the 8461 * aforementioned ones, which causes all operations on the stack above the 8462 * corresponding '(' to be evaluated down to a single resultant operand. 8463 * Then the fence for the '(' is removed, and the operand goes through the 8464 * algorithm above, without the fence. 8465 * 8466 * A separate stack is kept of the fence positions, so that the position of 8467 * the latest so-far unbalanced '(' is at the top of it. 8468 * 8469 * The ']' ending the construct is treated as the lowest operator of all, 8470 * so that everything gets evaluated down to a single operand, which is the 8471 * result */ 8472 8473 stack = (AV*)newSV_type_mortal(SVt_PVAV); 8474 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV); 8475 8476 while (RExC_parse < RExC_end) { 8477 I32 top_index; /* Index of top-most element in 'stack' */ 8478 SV** top_ptr; /* Pointer to top 'stack' element */ 8479 SV* current = NULL; /* To contain the current inversion list 8480 operand */ 8481 SV* only_to_avoid_leaks; 8482 8483 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 8484 TRUE /* Force /x */ ); 8485 if (RExC_parse >= RExC_end) { /* Fail */ 8486 break; 8487 } 8488 8489 curchar = UCHARAT(RExC_parse); 8490 8491redo_curchar: 8492 8493#ifdef ENABLE_REGEX_SETS_DEBUGGING 8494 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */ 8495 DEBUG_U(dump_regex_sets_structures(pRExC_state, 8496 stack, fence, fence_stack)); 8497#endif 8498 8499 top_index = av_tindex_skip_len_mg(stack); 8500 8501 switch (curchar) { 8502 SV** stacked_ptr; /* Ptr to something already on 'stack' */ 8503 char stacked_operator; /* The topmost operator on the 'stack'. */ 8504 SV* lhs; /* Operand to the left of the operator */ 8505 SV* rhs; /* Operand to the right of the operator */ 8506 SV* fence_ptr; /* Pointer to top element of the fence 8507 stack */ 8508 case '(': 8509 8510 if ( RExC_parse < RExC_end - 2 8511 && UCHARAT(RExC_parse + 1) == '?' 8512 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2))) 8513 { 8514 const regnode_offset orig_emit = RExC_emit; 8515 SV * resultant_invlist; 8516 8517 /* Here it could be an embedded '(?flags:(?[...])'. 8518 * This happens when we have some thing like 8519 * 8520 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; 8521 * ... 8522 * qr/(?[ \p{Digit} & $thai_or_lao ])/; 8523 * 8524 * Here we would be handling the interpolated 8525 * '$thai_or_lao'. We handle this by a recursive call to 8526 * reg which returns the inversion list the 8527 * interpolated expression evaluates to. Actually, the 8528 * return is a special regnode containing a pointer to that 8529 * inversion list. If the return isn't that regnode alone, 8530 * we know that this wasn't such an interpolation, which is 8531 * an error: we need to get a single inversion list back 8532 * from the recursion */ 8533 8534 RExC_parse_inc_by(1); 8535 RExC_sets_depth++; 8536 8537 node = reg(pRExC_state, 2, flagp, depth+1); 8538 RETURN_FAIL_ON_RESTART(*flagp, flagp); 8539 8540 if ( OP(REGNODE_p(node)) != REGEX_SET 8541 /* If more than a single node returned, the nested 8542 * parens evaluated to more than just a (?[...]), 8543 * which isn't legal */ 8544 || RExC_emit != orig_emit 8545 + NODE_STEP_REGNODE 8546 + REGNODE_ARG_LEN(REGEX_SET)) 8547 { 8548 vFAIL("Expecting interpolated extended charclass"); 8549 } 8550 resultant_invlist = (SV *) ARGp(REGNODE_p(node)); 8551 current = invlist_clone(resultant_invlist, NULL); 8552 SvREFCNT_dec(resultant_invlist); 8553 8554 RExC_sets_depth--; 8555 RExC_emit = orig_emit; 8556 goto handle_operand; 8557 } 8558 8559 /* A regular '('. Look behind for illegal syntax */ 8560 if (top_index - fence >= 0) { 8561 /* If the top entry on the stack is an operator, it had 8562 * better be a '!', otherwise the entry below the top 8563 * operand should be an operator */ 8564 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) 8565 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') 8566 || ( IS_OPERAND(*top_ptr) 8567 && ( top_index - fence < 1 8568 || ! (stacked_ptr = av_fetch(stack, 8569 top_index - 1, 8570 FALSE)) 8571 || ! IS_OPERATOR(*stacked_ptr)))) 8572 { 8573 RExC_parse_inc_by(1); 8574 vFAIL("Unexpected '(' with no preceding operator"); 8575 } 8576 } 8577 8578 /* Stack the position of this undealt-with left paren */ 8579 av_push_simple(fence_stack, newSViv(fence)); 8580 fence = top_index + 1; 8581 break; 8582 8583 case '\\': 8584 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 8585 * multi-char folds are allowed. */ 8586 if (!regclass(pRExC_state, flagp, depth+1, 8587 TRUE, /* means parse just the next thing */ 8588 FALSE, /* don't allow multi-char folds */ 8589 FALSE, /* don't silence non-portable warnings. */ 8590 TRUE, /* strict */ 8591 FALSE, /* Require return to be an ANYOF */ 8592 ¤t)) 8593 { 8594 RETURN_FAIL_ON_RESTART(*flagp, flagp); 8595 goto regclass_failed; 8596 } 8597 8598 assert(current); 8599 8600 /* regclass() will return with parsing just the \ sequence, 8601 * leaving the parse pointer at the next thing to parse */ 8602 RExC_parse--; 8603 goto handle_operand; 8604 8605 case '[': /* Is a bracketed character class */ 8606 { 8607 /* See if this is a [:posix:] class. */ 8608 bool is_posix_class = (OOB_NAMEDCLASS 8609 < handle_possible_posix(pRExC_state, 8610 RExC_parse + 1, 8611 NULL, 8612 NULL, 8613 TRUE /* checking only */)); 8614 /* If it is a posix class, leave the parse pointer at the '[' 8615 * to fool regclass() into thinking it is part of a 8616 * '[[:posix:]]'. */ 8617 if (! is_posix_class) { 8618 RExC_parse_inc_by(1); 8619 } 8620 8621 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 8622 * multi-char folds are allowed. */ 8623 if (!regclass(pRExC_state, flagp, depth+1, 8624 is_posix_class, /* parse the whole char 8625 class only if not a 8626 posix class */ 8627 FALSE, /* don't allow multi-char folds */ 8628 TRUE, /* silence non-portable warnings. */ 8629 TRUE, /* strict */ 8630 FALSE, /* Require return to be an ANYOF */ 8631 ¤t)) 8632 { 8633 RETURN_FAIL_ON_RESTART(*flagp, flagp); 8634 goto regclass_failed; 8635 } 8636 8637 assert(current); 8638 8639 /* function call leaves parse pointing to the ']', except if we 8640 * faked it */ 8641 if (is_posix_class) { 8642 RExC_parse--; 8643 } 8644 8645 goto handle_operand; 8646 } 8647 8648 case ']': 8649 if (top_index >= 1) { 8650 goto join_operators; 8651 } 8652 8653 /* Only a single operand on the stack: are done */ 8654 goto done; 8655 8656 case ')': 8657 if (av_tindex_skip_len_mg(fence_stack) < 0) { 8658 if (UCHARAT(RExC_parse - 1) == ']') { 8659 break; 8660 } 8661 RExC_parse_inc_by(1); 8662 vFAIL("Unexpected ')'"); 8663 } 8664 8665 /* If nothing after the fence, is missing an operand */ 8666 if (top_index - fence < 0) { 8667 RExC_parse_inc_by(1); 8668 goto bad_syntax; 8669 } 8670 /* If at least two things on the stack, treat this as an 8671 * operator */ 8672 if (top_index - fence >= 1) { 8673 goto join_operators; 8674 } 8675 8676 /* Here only a single thing on the fenced stack, and there is a 8677 * fence. Get rid of it */ 8678 fence_ptr = av_pop(fence_stack); 8679 assert(fence_ptr); 8680 fence = SvIV(fence_ptr); 8681 SvREFCNT_dec_NN(fence_ptr); 8682 fence_ptr = NULL; 8683 8684 if (fence < 0) { 8685 fence = 0; 8686 } 8687 8688 /* Having gotten rid of the fence, we pop the operand at the 8689 * stack top and process it as a newly encountered operand */ 8690 current = av_pop(stack); 8691 if (IS_OPERAND(current)) { 8692 goto handle_operand; 8693 } 8694 8695 RExC_parse_inc_by(1); 8696 goto bad_syntax; 8697 8698 case '&': 8699 case '|': 8700 case '+': 8701 case '-': 8702 case '^': 8703 8704 /* These binary operators should have a left operand already 8705 * parsed */ 8706 if ( top_index - fence < 0 8707 || top_index - fence == 1 8708 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) 8709 || ! IS_OPERAND(*top_ptr)) 8710 { 8711 goto unexpected_binary; 8712 } 8713 8714 /* If only the one operand is on the part of the stack visible 8715 * to us, we just place this operator in the proper position */ 8716 if (top_index - fence < 2) { 8717 8718 /* Place the operator before the operand */ 8719 8720 SV* lhs = av_pop(stack); 8721 av_push_simple(stack, newSVuv(curchar)); 8722 av_push_simple(stack, lhs); 8723 break; 8724 } 8725 8726 /* But if there is something else on the stack, we need to 8727 * process it before this new operator if and only if the 8728 * stacked operation has equal or higher precedence than the 8729 * new one */ 8730 8731 join_operators: 8732 8733 /* The operator on the stack is supposed to be below both its 8734 * operands */ 8735 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) 8736 || IS_OPERAND(*stacked_ptr)) 8737 { 8738 /* But if not, it's legal and indicates we are completely 8739 * done if and only if we're currently processing a ']', 8740 * which should be the final thing in the expression */ 8741 if (curchar == ']') { 8742 goto done; 8743 } 8744 8745 unexpected_binary: 8746 RExC_parse_inc_by(1); 8747 vFAIL2("Unexpected binary operator '%c' with no " 8748 "preceding operand", curchar); 8749 } 8750 stacked_operator = (char) SvUV(*stacked_ptr); 8751 8752 if (regex_set_precedence(curchar) 8753 > regex_set_precedence(stacked_operator)) 8754 { 8755 /* Here, the new operator has higher precedence than the 8756 * stacked one. This means we need to add the new one to 8757 * the stack to await its rhs operand (and maybe more 8758 * stuff). We put it before the lhs operand, leaving 8759 * untouched the stacked operator and everything below it 8760 * */ 8761 lhs = av_pop(stack); 8762 assert(IS_OPERAND(lhs)); 8763 av_push_simple(stack, newSVuv(curchar)); 8764 av_push_simple(stack, lhs); 8765 break; 8766 } 8767 8768 /* Here, the new operator has equal or lower precedence than 8769 * what's already there. This means the operation already 8770 * there should be performed now, before the new one. */ 8771 8772 rhs = av_pop(stack); 8773 if (! IS_OPERAND(rhs)) { 8774 8775 /* This can happen when a ! is not followed by an operand, 8776 * like in /(?[\t &!])/ */ 8777 goto bad_syntax; 8778 } 8779 8780 lhs = av_pop(stack); 8781 8782 if (! IS_OPERAND(lhs)) { 8783 8784 /* This can happen when there is an empty (), like in 8785 * /(?[[0]+()+])/ */ 8786 goto bad_syntax; 8787 } 8788 8789 switch (stacked_operator) { 8790 case '&': 8791 _invlist_intersection(lhs, rhs, &rhs); 8792 break; 8793 8794 case '|': 8795 case '+': 8796 _invlist_union(lhs, rhs, &rhs); 8797 break; 8798 8799 case '-': 8800 _invlist_subtract(lhs, rhs, &rhs); 8801 break; 8802 8803 case '^': /* The union minus the intersection */ 8804 { 8805 SV* i = NULL; 8806 SV* u = NULL; 8807 8808 _invlist_union(lhs, rhs, &u); 8809 _invlist_intersection(lhs, rhs, &i); 8810 _invlist_subtract(u, i, &rhs); 8811 SvREFCNT_dec_NN(i); 8812 SvREFCNT_dec_NN(u); 8813 break; 8814 } 8815 } 8816 SvREFCNT_dec(lhs); 8817 8818 /* Here, the higher precedence operation has been done, and the 8819 * result is in 'rhs'. We overwrite the stacked operator with 8820 * the result. Then we redo this code to either push the new 8821 * operator onto the stack or perform any higher precedence 8822 * stacked operation */ 8823 only_to_avoid_leaks = av_pop(stack); 8824 SvREFCNT_dec(only_to_avoid_leaks); 8825 av_push_simple(stack, rhs); 8826 goto redo_curchar; 8827 8828 case '!': /* Highest priority, right associative */ 8829 8830 /* If what's already at the top of the stack is another '!", 8831 * they just cancel each other out */ 8832 if ( (top_ptr = av_fetch(stack, top_index, FALSE)) 8833 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) 8834 { 8835 only_to_avoid_leaks = av_pop(stack); 8836 SvREFCNT_dec(only_to_avoid_leaks); 8837 } 8838 else { /* Otherwise, since it's right associative, just push 8839 onto the stack */ 8840 av_push_simple(stack, newSVuv(curchar)); 8841 } 8842 break; 8843 8844 default: 8845 RExC_parse_inc(); 8846 if (RExC_parse >= RExC_end) { 8847 break; 8848 } 8849 vFAIL("Unexpected character"); 8850 8851 handle_operand: 8852 8853 /* Here 'current' is the operand. If something is already on the 8854 * stack, we have to check if it is a !. But first, the code above 8855 * may have altered the stack in the time since we earlier set 8856 * 'top_index'. */ 8857 8858 top_index = av_tindex_skip_len_mg(stack); 8859 if (top_index - fence >= 0) { 8860 /* If the top entry on the stack is an operator, it had better 8861 * be a '!', otherwise the entry below the top operand should 8862 * be an operator */ 8863 top_ptr = av_fetch(stack, top_index, FALSE); 8864 assert(top_ptr); 8865 if (IS_OPERATOR(*top_ptr)) { 8866 8867 /* The only permissible operator at the top of the stack is 8868 * '!', which is applied immediately to this operand. */ 8869 curchar = (char) SvUV(*top_ptr); 8870 if (curchar != '!') { 8871 SvREFCNT_dec(current); 8872 vFAIL2("Unexpected binary operator '%c' with no " 8873 "preceding operand", curchar); 8874 } 8875 8876 _invlist_invert(current); 8877 8878 only_to_avoid_leaks = av_pop(stack); 8879 SvREFCNT_dec(only_to_avoid_leaks); 8880 8881 /* And we redo with the inverted operand. This allows 8882 * handling multiple ! in a row */ 8883 goto handle_operand; 8884 } 8885 /* Single operand is ok only for the non-binary ')' 8886 * operator */ 8887 else if ((top_index - fence == 0 && curchar != ')') 8888 || (top_index - fence > 0 8889 && (! (stacked_ptr = av_fetch(stack, 8890 top_index - 1, 8891 FALSE)) 8892 || IS_OPERAND(*stacked_ptr)))) 8893 { 8894 SvREFCNT_dec(current); 8895 vFAIL("Operand with no preceding operator"); 8896 } 8897 } 8898 8899 /* Here there was nothing on the stack or the top element was 8900 * another operand. Just add this new one */ 8901 av_push_simple(stack, current); 8902 8903 } /* End of switch on next parse token */ 8904 8905 RExC_parse_inc(); 8906 } /* End of loop parsing through the construct */ 8907 8908 vFAIL("Syntax error in (?[...])"); 8909 8910 done: 8911 8912 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') { 8913 if (RExC_parse < RExC_end) { 8914 RExC_parse_inc_by(1); 8915 } 8916 8917 vFAIL("Unexpected ']' with no following ')' in (?[..."); 8918 } 8919 8920 if (av_tindex_skip_len_mg(fence_stack) >= 0) { 8921 vFAIL("Unmatched ("); 8922 } 8923 8924 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */ 8925 || ((final = av_pop(stack)) == NULL) 8926 || ! IS_OPERAND(final) 8927 || ! is_invlist(final) 8928 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */ 8929 { 8930 bad_syntax: 8931 SvREFCNT_dec(final); 8932 vFAIL("Incomplete expression within '(?[ ])'"); 8933 } 8934 8935 /* Here, 'final' is the resultant inversion list from evaluating the 8936 * expression. Return it if so requested */ 8937 if (return_invlist) { 8938 *return_invlist = final; 8939 return END; 8940 } 8941 8942 if (RExC_sets_depth) { /* If within a recursive call, return in a special 8943 regnode */ 8944 RExC_parse_inc_by(1); 8945 node = regpnode(pRExC_state, REGEX_SET, final); 8946 } 8947 else { 8948 8949 /* Otherwise generate a resultant node, based on 'final'. regclass() 8950 * is expecting a string of ranges and individual code points */ 8951 invlist_iterinit(final); 8952 result_string = newSVpvs(""); 8953 while (invlist_iternext(final, &start, &end)) { 8954 if (start == end) { 8955 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start); 8956 } 8957 else { 8958 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" 8959 UVXf "}", start, end); 8960 } 8961 } 8962 8963 /* About to generate an ANYOF (or similar) node from the inversion list 8964 * we have calculated */ 8965 save_parse = RExC_parse; 8966 RExC_parse_set(SvPV(result_string, len)); 8967 save_end = RExC_end; 8968 RExC_end = RExC_parse + len; 8969 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE; 8970 8971 /* We turn off folding around the call, as the class we have 8972 * constructed already has all folding taken into consideration, and we 8973 * don't want regclass() to add to that */ 8974 RExC_flags &= ~RXf_PMf_FOLD; 8975 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char 8976 * folds are allowed. */ 8977 node = regclass(pRExC_state, flagp, depth+1, 8978 FALSE, /* means parse the whole char class */ 8979 FALSE, /* don't allow multi-char folds */ 8980 TRUE, /* silence non-portable warnings. The above may 8981 very well have generated non-portable code 8982 points, but they're valid on this machine */ 8983 FALSE, /* similarly, no need for strict */ 8984 8985 /* We can optimize into something besides an ANYOF, 8986 * except under /l, which needs to be ANYOF because of 8987 * runtime checks for locale sanity, etc */ 8988 ! in_locale, 8989 NULL 8990 ); 8991 8992 RESTORE_WARNINGS; 8993 RExC_parse_set(save_parse + 1); 8994 RExC_end = save_end; 8995 SvREFCNT_dec_NN(final); 8996 SvREFCNT_dec_NN(result_string); 8997 8998 if (save_fold) { 8999 RExC_flags |= RXf_PMf_FOLD; 9000 } 9001 9002 if (!node) { 9003 RETURN_FAIL_ON_RESTART(*flagp, flagp); 9004 goto regclass_failed; 9005 } 9006 9007 /* Fix up the node type if we are in locale. (We have pretended we are 9008 * under /u for the purposes of regclass(), as this construct will only 9009 * work under UTF-8 locales. But now we change the opcode to be ANYOFL 9010 * (so as to cause any warnings about bad locales to be output in 9011 * regexec.c), and add the flag that indicates to check if not in a 9012 * UTF-8 locale. The reason we above forbid optimization into 9013 * something other than an ANYOF node is simply to minimize the number 9014 * of code changes in regexec.c. Otherwise we would have to create new 9015 * EXACTish node types and deal with them. This decision could be 9016 * revisited should this construct become popular. 9017 * 9018 * (One might think we could look at the resulting ANYOF node and 9019 * suppress the flag if everything is above 255, as those would be 9020 * UTF-8 only, but this isn't true, as the components that led to that 9021 * result could have been locale-affected, and just happen to cancel 9022 * each other out under UTF-8 locales.) */ 9023 if (in_locale) { 9024 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); 9025 9026 assert(OP(REGNODE_p(node)) == ANYOF); 9027 9028 OP(REGNODE_p(node)) = ANYOFL; 9029 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD; 9030 } 9031 } 9032 9033 nextchar(pRExC_state); 9034 return node; 9035 9036 regclass_failed: 9037 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf, 9038 (UV) *flagp); 9039} 9040 9041#ifdef ENABLE_REGEX_SETS_DEBUGGING 9042 9043STATIC void 9044S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, 9045 AV * stack, const IV fence, AV * fence_stack) 9046{ /* Dumps the stacks in handle_regex_sets() */ 9047 9048 const SSize_t stack_top = av_tindex_skip_len_mg(stack); 9049 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack); 9050 SSize_t i; 9051 9052 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES; 9053 9054 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse); 9055 9056 if (stack_top < 0) { 9057 PerlIO_printf(Perl_debug_log, "Nothing on stack\n"); 9058 } 9059 else { 9060 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence); 9061 for (i = stack_top; i >= 0; i--) { 9062 SV ** element_ptr = av_fetch(stack, i, FALSE); 9063 if (! element_ptr) { 9064 } 9065 9066 if (IS_OPERATOR(*element_ptr)) { 9067 PerlIO_printf(Perl_debug_log, "[%d]: %c\n", 9068 (int) i, (int) SvIV(*element_ptr)); 9069 } 9070 else { 9071 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i); 9072 sv_dump(*element_ptr); 9073 } 9074 } 9075 } 9076 9077 if (fence_stack_top < 0) { 9078 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n"); 9079 } 9080 else { 9081 PerlIO_printf(Perl_debug_log, "Fence_stack: \n"); 9082 for (i = fence_stack_top; i >= 0; i--) { 9083 SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE); 9084 if (! element_ptr) { 9085 } 9086 9087 PerlIO_printf(Perl_debug_log, "[%d]: %d\n", 9088 (int) i, (int) SvIV(*element_ptr)); 9089 } 9090 } 9091} 9092 9093#endif 9094 9095#undef IS_OPERATOR 9096#undef IS_OPERAND 9097 9098void 9099Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) 9100{ 9101 /* This adds the Latin1/above-Latin1 folding rules. 9102 * 9103 * This should be called only for a Latin1-range code points, cp, which is 9104 * known to be involved in a simple fold with other code points above 9105 * Latin1. It would give false results if /aa has been specified. 9106 * Multi-char folds are outside the scope of this, and must be handled 9107 * specially. */ 9108 9109 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; 9110 9111 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); 9112 9113 /* The rules that are valid for all Unicode versions are hard-coded in */ 9114 switch (cp) { 9115 case 'k': 9116 case 'K': 9117 *invlist = 9118 add_cp_to_invlist(*invlist, KELVIN_SIGN); 9119 break; 9120 case 's': 9121 case 'S': 9122 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); 9123 break; 9124 case MICRO_SIGN: 9125 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); 9126 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); 9127 break; 9128 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: 9129 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: 9130 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); 9131 break; 9132 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 9133 *invlist = add_cp_to_invlist(*invlist, 9134 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); 9135 break; 9136 9137 default: /* Other code points are checked against the data for the 9138 current Unicode version */ 9139 { 9140 Size_t folds_count; 9141 U32 first_fold; 9142 const U32 * remaining_folds; 9143 UV folded_cp; 9144 9145 if (isASCII(cp)) { 9146 folded_cp = toFOLD(cp); 9147 } 9148 else { 9149 U8 dummy_fold[UTF8_MAXBYTES_CASE+1]; 9150 Size_t dummy_len; 9151 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0); 9152 } 9153 9154 if (folded_cp > 255) { 9155 *invlist = add_cp_to_invlist(*invlist, folded_cp); 9156 } 9157 9158 folds_count = _inverse_folds(folded_cp, &first_fold, 9159 &remaining_folds); 9160 if (folds_count == 0) { 9161 9162 /* Use deprecated warning to increase the chances of this being 9163 * output */ 9164 ckWARN2reg_d(RExC_parse, 9165 "Perl folding rules are not up-to-date for 0x%02X;" 9166 " please use the perlbug utility to report;", cp); 9167 } 9168 else { 9169 unsigned int i; 9170 9171 if (first_fold > 255) { 9172 *invlist = add_cp_to_invlist(*invlist, first_fold); 9173 } 9174 for (i = 0; i < folds_count - 1; i++) { 9175 if (remaining_folds[i] > 255) { 9176 *invlist = add_cp_to_invlist(*invlist, 9177 remaining_folds[i]); 9178 } 9179 } 9180 } 9181 break; 9182 } 9183 } 9184} 9185 9186STATIC void 9187S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings) 9188{ 9189 /* Output the elements of the array given by '*posix_warnings' as REGEXP 9190 * warnings. */ 9191 9192 SV * msg; 9193 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP)); 9194 9195 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS; 9196 9197 if (! TO_OUTPUT_WARNINGS(RExC_parse)) { 9198 CLEAR_POSIX_WARNINGS(); 9199 return; 9200 } 9201 9202 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { 9203 if (first_is_fatal) { /* Avoid leaking this */ 9204 av_undef(posix_warnings); /* This isn't necessary if the 9205 array is mortal, but is a 9206 fail-safe */ 9207 (void) sv_2mortal(msg); 9208 PREPARE_TO_DIE; 9209 } 9210 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); 9211 SvREFCNT_dec_NN(msg); 9212 } 9213 9214 UPDATE_WARNINGS_LOC(RExC_parse); 9215} 9216 9217PERL_STATIC_INLINE Size_t 9218S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max) 9219{ 9220 const U8 * const start = s1; 9221 const U8 * const send = start + max; 9222 9223 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS; 9224 9225 while (s1 < send && *s1 == *s2) { 9226 s1++; s2++; 9227 } 9228 9229 return s1 - start; 9230} 9231 9232STATIC AV * 9233S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) 9234{ 9235 /* This adds the string scalar <multi_string> to the array 9236 * <multi_char_matches>. <multi_string> is known to have exactly 9237 * <cp_count> code points in it. This is used when constructing a 9238 * bracketed character class and we find something that needs to match more 9239 * than a single character. 9240 * 9241 * <multi_char_matches> is actually an array of arrays. Each top-level 9242 * element is an array that contains all the strings known so far that are 9243 * the same length. And that length (in number of code points) is the same 9244 * as the index of the top-level array. Hence, the [2] element is an 9245 * array, each element thereof is a string containing TWO code points; 9246 * while element [3] is for strings of THREE characters, and so on. Since 9247 * this is for multi-char strings there can never be a [0] nor [1] element. 9248 * 9249 * When we rewrite the character class below, we will do so such that the 9250 * longest strings are written first, so that it prefers the longest 9251 * matching strings first. This is done even if it turns out that any 9252 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom 9253 * Christiansen has agreed that this is ok. This makes the test for the 9254 * ligature 'ffi' come before the test for 'ff', for example */ 9255 9256 AV* this_array; 9257 AV** this_array_ptr; 9258 9259 PERL_ARGS_ASSERT_ADD_MULTI_MATCH; 9260 9261 if (! multi_char_matches) { 9262 multi_char_matches = newAV(); 9263 } 9264 9265 if (av_exists(multi_char_matches, cp_count)) { 9266 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE); 9267 this_array = *this_array_ptr; 9268 } 9269 else { 9270 this_array = newAV(); 9271 av_store_simple(multi_char_matches, cp_count, 9272 (SV*) this_array); 9273 } 9274 av_push_simple(this_array, multi_string); 9275 9276 return multi_char_matches; 9277} 9278 9279/* The names of properties whose definitions are not known at compile time are 9280 * stored in this SV, after a constant heading. So if the length has been 9281 * changed since initialization, then there is a run-time definition. */ 9282#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ 9283 (SvCUR(listsv) != initial_listsv_len) 9284 9285/* There is a restricted set of white space characters that are legal when 9286 * ignoring white space in a bracketed character class. This generates the 9287 * code to skip them. 9288 * 9289 * There is a line below that uses the same white space criteria but is outside 9290 * this macro. Both here and there must use the same definition */ 9291#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \ 9292 STMT_START { \ 9293 if (do_skip) { \ 9294 while (p < stop_p && isBLANK_A(UCHARAT(p))) \ 9295 { \ 9296 p++; \ 9297 } \ 9298 } \ 9299 } STMT_END 9300 9301STATIC regnode_offset 9302S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, 9303 const bool stop_at_1, /* Just parse the next thing, don't 9304 look for a full character class */ 9305 bool allow_mutiple_chars, 9306 const bool silence_non_portable, /* Don't output warnings 9307 about too large 9308 characters */ 9309 const bool strict, 9310 bool optimizable, /* ? Allow a non-ANYOF return 9311 node */ 9312 SV** ret_invlist /* Return an inversion list, not a node */ 9313 ) 9314{ 9315 /* parse a bracketed class specification. Most of these will produce an 9316 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an 9317 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex 9318 * under /i with multi-character folds: it will be rewritten following the 9319 * paradigm of this example, where the <multi-fold>s are characters which 9320 * fold to multiple character sequences: 9321 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i 9322 * gets effectively rewritten as: 9323 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i 9324 * reg() gets called (recursively) on the rewritten version, and this 9325 * function will return what it constructs. (Actually the <multi-fold>s 9326 * aren't physically removed from the [abcdefghi], it's just that they are 9327 * ignored in the recursion by means of a flag: 9328 * <RExC_in_multi_char_class>.) 9329 * 9330 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS 9331 * characters, with the corresponding bit set if that character is in the 9332 * list. For characters above this, an inversion list is used. There 9333 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not 9334 * determinable at compile time 9335 * 9336 * On success, returns the offset at which any next node should be placed 9337 * into the regex engine program being compiled. 9338 * 9339 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs 9340 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to 9341 * UTF-8 9342 */ 9343 9344 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; 9345 IV range = 0; 9346 UV value = OOB_UNICODE, save_value = OOB_UNICODE; 9347 regnode_offset ret = -1; /* Initialized to an illegal value */ 9348 STRLEN numlen; 9349 int namedclass = OOB_NAMEDCLASS; 9350 char *rangebegin = NULL; 9351 SV *listsv = NULL; /* List of \p{user-defined} whose definitions 9352 aren't available at the time this was called */ 9353 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more 9354 than just initialized. */ 9355 SV* properties = NULL; /* Code points that match \p{} \P{} */ 9356 SV* posixes = NULL; /* Code points that match classes like [:word:], 9357 extended beyond the Latin1 range. These have to 9358 be kept separate from other code points for much 9359 of this function because their handling is 9360 different under /i, and for most classes under 9361 /d as well */ 9362 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept 9363 separate for a while from the non-complemented 9364 versions because of complications with /d 9365 matching */ 9366 SV* simple_posixes = NULL; /* But under some conditions, the classes can be 9367 treated more simply than the general case, 9368 leading to less compilation and execution 9369 work */ 9370 UV element_count = 0; /* Number of distinct elements in the class. 9371 Optimizations may be possible if this is tiny */ 9372 AV * multi_char_matches = NULL; /* Code points that fold to more than one 9373 character; used under /i */ 9374 UV n; 9375 char * stop_ptr = RExC_end; /* where to stop parsing */ 9376 9377 /* ignore unescaped whitespace? */ 9378 const bool skip_white = cBOOL( ret_invlist 9379 || (RExC_flags & RXf_PMf_EXTENDED_MORE)); 9380 9381 /* inversion list of code points this node matches only when the target 9382 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under 9383 * /d) */ 9384 SV* upper_latin1_only_utf8_matches = NULL; 9385 9386 /* Inversion list of code points this node matches regardless of things 9387 * like locale, folding, utf8ness of the target string */ 9388 SV* cp_list = NULL; 9389 9390 /* Like cp_list, but code points on this list need to be checked for things 9391 * that fold to/from them under /i */ 9392 SV* cp_foldable_list = NULL; 9393 9394 /* Like cp_list, but code points on this list are valid only when the 9395 * runtime locale is UTF-8 */ 9396 SV* only_utf8_locale_list = NULL; 9397 9398 /* In a range, if one of the endpoints is non-character-set portable, 9399 * meaning that it hard-codes a code point that may mean a different 9400 * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a 9401 * mnemonic '\t' which each mean the same character no matter which 9402 * character set the platform is on. */ 9403 unsigned int non_portable_endpoint = 0; 9404 9405 /* Is the range unicode? which means on a platform that isn't 1-1 native 9406 * to Unicode (i.e. non-ASCII), each code point in it should be considered 9407 * to be a Unicode value. */ 9408 bool unicode_range = FALSE; 9409 bool invert = FALSE; /* Is this class to be complemented */ 9410 9411 bool warn_super = ALWAYS_WARN_SUPER; 9412 9413 const char * orig_parse = RExC_parse; 9414 9415 /* This variable is used to mark where the end in the input is of something 9416 * that looks like a POSIX construct but isn't. During the parse, when 9417 * something looks like it could be such a construct is encountered, it is 9418 * checked for being one, but not if we've already checked this area of the 9419 * input. Only after this position is reached do we check again */ 9420 char *not_posix_region_end = RExC_parse - 1; 9421 9422 AV* posix_warnings = NULL; 9423 const bool do_posix_warnings = ckWARN(WARN_REGEXP); 9424 U8 op = ANYOF; /* The returned node-type, initialized to the expected 9425 type. */ 9426 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */ 9427 U32 posixl = 0; /* bit field of posix classes matched under /l */ 9428 9429 9430/* Flags as to what things aren't knowable until runtime. (Note that these are 9431 * mutually exclusive.) */ 9432#define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that 9433 haven't been defined as of yet */ 9434#define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is 9435 UTF-8 or not */ 9436#define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and 9437 what gets folded */ 9438 U32 has_runtime_dependency = 0; /* OR of the above flags */ 9439 9440 DECLARE_AND_GET_RE_DEBUG_FLAGS; 9441 9442 PERL_ARGS_ASSERT_REGCLASS; 9443#ifndef DEBUGGING 9444 PERL_UNUSED_ARG(depth); 9445#endif 9446 9447 assert(! (ret_invlist && allow_mutiple_chars)); 9448 9449 /* If wants an inversion list returned, we can't optimize to something 9450 * else. */ 9451 if (ret_invlist) { 9452 optimizable = FALSE; 9453 } 9454 9455 DEBUG_PARSE("clas"); 9456 9457#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ 9458 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ 9459 && UNICODE_DOT_DOT_VERSION == 0) 9460 allow_mutiple_chars = FALSE; 9461#endif 9462 9463 /* We include the /i status at the beginning of this so that we can 9464 * know it at runtime */ 9465 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD))); 9466 initial_listsv_len = SvCUR(listsv); 9467 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ 9468 9469 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 9470 9471 assert(RExC_parse <= RExC_end); 9472 9473 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ 9474 RExC_parse_inc_by(1); 9475 invert = TRUE; 9476 allow_mutiple_chars = FALSE; 9477 MARK_NAUGHTY(1); 9478 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 9479 } 9480 9481 /* Check that they didn't say [:posix:] instead of [[:posix:]] */ 9482 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { 9483 int maybe_class = handle_possible_posix(pRExC_state, 9484 RExC_parse, 9485 ¬_posix_region_end, 9486 NULL, 9487 TRUE /* checking only */); 9488 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { 9489 ckWARN4reg(not_posix_region_end, 9490 "POSIX syntax [%c %c] belongs inside character classes%s", 9491 *RExC_parse, *RExC_parse, 9492 (maybe_class == OOB_NAMEDCLASS) 9493 ? ((POSIXCC_NOTYET(*RExC_parse)) 9494 ? " (but this one isn't implemented)" 9495 : " (but this one isn't fully valid)") 9496 : "" 9497 ); 9498 } 9499 } 9500 9501 /* If the caller wants us to just parse a single element, accomplish this 9502 * by faking the loop ending condition */ 9503 if (stop_at_1 && RExC_end > RExC_parse) { 9504 stop_ptr = RExC_parse + 1; 9505 } 9506 9507 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ 9508 if (UCHARAT(RExC_parse) == ']') 9509 goto charclassloop; 9510 9511 while (1) { 9512 9513 if ( posix_warnings 9514 && av_tindex_skip_len_mg(posix_warnings) >= 0 9515 && RExC_parse > not_posix_region_end) 9516 { 9517 /* Warnings about posix class issues are considered tentative until 9518 * we are far enough along in the parse that we can no longer 9519 * change our mind, at which point we output them. This is done 9520 * each time through the loop so that a later class won't zap them 9521 * before they have been dealt with. */ 9522 output_posix_warnings(pRExC_state, posix_warnings); 9523 } 9524 9525 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 9526 9527 if (RExC_parse >= stop_ptr) { 9528 break; 9529 } 9530 9531 if (UCHARAT(RExC_parse) == ']') { 9532 break; 9533 } 9534 9535 charclassloop: 9536 9537 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 9538 save_value = value; 9539 save_prevvalue = prevvalue; 9540 9541 if (!range) { 9542 rangebegin = RExC_parse; 9543 element_count++; 9544 non_portable_endpoint = 0; 9545 } 9546 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) { 9547 value = utf8n_to_uvchr((U8*)RExC_parse, 9548 RExC_end - RExC_parse, 9549 &numlen, UTF8_ALLOW_DEFAULT); 9550 RExC_parse_inc_by(numlen); 9551 } 9552 else { 9553 value = UCHARAT(RExC_parse); 9554 RExC_parse_inc_by(1); 9555 } 9556 9557 if (value == '[') { 9558 char * posix_class_end; 9559 namedclass = handle_possible_posix(pRExC_state, 9560 RExC_parse, 9561 &posix_class_end, 9562 do_posix_warnings ? &posix_warnings : NULL, 9563 FALSE /* die if error */); 9564 if (namedclass > OOB_NAMEDCLASS) { 9565 9566 /* If there was an earlier attempt to parse this particular 9567 * posix class, and it failed, it was a false alarm, as this 9568 * successful one proves */ 9569 if ( posix_warnings 9570 && av_tindex_skip_len_mg(posix_warnings) >= 0 9571 && not_posix_region_end >= RExC_parse 9572 && not_posix_region_end <= posix_class_end) 9573 { 9574 av_undef(posix_warnings); 9575 } 9576 9577 RExC_parse_set(posix_class_end); 9578 } 9579 else if (namedclass == OOB_NAMEDCLASS) { 9580 not_posix_region_end = posix_class_end; 9581 } 9582 else { 9583 namedclass = OOB_NAMEDCLASS; 9584 } 9585 } 9586 else if ( RExC_parse - 1 > not_posix_region_end 9587 && MAYBE_POSIXCC(value)) 9588 { 9589 (void) handle_possible_posix( 9590 pRExC_state, 9591 RExC_parse - 1, /* -1 because parse has already been 9592 advanced */ 9593 ¬_posix_region_end, 9594 do_posix_warnings ? &posix_warnings : NULL, 9595 TRUE /* checking only */); 9596 } 9597 else if ( strict && ! skip_white 9598 && ( generic_isCC_(value, CC_VERTSPACE_) 9599 || is_VERTWS_cp_high(value))) 9600 { 9601 vFAIL("Literal vertical space in [] is illegal except under /x"); 9602 } 9603 else if (value == '\\') { 9604 /* Is a backslash; get the code point of the char after it */ 9605 9606 if (RExC_parse >= RExC_end) { 9607 vFAIL("Unmatched ["); 9608 } 9609 9610 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { 9611 value = utf8n_to_uvchr((U8*)RExC_parse, 9612 RExC_end - RExC_parse, 9613 &numlen, UTF8_ALLOW_DEFAULT); 9614 RExC_parse_inc_by(numlen); 9615 } 9616 else { 9617 value = UCHARAT(RExC_parse); 9618 RExC_parse_inc_by(1); 9619 } 9620 9621 /* Some compilers cannot handle switching on 64-bit integer 9622 * values, therefore value cannot be an UV. Yes, this will 9623 * be a problem later if we want switch on Unicode. 9624 * A similar issue a little bit later when switching on 9625 * namedclass. --jhi */ 9626 9627 /* If the \ is escaping white space when white space is being 9628 * skipped, it means that that white space is wanted literally, and 9629 * is already in 'value'. Otherwise, need to translate the escape 9630 * into what it signifies. */ 9631 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { 9632 const char * message; 9633 U32 packed_warn; 9634 U8 grok_c_char; 9635 9636 case 'w': namedclass = ANYOF_WORDCHAR; break; 9637 case 'W': namedclass = ANYOF_NWORDCHAR; break; 9638 case 's': namedclass = ANYOF_SPACE; break; 9639 case 'S': namedclass = ANYOF_NSPACE; break; 9640 case 'd': namedclass = ANYOF_DIGIT; break; 9641 case 'D': namedclass = ANYOF_NDIGIT; break; 9642 case 'v': namedclass = ANYOF_VERTWS; break; 9643 case 'V': namedclass = ANYOF_NVERTWS; break; 9644 case 'h': namedclass = ANYOF_HORIZWS; break; 9645 case 'H': namedclass = ANYOF_NHORIZWS; break; 9646 case 'N': /* Handle \N{NAME} in class */ 9647 { 9648 const char * const backslash_N_beg = RExC_parse - 2; 9649 int cp_count; 9650 9651 if (! grok_bslash_N(pRExC_state, 9652 NULL, /* No regnode */ 9653 &value, /* Yes single value */ 9654 &cp_count, /* Multiple code pt count */ 9655 flagp, 9656 strict, 9657 depth) 9658 ) { 9659 9660 if (*flagp & NEED_UTF8) 9661 FAIL("panic: grok_bslash_N set NEED_UTF8"); 9662 9663 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 9664 9665 if (cp_count < 0) { 9666 vFAIL("\\N in a character class must be a named character: \\N{...}"); 9667 } 9668 else if (cp_count == 0) { 9669 ckWARNreg(RExC_parse, 9670 "Ignoring zero length \\N{} in character class"); 9671 } 9672 else { /* cp_count > 1 */ 9673 assert(cp_count > 1); 9674 if (! RExC_in_multi_char_class) { 9675 if ( ! allow_mutiple_chars 9676 || invert 9677 || range 9678 || *RExC_parse == '-') 9679 { 9680 if (strict) { 9681 RExC_parse--; 9682 vFAIL("\\N{} here is restricted to one character"); 9683 } 9684 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); 9685 break; /* <value> contains the first code 9686 point. Drop out of the switch to 9687 process it */ 9688 } 9689 else { 9690 SV * multi_char_N = newSVpvn(backslash_N_beg, 9691 RExC_parse - backslash_N_beg); 9692 multi_char_matches 9693 = add_multi_match(multi_char_matches, 9694 multi_char_N, 9695 cp_count); 9696 } 9697 } 9698 } /* End of cp_count != 1 */ 9699 9700 /* This element should not be processed further in this 9701 * class */ 9702 element_count--; 9703 value = save_value; 9704 prevvalue = save_prevvalue; 9705 continue; /* Back to top of loop to get next char */ 9706 } 9707 9708 /* Here, is a single code point, and <value> contains it */ 9709 unicode_range = TRUE; /* \N{} are Unicode */ 9710 } 9711 break; 9712 case 'p': 9713 case 'P': 9714 { 9715 char *e; 9716 9717 if (RExC_pm_flags & PMf_WILDCARD) { 9718 RExC_parse_inc_by(1); 9719 /* diag_listed_as: Use of %s is not allowed in Unicode 9720 property wildcard subpatterns in regex; marked by <-- 9721 HERE in m/%s/ */ 9722 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property" 9723 " wildcard subpatterns", (char) value, *(RExC_parse - 1)); 9724 } 9725 9726 /* \p means they want Unicode semantics */ 9727 REQUIRE_UNI_RULES(flagp, 0); 9728 9729 if (RExC_parse >= RExC_end) 9730 vFAIL2("Empty \\%c", (U8)value); 9731 if (*RExC_parse == '{') { 9732 const U8 c = (U8)value; 9733 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 9734 if (!e) { 9735 RExC_parse_inc_by(1); 9736 vFAIL2("Missing right brace on \\%c{}", c); 9737 } 9738 9739 RExC_parse_inc_by(1); 9740 9741 /* White space is allowed adjacent to the braces and after 9742 * any '^', even when not under /x */ 9743 while (isSPACE(*RExC_parse)) { 9744 RExC_parse_inc_by(1); 9745 } 9746 9747 if (UCHARAT(RExC_parse) == '^') { 9748 9749 /* toggle. (The rhs xor gets the single bit that 9750 * differs between P and p; the other xor inverts just 9751 * that bit) */ 9752 value ^= 'P' ^ 'p'; 9753 9754 RExC_parse_inc_by(1); 9755 while (isSPACE(*RExC_parse)) { 9756 RExC_parse_inc_by(1); 9757 } 9758 } 9759 9760 if (e == RExC_parse) 9761 vFAIL2("Empty \\%c{}", c); 9762 9763 n = e - RExC_parse; 9764 while (isSPACE(*(RExC_parse + n - 1))) 9765 n--; 9766 9767 } /* The \p isn't immediately followed by a '{' */ 9768 else if (! isALPHA(*RExC_parse)) { 9769 RExC_parse_inc_safe(); 9770 vFAIL2("Character following \\%c must be '{' or a " 9771 "single-character Unicode property name", 9772 (U8) value); 9773 } 9774 else { 9775 e = RExC_parse; 9776 n = 1; 9777 } 9778 { 9779 char* name = RExC_parse; 9780 9781 /* Any message returned about expanding the definition */ 9782 SV* msg = newSVpvs_flags("", SVs_TEMP); 9783 9784 /* If set TRUE, the property is user-defined as opposed to 9785 * official Unicode */ 9786 bool user_defined = FALSE; 9787 AV * strings = NULL; 9788 9789 SV * prop_definition = parse_uniprop_string( 9790 name, n, UTF, FOLD, 9791 FALSE, /* This is compile-time */ 9792 9793 /* We can't defer this defn when 9794 * the full result is required in 9795 * this call */ 9796 ! cBOOL(ret_invlist), 9797 9798 &strings, 9799 &user_defined, 9800 msg, 9801 0 /* Base level */ 9802 ); 9803 if (SvCUR(msg)) { /* Assumes any error causes a msg */ 9804 assert(prop_definition == NULL); 9805 RExC_parse_set(e + 1); 9806 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole 9807 thing so, or else the display is 9808 mojibake */ 9809 RExC_utf8 = TRUE; 9810 } 9811 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */ 9812 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg), 9813 SvCUR(msg), SvPVX(msg))); 9814 } 9815 9816 assert(prop_definition || strings); 9817 9818 if (strings) { 9819 if (ret_invlist) { 9820 if (! prop_definition) { 9821 RExC_parse_set(e + 1); 9822 vFAIL("Unicode string properties are not implemented in (?[...])"); 9823 } 9824 else { 9825 ckWARNreg(e + 1, 9826 "Using just the single character results" 9827 " returned by \\p{} in (?[...])"); 9828 } 9829 } 9830 else if (! RExC_in_multi_char_class) { 9831 if (invert ^ (value == 'P')) { 9832 RExC_parse_set(e + 1); 9833 vFAIL("Inverting a character class which contains" 9834 " a multi-character sequence is illegal"); 9835 } 9836 9837 /* For each multi-character string ... */ 9838 while (av_count(strings) > 0) { 9839 /* ... Each entry is itself an array of code 9840 * points. */ 9841 AV * this_string = (AV *) av_shift( strings); 9842 STRLEN cp_count = av_count(this_string); 9843 SV * final = newSV(cp_count ? cp_count * 4 : 1); 9844 SvPVCLEAR_FRESH(final); 9845 9846 /* Create another string of sequences of \x{...} */ 9847 while (av_count(this_string) > 0) { 9848 SV * character = av_shift(this_string); 9849 UV cp = SvUV(character); 9850 9851 if (cp > 255) { 9852 REQUIRE_UTF8(flagp); 9853 } 9854 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}", 9855 cp); 9856 SvREFCNT_dec_NN(character); 9857 } 9858 SvREFCNT_dec_NN(this_string); 9859 9860 /* And add that to the list of such things */ 9861 multi_char_matches 9862 = add_multi_match(multi_char_matches, 9863 final, 9864 cp_count); 9865 } 9866 } 9867 SvREFCNT_dec_NN(strings); 9868 } 9869 9870 if (! prop_definition) { /* If we got only a string, 9871 this iteration didn't really 9872 find a character */ 9873 element_count--; 9874 } 9875 else if (! is_invlist(prop_definition)) { 9876 9877 /* Here, the definition isn't known, so we have gotten 9878 * returned a string that will be evaluated if and when 9879 * encountered at runtime. We add it to the list of 9880 * such properties, along with whether it should be 9881 * complemented or not */ 9882 if (value == 'P') { 9883 sv_catpvs(listsv, "!"); 9884 } 9885 else { 9886 sv_catpvs(listsv, "+"); 9887 } 9888 sv_catsv(listsv, prop_definition); 9889 9890 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY; 9891 9892 /* We don't know yet what this matches, so have to flag 9893 * it */ 9894 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 9895 } 9896 else { 9897 assert (prop_definition && is_invlist(prop_definition)); 9898 9899 /* Here we do have the complete property definition 9900 * 9901 * Temporary workaround for [GH #16520]. For this 9902 * precise input that is in the .t that is failing, 9903 * load utf8.pm, which is what the test wants, so that 9904 * that .t passes */ 9905 if ( memEQs(RExC_start, e + 1 - RExC_start, 9906 "foo\\p{Alnum}") 9907 && ! hv_common(GvHVn(PL_incgv), 9908 NULL, 9909 "utf8.pm", sizeof("utf8.pm") - 1, 9910 0, HV_FETCH_ISEXISTS, NULL, 0)) 9911 { 9912 require_pv("utf8.pm"); 9913 } 9914 9915 if (! user_defined && 9916 /* We warn on matching an above-Unicode code point 9917 * if the match would return true, except don't 9918 * warn for \p{All}, which has exactly one element 9919 * = 0 */ 9920 (_invlist_contains_cp(prop_definition, 0x110000) 9921 && (! (_invlist_len(prop_definition) == 1 9922 && *invlist_array(prop_definition) == 0)))) 9923 { 9924 warn_super = TRUE; 9925 } 9926 9927 /* Invert if asking for the complement */ 9928 if (value == 'P') { 9929 _invlist_union_complement_2nd(properties, 9930 prop_definition, 9931 &properties); 9932 } 9933 else { 9934 _invlist_union(properties, prop_definition, &properties); 9935 } 9936 } 9937 } 9938 9939 RExC_parse_set(e + 1); 9940 namedclass = ANYOF_UNIPROP; /* no official name, but it's 9941 named */ 9942 } 9943 break; 9944 case 'n': value = '\n'; break; 9945 case 'r': value = '\r'; break; 9946 case 't': value = '\t'; break; 9947 case 'f': value = '\f'; break; 9948 case 'b': value = '\b'; break; 9949 case 'e': value = ESC_NATIVE; break; 9950 case 'a': value = '\a'; break; 9951 case 'o': 9952 RExC_parse--; /* function expects to be pointed at the 'o' */ 9953 if (! grok_bslash_o(&RExC_parse, 9954 RExC_end, 9955 &value, 9956 &message, 9957 &packed_warn, 9958 strict, 9959 cBOOL(range), /* MAX_UV allowed for range 9960 upper limit */ 9961 UTF)) 9962 { 9963 vFAIL(message); 9964 } 9965 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { 9966 warn_non_literal_string(RExC_parse, packed_warn, message); 9967 } 9968 9969 if (value < 256) { 9970 non_portable_endpoint++; 9971 } 9972 break; 9973 case 'x': 9974 RExC_parse--; /* function expects to be pointed at the 'x' */ 9975 if (! grok_bslash_x(&RExC_parse, 9976 RExC_end, 9977 &value, 9978 &message, 9979 &packed_warn, 9980 strict, 9981 cBOOL(range), /* MAX_UV allowed for range 9982 upper limit */ 9983 UTF)) 9984 { 9985 vFAIL(message); 9986 } 9987 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { 9988 warn_non_literal_string(RExC_parse, packed_warn, message); 9989 } 9990 9991 if (value < 256) { 9992 non_portable_endpoint++; 9993 } 9994 break; 9995 case 'c': 9996 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message, 9997 &packed_warn)) 9998 { 9999 /* going to die anyway; point to exact spot of 10000 * failure */ 10001 RExC_parse_inc_safe(); 10002 vFAIL(message); 10003 } 10004 10005 value = grok_c_char; 10006 RExC_parse_inc_by(1); 10007 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { 10008 warn_non_literal_string(RExC_parse, packed_warn, message); 10009 } 10010 10011 non_portable_endpoint++; 10012 break; 10013 case '0': case '1': case '2': case '3': case '4': 10014 case '5': case '6': case '7': 10015 { 10016 /* Take 1-3 octal digits */ 10017 I32 flags = PERL_SCAN_SILENT_ILLDIGIT 10018 | PERL_SCAN_NOTIFY_ILLDIGIT; 10019 numlen = (strict) ? 4 : 3; 10020 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 10021 RExC_parse_inc_by(numlen); 10022 if (numlen != 3) { 10023 if (strict) { 10024 RExC_parse_inc_safe(); 10025 vFAIL("Need exactly 3 octal digits"); 10026 } 10027 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT) 10028 && RExC_parse < RExC_end 10029 && isDIGIT(*RExC_parse) 10030 && ckWARN(WARN_REGEXP)) 10031 { 10032 reg_warn_non_literal_string( 10033 RExC_parse + 1, 10034 form_alien_digit_msg(8, numlen, RExC_parse, 10035 RExC_end, UTF, FALSE)); 10036 } 10037 } 10038 if (value < 256) { 10039 non_portable_endpoint++; 10040 } 10041 break; 10042 } 10043 default: 10044 /* Allow \_ to not give an error */ 10045 if (isWORDCHAR(value) && value != '_') { 10046 if (strict) { 10047 vFAIL2("Unrecognized escape \\%c in character class", 10048 (int)value); 10049 } 10050 else { 10051 ckWARN2reg(RExC_parse, 10052 "Unrecognized escape \\%c in character class passed through", 10053 (int)value); 10054 } 10055 } 10056 break; 10057 } /* End of switch on char following backslash */ 10058 } /* end of handling backslash escape sequences */ 10059 10060 /* Here, we have the current token in 'value' */ 10061 10062 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 10063 U8 classnum; 10064 10065 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a 10066 * literal, as is the character that began the false range, i.e. 10067 * the 'a' in the examples */ 10068 if (range) { 10069 const int w = (RExC_parse >= rangebegin) 10070 ? RExC_parse - rangebegin 10071 : 0; 10072 if (strict) { 10073 vFAIL2utf8f( 10074 "False [] range \"%" UTF8f "\"", 10075 UTF8fARG(UTF, w, rangebegin)); 10076 } 10077 else { 10078 ckWARN2reg(RExC_parse, 10079 "False [] range \"%" UTF8f "\"", 10080 UTF8fARG(UTF, w, rangebegin)); 10081 cp_list = add_cp_to_invlist(cp_list, '-'); 10082 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, 10083 prevvalue); 10084 } 10085 10086 range = 0; /* this was not a true range */ 10087 element_count += 2; /* So counts for three values */ 10088 } 10089 10090 classnum = namedclass_to_classnum(namedclass); 10091 10092 if (LOC && namedclass < ANYOF_POSIXL_MAX 10093#ifndef HAS_ISASCII 10094 && classnum != CC_ASCII_ 10095#endif 10096 ) { 10097 SV* scratch_list = NULL; 10098 10099 /* What the Posix classes (like \w, [:space:]) match isn't 10100 * generally knowable under locale until actual match time. A 10101 * special node is used for these which has extra space for a 10102 * bitmap, with a bit reserved for each named class that is to 10103 * be matched against. (This isn't needed for \p{} and 10104 * pseudo-classes, as they are not affected by locale, and 10105 * hence are dealt with separately.) However, if a named class 10106 * and its complement are both present, then it matches 10107 * everything, and there is no runtime dependency. Odd numbers 10108 * are the complements of the next lower number, so xor works. 10109 * (Note that something like [\w\D] should match everything, 10110 * because \d should be a proper subset of \w. But rather than 10111 * trust that the locale is well behaved, we leave this to 10112 * runtime to sort out) */ 10113 if (POSIXL_TEST(posixl, namedclass ^ 1)) { 10114 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX); 10115 POSIXL_ZERO(posixl); 10116 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY; 10117 anyof_flags &= ~ANYOF_MATCHES_POSIXL; 10118 continue; /* We could ignore the rest of the class, but 10119 best to parse it for any errors */ 10120 } 10121 else { /* Here, isn't the complement of any already parsed 10122 class */ 10123 POSIXL_SET(posixl, namedclass); 10124 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 10125 anyof_flags |= ANYOF_MATCHES_POSIXL; 10126 10127 /* The above-Latin1 characters are not subject to locale 10128 * rules. Just add them to the unconditionally-matched 10129 * list */ 10130 10131 /* Get the list of the above-Latin1 code points this 10132 * matches */ 10133 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, 10134 PL_XPosix_ptrs[classnum], 10135 10136 /* Odd numbers are complements, 10137 * like NDIGIT, NASCII, ... */ 10138 namedclass % 2 != 0, 10139 &scratch_list); 10140 /* Checking if 'cp_list' is NULL first saves an extra 10141 * clone. Its reference count will be decremented at the 10142 * next union, etc, or if this is the only instance, at the 10143 * end of the routine */ 10144 if (! cp_list) { 10145 cp_list = scratch_list; 10146 } 10147 else { 10148 _invlist_union(cp_list, scratch_list, &cp_list); 10149 SvREFCNT_dec_NN(scratch_list); 10150 } 10151 continue; /* Go get next character */ 10152 } 10153 } 10154 else { 10155 10156 /* Here, is not /l, or is a POSIX class for which /l doesn't 10157 * matter (or is a Unicode property, which is skipped here). */ 10158 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ 10159 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ 10160 10161 /* Here, should be \h, \H, \v, or \V. None of /d, /i 10162 * nor /l make a difference in what these match, 10163 * therefore we just add what they match to cp_list. */ 10164 if (classnum != CC_VERTSPACE_) { 10165 assert( namedclass == ANYOF_HORIZWS 10166 || namedclass == ANYOF_NHORIZWS); 10167 10168 /* It turns out that \h is just a synonym for 10169 * XPosixBlank */ 10170 classnum = CC_BLANK_; 10171 } 10172 10173 _invlist_union_maybe_complement_2nd( 10174 cp_list, 10175 PL_XPosix_ptrs[classnum], 10176 namedclass % 2 != 0, /* Complement if odd 10177 (NHORIZWS, NVERTWS) 10178 */ 10179 &cp_list); 10180 } 10181 } 10182 else if ( AT_LEAST_UNI_SEMANTICS 10183 || classnum == CC_ASCII_ 10184 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_ 10185 || classnum == CC_XDIGIT_))) 10186 { 10187 /* We usually have to worry about /d affecting what POSIX 10188 * classes match, with special code needed because we won't 10189 * know until runtime what all matches. But there is no 10190 * extra work needed under /u and /a; and [:ascii:] is 10191 * unaffected by /d; and :digit: and :xdigit: don't have 10192 * runtime differences under /d. So we can special case 10193 * these, and avoid some extra work below, and at runtime. 10194 * */ 10195 _invlist_union_maybe_complement_2nd( 10196 simple_posixes, 10197 ((AT_LEAST_ASCII_RESTRICTED) 10198 ? PL_Posix_ptrs[classnum] 10199 : PL_XPosix_ptrs[classnum]), 10200 namedclass % 2 != 0, 10201 &simple_posixes); 10202 } 10203 else { /* Garden variety class. If is NUPPER, NALPHA, ... 10204 complement and use nposixes */ 10205 SV** posixes_ptr = namedclass % 2 == 0 10206 ? &posixes 10207 : &nposixes; 10208 _invlist_union_maybe_complement_2nd( 10209 *posixes_ptr, 10210 PL_XPosix_ptrs[classnum], 10211 namedclass % 2 != 0, 10212 posixes_ptr); 10213 } 10214 } 10215 } /* end of namedclass \blah */ 10216 10217 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 10218 10219 /* If 'range' is set, 'value' is the ending of a range--check its 10220 * validity. (If value isn't a single code point in the case of a 10221 * range, we should have figured that out above in the code that 10222 * catches false ranges). Later, we will handle each individual code 10223 * point in the range. If 'range' isn't set, this could be the 10224 * beginning of a range, so check for that by looking ahead to see if 10225 * the next real character to be processed is the range indicator--the 10226 * minus sign */ 10227 10228 if (range) { 10229#ifdef EBCDIC 10230 /* For unicode ranges, we have to test that the Unicode as opposed 10231 * to the native values are not decreasing. (Above 255, there is 10232 * no difference between native and Unicode) */ 10233 if (unicode_range && prevvalue < 255 && value < 255) { 10234 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) { 10235 goto backwards_range; 10236 } 10237 } 10238 else 10239#endif 10240 if (prevvalue > value) /* b-a */ { 10241 int w; 10242#ifdef EBCDIC 10243 backwards_range: 10244#endif 10245 w = RExC_parse - rangebegin; 10246 vFAIL2utf8f( 10247 "Invalid [] range \"%" UTF8f "\"", 10248 UTF8fARG(UTF, w, rangebegin)); 10249 NOT_REACHED; /* NOTREACHED */ 10250 } 10251 } 10252 else { 10253 prevvalue = value; /* save the beginning of the potential range */ 10254 if (! stop_at_1 /* Can't be a range if parsing just one thing */ 10255 && *RExC_parse == '-') 10256 { 10257 char* next_char_ptr = RExC_parse + 1; 10258 10259 /* Get the next real char after the '-' */ 10260 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end); 10261 10262 /* If the '-' is at the end of the class (just before the ']', 10263 * it is a literal minus; otherwise it is a range */ 10264 if (next_char_ptr < RExC_end && *next_char_ptr != ']') { 10265 RExC_parse_set(next_char_ptr); 10266 10267 /* a bad range like \w-, [:word:]- ? */ 10268 if (namedclass > OOB_NAMEDCLASS) { 10269 if (strict || ckWARN(WARN_REGEXP)) { 10270 const int w = RExC_parse >= rangebegin 10271 ? RExC_parse - rangebegin 10272 : 0; 10273 if (strict) { 10274 vFAIL4("False [] range \"%*.*s\"", 10275 w, w, rangebegin); 10276 } 10277 else { 10278 vWARN4(RExC_parse, 10279 "False [] range \"%*.*s\"", 10280 w, w, rangebegin); 10281 } 10282 } 10283 cp_list = add_cp_to_invlist(cp_list, '-'); 10284 element_count++; 10285 } else 10286 range = 1; /* yeah, it's a range! */ 10287 continue; /* but do it the next time */ 10288 } 10289 } 10290 } 10291 10292 if (namedclass > OOB_NAMEDCLASS) { 10293 continue; 10294 } 10295 10296 /* Here, we have a single value this time through the loop, and 10297 * <prevvalue> is the beginning of the range, if any; or <value> if 10298 * not. */ 10299 10300 /* non-Latin1 code point implies unicode semantics. */ 10301 if (value > 255) { 10302 if (value > MAX_LEGAL_CP && ( value != UV_MAX 10303 || prevvalue > MAX_LEGAL_CP)) 10304 { 10305 vFAIL(form_cp_too_large_msg(16, NULL, 0, value)); 10306 } 10307 REQUIRE_UNI_RULES(flagp, 0); 10308 if ( ! silence_non_portable 10309 && UNICODE_IS_PERL_EXTENDED(value) 10310 && TO_OUTPUT_WARNINGS(RExC_parse)) 10311 { 10312 ckWARN2_non_literal_string(RExC_parse, 10313 packWARN(WARN_PORTABLE), 10314 PL_extended_cp_format, 10315 value); 10316 } 10317 } 10318 10319 /* Ready to process either the single value, or the completed range. 10320 * For single-valued non-inverted ranges, we consider the possibility 10321 * of multi-char folds. (We made a conscious decision to not do this 10322 * for the other cases because it can often lead to non-intuitive 10323 * results. For example, you have the peculiar case that: 10324 * "s s" =~ /^[^\xDF]+$/i => Y 10325 * "ss" =~ /^[^\xDF]+$/i => N 10326 * 10327 * See [perl #89750] */ 10328 if (FOLD && allow_mutiple_chars && value == prevvalue) { 10329 if ( value == LATIN_SMALL_LETTER_SHARP_S 10330 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, 10331 value))) 10332 { 10333 /* Here <value> is indeed a multi-char fold. Get what it is */ 10334 10335 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 10336 STRLEN foldlen; 10337 10338 UV folded = _to_uni_fold_flags( 10339 value, 10340 foldbuf, 10341 &foldlen, 10342 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED 10343 ? FOLD_FLAGS_NOMIX_ASCII 10344 : 0) 10345 ); 10346 10347 /* Here, <folded> should be the first character of the 10348 * multi-char fold of <value>, with <foldbuf> containing the 10349 * whole thing. But, if this fold is not allowed (because of 10350 * the flags), <fold> will be the same as <value>, and should 10351 * be processed like any other character, so skip the special 10352 * handling */ 10353 if (folded != value) { 10354 10355 /* Skip if we are recursed, currently parsing the class 10356 * again. Otherwise add this character to the list of 10357 * multi-char folds. */ 10358 if (! RExC_in_multi_char_class) { 10359 STRLEN cp_count = utf8_length(foldbuf, 10360 foldbuf + foldlen); 10361 SV* multi_fold = newSVpvs_flags("", SVs_TEMP); 10362 10363 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value); 10364 10365 multi_char_matches 10366 = add_multi_match(multi_char_matches, 10367 multi_fold, 10368 cp_count); 10369 10370 } 10371 10372 /* This element should not be processed further in this 10373 * class */ 10374 element_count--; 10375 value = save_value; 10376 prevvalue = save_prevvalue; 10377 continue; 10378 } 10379 } 10380 } 10381 10382 if (strict && ckWARN(WARN_REGEXP)) { 10383 if (range) { 10384 10385 /* If the range starts above 255, everything is portable and 10386 * likely to be so for any forseeable character set, so don't 10387 * warn. */ 10388 if (unicode_range && non_portable_endpoint && prevvalue < 256) { 10389 vWARN(RExC_parse, "Both or neither range ends should be Unicode"); 10390 } 10391 else if (prevvalue != value) { 10392 10393 /* Under strict, ranges that stop and/or end in an ASCII 10394 * printable should have each end point be a portable value 10395 * for it (preferably like 'A', but we don't warn if it is 10396 * a (portable) Unicode name or code point), and the range 10397 * must be all digits or all letters of the same case. 10398 * Otherwise, the range is non-portable and unclear as to 10399 * what it contains */ 10400 if ( (isPRINT_A(prevvalue) || isPRINT_A(value)) 10401 && ( non_portable_endpoint 10402 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value)) 10403 || (isLOWER_A(prevvalue) && isLOWER_A(value)) 10404 || (isUPPER_A(prevvalue) && isUPPER_A(value)) 10405 ))) { 10406 vWARN(RExC_parse, "Ranges of ASCII printables should" 10407 " be some subset of \"0-9\"," 10408 " \"A-Z\", or \"a-z\""); 10409 } 10410 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) { 10411 SSize_t index_start; 10412 SSize_t index_final; 10413 10414 /* But the nature of Unicode and languages mean we 10415 * can't do the same checks for above-ASCII ranges, 10416 * except in the case of digit ones. These should 10417 * contain only digits from the same group of 10. The 10418 * ASCII case is handled just above. Hence here, the 10419 * range could be a range of digits. First some 10420 * unlikely special cases. Grandfather in that a range 10421 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad 10422 * if its starting value is one of the 10 digits prior 10423 * to it. This is because it is an alternate way of 10424 * writing 19D1, and some people may expect it to be in 10425 * that group. But it is bad, because it won't give 10426 * the expected results. In Unicode 5.2 it was 10427 * considered to be in that group (of 11, hence), but 10428 * this was fixed in the next version */ 10429 10430 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) { 10431 goto warn_bad_digit_range; 10432 } 10433 else if (UNLIKELY( prevvalue >= 0x1D7CE 10434 && value <= 0x1D7FF)) 10435 { 10436 /* This is the only other case currently in Unicode 10437 * where the algorithm below fails. The code 10438 * points just above are the end points of a single 10439 * range containing only decimal digits. It is 5 10440 * different series of 0-9. All other ranges of 10441 * digits currently in Unicode are just a single 10442 * series. (And mktables will notify us if a later 10443 * Unicode version breaks this.) 10444 * 10445 * If the range being checked is at most 9 long, 10446 * and the digit values represented are in 10447 * numerical order, they are from the same series. 10448 * */ 10449 if ( value - prevvalue > 9 10450 || ((( value - 0x1D7CE) % 10) 10451 <= (prevvalue - 0x1D7CE) % 10)) 10452 { 10453 goto warn_bad_digit_range; 10454 } 10455 } 10456 else { 10457 10458 /* For all other ranges of digits in Unicode, the 10459 * algorithm is just to check if both end points 10460 * are in the same series, which is the same range. 10461 * */ 10462 index_start = _invlist_search( 10463 PL_XPosix_ptrs[CC_DIGIT_], 10464 prevvalue); 10465 10466 /* Warn if the range starts and ends with a digit, 10467 * and they are not in the same group of 10. */ 10468 if ( index_start >= 0 10469 && ELEMENT_RANGE_MATCHES_INVLIST(index_start) 10470 && (index_final = 10471 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_], 10472 value)) != index_start 10473 && index_final >= 0 10474 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)) 10475 { 10476 warn_bad_digit_range: 10477 vWARN(RExC_parse, "Ranges of digits should be" 10478 " from the same group of" 10479 " 10"); 10480 } 10481 } 10482 } 10483 } 10484 } 10485 if ((! range || prevvalue == value) && non_portable_endpoint) { 10486 if (isPRINT_A(value)) { 10487 char literal[3]; 10488 unsigned d = 0; 10489 if (isBACKSLASHED_PUNCT(value)) { 10490 literal[d++] = '\\'; 10491 } 10492 literal[d++] = (char) value; 10493 literal[d++] = '\0'; 10494 10495 vWARN4(RExC_parse, 10496 "\"%.*s\" is more clearly written simply as \"%s\"", 10497 (int) (RExC_parse - rangebegin), 10498 rangebegin, 10499 literal 10500 ); 10501 } 10502 else if (isMNEMONIC_CNTRL(value)) { 10503 vWARN4(RExC_parse, 10504 "\"%.*s\" is more clearly written simply as \"%s\"", 10505 (int) (RExC_parse - rangebegin), 10506 rangebegin, 10507 cntrl_to_mnemonic((U8) value) 10508 ); 10509 } 10510 } 10511 } 10512 10513 /* Deal with this element of the class */ 10514 10515#ifndef EBCDIC 10516 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 10517 prevvalue, value); 10518#else 10519 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones 10520 * that don't require special handling, we can just add the range like 10521 * we do for ASCII platforms */ 10522 if ((UNLIKELY(prevvalue == 0) && value >= 255) 10523 || ! (prevvalue < 256 10524 && (unicode_range 10525 || (! non_portable_endpoint 10526 && ((isLOWER_A(prevvalue) && isLOWER_A(value)) 10527 || (isUPPER_A(prevvalue) 10528 && isUPPER_A(value))))))) 10529 { 10530 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 10531 prevvalue, value); 10532 } 10533 else { 10534 /* Here, requires special handling. This can be because it is a 10535 * range whose code points are considered to be Unicode, and so 10536 * must be individually translated into native, or because its a 10537 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in 10538 * EBCDIC, but we have defined them to include only the "expected" 10539 * upper or lower case ASCII alphabetics. Subranges above 255 are 10540 * the same in native and Unicode, so can be added as a range */ 10541 U8 start = NATIVE_TO_LATIN1(prevvalue); 10542 unsigned j; 10543 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; 10544 for (j = start; j <= end; j++) { 10545 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); 10546 } 10547 if (value > 255) { 10548 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 10549 256, value); 10550 } 10551 } 10552#endif 10553 10554 range = 0; /* this range (if it was one) is done now */ 10555 } /* End of loop through all the text within the brackets */ 10556 10557 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { 10558 output_posix_warnings(pRExC_state, posix_warnings); 10559 } 10560 10561 /* If anything in the class expands to more than one character, we have to 10562 * deal with them by building up a substitute parse string, and recursively 10563 * calling reg() on it, instead of proceeding */ 10564 if (multi_char_matches) { 10565 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); 10566 I32 cp_count; 10567 STRLEN len; 10568 char *save_end = RExC_end; 10569 char *save_parse = RExC_parse; 10570 char *save_start = RExC_start; 10571 Size_t constructed_prefix_len = 0; /* This gives the length of the 10572 constructed portion of the 10573 substitute parse. */ 10574 bool first_time = TRUE; /* First multi-char occurrence doesn't get 10575 a "|" */ 10576 I32 reg_flags; 10577 10578 assert(! invert); 10579 /* Only one level of recursion allowed */ 10580 assert(RExC_copy_start_in_constructed == RExC_precomp); 10581 10582#if 0 /* Have decided not to deal with multi-char folds in inverted classes, 10583 because too confusing */ 10584 if (invert) { 10585 sv_catpvs(substitute_parse, "(?:"); 10586 } 10587#endif 10588 10589 /* Look at the longest strings first */ 10590 for (cp_count = av_tindex_skip_len_mg(multi_char_matches); 10591 cp_count > 0; 10592 cp_count--) 10593 { 10594 10595 if (av_exists(multi_char_matches, cp_count)) { 10596 AV** this_array_ptr; 10597 SV* this_sequence; 10598 10599 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, 10600 cp_count, FALSE); 10601 while ((this_sequence = av_pop(*this_array_ptr)) != 10602 &PL_sv_undef) 10603 { 10604 if (! first_time) { 10605 sv_catpvs(substitute_parse, "|"); 10606 } 10607 first_time = FALSE; 10608 10609 sv_catpv(substitute_parse, SvPVX(this_sequence)); 10610 } 10611 } 10612 } 10613 10614 /* If the character class contains anything else besides these 10615 * multi-character strings, have to include it in recursive parsing */ 10616 if (element_count) { 10617 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '['; 10618 10619 sv_catpvs(substitute_parse, "|"); 10620 if (has_l_bracket) { /* Add an [ if the original had one */ 10621 sv_catpvs(substitute_parse, "["); 10622 } 10623 constructed_prefix_len = SvCUR(substitute_parse); 10624 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); 10625 10626 /* Put in a closing ']' to match any opening one, but not if going 10627 * off the end, as otherwise we are adding something that really 10628 * isn't there */ 10629 if (has_l_bracket && RExC_parse < RExC_end) { 10630 sv_catpvs(substitute_parse, "]"); 10631 } 10632 } 10633 10634 sv_catpvs(substitute_parse, ")"); 10635#if 0 10636 if (invert) { 10637 /* This is a way to get the parse to skip forward a whole named 10638 * sequence instead of matching the 2nd character when it fails the 10639 * first */ 10640 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); 10641 } 10642#endif 10643 10644 /* Set up the data structure so that any errors will be properly 10645 * reported. See the comments at the definition of 10646 * REPORT_LOCATION_ARGS for details */ 10647 RExC_copy_start_in_input = (char *) orig_parse; 10648 RExC_start = SvPV(substitute_parse, len); 10649 RExC_parse_set( RExC_start ); 10650 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len; 10651 RExC_end = RExC_parse + len; 10652 RExC_in_multi_char_class = 1; 10653 10654 ret = reg(pRExC_state, 1, ®_flags, depth+1); 10655 10656 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8); 10657 10658 /* And restore so can parse the rest of the pattern */ 10659 RExC_parse_set(save_parse); 10660 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start; 10661 RExC_end = save_end; 10662 RExC_in_multi_char_class = 0; 10663 SvREFCNT_dec_NN(multi_char_matches); 10664 SvREFCNT_dec(properties); 10665 SvREFCNT_dec(cp_list); 10666 SvREFCNT_dec(simple_posixes); 10667 SvREFCNT_dec(posixes); 10668 SvREFCNT_dec(nposixes); 10669 SvREFCNT_dec(cp_foldable_list); 10670 return ret; 10671 } 10672 10673 /* If folding, we calculate all characters that could fold to or from the 10674 * ones already on the list */ 10675 if (cp_foldable_list) { 10676 if (FOLD) { 10677 UV start, end; /* End points of code point ranges */ 10678 10679 SV* fold_intersection = NULL; 10680 SV** use_list; 10681 10682 /* Our calculated list will be for Unicode rules. For locale 10683 * matching, we have to keep a separate list that is consulted at 10684 * runtime only when the locale indicates Unicode rules (and we 10685 * don't include potential matches in the ASCII/Latin1 range, as 10686 * any code point could fold to any other, based on the run-time 10687 * locale). For non-locale, we just use the general list */ 10688 if (LOC) { 10689 use_list = &only_utf8_locale_list; 10690 } 10691 else { 10692 use_list = &cp_list; 10693 } 10694 10695 /* Only the characters in this class that participate in folds need 10696 * be checked. Get the intersection of this class and all the 10697 * possible characters that are foldable. This can quickly narrow 10698 * down a large class */ 10699 _invlist_intersection(PL_in_some_fold, cp_foldable_list, 10700 &fold_intersection); 10701 10702 /* Now look at the foldable characters in this class individually */ 10703 invlist_iterinit(fold_intersection); 10704 while (invlist_iternext(fold_intersection, &start, &end)) { 10705 UV j; 10706 UV folded; 10707 10708 /* Look at every character in the range */ 10709 for (j = start; j <= end; j++) { 10710 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 10711 STRLEN foldlen; 10712 unsigned int k; 10713 Size_t folds_count; 10714 U32 first_fold; 10715 const U32 * remaining_folds; 10716 10717 if (j < 256) { 10718 10719 /* Under /l, we don't know what code points below 256 10720 * fold to, except we do know the MICRO SIGN folds to 10721 * an above-255 character if the locale is UTF-8, so we 10722 * add it to the special list (in *use_list) Otherwise 10723 * we know now what things can match, though some folds 10724 * are valid under /d only if the target is UTF-8. 10725 * Those go in a separate list */ 10726 if ( IS_IN_SOME_FOLD_L1(j) 10727 && ! (LOC && j != MICRO_SIGN)) 10728 { 10729 10730 /* ASCII is always matched; non-ASCII is matched 10731 * only under Unicode rules (which could happen 10732 * under /l if the locale is a UTF-8 one */ 10733 if (isASCII(j) || ! DEPENDS_SEMANTICS) { 10734 *use_list = add_cp_to_invlist(*use_list, 10735 PL_fold_latin1[j]); 10736 } 10737 else if (j != PL_fold_latin1[j]) { 10738 upper_latin1_only_utf8_matches 10739 = add_cp_to_invlist( 10740 upper_latin1_only_utf8_matches, 10741 PL_fold_latin1[j]); 10742 } 10743 } 10744 10745 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) 10746 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) 10747 { 10748 add_above_Latin1_folds(pRExC_state, 10749 (U8) j, 10750 use_list); 10751 } 10752 continue; 10753 } 10754 10755 /* Here is an above Latin1 character. We don't have the 10756 * rules hard-coded for it. First, get its fold. This is 10757 * the simple fold, as the multi-character folds have been 10758 * handled earlier and separated out */ 10759 folded = _to_uni_fold_flags(j, foldbuf, &foldlen, 10760 (ASCII_FOLD_RESTRICTED) 10761 ? FOLD_FLAGS_NOMIX_ASCII 10762 : 0); 10763 10764 /* Single character fold of above Latin1. Add everything 10765 * in its fold closure to the list that this node should 10766 * match. */ 10767 folds_count = _inverse_folds(folded, &first_fold, 10768 &remaining_folds); 10769 for (k = 0; k <= folds_count; k++) { 10770 UV c = (k == 0) /* First time through use itself */ 10771 ? folded 10772 : (k == 1) /* 2nd time use, the first fold */ 10773 ? first_fold 10774 10775 /* Then the remaining ones */ 10776 : remaining_folds[k-2]; 10777 10778 /* /aa doesn't allow folds between ASCII and non- */ 10779 if (( ASCII_FOLD_RESTRICTED 10780 && (isASCII(c) != isASCII(j)))) 10781 { 10782 continue; 10783 } 10784 10785 /* Folds under /l which cross the 255/256 boundary are 10786 * added to a separate list. (These are valid only 10787 * when the locale is UTF-8.) */ 10788 if (c < 256 && LOC) { 10789 *use_list = add_cp_to_invlist(*use_list, c); 10790 continue; 10791 } 10792 10793 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) 10794 { 10795 cp_list = add_cp_to_invlist(cp_list, c); 10796 } 10797 else { 10798 /* Similarly folds involving non-ascii Latin1 10799 * characters under /d are added to their list */ 10800 upper_latin1_only_utf8_matches 10801 = add_cp_to_invlist( 10802 upper_latin1_only_utf8_matches, 10803 c); 10804 } 10805 } 10806 } 10807 } 10808 SvREFCNT_dec_NN(fold_intersection); 10809 } 10810 10811 /* Now that we have finished adding all the folds, there is no reason 10812 * to keep the foldable list separate */ 10813 _invlist_union(cp_list, cp_foldable_list, &cp_list); 10814 SvREFCNT_dec_NN(cp_foldable_list); 10815 } 10816 10817 /* And combine the result (if any) with any inversion lists from posix 10818 * classes. The lists are kept separate up to now because we don't want to 10819 * fold the classes */ 10820 if (simple_posixes) { /* These are the classes known to be unaffected by 10821 /a, /aa, and /d */ 10822 if (cp_list) { 10823 _invlist_union(cp_list, simple_posixes, &cp_list); 10824 SvREFCNT_dec_NN(simple_posixes); 10825 } 10826 else { 10827 cp_list = simple_posixes; 10828 } 10829 } 10830 if (posixes || nposixes) { 10831 if (! DEPENDS_SEMANTICS) { 10832 10833 /* For everything but /d, we can just add the current 'posixes' and 10834 * 'nposixes' to the main list */ 10835 if (posixes) { 10836 if (cp_list) { 10837 _invlist_union(cp_list, posixes, &cp_list); 10838 SvREFCNT_dec_NN(posixes); 10839 } 10840 else { 10841 cp_list = posixes; 10842 } 10843 } 10844 if (nposixes) { 10845 if (cp_list) { 10846 _invlist_union(cp_list, nposixes, &cp_list); 10847 SvREFCNT_dec_NN(nposixes); 10848 } 10849 else { 10850 cp_list = nposixes; 10851 } 10852 } 10853 } 10854 else { 10855 /* Under /d, things like \w match upper Latin1 characters only if 10856 * the target string is in UTF-8. But things like \W match all the 10857 * upper Latin1 characters if the target string is not in UTF-8. 10858 * 10859 * Handle the case with something like \W separately */ 10860 if (nposixes) { 10861 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL); 10862 10863 /* A complemented posix class matches all upper Latin1 10864 * characters if not in UTF-8. And it matches just certain 10865 * ones when in UTF-8. That means those certain ones are 10866 * matched regardless, so can just be added to the 10867 * unconditional list */ 10868 if (cp_list) { 10869 _invlist_union(cp_list, nposixes, &cp_list); 10870 SvREFCNT_dec_NN(nposixes); 10871 nposixes = NULL; 10872 } 10873 else { 10874 cp_list = nposixes; 10875 } 10876 10877 /* Likewise for 'posixes' */ 10878 _invlist_union(posixes, cp_list, &cp_list); 10879 SvREFCNT_dec(posixes); 10880 10881 /* Likewise for anything else in the range that matched only 10882 * under UTF-8 */ 10883 if (upper_latin1_only_utf8_matches) { 10884 _invlist_union(cp_list, 10885 upper_latin1_only_utf8_matches, 10886 &cp_list); 10887 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 10888 upper_latin1_only_utf8_matches = NULL; 10889 } 10890 10891 /* If we don't match all the upper Latin1 characters regardless 10892 * of UTF-8ness, we have to set a flag to match the rest when 10893 * not in UTF-8 */ 10894 _invlist_subtract(only_non_utf8_list, cp_list, 10895 &only_non_utf8_list); 10896 if (_invlist_len(only_non_utf8_list) != 0) { 10897 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared; 10898 } 10899 SvREFCNT_dec_NN(only_non_utf8_list); 10900 } 10901 else { 10902 /* Here there were no complemented posix classes. That means 10903 * the upper Latin1 characters in 'posixes' match only when the 10904 * target string is in UTF-8. So we have to add them to the 10905 * list of those types of code points, while adding the 10906 * remainder to the unconditional list. 10907 * 10908 * First calculate what they are */ 10909 SV* nonascii_but_latin1_properties = NULL; 10910 _invlist_intersection(posixes, PL_UpperLatin1, 10911 &nonascii_but_latin1_properties); 10912 10913 /* And add them to the final list of such characters. */ 10914 _invlist_union(upper_latin1_only_utf8_matches, 10915 nonascii_but_latin1_properties, 10916 &upper_latin1_only_utf8_matches); 10917 10918 /* Remove them from what now becomes the unconditional list */ 10919 _invlist_subtract(posixes, nonascii_but_latin1_properties, 10920 &posixes); 10921 10922 /* And add those unconditional ones to the final list */ 10923 if (cp_list) { 10924 _invlist_union(cp_list, posixes, &cp_list); 10925 SvREFCNT_dec_NN(posixes); 10926 posixes = NULL; 10927 } 10928 else { 10929 cp_list = posixes; 10930 } 10931 10932 SvREFCNT_dec(nonascii_but_latin1_properties); 10933 10934 /* Get rid of any characters from the conditional list that we 10935 * now know are matched unconditionally, which may make that 10936 * list empty */ 10937 _invlist_subtract(upper_latin1_only_utf8_matches, 10938 cp_list, 10939 &upper_latin1_only_utf8_matches); 10940 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) { 10941 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 10942 upper_latin1_only_utf8_matches = NULL; 10943 } 10944 } 10945 } 10946 } 10947 10948 /* And combine the result (if any) with any inversion list from properties. 10949 * The lists are kept separate up to now so that we can distinguish the two 10950 * in regards to matching above-Unicode. A run-time warning is generated 10951 * if a Unicode property is matched against a non-Unicode code point. But, 10952 * we allow user-defined properties to match anything, without any warning, 10953 * and we also suppress the warning if there is a portion of the character 10954 * class that isn't a Unicode property, and which matches above Unicode, \W 10955 * or [\x{110000}] for example. 10956 * (Note that in this case, unlike the Posix one above, there is no 10957 * <upper_latin1_only_utf8_matches>, because having a Unicode property 10958 * forces Unicode semantics */ 10959 if (properties) { 10960 if (cp_list) { 10961 10962 /* If it matters to the final outcome, see if a non-property 10963 * component of the class matches above Unicode. If so, the 10964 * warning gets suppressed. This is true even if just a single 10965 * such code point is specified, as, though not strictly correct if 10966 * another such code point is matched against, the fact that they 10967 * are using above-Unicode code points indicates they should know 10968 * the issues involved */ 10969 if (warn_super) { 10970 warn_super = ! (invert 10971 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list)))); 10972 } 10973 10974 _invlist_union(properties, cp_list, &cp_list); 10975 SvREFCNT_dec_NN(properties); 10976 } 10977 else { 10978 cp_list = properties; 10979 } 10980 10981 if (warn_super) { 10982 anyof_flags |= ANYOF_WARN_SUPER__shared; 10983 10984 /* Because an ANYOF node is the only one that warns, this node 10985 * can't be optimized into something else */ 10986 optimizable = FALSE; 10987 } 10988 } 10989 10990 /* Here, we have calculated what code points should be in the character 10991 * class. 10992 * 10993 * Now we can see about various optimizations. Fold calculation (which we 10994 * did above) needs to take place before inversion. Otherwise /[^k]/i 10995 * would invert to include K, which under /i would match k, which it 10996 * shouldn't. Therefore we can't invert folded locale now, as it won't be 10997 * folded until runtime */ 10998 10999 /* If we didn't do folding, it's because some information isn't available 11000 * until runtime; set the run-time fold flag for these We know to set the 11001 * flag if we have a non-NULL list for UTF-8 locales, or the class matches 11002 * at least one 0-255 range code point */ 11003 if (LOC && FOLD) { 11004 11005 /* Some things on the list might be unconditionally included because of 11006 * other components. Remove them, and clean up the list if it goes to 11007 * 0 elements */ 11008 if (only_utf8_locale_list && cp_list) { 11009 _invlist_subtract(only_utf8_locale_list, cp_list, 11010 &only_utf8_locale_list); 11011 11012 if (_invlist_len(only_utf8_locale_list) == 0) { 11013 SvREFCNT_dec_NN(only_utf8_locale_list); 11014 only_utf8_locale_list = NULL; 11015 } 11016 } 11017 if ( only_utf8_locale_list 11018 || ( cp_list 11019 && ( _invlist_contains_cp(cp_list, 11020 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) 11021 || _invlist_contains_cp(cp_list, 11022 LATIN_SMALL_LETTER_DOTLESS_I)))) 11023 { 11024 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 11025 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 11026 } 11027 else if (cp_list && invlist_lowest(cp_list) < 256) { 11028 /* If nothing is below 256, has no locale dependency; otherwise it 11029 * does */ 11030 anyof_flags |= ANYOFL_FOLD; 11031 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 11032 11033 /* In a Turkish locale these could match, notify the run-time code 11034 * to check for that */ 11035 if ( _invlist_contains_cp(cp_list, 'I') 11036 || _invlist_contains_cp(cp_list, 'i')) 11037 { 11038 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 11039 } 11040 } 11041 } 11042 else if ( DEPENDS_SEMANTICS 11043 && ( upper_latin1_only_utf8_matches 11044 || ( anyof_flags 11045 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))) 11046 { 11047 RExC_seen_d_op = TRUE; 11048 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY; 11049 } 11050 11051 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at 11052 * compile time. */ 11053 if ( cp_list 11054 && invert 11055 && ! has_runtime_dependency) 11056 { 11057 _invlist_invert(cp_list); 11058 11059 /* Clear the invert flag since have just done it here */ 11060 invert = FALSE; 11061 } 11062 11063 /* All possible optimizations below still have these characteristics. 11064 * (Multi-char folds aren't SIMPLE, but they don't get this far in this 11065 * routine) */ 11066 *flagp |= HASWIDTH|SIMPLE; 11067 11068 if (ret_invlist) { 11069 *ret_invlist = cp_list; 11070 11071 return (cp_list) ? RExC_emit : 0; 11072 } 11073 11074 if (anyof_flags & ANYOF_LOCALE_FLAGS) { 11075 RExC_contains_locale = 1; 11076 } 11077 11078 if (optimizable) { 11079 11080 /* Some character classes are equivalent to other nodes. Such nodes 11081 * take up less room, and some nodes require fewer operations to 11082 * execute, than ANYOF nodes. EXACTish nodes may be joinable with 11083 * adjacent nodes to improve efficiency. */ 11084 op = optimize_regclass(pRExC_state, cp_list, 11085 only_utf8_locale_list, 11086 upper_latin1_only_utf8_matches, 11087 has_runtime_dependency, 11088 posixl, 11089 &anyof_flags, &invert, &ret, flagp); 11090 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 11091 11092 /* If optimized to something else and emitted, clean up and return */ 11093 if (ret >= 0) { 11094 SvREFCNT_dec(cp_list);; 11095 SvREFCNT_dec(only_utf8_locale_list); 11096 SvREFCNT_dec(upper_latin1_only_utf8_matches); 11097 return ret; 11098 } 11099 11100 /* If no optimization was found, an END was returned and we will now 11101 * emit an ANYOF */ 11102 if (op == END) { 11103 op = ANYOF; 11104 } 11105 } 11106 11107 /* Here are going to emit an ANYOF; set the particular type */ 11108 if (op == ANYOF) { 11109 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) { 11110 op = ANYOFD; 11111 } 11112 else if (posixl) { 11113 op = ANYOFPOSIXL; 11114 } 11115 else if (LOC) { 11116 op = ANYOFL; 11117 } 11118 } 11119 11120 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 11121 FILL_NODE(ret, op); /* We set the argument later */ 11122 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op); 11123 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags; 11124 11125 /* Here, <cp_list> contains all the code points we can determine at 11126 * compile time that match under all conditions. Go through it, and 11127 * for things that belong in the bitmap, put them there, and delete from 11128 * <cp_list>. While we are at it, see if everything above 255 is in the 11129 * list, and if so, set a flag to speed up execution */ 11130 11131 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list); 11132 11133 if (posixl) { 11134 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl); 11135 } 11136 11137 if (invert) { 11138 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT; 11139 } 11140 11141 /* Here, the bitmap has been populated with all the Latin1 code points that 11142 * always match. Can now add to the overall list those that match only 11143 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>). 11144 * */ 11145 if (upper_latin1_only_utf8_matches) { 11146 if (cp_list) { 11147 _invlist_union(cp_list, 11148 upper_latin1_only_utf8_matches, 11149 &cp_list); 11150 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 11151 } 11152 else { 11153 cp_list = upper_latin1_only_utf8_matches; 11154 } 11155 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 11156 } 11157 11158 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list, 11159 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 11160 ? listsv 11161 : NULL, 11162 only_utf8_locale_list); 11163 11164 SvREFCNT_dec(cp_list);; 11165 SvREFCNT_dec(only_utf8_locale_list); 11166 return ret; 11167} 11168 11169STATIC U8 11170S_optimize_regclass(pTHX_ 11171 RExC_state_t *pRExC_state, 11172 SV * cp_list, 11173 SV* only_utf8_locale_list, 11174 SV* upper_latin1_only_utf8_matches, 11175 const U32 has_runtime_dependency, 11176 const U32 posixl, 11177 U8 * anyof_flags, 11178 bool * invert, 11179 regnode_offset * ret, 11180 I32 *flagp 11181 ) 11182{ 11183 /* This function exists just to make S_regclass() smaller. It extracts out 11184 * the code that looks for potential optimizations away from a full generic 11185 * ANYOF node. The parameter names are the same as the corresponding 11186 * variables in S_regclass. 11187 * 11188 * It returns the new op (the impossible END one if no optimization found) 11189 * and sets *ret to any created regnode. If the new op is sufficiently 11190 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass. 11191 * 11192 * Certain of the parameters may be updated as a result of the changes 11193 * herein */ 11194 11195 U8 op = END; /* The returned node-type, initialized to an impossible 11196 one. */ 11197 UV value = 0; 11198 PERL_UINT_FAST8_T i; 11199 UV partial_cp_count = 0; 11200 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */ 11201 UV end[MAX_FOLD_FROMS+1] = { 0 }; 11202 bool single_range = FALSE; 11203 UV lowest_cp = 0, highest_cp = 0; 11204 11205 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS; 11206 11207 if (cp_list) { /* Count the code points in enough ranges that we would see 11208 all the ones possible in any fold in this version of 11209 Unicode */ 11210 11211 invlist_iterinit(cp_list); 11212 for (i = 0; i <= MAX_FOLD_FROMS; i++) { 11213 if (! invlist_iternext(cp_list, &start[i], &end[i])) { 11214 break; 11215 } 11216 partial_cp_count += end[i] - start[i] + 1; 11217 } 11218 11219 if (i == 1) { 11220 single_range = TRUE; 11221 } 11222 invlist_iterfinish(cp_list); 11223 11224 /* If we know at compile time that this matches every possible code 11225 * point, any run-time dependencies don't matter */ 11226 if (start[0] == 0 && end[0] == UV_MAX) { 11227 if (*invert) { 11228 goto return_OPFAIL; 11229 } 11230 else { 11231 goto return_SANY; 11232 } 11233 } 11234 11235 /* Use a clearer mnemonic for below */ 11236 lowest_cp = start[0]; 11237 11238 highest_cp = invlist_highest(cp_list); 11239 } 11240 11241 /* Similarly, for /l posix classes, if both a class and its complement 11242 * match, any run-time dependencies don't matter */ 11243 if (posixl) { 11244 int namedclass; 11245 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) { 11246 if ( POSIXL_TEST(posixl, namedclass) /* class */ 11247 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */ 11248 { 11249 if (*invert) { 11250 goto return_OPFAIL; 11251 } 11252 goto return_SANY; 11253 } 11254 } 11255 11256 /* For well-behaved locales, some classes are subsets of others, so 11257 * complementing the subset and including the non-complemented superset 11258 * should match everything, like [\D[:alnum:]], and 11259 * [[:^alpha:][:alnum:]], but some implementations of locales are 11260 * buggy, and khw thinks its a bad idea to have optimization change 11261 * behavior, even if it avoids an OS bug in a given case */ 11262 11263#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n) 11264 11265 /* If is a single posix /l class, can optimize to just that op. Such a 11266 * node will not match anything in the Latin1 range, as that is not 11267 * determinable until runtime, but will match whatever the class does 11268 * outside that range. (Note that some classes won't match anything 11269 * outside the range, like [:ascii:]) */ 11270 if ( isSINGLE_BIT_SET(posixl) 11271 && (partial_cp_count == 0 || lowest_cp > 255)) 11272 { 11273 U8 classnum; 11274 SV * class_above_latin1 = NULL; 11275 bool already_inverted; 11276 bool are_equivalent; 11277 11278 11279 namedclass = single_1bit_pos32(posixl); 11280 classnum = namedclass_to_classnum(namedclass); 11281 11282 /* The named classes are such that the inverted number is one 11283 * larger than the non-inverted one */ 11284 already_inverted = namedclass - classnum_to_namedclass(classnum); 11285 11286 /* Create an inversion list of the official property, inverted if 11287 * the constructed node list is inverted, and restricted to only 11288 * the above latin1 code points, which are the only ones known at 11289 * compile time */ 11290 _invlist_intersection_maybe_complement_2nd( 11291 PL_AboveLatin1, 11292 PL_XPosix_ptrs[classnum], 11293 already_inverted, 11294 &class_above_latin1); 11295 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE); 11296 SvREFCNT_dec_NN(class_above_latin1); 11297 11298 if (are_equivalent) { 11299 11300 /* Resolve the run-time inversion flag with this possibly 11301 * inverted class */ 11302 *invert = *invert ^ already_inverted; 11303 11304 op = POSIXL + *invert * (NPOSIXL - POSIXL); 11305 *ret = reg_node(pRExC_state, op); 11306 FLAGS(REGNODE_p(*ret)) = classnum; 11307 return op; 11308 } 11309 } 11310 } 11311 11312 /* khw can't think of any other possible transformation involving these. */ 11313 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) { 11314 return END; 11315 } 11316 11317 if (! has_runtime_dependency) { 11318 11319 /* If the list is empty, nothing matches. This happens, for example, 11320 * when a Unicode property that doesn't match anything is the only 11321 * element in the character class (perluniprops.pod notes such 11322 * properties). */ 11323 if (partial_cp_count == 0) { 11324 if (*invert) { 11325 goto return_SANY; 11326 } 11327 else { 11328 goto return_OPFAIL; 11329 } 11330 } 11331 11332 /* If matches everything but \n */ 11333 if ( start[0] == 0 && end[0] == '\n' - 1 11334 && start[1] == '\n' + 1 && end[1] == UV_MAX) 11335 { 11336 assert (! *invert); 11337 op = REG_ANY; 11338 *ret = reg_node(pRExC_state, op); 11339 MARK_NAUGHTY(1); 11340 return op; 11341 } 11342 } 11343 11344 /* Next see if can optimize classes that contain just a few code points 11345 * into an EXACTish node. The reason to do this is to let the optimizer 11346 * join this node with adjacent EXACTish ones, and ANYOF nodes require 11347 * runtime conversion to code point from UTF-8, which we'd like to avoid. 11348 * 11349 * An EXACTFish node can be generated even if not under /i, and vice versa. 11350 * But care must be taken. An EXACTFish node has to be such that it only 11351 * matches precisely the code points in the class, but we want to generate 11352 * the least restrictive one that does that, to increase the odds of being 11353 * able to join with an adjacent node. For example, if the class contains 11354 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN 11355 * from matching. Whether we are under /i or not is irrelevant in this 11356 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER 11357 * LETTER APOSTROPHE. That is supposed to match the single character U+0149 11358 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there 11359 * is no simple fold that includes \X{02BC}, there is a multi-char fold 11360 * that does, and so the node generated for it must be an EXACTFish one. 11361 * On the other hand qr/:/i should generate a plain EXACT node since the 11362 * colon participates in no fold whatsoever, and having it be EXACT tells 11363 * the optimizer the target string cannot match unless it has a colon in 11364 * it. */ 11365 if ( ! posixl 11366 && ! *invert 11367 11368 /* Only try if there are no more code points in the class than in 11369 * the max possible fold */ 11370 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1)) 11371 { 11372 /* We can always make a single code point class into an EXACTish node. 11373 * */ 11374 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) { 11375 if (LOC) { 11376 11377 /* Here is /l: Use EXACTL, except if there is a fold not known 11378 * until runtime so shows as only a single code point here. 11379 * For code points above 255, we know which can cause problems 11380 * by having a potential fold to the Latin1 range. */ 11381 if ( ! FOLD 11382 || ( lowest_cp > 255 11383 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp))) 11384 { 11385 op = EXACTL; 11386 } 11387 else { 11388 op = EXACTFL; 11389 } 11390 } 11391 else if (! FOLD) { /* Not /l and not /i */ 11392 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8; 11393 } 11394 else if (lowest_cp < 256) { /* /i, not /l, and the code point is 11395 small */ 11396 11397 /* Under /i, it gets a little tricky. A code point that 11398 * doesn't participate in a fold should be an EXACT node. We 11399 * know this one isn't the result of a simple fold, or there'd 11400 * be more than one code point in the list, but it could be 11401 * part of a multi-character fold. In that case we better not 11402 * create an EXACT node, as we would wrongly be telling the 11403 * optimizer that this code point must be in the target string, 11404 * and that is wrong. This is because if the sequence around 11405 * this code point forms a multi-char fold, what needs to be in 11406 * the string could be the code point that folds to the 11407 * sequence. 11408 * 11409 * This handles the case of below-255 code points, as we have 11410 * an easy look up for those. The next clause handles the 11411 * above-256 one */ 11412 op = IS_IN_SOME_FOLD_L1(lowest_cp) 11413 ? EXACTFU 11414 : EXACT; 11415 } 11416 else { /* /i, larger code point. Since we are under /i, and have 11417 just this code point, we know that it can't fold to 11418 something else, so PL_InMultiCharFold applies to it */ 11419 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp)) 11420 ? EXACTFU_REQ8 11421 : EXACT_REQ8; 11422 } 11423 11424 value = lowest_cp; 11425 } 11426 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY) 11427 && _invlist_contains_cp(PL_in_some_fold, lowest_cp)) 11428 { 11429 /* Here, the only runtime dependency, if any, is from /d, and the 11430 * class matches more than one code point, and the lowest code 11431 * point participates in some fold. It might be that the other 11432 * code points are /i equivalent to this one, and hence they would 11433 * be representable by an EXACTFish node. Above, we eliminated 11434 * classes that contain too many code points to be EXACTFish, with 11435 * the test for MAX_FOLD_FROMS 11436 * 11437 * First, special case the ASCII fold pairs, like 'B' and 'b'. We 11438 * do this because we have EXACTFAA at our disposal for the ASCII 11439 * range */ 11440 if (partial_cp_count == 2 && isASCII(lowest_cp)) { 11441 11442 /* The only ASCII characters that participate in folds are 11443 * alphabetics */ 11444 assert(isALPHA(lowest_cp)); 11445 if ( end[0] == start[0] /* First range is a single 11446 character, so 2nd exists */ 11447 && isALPHA_FOLD_EQ(start[0], start[1])) 11448 { 11449 /* Here, is part of an ASCII fold pair */ 11450 11451 if ( ASCII_FOLD_RESTRICTED 11452 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp)) 11453 { 11454 /* If the second clause just above was true, it means 11455 * we can't be under /i, or else the list would have 11456 * included more than this fold pair. Therefore we 11457 * have to exclude the possibility of whatever else it 11458 * is that folds to these, by using EXACTFAA */ 11459 op = EXACTFAA; 11460 } 11461 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) { 11462 11463 /* Here, there's no simple fold that lowest_cp is part 11464 * of, but there is a multi-character one. If we are 11465 * not under /i, we want to exclude that possibility; 11466 * if under /i, we want to include it */ 11467 op = (FOLD) ? EXACTFU : EXACTFAA; 11468 } 11469 else { 11470 11471 /* Here, the only possible fold lowest_cp participates in 11472 * is with start[1]. /i or not isn't relevant */ 11473 op = EXACTFU; 11474 } 11475 11476 value = toFOLD(lowest_cp); 11477 } 11478 } 11479 else if ( ! upper_latin1_only_utf8_matches 11480 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2 11481 && PL_fold_latin1[ 11482 invlist_highest(upper_latin1_only_utf8_matches)] 11483 == lowest_cp)) 11484 { 11485 /* Here, the smallest character is non-ascii or there are more 11486 * than 2 code points matched by this node. Also, we either 11487 * don't have /d UTF-8 dependent matches, or if we do, they 11488 * look like they could be a single character that is the fold 11489 * of the lowest one is in the always-match list. This test 11490 * quickly excludes most of the false positives when there are 11491 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL 11492 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE 11493 * iff the target string is UTF-8. (We don't have to worry 11494 * above about exceeding the array bounds of PL_fold_latin1[] 11495 * because any code point in 'upper_latin1_only_utf8_matches' 11496 * is below 256.) 11497 * 11498 * EXACTFAA would apply only to pairs (hence exactly 2 code 11499 * points) in the ASCII range, so we can't use it here to 11500 * artificially restrict the fold domain, so we check if the 11501 * class does or does not match some EXACTFish node. Further, 11502 * if we aren't under /i, and and the folded-to character is 11503 * part of a multi-character fold, we can't do this 11504 * optimization, as the sequence around it could be that 11505 * multi-character fold, and we don't here know the context, so 11506 * we have to assume it is that multi-char fold, to prevent 11507 * potential bugs. 11508 * 11509 * To do the general case, we first find the fold of the lowest 11510 * code point (which may be higher than that lowest unfolded 11511 * one), then find everything that folds to it. (The data 11512 * structure we have only maps from the folded code points, so 11513 * we have to do the earlier step.) */ 11514 11515 Size_t foldlen; 11516 U8 foldbuf[UTF8_MAXBYTES_CASE]; 11517 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0); 11518 U32 first_fold; 11519 const U32 * remaining_folds; 11520 Size_t folds_to_this_cp_count = _inverse_folds( 11521 folded, 11522 &first_fold, 11523 &remaining_folds); 11524 Size_t folds_count = folds_to_this_cp_count + 1; 11525 SV * fold_list = _new_invlist(folds_count); 11526 unsigned int i; 11527 11528 /* If there are UTF-8 dependent matches, create a temporary 11529 * list of what this node matches, including them. */ 11530 SV * all_cp_list = NULL; 11531 SV ** use_this_list = &cp_list; 11532 11533 if (upper_latin1_only_utf8_matches) { 11534 all_cp_list = _new_invlist(0); 11535 use_this_list = &all_cp_list; 11536 _invlist_union(cp_list, 11537 upper_latin1_only_utf8_matches, 11538 use_this_list); 11539 } 11540 11541 /* Having gotten everything that participates in the fold 11542 * containing the lowest code point, we turn that into an 11543 * inversion list, making sure everything is included. */ 11544 fold_list = add_cp_to_invlist(fold_list, lowest_cp); 11545 fold_list = add_cp_to_invlist(fold_list, folded); 11546 if (folds_to_this_cp_count > 0) { 11547 fold_list = add_cp_to_invlist(fold_list, first_fold); 11548 for (i = 0; i + 1 < folds_to_this_cp_count; i++) { 11549 fold_list = add_cp_to_invlist(fold_list, 11550 remaining_folds[i]); 11551 } 11552 } 11553 11554 /* If the fold list is identical to what's in this ANYOF node, 11555 * the node can be represented by an EXACTFish one instead */ 11556 if (_invlistEQ(*use_this_list, fold_list, 11557 0 /* Don't complement */ ) 11558 ) { 11559 11560 /* But, we have to be careful, as mentioned above. Just 11561 * the right sequence of characters could match this if it 11562 * is part of a multi-character fold. That IS what we want 11563 * if we are under /i. But it ISN'T what we want if not 11564 * under /i, as it could match when it shouldn't. So, when 11565 * we aren't under /i and this character participates in a 11566 * multi-char fold, we don't optimize into an EXACTFish 11567 * node. So, for each case below we have to check if we 11568 * are folding, and if not, if it is not part of a 11569 * multi-char fold. */ 11570 if (lowest_cp > 255) { /* Highish code point */ 11571 if (FOLD || ! _invlist_contains_cp( 11572 PL_InMultiCharFold, folded)) 11573 { 11574 op = (LOC) 11575 ? EXACTFLU8 11576 : (ASCII_FOLD_RESTRICTED) 11577 ? EXACTFAA 11578 : EXACTFU_REQ8; 11579 value = folded; 11580 } 11581 } /* Below, the lowest code point < 256 */ 11582 else if ( FOLD 11583 && folded == 's' 11584 && DEPENDS_SEMANTICS) 11585 { /* An EXACTF node containing a single character 's', 11586 can be an EXACTFU if it doesn't get joined with an 11587 adjacent 's' */ 11588 op = EXACTFU_S_EDGE; 11589 value = folded; 11590 } 11591 else if ( FOLD 11592 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) 11593 { 11594 if (upper_latin1_only_utf8_matches) { 11595 op = EXACTF; 11596 11597 /* We can't use the fold, as that only matches 11598 * under UTF-8 */ 11599 value = lowest_cp; 11600 } 11601 else if ( UNLIKELY(lowest_cp == MICRO_SIGN) 11602 && ! UTF) 11603 { /* EXACTFUP is a special node for this character */ 11604 op = (ASCII_FOLD_RESTRICTED) 11605 ? EXACTFAA 11606 : EXACTFUP; 11607 value = MICRO_SIGN; 11608 } 11609 else if ( ASCII_FOLD_RESTRICTED 11610 && ! isASCII(lowest_cp)) 11611 { /* For ASCII under /iaa, we can use EXACTFU below 11612 */ 11613 op = EXACTFAA; 11614 value = folded; 11615 } 11616 else { 11617 op = EXACTFU; 11618 value = folded; 11619 } 11620 } 11621 } 11622 11623 SvREFCNT_dec_NN(fold_list); 11624 SvREFCNT_dec(all_cp_list); 11625 } 11626 } 11627 11628 if (op != END) { 11629 U8 len; 11630 11631 /* Here, we have calculated what EXACTish node to use. Have to 11632 * convert to UTF-8 if not already there */ 11633 if (value > 255) { 11634 if (! UTF) { 11635 SvREFCNT_dec(cp_list);; 11636 REQUIRE_UTF8(flagp); 11637 } 11638 11639 /* This is a kludge to the special casing issues with this 11640 * ligature under /aa. FB05 should fold to FB06, but the call 11641 * above to _to_uni_fold_flags() didn't find this, as it didn't 11642 * use the /aa restriction in order to not miss other folds 11643 * that would be affected. This is the only instance likely to 11644 * ever be a problem in all of Unicode. So special case it. */ 11645 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T 11646 && ASCII_FOLD_RESTRICTED) 11647 { 11648 value = LATIN_SMALL_LIGATURE_ST; 11649 } 11650 } 11651 11652 len = (UTF) ? UVCHR_SKIP(value) : 1; 11653 11654 *ret = REGNODE_GUTS(pRExC_state, op, len); 11655 FILL_NODE(*ret, op); 11656 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len); 11657 setSTR_LEN(REGNODE_p(*ret), len); 11658 if (len == 1) { 11659 *STRINGs(REGNODE_p(*ret)) = (U8) value; 11660 } 11661 else { 11662 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value); 11663 } 11664 11665 return op; 11666 } 11667 } 11668 11669 if (! has_runtime_dependency) { 11670 11671 /* See if this can be turned into an ANYOFM node. Think about the bit 11672 * patterns in two different bytes. In some positions, the bits in 11673 * each will be 1; and in other positions both will be 0; and in some 11674 * positions the bit will be 1 in one byte, and 0 in the other. Let 11675 * 'n' be the number of positions where the bits differ. We create a 11676 * mask which has exactly 'n' 0 bits, each in a position where the two 11677 * bytes differ. Now take the set of all bytes that when ANDed with 11678 * the mask yield the same result. That set has 2**n elements, and is 11679 * representable by just two 8 bit numbers: the result and the mask. 11680 * Importantly, matching the set can be vectorized by creating a word 11681 * full of the result bytes, and a word full of the mask bytes, 11682 * yielding a significant speed up. Here, see if this node matches 11683 * such a set. As a concrete example consider [01], and the byte 11684 * representing '0' which is 0x30 on ASCII machines. It has the bits 11685 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with 11686 * that mask we get 0x30. Any other bytes ANDed yield something else. 11687 * So [01], which is a common usage, is optimizable into ANYOFM, and 11688 * can benefit from the speed up. We can only do this on UTF-8 11689 * invariant bytes, because they have the same bit patterns under UTF-8 11690 * as not. */ 11691 PERL_UINT_FAST8_T inverted = 0; 11692 11693 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on 11694 * EBCDIC */ 11695 const PERL_UINT_FAST8_T max_permissible 11696 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT); 11697 11698 /* If doesn't fit the criteria for ANYOFM, invert and try again. If 11699 * that works we will instead later generate an NANYOFM, and invert 11700 * back when through */ 11701 if (highest_cp > max_permissible) { 11702 _invlist_invert(cp_list); 11703 inverted = 1; 11704 } 11705 11706 if (invlist_highest(cp_list) <= max_permissible) { 11707 UV this_start, this_end; 11708 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */ 11709 U8 bits_differing = 0; 11710 Size_t full_cp_count = 0; 11711 bool first_time = TRUE; 11712 11713 /* Go through the bytes and find the bit positions that differ */ 11714 invlist_iterinit(cp_list); 11715 while (invlist_iternext(cp_list, &this_start, &this_end)) { 11716 unsigned int i = this_start; 11717 11718 if (first_time) { 11719 if (! UVCHR_IS_INVARIANT(i)) { 11720 goto done_anyofm; 11721 } 11722 11723 first_time = FALSE; 11724 lowest_cp = this_start; 11725 11726 /* We have set up the code point to compare with. Don't 11727 * compare it with itself */ 11728 i++; 11729 } 11730 11731 /* Find the bit positions that differ from the lowest code 11732 * point in the node. Keep track of all such positions by 11733 * OR'ing */ 11734 for (; i <= this_end; i++) { 11735 if (! UVCHR_IS_INVARIANT(i)) { 11736 goto done_anyofm; 11737 } 11738 11739 bits_differing |= i ^ lowest_cp; 11740 } 11741 11742 full_cp_count += this_end - this_start + 1; 11743 } 11744 11745 /* At the end of the loop, we count how many bits differ from the 11746 * bits in lowest code point, call the count 'd'. If the set we 11747 * found contains 2**d elements, it is the closure of all code 11748 * points that differ only in those bit positions. To convince 11749 * yourself of that, first note that the number in the closure must 11750 * be a power of 2, which we test for. The only way we could have 11751 * that count and it be some differing set, is if we got some code 11752 * points that don't differ from the lowest code point in any 11753 * position, but do differ from each other in some other position. 11754 * That means one code point has a 1 in that position, and another 11755 * has a 0. But that would mean that one of them differs from the 11756 * lowest code point in that position, which possibility we've 11757 * already excluded. */ 11758 if ( (inverted || full_cp_count > 1) 11759 && full_cp_count == 1U << PL_bitcount[bits_differing]) 11760 { 11761 U8 ANYOFM_mask; 11762 11763 op = ANYOFM + inverted;; 11764 11765 /* We need to make the bits that differ be 0's */ 11766 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */ 11767 11768 /* The argument is the lowest code point */ 11769 *ret = reg1node(pRExC_state, op, lowest_cp); 11770 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask; 11771 } 11772 11773 done_anyofm: 11774 invlist_iterfinish(cp_list); 11775 } 11776 11777 if (inverted) { 11778 _invlist_invert(cp_list); 11779 } 11780 11781 if (op != END) { 11782 return op; 11783 } 11784 11785 /* XXX We could create an ANYOFR_LOW node here if we saved above if all 11786 * were invariants, it wasn't inverted, and there is a single range. 11787 * This would be faster than some of the posix nodes we create below 11788 * like /\d/a, but would be twice the size. Without having actually 11789 * measured the gain, khw doesn't think the tradeoff is really worth it 11790 * */ 11791 } 11792 11793 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) { 11794 PERL_UINT_FAST8_T type; 11795 SV * intersection = NULL; 11796 SV* d_invlist = NULL; 11797 11798 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD 11799 * ones are about the same speed as ANYOF ops, but take less room; the 11800 * ones that have above-Latin1 code point matches are somewhat faster 11801 * than ANYOF. */ 11802 11803 for (type = POSIXA; type >= POSIXD; type--) { 11804 int posix_class; 11805 11806 if (type == POSIXL) { /* But not /l posix classes */ 11807 continue; 11808 } 11809 11810 for (posix_class = 0; 11811 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_; 11812 posix_class++) 11813 { 11814 SV** our_code_points = &cp_list; 11815 SV** official_code_points; 11816 int try_inverted; 11817 11818 if (type == POSIXA) { 11819 official_code_points = &PL_Posix_ptrs[posix_class]; 11820 } 11821 else { 11822 official_code_points = &PL_XPosix_ptrs[posix_class]; 11823 } 11824 11825 /* Skip non-existent classes of this type. e.g. \v only has an 11826 * entry in PL_XPosix_ptrs */ 11827 if (! *official_code_points) { 11828 continue; 11829 } 11830 11831 /* Try both the regular class, and its inversion */ 11832 for (try_inverted = 0; try_inverted < 2; try_inverted++) { 11833 bool this_inverted = *invert ^ try_inverted; 11834 11835 if (type != POSIXD) { 11836 11837 /* This class that isn't /d can't match if we have /d 11838 * dependencies */ 11839 if (has_runtime_dependency 11840 & HAS_D_RUNTIME_DEPENDENCY) 11841 { 11842 continue; 11843 } 11844 } 11845 else /* is /d */ if (! this_inverted) { 11846 11847 /* /d classes don't match anything non-ASCII below 256 11848 * unconditionally (which cp_list contains) */ 11849 _invlist_intersection(cp_list, PL_UpperLatin1, 11850 &intersection); 11851 if (_invlist_len(intersection) != 0) { 11852 continue; 11853 } 11854 11855 SvREFCNT_dec(d_invlist); 11856 d_invlist = invlist_clone(cp_list, NULL); 11857 11858 /* But under UTF-8 it turns into using /u rules. Add 11859 * the things it matches under these conditions so that 11860 * we check below that these are identical to what the 11861 * tested class should match */ 11862 if (upper_latin1_only_utf8_matches) { 11863 _invlist_union( 11864 d_invlist, 11865 upper_latin1_only_utf8_matches, 11866 &d_invlist); 11867 } 11868 our_code_points = &d_invlist; 11869 } 11870 else { /* POSIXD, inverted. If this doesn't have this 11871 flag set, it isn't /d. */ 11872 if (! ( *anyof_flags 11873 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) 11874 { 11875 continue; 11876 } 11877 11878 our_code_points = &cp_list; 11879 } 11880 11881 /* Here, have weeded out some things. We want to see if 11882 * the list of characters this node contains 11883 * ('*our_code_points') precisely matches those of the 11884 * class we are currently checking against 11885 * ('*official_code_points'). */ 11886 if (_invlistEQ(*our_code_points, 11887 *official_code_points, 11888 try_inverted)) 11889 { 11890 /* Here, they precisely match. Optimize this ANYOF 11891 * node into its equivalent POSIX one of the correct 11892 * type, possibly inverted. 11893 * 11894 * Some of these nodes match a single range of 11895 * characters (or [:alpha:] matches two parallel ranges 11896 * on ASCII platforms). The array lookup at execution 11897 * time could be replaced by a range check for such 11898 * nodes. But regnodes are a finite resource, and the 11899 * possible performance boost isn't large, so this 11900 * hasn't been done. An attempt to use just one node 11901 * (and its inverse) to encompass all such cases was 11902 * made in d62feba66bf43f35d092bb026694f927e9f94d38. 11903 * But the shifting/masking it used ended up being 11904 * slower than the array look up, so it was reverted */ 11905 op = (try_inverted) 11906 ? type + NPOSIXA - POSIXA 11907 : type; 11908 *ret = reg_node(pRExC_state, op); 11909 FLAGS(REGNODE_p(*ret)) = posix_class; 11910 SvREFCNT_dec(d_invlist); 11911 SvREFCNT_dec(intersection); 11912 return op; 11913 } 11914 } 11915 } 11916 } 11917 SvREFCNT_dec(d_invlist); 11918 SvREFCNT_dec(intersection); 11919 } 11920 11921 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both 11922 * in size and speed. Currently, a 20 bit range base (smallest code point 11923 * in the range), and a 12 bit maximum delta are packed into a 32 bit word. 11924 * This allows for using it on all of the Unicode code points except for 11925 * the highest plane, which is only for private use code points. khw 11926 * doubts that a bigger delta is likely in real world applications */ 11927 if ( single_range 11928 && ! has_runtime_dependency 11929 && *anyof_flags == 0 11930 && start[0] < (1 << ANYOFR_BASE_BITS) 11931 && end[0] - start[0] 11932 < ((1U << (sizeof(ARG1u_LOC(NULL)) 11933 * CHARBITS - ANYOFR_BASE_BITS)))) 11934 11935 { 11936 U8 low_utf8[UTF8_MAXBYTES+1]; 11937 U8 high_utf8[UTF8_MAXBYTES+1]; 11938 11939 op = ANYOFR; 11940 *ret = reg1node(pRExC_state, op, 11941 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS)); 11942 11943 /* Place the lowest UTF-8 start byte in the flags field, so as to allow 11944 * efficient ruling out at run time of many possible inputs. */ 11945 (void) uvchr_to_utf8(low_utf8, start[0]); 11946 (void) uvchr_to_utf8(high_utf8, end[0]); 11947 11948 /* If all code points share the same first byte, this can be an 11949 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can 11950 * quickly rule out many inputs at run-time without having to compute 11951 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that 11952 * transformation would not rule out nearly so many things */ 11953 if (low_utf8[0] == high_utf8[0]) { 11954 op = ANYOFRb; 11955 OP(REGNODE_p(*ret)) = op; 11956 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0]; 11957 } 11958 else { 11959 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]); 11960 } 11961 11962 return op; 11963 } 11964 11965 /* If didn't find an optimization and there is no need for a bitmap, 11966 * of the lowest code points, optimize to indicate that */ 11967 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS 11968 && ! LOC 11969 && ! upper_latin1_only_utf8_matches 11970 && *anyof_flags == 0) 11971 { 11972 U8 low_utf8[UTF8_MAXBYTES+1]; 11973 UV highest_cp = invlist_highest(cp_list); 11974 11975 /* Currently the maximum allowed code point by the system is IV_MAX. 11976 * Higher ones are reserved for future internal use. This particular 11977 * regnode can be used for higher ones, but we can't calculate the code 11978 * point of those. IV_MAX suffices though, as it will be a large first 11979 * byte */ 11980 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX)) 11981 - low_utf8; 11982 11983 /* We store the lowest possible first byte of the UTF-8 representation, 11984 * using the flags field. This allows for quick ruling out of some 11985 * inputs without having to convert from UTF-8 to code point. For 11986 * EBCDIC, we use I8, as not doing that transformation would not rule 11987 * out nearly so many things */ 11988 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]); 11989 11990 op = ANYOFH; 11991 11992 /* If the first UTF-8 start byte for the highest code point in the 11993 * range is suitably small, we may be able to get an upper bound as 11994 * well */ 11995 if (highest_cp <= IV_MAX) { 11996 U8 high_utf8[UTF8_MAXBYTES+1]; 11997 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8; 11998 11999 /* If the lowest and highest are the same, we can get an exact 12000 * first byte instead of a just minimum or even a sequence of exact 12001 * leading bytes. We signal these with different regnodes */ 12002 if (low_utf8[0] == high_utf8[0]) { 12003 Size_t len = find_first_differing_byte_pos(low_utf8, 12004 high_utf8, 12005 MIN(low_len, high_len)); 12006 if (len == 1) { 12007 12008 /* No need to convert to I8 for EBCDIC as this is an exact 12009 * match */ 12010 *anyof_flags = low_utf8[0]; 12011 12012 if (high_len == 2) { 12013 /* If the elements matched all have a 2-byte UTF-8 12014 * representation, with the first byte being the same, 12015 * we can use a compact, fast regnode. capable of 12016 * matching any combination of continuation byte 12017 * patterns. 12018 * 12019 * (A similar regnode could be created for the Latin1 12020 * range; the complication being that it could match 12021 * non-UTF8 targets. The internal bitmap would serve 12022 * both cases; with some extra code in regexec.c) */ 12023 op = ANYOFHbbm; 12024 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12025 FILL_NODE(*ret, op); 12026 FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0], 12027 12028 /* The 64 bit (or 32 on EBCCDIC) map can be looked up 12029 * directly based on the continuation byte, without 12030 * needing to convert to code point */ 12031 populate_bitmap_from_invlist( 12032 cp_list, 12033 12034 /* The base code point is from the start byte */ 12035 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0], 12036 UTF_CONTINUATION_MARK | 0), 12037 12038 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap, 12039 REGNODE_BBM_BITMAP_LEN); 12040 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op); 12041 return op; 12042 } 12043 else { 12044 op = ANYOFHb; 12045 } 12046 } 12047 else { 12048 op = ANYOFHs; 12049 *ret = REGNODE_GUTS(pRExC_state, op, 12050 REGNODE_ARG_LEN(op) + STR_SZ(len)); 12051 FILL_NODE(*ret, op); 12052 STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret)) 12053 = len; 12054 Copy(low_utf8, /* Add the common bytes */ 12055 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string, 12056 len, U8); 12057 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret))); 12058 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list, 12059 NULL, only_utf8_locale_list); 12060 return op; 12061 } 12062 } 12063 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) { 12064 12065 /* Here, the high byte is not the same as the low, but is small 12066 * enough that its reasonable to have a loose upper bound, 12067 * which is packed in with the strict lower bound. See 12068 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC 12069 * platforms, I8 is used. On ASCII platforms I8 is the same 12070 * thing as UTF-8 */ 12071 12072 U8 bits = 0; 12073 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags; 12074 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0]) 12075 - *anyof_flags; 12076 12077 if (range_diff <= max_range_diff / 8) { 12078 bits = 3; 12079 } 12080 else if (range_diff <= max_range_diff / 4) { 12081 bits = 2; 12082 } 12083 else if (range_diff <= max_range_diff / 2) { 12084 bits = 1; 12085 } 12086 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits; 12087 op = ANYOFHr; 12088 } 12089 } 12090 } 12091 12092 return op; 12093 12094 return_OPFAIL: 12095 op = OPFAIL; 12096 *ret = reg1node(pRExC_state, op, 0); 12097 return op; 12098 12099 return_SANY: 12100 op = SANY; 12101 *ret = reg_node(pRExC_state, op); 12102 MARK_NAUGHTY(1); 12103 return op; 12104} 12105 12106#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION 12107 12108void 12109Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, 12110 regnode* const node, 12111 SV* const cp_list, 12112 SV* const runtime_defns, 12113 SV* const only_utf8_locale_list) 12114{ 12115 /* Sets the arg field of an ANYOF-type node 'node', using information about 12116 * the node passed-in. If only the bitmap is needed to determine what 12117 * matches, the arg is set appropriately to either 12118 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE 12119 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE 12120 * 12121 * Otherwise, it sets the argument to the count returned by reg_add_data(), 12122 * having allocated and stored an array, av, as follows: 12123 * av[0] stores the inversion list defining this class as far as known at 12124 * this time, or PL_sv_undef if nothing definite is now known. 12125 * av[1] stores the inversion list of code points that match only if the 12126 * current locale is UTF-8, or if none, PL_sv_undef if there is an 12127 * av[2], or no entry otherwise. 12128 * av[2] stores the list of user-defined properties whose subroutine 12129 * definitions aren't known at this time, or no entry if none. */ 12130 12131 UV n; 12132 12133 PERL_ARGS_ASSERT_SET_ANYOF_ARG; 12134 12135 /* If this is set, the final disposition won't be known until runtime, so 12136 * we can't do any of the compile time optimizations */ 12137 if (! runtime_defns) { 12138 12139 /* On plain ANYOF nodes without the possibility of a runtime locale 12140 * making a difference, maybe there's no information to be gleaned 12141 * except for what's in the bitmap */ 12142 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) { 12143 12144 /* There are two such cases: 12145 * 1) there is no list of code points matched outside the bitmap 12146 */ 12147 if (! cp_list) { 12148 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE); 12149 return; 12150 } 12151 12152 /* 2) the list indicates everything outside the bitmap matches */ 12153 if ( invlist_highest(cp_list) == UV_MAX 12154 && invlist_highest_range_start(cp_list) 12155 <= NUM_ANYOF_CODE_POINTS) 12156 { 12157 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); 12158 return; 12159 } 12160 12161 /* In all other cases there are things outside the bitmap that we 12162 * may need to check at runtime. */ 12163 } 12164 12165 /* Here, we have resolved all the possible run-time matches, and they 12166 * are stored in one or both of two possible lists. (While some match 12167 * only under certain runtime circumstances, we know all the possible 12168 * ones for each such circumstance.) 12169 * 12170 * It may very well be that the pattern being compiled contains an 12171 * identical class, already encountered. Reusing that class here saves 12172 * space. Look through all classes so far encountered. */ 12173 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0; 12174 for (unsigned int i = 0; i < existing_items; i++) { 12175 12176 /* Only look at auxiliary data of this type */ 12177 if (RExC_rxi->data->what[i] != 's') { 12178 continue; 12179 } 12180 12181 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]); 12182 AV * const av = MUTABLE_AV(SvRV(rv)); 12183 12184 /* If the already encountered class has data that won't be known 12185 * until runtime (stored in the final element of the array), we 12186 * can't share */ 12187 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) { 12188 continue; 12189 } 12190 12191 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX, 12192 false /* no lvalue */); 12193 12194 /* The new and the existing one both have to have or both not 12195 * have this element, for this one to duplicate that one */ 12196 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) { 12197 continue; 12198 } 12199 12200 /* If the inversion lists aren't equivalent, can't share */ 12201 if (cp_list && ! _invlistEQ(cp_list, 12202 *stored_cp_list_ptr, 12203 FALSE /* don't complement */)) 12204 { 12205 continue; 12206 } 12207 12208 /* Similarly for the other list */ 12209 SV ** stored_only_utf8_locale_list_ptr = av_fetch( 12210 av, 12211 ONLY_LOCALE_MATCHES_INDEX, 12212 false /* no lvalue */); 12213 if ( cBOOL(only_utf8_locale_list) 12214 != cBOOL(stored_only_utf8_locale_list_ptr)) 12215 { 12216 continue; 12217 } 12218 12219 if (only_utf8_locale_list && ! _invlistEQ( 12220 only_utf8_locale_list, 12221 *stored_only_utf8_locale_list_ptr, 12222 FALSE /* don't complement */)) 12223 { 12224 continue; 12225 } 12226 12227 /* Here, the existence and contents of both compile-time lists 12228 * are identical between the new and existing data. Re-use the 12229 * existing one */ 12230 ARG1u_SET(node, i); 12231 return; 12232 } /* end of loop through existing classes */ 12233 } 12234 12235 /* Here, we need to create a new auxiliary data element; either because 12236 * this doesn't duplicate an existing one, or we can't tell at this time if 12237 * it eventually will */ 12238 12239 AV * const av = newAV(); 12240 SV *rv; 12241 12242 if (cp_list) { 12243 av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list)); 12244 } 12245 12246 /* (Note that if any of this changes, the size calculations in 12247 * S_optimize_regclass() might need to be updated.) */ 12248 12249 if (only_utf8_locale_list) { 12250 av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX, 12251 SvREFCNT_inc_NN(only_utf8_locale_list)); 12252 } 12253 12254 if (runtime_defns) { 12255 av_store_simple(av, DEFERRED_USER_DEFINED_INDEX, 12256 SvREFCNT_inc_NN(runtime_defns)); 12257 } 12258 12259 rv = newRV_noinc(MUTABLE_SV(av)); 12260 n = reg_add_data(pRExC_state, STR_WITH_LEN("s")); 12261 RExC_rxi->data->data[n] = (void*)rv; 12262 ARG1u_SET(node, n); 12263} 12264 12265SV * 12266 12267#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 12268Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist) 12269#else 12270Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist) 12271#endif 12272 12273{ 12274 /* For internal core use only. 12275 * Returns the inversion list for the input 'node' in the regex 'prog'. 12276 * If <doinit> is 'true', will attempt to create the inversion list if not 12277 * already done. If it is created, it will add to the normal inversion 12278 * list any that comes from user-defined properties. It croaks if this 12279 * is called before such a list is ready to be generated, that is when a 12280 * user-defined property has been declared, buyt still not yet defined. 12281 * If <listsvp> is non-null, will return the printable contents of the 12282 * property definition. This can be used to get debugging information 12283 * even before the inversion list exists, by calling this function with 12284 * 'doinit' set to false, in which case the components that will be used 12285 * to eventually create the inversion list are returned (in a printable 12286 * form). 12287 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to 12288 * store an inversion list of code points that should match only if the 12289 * execution-time locale is a UTF-8 one. 12290 * If <output_invlist> is not NULL, it is where this routine is to store an 12291 * inversion list of the code points that would be instead returned in 12292 * <listsvp> if this were NULL. Thus, what gets output in <listsvp> 12293 * when this parameter is used, is just the non-code point data that 12294 * will go into creating the inversion list. This currently should be just 12295 * user-defined properties whose definitions were not known at compile 12296 * time. Using this parameter allows for easier manipulation of the 12297 * inversion list's data by the caller. It is illegal to call this 12298 * function with this parameter set, but not <listsvp> 12299 * 12300 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note 12301 * that, in spite of this function's name, the inversion list it returns 12302 * may include the bitmap data as well */ 12303 12304 SV *si = NULL; /* Input initialization string */ 12305 SV* invlist = NULL; 12306 12307 RXi_GET_DECL_NULL(prog, progi); 12308 const struct reg_data * const data = prog ? progi->data : NULL; 12309 12310#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 12311 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA; 12312#else 12313 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA; 12314#endif 12315 assert(! output_invlist || listsvp); 12316 12317 if (data && data->count) { 12318 const U32 n = ARG1u(node); 12319 12320 if (data->what[n] == 's') { 12321 SV * const rv = MUTABLE_SV(data->data[n]); 12322 AV * const av = MUTABLE_AV(SvRV(rv)); 12323 SV **const ary = AvARRAY(av); 12324 12325 invlist = ary[INVLIST_INDEX]; 12326 12327 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) { 12328 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX]; 12329 } 12330 12331 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { 12332 si = ary[DEFERRED_USER_DEFINED_INDEX]; 12333 } 12334 12335 if (doinit && (si || invlist)) { 12336 if (si) { 12337 bool user_defined; 12338 SV * msg = newSVpvs_flags("", SVs_TEMP); 12339 12340 SV * prop_definition = handle_user_defined_property( 12341 "", 0, FALSE, /* There is no \p{}, \P{} */ 12342 SvPVX_const(si)[1] - '0', /* /i or not has been 12343 stored here for just 12344 this occasion */ 12345 TRUE, /* run time */ 12346 FALSE, /* This call must find the defn */ 12347 si, /* The property definition */ 12348 &user_defined, 12349 msg, 12350 0 /* base level call */ 12351 ); 12352 12353 if (SvCUR(msg)) { 12354 assert(prop_definition == NULL); 12355 12356 Perl_croak(aTHX_ "%" UTF8f, 12357 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg))); 12358 } 12359 12360 if (invlist) { 12361 _invlist_union(invlist, prop_definition, &invlist); 12362 SvREFCNT_dec_NN(prop_definition); 12363 } 12364 else { 12365 invlist = prop_definition; 12366 } 12367 12368 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX); 12369 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX); 12370 12371 ary[INVLIST_INDEX] = invlist; 12372 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX]) 12373 ? ONLY_LOCALE_MATCHES_INDEX 12374 : INVLIST_INDEX); 12375 si = NULL; 12376 } 12377 } 12378 } 12379 } 12380 12381 /* If requested, return a printable version of what this ANYOF node matches 12382 * */ 12383 if (listsvp) { 12384 SV* matches_string = NULL; 12385 12386 /* This function can be called at compile-time, before everything gets 12387 * resolved, in which case we return the currently best available 12388 * information, which is the string that will eventually be used to do 12389 * that resolving, 'si' */ 12390 if (si) { 12391 /* Here, we only have 'si' (and possibly some passed-in data in 12392 * 'invlist', which is handled below) If the caller only wants 12393 * 'si', use that. */ 12394 if (! output_invlist) { 12395 matches_string = newSVsv(si); 12396 } 12397 else { 12398 /* But if the caller wants an inversion list of the node, we 12399 * need to parse 'si' and place as much as possible in the 12400 * desired output inversion list, making 'matches_string' only 12401 * contain the currently unresolvable things */ 12402 const char *si_string = SvPVX(si); 12403 STRLEN remaining = SvCUR(si); 12404 UV prev_cp = 0; 12405 U8 count = 0; 12406 12407 /* Ignore everything before and including the first new-line */ 12408 si_string = (const char *) memchr(si_string, '\n', SvCUR(si)); 12409 assert (si_string != NULL); 12410 si_string++; 12411 remaining = SvPVX(si) + SvCUR(si) - si_string; 12412 12413 while (remaining > 0) { 12414 12415 /* The data consists of just strings defining user-defined 12416 * property names, but in prior incarnations, and perhaps 12417 * somehow from pluggable regex engines, it could still 12418 * hold hex code point definitions, all of which should be 12419 * legal (or it wouldn't have gotten this far). Each 12420 * component of a range would be separated by a tab, and 12421 * each range by a new-line. If these are found, instead 12422 * add them to the inversion list */ 12423 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT 12424 |PERL_SCAN_SILENT_NON_PORTABLE; 12425 STRLEN len = remaining; 12426 UV cp = grok_hex(si_string, &len, &grok_flags, NULL); 12427 12428 /* If the hex decode routine found something, it should go 12429 * up to the next \n */ 12430 if ( *(si_string + len) == '\n') { 12431 if (count) { /* 2nd code point on line */ 12432 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); 12433 } 12434 else { 12435 *output_invlist = add_cp_to_invlist(*output_invlist, cp); 12436 } 12437 count = 0; 12438 goto prepare_for_next_iteration; 12439 } 12440 12441 /* If the hex decode was instead for the lower range limit, 12442 * save it, and go parse the upper range limit */ 12443 if (*(si_string + len) == '\t') { 12444 assert(count == 0); 12445 12446 prev_cp = cp; 12447 count = 1; 12448 prepare_for_next_iteration: 12449 si_string += len + 1; 12450 remaining -= len + 1; 12451 continue; 12452 } 12453 12454 /* Here, didn't find a legal hex number. Just add the text 12455 * from here up to the next \n, omitting any trailing 12456 * markers. */ 12457 12458 remaining -= len; 12459 len = strcspn(si_string, 12460 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n"); 12461 remaining -= len; 12462 if (matches_string) { 12463 sv_catpvn(matches_string, si_string, len); 12464 } 12465 else { 12466 matches_string = newSVpvn(si_string, len); 12467 } 12468 sv_catpvs(matches_string, " "); 12469 12470 si_string += len; 12471 if ( remaining 12472 && UCHARAT(si_string) 12473 == DEFERRED_COULD_BE_OFFICIAL_MARKERc) 12474 { 12475 si_string++; 12476 remaining--; 12477 } 12478 if (remaining && UCHARAT(si_string) == '\n') { 12479 si_string++; 12480 remaining--; 12481 } 12482 } /* end of loop through the text */ 12483 12484 assert(matches_string); 12485 if (SvCUR(matches_string)) { /* Get rid of trailing blank */ 12486 SvCUR_set(matches_string, SvCUR(matches_string) - 1); 12487 } 12488 } /* end of has an 'si' */ 12489 } 12490 12491 /* Add the stuff that's already known */ 12492 if (invlist) { 12493 12494 /* Again, if the caller doesn't want the output inversion list, put 12495 * everything in 'matches-string' */ 12496 if (! output_invlist) { 12497 if ( ! matches_string) { 12498 matches_string = newSVpvs("\n"); 12499 } 12500 sv_catsv(matches_string, invlist_contents(invlist, 12501 TRUE /* traditional style */ 12502 )); 12503 } 12504 else if (! *output_invlist) { 12505 *output_invlist = invlist_clone(invlist, NULL); 12506 } 12507 else { 12508 _invlist_union(*output_invlist, invlist, output_invlist); 12509 } 12510 } 12511 12512 *listsvp = matches_string; 12513 } 12514 12515 return invlist; 12516} 12517 12518/* reg_skipcomment() 12519 12520 Absorbs an /x style # comment from the input stream, 12521 returning a pointer to the first character beyond the comment, or if the 12522 comment terminates the pattern without anything following it, this returns 12523 one past the final character of the pattern (in other words, RExC_end) and 12524 sets the REG_RUN_ON_COMMENT_SEEN flag. 12525 12526 Note it's the callers responsibility to ensure that we are 12527 actually in /x mode 12528 12529*/ 12530 12531PERL_STATIC_INLINE char* 12532S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) 12533{ 12534 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 12535 12536 assert(*p == '#'); 12537 12538 while (p < RExC_end) { 12539 if (*(++p) == '\n') { 12540 return p+1; 12541 } 12542 } 12543 12544 /* we ran off the end of the pattern without ending the comment, so we have 12545 * to add an \n when wrapping */ 12546 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 12547 return p; 12548} 12549 12550STATIC void 12551S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, 12552 char ** p, 12553 const bool force_to_xmod 12554 ) 12555{ 12556 /* If the text at the current parse position '*p' is a '(?#...)' comment, 12557 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' 12558 * is /x whitespace, advance '*p' so that on exit it points to the first 12559 * byte past all such white space and comments */ 12560 12561 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); 12562 12563 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; 12564 12565 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); 12566 12567 for (;;) { 12568 if (RExC_end - (*p) >= 3 12569 && *(*p) == '(' 12570 && *(*p + 1) == '?' 12571 && *(*p + 2) == '#') 12572 { 12573 while (*(*p) != ')') { 12574 if ((*p) == RExC_end) 12575 FAIL("Sequence (?#... not terminated"); 12576 (*p)++; 12577 } 12578 (*p)++; 12579 continue; 12580 } 12581 12582 if (use_xmod) { 12583 const char * save_p = *p; 12584 while ((*p) < RExC_end) { 12585 STRLEN len; 12586 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) { 12587 (*p) += len; 12588 } 12589 else if (*(*p) == '#') { 12590 (*p) = reg_skipcomment(pRExC_state, (*p)); 12591 } 12592 else { 12593 break; 12594 } 12595 } 12596 if (*p != save_p) { 12597 continue; 12598 } 12599 } 12600 12601 break; 12602 } 12603 12604 return; 12605} 12606 12607/* nextchar() 12608 12609 Advances the parse position by one byte, unless that byte is the beginning 12610 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In 12611 those two cases, the parse position is advanced beyond all such comments and 12612 white space. 12613 12614 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1). 12615*/ 12616 12617STATIC void 12618S_nextchar(pTHX_ RExC_state_t *pRExC_state) 12619{ 12620 PERL_ARGS_ASSERT_NEXTCHAR; 12621 12622 if (RExC_parse < RExC_end) { 12623 assert( ! UTF 12624 || UTF8_IS_INVARIANT(*RExC_parse) 12625 || UTF8_IS_START(*RExC_parse)); 12626 12627 RExC_parse_inc_safe(); 12628 12629 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 12630 FALSE /* Don't force /x */ ); 12631 } 12632} 12633 12634STATIC void 12635S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) 12636{ 12637 /* 'size' is the delta number of smallest regnode equivalents to add or 12638 * subtract from the current memory allocated to the regex engine being 12639 * constructed. */ 12640 12641 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; 12642 12643 RExC_size += size; 12644 12645 Renewc(RExC_rxi, 12646 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode), 12647 /* +1 for REG_MAGIC */ 12648 char, 12649 regexp_internal); 12650 if ( RExC_rxi == NULL ) 12651 FAIL("Regexp out of space"); 12652 RXi_SET(RExC_rx, RExC_rxi); 12653 12654 RExC_emit_start = RExC_rxi->program; 12655 if (size > 0) { 12656 Zero(REGNODE_p(RExC_emit), size, regnode); 12657 } 12658} 12659 12660STATIC regnode_offset 12661S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size) 12662{ 12663 /* Allocate a regnode that is (1 + extra_size) times as big as the 12664 * smallest regnode worth of space, and also aligns and increments 12665 * RExC_size appropriately. 12666 * 12667 * It returns the regnode's offset into the regex engine program */ 12668 12669 const regnode_offset ret = RExC_emit; 12670 12671 PERL_ARGS_ASSERT_REGNODE_GUTS; 12672 12673 SIZE_ALIGN(RExC_size); 12674 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size); 12675 NODE_ALIGN_FILL(REGNODE_p(ret)); 12676 return(ret); 12677} 12678 12679#ifdef DEBUGGING 12680 12681STATIC regnode_offset 12682S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) { 12683 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG; 12684 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF); 12685 return S_regnode_guts(aTHX_ pRExC_state, extra_size); 12686} 12687 12688#endif 12689 12690 12691 12692/* 12693- reg_node - emit a node 12694*/ 12695STATIC regnode_offset /* Location. */ 12696S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 12697{ 12698 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12699 regnode_offset ptr = ret; 12700 12701 PERL_ARGS_ASSERT_REG_NODE; 12702 12703 assert(REGNODE_ARG_LEN(op) == 0); 12704 12705 FILL_ADVANCE_NODE(ptr, op); 12706 RExC_emit = ptr; 12707 return(ret); 12708} 12709 12710/* 12711- reg1node - emit a node with an argument 12712*/ 12713STATIC regnode_offset /* Location. */ 12714S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 12715{ 12716 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12717 regnode_offset ptr = ret; 12718 12719 PERL_ARGS_ASSERT_REG1NODE; 12720 12721 /* ANYOF are special cased to allow non-length 1 args */ 12722 assert(REGNODE_ARG_LEN(op) == 1); 12723 12724 FILL_ADVANCE_NODE_ARG1u(ptr, op, arg); 12725 RExC_emit = ptr; 12726 return(ret); 12727} 12728 12729/* 12730- regpnode - emit a temporary node with a SV* argument 12731*/ 12732STATIC regnode_offset /* Location. */ 12733S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg) 12734{ 12735 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12736 regnode_offset ptr = ret; 12737 12738 PERL_ARGS_ASSERT_REGPNODE; 12739 12740 FILL_ADVANCE_NODE_ARGp(ptr, op, arg); 12741 RExC_emit = ptr; 12742 return(ret); 12743} 12744 12745STATIC regnode_offset 12746S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) 12747{ 12748 /* emit a node with U32 and I32 arguments */ 12749 12750 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12751 regnode_offset ptr = ret; 12752 12753 PERL_ARGS_ASSERT_REG2NODE; 12754 12755 assert(REGNODE_ARG_LEN(op) == 2); 12756 12757 FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2); 12758 RExC_emit = ptr; 12759 return(ret); 12760} 12761 12762/* 12763- reginsert - insert an operator in front of already-emitted operand 12764* 12765* That means that on exit 'operand' is the offset of the newly inserted 12766* operator, and the original operand has been relocated. 12767* 12768* IMPORTANT NOTE - it is the *callers* responsibility to correctly 12769* set up NEXT_OFF() of the inserted node if needed. Something like this: 12770* 12771* reginsert(pRExC, OPFAIL, orig_emit, depth+1); 12772* NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE; 12773* 12774* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well. 12775*/ 12776STATIC void 12777S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, 12778 const regnode_offset operand, const U32 depth) 12779{ 12780 regnode *src; 12781 regnode *dst; 12782 regnode *place; 12783 const int offset = REGNODE_ARG_LEN((U8)op); 12784 const int size = NODE_STEP_REGNODE + offset; 12785 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12786 12787 PERL_ARGS_ASSERT_REGINSERT; 12788 PERL_UNUSED_CONTEXT; 12789 PERL_UNUSED_ARG(depth); 12790 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op)); 12791 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started 12792 studying. If this is wrong then we need to adjust RExC_recurse 12793 below like we do with RExC_open_parens/RExC_close_parens. */ 12794 change_engine_size(pRExC_state, (Ptrdiff_t) size); 12795 src = REGNODE_p(RExC_emit); 12796 RExC_emit += size; 12797 dst = REGNODE_p(RExC_emit); 12798 12799 /* If we are in a "count the parentheses" pass, the numbers are unreliable, 12800 * and [perl #133871] shows this can lead to problems, so skip this 12801 * realignment of parens until a later pass when they are reliable */ 12802 if (! IN_PARENS_PASS && RExC_open_parens) { 12803 int paren; 12804 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/ 12805 /* remember that RExC_npar is rex->nparens + 1, 12806 * iow it is 1 more than the number of parens seen in 12807 * the pattern so far. */ 12808 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 12809 /* note, RExC_open_parens[0] is the start of the 12810 * regex, it can't move. RExC_close_parens[0] is the end 12811 * of the regex, it *can* move. */ 12812 if ( paren && RExC_open_parens[paren] >= operand ) { 12813 /*DEBUG_PARSE_FMT("open"," - %d", size);*/ 12814 RExC_open_parens[paren] += size; 12815 } else { 12816 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 12817 } 12818 if ( RExC_close_parens[paren] >= operand ) { 12819 /*DEBUG_PARSE_FMT("close"," - %d", size);*/ 12820 RExC_close_parens[paren] += size; 12821 } else { 12822 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 12823 } 12824 } 12825 } 12826 if (RExC_end_op) 12827 RExC_end_op += size; 12828 12829 while (src > REGNODE_p(operand)) { 12830 StructCopy(--src, --dst, regnode); 12831 } 12832 12833 place = REGNODE_p(operand); /* Op node, where operand used to be. */ 12834 src = place + 1; /* NOT REGNODE_AFTER! */ 12835 FLAGS(place) = 0; 12836 FILL_NODE(operand, op); 12837 12838 /* Zero out any arguments in the new node */ 12839 Zero(src, offset, regnode); 12840} 12841 12842/* 12843- regtail - set the next-pointer at the end of a node chain of p to val. If 12844 that value won't fit in the space available, instead returns FALSE. 12845 (Except asserts if we can't fit in the largest space the regex 12846 engine is designed for.) 12847- SEE ALSO: regtail_study 12848*/ 12849STATIC bool 12850S_regtail(pTHX_ RExC_state_t * pRExC_state, 12851 const regnode_offset p, 12852 const regnode_offset val, 12853 const U32 depth) 12854{ 12855 regnode_offset scan; 12856 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12857 12858 PERL_ARGS_ASSERT_REGTAIL; 12859#ifndef DEBUGGING 12860 PERL_UNUSED_ARG(depth); 12861#endif 12862 12863 /* The final node in the chain is the first one with a nonzero next pointer 12864 * */ 12865 scan = (regnode_offset) p; 12866 for (;;) { 12867 regnode * const temp = regnext(REGNODE_p(scan)); 12868 DEBUG_PARSE_r({ 12869 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 12870 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); 12871 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n", 12872 SvPV_nolen_const(RExC_mysv), scan, 12873 (temp == NULL ? "->" : ""), 12874 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "") 12875 ); 12876 }); 12877 if (temp == NULL) 12878 break; 12879 scan = REGNODE_OFFSET(temp); 12880 } 12881 12882 /* Populate this node's next pointer */ 12883 assert(val >= scan); 12884 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) { 12885 assert((UV) (val - scan) <= U32_MAX); 12886 ARG1u_SET(REGNODE_p(scan), val - scan); 12887 } 12888 else { 12889 if (val - scan > U16_MAX) { 12890 /* Populate this with something that won't loop and will likely 12891 * lead to a crash if the caller ignores the failure return, and 12892 * execution continues */ 12893 NEXT_OFF(REGNODE_p(scan)) = U16_MAX; 12894 return FALSE; 12895 } 12896 NEXT_OFF(REGNODE_p(scan)) = val - scan; 12897 } 12898 12899 return TRUE; 12900} 12901 12902#ifdef DEBUGGING 12903/* 12904- regtail_study - set the next-pointer at the end of a node chain of p to val. 12905- Look for optimizable sequences at the same time. 12906- currently only looks for EXACT chains. 12907 12908This is experimental code. The idea is to use this routine to perform 12909in place optimizations on branches and groups as they are constructed, 12910with the long term intention of removing optimization from study_chunk so 12911that it is purely analytical. 12912 12913Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 12914to control which is which. 12915 12916This used to return a value that was ignored. It was a problem that it is 12917#ifdef'd to be another function that didn't return a value. khw has changed it 12918so both currently return a pass/fail return. 12919 12920*/ 12921/* TODO: All four parms should be const */ 12922 12923STATIC bool 12924S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, 12925 const regnode_offset val, U32 depth) 12926{ 12927 regnode_offset scan; 12928 U8 exact = PSEUDO; 12929#ifdef EXPERIMENTAL_INPLACESCAN 12930 I32 min = 0; 12931#endif 12932 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12933 12934 PERL_ARGS_ASSERT_REGTAIL_STUDY; 12935 12936 12937 /* Find last node. */ 12938 12939 scan = p; 12940 for (;;) { 12941 regnode * const temp = regnext(REGNODE_p(scan)); 12942#ifdef EXPERIMENTAL_INPLACESCAN 12943 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) { 12944 bool unfolded_multi_char; /* Unexamined in this routine */ 12945 if (join_exact(pRExC_state, scan, &min, 12946 &unfolded_multi_char, 1, REGNODE_p(val), depth+1)) 12947 return TRUE; /* Was return EXACT */ 12948 } 12949#endif 12950 if ( exact ) { 12951 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) { 12952 if (exact == PSEUDO ) 12953 exact= OP(REGNODE_p(scan)); 12954 else if (exact != OP(REGNODE_p(scan)) ) 12955 exact= 0; 12956 } 12957 else if (OP(REGNODE_p(scan)) != NOTHING) { 12958 exact= 0; 12959 } 12960 } 12961 DEBUG_PARSE_r({ 12962 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 12963 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); 12964 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n", 12965 SvPV_nolen_const(RExC_mysv), 12966 scan, 12967 REGNODE_NAME(exact)); 12968 }); 12969 if (temp == NULL) 12970 break; 12971 scan = REGNODE_OFFSET(temp); 12972 } 12973 DEBUG_PARSE_r({ 12974 DEBUG_PARSE_MSG(""); 12975 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state); 12976 Perl_re_printf( aTHX_ 12977 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n", 12978 SvPV_nolen_const(RExC_mysv), 12979 (IV)val, 12980 (IV)(val - scan) 12981 ); 12982 }); 12983 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) { 12984 assert((UV) (val - scan) <= U32_MAX); 12985 ARG1u_SET(REGNODE_p(scan), val - scan); 12986 } 12987 else { 12988 if (val - scan > U16_MAX) { 12989 /* Populate this with something that won't loop and will likely 12990 * lead to a crash if the caller ignores the failure return, and 12991 * execution continues */ 12992 NEXT_OFF(REGNODE_p(scan)) = U16_MAX; 12993 return FALSE; 12994 } 12995 NEXT_OFF(REGNODE_p(scan)) = val - scan; 12996 } 12997 12998 return TRUE; /* Was 'return exact' */ 12999} 13000#endif 13001 13002SV* 13003Perl_get_ANYOFM_contents(pTHX_ const regnode * n) { 13004 13005 /* Returns an inversion list of all the code points matched by the 13006 * ANYOFM/NANYOFM node 'n' */ 13007 13008 SV * cp_list = _new_invlist(-1); 13009 const U8 lowest = (U8) ARG1u(n); 13010 unsigned int i; 13011 U8 count = 0; 13012 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)]; 13013 13014 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS; 13015 13016 /* Starting with the lowest code point, any code point that ANDed with the 13017 * mask yields the lowest code point is in the set */ 13018 for (i = lowest; i <= 0xFF; i++) { 13019 if ((i & FLAGS(n)) == ARG1u(n)) { 13020 cp_list = add_cp_to_invlist(cp_list, i); 13021 count++; 13022 13023 /* We know how many code points (a power of two) that are in the 13024 * set. No use looking once we've got that number */ 13025 if (count >= needed) break; 13026 } 13027 } 13028 13029 if (OP(n) == NANYOFM) { 13030 _invlist_invert(cp_list); 13031 } 13032 return cp_list; 13033} 13034 13035SV * 13036Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { 13037 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS; 13038 13039 SV * cp_list = NULL; 13040 populate_invlist_from_bitmap( 13041 ((struct regnode_bbm *) n)->bitmap, 13042 REGNODE_BBM_BITMAP_LEN * CHARBITS, 13043 &cp_list, 13044 13045 /* The base cp is from the start byte plus a zero continuation */ 13046 TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n), 13047 UTF_CONTINUATION_MARK | 0)); 13048 return cp_list; 13049} 13050 13051 13052 13053SV * 13054Perl_re_intuit_string(pTHX_ REGEXP * const r) 13055{ /* Assume that RE_INTUIT is set */ 13056 /* Returns an SV containing a string that must appear in the target for it 13057 * to match, or NULL if nothing is known that must match. 13058 * 13059 * CAUTION: the SV can be freed during execution of the regex engine */ 13060 13061 struct regexp *const prog = ReANY(r); 13062 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13063 13064 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 13065 PERL_UNUSED_CONTEXT; 13066 13067 DEBUG_COMPILE_r( 13068 { 13069 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) { 13070 const char * const s = SvPV_nolen_const(RX_UTF8(r) 13071 ? prog->check_utf8 : prog->check_substr); 13072 13073 if (!PL_colorset) reginitcolors(); 13074 Perl_re_printf( aTHX_ 13075 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 13076 PL_colors[4], 13077 RX_UTF8(r) ? "utf8 " : "", 13078 PL_colors[5], PL_colors[0], 13079 s, 13080 PL_colors[1], 13081 (strlen(s) > PL_dump_re_max_len ? "..." : "")); 13082 } 13083 } ); 13084 13085 /* use UTF8 check substring if regexp pattern itself is in UTF8 */ 13086 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; 13087} 13088 13089/* 13090 pregfree() 13091 13092 handles refcounting and freeing the perl core regexp structure. When 13093 it is necessary to actually free the structure the first thing it 13094 does is call the 'free' method of the regexp_engine associated to 13095 the regexp, allowing the handling of the void *pprivate; member 13096 first. (This routine is not overridable by extensions, which is why 13097 the extensions free is called first.) 13098 13099 See regdupe and regdupe_internal if you change anything here. 13100*/ 13101#ifndef PERL_IN_XSUB_RE 13102void 13103Perl_pregfree(pTHX_ REGEXP *r) 13104{ 13105 SvREFCNT_dec(r); 13106} 13107 13108void 13109Perl_pregfree2(pTHX_ REGEXP *rx) 13110{ 13111 struct regexp *const r = ReANY(rx); 13112 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13113 13114 PERL_ARGS_ASSERT_PREGFREE2; 13115 13116 if (! r) 13117 return; 13118 13119 if (r->mother_re) { 13120 ReREFCNT_dec(r->mother_re); 13121 } else { 13122 CALLREGFREE_PVT(rx); /* free the private data */ 13123 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 13124 } 13125 if (r->substrs) { 13126 int i; 13127 for (i = 0; i < 2; i++) { 13128 SvREFCNT_dec(r->substrs->data[i].substr); 13129 SvREFCNT_dec(r->substrs->data[i].utf8_substr); 13130 } 13131 Safefree(r->substrs); 13132 } 13133 RX_MATCH_COPY_FREE(rx); 13134#ifdef PERL_ANY_COW 13135 SvREFCNT_dec(r->saved_copy); 13136#endif 13137 Safefree(RXp_OFFSp(r)); 13138 if (r->logical_to_parno) { 13139 Safefree(r->logical_to_parno); 13140 Safefree(r->parno_to_logical); 13141 Safefree(r->parno_to_logical_next); 13142 } 13143 13144 SvREFCNT_dec(r->qr_anoncv); 13145 if (r->recurse_locinput) 13146 Safefree(r->recurse_locinput); 13147} 13148 13149 13150/* reg_temp_copy() 13151 13152 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV, 13153 except that dsv will be created if NULL. 13154 13155 This function is used in two main ways. First to implement 13156 $r = qr/....; $s = $$r; 13157 13158 Secondly, it is used as a hacky workaround to the structural issue of 13159 match results 13160 being stored in the regexp structure which is in turn stored in 13161 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 13162 could be PL_curpm in multiple contexts, and could require multiple 13163 result sets being associated with the pattern simultaneously, such 13164 as when doing a recursive match with (??{$qr}) 13165 13166 The solution is to make a lightweight copy of the regexp structure 13167 when a qr// is returned from the code executed by (??{$qr}) this 13168 lightweight copy doesn't actually own any of its data except for 13169 the starp/end and the actual regexp structure itself. 13170 13171*/ 13172 13173 13174REGEXP * 13175Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv) 13176{ 13177 struct regexp *drx; 13178 struct regexp *const srx = ReANY(ssv); 13179 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV; 13180 13181 PERL_ARGS_ASSERT_REG_TEMP_COPY; 13182 13183 if (!dsv) 13184 dsv = (REGEXP*) newSV_type(SVt_REGEXP); 13185 else { 13186 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV)); 13187 13188 /* our only valid caller, sv_setsv_flags(), should have done 13189 * a SV_CHECK_THINKFIRST_COW_DROP() by now */ 13190 assert(!SvOOK(dsv)); 13191 assert(!SvIsCOW(dsv)); 13192 assert(!SvROK(dsv)); 13193 13194 if (SvPVX_const(dsv)) { 13195 if (SvLEN(dsv)) 13196 Safefree(SvPVX(dsv)); 13197 SvPVX(dsv) = NULL; 13198 } 13199 SvLEN_set(dsv, 0); 13200 SvCUR_set(dsv, 0); 13201 SvOK_off((SV *)dsv); 13202 13203 if (islv) { 13204 /* For PVLVs, the head (sv_any) points to an XPVLV, while 13205 * the LV's xpvlenu_rx will point to a regexp body, which 13206 * we allocate here */ 13207 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); 13208 assert(!SvPVX(dsv)); 13209 /* We "steal" the body from the newly allocated SV temp, changing 13210 * the pointer in its HEAD to NULL. We then change its type to 13211 * SVt_NULL so that when we immediately release its only reference, 13212 * no memory deallocation happens. 13213 * 13214 * The body will eventually be freed (from the PVLV) either in 13215 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and 13216 * the regexp body needs to be removed) 13217 * or in Perl_sv_clear() (if the PVLV still holds the pointer until 13218 * the PVLV itself is deallocated). */ 13219 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any; 13220 temp->sv_any = NULL; 13221 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; 13222 SvREFCNT_dec_NN(temp); 13223 /* SvCUR still resides in the xpvlv struct, so the regexp copy- 13224 ing below will not set it. */ 13225 SvCUR_set(dsv, SvCUR(ssv)); 13226 } 13227 } 13228 /* This ensures that SvTHINKFIRST(sv) is true, and hence that 13229 sv_force_normal(sv) is called. */ 13230 SvFAKE_on(dsv); 13231 drx = ReANY(dsv); 13232 13233 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8); 13234 SvPV_set(dsv, RX_WRAPPED(ssv)); 13235 /* We share the same string buffer as the original regexp, on which we 13236 hold a reference count, incremented when mother_re is set below. 13237 The string pointer is copied here, being part of the regexp struct. 13238 */ 13239 memcpy(&(drx->xpv_cur), &(srx->xpv_cur), 13240 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); 13241 13242 if (!islv) 13243 SvLEN_set(dsv, 0); 13244 if (RXp_OFFSp(srx)) { 13245 const I32 npar = srx->nparens+1; 13246 NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair); 13247 } 13248 if (srx->substrs) { 13249 int i; 13250 Newx(drx->substrs, 1, struct reg_substr_data); 13251 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data); 13252 13253 for (i = 0; i < 2; i++) { 13254 SvREFCNT_inc_void(drx->substrs->data[i].substr); 13255 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr); 13256 } 13257 13258 /* check_substr and check_utf8, if non-NULL, point to either their 13259 anchored or float namesakes, and don't hold a second reference. */ 13260 } 13261 if (srx->logical_to_parno) { 13262 NewCopy(srx->logical_to_parno, 13263 drx->logical_to_parno, 13264 srx->nparens+1, I32); 13265 NewCopy(srx->parno_to_logical, 13266 drx->parno_to_logical, 13267 srx->nparens+1, I32); 13268 NewCopy(srx->parno_to_logical_next, 13269 drx->parno_to_logical_next, 13270 srx->nparens+1, I32); 13271 } else { 13272 drx->logical_to_parno = NULL; 13273 drx->parno_to_logical = NULL; 13274 drx->parno_to_logical_next = NULL; 13275 } 13276 drx->logical_nparens = srx->logical_nparens; 13277 13278 RX_MATCH_COPIED_off(dsv); 13279#ifdef PERL_ANY_COW 13280 RXp_SAVED_COPY(drx) = NULL; 13281#endif 13282 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv); 13283 SvREFCNT_inc_void(drx->qr_anoncv); 13284 if (srx->recurse_locinput) 13285 Newx(drx->recurse_locinput, srx->nparens + 1, char *); 13286 13287 return dsv; 13288} 13289#endif 13290 13291 13292/* regfree_internal() 13293 13294 Free the private data in a regexp. This is overloadable by 13295 extensions. Perl takes care of the regexp structure in pregfree(), 13296 this covers the *pprivate pointer which technically perl doesn't 13297 know about, however of course we have to handle the 13298 regexp_internal structure when no extension is in use. 13299 13300 Note this is called before freeing anything in the regexp 13301 structure. 13302 */ 13303 13304void 13305Perl_regfree_internal(pTHX_ REGEXP * const rx) 13306{ 13307 struct regexp *const r = ReANY(rx); 13308 RXi_GET_DECL(r, ri); 13309 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13310 13311 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 13312 13313 if (! ri) { 13314 return; 13315 } 13316 13317 DEBUG_COMPILE_r({ 13318 if (!PL_colorset) 13319 reginitcolors(); 13320 { 13321 SV *dsv= sv_newmortal(); 13322 RE_PV_QUOTED_DECL(s, RX_UTF8(rx), 13323 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len); 13324 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n", 13325 PL_colors[4], PL_colors[5], s); 13326 } 13327 }); 13328 13329 if (ri->code_blocks) 13330 S_free_codeblocks(aTHX_ ri->code_blocks); 13331 13332 if (ri->data) { 13333 int n = ri->data->count; 13334 13335 while (--n >= 0) { 13336 /* If you add a ->what type here, update the comment in regcomp.h */ 13337 switch (ri->data->what[n]) { 13338 case 'a': 13339 case 'r': 13340 case 's': 13341 case 'S': 13342 case 'u': 13343 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 13344 break; 13345 case 'f': 13346 Safefree(ri->data->data[n]); 13347 break; 13348 case 'l': 13349 case 'L': 13350 break; 13351 case 'T': 13352 { /* Aho Corasick add-on structure for a trie node. 13353 Used in stclass optimization only */ 13354 U32 refcount; 13355 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 13356 OP_REFCNT_LOCK; 13357 refcount = --aho->refcount; 13358 OP_REFCNT_UNLOCK; 13359 if ( !refcount ) { 13360 PerlMemShared_free(aho->states); 13361 PerlMemShared_free(aho->fail); 13362 /* do this last!!!! */ 13363 PerlMemShared_free(ri->data->data[n]); 13364 /* we should only ever get called once, so 13365 * assert as much, and also guard the free 13366 * which /might/ happen twice. At the least 13367 * it will make code anlyzers happy and it 13368 * doesn't cost much. - Yves */ 13369 assert(ri->regstclass); 13370 if (ri->regstclass) { 13371 PerlMemShared_free(ri->regstclass); 13372 ri->regstclass = 0; 13373 } 13374 } 13375 } 13376 break; 13377 case 't': 13378 { 13379 /* trie structure. */ 13380 U32 refcount; 13381 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 13382 OP_REFCNT_LOCK; 13383 refcount = --trie->refcount; 13384 OP_REFCNT_UNLOCK; 13385 if ( !refcount ) { 13386 PerlMemShared_free(trie->charmap); 13387 PerlMemShared_free(trie->states); 13388 PerlMemShared_free(trie->trans); 13389 if (trie->bitmap) 13390 PerlMemShared_free(trie->bitmap); 13391 if (trie->jump) 13392 PerlMemShared_free(trie->jump); 13393 if (trie->j_before_paren) 13394 PerlMemShared_free(trie->j_before_paren); 13395 if (trie->j_after_paren) 13396 PerlMemShared_free(trie->j_after_paren); 13397 PerlMemShared_free(trie->wordinfo); 13398 /* do this last!!!! */ 13399 PerlMemShared_free(ri->data->data[n]); 13400 } 13401 } 13402 break; 13403 case '%': 13404 /* NO-OP a '%' data contains a null pointer, so that reg_add_data 13405 * always returns non-zero, this should only ever happen in the 13406 * 0 index */ 13407 assert(n==0); 13408 break; 13409 default: 13410 Perl_croak(aTHX_ "panic: regfree data code '%c'", 13411 ri->data->what[n]); 13412 } 13413 } 13414 Safefree(ri->data->what); 13415 Safefree(ri->data); 13416 } 13417 13418 Safefree(ri); 13419} 13420 13421#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) 13422 13423/* 13424=for apidoc re_dup_guts 13425Duplicate a regexp. 13426 13427This routine is expected to clone a given regexp structure. It is only 13428compiled under USE_ITHREADS. 13429 13430After all of the core data stored in struct regexp is duplicated 13431the C<regexp_engine.dupe> method is used to copy any private data 13432stored in the *pprivate pointer. This allows extensions to handle 13433any duplication they need to do. 13434 13435=cut 13436 13437 See pregfree() and regfree_internal() if you change anything here. 13438*/ 13439#if defined(USE_ITHREADS) 13440#ifndef PERL_IN_XSUB_RE 13441void 13442Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 13443{ 13444 I32 npar; 13445 const struct regexp *r = ReANY(sstr); 13446 struct regexp *ret = ReANY(dstr); 13447 13448 PERL_ARGS_ASSERT_RE_DUP_GUTS; 13449 13450 npar = r->nparens+1; 13451 NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair); 13452 13453 if (ret->substrs) { 13454 /* Do it this way to avoid reading from *r after the StructCopy(). 13455 That way, if any of the sv_dup_inc()s dislodge *r from the L1 13456 cache, it doesn't matter. */ 13457 int i; 13458 const bool anchored = r->check_substr 13459 ? r->check_substr == r->substrs->data[0].substr 13460 : r->check_utf8 == r->substrs->data[0].utf8_substr; 13461 Newx(ret->substrs, 1, struct reg_substr_data); 13462 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 13463 13464 for (i = 0; i < 2; i++) { 13465 ret->substrs->data[i].substr = 13466 sv_dup_inc(ret->substrs->data[i].substr, param); 13467 ret->substrs->data[i].utf8_substr = 13468 sv_dup_inc(ret->substrs->data[i].utf8_substr, param); 13469 } 13470 13471 /* check_substr and check_utf8, if non-NULL, point to either their 13472 anchored or float namesakes, and don't hold a second reference. */ 13473 13474 if (ret->check_substr) { 13475 if (anchored) { 13476 assert(r->check_utf8 == r->substrs->data[0].utf8_substr); 13477 13478 ret->check_substr = ret->substrs->data[0].substr; 13479 ret->check_utf8 = ret->substrs->data[0].utf8_substr; 13480 } else { 13481 assert(r->check_substr == r->substrs->data[1].substr); 13482 assert(r->check_utf8 == r->substrs->data[1].utf8_substr); 13483 13484 ret->check_substr = ret->substrs->data[1].substr; 13485 ret->check_utf8 = ret->substrs->data[1].utf8_substr; 13486 } 13487 } else if (ret->check_utf8) { 13488 if (anchored) { 13489 ret->check_utf8 = ret->substrs->data[0].utf8_substr; 13490 } else { 13491 ret->check_utf8 = ret->substrs->data[1].utf8_substr; 13492 } 13493 } 13494 } 13495 13496 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 13497 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); 13498 if (r->recurse_locinput) 13499 Newx(ret->recurse_locinput, r->nparens + 1, char *); 13500 13501 if (ret->pprivate) 13502 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param)); 13503 13504 if (RX_MATCH_COPIED(dstr)) 13505 RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret)); 13506 else 13507 RXp_SUBBEG(ret) = NULL; 13508#ifdef PERL_ANY_COW 13509 RXp_SAVED_COPY(ret) = NULL; 13510#endif 13511 13512 if (r->logical_to_parno) { 13513 /* we use total_parens for all three just for symmetry */ 13514 ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32)); 13515 ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32)); 13516 ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32)); 13517 } else { 13518 ret->logical_to_parno = NULL; 13519 ret->parno_to_logical = NULL; 13520 ret->parno_to_logical_next = NULL; 13521 } 13522 13523 ret->logical_nparens = r->logical_nparens; 13524 13525 /* Whether mother_re be set or no, we need to copy the string. We 13526 cannot refrain from copying it when the storage points directly to 13527 our mother regexp, because that's 13528 1: a buffer in a different thread 13529 2: something we no longer hold a reference on 13530 so we need to copy it locally. */ 13531 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1); 13532 /* set malloced length to a non-zero value so it will be freed 13533 * (otherwise in combination with SVf_FAKE it looks like an alien 13534 * buffer). It doesn't have to be the actual malloced size, since it 13535 * should never be grown */ 13536 SvLEN_set(dstr, SvCUR(sstr)+1); 13537 ret->mother_re = NULL; 13538} 13539#endif /* PERL_IN_XSUB_RE */ 13540 13541/* 13542 regdupe_internal() 13543 13544 This is the internal complement to regdupe() which is used to copy 13545 the structure pointed to by the *pprivate pointer in the regexp. 13546 This is the core version of the extension overridable cloning hook. 13547 The regexp structure being duplicated will be copied by perl prior 13548 to this and will be provided as the regexp *r argument, however 13549 with the /old/ structures pprivate pointer value. Thus this routine 13550 may override any copying normally done by perl. 13551 13552 It returns a pointer to the new regexp_internal structure. 13553*/ 13554 13555void * 13556Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) 13557{ 13558 struct regexp *const r = ReANY(rx); 13559 regexp_internal *reti; 13560 int len; 13561 RXi_GET_DECL(r, ri); 13562 13563 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 13564 13565 len = ProgLen(ri); 13566 13567 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), 13568 char, regexp_internal); 13569 Copy(ri->program, reti->program, len+1, regnode); 13570 13571 13572 if (ri->code_blocks) { 13573 int n; 13574 Newx(reti->code_blocks, 1, struct reg_code_blocks); 13575 Newx(reti->code_blocks->cb, ri->code_blocks->count, 13576 struct reg_code_block); 13577 Copy(ri->code_blocks->cb, reti->code_blocks->cb, 13578 ri->code_blocks->count, struct reg_code_block); 13579 for (n = 0; n < ri->code_blocks->count; n++) 13580 reti->code_blocks->cb[n].src_regex = (REGEXP*) 13581 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param); 13582 reti->code_blocks->count = ri->code_blocks->count; 13583 reti->code_blocks->refcnt = 1; 13584 } 13585 else 13586 reti->code_blocks = NULL; 13587 13588 reti->regstclass = NULL; 13589 13590 if (ri->data) { 13591 struct reg_data *d; 13592 const int count = ri->data->count; 13593 int i; 13594 13595 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 13596 char, struct reg_data); 13597 Newx(d->what, count, U8); 13598 13599 d->count = count; 13600 for (i = 0; i < count; i++) { 13601 d->what[i] = ri->data->what[i]; 13602 switch (d->what[i]) { 13603 /* see also regcomp.h and regfree_internal() */ 13604 case 'a': /* actually an AV, but the dup function is identical. 13605 values seem to be "plain sv's" generally. */ 13606 case 'r': /* a compiled regex (but still just another SV) */ 13607 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code) 13608 this use case should go away, the code could have used 13609 'a' instead - see S_set_ANYOF_arg() for array contents. */ 13610 case 'S': /* actually an SV, but the dup function is identical. */ 13611 case 'u': /* actually an HV, but the dup function is identical. 13612 values are "plain sv's" */ 13613 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 13614 break; 13615 case 'f': 13616 /* Synthetic Start Class - "Fake" charclass we generate to optimize 13617 * patterns which could start with several different things. Pre-TRIE 13618 * this was more important than it is now, however this still helps 13619 * in some places, for instance /x?a+/ might produce a SSC equivalent 13620 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass() 13621 * in regexec.c 13622 */ 13623 /* This is cheating. */ 13624 Newx(d->data[i], 1, regnode_ssc); 13625 StructCopy(ri->data->data[i], d->data[i], regnode_ssc); 13626 reti->regstclass = (regnode*)d->data[i]; 13627 break; 13628 case 'T': 13629 /* AHO-CORASICK fail table */ 13630 /* Trie stclasses are readonly and can thus be shared 13631 * without duplication. We free the stclass in pregfree 13632 * when the corresponding reg_ac_data struct is freed. 13633 */ 13634 reti->regstclass= ri->regstclass; 13635 /* FALLTHROUGH */ 13636 case 't': 13637 /* TRIE transition table */ 13638 OP_REFCNT_LOCK; 13639 ((reg_trie_data*)ri->data->data[i])->refcount++; 13640 OP_REFCNT_UNLOCK; 13641 /* FALLTHROUGH */ 13642 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */ 13643 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code 13644 is not from another regexp */ 13645 d->data[i] = ri->data->data[i]; 13646 break; 13647 case '%': 13648 /* this is a placeholder type, it exists purely so that 13649 * reg_add_data always returns a non-zero value, this type of 13650 * entry should ONLY be present in the 0 slot of the array */ 13651 assert(i == 0); 13652 d->data[i]= ri->data->data[i]; 13653 break; 13654 default: 13655 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'", 13656 ri->data->what[i]); 13657 } 13658 } 13659 13660 reti->data = d; 13661 } 13662 else 13663 reti->data = NULL; 13664 13665 if (ri->regstclass && !reti->regstclass) { 13666 /* Assume that the regstclass is a regnode which is inside of the 13667 * program which we have to copy over */ 13668 regnode *node= ri->regstclass; 13669 assert(node >= ri->program && (node - ri->program) < len); 13670 reti->regstclass = reti->program + (node - ri->program); 13671 } 13672 13673 13674 reti->name_list_idx = ri->name_list_idx; 13675 13676 SetProgLen(reti, len); 13677 13678 return (void*)reti; 13679} 13680 13681#endif /* USE_ITHREADS */ 13682 13683STATIC void 13684S_re_croak(pTHX_ bool utf8, const char* pat,...) 13685{ 13686 va_list args; 13687 STRLEN len = strlen(pat); 13688 char buf[512]; 13689 SV *msv; 13690 const char *message; 13691 13692 PERL_ARGS_ASSERT_RE_CROAK; 13693 13694 if (len > 510) 13695 len = 510; 13696 Copy(pat, buf, len , char); 13697 buf[len] = '\n'; 13698 buf[len + 1] = '\0'; 13699 va_start(args, pat); 13700 msv = vmess(buf, &args); 13701 va_end(args); 13702 message = SvPV_const(msv, len); 13703 if (len > 512) 13704 len = 512; 13705 Copy(message, buf, len , char); 13706 /* len-1 to avoid \n */ 13707 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf)); 13708} 13709 13710/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 13711 13712#ifndef PERL_IN_XSUB_RE 13713void 13714Perl_save_re_context(pTHX) 13715{ 13716 I32 nparens = -1; 13717 I32 i; 13718 13719 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 13720 13721 if (PL_curpm) { 13722 const REGEXP * const rx = PM_GETRE(PL_curpm); 13723 if (rx) 13724 nparens = RX_NPARENS(rx); 13725 } 13726 13727 /* RT #124109. This is a complete hack; in the SWASHNEW case we know 13728 * that PL_curpm will be null, but that utf8.pm and the modules it 13729 * loads will only use $1..$3. 13730 * The t/porting/re_context.t test file checks this assumption. 13731 */ 13732 if (nparens == -1) 13733 nparens = 3; 13734 13735 for (i = 1; i <= nparens; i++) { 13736 char digits[TYPE_CHARS(long)]; 13737 const STRLEN len = my_snprintf(digits, sizeof(digits), 13738 "%lu", (long)i); 13739 GV *const *const gvp 13740 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 13741 13742 if (gvp) { 13743 GV * const gv = *gvp; 13744 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 13745 save_scalar(gv); 13746 } 13747 } 13748} 13749#endif 13750 13751#ifndef PERL_IN_XSUB_RE 13752 13753# include "uni_keywords.h" 13754 13755void 13756Perl_init_uniprops(pTHX) 13757{ 13758 13759# ifdef DEBUGGING 13760 char * dump_len_string; 13761 13762 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); 13763 if ( ! dump_len_string 13764 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) 13765 { 13766 PL_dump_re_max_len = 60; /* A reasonable default */ 13767 } 13768# endif 13769 13770 PL_user_def_props = newHV(); 13771 13772# ifdef USE_ITHREADS 13773 13774 HvSHAREKEYS_off(PL_user_def_props); 13775 PL_user_def_props_aTHX = aTHX; 13776 13777# endif 13778 13779 /* Set up the inversion list interpreter-level variables */ 13780 13781 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); 13782 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]); 13783 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]); 13784 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]); 13785 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]); 13786 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]); 13787 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]); 13788 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]); 13789 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]); 13790 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]); 13791 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]); 13792 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]); 13793 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]); 13794 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]); 13795 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]); 13796 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]); 13797 13798 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); 13799 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]); 13800 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]); 13801 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]); 13802 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_]; 13803 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]); 13804 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]); 13805 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]); 13806 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]); 13807 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]); 13808 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]); 13809 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]); 13810 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]); 13811 PL_Posix_ptrs[CC_VERTSPACE_] = NULL; 13812 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]); 13813 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]); 13814 13815 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist); 13816 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist); 13817 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist); 13818 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist); 13819 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist); 13820 13821 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist); 13822 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); 13823 PL_Latin1 = _new_invlist_C_array(Latin1_invlist); 13824 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); 13825 13826 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]); 13827 13828 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]); 13829 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]); 13830 13831 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]); 13832 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]); 13833 13834 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); 13835 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ 13836 UNI__PERL_FOLDS_TO_MULTI_CHAR]); 13837 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ 13838 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]); 13839 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); 13840 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); 13841 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist); 13842 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist); 13843 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist); 13844 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); 13845 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]); 13846 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist); 13847 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]); 13848 13849# ifdef UNI_XIDC 13850 /* The below are used only by deprecated functions. They could be removed */ 13851 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]); 13852 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]); 13853 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]); 13854# endif 13855} 13856 13857/* These four functions are compiled only in regcomp.c, where they have access 13858 * to the data they return. They are a way for re_comp.c to get access to that 13859 * data without having to compile the whole data structures. */ 13860 13861I16 13862Perl_do_uniprop_match(const char * const key, const U16 key_len) 13863{ 13864 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH; 13865 13866 return match_uniprop((U8 *) key, key_len); 13867} 13868 13869SV * 13870Perl_get_prop_definition(pTHX_ const int table_index) 13871{ 13872 PERL_ARGS_ASSERT_GET_PROP_DEFINITION; 13873 13874 /* Create and return the inversion list */ 13875 return _new_invlist_C_array(uni_prop_ptrs[table_index]); 13876} 13877 13878const char * const * 13879Perl_get_prop_values(const int table_index) 13880{ 13881 PERL_ARGS_ASSERT_GET_PROP_VALUES; 13882 13883 return UNI_prop_value_ptrs[table_index]; 13884} 13885 13886const char * 13887Perl_get_deprecated_property_msg(const Size_t warning_offset) 13888{ 13889 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG; 13890 13891 return deprecated_property_msgs[warning_offset]; 13892} 13893 13894# if 0 13895 13896This code was mainly added for backcompat to give a warning for non-portable 13897code points in user-defined properties. But experiments showed that the 13898warning in earlier perls were only omitted on overflow, which should be an 13899error, so there really isnt a backcompat issue, and actually adding the 13900warning when none was present before might cause breakage, for little gain. So 13901khw left this code in, but not enabled. Tests were never added. 13902 13903embed.fnc entry: 13904Ei |const char *|get_extended_utf8_msg|const UV cp 13905 13906PERL_STATIC_INLINE const char * 13907S_get_extended_utf8_msg(pTHX_ const UV cp) 13908{ 13909 U8 dummy[UTF8_MAXBYTES + 1]; 13910 HV *msgs; 13911 SV **msg; 13912 13913 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED, 13914 &msgs); 13915 13916 msg = hv_fetchs(msgs, "text", 0); 13917 assert(msg); 13918 13919 (void) sv_2mortal((SV *) msgs); 13920 13921 return SvPVX(*msg); 13922} 13923 13924# endif 13925#endif /* end of ! PERL_IN_XSUB_RE */ 13926 13927STATIC REGEXP * 13928S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len, 13929 const bool ignore_case) 13930{ 13931 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it 13932 * possibly with /i if the 'ignore_case' parameter is true. Use /aa 13933 * because nothing outside of ASCII will match. Use /m because the input 13934 * string may be a bunch of lines strung together. 13935 * 13936 * Also sets up the debugging info */ 13937 13938 U32 flags = PMf_MULTILINE|PMf_WILDCARD; 13939 U32 rx_flags; 13940 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP); 13941 REGEXP * subpattern_re; 13942 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13943 13944 PERL_ARGS_ASSERT_COMPILE_WILDCARD; 13945 13946 if (ignore_case) { 13947 flags |= PMf_FOLD; 13948 } 13949 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET); 13950 13951 /* Like in op.c, we copy the compile time pm flags to the rx ones */ 13952 rx_flags = flags & RXf_PMf_COMPILETIME; 13953 13954#ifndef PERL_IN_XSUB_RE 13955 /* Use the core engine if this file is regcomp.c. That means no 13956 * 'use re "Debug ..." is in effect, so the core engine is sufficient */ 13957 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL, 13958 &PL_core_reg_engine, 13959 NULL, NULL, 13960 rx_flags, flags); 13961#else 13962 if (isDEBUG_WILDCARD) { 13963 /* Use the special debugging engine if this file is re_comp.c and wants 13964 * to output the wildcard matching. This uses whatever 13965 * 'use re "Debug ..." is in effect */ 13966 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL, 13967 &my_reg_engine, 13968 NULL, NULL, 13969 rx_flags, flags); 13970 } 13971 else { 13972 /* Use the special wildcard engine if this file is re_comp.c and 13973 * doesn't want to output the wildcard matching. This uses whatever 13974 * 'use re "Debug ..." is in effect for compilation, but this engine 13975 * structure has been set up so that it uses the core engine for 13976 * execution, so no execution debugging as a result of re.pm will be 13977 * displayed. */ 13978 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL, 13979 &wild_reg_engine, 13980 NULL, NULL, 13981 rx_flags, flags); 13982 /* XXX The above has the effect that any user-supplied regex engine 13983 * won't be called for matching wildcards. That might be good, or bad. 13984 * It could be changed in several ways. The reason it is done the 13985 * current way is to avoid having to save and restore 13986 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps 13987 * could be used. Another suggestion is to keep the authoritative 13988 * value of the debug flags in a thread-local variable and add set/get 13989 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date. 13990 * Still another is to pass a flag, say in the engine's intflags that 13991 * would be checked each time before doing the debug output */ 13992 } 13993#endif 13994 13995 assert(subpattern_re); /* Should have died if didn't compile successfully */ 13996 return subpattern_re; 13997} 13998 13999STATIC I32 14000S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 14001 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 14002{ 14003 I32 result; 14004 DECLARE_AND_GET_RE_DEBUG_FLAGS; 14005 14006 PERL_ARGS_ASSERT_EXECUTE_WILDCARD; 14007 14008 ENTER; 14009 14010 /* The compilation has set things up so that if the program doesn't want to 14011 * see the wildcard matching procedure, it will get the core execution 14012 * engine, which is subject only to -Dr. So we have to turn that off 14013 * around this procedure */ 14014 if (! isDEBUG_WILDCARD) { 14015 /* Note! Casts away 'volatile' */ 14016 SAVEI32(PL_debug); 14017 PL_debug &= ~ DEBUG_r_FLAG; 14018 } 14019 14020 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer, 14021 NULL, nosave); 14022 LEAVE; 14023 14024 return result; 14025} 14026 14027SV * 14028S_handle_user_defined_property(pTHX_ 14029 14030 /* Parses the contents of a user-defined property definition; returning the 14031 * expanded definition if possible. If so, the return is an inversion 14032 * list. 14033 * 14034 * If there are subroutines that are part of the expansion and which aren't 14035 * known at the time of the call to this function, this returns what 14036 * parse_uniprop_string() returned for the first one encountered. 14037 * 14038 * If an error was found, NULL is returned, and 'msg' gets a suitable 14039 * message appended to it. (Appending allows the back trace of how we got 14040 * to the faulty definition to be displayed through nested calls of 14041 * user-defined subs.) 14042 * 14043 * The caller IS responsible for freeing any returned SV. 14044 * 14045 * The syntax of the contents is pretty much described in perlunicode.pod, 14046 * but we also allow comments on each line */ 14047 14048 const char * name, /* Name of property */ 14049 const STRLEN name_len, /* The name's length in bytes */ 14050 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 14051 const bool to_fold, /* ? Is this under /i */ 14052 const bool runtime, /* ? Are we in compile- or run-time */ 14053 const bool deferrable, /* Is it ok for this property's full definition 14054 to be deferred until later? */ 14055 SV* contents, /* The property's definition */ 14056 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be 14057 getting called unless this is thought to be 14058 a user-defined property */ 14059 SV * msg, /* Any error or warning msg(s) are appended to 14060 this */ 14061 const STRLEN level) /* Recursion level of this call */ 14062{ 14063 STRLEN len; 14064 const char * string = SvPV_const(contents, len); 14065 const char * const e = string + len; 14066 const bool is_contents_utf8 = cBOOL(SvUTF8(contents)); 14067 const STRLEN msgs_length_on_entry = SvCUR(msg); 14068 14069 const char * s0 = string; /* Points to first byte in the current line 14070 being parsed in 'string' */ 14071 const char overflow_msg[] = "Code point too large in \""; 14072 SV* running_definition = NULL; 14073 14074 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY; 14075 14076 *user_defined_ptr = TRUE; 14077 14078 /* Look at each line */ 14079 while (s0 < e) { 14080 const char * s; /* Current byte */ 14081 char op = '+'; /* Default operation is 'union' */ 14082 IV min = 0; /* range begin code point */ 14083 IV max = -1; /* and range end */ 14084 SV* this_definition; 14085 14086 /* Skip comment lines */ 14087 if (*s0 == '#') { 14088 s0 = strchr(s0, '\n'); 14089 if (s0 == NULL) { 14090 break; 14091 } 14092 s0++; 14093 continue; 14094 } 14095 14096 /* For backcompat, allow an empty first line */ 14097 if (*s0 == '\n') { 14098 s0++; 14099 continue; 14100 } 14101 14102 /* First character in the line may optionally be the operation */ 14103 if ( *s0 == '+' 14104 || *s0 == '!' 14105 || *s0 == '-' 14106 || *s0 == '&') 14107 { 14108 op = *s0++; 14109 } 14110 14111 /* If the line is one or two hex digits separated by blank space, its 14112 * a range; otherwise it is either another user-defined property or an 14113 * error */ 14114 14115 s = s0; 14116 14117 if (! isXDIGIT(*s)) { 14118 goto check_if_property; 14119 } 14120 14121 do { /* Each new hex digit will add 4 bits. */ 14122 if (min > ( (IV) MAX_LEGAL_CP >> 4)) { 14123 s = strchr(s, '\n'); 14124 if (s == NULL) { 14125 s = e; 14126 } 14127 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14128 sv_catpv(msg, overflow_msg); 14129 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14130 UTF8fARG(is_contents_utf8, s - s0, s0)); 14131 sv_catpvs(msg, "\""); 14132 goto return_failure; 14133 } 14134 14135 /* Accumulate this digit into the value */ 14136 min = (min << 4) + READ_XDIGIT(s); 14137 } while (isXDIGIT(*s)); 14138 14139 while (isBLANK(*s)) { s++; } 14140 14141 /* We allow comments at the end of the line */ 14142 if (*s == '#') { 14143 s = strchr(s, '\n'); 14144 if (s == NULL) { 14145 s = e; 14146 } 14147 s++; 14148 } 14149 else if (s < e && *s != '\n') { 14150 if (! isXDIGIT(*s)) { 14151 goto check_if_property; 14152 } 14153 14154 /* Look for the high point of the range */ 14155 max = 0; 14156 do { 14157 if (max > ( (IV) MAX_LEGAL_CP >> 4)) { 14158 s = strchr(s, '\n'); 14159 if (s == NULL) { 14160 s = e; 14161 } 14162 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14163 sv_catpv(msg, overflow_msg); 14164 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14165 UTF8fARG(is_contents_utf8, s - s0, s0)); 14166 sv_catpvs(msg, "\""); 14167 goto return_failure; 14168 } 14169 14170 max = (max << 4) + READ_XDIGIT(s); 14171 } while (isXDIGIT(*s)); 14172 14173 while (isBLANK(*s)) { s++; } 14174 14175 if (*s == '#') { 14176 s = strchr(s, '\n'); 14177 if (s == NULL) { 14178 s = e; 14179 } 14180 } 14181 else if (s < e && *s != '\n') { 14182 goto check_if_property; 14183 } 14184 } 14185 14186 if (max == -1) { /* The line only had one entry */ 14187 max = min; 14188 } 14189 else if (max < min) { 14190 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14191 sv_catpvs(msg, "Illegal range in \""); 14192 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14193 UTF8fARG(is_contents_utf8, s - s0, s0)); 14194 sv_catpvs(msg, "\""); 14195 goto return_failure; 14196 } 14197 14198# if 0 /* See explanation at definition above of get_extended_utf8_msg() */ 14199 14200 if ( UNICODE_IS_PERL_EXTENDED(min) 14201 || UNICODE_IS_PERL_EXTENDED(max)) 14202 { 14203 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14204 14205 /* If both code points are non-portable, warn only on the lower 14206 * one. */ 14207 sv_catpv(msg, get_extended_utf8_msg( 14208 (UNICODE_IS_PERL_EXTENDED(min)) 14209 ? min : max)); 14210 sv_catpvs(msg, " in \""); 14211 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14212 UTF8fARG(is_contents_utf8, s - s0, s0)); 14213 sv_catpvs(msg, "\""); 14214 } 14215 14216# endif 14217 14218 /* Here, this line contains a legal range */ 14219 this_definition = sv_2mortal(_new_invlist(2)); 14220 this_definition = _add_range_to_invlist(this_definition, min, max); 14221 goto calculate; 14222 14223 check_if_property: 14224 14225 /* Here it isn't a legal range line. See if it is a legal property 14226 * line. First find the end of the meat of the line */ 14227 s = strpbrk(s, "#\n"); 14228 if (s == NULL) { 14229 s = e; 14230 } 14231 14232 /* Ignore trailing blanks in keeping with the requirements of 14233 * parse_uniprop_string() */ 14234 s--; 14235 while (s > s0 && isBLANK_A(*s)) { 14236 s--; 14237 } 14238 s++; 14239 14240 this_definition = parse_uniprop_string(s0, s - s0, 14241 is_utf8, to_fold, runtime, 14242 deferrable, 14243 NULL, 14244 user_defined_ptr, msg, 14245 (name_len == 0) 14246 ? level /* Don't increase level 14247 if input is empty */ 14248 : level + 1 14249 ); 14250 if (this_definition == NULL) { 14251 goto return_failure; /* 'msg' should have had the reason 14252 appended to it by the above call */ 14253 } 14254 14255 if (! is_invlist(this_definition)) { /* Unknown at this time */ 14256 return newSVsv(this_definition); 14257 } 14258 14259 if (*s != '\n') { 14260 s = strchr(s, '\n'); 14261 if (s == NULL) { 14262 s = e; 14263 } 14264 } 14265 14266 calculate: 14267 14268 switch (op) { 14269 case '+': 14270 _invlist_union(running_definition, this_definition, 14271 &running_definition); 14272 break; 14273 case '-': 14274 _invlist_subtract(running_definition, this_definition, 14275 &running_definition); 14276 break; 14277 case '&': 14278 _invlist_intersection(running_definition, this_definition, 14279 &running_definition); 14280 break; 14281 case '!': 14282 _invlist_union_complement_2nd(running_definition, 14283 this_definition, &running_definition); 14284 break; 14285 default: 14286 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d", 14287 __FILE__, __LINE__, op); 14288 break; 14289 } 14290 14291 /* Position past the '\n' */ 14292 s0 = s + 1; 14293 } /* End of loop through the lines of 'contents' */ 14294 14295 /* Here, we processed all the lines in 'contents' without error. If we 14296 * didn't add any warnings, simply return success */ 14297 if (msgs_length_on_entry == SvCUR(msg)) { 14298 14299 /* If the expansion was empty, the answer isn't nothing: its an empty 14300 * inversion list */ 14301 if (running_definition == NULL) { 14302 running_definition = _new_invlist(1); 14303 } 14304 14305 return running_definition; 14306 } 14307 14308 /* Otherwise, add some explanatory text, but we will return success */ 14309 goto return_msg; 14310 14311 return_failure: 14312 running_definition = NULL; 14313 14314 return_msg: 14315 14316 if (name_len > 0) { 14317 sv_catpvs(msg, " in expansion of "); 14318 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); 14319 } 14320 14321 return running_definition; 14322} 14323 14324/* As explained below, certain operations need to take place in the first 14325 * thread created. These macros switch contexts */ 14326# ifdef USE_ITHREADS 14327# define DECLARATION_FOR_GLOBAL_CONTEXT \ 14328 PerlInterpreter * save_aTHX = aTHX; 14329# define SWITCH_TO_GLOBAL_CONTEXT \ 14330 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX)) 14331# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX)); 14332# define CUR_CONTEXT aTHX 14333# define ORIGINAL_CONTEXT save_aTHX 14334# else 14335# define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP 14336# define SWITCH_TO_GLOBAL_CONTEXT NOOP 14337# define RESTORE_CONTEXT NOOP 14338# define CUR_CONTEXT NULL 14339# define ORIGINAL_CONTEXT NULL 14340# endif 14341 14342STATIC void 14343S_delete_recursion_entry(pTHX_ void *key) 14344{ 14345 /* Deletes the entry used to detect recursion when expanding user-defined 14346 * properties. This is a function so it can be set up to be called even if 14347 * the program unexpectedly quits */ 14348 14349 SV ** current_entry; 14350 const STRLEN key_len = strlen((const char *) key); 14351 DECLARATION_FOR_GLOBAL_CONTEXT; 14352 14353 SWITCH_TO_GLOBAL_CONTEXT; 14354 14355 /* If the entry is one of these types, it is a permanent entry, and not the 14356 * one used to detect recursions. This function should delete only the 14357 * recursion entry */ 14358 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0); 14359 if ( current_entry 14360 && ! is_invlist(*current_entry) 14361 && ! SvPOK(*current_entry)) 14362 { 14363 (void) hv_delete(PL_user_def_props, (const char *) key, key_len, 14364 G_DISCARD); 14365 } 14366 14367 RESTORE_CONTEXT; 14368} 14369 14370STATIC SV * 14371S_get_fq_name(pTHX_ 14372 const char * const name, /* The first non-blank in the \p{}, \P{} */ 14373 const Size_t name_len, /* Its length in bytes, not including any trailing space */ 14374 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 14375 const bool has_colon_colon 14376 ) 14377{ 14378 /* Returns a mortal SV containing the fully qualified version of the input 14379 * name */ 14380 14381 SV * fq_name; 14382 14383 fq_name = newSVpvs_flags("", SVs_TEMP); 14384 14385 /* Use the current package if it wasn't included in our input */ 14386 if (! has_colon_colon) { 14387 const HV * pkg = (IN_PERL_COMPILETIME) 14388 ? PL_curstash 14389 : CopSTASH(PL_curcop); 14390 const char* pkgname = HvNAME(pkg); 14391 14392 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f, 14393 UTF8fARG(is_utf8, strlen(pkgname), pkgname)); 14394 sv_catpvs(fq_name, "::"); 14395 } 14396 14397 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f, 14398 UTF8fARG(is_utf8, name_len, name)); 14399 return fq_name; 14400} 14401 14402STATIC SV * 14403S_parse_uniprop_string(pTHX_ 14404 14405 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable 14406 * now. If so, the return is an inversion list. 14407 * 14408 * If the property is user-defined, it is a subroutine, which in turn 14409 * may call other subroutines. This function will call the whole nest of 14410 * them to get the definition they return; if some aren't known at the time 14411 * of the call to this function, the fully qualified name of the highest 14412 * level sub is returned. It is an error to call this function at runtime 14413 * without every sub defined. 14414 * 14415 * If an error was found, NULL is returned, and 'msg' gets a suitable 14416 * message appended to it. (Appending allows the back trace of how we got 14417 * to the faulty definition to be displayed through nested calls of 14418 * user-defined subs.) 14419 * 14420 * The caller should NOT try to free any returned inversion list. 14421 * 14422 * Other parameters will be set on return as described below */ 14423 14424 const char * const name, /* The first non-blank in the \p{}, \P{} */ 14425 Size_t name_len, /* Its length in bytes, not including any 14426 trailing space */ 14427 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 14428 const bool to_fold, /* ? Is this under /i */ 14429 const bool runtime, /* TRUE if this is being called at run time */ 14430 const bool deferrable, /* TRUE if it's ok for the definition to not be 14431 known at this call */ 14432 AV ** strings, /* To return string property values, like named 14433 sequences */ 14434 bool *user_defined_ptr, /* Upon return from this function it will be 14435 set to TRUE if any component is a 14436 user-defined property */ 14437 SV * msg, /* Any error or warning msg(s) are appended to 14438 this */ 14439 const STRLEN level) /* Recursion level of this call */ 14440{ 14441 char* lookup_name; /* normalized name for lookup in our tables */ 14442 unsigned lookup_len; /* Its length */ 14443 enum { Not_Strict = 0, /* Some properties have stricter name */ 14444 Strict, /* normalization rules, which we decide */ 14445 As_Is /* upon based on parsing */ 14446 } stricter = Not_Strict; 14447 14448 /* nv= or numeric_value=, or possibly one of the cjk numeric properties 14449 * (though it requires extra effort to download them from Unicode and 14450 * compile perl to know about them) */ 14451 bool is_nv_type = FALSE; 14452 14453 unsigned int i = 0, i_zero = 0, j = 0; 14454 int equals_pos = -1; /* Where the '=' is found, or negative if none */ 14455 int slash_pos = -1; /* Where the '/' is found, or negative if none */ 14456 int table_index = 0; /* The entry number for this property in the table 14457 of all Unicode property names */ 14458 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */ 14459 Size_t lookup_offset = 0; /* Used to ignore the first few characters of 14460 the normalized name in certain situations */ 14461 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't 14462 part of a package name */ 14463 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */ 14464 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined 14465 property rather than a Unicode 14466 one. */ 14467 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL 14468 if an error. If it is an inversion list, 14469 it is the definition. Otherwise it is a 14470 string containing the fully qualified sub 14471 name of 'name' */ 14472 SV * fq_name = NULL; /* For user-defined properties, the fully 14473 qualified name */ 14474 bool invert_return = FALSE; /* ? Do we need to complement the result before 14475 returning it */ 14476 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an 14477 explicit utf8:: package that we strip 14478 off */ 14479 /* The expansion of properties that could be either user-defined or 14480 * official unicode ones is deferred until runtime, including a marker for 14481 * those that might be in the latter category. This boolean indicates if 14482 * we've seen that marker. If not, what we're parsing can't be such an 14483 * official Unicode property whose expansion was deferred */ 14484 bool could_be_deferred_official = FALSE; 14485 14486 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING; 14487 14488 /* The input will be normalized into 'lookup_name' */ 14489 Newx(lookup_name, name_len, char); 14490 SAVEFREEPV(lookup_name); 14491 14492 /* Parse the input. */ 14493 for (i = 0; i < name_len; i++) { 14494 char cur = name[i]; 14495 14496 /* Most of the characters in the input will be of this ilk, being parts 14497 * of a name */ 14498 if (isIDCONT_A(cur)) { 14499 14500 /* Case differences are ignored. Our lookup routine assumes 14501 * everything is lowercase, so normalize to that */ 14502 if (isUPPER_A(cur)) { 14503 lookup_name[j++] = toLOWER_A(cur); 14504 continue; 14505 } 14506 14507 if (cur == '_') { /* Don't include these in the normalized name */ 14508 continue; 14509 } 14510 14511 lookup_name[j++] = cur; 14512 14513 /* The first character in a user-defined name must be of this type. 14514 * */ 14515 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) { 14516 could_be_user_defined = FALSE; 14517 } 14518 14519 continue; 14520 } 14521 14522 /* Here, the character is not something typically in a name, But these 14523 * two types of characters (and the '_' above) can be freely ignored in 14524 * most situations. Later it may turn out we shouldn't have ignored 14525 * them, and we have to reparse, but we don't have enough information 14526 * yet to make that decision */ 14527 if (cur == '-' || isSPACE_A(cur)) { 14528 could_be_user_defined = FALSE; 14529 continue; 14530 } 14531 14532 /* An equals sign or single colon mark the end of the first part of 14533 * the property name */ 14534 if ( cur == '=' 14535 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':'))) 14536 { 14537 lookup_name[j++] = '='; /* Treat the colon as an '=' */ 14538 equals_pos = j; /* Note where it occurred in the input */ 14539 could_be_user_defined = FALSE; 14540 break; 14541 } 14542 14543 /* If this looks like it is a marker we inserted at compile time, 14544 * set a flag and otherwise ignore it. If it isn't in the final 14545 * position, keep it as it would have been user input. */ 14546 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc) 14547 && ! deferrable 14548 && could_be_user_defined 14549 && i == name_len - 1) 14550 { 14551 name_len--; 14552 could_be_deferred_official = TRUE; 14553 continue; 14554 } 14555 14556 /* Otherwise, this character is part of the name. */ 14557 lookup_name[j++] = cur; 14558 14559 /* Here it isn't a single colon, so if it is a colon, it must be a 14560 * double colon */ 14561 if (cur == ':') { 14562 14563 /* A double colon should be a package qualifier. We note its 14564 * position and continue. Note that one could have 14565 * pkg1::pkg2::...::foo 14566 * so that the position at the end of the loop will be just after 14567 * the final qualifier */ 14568 14569 i++; 14570 non_pkg_begin = i + 1; 14571 lookup_name[j++] = ':'; 14572 lun_non_pkg_begin = j; 14573 } 14574 else { /* Only word chars (and '::') can be in a user-defined name */ 14575 could_be_user_defined = FALSE; 14576 } 14577 } /* End of parsing through the lhs of the property name (or all of it if 14578 no rhs) */ 14579 14580 /* If there is a single package name 'utf8::', it is ambiguous. It could 14581 * be for a user-defined property, or it could be a Unicode property, as 14582 * all of them are considered to be for that package. For the purposes of 14583 * parsing the rest of the property, strip it off */ 14584 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) { 14585 lookup_name += STRLENs("utf8::"); 14586 j -= STRLENs("utf8::"); 14587 equals_pos -= STRLENs("utf8::"); 14588 i_zero = STRLENs("utf8::"); /* When resetting 'i' to reparse 14589 from the beginning, it has to be 14590 set past what we're stripping 14591 off */ 14592 stripped_utf8_pkg = TRUE; 14593 } 14594 14595 /* Here, we are either done with the whole property name, if it was simple; 14596 * or are positioned just after the '=' if it is compound. */ 14597 14598 if (equals_pos >= 0) { 14599 assert(stricter == Not_Strict); /* We shouldn't have set this yet */ 14600 14601 /* Space immediately after the '=' is ignored */ 14602 i++; 14603 for (; i < name_len; i++) { 14604 if (! isSPACE_A(name[i])) { 14605 break; 14606 } 14607 } 14608 14609 /* Most punctuation after the equals indicates a subpattern, like 14610 * \p{foo=/bar/} */ 14611 if ( isPUNCT_A(name[i]) 14612 && name[i] != '-' 14613 && name[i] != '+' 14614 && name[i] != '_' 14615 && name[i] != '{' 14616 /* A backslash means the real delimiter is the next character, 14617 * but it must be punctuation */ 14618 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1])))) 14619 { 14620 bool special_property = memEQs(lookup_name, j - 1, "name") 14621 || memEQs(lookup_name, j - 1, "na"); 14622 if (! special_property) { 14623 /* Find the property. The table includes the equals sign, so 14624 * we use 'j' as-is */ 14625 table_index = do_uniprop_match(lookup_name, j); 14626 } 14627 if (special_property || table_index) { 14628 REGEXP * subpattern_re; 14629 char open = name[i++]; 14630 char close; 14631 const char * pos_in_brackets; 14632 const char * const * prop_values; 14633 bool escaped = 0; 14634 14635 /* Backslash => delimiter is the character following. We 14636 * already checked that it is punctuation */ 14637 if (open == '\\') { 14638 open = name[i++]; 14639 escaped = 1; 14640 } 14641 14642 /* This data structure is constructed so that the matching 14643 * closing bracket is 3 past its matching opening. The second 14644 * set of closing is so that if the opening is something like 14645 * ']', the closing will be that as well. Something similar is 14646 * done in toke.c */ 14647 pos_in_brackets = memCHRs("([<)]>)]>", open); 14648 close = (pos_in_brackets) ? pos_in_brackets[3] : open; 14649 14650 if ( i >= name_len 14651 || name[name_len-1] != close 14652 || (escaped && name[name_len-2] != '\\') 14653 /* Also make sure that there are enough characters. 14654 * e.g., '\\\' would show up incorrectly as legal even 14655 * though it is too short */ 14656 || (SSize_t) (name_len - i - 1 - escaped) < 0) 14657 { 14658 sv_catpvs(msg, "Unicode property wildcard not terminated"); 14659 goto append_name_to_msg; 14660 } 14661 14662 Perl_ck_warner_d(aTHX_ 14663 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS), 14664 "The Unicode property wildcards feature is experimental"); 14665 14666 if (special_property) { 14667 const char * error_msg; 14668 const char * revised_name = name + i; 14669 Size_t revised_name_len = name_len - (i + 1 + escaped); 14670 14671 /* Currently, the only 'special_property' is name, which we 14672 * lookup in _charnames.pm */ 14673 14674 if (! load_charnames(newSVpvs("placeholder"), 14675 revised_name, revised_name_len, 14676 &error_msg)) 14677 { 14678 sv_catpv(msg, error_msg); 14679 goto append_name_to_msg; 14680 } 14681 14682 /* Farm this out to a function just to make the current 14683 * function less unwieldy */ 14684 if (handle_names_wildcard(revised_name, revised_name_len, 14685 &prop_definition, 14686 strings)) 14687 { 14688 return prop_definition; 14689 } 14690 14691 goto failed; 14692 } 14693 14694 prop_values = get_prop_values(table_index); 14695 14696 /* Now create and compile the wildcard subpattern. Use /i 14697 * because the property values are supposed to match with case 14698 * ignored. */ 14699 subpattern_re = compile_wildcard(name + i, 14700 name_len - i - 1 - escaped, 14701 TRUE /* /i */ 14702 ); 14703 14704 /* For each legal property value, see if the supplied pattern 14705 * matches it. */ 14706 while (*prop_values) { 14707 const char * const entry = *prop_values; 14708 const Size_t len = strlen(entry); 14709 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP); 14710 14711 if (execute_wildcard(subpattern_re, 14712 (char *) entry, 14713 (char *) entry + len, 14714 (char *) entry, 0, 14715 entry_sv, 14716 0)) 14717 { /* Here, matched. Add to the returned list */ 14718 Size_t total_len = j + len; 14719 SV * sub_invlist = NULL; 14720 char * this_string; 14721 14722 /* We know this is a legal \p{property=value}. Call 14723 * the function to return the list of code points that 14724 * match it */ 14725 Newxz(this_string, total_len + 1, char); 14726 Copy(lookup_name, this_string, j, char); 14727 my_strlcat(this_string, entry, total_len + 1); 14728 SAVEFREEPV(this_string); 14729 sub_invlist = parse_uniprop_string(this_string, 14730 total_len, 14731 is_utf8, 14732 to_fold, 14733 runtime, 14734 deferrable, 14735 NULL, 14736 user_defined_ptr, 14737 msg, 14738 level + 1); 14739 _invlist_union(prop_definition, sub_invlist, 14740 &prop_definition); 14741 } 14742 14743 prop_values++; /* Next iteration, look at next propvalue */ 14744 } /* End of looking through property values; (the data 14745 structure is terminated by a NULL ptr) */ 14746 14747 SvREFCNT_dec_NN(subpattern_re); 14748 14749 if (prop_definition) { 14750 return prop_definition; 14751 } 14752 14753 sv_catpvs(msg, "No Unicode property value wildcard matches:"); 14754 goto append_name_to_msg; 14755 } 14756 14757 /* Here's how khw thinks we should proceed to handle the properties 14758 * not yet done: Bidi Mirroring Glyph can map to "" 14759 Bidi Paired Bracket can map to "" 14760 Case Folding (both full and simple) 14761 Shouldn't /i be good enough for Full 14762 Decomposition Mapping 14763 Equivalent Unified Ideograph can map to "" 14764 Lowercase Mapping (both full and simple) 14765 NFKC Case Fold can map to "" 14766 Titlecase Mapping (both full and simple) 14767 Uppercase Mapping (both full and simple) 14768 * Handle these the same way Name is done, using say, _wild.pm, but 14769 * having both loose and full, like in charclass_invlists.h. 14770 * Perhaps move block and script to that as they are somewhat large 14771 * in charclass_invlists.h. 14772 * For properties where the default is the code point itself, such 14773 * as any of the case changing mappings, the string would otherwise 14774 * consist of all Unicode code points in UTF-8 strung together. 14775 * This would be impractical. So instead, examine their compiled 14776 * pattern, looking at the ssc. If none, reject the pattern as an 14777 * error. Otherwise run the pattern against every code point in 14778 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets 14779 * And it might be good to create an API to return the ssc. 14780 * Or handle them like the algorithmic names are done 14781 */ 14782 } /* End of is a wildcard subppattern */ 14783 14784 /* \p{name=...} is handled specially. Instead of using the normal 14785 * mechanism involving charclass_invlists.h, it uses _charnames.pm 14786 * which has the necessary (huge) data accessible to it, and which 14787 * doesn't get loaded unless necessary. The legal syntax for names is 14788 * somewhat different than other properties due both to the vagaries of 14789 * a few outlier official names, and the fact that only a few ASCII 14790 * characters are permitted in them */ 14791 if ( memEQs(lookup_name, j - 1, "name") 14792 || memEQs(lookup_name, j - 1, "na")) 14793 { 14794 dSP; 14795 HV * table; 14796 SV * character; 14797 const char * error_msg; 14798 CV* lookup_loose; 14799 SV * character_name; 14800 STRLEN character_len; 14801 UV cp; 14802 14803 stricter = As_Is; 14804 14805 /* Since the RHS (after skipping initial space) is passed unchanged 14806 * to charnames, and there are different criteria for what are 14807 * legal characters in the name, just parse it here. A character 14808 * name must begin with an ASCII alphabetic */ 14809 if (! isALPHA(name[i])) { 14810 goto failed; 14811 } 14812 lookup_name[j++] = name[i]; 14813 14814 for (++i; i < name_len; i++) { 14815 /* Official names can only be in the ASCII range, and only 14816 * certain characters */ 14817 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) { 14818 goto failed; 14819 } 14820 lookup_name[j++] = name[i]; 14821 } 14822 14823 /* Finished parsing, save the name into an SV */ 14824 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos); 14825 14826 /* Make sure _charnames is loaded. (The parameters give context 14827 * for any errors generated */ 14828 table = load_charnames(character_name, name, name_len, &error_msg); 14829 if (table == NULL) { 14830 sv_catpv(msg, error_msg); 14831 goto append_name_to_msg; 14832 } 14833 14834 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0); 14835 if (! lookup_loose) { 14836 Perl_croak(aTHX_ 14837 "panic: Can't find '_charnames::_loose_regcomp_lookup"); 14838 } 14839 14840 PUSHSTACKi(PERLSI_REGCOMP); 14841 ENTER ; 14842 SAVETMPS; 14843 save_re_context(); 14844 14845 PUSHMARK(SP) ; 14846 XPUSHs(character_name); 14847 PUTBACK; 14848 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR); 14849 14850 SPAGAIN ; 14851 14852 character = POPs; 14853 SvREFCNT_inc_simple_void_NN(character); 14854 14855 PUTBACK ; 14856 FREETMPS ; 14857 LEAVE ; 14858 POPSTACK; 14859 14860 if (! SvOK(character)) { 14861 goto failed; 14862 } 14863 14864 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len); 14865 if (character_len == SvCUR(character)) { 14866 prop_definition = add_cp_to_invlist(NULL, cp); 14867 } 14868 else { 14869 AV * this_string; 14870 14871 /* First of the remaining characters in the string. */ 14872 char * remaining = SvPVX(character) + character_len; 14873 14874 if (strings == NULL) { 14875 goto failed; /* XXX Perhaps a specific msg instead, like 14876 'not available here' */ 14877 } 14878 14879 if (*strings == NULL) { 14880 *strings = newAV(); 14881 } 14882 14883 this_string = newAV(); 14884 av_push_simple(this_string, newSVuv(cp)); 14885 14886 do { 14887 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len); 14888 av_push_simple(this_string, newSVuv(cp)); 14889 remaining += character_len; 14890 } while (remaining < SvEND(character)); 14891 14892 av_push_simple(*strings, (SV *) this_string); 14893 } 14894 14895 return prop_definition; 14896 } 14897 14898 /* Certain properties whose values are numeric need special handling. 14899 * They may optionally be prefixed by 'is'. Ignore that prefix for the 14900 * purposes of checking if this is one of those properties */ 14901 if (memBEGINPs(lookup_name, j, "is")) { 14902 lookup_offset = 2; 14903 } 14904 14905 /* Then check if it is one of these specially-handled properties. The 14906 * possibilities are hard-coded because easier this way, and the list 14907 * is unlikely to change. 14908 * 14909 * All numeric value type properties are of this ilk, and are also 14910 * special in a different way later on. So find those first. There 14911 * are several numeric value type properties in the Unihan DB (which is 14912 * unlikely to be compiled with perl, but we handle it here in case it 14913 * does get compiled). They all end with 'numeric'. The interiors 14914 * aren't checked for the precise property. This would stop working if 14915 * a cjk property were to be created that ended with 'numeric' and 14916 * wasn't a numeric type */ 14917 is_nv_type = memEQs(lookup_name + lookup_offset, 14918 j - 1 - lookup_offset, "numericvalue") 14919 || memEQs(lookup_name + lookup_offset, 14920 j - 1 - lookup_offset, "nv") 14921 || ( memENDPs(lookup_name + lookup_offset, 14922 j - 1 - lookup_offset, "numeric") 14923 && ( memBEGINPs(lookup_name + lookup_offset, 14924 j - 1 - lookup_offset, "cjk") 14925 || memBEGINPs(lookup_name + lookup_offset, 14926 j - 1 - lookup_offset, "k"))); 14927 if ( is_nv_type 14928 || memEQs(lookup_name + lookup_offset, 14929 j - 1 - lookup_offset, "canonicalcombiningclass") 14930 || memEQs(lookup_name + lookup_offset, 14931 j - 1 - lookup_offset, "ccc") 14932 || memEQs(lookup_name + lookup_offset, 14933 j - 1 - lookup_offset, "age") 14934 || memEQs(lookup_name + lookup_offset, 14935 j - 1 - lookup_offset, "in") 14936 || memEQs(lookup_name + lookup_offset, 14937 j - 1 - lookup_offset, "presentin")) 14938 { 14939 unsigned int k; 14940 14941 /* Since the stuff after the '=' is a number, we can't throw away 14942 * '-' willy-nilly, as those could be a minus sign. Other stricter 14943 * rules also apply. However, these properties all can have the 14944 * rhs not be a number, in which case they contain at least one 14945 * alphabetic. In those cases, the stricter rules don't apply. 14946 * But the numeric type properties can have the alphas [Ee] to 14947 * signify an exponent, and it is still a number with stricter 14948 * rules. So look for an alpha that signifies not-strict */ 14949 stricter = Strict; 14950 for (k = i; k < name_len; k++) { 14951 if ( isALPHA_A(name[k]) 14952 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E'))) 14953 { 14954 stricter = Not_Strict; 14955 break; 14956 } 14957 } 14958 } 14959 14960 if (stricter) { 14961 14962 /* A number may have a leading '+' or '-'. The latter is retained 14963 * */ 14964 if (name[i] == '+') { 14965 i++; 14966 } 14967 else if (name[i] == '-') { 14968 lookup_name[j++] = '-'; 14969 i++; 14970 } 14971 14972 /* Skip leading zeros including single underscores separating the 14973 * zeros, or between the final leading zero and the first other 14974 * digit */ 14975 for (; i < name_len - 1; i++) { 14976 if ( name[i] != '0' 14977 && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) 14978 { 14979 break; 14980 } 14981 } 14982 14983 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by 14984 * underling libc implementation. */ 14985 if ( i == name_len - 1 14986 && name[name_len-1] == '0' 14987 && lookup_name[j-1] == '-') 14988 { 14989 j--; 14990 } 14991 } 14992 } 14993 else { /* No '=' */ 14994 14995 /* Only a few properties without an '=' should be parsed with stricter 14996 * rules. The list is unlikely to change. */ 14997 if ( memBEGINPs(lookup_name, j, "perl") 14998 && memNEs(lookup_name + 4, j - 4, "space") 14999 && memNEs(lookup_name + 4, j - 4, "word")) 15000 { 15001 stricter = Strict; 15002 15003 /* We set the inputs back to 0 and the code below will reparse, 15004 * using strict */ 15005 i = i_zero; 15006 j = 0; 15007 } 15008 } 15009 15010 /* Here, we have either finished the property, or are positioned to parse 15011 * the remainder, and we know if stricter rules apply. Finish out, if not 15012 * already done */ 15013 for (; i < name_len; i++) { 15014 char cur = name[i]; 15015 15016 /* In all instances, case differences are ignored, and we normalize to 15017 * lowercase */ 15018 if (isUPPER_A(cur)) { 15019 lookup_name[j++] = toLOWER(cur); 15020 continue; 15021 } 15022 15023 /* An underscore is skipped, but not under strict rules unless it 15024 * separates two digits */ 15025 if (cur == '_') { 15026 if ( stricter 15027 && ( i == i_zero || (int) i == equals_pos || i == name_len- 1 15028 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1]))) 15029 { 15030 lookup_name[j++] = '_'; 15031 } 15032 continue; 15033 } 15034 15035 /* Hyphens are skipped except under strict */ 15036 if (cur == '-' && ! stricter) { 15037 continue; 15038 } 15039 15040 /* XXX Bug in documentation. It says white space skipped adjacent to 15041 * non-word char. Maybe we should, but shouldn't skip it next to a dot 15042 * in a number */ 15043 if (isSPACE_A(cur) && ! stricter) { 15044 continue; 15045 } 15046 15047 lookup_name[j++] = cur; 15048 15049 /* Unless this is a non-trailing slash, we are done with it */ 15050 if (i >= name_len - 1 || cur != '/') { 15051 continue; 15052 } 15053 15054 slash_pos = j; 15055 15056 /* A slash in the 'numeric value' property indicates that what follows 15057 * is a denominator. It can have a leading '+' and '0's that should be 15058 * skipped. But we have never allowed a negative denominator, so treat 15059 * a minus like every other character. (No need to rule out a second 15060 * '/', as that won't match anything anyway */ 15061 if (is_nv_type) { 15062 i++; 15063 if (i < name_len && name[i] == '+') { 15064 i++; 15065 } 15066 15067 /* Skip leading zeros including underscores separating digits */ 15068 for (; i < name_len - 1; i++) { 15069 if ( name[i] != '0' 15070 && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) 15071 { 15072 break; 15073 } 15074 } 15075 15076 /* Store the first real character in the denominator */ 15077 if (i < name_len) { 15078 lookup_name[j++] = name[i]; 15079 } 15080 } 15081 } 15082 15083 /* Here are completely done parsing the input 'name', and 'lookup_name' 15084 * contains a copy, normalized. 15085 * 15086 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and 15087 * different from without the underscores. */ 15088 if ( ( UNLIKELY(memEQs(lookup_name, j, "l")) 15089 || UNLIKELY(memEQs(lookup_name, j, "gc=l"))) 15090 && UNLIKELY(name[name_len-1] == '_')) 15091 { 15092 lookup_name[j++] = '&'; 15093 } 15094 15095 /* If the original input began with 'In' or 'Is', it could be a subroutine 15096 * call to a user-defined property instead of a Unicode property name. */ 15097 if ( name_len - non_pkg_begin > 2 15098 && name[non_pkg_begin+0] == 'I' 15099 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's')) 15100 { 15101 /* Names that start with In have different characteristics than those 15102 * that start with Is */ 15103 if (name[non_pkg_begin+1] == 's') { 15104 starts_with_Is = TRUE; 15105 } 15106 } 15107 else { 15108 could_be_user_defined = FALSE; 15109 } 15110 15111 if (could_be_user_defined) { 15112 CV* user_sub; 15113 15114 /* If the user defined property returns the empty string, it could 15115 * easily be because the pattern is being compiled before the data it 15116 * actually needs to compile is available. This could be argued to be 15117 * a bug in the perl code, but this is a change of behavior for Perl, 15118 * so we handle it. This means that intentionally returning nothing 15119 * will not be resolved until runtime */ 15120 bool empty_return = FALSE; 15121 15122 /* Here, the name could be for a user defined property, which are 15123 * implemented as subs. */ 15124 user_sub = get_cvn_flags(name, name_len, 0); 15125 if (! user_sub) { 15126 15127 /* Here, the property name could be a user-defined one, but there 15128 * is no subroutine to handle it (as of now). Defer handling it 15129 * until runtime. Otherwise, a block defined by Unicode in a later 15130 * release would get the synonym InFoo added for it, and existing 15131 * code that used that name would suddenly break if it referred to 15132 * the property before the sub was declared. See [perl #134146] */ 15133 if (deferrable) { 15134 goto definition_deferred; 15135 } 15136 15137 /* Here, we are at runtime, and didn't find the user property. It 15138 * could be an official property, but only if no package was 15139 * specified, or just the utf8:: package. */ 15140 if (could_be_deferred_official) { 15141 lookup_name += lun_non_pkg_begin; 15142 j -= lun_non_pkg_begin; 15143 } 15144 else if (! stripped_utf8_pkg) { 15145 goto unknown_user_defined; 15146 } 15147 15148 /* Drop down to look up in the official properties */ 15149 } 15150 else { 15151 const char insecure[] = "Insecure user-defined property"; 15152 15153 /* Here, there is a sub by the correct name. Normally we call it 15154 * to get the property definition */ 15155 dSP; 15156 SV * user_sub_sv = MUTABLE_SV(user_sub); 15157 SV * error; /* Any error returned by calling 'user_sub' */ 15158 SV * key; /* The key into the hash of user defined sub names 15159 */ 15160 SV * placeholder; 15161 SV ** saved_user_prop_ptr; /* Hash entry for this property */ 15162 15163 /* How many times to retry when another thread is in the middle of 15164 * expanding the same definition we want */ 15165 PERL_INT_FAST8_T retry_countdown = 10; 15166 15167 DECLARATION_FOR_GLOBAL_CONTEXT; 15168 15169 /* If we get here, we know this property is user-defined */ 15170 *user_defined_ptr = TRUE; 15171 15172 /* We refuse to call a potentially tainted subroutine; returning an 15173 * error instead */ 15174 if (TAINT_get) { 15175 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15176 sv_catpvn(msg, insecure, sizeof(insecure) - 1); 15177 goto append_name_to_msg; 15178 } 15179 15180 /* In principal, we only call each subroutine property definition 15181 * once during the life of the program. This guarantees that the 15182 * property definition never changes. The results of the single 15183 * sub call are stored in a hash, which is used instead for future 15184 * references to this property. The property definition is thus 15185 * immutable. But, to allow the user to have a /i-dependent 15186 * definition, we call the sub once for non-/i, and once for /i, 15187 * should the need arise, passing the /i status as a parameter. 15188 * 15189 * We start by constructing the hash key name, consisting of the 15190 * fully qualified subroutine name, preceded by the /i status, so 15191 * that there is a key for /i and a different key for non-/i */ 15192 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP); 15193 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, 15194 non_pkg_begin != 0); 15195 sv_catsv(key, fq_name); 15196 15197 /* We only call the sub once throughout the life of the program 15198 * (with the /i, non-/i exception noted above). That means the 15199 * hash must be global and accessible to all threads. It is 15200 * created at program start-up, before any threads are created, so 15201 * is accessible to all children. But this creates some 15202 * complications. 15203 * 15204 * 1) The keys can't be shared, or else problems arise; sharing is 15205 * turned off at hash creation time 15206 * 2) All SVs in it are there for the remainder of the life of the 15207 * program, and must be created in the same interpreter context 15208 * as the hash, or else they will be freed from the wrong pool 15209 * at global destruction time. This is handled by switching to 15210 * the hash's context to create each SV going into it, and then 15211 * immediately switching back 15212 * 3) All accesses to the hash must be controlled by a mutex, to 15213 * prevent two threads from getting an unstable state should 15214 * they simultaneously be accessing it. The code below is 15215 * crafted so that the mutex is locked whenever there is an 15216 * access and unlocked only when the next stable state is 15217 * achieved. 15218 * 15219 * The hash stores either the definition of the property if it was 15220 * valid, or, if invalid, the error message that was raised. We 15221 * use the type of SV to distinguish. 15222 * 15223 * There's also the need to guard against the definition expansion 15224 * from infinitely recursing. This is handled by storing the aTHX 15225 * of the expanding thread during the expansion. Again the SV type 15226 * is used to distinguish this from the other two cases. If we 15227 * come to here and the hash entry for this property is our aTHX, 15228 * it means we have recursed, and the code assumes that we would 15229 * infinitely recurse, so instead stops and raises an error. 15230 * (Any recursion has always been treated as infinite recursion in 15231 * this feature.) 15232 * 15233 * If instead, the entry is for a different aTHX, it means that 15234 * that thread has gotten here first, and hasn't finished expanding 15235 * the definition yet. We just have to wait until it is done. We 15236 * sleep and retry a few times, returning an error if the other 15237 * thread doesn't complete. */ 15238 15239 re_fetch: 15240 USER_PROP_MUTEX_LOCK; 15241 15242 /* If we have an entry for this key, the subroutine has already 15243 * been called once with this /i status. */ 15244 saved_user_prop_ptr = hv_fetch(PL_user_def_props, 15245 SvPVX(key), SvCUR(key), 0); 15246 if (saved_user_prop_ptr) { 15247 15248 /* If the saved result is an inversion list, it is the valid 15249 * definition of this property */ 15250 if (is_invlist(*saved_user_prop_ptr)) { 15251 prop_definition = *saved_user_prop_ptr; 15252 15253 /* The SV in the hash won't be removed until global 15254 * destruction, so it is stable and we can unlock */ 15255 USER_PROP_MUTEX_UNLOCK; 15256 15257 /* The caller shouldn't try to free this SV */ 15258 return prop_definition; 15259 } 15260 15261 /* Otherwise, if it is a string, it is the error message 15262 * that was returned when we first tried to evaluate this 15263 * property. Fail, and append the message */ 15264 if (SvPOK(*saved_user_prop_ptr)) { 15265 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15266 sv_catsv(msg, *saved_user_prop_ptr); 15267 15268 /* The SV in the hash won't be removed until global 15269 * destruction, so it is stable and we can unlock */ 15270 USER_PROP_MUTEX_UNLOCK; 15271 15272 return NULL; 15273 } 15274 15275 assert(SvIOK(*saved_user_prop_ptr)); 15276 15277 /* Here, we have an unstable entry in the hash. Either another 15278 * thread is in the middle of expanding the property's 15279 * definition, or we are ourselves recursing. We use the aTHX 15280 * in it to distinguish */ 15281 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) { 15282 15283 /* Here, it's another thread doing the expanding. We've 15284 * looked as much as we are going to at the contents of the 15285 * hash entry. It's safe to unlock. */ 15286 USER_PROP_MUTEX_UNLOCK; 15287 15288 /* Retry a few times */ 15289 if (retry_countdown-- > 0) { 15290 PerlProc_sleep(1); 15291 goto re_fetch; 15292 } 15293 15294 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15295 sv_catpvs(msg, "Timeout waiting for another thread to " 15296 "define"); 15297 goto append_name_to_msg; 15298 } 15299 15300 /* Here, we are recursing; don't dig any deeper */ 15301 USER_PROP_MUTEX_UNLOCK; 15302 15303 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15304 sv_catpvs(msg, 15305 "Infinite recursion in user-defined property"); 15306 goto append_name_to_msg; 15307 } 15308 15309 /* Here, this thread has exclusive control, and there is no entry 15310 * for this property in the hash. So we have the go ahead to 15311 * expand the definition ourselves. */ 15312 15313 PUSHSTACKi(PERLSI_REGCOMP); 15314 ENTER; 15315 15316 /* Create a temporary placeholder in the hash to detect recursion 15317 * */ 15318 SWITCH_TO_GLOBAL_CONTEXT; 15319 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT)); 15320 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0); 15321 RESTORE_CONTEXT; 15322 15323 /* Now that we have a placeholder, we can let other threads 15324 * continue */ 15325 USER_PROP_MUTEX_UNLOCK; 15326 15327 /* Make sure the placeholder always gets destroyed */ 15328 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key)); 15329 15330 PUSHMARK(SP); 15331 SAVETMPS; 15332 15333 /* Call the user's function, with the /i status as a parameter. 15334 * Note that we have gone to a lot of trouble to keep this call 15335 * from being within the locked mutex region. */ 15336 XPUSHs(boolSV(to_fold)); 15337 PUTBACK; 15338 15339 /* The following block was taken from swash_init(). Presumably 15340 * they apply to here as well, though we no longer use a swash -- 15341 * khw */ 15342 SAVEHINTS(); 15343 save_re_context(); 15344 /* We might get here via a subroutine signature which uses a utf8 15345 * parameter name, at which point PL_subname will have been set 15346 * but not yet used. */ 15347 save_item(PL_subname); 15348 15349 /* G_SCALAR guarantees a single return value */ 15350 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR); 15351 15352 SPAGAIN; 15353 15354 error = ERRSV; 15355 if (TAINT_get || SvTRUE(error)) { 15356 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15357 if (SvTRUE(error)) { 15358 sv_catpvs(msg, "Error \""); 15359 sv_catsv(msg, error); 15360 sv_catpvs(msg, "\""); 15361 } 15362 if (TAINT_get) { 15363 if (SvTRUE(error)) sv_catpvs(msg, "; "); 15364 sv_catpvn(msg, insecure, sizeof(insecure) - 1); 15365 } 15366 15367 if (name_len > 0) { 15368 sv_catpvs(msg, " in expansion of "); 15369 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, 15370 name_len, 15371 name)); 15372 } 15373 15374 (void) POPs; 15375 prop_definition = NULL; 15376 } 15377 else { 15378 SV * contents = POPs; 15379 15380 /* The contents is supposed to be the expansion of the property 15381 * definition. If the definition is deferrable, and we got an 15382 * empty string back, set a flag to later defer it (after clean 15383 * up below). */ 15384 if ( deferrable 15385 && (! SvPOK(contents) || SvCUR(contents) == 0)) 15386 { 15387 empty_return = TRUE; 15388 } 15389 else { /* Otherwise, call a function to check for valid syntax, 15390 and handle it */ 15391 15392 prop_definition = handle_user_defined_property( 15393 name, name_len, 15394 is_utf8, to_fold, runtime, 15395 deferrable, 15396 contents, user_defined_ptr, 15397 msg, 15398 level); 15399 } 15400 } 15401 15402 /* Here, we have the results of the expansion. Delete the 15403 * placeholder, and if the definition is now known, replace it with 15404 * that definition. We need exclusive access to the hash, and we 15405 * can't let anyone else in, between when we delete the placeholder 15406 * and add the permanent entry */ 15407 USER_PROP_MUTEX_LOCK; 15408 15409 S_delete_recursion_entry(aTHX_ SvPVX(key)); 15410 15411 if ( ! empty_return 15412 && (! prop_definition || is_invlist(prop_definition))) 15413 { 15414 /* If we got success we use the inversion list defining the 15415 * property; otherwise use the error message */ 15416 SWITCH_TO_GLOBAL_CONTEXT; 15417 (void) hv_store_ent(PL_user_def_props, 15418 key, 15419 ((prop_definition) 15420 ? newSVsv(prop_definition) 15421 : newSVsv(msg)), 15422 0); 15423 RESTORE_CONTEXT; 15424 } 15425 15426 /* All done, and the hash now has a permanent entry for this 15427 * property. Give up exclusive control */ 15428 USER_PROP_MUTEX_UNLOCK; 15429 15430 FREETMPS; 15431 LEAVE; 15432 POPSTACK; 15433 15434 if (empty_return) { 15435 goto definition_deferred; 15436 } 15437 15438 if (prop_definition) { 15439 15440 /* If the definition is for something not known at this time, 15441 * we toss it, and go return the main property name, as that's 15442 * the one the user will be aware of */ 15443 if (! is_invlist(prop_definition)) { 15444 SvREFCNT_dec_NN(prop_definition); 15445 goto definition_deferred; 15446 } 15447 15448 sv_2mortal(prop_definition); 15449 } 15450 15451 /* And return */ 15452 return prop_definition; 15453 15454 } /* End of calling the subroutine for the user-defined property */ 15455 } /* End of it could be a user-defined property */ 15456 15457 /* Here it wasn't a user-defined property that is known at this time. See 15458 * if it is a Unicode property */ 15459 15460 lookup_len = j; /* This is a more mnemonic name than 'j' */ 15461 15462 /* Get the index into our pointer table of the inversion list corresponding 15463 * to the property */ 15464 table_index = do_uniprop_match(lookup_name, lookup_len); 15465 15466 /* If it didn't find the property ... */ 15467 if (table_index == 0) { 15468 15469 /* Try again stripping off any initial 'Is'. This is because we 15470 * promise that an initial Is is optional. The same isn't true of 15471 * names that start with 'In'. Those can match only blocks, and the 15472 * lookup table already has those accounted for. The lookup table also 15473 * has already accounted for Perl extensions (without and = sign) 15474 * starting with 'i's'. */ 15475 if (starts_with_Is && equals_pos >= 0) { 15476 lookup_name += 2; 15477 lookup_len -= 2; 15478 equals_pos -= 2; 15479 slash_pos -= 2; 15480 15481 table_index = do_uniprop_match(lookup_name, lookup_len); 15482 } 15483 15484 if (table_index == 0) { 15485 char * canonical; 15486 15487 /* Here, we didn't find it. If not a numeric type property, and 15488 * can't be a user-defined one, it isn't a legal property */ 15489 if (! is_nv_type) { 15490 if (! could_be_user_defined) { 15491 goto failed; 15492 } 15493 15494 /* Here, the property name is legal as a user-defined one. At 15495 * compile time, it might just be that the subroutine for that 15496 * property hasn't been encountered yet, but at runtime, it's 15497 * an error to try to use an undefined one */ 15498 if (! deferrable) { 15499 goto unknown_user_defined;; 15500 } 15501 15502 goto definition_deferred; 15503 } /* End of isn't a numeric type property */ 15504 15505 /* The numeric type properties need more work to decide. What we 15506 * do is make sure we have the number in canonical form and look 15507 * that up. */ 15508 15509 if (slash_pos < 0) { /* No slash */ 15510 15511 /* When it isn't a rational, take the input, convert it to a 15512 * NV, then create a canonical string representation of that 15513 * NV. */ 15514 15515 NV value; 15516 SSize_t value_len = lookup_len - equals_pos; 15517 15518 /* Get the value */ 15519 if ( value_len <= 0 15520 || my_atof3(lookup_name + equals_pos, &value, 15521 value_len) 15522 != lookup_name + lookup_len) 15523 { 15524 goto failed; 15525 } 15526 15527 /* If the value is an integer, the canonical value is integral 15528 * */ 15529 if (Perl_ceil(value) == value) { 15530 canonical = Perl_form(aTHX_ "%.*s%.0" NVff, 15531 equals_pos, lookup_name, value); 15532 } 15533 else { /* Otherwise, it is %e with a known precision */ 15534 char * exp_ptr; 15535 15536 canonical = Perl_form(aTHX_ "%.*s%.*" NVef, 15537 equals_pos, lookup_name, 15538 PL_E_FORMAT_PRECISION, value); 15539 15540 /* The exponent generated is expecting two digits, whereas 15541 * %e on some systems will generate three. Remove leading 15542 * zeros in excess of 2 from the exponent. We start 15543 * looking for them after the '=' */ 15544 exp_ptr = strchr(canonical + equals_pos, 'e'); 15545 if (exp_ptr) { 15546 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */ 15547 SSize_t excess_exponent_len = strlen(cur_ptr) - 2; 15548 15549 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+'); 15550 15551 if (excess_exponent_len > 0) { 15552 SSize_t leading_zeros = strspn(cur_ptr, "0"); 15553 SSize_t excess_leading_zeros 15554 = MIN(leading_zeros, excess_exponent_len); 15555 if (excess_leading_zeros > 0) { 15556 Move(cur_ptr + excess_leading_zeros, 15557 cur_ptr, 15558 strlen(cur_ptr) - excess_leading_zeros 15559 + 1, /* Copy the NUL as well */ 15560 char); 15561 } 15562 } 15563 } 15564 } 15565 } 15566 else { /* Has a slash. Create a rational in canonical form */ 15567 UV numerator, denominator, gcd, trial; 15568 const char * end_ptr; 15569 const char * sign = ""; 15570 15571 /* We can't just find the numerator, denominator, and do the 15572 * division, then use the method above, because that is 15573 * inexact. And the input could be a rational that is within 15574 * epsilon (given our precision) of a valid rational, and would 15575 * then incorrectly compare valid. 15576 * 15577 * We're only interested in the part after the '=' */ 15578 const char * this_lookup_name = lookup_name + equals_pos; 15579 lookup_len -= equals_pos; 15580 slash_pos -= equals_pos; 15581 15582 /* Handle any leading minus */ 15583 if (this_lookup_name[0] == '-') { 15584 sign = "-"; 15585 this_lookup_name++; 15586 lookup_len--; 15587 slash_pos--; 15588 } 15589 15590 /* Convert the numerator to numeric */ 15591 end_ptr = this_lookup_name + slash_pos; 15592 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) { 15593 goto failed; 15594 } 15595 15596 /* It better have included all characters before the slash */ 15597 if (*end_ptr != '/') { 15598 goto failed; 15599 } 15600 15601 /* Set to look at just the denominator */ 15602 this_lookup_name += slash_pos; 15603 lookup_len -= slash_pos; 15604 end_ptr = this_lookup_name + lookup_len; 15605 15606 /* Convert the denominator to numeric */ 15607 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) { 15608 goto failed; 15609 } 15610 15611 /* It better be the rest of the characters, and don't divide by 15612 * 0 */ 15613 if ( end_ptr != this_lookup_name + lookup_len 15614 || denominator == 0) 15615 { 15616 goto failed; 15617 } 15618 15619 /* Get the greatest common denominator using 15620 http://en.wikipedia.org/wiki/Euclidean_algorithm */ 15621 gcd = numerator; 15622 trial = denominator; 15623 while (trial != 0) { 15624 UV temp = trial; 15625 trial = gcd % trial; 15626 gcd = temp; 15627 } 15628 15629 /* If already in lowest possible terms, we have already tried 15630 * looking this up */ 15631 if (gcd == 1) { 15632 goto failed; 15633 } 15634 15635 /* Reduce the rational, which should put it in canonical form 15636 * */ 15637 numerator /= gcd; 15638 denominator /= gcd; 15639 15640 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf, 15641 equals_pos, lookup_name, sign, numerator, denominator); 15642 } 15643 15644 /* Here, we have the number in canonical form. Try that */ 15645 table_index = do_uniprop_match(canonical, strlen(canonical)); 15646 if (table_index == 0) { 15647 goto failed; 15648 } 15649 } /* End of still didn't find the property in our table */ 15650 } /* End of didn't find the property in our table */ 15651 15652 /* Here, we have a non-zero return, which is an index into a table of ptrs. 15653 * A negative return signifies that the real index is the absolute value, 15654 * but the result needs to be inverted */ 15655 if (table_index < 0) { 15656 invert_return = TRUE; 15657 table_index = -table_index; 15658 } 15659 15660 /* Out-of band indices indicate a deprecated property. The proper index is 15661 * modulo it with the table size. And dividing by the table size yields 15662 * an offset into a table constructed by regen/mk_invlists.pl to contain 15663 * the corresponding warning message */ 15664 if (table_index > MAX_UNI_KEYWORD_INDEX) { 15665 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX; 15666 table_index %= MAX_UNI_KEYWORD_INDEX; 15667 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME), 15668 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s", 15669 (int) name_len, name, 15670 get_deprecated_property_msg(warning_offset)); 15671 } 15672 15673 /* In a few properties, a different property is used under /i. These are 15674 * unlikely to change, so are hard-coded here. */ 15675 if (to_fold) { 15676 if ( table_index == UNI_XPOSIXUPPER 15677 || table_index == UNI_XPOSIXLOWER 15678 || table_index == UNI_TITLE) 15679 { 15680 table_index = UNI_CASED; 15681 } 15682 else if ( table_index == UNI_UPPERCASELETTER 15683 || table_index == UNI_LOWERCASELETTER 15684# ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */ 15685 || table_index == UNI_TITLECASELETTER 15686# endif 15687 ) { 15688 table_index = UNI_CASEDLETTER; 15689 } 15690 else if ( table_index == UNI_POSIXUPPER 15691 || table_index == UNI_POSIXLOWER) 15692 { 15693 table_index = UNI_POSIXALPHA; 15694 } 15695 } 15696 15697 /* Create and return the inversion list */ 15698 prop_definition = get_prop_definition(table_index); 15699 sv_2mortal(prop_definition); 15700 15701 /* See if there is a private use override to add to this definition */ 15702 { 15703 COPHH * hinthash = (IN_PERL_COMPILETIME) 15704 ? CopHINTHASH_get(&PL_compiling) 15705 : CopHINTHASH_get(PL_curcop); 15706 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0); 15707 15708 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) { 15709 15710 /* See if there is an element in the hints hash for this table */ 15711 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index); 15712 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup)); 15713 15714 if (pos) { 15715 bool dummy; 15716 SV * pu_definition; 15717 SV * pu_invlist; 15718 SV * expanded_prop_definition = 15719 sv_2mortal(invlist_clone(prop_definition, NULL)); 15720 15721 /* If so, it's definition is the string from here to the next 15722 * \a character. And its format is the same as a user-defined 15723 * property */ 15724 pos += SvCUR(pu_lookup); 15725 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos); 15726 pu_invlist = handle_user_defined_property(lookup_name, 15727 lookup_len, 15728 0, /* Not UTF-8 */ 15729 0, /* Not folded */ 15730 runtime, 15731 deferrable, 15732 pu_definition, 15733 &dummy, 15734 msg, 15735 level); 15736 if (TAINT_get) { 15737 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15738 sv_catpvs(msg, "Insecure private-use override"); 15739 goto append_name_to_msg; 15740 } 15741 15742 /* For now, as a safety measure, make sure that it doesn't 15743 * override non-private use code points */ 15744 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist); 15745 15746 /* Add it to the list to be returned */ 15747 _invlist_union(prop_definition, pu_invlist, 15748 &expanded_prop_definition); 15749 prop_definition = expanded_prop_definition; 15750 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental"); 15751 } 15752 } 15753 } 15754 15755 if (invert_return) { 15756 _invlist_invert(prop_definition); 15757 } 15758 return prop_definition; 15759 15760 unknown_user_defined: 15761 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15762 sv_catpvs(msg, "Unknown user-defined property name"); 15763 goto append_name_to_msg; 15764 15765 failed: 15766 if (non_pkg_begin != 0) { 15767 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15768 sv_catpvs(msg, "Illegal user-defined property name"); 15769 } 15770 else { 15771 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15772 sv_catpvs(msg, "Can't find Unicode property definition"); 15773 } 15774 /* FALLTHROUGH */ 15775 15776 append_name_to_msg: 15777 { 15778 const char * prefix = (runtime && level == 0) ? " \\p{" : " \""; 15779 const char * suffix = (runtime && level == 0) ? "}" : "\""; 15780 15781 sv_catpv(msg, prefix); 15782 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); 15783 sv_catpv(msg, suffix); 15784 } 15785 15786 return NULL; 15787 15788 definition_deferred: 15789 15790 { 15791 bool is_qualified = non_pkg_begin != 0; /* If has "::" */ 15792 15793 /* Here it could yet to be defined, so defer evaluation of this until 15794 * its needed at runtime. We need the fully qualified property name to 15795 * avoid ambiguity */ 15796 if (! fq_name) { 15797 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, 15798 is_qualified); 15799 } 15800 15801 /* If it didn't come with a package, or the package is utf8::, this 15802 * actually could be an official Unicode property whose inclusion we 15803 * are deferring until runtime to make sure that it isn't overridden by 15804 * a user-defined property of the same name (which we haven't 15805 * encountered yet). Add a marker to indicate this possibility, for 15806 * use at such time when we first need the definition during pattern 15807 * matching execution */ 15808 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) { 15809 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs); 15810 } 15811 15812 /* We also need a trailing newline */ 15813 sv_catpvs(fq_name, "\n"); 15814 15815 *user_defined_ptr = TRUE; 15816 return fq_name; 15817 } 15818} 15819 15820STATIC bool 15821S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ 15822 const STRLEN wname_len, /* Its length */ 15823 SV ** prop_definition, 15824 AV ** strings) 15825{ 15826 /* Deal with Name property wildcard subpatterns; returns TRUE if there were 15827 * any matches, adding them to prop_definition */ 15828 15829 dSP; 15830 15831 CV * get_names_info; /* entry to charnames.pm to get info we need */ 15832 SV * names_string; /* Contains all character names, except algo */ 15833 SV * algorithmic_names; /* Contains info about algorithmically 15834 generated character names */ 15835 REGEXP * subpattern_re; /* The user's pattern to match with */ 15836 struct regexp * prog; /* The compiled pattern */ 15837 char * all_names_start; /* lib/unicore/Name.pl string of every 15838 (non-algorithmic) character name */ 15839 char * cur_pos; /* We match, effectively using /gc; this is 15840 where we are now */ 15841 bool found_matches = FALSE; /* Did any name match so far? */ 15842 SV * empty; /* For matching zero length names */ 15843 SV * must_sv; /* Contains the substring, if any, that must be 15844 in a name for the subpattern to match */ 15845 const char * must; /* The PV of 'must' */ 15846 STRLEN must_len; /* And its length */ 15847 SV * syllable_name = NULL; /* For Hangul syllables */ 15848 const char hangul_prefix[] = "HANGUL SYLLABLE "; 15849 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1; 15850 15851 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul 15852 * syllable name, and these are immutable and guaranteed by the Unicode 15853 * standard to never be extended */ 15854 const STRLEN syl_max_len = hangul_prefix_len + 7; 15855 15856 IV i; 15857 15858 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD; 15859 15860 /* Make sure _charnames is loaded. (The parameters give context 15861 * for any errors generated */ 15862 get_names_info = get_cv("_charnames::_get_names_info", 0); 15863 if (! get_names_info) { 15864 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info"); 15865 } 15866 15867 /* Get the charnames data */ 15868 PUSHSTACKi(PERLSI_REGCOMP); 15869 ENTER ; 15870 SAVETMPS; 15871 save_re_context(); 15872 15873 PUSHMARK(SP) ; 15874 PUTBACK; 15875 15876 /* Special _charnames entry point that returns the info this routine 15877 * requires */ 15878 call_sv(MUTABLE_SV(get_names_info), G_LIST); 15879 15880 SPAGAIN ; 15881 15882 /* Data structure for names which end in their very own code points */ 15883 algorithmic_names = POPs; 15884 SvREFCNT_inc_simple_void_NN(algorithmic_names); 15885 15886 /* The lib/unicore/Name.pl string */ 15887 names_string = POPs; 15888 SvREFCNT_inc_simple_void_NN(names_string); 15889 15890 PUTBACK ; 15891 FREETMPS ; 15892 LEAVE ; 15893 POPSTACK; 15894 15895 if ( ! SvROK(names_string) 15896 || ! SvROK(algorithmic_names)) 15897 { /* Perhaps should panic instead XXX */ 15898 SvREFCNT_dec(names_string); 15899 SvREFCNT_dec(algorithmic_names); 15900 return FALSE; 15901 } 15902 15903 names_string = sv_2mortal(SvRV(names_string)); 15904 all_names_start = SvPVX(names_string); 15905 cur_pos = all_names_start; 15906 15907 algorithmic_names= sv_2mortal(SvRV(algorithmic_names)); 15908 15909 /* Compile the subpattern consisting of the name being looked for */ 15910 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ ); 15911 15912 must_sv = re_intuit_string(subpattern_re); 15913 if (must_sv) { 15914 /* regexec.c can free the re_intuit_string() return. GH #17734 */ 15915 must_sv = sv_2mortal(newSVsv(must_sv)); 15916 must = SvPV(must_sv, must_len); 15917 } 15918 else { 15919 must = ""; 15920 must_len = 0; 15921 } 15922 15923 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it. 15924 * This works because the NUL causes the function to return early, thus 15925 * showing that there are characters in it other than the acceptable ones, 15926 * which is our desired result.) */ 15927 15928 prog = ReANY(subpattern_re); 15929 15930 /* If only nothing is matched, skip to where empty names are looked for */ 15931 if (prog->maxlen == 0) { 15932 goto check_empty; 15933 } 15934 15935 /* And match against the string of all names /gc. Don't even try if it 15936 * must match a character not found in any name. */ 15937 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len) 15938 { 15939 while (execute_wildcard(subpattern_re, 15940 cur_pos, 15941 SvEND(names_string), 15942 all_names_start, 0, 15943 names_string, 15944 0)) 15945 { /* Here, matched. */ 15946 15947 /* Note the string entries look like 15948 * 00001\nSTART OF HEADING\n\n 15949 * so we could match anywhere in that string. We have to rule out 15950 * matching a code point line */ 15951 char * this_name_start = all_names_start 15952 + RX_OFFS_START(subpattern_re,0); 15953 char * this_name_end = all_names_start 15954 + RX_OFFS_END(subpattern_re,0); 15955 char * cp_start; 15956 char * cp_end; 15957 UV cp = 0; /* Silences some compilers */ 15958 AV * this_string = NULL; 15959 bool is_multi = FALSE; 15960 15961 /* If matched nothing, advance to next possible match */ 15962 if (this_name_start == this_name_end) { 15963 cur_pos = (char *) memchr(this_name_end + 1, '\n', 15964 SvEND(names_string) - this_name_end); 15965 if (cur_pos == NULL) { 15966 break; 15967 } 15968 } 15969 else { 15970 /* Position the next match to start beyond the current returned 15971 * entry */ 15972 cur_pos = (char *) memchr(this_name_end, '\n', 15973 SvEND(names_string) - this_name_end); 15974 } 15975 15976 /* Back up to the \n just before the beginning of the character. */ 15977 cp_end = (char *) my_memrchr(all_names_start, 15978 '\n', 15979 this_name_start - all_names_start); 15980 15981 /* If we didn't find a \n, it means it matched somewhere in the 15982 * initial '00000' in the string, so isn't a real match */ 15983 if (cp_end == NULL) { 15984 continue; 15985 } 15986 15987 this_name_start = cp_end + 1; /* The name starts just after */ 15988 cp_end--; /* the \n, and the code point */ 15989 /* ends just before it */ 15990 15991 /* All code points are 5 digits long */ 15992 cp_start = cp_end - 4; 15993 15994 /* This shouldn't happen, as we found a \n, and the first \n is 15995 * further along than what we subtracted */ 15996 assert(cp_start >= all_names_start); 15997 15998 if (cp_start == all_names_start) { 15999 *prop_definition = add_cp_to_invlist(*prop_definition, 0); 16000 continue; 16001 } 16002 16003 /* If the character is a blank, we either have a named sequence, or 16004 * something is wrong */ 16005 if (*(cp_start - 1) == ' ') { 16006 cp_start = (char *) my_memrchr(all_names_start, 16007 '\n', 16008 cp_start - all_names_start); 16009 cp_start++; 16010 } 16011 16012 assert(cp_start != NULL && cp_start >= all_names_start + 2); 16013 16014 /* Except for the first line in the string, the sequence before the 16015 * code point is \n\n. If that isn't the case here, we didn't 16016 * match the name of a character. (We could have matched a named 16017 * sequence, not currently handled */ 16018 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') { 16019 continue; 16020 } 16021 16022 /* We matched! Add this to the list */ 16023 found_matches = TRUE; 16024 16025 /* Loop through all the code points in the sequence */ 16026 while (cp_start < cp_end) { 16027 16028 /* Calculate this code point from its 5 digits */ 16029 cp = (XDIGIT_VALUE(cp_start[0]) << 16) 16030 + (XDIGIT_VALUE(cp_start[1]) << 12) 16031 + (XDIGIT_VALUE(cp_start[2]) << 8) 16032 + (XDIGIT_VALUE(cp_start[3]) << 4) 16033 + XDIGIT_VALUE(cp_start[4]); 16034 16035 cp_start += 6; /* Go past any blank */ 16036 16037 if (cp_start < cp_end || is_multi) { 16038 if (this_string == NULL) { 16039 this_string = newAV(); 16040 } 16041 16042 is_multi = TRUE; 16043 av_push_simple(this_string, newSVuv(cp)); 16044 } 16045 } 16046 16047 if (is_multi) { /* Was more than one code point */ 16048 if (*strings == NULL) { 16049 *strings = newAV(); 16050 } 16051 16052 av_push_simple(*strings, (SV *) this_string); 16053 } 16054 else { /* Only a single code point */ 16055 *prop_definition = add_cp_to_invlist(*prop_definition, cp); 16056 } 16057 } /* End of loop through the non-algorithmic names string */ 16058 } 16059 16060 /* There are also character names not in 'names_string'. These are 16061 * algorithmically generatable. Try this pattern on each possible one. 16062 * (khw originally planned to leave this out given the large number of 16063 * matches attempted; but the speed turned out to be quite acceptable 16064 * 16065 * There are plenty of opportunities to optimize to skip many of the tests. 16066 * beyond the rudimentary ones already here */ 16067 16068 /* First see if the subpattern matches any of the algorithmic generatable 16069 * Hangul syllable names. 16070 * 16071 * We know none of these syllable names will match if the input pattern 16072 * requires more bytes than any syllable has, or if the input pattern only 16073 * matches an empty name, or if the pattern has something it must match and 16074 * one of the characters in that isn't in any Hangul syllable. */ 16075 if ( prog->minlen <= (SSize_t) syl_max_len 16076 && prog->maxlen > 0 16077 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len)) 16078 { 16079 /* These constants, names, values, and algorithm are adapted from the 16080 * Unicode standard, version 5.1, section 3.12, and should never 16081 * change. */ 16082 const char * JamoL[] = { 16083 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", 16084 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" 16085 }; 16086 const int LCount = C_ARRAY_LENGTH(JamoL); 16087 16088 const char * JamoV[] = { 16089 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", 16090 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", 16091 "I" 16092 }; 16093 const int VCount = C_ARRAY_LENGTH(JamoV); 16094 16095 const char * JamoT[] = { 16096 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", 16097 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", 16098 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 16099 }; 16100 const int TCount = C_ARRAY_LENGTH(JamoT); 16101 16102 int L, V, T; 16103 16104 /* This is the initial Hangul syllable code point; each time through the 16105 * inner loop, it maps to the next higher code point. For more info, 16106 * see the Hangul syllable section of the Unicode standard. */ 16107 int cp = 0xAC00; 16108 16109 syllable_name = sv_2mortal(newSV(syl_max_len)); 16110 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len); 16111 16112 for (L = 0; L < LCount; L++) { 16113 for (V = 0; V < VCount; V++) { 16114 for (T = 0; T < TCount; T++) { 16115 16116 /* Truncate back to the prefix, which is unvarying */ 16117 SvCUR_set(syllable_name, hangul_prefix_len); 16118 16119 sv_catpv(syllable_name, JamoL[L]); 16120 sv_catpv(syllable_name, JamoV[V]); 16121 sv_catpv(syllable_name, JamoT[T]); 16122 16123 if (execute_wildcard(subpattern_re, 16124 SvPVX(syllable_name), 16125 SvEND(syllable_name), 16126 SvPVX(syllable_name), 0, 16127 syllable_name, 16128 0)) 16129 { 16130 *prop_definition = add_cp_to_invlist(*prop_definition, 16131 cp); 16132 found_matches = TRUE; 16133 } 16134 16135 cp++; 16136 } 16137 } 16138 } 16139 } 16140 16141 /* The rest of the algorithmically generatable names are of the form 16142 * "PREFIX-code_point". The prefixes and the code point limits of each 16143 * were returned to us in the array 'algorithmic_names' from data in 16144 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */ 16145 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) { 16146 IV j; 16147 16148 /* Each element of the array is a hash, giving the details for the 16149 * series of names it covers. There is the base name of the characters 16150 * in the series, and the low and high code points in the series. And, 16151 * for optimization purposes a string containing all the legal 16152 * characters that could possibly be in a name in this series. */ 16153 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0)); 16154 SV * prefix = * hv_fetchs(this_series, "name", 0); 16155 IV low = SvIV(* hv_fetchs(this_series, "low", 0)); 16156 IV high = SvIV(* hv_fetchs(this_series, "high", 0)); 16157 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0)); 16158 16159 /* Pre-allocate an SV with enough space */ 16160 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000", 16161 SvPVX(prefix))); 16162 if (high >= 0x10000) { 16163 sv_catpvs(algo_name, "0"); 16164 } 16165 16166 /* This series can be skipped entirely if the pattern requires 16167 * something longer than any name in the series, or can only match an 16168 * empty name, or contains a character not found in any name in the 16169 * series */ 16170 if ( prog->minlen <= (SSize_t) SvCUR(algo_name) 16171 && prog->maxlen > 0 16172 && (strspn(must, legal) == must_len)) 16173 { 16174 for (j = low; j <= high; j++) { /* For each code point in the series */ 16175 16176 /* Get its name, and see if it matches the subpattern */ 16177 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix), 16178 (unsigned) j); 16179 16180 if (execute_wildcard(subpattern_re, 16181 SvPVX(algo_name), 16182 SvEND(algo_name), 16183 SvPVX(algo_name), 0, 16184 algo_name, 16185 0)) 16186 { 16187 *prop_definition = add_cp_to_invlist(*prop_definition, j); 16188 found_matches = TRUE; 16189 } 16190 } 16191 } 16192 } 16193 16194 check_empty: 16195 /* Finally, see if the subpattern matches an empty string */ 16196 empty = newSVpvs(""); 16197 if (execute_wildcard(subpattern_re, 16198 SvPVX(empty), 16199 SvEND(empty), 16200 SvPVX(empty), 0, 16201 empty, 16202 0)) 16203 { 16204 /* Many code points have empty names. Currently these are the \p{GC=C} 16205 * ones, minus CC and CF */ 16206 16207 SV * empty_names_ref = get_prop_definition(UNI_C); 16208 SV * empty_names = invlist_clone(empty_names_ref, NULL); 16209 16210 SV * subtract = get_prop_definition(UNI_CC); 16211 16212 _invlist_subtract(empty_names, subtract, &empty_names); 16213 SvREFCNT_dec_NN(empty_names_ref); 16214 SvREFCNT_dec_NN(subtract); 16215 16216 subtract = get_prop_definition(UNI_CF); 16217 _invlist_subtract(empty_names, subtract, &empty_names); 16218 SvREFCNT_dec_NN(subtract); 16219 16220 _invlist_union(*prop_definition, empty_names, prop_definition); 16221 found_matches = TRUE; 16222 SvREFCNT_dec_NN(empty_names); 16223 } 16224 SvREFCNT_dec_NN(empty); 16225 16226#if 0 16227 /* If we ever were to accept aliases for, say private use names, we would 16228 * need to do something fancier to find empty names. The code below works 16229 * (at the time it was written), and is slower than the above */ 16230 const char empties_pat[] = "^."; 16231 if (strNE(name, empties_pat)) { 16232 SV * empty = newSVpvs(""); 16233 if (execute_wildcard(subpattern_re, 16234 SvPVX(empty), 16235 SvEND(empty), 16236 SvPVX(empty), 0, 16237 empty, 16238 0)) 16239 { 16240 SV * empties = NULL; 16241 16242 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties); 16243 16244 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition); 16245 SvREFCNT_dec_NN(empties); 16246 16247 found_matches = TRUE; 16248 } 16249 SvREFCNT_dec_NN(empty); 16250 } 16251#endif 16252 16253 SvREFCNT_dec_NN(subpattern_re); 16254 return found_matches; 16255} 16256 16257/* 16258 * ex: set ts=8 sts=4 sw=4 et: 16259 */ 16260