1#ifdef PERL_EXT_RE_BUILD 2#include "re_top.h" 3#endif 4 5#include "EXTERN.h" 6#define PERL_IN_REGEX_ENGINE 7#define PERL_IN_REGCOMP_ANY 8#define PERL_IN_REGCOMP_DEBUG_C 9#include "perl.h" 10 11#ifdef PERL_IN_XSUB_RE 12# include "re_comp.h" 13#else 14# include "regcomp.h" 15#endif 16 17#include "invlist_inline.h" 18#include "unicode_constants.h" 19#include "regcomp_internal.h" 20 21#ifdef DEBUGGING 22 23int 24Perl_re_printf(pTHX_ const char *fmt, ...) 25{ 26 va_list ap; 27 int result; 28 PerlIO *f= Perl_debug_log; 29 PERL_ARGS_ASSERT_RE_PRINTF; 30 va_start(ap, fmt); 31 result = PerlIO_vprintf(f, fmt, ap); 32 va_end(ap); 33 return result; 34} 35 36int 37Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) 38{ 39 va_list ap; 40 int result; 41 PerlIO *f= Perl_debug_log; 42 PERL_ARGS_ASSERT_RE_INDENTF; 43 va_start(ap, depth); 44 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); 45 result = PerlIO_vprintf(f, fmt, ap); 46 va_end(ap); 47 return result; 48} 49 50void 51Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, 52 const char *close_str) 53{ 54 PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS; 55 if (!flags) 56 return; 57 58 Perl_re_printf( aTHX_ "%s", open_str); 59 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); 60 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); 61 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); 62 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); 63 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); 64 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); 65 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); 66 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); 67 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); 68 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); 69 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); 70 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); 71 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); 72 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); 73 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); 74 Perl_re_printf( aTHX_ "%s", close_str); 75} 76 77void 78Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data, 79 U32 depth, int is_inf, 80 SSize_t min, SSize_t stopmin, SSize_t delta) 81{ 82 PERL_ARGS_ASSERT_DEBUG_STUDYDATA; 83 DECLARE_AND_GET_RE_DEBUG_FLAGS; 84 85 DEBUG_OPTIMISE_MORE_r({ 86 if (!data) { 87 Perl_re_indentf(aTHX_ "%s: NO DATA", 88 depth, 89 where); 90 return; 91 } 92 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, 93 depth, 94 where, 95 min, stopmin, delta, 96 (IV)data->pos_min, 97 (IV)data->pos_delta, 98 (UV)data->flags 99 ); 100 101 Perl_debug_show_study_flags(aTHX_ data->flags," [","]"); 102 103 Perl_re_printf( aTHX_ 104 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", 105 (IV)data->whilem_c, 106 (IV)(data->last_closep ? *((data)->last_closep) : -1), 107 is_inf ? "INF " : "" 108 ); 109 110 if (data->last_found) { 111 int i; 112 Perl_re_printf(aTHX_ 113 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, 114 SvPVX_const(data->last_found), 115 (IV)data->last_end, 116 (IV)data->last_start_min, 117 (IV)data->last_start_max 118 ); 119 120 for (i = 0; i < 2; i++) { 121 Perl_re_printf(aTHX_ 122 " %s%s: '%s' @ %" IVdf "/%" IVdf, 123 data->cur_is_floating == i ? "*" : "", 124 i ? "Float" : "Fixed", 125 SvPVX_const(data->substrs[i].str), 126 (IV)data->substrs[i].min_offset, 127 (IV)data->substrs[i].max_offset 128 ); 129 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); 130 } 131 } 132 133 Perl_re_printf( aTHX_ "\n"); 134 }); 135} 136 137 138void 139Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, 140 regnode *scan, U32 depth, U32 flags) 141{ 142 PERL_ARGS_ASSERT_DEBUG_PEEP; 143 DECLARE_AND_GET_RE_DEBUG_FLAGS; 144 145 DEBUG_OPTIMISE_r({ 146 regnode *Next; 147 148 if (!scan) 149 return; 150 Next = regnext(scan); 151 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); 152 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", 153 depth, 154 str, 155 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), 156 Next ? (REG_NODE_NUM(Next)) : 0 ); 157 Perl_debug_show_study_flags(aTHX_ flags," [ ","]"); 158 Perl_re_printf( aTHX_ "\n"); 159 }); 160} 161 162#endif /* DEBUGGING */ 163 164/* 165 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 166 */ 167#ifdef DEBUGGING 168 169static void 170S_regdump_intflags(pTHX_ const char *lead, const U32 flags) 171{ 172 int bit; 173 int set=0; 174 175 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); 176 177 for (bit=0; bit<=REG_INTFLAGS_NAME_SIZE; bit++) { 178 if (flags & (1<<bit)) { 179 if (!set++ && lead) 180 Perl_re_printf( aTHX_ "%s", lead); 181 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); 182 } 183 } 184 if (lead) { 185 if (set) 186 Perl_re_printf( aTHX_ "\n"); 187 else 188 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 189 } 190} 191 192static void 193S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 194{ 195 int bit; 196 int set=0; 197 regex_charset cs; 198 199 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); 200 201 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { 202 if (flags & (1U<<bit)) { 203 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ 204 continue; 205 } 206 if (!set++ && lead) 207 Perl_re_printf( aTHX_ "%s", lead); 208 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); 209 } 210 } 211 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { 212 if (!set++ && lead) { 213 Perl_re_printf( aTHX_ "%s", lead); 214 } 215 switch (cs) { 216 case REGEX_UNICODE_CHARSET: 217 Perl_re_printf( aTHX_ "UNICODE"); 218 break; 219 case REGEX_LOCALE_CHARSET: 220 Perl_re_printf( aTHX_ "LOCALE"); 221 break; 222 case REGEX_ASCII_RESTRICTED_CHARSET: 223 Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); 224 break; 225 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 226 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); 227 break; 228 default: 229 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); 230 break; 231 } 232 } 233 if (lead) { 234 if (set) 235 Perl_re_printf( aTHX_ "\n"); 236 else 237 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 238 } 239} 240#endif 241 242void 243Perl_regdump(pTHX_ const regexp *r) 244{ 245#ifdef DEBUGGING 246 int i; 247 SV * const sv = sv_newmortal(); 248 SV *dsv= sv_newmortal(); 249 RXi_GET_DECL(r, ri); 250 DECLARE_AND_GET_RE_DEBUG_FLAGS; 251 252 PERL_ARGS_ASSERT_REGDUMP; 253 254 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 255 256 /* Header fields of interest. */ 257 for (i = 0; i < 2; i++) { 258 if (r->substrs->data[i].substr) { 259 RE_PV_QUOTED_DECL(s, 0, dsv, 260 SvPVX_const(r->substrs->data[i].substr), 261 RE_SV_DUMPLEN(r->substrs->data[i].substr), 262 PL_dump_re_max_len); 263 Perl_re_printf( aTHX_ 264 "%s %s%s at %" IVdf "..%" UVuf " ", 265 i ? "floating" : "anchored", 266 s, 267 RE_SV_TAIL(r->substrs->data[i].substr), 268 (IV)r->substrs->data[i].min_offset, 269 (UV)r->substrs->data[i].max_offset); 270 } 271 else if (r->substrs->data[i].utf8_substr) { 272 RE_PV_QUOTED_DECL(s, 1, dsv, 273 SvPVX_const(r->substrs->data[i].utf8_substr), 274 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), 275 30); 276 Perl_re_printf( aTHX_ 277 "%s utf8 %s%s at %" IVdf "..%" UVuf " ", 278 i ? "floating" : "anchored", 279 s, 280 RE_SV_TAIL(r->substrs->data[i].utf8_substr), 281 (IV)r->substrs->data[i].min_offset, 282 (UV)r->substrs->data[i].max_offset); 283 } 284 } 285 286 if (r->check_substr || r->check_utf8) 287 Perl_re_printf( aTHX_ 288 (const char *) 289 ( r->check_substr == r->substrs->data[1].substr 290 && r->check_utf8 == r->substrs->data[1].utf8_substr 291 ? "(checking floating" : "(checking anchored")); 292 if (r->intflags & PREGf_NOSCAN) 293 Perl_re_printf( aTHX_ " noscan"); 294 if (r->extflags & RXf_CHECK_ALL) 295 Perl_re_printf( aTHX_ " isall"); 296 if (r->check_substr || r->check_utf8) 297 Perl_re_printf( aTHX_ ") "); 298 299 if (ri->regstclass) { 300 regprop(r, sv, ri->regstclass, NULL, NULL); 301 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); 302 } 303 if (r->intflags & PREGf_ANCH) { 304 Perl_re_printf( aTHX_ "anchored"); 305 if (r->intflags & PREGf_ANCH_MBOL) 306 Perl_re_printf( aTHX_ "(MBOL)"); 307 if (r->intflags & PREGf_ANCH_SBOL) 308 Perl_re_printf( aTHX_ "(SBOL)"); 309 if (r->intflags & PREGf_ANCH_GPOS) 310 Perl_re_printf( aTHX_ "(GPOS)"); 311 Perl_re_printf( aTHX_ " "); 312 } 313 if (r->intflags & PREGf_GPOS_SEEN) 314 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); 315 if (r->intflags & PREGf_SKIP) 316 Perl_re_printf( aTHX_ "plus "); 317 if (r->intflags & PREGf_IMPLICIT) 318 Perl_re_printf( aTHX_ "implicit "); 319 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); 320 if (r->extflags & RXf_EVAL_SEEN) 321 Perl_re_printf( aTHX_ "with eval "); 322 Perl_re_printf( aTHX_ "\n"); 323 DEBUG_FLAGS_r({ 324 regdump_extflags("r->extflags: ", r->extflags); 325 regdump_intflags("r->intflags: ", r->intflags); 326 }); 327#else 328 PERL_ARGS_ASSERT_REGDUMP; 329 PERL_UNUSED_CONTEXT; 330 PERL_UNUSED_ARG(r); 331#endif /* DEBUGGING */ 332} 333 334/* Should be synchronized with ANYOF_ #defines in regcomp.h */ 335#ifdef DEBUGGING 336 337# if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \ 338 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \ 339 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \ 340 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \ 341 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \ 342 || CC_VERTSPACE_ != 15 343# error Need to adjust order of anyofs[] 344# endif 345static const char * const anyofs[] = { 346 "\\w", 347 "\\W", 348 "\\d", 349 "\\D", 350 "[:alpha:]", 351 "[:^alpha:]", 352 "[:lower:]", 353 "[:^lower:]", 354 "[:upper:]", 355 "[:^upper:]", 356 "[:punct:]", 357 "[:^punct:]", 358 "[:print:]", 359 "[:^print:]", 360 "[:alnum:]", 361 "[:^alnum:]", 362 "[:graph:]", 363 "[:^graph:]", 364 "[:cased:]", 365 "[:^cased:]", 366 "\\s", 367 "\\S", 368 "[:blank:]", 369 "[:^blank:]", 370 "[:xdigit:]", 371 "[:^xdigit:]", 372 "[:cntrl:]", 373 "[:^cntrl:]", 374 "[:ascii:]", 375 "[:^ascii:]", 376 "\\v", 377 "\\V" 378}; 379#endif 380 381/* 382- regprop - printable representation of opcode, with run time support 383*/ 384 385void 386Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) 387{ 388#ifdef DEBUGGING 389 U8 k; 390 const U8 op = OP(o); 391 RXi_GET_DECL(prog, progi); 392 DECLARE_AND_GET_RE_DEBUG_FLAGS; 393 394 PERL_ARGS_ASSERT_REGPROP; 395 396 SvPVCLEAR(sv); 397 398 if (op > REGNODE_MAX) { /* regnode.type is unsigned */ 399 if (pRExC_state) { /* This gives more info, if we have it */ 400 FAIL3("panic: corrupted regexp opcode %d > %d", 401 (int)op, (int)REGNODE_MAX); 402 } 403 else { 404 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", 405 (int)op, (int)REGNODE_MAX); 406 } 407 } 408 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ 409 410 k = REGNODE_TYPE(op); 411 if (op == BRANCH) { 412 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o)); 413 } 414 else if (op == BRANCHJ) { 415 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o)); 416 } 417 else if (k == EXACT) { 418 sv_catpvs(sv, " "); 419 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 420 * is a crude hack but it may be the best for now since 421 * we have no flag "this EXACTish node was UTF-8" 422 * --jhi */ 423 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, 424 PL_colors[0], PL_colors[1], 425 PERL_PV_ESCAPE_UNI_DETECT | 426 PERL_PV_ESCAPE_NONASCII | 427 PERL_PV_PRETTY_ELLIPSES | 428 PERL_PV_PRETTY_LTGT | 429 PERL_PV_PRETTY_NOCLEAR 430 ); 431 } else if (k == TRIE) { 432 /* print the details of the trie in dumpuntil instead, as 433 * progi->data isn't available here */ 434 const U32 n = ARG1u(o); 435 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 436 (reg_ac_data *)progi->data->data[n] : 437 NULL; 438 const reg_trie_data * const trie 439 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 440 441 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o))); 442 DEBUG_TRIE_COMPILE_r({ 443 if (trie->jump) 444 sv_catpvs(sv, "(JUMP)"); 445 Perl_sv_catpvf(aTHX_ sv, 446 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", 447 (UV)trie->startstate, 448 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 449 (UV)trie->wordcount, 450 (UV)trie->minlen, 451 (UV)trie->maxlen, 452 (UV)TRIE_CHARCOUNT(trie), 453 (UV)trie->uniquecharcount 454 ); 455 }); 456 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 457 sv_catpvs(sv, "["); 458 (void) put_charclass_bitmap_innards(sv, 459 ((IS_ANYOF_TRIE(op)) 460 ? ANYOF_BITMAP(o) 461 : TRIE_BITMAP(trie)), 462 NULL, 463 NULL, 464 NULL, 465 0, 466 FALSE 467 ); 468 sv_catpvs(sv, "]"); 469 } 470 if (trie->before_paren || trie->after_paren) 471 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", 472 (IV)trie->before_paren,(IV)trie->after_paren); 473 } else if (k == CURLY) { 474 U32 lo = ARG1i(o), hi = ARG2i(o); 475 if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */ 476 Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */ 477 if (op == CURLYM || op == CURLYN || op == CURLYX) 478 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */ 479 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); 480 if (hi == REG_INFTY) 481 sv_catpvs(sv, "INFTY"); 482 else 483 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); 484 sv_catpvs(sv, "}"); 485 } 486 else if (k == WHILEM && FLAGS(o)) /* Ordinal/of */ 487 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4); 488 else if (k == REF || k == OPEN || k == CLOSE 489 || k == GROUPP || op == ACCEPT) 490 { 491 AV *name_list= NULL; 492 U32 parno= (op == ACCEPT) ? ARG2u(o) : 493 (op == OPEN || op == CLOSE) ? PARNO(o) : 494 ARG1u(o); 495 if ( RXp_PAREN_NAMES(prog) ) { 496 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 497 } else if ( pRExC_state ) { 498 name_list= RExC_paren_name_list; 499 } 500 if ( name_list ) { 501 if ( k != REF || (op < REFN)) { 502 UV logical_parno = parno; 503 if (prog->parno_to_logical) 504 logical_parno = prog->parno_to_logical[parno]; 505 506 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ 507 if (parno != logical_parno) 508 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ 509 510 SV **name= av_fetch_simple(name_list, parno, 0 ); 511 if (name) 512 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 513 } 514 else 515 if (parno > 0) { 516 /* parno must always be larger than 0 for this block 517 * as it represents a slot into the data array, which 518 * has the 0 slot reserved for a placeholder so any valid 519 * index into it is always true, eg non-zero 520 * see the '%' "what" type and the implementation of 521 * S_reg_add_data() 522 */ 523 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); 524 I32 *nums=(I32*)SvPVX(sv_dat); 525 SV **name= av_fetch_simple(name_list, nums[0], 0 ); 526 I32 n; 527 if (name) { 528 for ( n=0; n<SvIVX(sv_dat); n++ ) { 529 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, 530 (n ? "," : ""), (IV)nums[n]); 531 } 532 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 533 } 534 } 535 } else if (parno>0) { 536 UV logical_parno = parno; 537 if (prog->parno_to_logical) 538 logical_parno = prog->parno_to_logical[parno]; 539 540 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ 541 if (logical_parno != parno) 542 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ 543 544 } 545 if ( k == REF ) { 546 Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o)); 547 } 548 if ( k == REF && reginfo) { 549 U32 n = ARG1u(o); /* which paren pair */ 550 I32 ln = RXp_OFFS_START(prog,n); 551 if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1) 552 Perl_sv_catpvf(aTHX_ sv, ": FAIL"); 553 else if (ln == RXp_OFFS_END(prog,n)) 554 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); 555 else { 556 const char *s = reginfo->strbeg + ln; 557 Perl_sv_catpvf(aTHX_ sv, ": "); 558 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0, 559 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); 560 } 561 } 562 } else if (k == GOSUB) { 563 AV *name_list= NULL; 564 IV parno = ARG1u(o); 565 IV logical_parno = (parno && prog->parno_to_logical) 566 ? prog->parno_to_logical[parno] 567 : parno; 568 if ( RXp_PAREN_NAMES(prog) ) { 569 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 570 } else if ( pRExC_state ) { 571 name_list= RExC_paren_name_list; 572 } 573 574 /* Paren and offset */ 575 Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno); 576 if (logical_parno != parno) 577 Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno); 578 579 Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o), 580 (int)((o + (int)ARG2i(o)) - progi->program) ); 581 if (name_list) { 582 SV **name= av_fetch_simple(name_list, ARG1u(o), 0 ); 583 if (name) 584 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 585 } 586 } 587 else if (k == LOGICAL) 588 /* 2: embedded, otherwise 1 */ 589 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); 590 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { 591 U8 flags; 592 char * bitmap; 593 U8 do_sep = 0; /* Do we need to separate various components of the 594 output? */ 595 /* Set if there is still an unresolved user-defined property */ 596 SV *unresolved = NULL; 597 598 /* Things that are ignored except when the runtime locale is UTF-8 */ 599 SV *only_utf8_locale_invlist = NULL; 600 601 /* Code points that don't fit in the bitmap */ 602 SV *nonbitmap_invlist = NULL; 603 604 /* And things that aren't in the bitmap, but are small enough to be */ 605 SV* bitmap_range_not_in_bitmap = NULL; 606 607 bool inverted; 608 609 if (k != ANYOF) { 610 flags = 0; 611 bitmap = NULL; 612 } 613 else { 614 flags = ANYOF_FLAGS(o); 615 bitmap = ANYOF_BITMAP(o); 616 } 617 618 if (op == ANYOFL || op == ANYOFPOSIXL) { 619 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) { 620 sv_catpvs(sv, "{utf8-locale-reqd}"); 621 } 622 if (flags & ANYOFL_FOLD) { 623 sv_catpvs(sv, "{i}"); 624 } 625 } 626 627 inverted = flags & ANYOF_INVERT; 628 629 /* If there is stuff outside the bitmap, get it */ 630 if (k == ANYOFR) { 631 632 /* For a single range, split into the parts inside vs outside the 633 * bitmap. */ 634 UV start = ANYOFRbase(o); 635 UV end = ANYOFRbase(o) + ANYOFRdelta(o); 636 637 if (start < NUM_ANYOF_CODE_POINTS) { 638 if (end < NUM_ANYOF_CODE_POINTS) { 639 bitmap_range_not_in_bitmap 640 = _add_range_to_invlist(bitmap_range_not_in_bitmap, 641 start, end); 642 } 643 else { 644 bitmap_range_not_in_bitmap 645 = _add_range_to_invlist(bitmap_range_not_in_bitmap, 646 start, NUM_ANYOF_CODE_POINTS); 647 start = NUM_ANYOF_CODE_POINTS; 648 } 649 } 650 651 if (start >= NUM_ANYOF_CODE_POINTS) { 652 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 653 ANYOFRbase(o), 654 ANYOFRbase(o) + ANYOFRdelta(o)); 655 } 656 } 657 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { 658 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 659 NUM_ANYOF_CODE_POINTS, 660 UV_MAX); 661 } 662 else if (ANYOF_HAS_AUX(o)) { 663 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, 664 &unresolved, 665 &only_utf8_locale_invlist, 666 &nonbitmap_invlist); 667 668 /* The aux data may contain stuff that could fit in the bitmap. 669 * This could come from a user-defined property being finally 670 * resolved when this call was done; or much more likely because 671 * there are matches that require UTF-8 to be valid, and so aren't 672 * in the bitmap (or ANYOFR). This is teased apart later */ 673 _invlist_intersection(nonbitmap_invlist, 674 PL_InBitmap, 675 &bitmap_range_not_in_bitmap); 676 /* Leave just the things that don't fit into the bitmap */ 677 _invlist_subtract(nonbitmap_invlist, 678 PL_InBitmap, 679 &nonbitmap_invlist); 680 } 681 682 /* Ready to start outputting. First, the initial left bracket */ 683 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 684 685 if ( bitmap 686 || bitmap_range_not_in_bitmap 687 || only_utf8_locale_invlist 688 || unresolved) 689 { 690 /* Then all the things that could fit in the bitmap */ 691 do_sep = put_charclass_bitmap_innards( 692 sv, 693 bitmap, 694 bitmap_range_not_in_bitmap, 695 only_utf8_locale_invlist, 696 o, 697 flags, 698 699 /* Can't try inverting for a 700 * better display if there 701 * are things that haven't 702 * been resolved */ 703 (unresolved != NULL || k == ANYOFR)); 704 SvREFCNT_dec(bitmap_range_not_in_bitmap); 705 706 /* If there are user-defined properties which haven't been defined 707 * yet, output them. If the result is not to be inverted, it is 708 * clearest to output them in a separate [] from the bitmap range 709 * stuff. If the result is to be complemented, we have to show 710 * everything in one [], as the inversion applies to the whole 711 * thing. Use {braces} to separate them from anything in the 712 * bitmap and anything above the bitmap. */ 713 if (unresolved) { 714 if (inverted) { 715 if (! do_sep) { /* If didn't output anything in the bitmap 716 */ 717 sv_catpvs(sv, "^"); 718 } 719 sv_catpvs(sv, "{"); 720 } 721 else if (do_sep) { 722 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], 723 PL_colors[0]); 724 } 725 sv_catsv(sv, unresolved); 726 if (inverted) { 727 sv_catpvs(sv, "}"); 728 } 729 do_sep = ! inverted; 730 } 731 else if ( do_sep == 2 732 && ! nonbitmap_invlist 733 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o)) 734 { 735 /* Here, the display shows the class as inverted, and 736 * everything above the lower display should also match, but 737 * there is no indication of that. Add this range so the code 738 * below will add it to the display */ 739 _invlist_union_complement_2nd(nonbitmap_invlist, 740 PL_InBitmap, 741 &nonbitmap_invlist); 742 } 743 } 744 745 /* And, finally, add the above-the-bitmap stuff */ 746 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { 747 SV* contents; 748 749 /* See if truncation size is overridden */ 750 const STRLEN dump_len = (PL_dump_re_max_len > 256) 751 ? PL_dump_re_max_len 752 : 256; 753 754 /* This is output in a separate [] */ 755 if (do_sep) { 756 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); 757 } 758 759 /* And, for easy of understanding, it is shown in the 760 * uncomplemented form if possible. The one exception being if 761 * there are unresolved items, where the inversion has to be 762 * delayed until runtime */ 763 if (inverted && ! unresolved) { 764 _invlist_invert(nonbitmap_invlist); 765 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); 766 } 767 768 contents = invlist_contents(nonbitmap_invlist, 769 FALSE /* output suitable for catsv */ 770 ); 771 772 /* If the output is shorter than the permissible maximum, just do it. */ 773 if (SvCUR(contents) <= dump_len) { 774 sv_catsv(sv, contents); 775 } 776 else { 777 const char * contents_string = SvPVX(contents); 778 STRLEN i = dump_len; 779 780 /* Otherwise, start at the permissible max and work back to the 781 * first break possibility */ 782 while (i > 0 && contents_string[i] != ' ') { 783 i--; 784 } 785 if (i == 0) { /* Fail-safe. Use the max if we couldn't 786 find a legal break */ 787 i = dump_len; 788 } 789 790 sv_catpvn(sv, contents_string, i); 791 sv_catpvs(sv, "..."); 792 } 793 794 SvREFCNT_dec_NN(contents); 795 SvREFCNT_dec_NN(nonbitmap_invlist); 796 } 797 798 /* And finally the matching, closing ']' */ 799 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 800 801 if (op == ANYOFHs) { 802 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); 803 } 804 else if (REGNODE_TYPE(op) != ANYOF) { 805 U8 lowest = (op != ANYOFHr) 806 ? FLAGS(o) 807 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); 808 U8 highest = (op == ANYOFHr) 809 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) 810 : (op == ANYOFH || op == ANYOFR) 811 ? 0xFF 812 : lowest; 813#ifndef EBCDIC 814 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) 815#endif 816 { 817 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); 818 if (lowest != highest) { 819 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); 820 } 821 Perl_sv_catpvf(aTHX_ sv, ")"); 822 } 823 } 824 825 SvREFCNT_dec(unresolved); 826 } 827 else if (k == ANYOFM) { 828 SV * cp_list = get_ANYOFM_contents(o); 829 830 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 831 if (op == NANYOFM) { 832 _invlist_invert(cp_list); 833 } 834 835 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); 836 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 837 838 SvREFCNT_dec(cp_list); 839 } 840 else if (k == ANYOFHbbm) { 841 SV * cp_list = get_ANYOFHbbm_contents(o); 842 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 843 844 sv_catsv(sv, invlist_contents(cp_list, 845 FALSE /* output suitable for catsv */ 846 )); 847 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 848 849 SvREFCNT_dec(cp_list); 850 } 851 else if (k == POSIXD || k == NPOSIXD) { 852 U8 index = FLAGS(o) * 2; 853 if (index < C_ARRAY_LENGTH(anyofs)) { 854 if (*anyofs[index] != '[') { 855 sv_catpvs(sv, "["); 856 } 857 sv_catpv(sv, anyofs[index]); 858 if (*anyofs[index] != '[') { 859 sv_catpvs(sv, "]"); 860 } 861 } 862 else { 863 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); 864 } 865 } 866 else if (k == BOUND || k == NBOUND) { 867 /* Must be synced with order of 'bound_type' in regcomp.h */ 868 const char * const bounds[] = { 869 "", /* Traditional */ 870 "{gcb}", 871 "{lb}", 872 "{sb}", 873 "{wb}" 874 }; 875 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); 876 sv_catpv(sv, bounds[FLAGS(o)]); 877 } 878 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) { 879 Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o))); 880 if (NEXT_OFF(o)) { 881 Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o)); 882 } 883 Perl_sv_catpvf(aTHX_ sv, "]"); 884 } 885 else if (op == SBOL) 886 Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^"); 887 else if (op == EVAL) { 888 if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG) 889 Perl_sv_catpvf(aTHX_ sv, " optimistic"); 890 } 891 892 /* add on the verb argument if there is one */ 893 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) { 894 if ( ARG1u(o) ) 895 Perl_sv_catpvf(aTHX_ sv, ":%" SVf, 896 SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ])))); 897 else 898 sv_catpvs(sv, ":NULL"); 899 } 900#else 901 PERL_UNUSED_CONTEXT; 902 PERL_UNUSED_ARG(sv); 903 PERL_UNUSED_ARG(o); 904 PERL_UNUSED_ARG(prog); 905 PERL_UNUSED_ARG(reginfo); 906 PERL_UNUSED_ARG(pRExC_state); 907#endif /* DEBUGGING */ 908} 909 910#ifdef DEBUGGING 911 912STATIC void 913S_put_code_point(pTHX_ SV *sv, UV c) 914{ 915 PERL_ARGS_ASSERT_PUT_CODE_POINT; 916 917 if (c > 255) { 918 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); 919 } 920 else if (isPRINT(c)) { 921 const char string = (char) c; 922 923 /* We use {phrase} as metanotation in the class, so also escape literal 924 * braces */ 925 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') 926 sv_catpvs(sv, "\\"); 927 sv_catpvn(sv, &string, 1); 928 } 929 else if (isMNEMONIC_CNTRL(c)) { 930 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); 931 } 932 else { 933 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); 934 } 935} 936 937STATIC void 938S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) 939{ 940 /* Appends to 'sv' a displayable version of the range of code points from 941 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls 942 * that have them, when they occur at the beginning or end of the range. 943 * It uses hex to output the remaining code points, unless 'allow_literals' 944 * is true, in which case the printable ASCII ones are output as-is (though 945 * some of these will be escaped by put_code_point()). 946 * 947 * NOTE: This is designed only for printing ranges of code points that fit 948 * inside an ANYOF bitmap. Higher code points are simply suppressed 949 */ 950 951 const unsigned int min_range_count = 3; 952 953 assert(start <= end); 954 955 PERL_ARGS_ASSERT_PUT_RANGE; 956 957 while (start <= end) { 958 UV this_end; 959 const char * format; 960 961 if ( end - start < min_range_count 962 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) 963 { 964 /* Output a range of 1 or 2 chars individually, or longer ranges 965 * when printable */ 966 for (; start <= end; start++) { 967 put_code_point(sv, start); 968 } 969 break; 970 } 971 972 /* If permitted by the input options, and there is a possibility that 973 * this range contains a printable literal, look to see if there is 974 * one. */ 975 if (allow_literals && start <= MAX_PRINT_A) { 976 977 /* If the character at the beginning of the range isn't an ASCII 978 * printable, effectively split the range into two parts: 979 * 1) the portion before the first such printable, 980 * 2) the rest 981 * and output them separately. */ 982 if (! isPRINT_A(start)) { 983 UV temp_end = start + 1; 984 985 /* There is no point looking beyond the final possible 986 * printable, in MAX_PRINT_A */ 987 UV max = MIN(end, MAX_PRINT_A); 988 989 while (temp_end <= max && ! isPRINT_A(temp_end)) { 990 temp_end++; 991 } 992 993 /* Here, temp_end points to one beyond the first printable if 994 * found, or to one beyond 'max' if not. If none found, make 995 * sure that we use the entire range */ 996 if (temp_end > MAX_PRINT_A) { 997 temp_end = end + 1; 998 } 999 1000 /* Output the first part of the split range: the part that 1001 * doesn't have printables, with the parameter set to not look 1002 * for literals (otherwise we would infinitely recurse) */ 1003 put_range(sv, start, temp_end - 1, FALSE); 1004 1005 /* The 2nd part of the range (if any) starts here. */ 1006 start = temp_end; 1007 1008 /* We do a continue, instead of dropping down, because even if 1009 * the 2nd part is non-empty, it could be so short that we want 1010 * to output it as individual characters, as tested for at the 1011 * top of this loop. */ 1012 continue; 1013 } 1014 1015 /* Here, 'start' is a printable ASCII. If it is an alphanumeric, 1016 * output a sub-range of just the digits or letters, then process 1017 * the remaining portion as usual. */ 1018 if (isALPHANUMERIC_A(start)) { 1019 UV mask = (isDIGIT_A(start)) 1020 ? CC_DIGIT_ 1021 : isUPPER_A(start) 1022 ? CC_UPPER_ 1023 : CC_LOWER_; 1024 UV temp_end = start + 1; 1025 1026 /* Find the end of the sub-range that includes just the 1027 * characters in the same class as the first character in it */ 1028 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) { 1029 temp_end++; 1030 } 1031 temp_end--; 1032 1033 /* For short ranges, don't duplicate the code above to output 1034 * them; just call recursively */ 1035 if (temp_end - start < min_range_count) { 1036 put_range(sv, start, temp_end, FALSE); 1037 } 1038 else { /* Output as a range */ 1039 put_code_point(sv, start); 1040 sv_catpvs(sv, "-"); 1041 put_code_point(sv, temp_end); 1042 } 1043 start = temp_end + 1; 1044 continue; 1045 } 1046 1047 /* We output any other printables as individual characters */ 1048 if (isPUNCT_A(start) || isSPACE_A(start)) { 1049 while (start <= end && (isPUNCT_A(start) 1050 || isSPACE_A(start))) 1051 { 1052 put_code_point(sv, start); 1053 start++; 1054 } 1055 continue; 1056 } 1057 } /* End of looking for literals */ 1058 1059 /* Here is not to output as a literal. Some control characters have 1060 * mnemonic names. Split off any of those at the beginning and end of 1061 * the range to print mnemonically. It isn't possible for many of 1062 * these to be in a row, so this won't overwhelm with output */ 1063 if ( start <= end 1064 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) 1065 { 1066 while (isMNEMONIC_CNTRL(start) && start <= end) { 1067 put_code_point(sv, start); 1068 start++; 1069 } 1070 1071 /* If this didn't take care of the whole range ... */ 1072 if (start <= end) { 1073 1074 /* Look backwards from the end to find the final non-mnemonic 1075 * */ 1076 UV temp_end = end; 1077 while (isMNEMONIC_CNTRL(temp_end)) { 1078 temp_end--; 1079 } 1080 1081 /* And separately output the interior range that doesn't start 1082 * or end with mnemonics */ 1083 put_range(sv, start, temp_end, FALSE); 1084 1085 /* Then output the mnemonic trailing controls */ 1086 start = temp_end + 1; 1087 while (start <= end) { 1088 put_code_point(sv, start); 1089 start++; 1090 } 1091 break; 1092 } 1093 } 1094 1095 /* As a final resort, output the range or subrange as hex. */ 1096 1097 if (start >= NUM_ANYOF_CODE_POINTS) { 1098 this_end = end; 1099 } 1100 else { /* Have to split range at the bitmap boundary */ 1101 this_end = (end < NUM_ANYOF_CODE_POINTS) 1102 ? end 1103 : NUM_ANYOF_CODE_POINTS - 1; 1104 } 1105#if NUM_ANYOF_CODE_POINTS > 256 1106 format = (this_end < 256) 1107 ? "\\x%02" UVXf "-\\x%02" UVXf 1108 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; 1109#else 1110 format = "\\x%02" UVXf "-\\x%02" UVXf; 1111#endif 1112 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); 1113 Perl_sv_catpvf(aTHX_ sv, format, start, this_end); 1114 GCC_DIAG_RESTORE_STMT; 1115 break; 1116 } 1117} 1118 1119STATIC void 1120S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) 1121{ 1122 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list 1123 * 'invlist' */ 1124 1125 UV start, end; 1126 bool allow_literals = TRUE; 1127 1128 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; 1129 1130 /* Generally, it is more readable if printable characters are output as 1131 * literals, but if a range (nearly) spans all of them, it's best to output 1132 * it as a single range. This code will use a single range if all but 2 1133 * ASCII printables are in it */ 1134 invlist_iterinit(invlist); 1135 while (invlist_iternext(invlist, &start, &end)) { 1136 1137 /* If the range starts beyond the final printable, it doesn't have any 1138 * in it */ 1139 if (start > MAX_PRINT_A) { 1140 break; 1141 } 1142 1143 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span 1144 * all but two, the range must start and end no later than 2 from 1145 * either end */ 1146 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { 1147 if (end > MAX_PRINT_A) { 1148 end = MAX_PRINT_A; 1149 } 1150 if (start < ' ') { 1151 start = ' '; 1152 } 1153 if (end - start >= MAX_PRINT_A - ' ' - 2) { 1154 allow_literals = FALSE; 1155 } 1156 break; 1157 } 1158 } 1159 invlist_iterfinish(invlist); 1160 1161 /* Here we have figured things out. Output each range */ 1162 invlist_iterinit(invlist); 1163 while (invlist_iternext(invlist, &start, &end)) { 1164 if (start >= NUM_ANYOF_CODE_POINTS) { 1165 break; 1166 } 1167 put_range(sv, start, end, allow_literals); 1168 } 1169 invlist_iterfinish(invlist); 1170 1171 return; 1172} 1173 1174STATIC SV* 1175S_put_charclass_bitmap_innards_common(pTHX_ 1176 SV* invlist, /* The bitmap */ 1177 SV* posixes, /* Under /l, things like [:word:], \S */ 1178 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ 1179 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ 1180 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ 1181 const bool invert /* Is the result to be inverted? */ 1182) 1183{ 1184 /* Create and return an SV containing a displayable version of the bitmap 1185 * and associated information determined by the input parameters. If the 1186 * output would have been only the inversion indicator '^', NULL is instead 1187 * returned. */ 1188 1189 SV * output; 1190 1191 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; 1192 1193 if (invert) { 1194 output = newSVpvs("^"); 1195 } 1196 else { 1197 output = newSVpvs(""); 1198 } 1199 1200 /* First, the code points in the bitmap that are unconditionally there */ 1201 put_charclass_bitmap_innards_invlist(output, invlist); 1202 1203 /* Traditionally, these have been placed after the main code points */ 1204 if (posixes) { 1205 sv_catsv(output, posixes); 1206 } 1207 1208 if (only_utf8 && _invlist_len(only_utf8)) { 1209 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); 1210 put_charclass_bitmap_innards_invlist(output, only_utf8); 1211 } 1212 1213 if (not_utf8 && _invlist_len(not_utf8)) { 1214 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); 1215 put_charclass_bitmap_innards_invlist(output, not_utf8); 1216 } 1217 1218 if (only_utf8_locale && _invlist_len(only_utf8_locale)) { 1219 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); 1220 put_charclass_bitmap_innards_invlist(output, only_utf8_locale); 1221 1222 /* This is the only list in this routine that can legally contain code 1223 * points outside the bitmap range. The call just above to 1224 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so 1225 * output them here. There's about a half-dozen possible, and none in 1226 * contiguous ranges longer than 2 */ 1227 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 1228 UV start, end; 1229 SV* above_bitmap = NULL; 1230 1231 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); 1232 1233 invlist_iterinit(above_bitmap); 1234 while (invlist_iternext(above_bitmap, &start, &end)) { 1235 UV i; 1236 1237 for (i = start; i <= end; i++) { 1238 put_code_point(output, i); 1239 } 1240 } 1241 invlist_iterfinish(above_bitmap); 1242 SvREFCNT_dec_NN(above_bitmap); 1243 } 1244 } 1245 1246 if (invert && SvCUR(output) == 1) { 1247 return NULL; 1248 } 1249 1250 return output; 1251} 1252 1253STATIC U8 1254S_put_charclass_bitmap_innards(pTHX_ SV *sv, 1255 char *bitmap, 1256 SV *nonbitmap_invlist, 1257 SV *only_utf8_locale_invlist, 1258 const regnode * const node, 1259 const U8 flags, 1260 const bool force_as_is_display) 1261{ 1262 /* Appends to 'sv' a displayable version of the innards of the bracketed 1263 * character class defined by the other arguments: 1264 * 'bitmap' points to the bitmap, or NULL if to ignore that. 1265 * 'nonbitmap_invlist' is an inversion list of the code points that are in 1266 * the bitmap range, but for some reason aren't in the bitmap; NULL if 1267 * none. The reasons for this could be that they require some 1268 * condition such as the target string being or not being in UTF-8 1269 * (under /d), or because they came from a user-defined property that 1270 * was not resolved at the time of the regex compilation (under /u) 1271 * 'only_utf8_locale_invlist' is an inversion list of the code points that 1272 * are valid only if the runtime locale is a UTF-8 one; NULL if none 1273 * 'node' is the regex pattern ANYOF node. It is needed only when the 1274 * above two parameters are not null, and is passed so that this 1275 * routine can tease apart the various reasons for them. 1276 * 'flags' is the flags field of 'node' 1277 * 'force_as_is_display' is TRUE if this routine should definitely NOT try 1278 * to invert things to see if that leads to a cleaner display. If 1279 * FALSE, this routine is free to use its judgment about doing this. 1280 * 1281 * It returns 0 if nothing was actually output. (It may be that 1282 * the bitmap, etc is empty.) 1283 * 1 if the output wasn't inverted (didn't begin with a '^') 1284 * 2 if the output was inverted (did begin with a '^') 1285 * 1286 * When called for outputting the bitmap of a non-ANYOF node, just pass the 1287 * bitmap, with the succeeding parameters set to NULL, and the final one to 1288 * FALSE. 1289 */ 1290 1291 /* In general, it tries to display the 'cleanest' representation of the 1292 * innards, choosing whether to display them inverted or not, regardless of 1293 * whether the class itself is to be inverted. However, there are some 1294 * cases where it can't try inverting, as what actually matches isn't known 1295 * until runtime, and hence the inversion isn't either. */ 1296 1297 bool inverting_allowed = ! force_as_is_display; 1298 1299 int i; 1300 STRLEN orig_sv_cur = SvCUR(sv); 1301 1302 SV* invlist; /* Inversion list we accumulate of code points that 1303 are unconditionally matched */ 1304 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is 1305 UTF-8 */ 1306 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 1307 */ 1308 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ 1309 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale 1310 is UTF-8 */ 1311 1312 SV* as_is_display; /* The output string when we take the inputs 1313 literally */ 1314 SV* inverted_display; /* The output string when we invert the inputs */ 1315 1316 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted 1317 to match? */ 1318 /* We are biased in favor of displaying things without them being inverted, 1319 * as that is generally easier to understand */ 1320 const int bias = 5; 1321 1322 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; 1323 1324 /* Start off with whatever code points are passed in. (We clone, so we 1325 * don't change the caller's list) */ 1326 if (nonbitmap_invlist) { 1327 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); 1328 invlist = invlist_clone(nonbitmap_invlist, NULL); 1329 } 1330 else { /* Worst case size is every other code point is matched */ 1331 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); 1332 } 1333 1334 if (flags) { 1335 if (OP(node) == ANYOFD) { 1336 1337 /* This flag indicates that the code points below 0x100 in the 1338 * nonbitmap list are precisely the ones that match only when the 1339 * target is UTF-8 (they should all be non-ASCII). */ 1340 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) { 1341 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); 1342 _invlist_subtract(invlist, only_utf8, &invlist); 1343 } 1344 1345 /* And this flag for matching all non-ASCII 0xFF and below */ 1346 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) { 1347 not_utf8 = invlist_clone(PL_UpperLatin1, NULL); 1348 } 1349 } 1350 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { 1351 1352 /* If either of these flags are set, what matches isn't 1353 * determinable except during execution, so don't know enough here 1354 * to invert */ 1355 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { 1356 inverting_allowed = FALSE; 1357 } 1358 1359 /* What the posix classes match also varies at runtime, so these 1360 * will be output symbolically. */ 1361 if (ANYOF_POSIXL_TEST_ANY_SET(node)) { 1362 int i; 1363 1364 posixes = newSVpvs(""); 1365 for (i = 0; i < ANYOF_POSIXL_MAX; i++) { 1366 if (ANYOF_POSIXL_TEST(node, i)) { 1367 sv_catpv(posixes, anyofs[i]); 1368 } 1369 } 1370 } 1371 } 1372 } 1373 1374 /* Accumulate the bit map into the unconditional match list */ 1375 if (bitmap) { 1376 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 1377 if (BITMAP_TEST(bitmap, i)) { 1378 int start = i++; 1379 for (; 1380 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); 1381 i++) 1382 { /* empty */ } 1383 invlist = _add_range_to_invlist(invlist, start, i-1); 1384 } 1385 } 1386 } 1387 1388 /* Make sure that the conditional match lists don't have anything in them 1389 * that match unconditionally; otherwise the output is quite confusing. 1390 * This could happen if the code that populates these misses some 1391 * duplication. */ 1392 if (only_utf8) { 1393 _invlist_subtract(only_utf8, invlist, &only_utf8); 1394 } 1395 if (not_utf8) { 1396 _invlist_subtract(not_utf8, invlist, ¬_utf8); 1397 } 1398 1399 if (only_utf8_locale_invlist) { 1400 1401 /* Since this list is passed in, we have to make a copy before 1402 * modifying it */ 1403 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); 1404 1405 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); 1406 1407 /* And, it can get really weird for us to try outputting an inverted 1408 * form of this list when it has things above the bitmap, so don't even 1409 * try */ 1410 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 1411 inverting_allowed = FALSE; 1412 } 1413 } 1414 1415 /* Calculate what the output would be if we take the input as-is */ 1416 as_is_display = put_charclass_bitmap_innards_common(invlist, 1417 posixes, 1418 only_utf8, 1419 not_utf8, 1420 only_utf8_locale, 1421 invert); 1422 1423 /* If have to take the output as-is, just do that */ 1424 if (! inverting_allowed) { 1425 if (as_is_display) { 1426 sv_catsv(sv, as_is_display); 1427 SvREFCNT_dec_NN(as_is_display); 1428 } 1429 } 1430 else { /* But otherwise, create the output again on the inverted input, and 1431 use whichever version is shorter */ 1432 1433 int inverted_bias, as_is_bias; 1434 1435 /* We will apply our bias to whichever of the results doesn't have 1436 * the '^' */ 1437 bool trial_invert; 1438 if (invert) { 1439 trial_invert = FALSE; 1440 as_is_bias = bias; 1441 inverted_bias = 0; 1442 } 1443 else { 1444 trial_invert = TRUE; 1445 as_is_bias = 0; 1446 inverted_bias = bias; 1447 } 1448 1449 /* Now invert each of the lists that contribute to the output, 1450 * excluding from the result things outside the possible range */ 1451 1452 /* For the unconditional inversion list, we have to add in all the 1453 * conditional code points, so that when inverted, they will be gone 1454 * from it */ 1455 _invlist_union(only_utf8, invlist, &invlist); 1456 _invlist_union(not_utf8, invlist, &invlist); 1457 _invlist_union(only_utf8_locale, invlist, &invlist); 1458 _invlist_invert(invlist); 1459 _invlist_intersection(invlist, PL_InBitmap, &invlist); 1460 1461 if (only_utf8) { 1462 _invlist_invert(only_utf8); 1463 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); 1464 } 1465 else if (not_utf8) { 1466 1467 /* If a code point matches iff the target string is not in UTF-8, 1468 * then complementing the result has it not match iff not in UTF-8, 1469 * which is the same thing as matching iff it is UTF-8. */ 1470 only_utf8 = not_utf8; 1471 not_utf8 = NULL; 1472 } 1473 1474 if (only_utf8_locale) { 1475 _invlist_invert(only_utf8_locale); 1476 _invlist_intersection(only_utf8_locale, 1477 PL_InBitmap, 1478 &only_utf8_locale); 1479 } 1480 1481 inverted_display = put_charclass_bitmap_innards_common( 1482 invlist, 1483 posixes, 1484 only_utf8, 1485 not_utf8, 1486 only_utf8_locale, trial_invert); 1487 1488 /* Use the shortest representation, taking into account our bias 1489 * against showing it inverted */ 1490 if ( inverted_display 1491 && ( ! as_is_display 1492 || ( SvCUR(inverted_display) + inverted_bias 1493 < SvCUR(as_is_display) + as_is_bias))) 1494 { 1495 sv_catsv(sv, inverted_display); 1496 invert = ! invert; 1497 } 1498 else if (as_is_display) { 1499 sv_catsv(sv, as_is_display); 1500 } 1501 1502 SvREFCNT_dec(as_is_display); 1503 SvREFCNT_dec(inverted_display); 1504 } 1505 1506 SvREFCNT_dec_NN(invlist); 1507 SvREFCNT_dec(only_utf8); 1508 SvREFCNT_dec(not_utf8); 1509 SvREFCNT_dec(posixes); 1510 SvREFCNT_dec(only_utf8_locale); 1511 1512 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur); 1513 if (did_output_something) { 1514 /* Distinguish between non and inverted cases */ 1515 did_output_something += invert; 1516 } 1517 1518 return did_output_something; 1519} 1520 1521 1522const regnode * 1523Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 1524 const regnode *last, const regnode *plast, 1525 SV* sv, I32 indent, U32 depth) 1526{ 1527 const regnode *next; 1528 const regnode *optstart= NULL; 1529 1530 RXi_GET_DECL(r, ri); 1531 DECLARE_AND_GET_RE_DEBUG_FLAGS; 1532 1533 PERL_ARGS_ASSERT_DUMPUNTIL; 1534 1535#ifdef DEBUG_DUMPUNTIL 1536 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, 1537 last ? last-start : 0, plast ? plast-start : 0); 1538#endif 1539 1540 if (plast && plast < last) 1541 last= plast; 1542 1543 while (node && (!last || node < last)) { 1544 const U8 op = OP(node); 1545 1546 if (op == CLOSE || op == SRCLOSE || op == WHILEM) 1547 indent--; 1548 next = regnext((regnode *)node); 1549 const regnode *after = regnode_after((regnode *)node,0); 1550 1551 /* Where, what. */ 1552 if (op == OPTIMIZED) { 1553 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 1554 optstart = node; 1555 else 1556 goto after_print; 1557 } else 1558 CLEAR_OPTSTART; 1559 1560 regprop(r, sv, node, NULL, NULL); 1561 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), 1562 (int)(2*indent + 1), "", SvPVX_const(sv)); 1563 1564 if (op != OPTIMIZED) { 1565 if (next == NULL) /* Next ptr. */ 1566 Perl_re_printf( aTHX_ " (0)"); 1567 else if (REGNODE_TYPE(op) == BRANCH 1568 && REGNODE_TYPE(OP(next)) != BRANCH ) 1569 Perl_re_printf( aTHX_ " (FAIL)"); 1570 else 1571 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); 1572 Perl_re_printf( aTHX_ "\n"); 1573 } 1574 1575 after_print: 1576 if (REGNODE_TYPE(op) == BRANCHJ) { 1577 assert(next); 1578 const regnode *nnode = (OP(next) == LONGJMP 1579 ? regnext((regnode *)next) 1580 : next); 1581 if (last && nnode > last) 1582 nnode = last; 1583 DUMPUNTIL(after, nnode); 1584 } 1585 else if (REGNODE_TYPE(op) == BRANCH) { 1586 assert(next); 1587 DUMPUNTIL(after, next); 1588 } 1589 else if ( REGNODE_TYPE(op) == TRIE ) { 1590 const regnode *this_trie = node; 1591 const U32 n = ARG1u(node); 1592 const reg_ac_data * const ac = op>=AHOCORASICK ? 1593 (reg_ac_data *)ri->data->data[n] : 1594 NULL; 1595 const reg_trie_data * const trie = 1596 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 1597#ifdef DEBUGGING 1598 AV *const trie_words 1599 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 1600#endif 1601 const regnode *nextbranch= NULL; 1602 I32 word_idx; 1603 SvPVCLEAR(sv); 1604 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 1605 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); 1606 1607 Perl_re_indentf( aTHX_ "%s ", 1608 indent+3, 1609 elem_ptr 1610 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), 1611 SvCUR(*elem_ptr), PL_dump_re_max_len, 1612 PL_colors[0], PL_colors[1], 1613 (SvUTF8(*elem_ptr) 1614 ? PERL_PV_ESCAPE_UNI 1615 : 0) 1616 | PERL_PV_PRETTY_ELLIPSES 1617 | PERL_PV_PRETTY_LTGT 1618 ) 1619 : "???" 1620 ); 1621 if (trie->jump) { 1622 U16 dist= trie->jump[word_idx+1]; 1623 Perl_re_printf( aTHX_ "(%" UVuf ")\n", 1624 (UV)((dist ? this_trie + dist : next) - start)); 1625 if (dist) { 1626 if (!nextbranch) 1627 nextbranch= this_trie + trie->jump[0]; 1628 DUMPUNTIL(this_trie + dist, nextbranch); 1629 } 1630 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) 1631 nextbranch= regnext((regnode *)nextbranch); 1632 } else { 1633 Perl_re_printf( aTHX_ "\n"); 1634 } 1635 } 1636 if (last && next > last) 1637 node= last; 1638 else 1639 node= next; 1640 } 1641 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 1642 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ 1643 } 1644 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { 1645 assert(next); 1646 DUMPUNTIL(after, next); 1647 } 1648 else if ( op == PLUS || op == STAR) { 1649 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ 1650 } 1651 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { 1652 /* Literal string, where present. */ 1653 node = (const regnode *)REGNODE_AFTER_varies(node); 1654 } 1655 else { 1656 node = REGNODE_AFTER_opcode(node,op); 1657 } 1658 if (op == CURLYX || op == OPEN || op == SROPEN) 1659 indent++; 1660 if (REGNODE_TYPE(op) == END) 1661 break; 1662 } 1663 CLEAR_OPTSTART; 1664#ifdef DEBUG_DUMPUNTIL 1665 Perl_re_printf( aTHX_ "--- %d\n", (int)indent); 1666#endif 1667 return node; 1668} 1669 1670#endif /* DEBUGGING */ 1671