1#ifdef PERL_EXT_RE_BUILD
2#include "re_top.h"
3#endif
4
5#include "EXTERN.h"
6#define PERL_IN_REGEX_ENGINE
7#define PERL_IN_REGCOMP_ANY
8#define PERL_IN_REGCOMP_DEBUG_C
9#include "perl.h"
10
11#ifdef PERL_IN_XSUB_RE
12#  include "re_comp.h"
13#else
14#  include "regcomp.h"
15#endif
16
17#include "invlist_inline.h"
18#include "unicode_constants.h"
19#include "regcomp_internal.h"
20
21#ifdef DEBUGGING
22
23int
24Perl_re_printf(pTHX_ const char *fmt, ...)
25{
26    va_list ap;
27    int result;
28    PerlIO *f= Perl_debug_log;
29    PERL_ARGS_ASSERT_RE_PRINTF;
30    va_start(ap, fmt);
31    result = PerlIO_vprintf(f, fmt, ap);
32    va_end(ap);
33    return result;
34}
35
36int
37Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
38{
39    va_list ap;
40    int result;
41    PerlIO *f= Perl_debug_log;
42    PERL_ARGS_ASSERT_RE_INDENTF;
43    va_start(ap, depth);
44    PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
45    result = PerlIO_vprintf(f, fmt, ap);
46    va_end(ap);
47    return result;
48}
49
50void
51Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
52                                    const char *close_str)
53{
54    PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
55    if (!flags)
56        return;
57
58    Perl_re_printf( aTHX_  "%s", open_str);
59    DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
60    DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
61    DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
62    DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
63    DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
64    DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
65    DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
66    DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
67    DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
68    DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
69    DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
70    DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
71    DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
72    DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
73    DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
74    Perl_re_printf( aTHX_  "%s", close_str);
75}
76
77void
78Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
79                    U32 depth, int is_inf,
80                    SSize_t min, SSize_t stopmin, SSize_t delta)
81{
82    PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
83    DECLARE_AND_GET_RE_DEBUG_FLAGS;
84
85    DEBUG_OPTIMISE_MORE_r({
86        if (!data) {
87            Perl_re_indentf(aTHX_  "%s: NO DATA",
88                depth,
89                where);
90            return;
91        }
92        Perl_re_indentf(aTHX_  "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
93            depth,
94            where,
95            min, stopmin, delta,
96            (IV)data->pos_min,
97            (IV)data->pos_delta,
98            (UV)data->flags
99        );
100
101        Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
102
103        Perl_re_printf( aTHX_
104            " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
105            (IV)data->whilem_c,
106            (IV)(data->last_closep ? *((data)->last_closep) : -1),
107            is_inf ? "INF " : ""
108        );
109
110        if (data->last_found) {
111            int i;
112            Perl_re_printf(aTHX_
113                "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
114                    SvPVX_const(data->last_found),
115                    (IV)data->last_end,
116                    (IV)data->last_start_min,
117                    (IV)data->last_start_max
118            );
119
120            for (i = 0; i < 2; i++) {
121                Perl_re_printf(aTHX_
122                    " %s%s: '%s' @ %" IVdf "/%" IVdf,
123                    data->cur_is_floating == i ? "*" : "",
124                    i ? "Float" : "Fixed",
125                    SvPVX_const(data->substrs[i].str),
126                    (IV)data->substrs[i].min_offset,
127                    (IV)data->substrs[i].max_offset
128                );
129                Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
130            }
131        }
132
133        Perl_re_printf( aTHX_ "\n");
134    });
135}
136
137
138void
139Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
140                regnode *scan, U32 depth, U32 flags)
141{
142    PERL_ARGS_ASSERT_DEBUG_PEEP;
143    DECLARE_AND_GET_RE_DEBUG_FLAGS;
144
145    DEBUG_OPTIMISE_r({
146        regnode *Next;
147
148        if (!scan)
149            return;
150        Next = regnext(scan);
151        regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
152        Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
153            depth,
154            str,
155            REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
156            Next ? (REG_NODE_NUM(Next)) : 0 );
157        Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
158        Perl_re_printf( aTHX_  "\n");
159   });
160}
161
162#endif /* DEBUGGING */
163
164/*
165 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
166 */
167#ifdef DEBUGGING
168
169static void
170S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
171{
172    int bit;
173    int set=0;
174
175    ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
176
177    for (bit=0; bit<=REG_INTFLAGS_NAME_SIZE; bit++) {
178        if (flags & (1<<bit)) {
179            if (!set++ && lead)
180                Perl_re_printf( aTHX_  "%s", lead);
181            Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
182        }
183    }
184    if (lead)  {
185        if (set)
186            Perl_re_printf( aTHX_  "\n");
187        else
188            Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
189    }
190}
191
192static void
193S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
194{
195    int bit;
196    int set=0;
197    regex_charset cs;
198
199    ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
200
201    for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
202        if (flags & (1U<<bit)) {
203            if ((1U<<bit) & RXf_PMf_CHARSET) {  /* Output separately, below */
204                continue;
205            }
206            if (!set++ && lead)
207                Perl_re_printf( aTHX_  "%s", lead);
208            Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
209        }
210    }
211    if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
212            if (!set++ && lead) {
213                Perl_re_printf( aTHX_  "%s", lead);
214            }
215            switch (cs) {
216                case REGEX_UNICODE_CHARSET:
217                    Perl_re_printf( aTHX_  "UNICODE");
218                    break;
219                case REGEX_LOCALE_CHARSET:
220                    Perl_re_printf( aTHX_  "LOCALE");
221                    break;
222                case REGEX_ASCII_RESTRICTED_CHARSET:
223                    Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
224                    break;
225                case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
226                    Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
227                    break;
228                default:
229                    Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
230                    break;
231            }
232    }
233    if (lead)  {
234        if (set)
235            Perl_re_printf( aTHX_  "\n");
236        else
237            Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
238    }
239}
240#endif
241
242void
243Perl_regdump(pTHX_ const regexp *r)
244{
245#ifdef DEBUGGING
246    int i;
247    SV * const sv = sv_newmortal();
248    SV *dsv= sv_newmortal();
249    RXi_GET_DECL(r, ri);
250    DECLARE_AND_GET_RE_DEBUG_FLAGS;
251
252    PERL_ARGS_ASSERT_REGDUMP;
253
254    (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
255
256    /* Header fields of interest. */
257    for (i = 0; i < 2; i++) {
258        if (r->substrs->data[i].substr) {
259            RE_PV_QUOTED_DECL(s, 0, dsv,
260                            SvPVX_const(r->substrs->data[i].substr),
261                            RE_SV_DUMPLEN(r->substrs->data[i].substr),
262                            PL_dump_re_max_len);
263            Perl_re_printf( aTHX_
264                          "%s %s%s at %" IVdf "..%" UVuf " ",
265                          i ? "floating" : "anchored",
266                          s,
267                          RE_SV_TAIL(r->substrs->data[i].substr),
268                          (IV)r->substrs->data[i].min_offset,
269                          (UV)r->substrs->data[i].max_offset);
270        }
271        else if (r->substrs->data[i].utf8_substr) {
272            RE_PV_QUOTED_DECL(s, 1, dsv,
273                            SvPVX_const(r->substrs->data[i].utf8_substr),
274                            RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
275                            30);
276            Perl_re_printf( aTHX_
277                          "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
278                          i ? "floating" : "anchored",
279                          s,
280                          RE_SV_TAIL(r->substrs->data[i].utf8_substr),
281                          (IV)r->substrs->data[i].min_offset,
282                          (UV)r->substrs->data[i].max_offset);
283        }
284    }
285
286    if (r->check_substr || r->check_utf8)
287        Perl_re_printf( aTHX_
288                      (const char *)
289                      (   r->check_substr == r->substrs->data[1].substr
290                       && r->check_utf8   == r->substrs->data[1].utf8_substr
291                       ? "(checking floating" : "(checking anchored"));
292    if (r->intflags & PREGf_NOSCAN)
293        Perl_re_printf( aTHX_  " noscan");
294    if (r->extflags & RXf_CHECK_ALL)
295        Perl_re_printf( aTHX_  " isall");
296    if (r->check_substr || r->check_utf8)
297        Perl_re_printf( aTHX_  ") ");
298
299    if (ri->regstclass) {
300        regprop(r, sv, ri->regstclass, NULL, NULL);
301        Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
302    }
303    if (r->intflags & PREGf_ANCH) {
304        Perl_re_printf( aTHX_  "anchored");
305        if (r->intflags & PREGf_ANCH_MBOL)
306            Perl_re_printf( aTHX_  "(MBOL)");
307        if (r->intflags & PREGf_ANCH_SBOL)
308            Perl_re_printf( aTHX_  "(SBOL)");
309        if (r->intflags & PREGf_ANCH_GPOS)
310            Perl_re_printf( aTHX_  "(GPOS)");
311        Perl_re_printf( aTHX_ " ");
312    }
313    if (r->intflags & PREGf_GPOS_SEEN)
314        Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
315    if (r->intflags & PREGf_SKIP)
316        Perl_re_printf( aTHX_  "plus ");
317    if (r->intflags & PREGf_IMPLICIT)
318        Perl_re_printf( aTHX_  "implicit ");
319    Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
320    if (r->extflags & RXf_EVAL_SEEN)
321        Perl_re_printf( aTHX_  "with eval ");
322    Perl_re_printf( aTHX_  "\n");
323    DEBUG_FLAGS_r({
324        regdump_extflags("r->extflags: ", r->extflags);
325        regdump_intflags("r->intflags: ", r->intflags);
326    });
327#else
328    PERL_ARGS_ASSERT_REGDUMP;
329    PERL_UNUSED_CONTEXT;
330    PERL_UNUSED_ARG(r);
331#endif  /* DEBUGGING */
332}
333
334/* Should be synchronized with ANYOF_ #defines in regcomp.h */
335#ifdef DEBUGGING
336
337#  if   CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1        || CC_ALPHA_ != 2    \
338     || CC_LOWER_ != 3    || CC_UPPER_ != 4        || CC_PUNCT_ != 5    \
339     || CC_PRINT_ != 6    || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8    \
340     || CC_CASED_ != 9    || CC_SPACE_ != 10       || CC_BLANK_ != 11   \
341     || CC_XDIGIT_ != 12  || CC_CNTRL_ != 13       || CC_ASCII_ != 14   \
342     || CC_VERTSPACE_ != 15
343#   error Need to adjust order of anyofs[]
344#  endif
345static const char * const anyofs[] = {
346    "\\w",
347    "\\W",
348    "\\d",
349    "\\D",
350    "[:alpha:]",
351    "[:^alpha:]",
352    "[:lower:]",
353    "[:^lower:]",
354    "[:upper:]",
355    "[:^upper:]",
356    "[:punct:]",
357    "[:^punct:]",
358    "[:print:]",
359    "[:^print:]",
360    "[:alnum:]",
361    "[:^alnum:]",
362    "[:graph:]",
363    "[:^graph:]",
364    "[:cased:]",
365    "[:^cased:]",
366    "\\s",
367    "\\S",
368    "[:blank:]",
369    "[:^blank:]",
370    "[:xdigit:]",
371    "[:^xdigit:]",
372    "[:cntrl:]",
373    "[:^cntrl:]",
374    "[:ascii:]",
375    "[:^ascii:]",
376    "\\v",
377    "\\V"
378};
379#endif
380
381/*
382- regprop - printable representation of opcode, with run time support
383*/
384
385void
386Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
387{
388#ifdef DEBUGGING
389    U8 k;
390    const U8 op = OP(o);
391    RXi_GET_DECL(prog, progi);
392    DECLARE_AND_GET_RE_DEBUG_FLAGS;
393
394    PERL_ARGS_ASSERT_REGPROP;
395
396    SvPVCLEAR(sv);
397
398    if (op > REGNODE_MAX) {          /* regnode.type is unsigned */
399        if (pRExC_state) {  /* This gives more info, if we have it */
400            FAIL3("panic: corrupted regexp opcode %d > %d",
401                  (int)op, (int)REGNODE_MAX);
402        }
403        else {
404            Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
405                             (int)op, (int)REGNODE_MAX);
406        }
407    }
408    sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
409
410    k = REGNODE_TYPE(op);
411    if (op == BRANCH) {
412        Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o));
413    }
414    else if (op == BRANCHJ) {
415        Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o));
416    }
417    else if (k == EXACT) {
418        sv_catpvs(sv, " ");
419        /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
420         * is a crude hack but it may be the best for now since
421         * we have no flag "this EXACTish node was UTF-8"
422         * --jhi */
423        pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
424                  PL_colors[0], PL_colors[1],
425                  PERL_PV_ESCAPE_UNI_DETECT |
426                  PERL_PV_ESCAPE_NONASCII   |
427                  PERL_PV_PRETTY_ELLIPSES   |
428                  PERL_PV_PRETTY_LTGT       |
429                  PERL_PV_PRETTY_NOCLEAR
430                  );
431    } else if (k == TRIE) {
432        /* print the details of the trie in dumpuntil instead, as
433         * progi->data isn't available here */
434        const U32 n = ARG1u(o);
435        const reg_ac_data * const ac = IS_TRIE_AC(op) ?
436               (reg_ac_data *)progi->data->data[n] :
437               NULL;
438        const reg_trie_data * const trie
439            = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
440
441        Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o)));
442        DEBUG_TRIE_COMPILE_r({
443          if (trie->jump)
444            sv_catpvs(sv, "(JUMP)");
445          Perl_sv_catpvf(aTHX_ sv,
446            "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
447            (UV)trie->startstate,
448            (IV)trie->statecount-1, /* -1 because of the unused 0 element */
449            (UV)trie->wordcount,
450            (UV)trie->minlen,
451            (UV)trie->maxlen,
452            (UV)TRIE_CHARCOUNT(trie),
453            (UV)trie->uniquecharcount
454          );
455        });
456        if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
457            sv_catpvs(sv, "[");
458            (void) put_charclass_bitmap_innards(sv,
459                                                ((IS_ANYOF_TRIE(op))
460                                                 ? ANYOF_BITMAP(o)
461                                                 : TRIE_BITMAP(trie)),
462                                                NULL,
463                                                NULL,
464                                                NULL,
465                                                0,
466                                                FALSE
467                                               );
468            sv_catpvs(sv, "]");
469        }
470        if (trie->before_paren || trie->after_paren)
471            Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
472                    (IV)trie->before_paren,(IV)trie->after_paren);
473    } else if (k == CURLY) {
474        U32 lo = ARG1i(o), hi = ARG2i(o);
475        if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */
476            Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */
477        if (op == CURLYM || op == CURLYN || op == CURLYX)
478            Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */
479        Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
480        if (hi == REG_INFTY)
481            sv_catpvs(sv, "INFTY");
482        else
483            Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
484        sv_catpvs(sv, "}");
485    }
486    else if (k == WHILEM && FLAGS(o))                   /* Ordinal/of */
487        Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4);
488    else if (k == REF || k == OPEN || k == CLOSE
489             || k == GROUPP || op == ACCEPT)
490    {
491        AV *name_list= NULL;
492        U32 parno= (op == ACCEPT)              ? ARG2u(o) :
493                   (op == OPEN || op == CLOSE) ? PARNO(o) :
494                                                 ARG1u(o);
495        if ( RXp_PAREN_NAMES(prog) ) {
496            name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
497        } else if ( pRExC_state ) {
498            name_list= RExC_paren_name_list;
499        }
500        if ( name_list ) {
501            if ( k != REF || (op < REFN)) {
502                UV logical_parno = parno;
503                if (prog->parno_to_logical)
504                    logical_parno = prog->parno_to_logical[parno];
505
506                Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno);     /* Parenth number */
507                if (parno != logical_parno)
508                    Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno);        /* Parenth number */
509
510                SV **name= av_fetch_simple(name_list, parno, 0 );
511                if (name)
512                    Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
513            }
514            else
515            if (parno > 0) {
516                /* parno must always be larger than 0 for this block
517                 * as it represents a slot into the data array, which
518                 * has the 0 slot reserved for a placeholder so any valid
519                 * index into it is always true, eg non-zero
520                 * see the '%' "what" type and the implementation of
521                 * S_reg_add_data()
522                 */
523                SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
524                I32 *nums=(I32*)SvPVX(sv_dat);
525                SV **name= av_fetch_simple(name_list, nums[0], 0 );
526                I32 n;
527                if (name) {
528                    for ( n=0; n<SvIVX(sv_dat); n++ ) {
529                        Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
530                                    (n ? "," : ""), (IV)nums[n]);
531                    }
532                    Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
533                }
534            }
535        } else if (parno>0) {
536            UV logical_parno = parno;
537            if (prog->parno_to_logical)
538                logical_parno = prog->parno_to_logical[parno];
539
540            Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno);     /* Parenth number */
541            if (logical_parno != parno)
542                Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno);     /* Parenth number */
543
544        }
545        if ( k == REF ) {
546            Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o));
547        }
548        if ( k == REF && reginfo) {
549            U32 n = ARG1u(o);  /* which paren pair */
550            I32 ln = RXp_OFFS_START(prog,n);
551            if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1)
552                Perl_sv_catpvf(aTHX_ sv, ": FAIL");
553            else if (ln == RXp_OFFS_END(prog,n))
554                Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
555            else {
556                const char *s = reginfo->strbeg + ln;
557                Perl_sv_catpvf(aTHX_ sv, ": ");
558                Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0,
559                    PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
560            }
561        }
562    } else if (k == GOSUB) {
563        AV *name_list= NULL;
564        IV parno = ARG1u(o);
565        IV logical_parno = (parno && prog->parno_to_logical)
566                         ? prog->parno_to_logical[parno]
567                         : parno;
568        if ( RXp_PAREN_NAMES(prog) ) {
569            name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
570        } else if ( pRExC_state ) {
571            name_list= RExC_paren_name_list;
572        }
573
574        /* Paren and offset */
575        Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno);
576        if (logical_parno != parno)
577            Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno);
578
579        Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o),
580                (int)((o + (int)ARG2i(o)) - progi->program) );
581        if (name_list) {
582            SV **name= av_fetch_simple(name_list, ARG1u(o), 0 );
583            if (name)
584                Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
585        }
586    }
587    else if (k == LOGICAL)
588        /* 2: embedded, otherwise 1 */
589        Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o));
590    else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
591        U8 flags;
592        char * bitmap;
593        U8 do_sep = 0;    /* Do we need to separate various components of the
594                             output? */
595        /* Set if there is still an unresolved user-defined property */
596        SV *unresolved                = NULL;
597
598        /* Things that are ignored except when the runtime locale is UTF-8 */
599        SV *only_utf8_locale_invlist = NULL;
600
601        /* Code points that don't fit in the bitmap */
602        SV *nonbitmap_invlist = NULL;
603
604        /* And things that aren't in the bitmap, but are small enough to be */
605        SV* bitmap_range_not_in_bitmap = NULL;
606
607        bool inverted;
608
609        if (k != ANYOF) {
610            flags = 0;
611            bitmap = NULL;
612        }
613        else {
614            flags = ANYOF_FLAGS(o);
615            bitmap = ANYOF_BITMAP(o);
616        }
617
618        if (op == ANYOFL || op == ANYOFPOSIXL) {
619            if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
620                sv_catpvs(sv, "{utf8-locale-reqd}");
621            }
622            if (flags & ANYOFL_FOLD) {
623                sv_catpvs(sv, "{i}");
624            }
625        }
626
627        inverted = flags & ANYOF_INVERT;
628
629        /* If there is stuff outside the bitmap, get it */
630        if (k == ANYOFR) {
631
632            /* For a single range, split into the parts inside vs outside the
633             * bitmap. */
634            UV start = ANYOFRbase(o);
635            UV end   = ANYOFRbase(o) + ANYOFRdelta(o);
636
637            if (start < NUM_ANYOF_CODE_POINTS) {
638                if (end < NUM_ANYOF_CODE_POINTS) {
639                    bitmap_range_not_in_bitmap
640                          = _add_range_to_invlist(bitmap_range_not_in_bitmap,
641                                                  start, end);
642                }
643                else {
644                    bitmap_range_not_in_bitmap
645                          = _add_range_to_invlist(bitmap_range_not_in_bitmap,
646                                                  start, NUM_ANYOF_CODE_POINTS);
647                    start = NUM_ANYOF_CODE_POINTS;
648                }
649            }
650
651            if (start >= NUM_ANYOF_CODE_POINTS) {
652                nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
653                                                ANYOFRbase(o),
654                                                ANYOFRbase(o) + ANYOFRdelta(o));
655            }
656        }
657        else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
658            nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
659                                                      NUM_ANYOF_CODE_POINTS,
660                                                      UV_MAX);
661        }
662        else if (ANYOF_HAS_AUX(o)) {
663                (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
664                                                &unresolved,
665                                                &only_utf8_locale_invlist,
666                                                &nonbitmap_invlist);
667
668            /* The aux data may contain stuff that could fit in the bitmap.
669             * This could come from a user-defined property being finally
670             * resolved when this call was done; or much more likely because
671             * there are matches that require UTF-8 to be valid, and so aren't
672             * in the bitmap (or ANYOFR).  This is teased apart later */
673            _invlist_intersection(nonbitmap_invlist,
674                                  PL_InBitmap,
675                                  &bitmap_range_not_in_bitmap);
676            /* Leave just the things that don't fit into the bitmap */
677            _invlist_subtract(nonbitmap_invlist,
678                              PL_InBitmap,
679                              &nonbitmap_invlist);
680        }
681
682        /* Ready to start outputting.  First, the initial left bracket */
683        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
684
685        if (   bitmap
686            || bitmap_range_not_in_bitmap
687            || only_utf8_locale_invlist
688            || unresolved)
689        {
690            /* Then all the things that could fit in the bitmap */
691            do_sep = put_charclass_bitmap_innards(
692                                    sv,
693                                    bitmap,
694                                    bitmap_range_not_in_bitmap,
695                                    only_utf8_locale_invlist,
696                                    o,
697                                    flags,
698
699                                    /* Can't try inverting for a
700                                                   * better display if there
701                                                   * are things that haven't
702                                                   * been resolved */
703                                    (unresolved != NULL || k == ANYOFR));
704            SvREFCNT_dec(bitmap_range_not_in_bitmap);
705
706            /* If there are user-defined properties which haven't been defined
707             * yet, output them.  If the result is not to be inverted, it is
708             * clearest to output them in a separate [] from the bitmap range
709             * stuff.  If the result is to be complemented, we have to show
710             * everything in one [], as the inversion applies to the whole
711             * thing.  Use {braces} to separate them from anything in the
712             * bitmap and anything above the bitmap. */
713            if (unresolved) {
714                if (inverted) {
715                    if (! do_sep) { /* If didn't output anything in the bitmap
716                                     */
717                        sv_catpvs(sv, "^");
718                    }
719                    sv_catpvs(sv, "{");
720                }
721                else if (do_sep) {
722                    Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
723                                                      PL_colors[0]);
724                }
725                sv_catsv(sv, unresolved);
726                if (inverted) {
727                    sv_catpvs(sv, "}");
728                }
729                do_sep = ! inverted;
730            }
731            else if (     do_sep == 2
732                     && ! nonbitmap_invlist
733                     &&   ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
734            {
735                /* Here, the display shows the class as inverted, and
736                 * everything above the lower display should also match, but
737                 * there is no indication of that.  Add this range so the code
738                 * below will add it to the display */
739                _invlist_union_complement_2nd(nonbitmap_invlist,
740                                              PL_InBitmap,
741                                              &nonbitmap_invlist);
742            }
743        }
744
745        /* And, finally, add the above-the-bitmap stuff */
746        if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
747            SV* contents;
748
749            /* See if truncation size is overridden */
750            const STRLEN dump_len = (PL_dump_re_max_len > 256)
751                                    ? PL_dump_re_max_len
752                                    : 256;
753
754            /* This is output in a separate [] */
755            if (do_sep) {
756                Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
757            }
758
759            /* And, for easy of understanding, it is shown in the
760             * uncomplemented form if possible.  The one exception being if
761             * there are unresolved items, where the inversion has to be
762             * delayed until runtime */
763            if (inverted && ! unresolved) {
764                _invlist_invert(nonbitmap_invlist);
765                _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
766            }
767
768            contents = invlist_contents(nonbitmap_invlist,
769                                        FALSE /* output suitable for catsv */
770                                       );
771
772            /* If the output is shorter than the permissible maximum, just do it. */
773            if (SvCUR(contents) <= dump_len) {
774                sv_catsv(sv, contents);
775            }
776            else {
777                const char * contents_string = SvPVX(contents);
778                STRLEN i = dump_len;
779
780                /* Otherwise, start at the permissible max and work back to the
781                 * first break possibility */
782                while (i > 0 && contents_string[i] != ' ') {
783                    i--;
784                }
785                if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
786                                       find a legal break */
787                    i = dump_len;
788                }
789
790                sv_catpvn(sv, contents_string, i);
791                sv_catpvs(sv, "...");
792            }
793
794            SvREFCNT_dec_NN(contents);
795            SvREFCNT_dec_NN(nonbitmap_invlist);
796        }
797
798        /* And finally the matching, closing ']' */
799        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
800
801        if (op == ANYOFHs) {
802            Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
803        }
804        else if (REGNODE_TYPE(op) != ANYOF) {
805            U8 lowest = (op != ANYOFHr)
806                         ? FLAGS(o)
807                         : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
808            U8 highest = (op == ANYOFHr)
809                         ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
810                         : (op == ANYOFH || op == ANYOFR)
811                           ? 0xFF
812                           : lowest;
813#ifndef EBCDIC
814            if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
815#endif
816            {
817                Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
818                if (lowest != highest) {
819                    Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
820                }
821                Perl_sv_catpvf(aTHX_ sv, ")");
822            }
823        }
824
825        SvREFCNT_dec(unresolved);
826    }
827    else if (k == ANYOFM) {
828        SV * cp_list = get_ANYOFM_contents(o);
829
830        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
831        if (op == NANYOFM) {
832            _invlist_invert(cp_list);
833        }
834
835        put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
836        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
837
838        SvREFCNT_dec(cp_list);
839    }
840    else if (k == ANYOFHbbm) {
841        SV * cp_list = get_ANYOFHbbm_contents(o);
842        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
843
844        sv_catsv(sv, invlist_contents(cp_list,
845                                      FALSE /* output suitable for catsv */
846                                     ));
847        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
848
849        SvREFCNT_dec(cp_list);
850    }
851    else if (k == POSIXD || k == NPOSIXD) {
852        U8 index = FLAGS(o) * 2;
853        if (index < C_ARRAY_LENGTH(anyofs)) {
854            if (*anyofs[index] != '[')  {
855                sv_catpvs(sv, "[");
856            }
857            sv_catpv(sv, anyofs[index]);
858            if (*anyofs[index] != '[')  {
859                sv_catpvs(sv, "]");
860            }
861        }
862        else {
863            Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
864        }
865    }
866    else if (k == BOUND || k == NBOUND) {
867        /* Must be synced with order of 'bound_type' in regcomp.h */
868        const char * const bounds[] = {
869            "",      /* Traditional */
870            "{gcb}",
871            "{lb}",
872            "{sb}",
873            "{wb}"
874        };
875        assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
876        sv_catpv(sv, bounds[FLAGS(o)]);
877    }
878    else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
879        Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o)));
880        if (NEXT_OFF(o)) {
881            Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o));
882        }
883        Perl_sv_catpvf(aTHX_ sv, "]");
884    }
885    else if (op == SBOL)
886        Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^");
887    else if (op == EVAL) {
888        if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG)
889            Perl_sv_catpvf(aTHX_ sv, " optimistic");
890    }
891
892    /* add on the verb argument if there is one */
893    if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) {
894        if ( ARG1u(o) )
895            Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
896                       SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ]))));
897        else
898            sv_catpvs(sv, ":NULL");
899    }
900#else
901    PERL_UNUSED_CONTEXT;
902    PERL_UNUSED_ARG(sv);
903    PERL_UNUSED_ARG(o);
904    PERL_UNUSED_ARG(prog);
905    PERL_UNUSED_ARG(reginfo);
906    PERL_UNUSED_ARG(pRExC_state);
907#endif  /* DEBUGGING */
908}
909
910#ifdef DEBUGGING
911
912STATIC void
913S_put_code_point(pTHX_ SV *sv, UV c)
914{
915    PERL_ARGS_ASSERT_PUT_CODE_POINT;
916
917    if (c > 255) {
918        Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
919    }
920    else if (isPRINT(c)) {
921        const char string = (char) c;
922
923        /* We use {phrase} as metanotation in the class, so also escape literal
924         * braces */
925        if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
926            sv_catpvs(sv, "\\");
927        sv_catpvn(sv, &string, 1);
928    }
929    else if (isMNEMONIC_CNTRL(c)) {
930        Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
931    }
932    else {
933        Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
934    }
935}
936
937STATIC void
938S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
939{
940    /* Appends to 'sv' a displayable version of the range of code points from
941     * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
942     * that have them, when they occur at the beginning or end of the range.
943     * It uses hex to output the remaining code points, unless 'allow_literals'
944     * is true, in which case the printable ASCII ones are output as-is (though
945     * some of these will be escaped by put_code_point()).
946     *
947     * NOTE:  This is designed only for printing ranges of code points that fit
948     *        inside an ANYOF bitmap.  Higher code points are simply suppressed
949     */
950
951    const unsigned int min_range_count = 3;
952
953    assert(start <= end);
954
955    PERL_ARGS_ASSERT_PUT_RANGE;
956
957    while (start <= end) {
958        UV this_end;
959        const char * format;
960
961        if (    end - start < min_range_count
962            && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
963        {
964            /* Output a range of 1 or 2 chars individually, or longer ranges
965             * when printable */
966            for (; start <= end; start++) {
967                put_code_point(sv, start);
968            }
969            break;
970        }
971
972        /* If permitted by the input options, and there is a possibility that
973         * this range contains a printable literal, look to see if there is
974         * one. */
975        if (allow_literals && start <= MAX_PRINT_A) {
976
977            /* If the character at the beginning of the range isn't an ASCII
978             * printable, effectively split the range into two parts:
979             *  1) the portion before the first such printable,
980             *  2) the rest
981             * and output them separately. */
982            if (! isPRINT_A(start)) {
983                UV temp_end = start + 1;
984
985                /* There is no point looking beyond the final possible
986                 * printable, in MAX_PRINT_A */
987                UV max = MIN(end, MAX_PRINT_A);
988
989                while (temp_end <= max && ! isPRINT_A(temp_end)) {
990                    temp_end++;
991                }
992
993                /* Here, temp_end points to one beyond the first printable if
994                 * found, or to one beyond 'max' if not.  If none found, make
995                 * sure that we use the entire range */
996                if (temp_end > MAX_PRINT_A) {
997                    temp_end = end + 1;
998                }
999
1000                /* Output the first part of the split range: the part that
1001                 * doesn't have printables, with the parameter set to not look
1002                 * for literals (otherwise we would infinitely recurse) */
1003                put_range(sv, start, temp_end - 1, FALSE);
1004
1005                /* The 2nd part of the range (if any) starts here. */
1006                start = temp_end;
1007
1008                /* We do a continue, instead of dropping down, because even if
1009                 * the 2nd part is non-empty, it could be so short that we want
1010                 * to output it as individual characters, as tested for at the
1011                 * top of this loop.  */
1012                continue;
1013            }
1014
1015            /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
1016             * output a sub-range of just the digits or letters, then process
1017             * the remaining portion as usual. */
1018            if (isALPHANUMERIC_A(start)) {
1019                UV mask = (isDIGIT_A(start))
1020                           ? CC_DIGIT_
1021                             : isUPPER_A(start)
1022                               ? CC_UPPER_
1023                               : CC_LOWER_;
1024                UV temp_end = start + 1;
1025
1026                /* Find the end of the sub-range that includes just the
1027                 * characters in the same class as the first character in it */
1028                while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
1029                    temp_end++;
1030                }
1031                temp_end--;
1032
1033                /* For short ranges, don't duplicate the code above to output
1034                 * them; just call recursively */
1035                if (temp_end - start < min_range_count) {
1036                    put_range(sv, start, temp_end, FALSE);
1037                }
1038                else {  /* Output as a range */
1039                    put_code_point(sv, start);
1040                    sv_catpvs(sv, "-");
1041                    put_code_point(sv, temp_end);
1042                }
1043                start = temp_end + 1;
1044                continue;
1045            }
1046
1047            /* We output any other printables as individual characters */
1048            if (isPUNCT_A(start) || isSPACE_A(start)) {
1049                while (start <= end && (isPUNCT_A(start)
1050                                        || isSPACE_A(start)))
1051                {
1052                    put_code_point(sv, start);
1053                    start++;
1054                }
1055                continue;
1056            }
1057        } /* End of looking for literals */
1058
1059        /* Here is not to output as a literal.  Some control characters have
1060         * mnemonic names.  Split off any of those at the beginning and end of
1061         * the range to print mnemonically.  It isn't possible for many of
1062         * these to be in a row, so this won't overwhelm with output */
1063        if (   start <= end
1064            && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
1065        {
1066            while (isMNEMONIC_CNTRL(start) && start <= end) {
1067                put_code_point(sv, start);
1068                start++;
1069            }
1070
1071            /* If this didn't take care of the whole range ... */
1072            if (start <= end) {
1073
1074                /* Look backwards from the end to find the final non-mnemonic
1075                 * */
1076                UV temp_end = end;
1077                while (isMNEMONIC_CNTRL(temp_end)) {
1078                    temp_end--;
1079                }
1080
1081                /* And separately output the interior range that doesn't start
1082                 * or end with mnemonics */
1083                put_range(sv, start, temp_end, FALSE);
1084
1085                /* Then output the mnemonic trailing controls */
1086                start = temp_end + 1;
1087                while (start <= end) {
1088                    put_code_point(sv, start);
1089                    start++;
1090                }
1091                break;
1092            }
1093        }
1094
1095        /* As a final resort, output the range or subrange as hex. */
1096
1097        if (start >= NUM_ANYOF_CODE_POINTS) {
1098            this_end = end;
1099        }
1100        else {  /* Have to split range at the bitmap boundary */
1101            this_end = (end < NUM_ANYOF_CODE_POINTS)
1102                        ? end
1103                        : NUM_ANYOF_CODE_POINTS - 1;
1104        }
1105#if NUM_ANYOF_CODE_POINTS > 256
1106        format = (this_end < 256)
1107                 ? "\\x%02" UVXf "-\\x%02" UVXf
1108                 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
1109#else
1110        format = "\\x%02" UVXf "-\\x%02" UVXf;
1111#endif
1112        GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
1113        Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
1114        GCC_DIAG_RESTORE_STMT;
1115        break;
1116    }
1117}
1118
1119STATIC void
1120S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
1121{
1122    /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
1123     * 'invlist' */
1124
1125    UV start, end;
1126    bool allow_literals = TRUE;
1127
1128    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
1129
1130    /* Generally, it is more readable if printable characters are output as
1131     * literals, but if a range (nearly) spans all of them, it's best to output
1132     * it as a single range.  This code will use a single range if all but 2
1133     * ASCII printables are in it */
1134    invlist_iterinit(invlist);
1135    while (invlist_iternext(invlist, &start, &end)) {
1136
1137        /* If the range starts beyond the final printable, it doesn't have any
1138         * in it */
1139        if (start > MAX_PRINT_A) {
1140            break;
1141        }
1142
1143        /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
1144         * all but two, the range must start and end no later than 2 from
1145         * either end */
1146        if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
1147            if (end > MAX_PRINT_A) {
1148                end = MAX_PRINT_A;
1149            }
1150            if (start < ' ') {
1151                start = ' ';
1152            }
1153            if (end - start >= MAX_PRINT_A - ' ' - 2) {
1154                allow_literals = FALSE;
1155            }
1156            break;
1157        }
1158    }
1159    invlist_iterfinish(invlist);
1160
1161    /* Here we have figured things out.  Output each range */
1162    invlist_iterinit(invlist);
1163    while (invlist_iternext(invlist, &start, &end)) {
1164        if (start >= NUM_ANYOF_CODE_POINTS) {
1165            break;
1166        }
1167        put_range(sv, start, end, allow_literals);
1168    }
1169    invlist_iterfinish(invlist);
1170
1171    return;
1172}
1173
1174STATIC SV*
1175S_put_charclass_bitmap_innards_common(pTHX_
1176        SV* invlist,            /* The bitmap */
1177        SV* posixes,            /* Under /l, things like [:word:], \S */
1178        SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
1179        SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
1180        SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
1181        const bool invert       /* Is the result to be inverted? */
1182)
1183{
1184    /* Create and return an SV containing a displayable version of the bitmap
1185     * and associated information determined by the input parameters.  If the
1186     * output would have been only the inversion indicator '^', NULL is instead
1187     * returned. */
1188
1189    SV * output;
1190
1191    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
1192
1193    if (invert) {
1194        output = newSVpvs("^");
1195    }
1196    else {
1197        output = newSVpvs("");
1198    }
1199
1200    /* First, the code points in the bitmap that are unconditionally there */
1201    put_charclass_bitmap_innards_invlist(output, invlist);
1202
1203    /* Traditionally, these have been placed after the main code points */
1204    if (posixes) {
1205        sv_catsv(output, posixes);
1206    }
1207
1208    if (only_utf8 && _invlist_len(only_utf8)) {
1209        Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
1210        put_charclass_bitmap_innards_invlist(output, only_utf8);
1211    }
1212
1213    if (not_utf8 && _invlist_len(not_utf8)) {
1214        Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
1215        put_charclass_bitmap_innards_invlist(output, not_utf8);
1216    }
1217
1218    if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
1219        Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
1220        put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
1221
1222        /* This is the only list in this routine that can legally contain code
1223         * points outside the bitmap range.  The call just above to
1224         * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
1225         * output them here.  There's about a half-dozen possible, and none in
1226         * contiguous ranges longer than 2 */
1227        if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1228            UV start, end;
1229            SV* above_bitmap = NULL;
1230
1231            _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
1232
1233            invlist_iterinit(above_bitmap);
1234            while (invlist_iternext(above_bitmap, &start, &end)) {
1235                UV i;
1236
1237                for (i = start; i <= end; i++) {
1238                    put_code_point(output, i);
1239                }
1240            }
1241            invlist_iterfinish(above_bitmap);
1242            SvREFCNT_dec_NN(above_bitmap);
1243        }
1244    }
1245
1246    if (invert && SvCUR(output) == 1) {
1247        return NULL;
1248    }
1249
1250    return output;
1251}
1252
1253STATIC U8
1254S_put_charclass_bitmap_innards(pTHX_ SV *sv,
1255                                     char *bitmap,
1256                                     SV *nonbitmap_invlist,
1257                                     SV *only_utf8_locale_invlist,
1258                                     const regnode * const node,
1259                                     const U8 flags,
1260                                     const bool force_as_is_display)
1261{
1262    /* Appends to 'sv' a displayable version of the innards of the bracketed
1263     * character class defined by the other arguments:
1264     *  'bitmap' points to the bitmap, or NULL if to ignore that.
1265     *  'nonbitmap_invlist' is an inversion list of the code points that are in
1266     *      the bitmap range, but for some reason aren't in the bitmap; NULL if
1267     *      none.  The reasons for this could be that they require some
1268     *      condition such as the target string being or not being in UTF-8
1269     *      (under /d), or because they came from a user-defined property that
1270     *      was not resolved at the time of the regex compilation (under /u)
1271     *  'only_utf8_locale_invlist' is an inversion list of the code points that
1272     *      are valid only if the runtime locale is a UTF-8 one; NULL if none
1273     *  'node' is the regex pattern ANYOF node.  It is needed only when the
1274     *      above two parameters are not null, and is passed so that this
1275     *      routine can tease apart the various reasons for them.
1276     *  'flags' is the flags field of 'node'
1277     *  'force_as_is_display' is TRUE if this routine should definitely NOT try
1278     *      to invert things to see if that leads to a cleaner display.  If
1279     *      FALSE, this routine is free to use its judgment about doing this.
1280     *
1281     * It returns 0 if nothing was actually output.  (It may be that
1282     *              the bitmap, etc is empty.)
1283     *            1 if the output wasn't inverted (didn't begin with a '^')
1284     *            2 if the output was inverted (did begin with a '^')
1285     *
1286     * When called for outputting the bitmap of a non-ANYOF node, just pass the
1287     * bitmap, with the succeeding parameters set to NULL, and the final one to
1288     * FALSE.
1289     */
1290
1291    /* In general, it tries to display the 'cleanest' representation of the
1292     * innards, choosing whether to display them inverted or not, regardless of
1293     * whether the class itself is to be inverted.  However,  there are some
1294     * cases where it can't try inverting, as what actually matches isn't known
1295     * until runtime, and hence the inversion isn't either. */
1296
1297    bool inverting_allowed = ! force_as_is_display;
1298
1299    int i;
1300    STRLEN orig_sv_cur = SvCUR(sv);
1301
1302    SV* invlist;            /* Inversion list we accumulate of code points that
1303                               are unconditionally matched */
1304    SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
1305                               UTF-8 */
1306    SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
1307                             */
1308    SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
1309    SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
1310                                       is UTF-8 */
1311
1312    SV* as_is_display;      /* The output string when we take the inputs
1313                               literally */
1314    SV* inverted_display;   /* The output string when we invert the inputs */
1315
1316    bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
1317                                                   to match? */
1318    /* We are biased in favor of displaying things without them being inverted,
1319     * as that is generally easier to understand */
1320    const int bias = 5;
1321
1322    PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
1323
1324    /* Start off with whatever code points are passed in.  (We clone, so we
1325     * don't change the caller's list) */
1326    if (nonbitmap_invlist) {
1327        assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
1328        invlist = invlist_clone(nonbitmap_invlist, NULL);
1329    }
1330    else {  /* Worst case size is every other code point is matched */
1331        invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
1332    }
1333
1334    if (flags) {
1335        if (OP(node) == ANYOFD) {
1336
1337            /* This flag indicates that the code points below 0x100 in the
1338             * nonbitmap list are precisely the ones that match only when the
1339             * target is UTF-8 (they should all be non-ASCII). */
1340            if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
1341                _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
1342                _invlist_subtract(invlist, only_utf8, &invlist);
1343            }
1344
1345            /* And this flag for matching all non-ASCII 0xFF and below */
1346            if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
1347                not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
1348            }
1349        }
1350        else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
1351
1352            /* If either of these flags are set, what matches isn't
1353             * determinable except during execution, so don't know enough here
1354             * to invert */
1355            if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
1356                inverting_allowed = FALSE;
1357            }
1358
1359            /* What the posix classes match also varies at runtime, so these
1360             * will be output symbolically. */
1361            if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
1362                int i;
1363
1364                posixes = newSVpvs("");
1365                for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
1366                    if (ANYOF_POSIXL_TEST(node, i)) {
1367                        sv_catpv(posixes, anyofs[i]);
1368                    }
1369                }
1370            }
1371        }
1372    }
1373
1374    /* Accumulate the bit map into the unconditional match list */
1375    if (bitmap) {
1376        for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1377            if (BITMAP_TEST(bitmap, i)) {
1378                int start = i++;
1379                for (;
1380                     i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
1381                     i++)
1382                { /* empty */ }
1383                invlist = _add_range_to_invlist(invlist, start, i-1);
1384            }
1385        }
1386    }
1387
1388    /* Make sure that the conditional match lists don't have anything in them
1389     * that match unconditionally; otherwise the output is quite confusing.
1390     * This could happen if the code that populates these misses some
1391     * duplication. */
1392    if (only_utf8) {
1393        _invlist_subtract(only_utf8, invlist, &only_utf8);
1394    }
1395    if (not_utf8) {
1396        _invlist_subtract(not_utf8, invlist, &not_utf8);
1397    }
1398
1399    if (only_utf8_locale_invlist) {
1400
1401        /* Since this list is passed in, we have to make a copy before
1402         * modifying it */
1403        only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
1404
1405        _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
1406
1407        /* And, it can get really weird for us to try outputting an inverted
1408         * form of this list when it has things above the bitmap, so don't even
1409         * try */
1410        if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1411            inverting_allowed = FALSE;
1412        }
1413    }
1414
1415    /* Calculate what the output would be if we take the input as-is */
1416    as_is_display = put_charclass_bitmap_innards_common(invlist,
1417                                                    posixes,
1418                                                    only_utf8,
1419                                                    not_utf8,
1420                                                    only_utf8_locale,
1421                                                    invert);
1422
1423    /* If have to take the output as-is, just do that */
1424    if (! inverting_allowed) {
1425        if (as_is_display) {
1426            sv_catsv(sv, as_is_display);
1427            SvREFCNT_dec_NN(as_is_display);
1428        }
1429    }
1430    else { /* But otherwise, create the output again on the inverted input, and
1431              use whichever version is shorter */
1432
1433        int inverted_bias, as_is_bias;
1434
1435        /* We will apply our bias to whichever of the results doesn't have
1436         * the '^' */
1437        bool trial_invert;
1438        if (invert) {
1439            trial_invert = FALSE;
1440            as_is_bias = bias;
1441            inverted_bias = 0;
1442        }
1443        else {
1444            trial_invert = TRUE;
1445            as_is_bias = 0;
1446            inverted_bias = bias;
1447        }
1448
1449        /* Now invert each of the lists that contribute to the output,
1450         * excluding from the result things outside the possible range */
1451
1452        /* For the unconditional inversion list, we have to add in all the
1453         * conditional code points, so that when inverted, they will be gone
1454         * from it */
1455        _invlist_union(only_utf8, invlist, &invlist);
1456        _invlist_union(not_utf8, invlist, &invlist);
1457        _invlist_union(only_utf8_locale, invlist, &invlist);
1458        _invlist_invert(invlist);
1459        _invlist_intersection(invlist, PL_InBitmap, &invlist);
1460
1461        if (only_utf8) {
1462            _invlist_invert(only_utf8);
1463            _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
1464        }
1465        else if (not_utf8) {
1466
1467            /* If a code point matches iff the target string is not in UTF-8,
1468             * then complementing the result has it not match iff not in UTF-8,
1469             * which is the same thing as matching iff it is UTF-8. */
1470            only_utf8 = not_utf8;
1471            not_utf8 = NULL;
1472        }
1473
1474        if (only_utf8_locale) {
1475            _invlist_invert(only_utf8_locale);
1476            _invlist_intersection(only_utf8_locale,
1477                                  PL_InBitmap,
1478                                  &only_utf8_locale);
1479        }
1480
1481        inverted_display = put_charclass_bitmap_innards_common(
1482                                            invlist,
1483                                            posixes,
1484                                            only_utf8,
1485                                            not_utf8,
1486                                            only_utf8_locale, trial_invert);
1487
1488        /* Use the shortest representation, taking into account our bias
1489         * against showing it inverted */
1490        if (   inverted_display
1491            && (   ! as_is_display
1492                || (  SvCUR(inverted_display) + inverted_bias
1493                    < SvCUR(as_is_display)    + as_is_bias)))
1494        {
1495            sv_catsv(sv, inverted_display);
1496            invert = ! invert;
1497        }
1498        else if (as_is_display) {
1499            sv_catsv(sv, as_is_display);
1500        }
1501
1502        SvREFCNT_dec(as_is_display);
1503        SvREFCNT_dec(inverted_display);
1504    }
1505
1506    SvREFCNT_dec_NN(invlist);
1507    SvREFCNT_dec(only_utf8);
1508    SvREFCNT_dec(not_utf8);
1509    SvREFCNT_dec(posixes);
1510    SvREFCNT_dec(only_utf8_locale);
1511
1512    U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
1513    if (did_output_something) {
1514        /* Distinguish between non and inverted cases */
1515        did_output_something += invert;
1516    }
1517
1518    return did_output_something;
1519}
1520
1521
1522const regnode *
1523Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
1524            const regnode *last, const regnode *plast,
1525            SV* sv, I32 indent, U32 depth)
1526{
1527    const regnode *next;
1528    const regnode *optstart= NULL;
1529
1530    RXi_GET_DECL(r, ri);
1531    DECLARE_AND_GET_RE_DEBUG_FLAGS;
1532
1533    PERL_ARGS_ASSERT_DUMPUNTIL;
1534
1535#ifdef DEBUG_DUMPUNTIL
1536    Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
1537        last ? last-start : 0, plast ? plast-start : 0);
1538#endif
1539
1540    if (plast && plast < last)
1541        last= plast;
1542
1543    while (node && (!last || node < last)) {
1544        const U8 op = OP(node);
1545
1546        if (op == CLOSE || op == SRCLOSE || op == WHILEM)
1547            indent--;
1548        next = regnext((regnode *)node);
1549        const regnode *after = regnode_after((regnode *)node,0);
1550
1551        /* Where, what. */
1552        if (op == OPTIMIZED) {
1553            if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
1554                optstart = node;
1555            else
1556                goto after_print;
1557        } else
1558            CLEAR_OPTSTART;
1559
1560        regprop(r, sv, node, NULL, NULL);
1561        Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
1562                      (int)(2*indent + 1), "", SvPVX_const(sv));
1563
1564        if (op != OPTIMIZED) {
1565            if (next == NULL)           /* Next ptr. */
1566                Perl_re_printf( aTHX_  " (0)");
1567            else if (REGNODE_TYPE(op) == BRANCH
1568                     && REGNODE_TYPE(OP(next)) != BRANCH )
1569                Perl_re_printf( aTHX_  " (FAIL)");
1570            else
1571                Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
1572            Perl_re_printf( aTHX_ "\n");
1573        }
1574
1575      after_print:
1576        if (REGNODE_TYPE(op) == BRANCHJ) {
1577            assert(next);
1578            const regnode *nnode = (OP(next) == LONGJMP
1579                                   ? regnext((regnode *)next)
1580                                   : next);
1581            if (last && nnode > last)
1582                nnode = last;
1583            DUMPUNTIL(after, nnode);
1584        }
1585        else if (REGNODE_TYPE(op) == BRANCH) {
1586            assert(next);
1587            DUMPUNTIL(after, next);
1588        }
1589        else if ( REGNODE_TYPE(op)  == TRIE ) {
1590            const regnode *this_trie = node;
1591            const U32 n = ARG1u(node);
1592            const reg_ac_data * const ac = op>=AHOCORASICK ?
1593               (reg_ac_data *)ri->data->data[n] :
1594               NULL;
1595            const reg_trie_data * const trie =
1596                (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
1597#ifdef DEBUGGING
1598            AV *const trie_words
1599                           = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
1600#endif
1601            const regnode *nextbranch= NULL;
1602            I32 word_idx;
1603            SvPVCLEAR(sv);
1604            for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
1605                SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
1606
1607                Perl_re_indentf( aTHX_  "%s ",
1608                    indent+3,
1609                    elem_ptr
1610                    ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
1611                                SvCUR(*elem_ptr), PL_dump_re_max_len,
1612                                PL_colors[0], PL_colors[1],
1613                                (SvUTF8(*elem_ptr)
1614                                 ? PERL_PV_ESCAPE_UNI
1615                                 : 0)
1616                                | PERL_PV_PRETTY_ELLIPSES
1617                                | PERL_PV_PRETTY_LTGT
1618                            )
1619                    : "???"
1620                );
1621                if (trie->jump) {
1622                    U16 dist= trie->jump[word_idx+1];
1623                    Perl_re_printf( aTHX_  "(%" UVuf ")\n",
1624                               (UV)((dist ? this_trie + dist : next) - start));
1625                    if (dist) {
1626                        if (!nextbranch)
1627                            nextbranch= this_trie + trie->jump[0];
1628                        DUMPUNTIL(this_trie + dist, nextbranch);
1629                    }
1630                    if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
1631                        nextbranch= regnext((regnode *)nextbranch);
1632                } else {
1633                    Perl_re_printf( aTHX_  "\n");
1634                }
1635            }
1636            if (last && next > last)
1637                node= last;
1638            else
1639                node= next;
1640        }
1641        else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
1642            DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
1643        }
1644        else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
1645            assert(next);
1646            DUMPUNTIL(after, next);
1647        }
1648        else if ( op == PLUS || op == STAR) {
1649            DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
1650        }
1651        else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
1652            /* Literal string, where present. */
1653            node = (const regnode *)REGNODE_AFTER_varies(node);
1654        }
1655        else {
1656            node = REGNODE_AFTER_opcode(node,op);
1657        }
1658        if (op == CURLYX || op == OPEN || op == SROPEN)
1659            indent++;
1660        if (REGNODE_TYPE(op) == END)
1661            break;
1662    }
1663    CLEAR_OPTSTART;
1664#ifdef DEBUG_DUMPUNTIL
1665    Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
1666#endif
1667    return node;
1668}
1669
1670#endif  /* DEBUGGING */
1671