1/*    regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
6 *
7 *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression.  See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below).  Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places.  In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34/*
35 * pregcomp and pregexec -- regsub and regerror are not used in perl
36 *
37 *	Copyright (c) 1986 by University of Toronto.
38 *	Written by Henry Spencer.  Not derived from licensed software.
39 *
40 *	Permission is granted to anyone to use this software for any
41 *	purpose on any computer system, and to redistribute it freely,
42 *	subject to the following restrictions:
43 *
44 *	1. The author is not responsible for the consequences of use of
45 *		this software, no matter how awful, even if they arise
46 *		from defects in it.
47 *
48 *	2. The origin of this software must not be misrepresented, either
49 *		by explicit claim or by omission.
50 *
51 *	3. Altered versions must be plainly marked as such, and must not
52 *		be misrepresented as being the original software.
53 *
54 *
55 ****    Alterations to Henry's code are...
56 ****
57 ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58 ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59 ****    by Larry Wall and others
60 ****
61 ****    You may distribute under the terms of either the GNU General Public
62 ****    License or the Artistic License, as specified in the README file.
63
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions.  Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69
70/* Note on debug output:
71 *
72 * This is set up so that -Dr turns on debugging like all other flags that are
73 * enabled by -DDEBUGGING.  -Drv gives more verbose output.  This applies to
74 * all regular expressions encountered in a program, and gives a huge amount of
75 * output for all but the shortest programs.
76 *
77 * The ability to output pattern debugging information lexically, and with much
78 * finer grained control was added, with 'use re qw(Debug ....);' available even
79 * in non-DEBUGGING builds.  This is accomplished by copying the contents of
80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81 * Those files are compiled and linked into the perl executable, and they are
82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
83 * to re.pm.
84 *
85 * That would normally mean linking errors when two functions of the same name
86 * are attempted to be placed into the same executable.  That is solved in one
87 * of four ways:
88 *  1)  Static functions aren't known outside the file they are in, so for the
89 *      many functions of that type in this file, it just isn't a problem.
90 *  2)  Most externally known functions are enclosed in
91 *          #ifndef PERL_IN_XSUB_RE
92 *          ...
93 *          #endif
94 *      blocks, so there is only one definition for them in the whole
95 *      executable, the one in regcomp.c (or regexec.c).  The implication of
96 *      that is any debugging info that comes from them is controlled only by
97 *      -Dr.  Further, any static function they call will also be the version
98 *      in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99 *  3)  About a dozen external functions are re-#defined in ext/re/re_top.h, to
100 *      have different names, so that what gets loaded in the executable is
101 *      'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102 *      from re_comp.c (and re_exec.c), but with the name 'my_foo'  Debugging
103 *      in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104 *      versions and their callees are under control of re.pm.   The catch is
105 *      that references to all these go through the regexp_engine structure,
106 *      which is initialized in regcomp.h to the Perl_foo versions, and
107 *      substituted out in lexical scopes where 'use re' is in effect to the
108 *      'my_foo' ones.   That structure is public API, so it would be a hard
109 *      sell to add any additional members.
110 *  4)  For functions in regcomp.c and re_comp.c that are called only from,
111 *      respectively, regexec.c and re_exec.c, they can have two different
112 *      names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
113 *      embed.fnc.
114 *
115 * The bottom line is that if you add code to one of the public functions
116 * listed in ext/re/re_top.h, debugging automagically works.  But if you write
117 * a new function that needs to do debugging or there is a chain of calls from
118 * it that need to do debugging, all functions in the chain should use options
119 * 2) or 4) above.
120 *
121 * A function may have to be split so that debugging stuff is static, but it
122 * calls out to some other function that only gets compiled in regcomp.c to
123 * access data that we don't want to duplicate.
124 */
125
126#ifdef PERL_EXT_RE_BUILD
127#include "re_top.h"
128#endif
129
130#include "EXTERN.h"
131#define PERL_IN_REGEX_ENGINE
132#define PERL_IN_REGCOMP_ANY
133#define PERL_IN_REGCOMP_C
134#include "perl.h"
135
136#ifdef PERL_IN_XSUB_RE
137#  include "re_comp.h"
138EXTERN_C const struct regexp_engine my_reg_engine;
139EXTERN_C const struct regexp_engine wild_reg_engine;
140#else
141#  include "regcomp.h"
142#endif
143
144#include "invlist_inline.h"
145#include "unicode_constants.h"
146#include "regcomp_internal.h"
147
148/* =========================================================
149 * BEGIN edit_distance stuff.
150 *
151 * This calculates how many single character changes of any type are needed to
152 * transform a string into another one.  It is taken from version 3.1 of
153 *
154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
155 */
156
157/* Our unsorted dictionary linked list.   */
158/* Note we use UVs, not chars. */
159
160struct dictionary{
161  UV key;
162  UV value;
163  struct dictionary* next;
164};
165typedef struct dictionary item;
166
167
168PERL_STATIC_INLINE item*
169push(UV key, item* curr)
170{
171    item* head;
172    Newx(head, 1, item);
173    head->key = key;
174    head->value = 0;
175    head->next = curr;
176    return head;
177}
178
179
180PERL_STATIC_INLINE item*
181find(item* head, UV key)
182{
183    item* iterator = head;
184    while (iterator){
185        if (iterator->key == key){
186            return iterator;
187        }
188        iterator = iterator->next;
189    }
190
191    return NULL;
192}
193
194PERL_STATIC_INLINE item*
195uniquePush(item* head, UV key)
196{
197    item* iterator = head;
198
199    while (iterator){
200        if (iterator->key == key) {
201            return head;
202        }
203        iterator = iterator->next;
204    }
205
206    return push(key, head);
207}
208
209PERL_STATIC_INLINE void
210dict_free(item* head)
211{
212    item* iterator = head;
213
214    while (iterator) {
215        item* temp = iterator;
216        iterator = iterator->next;
217        Safefree(temp);
218    }
219
220    head = NULL;
221}
222
223/* End of Dictionary Stuff */
224
225/* All calculations/work are done here */
226STATIC int
227S_edit_distance(const UV* src,
228                const UV* tgt,
229                const STRLEN x,             /* length of src[] */
230                const STRLEN y,             /* length of tgt[] */
231                const SSize_t maxDistance
232)
233{
234    item *head = NULL;
235    UV swapCount, swapScore, targetCharCount, i, j;
236    UV *scores;
237    UV score_ceil = x + y;
238
239    PERL_ARGS_ASSERT_EDIT_DISTANCE;
240
241    /* initialize matrix start values */
242    Newx(scores, ( (x + 2) * (y + 2)), UV);
243    scores[0] = score_ceil;
244    scores[1 * (y + 2) + 0] = score_ceil;
245    scores[0 * (y + 2) + 1] = score_ceil;
246    scores[1 * (y + 2) + 1] = 0;
247    head = uniquePush(uniquePush(head, src[0]), tgt[0]);
248
249    /* work loops    */
250    /* i = src index */
251    /* j = tgt index */
252    for (i=1;i<=x;i++) {
253        if (i < x)
254            head = uniquePush(head, src[i]);
255        scores[(i+1) * (y + 2) + 1] = i;
256        scores[(i+1) * (y + 2) + 0] = score_ceil;
257        swapCount = 0;
258
259        for (j=1;j<=y;j++) {
260            if (i == 1) {
261                if(j < y)
262                head = uniquePush(head, tgt[j]);
263                scores[1 * (y + 2) + (j + 1)] = j;
264                scores[0 * (y + 2) + (j + 1)] = score_ceil;
265            }
266
267            targetCharCount = find(head, tgt[j-1])->value;
268            swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
269
270            if (src[i-1] != tgt[j-1]){
271                scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
272            }
273            else {
274                swapCount = j;
275                scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
276            }
277        }
278
279        find(head, src[i-1])->value = i;
280    }
281
282    {
283        IV score = scores[(x+1) * (y + 2) + (y + 1)];
284        dict_free(head);
285        Safefree(scores);
286        return (maxDistance != 0 && maxDistance < score)?(-1):score;
287    }
288}
289
290/* END of edit_distance() stuff
291 * ========================================================= */
292
293/* add a data member to the struct reg_data attached to this regex, it should
294 * always return a non-zero return. the 's' argument is the type of the items
295 * being added and the n is the number of items. The length of 's' should match
296 * the number of items. */
297U32
298Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
299{
300    U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
301
302    PERL_ARGS_ASSERT_REG_ADD_DATA;
303
304    /* in the below expression we have (count + n - 1), the minus one is there
305     * because the struct that we allocate already contains a slot for 1 data
306     * item, so we do not need to allocate it the first time. IOW, the
307     * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
308     * to allocate. See struct reg_data in regcomp.h
309     */
310    Renewc(RExC_rxi->data,
311           sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
312           char, struct reg_data);
313    /* however in the data->what expression we use (count + n) and do not
314     * subtract one from the result because the data structure contains a
315     * pointer to an array, and does not allocate the first element as part of
316     * the data struct. */
317    if (count > 1)
318        Renew(RExC_rxi->data->what, (count + n), U8);
319    else {
320        /* when count == 1 it means we have not initialized anything.
321         * we always fill the 0 slot of the data array with a '%' entry, which
322         * means "zero" (all the other types are letters) which exists purely
323         * so the return from reg_add_data is ALWAYS true, so we can tell it apart
324         * from a "no value" idx=0 in places where we would return an index
325         * into reg_add_data.  This is particularly important with the new "single
326         * pass, usually, but not always" strategy that we use, where the code
327         * will use a 0 to represent "not able to compute this yet".
328         */
329        Newx(RExC_rxi->data->what, n+1, U8);
330        /* fill in the placeholder slot of 0 with a what of '%', we use
331         * this because it sorta looks like a zero (0/0) and it is not a letter
332         * like any of the other "whats", this type should never be created
333         * any other way but here. '%' happens to also not appear in this
334         * file for any other reason (at the time of writing this comment)*/
335        RExC_rxi->data->what[0]= '%';
336        RExC_rxi->data->data[0]= NULL;
337    }
338    RExC_rxi->data->count = count + n;
339    Copy(s, RExC_rxi->data->what + count, n, U8);
340    assert(count>0);
341    return count;
342}
343
344/*XXX: todo make this not included in a non debugging perl, but appears to be
345 * used anyway there, in 'use re' */
346#ifndef PERL_IN_XSUB_RE
347void
348Perl_reginitcolors(pTHX)
349{
350    const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
351    if (s) {
352        char *t = savepv(s);
353        int i = 0;
354        PL_colors[0] = t;
355        while (++i < 6) {
356            t = strchr(t, '\t');
357            if (t) {
358                *t = '\0';
359                PL_colors[i] = ++t;
360            }
361            else
362                PL_colors[i] = t = (char *)"";
363        }
364    } else {
365        int i = 0;
366        while (i < 6)
367            PL_colors[i++] = (char *)"";
368    }
369    PL_colorset = 1;
370}
371#endif
372
373
374#ifdef TRIE_STUDY_OPT
375/* search for "restudy" in this file for a detailed explanation */
376#define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
377    STMT_START {                                            \
378        if (                                                \
379              (data.flags & SCF_TRIE_RESTUDY)               \
380              && ! restudied++                              \
381        ) {                                                 \
382            dOsomething;                                    \
383            goto reStudy;                                   \
384        }                                                   \
385    } STMT_END
386#else
387#define CHECK_RESTUDY_GOTO_butfirst
388#endif
389
390/*
391 * pregcomp - compile a regular expression into internal code
392 *
393 * Decides which engine's compiler to call based on the hint currently in
394 * scope
395 */
396
397#ifndef PERL_IN_XSUB_RE
398
399/* return the currently in-scope regex engine (or the default if none)  */
400
401regexp_engine const *
402Perl_current_re_engine(pTHX)
403{
404    if (IN_PERL_COMPILETIME) {
405        HV * const table = GvHV(PL_hintgv);
406        SV **ptr;
407
408        if (!table || !(PL_hints & HINT_LOCALIZE_HH))
409            return &PL_core_reg_engine;
410        ptr = hv_fetchs(table, "regcomp", FALSE);
411        if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
412            return &PL_core_reg_engine;
413        return INT2PTR(regexp_engine*, SvIV(*ptr));
414    }
415    else {
416        SV *ptr;
417        if (!PL_curcop->cop_hints_hash)
418            return &PL_core_reg_engine;
419        ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
420        if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
421            return &PL_core_reg_engine;
422        return INT2PTR(regexp_engine*, SvIV(ptr));
423    }
424}
425
426
427REGEXP *
428Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
429{
430    regexp_engine const *eng = current_re_engine();
431    DECLARE_AND_GET_RE_DEBUG_FLAGS;
432
433    PERL_ARGS_ASSERT_PREGCOMP;
434
435    /* Dispatch a request to compile a regexp to correct regexp engine. */
436    DEBUG_COMPILE_r({
437        Perl_re_printf( aTHX_  "Using engine %" UVxf "\n",
438                        PTR2UV(eng));
439    });
440    return CALLREGCOMP_ENG(eng, pattern, flags);
441}
442#endif
443
444/*
445=for apidoc re_compile
446
447Compile the regular expression pattern C<pattern>, returning a pointer to the
448compiled object for later matching with the internal regex engine.
449
450This function is typically used by a custom regexp engine C<.comp()> function
451to hand off to the core regexp engine those patterns it doesn't want to handle
452itself (typically passing through the same flags it was called with).  In
453almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
454to compile using the currently active regexp engine.
455
456If C<pattern> is already a C<REGEXP>, this function does nothing but return a
457pointer to the input.  Otherwise the PV is extracted and treated like a string
458representing a pattern.  See L<perlre>.
459
460The possible flags for C<rx_flags> are documented in L<perlreapi>.  Their names
461all begin with C<RXf_>.
462
463=cut
464
465 * public entry point for the perl core's own regex compiling code.
466 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
467 * pattern rather than a list of OPs, and uses the internal engine rather
468 * than the current one */
469
470REGEXP *
471Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
472{
473    SV *pat = pattern; /* defeat constness! */
474
475    PERL_ARGS_ASSERT_RE_COMPILE;
476
477    return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
478#ifdef PERL_IN_XSUB_RE
479                                &my_reg_engine,
480#else
481                                &PL_core_reg_engine,
482#endif
483                                NULL, NULL, rx_flags, 0);
484}
485
486static void
487S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
488{
489    int n;
490
491    if (--cbs->refcnt > 0)
492        return;
493    for (n = 0; n < cbs->count; n++) {
494        REGEXP *rx = cbs->cb[n].src_regex;
495        if (rx) {
496            cbs->cb[n].src_regex = NULL;
497            SvREFCNT_dec_NN(rx);
498        }
499    }
500    Safefree(cbs->cb);
501    Safefree(cbs);
502}
503
504
505static struct reg_code_blocks *
506S_alloc_code_blocks(pTHX_  int ncode)
507{
508     struct reg_code_blocks *cbs;
509    Newx(cbs, 1, struct reg_code_blocks);
510    cbs->count = ncode;
511    cbs->refcnt = 1;
512    SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
513    if (ncode)
514        Newx(cbs->cb, ncode, struct reg_code_block);
515    else
516        cbs->cb = NULL;
517    return cbs;
518}
519
520
521/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
522 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
523 * point to the realloced string and length.
524 *
525 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
526 * stuff added */
527
528static void
529S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
530                    char **pat_p, STRLEN *plen_p, int num_code_blocks)
531{
532    U8 *const src = (U8*)*pat_p;
533    U8 *dst, *d;
534    int n=0;
535    STRLEN s = 0;
536    bool do_end = 0;
537    DECLARE_AND_GET_RE_DEBUG_FLAGS;
538
539    DEBUG_PARSE_r(Perl_re_printf( aTHX_
540        "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
541
542    /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
543    Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
544    d = dst;
545
546    while (s < *plen_p) {
547        append_utf8_from_native_byte(src[s], &d);
548
549        if (n < num_code_blocks) {
550            assert(pRExC_state->code_blocks);
551            if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
552                pRExC_state->code_blocks->cb[n].start = d - dst - 1;
553                assert(*(d - 1) == '(');
554                do_end = 1;
555            }
556            else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
557                pRExC_state->code_blocks->cb[n].end = d - dst - 1;
558                assert(*(d - 1) == ')');
559                do_end = 0;
560                n++;
561            }
562        }
563        s++;
564    }
565    *d = '\0';
566    *plen_p = d - dst;
567    *pat_p = (char*) dst;
568    SAVEFREEPV(*pat_p);
569    RExC_orig_utf8 = RExC_utf8 = 1;
570}
571
572
573
574/* S_concat_pat(): concatenate a list of args to the pattern string pat,
575 * while recording any code block indices, and handling overloading,
576 * nested qr// objects etc.  If pat is null, it will allocate a new
577 * string, or just return the first arg, if there's only one.
578 *
579 * Returns the malloced/updated pat.
580 * patternp and pat_count is the array of SVs to be concatted;
581 * oplist is the optional list of ops that generated the SVs;
582 * recompile_p is a pointer to a boolean that will be set if
583 *   the regex will need to be recompiled.
584 * delim, if non-null is an SV that will be inserted between each element
585 */
586
587static SV*
588S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
589                SV *pat, SV ** const patternp, int pat_count,
590                OP *oplist, bool *recompile_p, SV *delim)
591{
592    SV **svp;
593    int n = 0;
594    bool use_delim = FALSE;
595    bool alloced = FALSE;
596
597    /* if we know we have at least two args, create an empty string,
598     * then concatenate args to that. For no args, return an empty string */
599    if (!pat && pat_count != 1) {
600        pat = newSVpvs("");
601        SAVEFREESV(pat);
602        alloced = TRUE;
603    }
604
605    for (svp = patternp; svp < patternp + pat_count; svp++) {
606        SV *sv;
607        SV *rx  = NULL;
608        STRLEN orig_patlen = 0;
609        bool code = 0;
610        SV *msv = use_delim ? delim : *svp;
611        if (!msv) msv = &PL_sv_undef;
612
613        /* if we've got a delimiter, we go round the loop twice for each
614         * svp slot (except the last), using the delimiter the second
615         * time round */
616        if (use_delim) {
617            svp--;
618            use_delim = FALSE;
619        }
620        else if (delim)
621            use_delim = TRUE;
622
623        if (SvTYPE(msv) == SVt_PVAV) {
624            /* we've encountered an interpolated array within
625             * the pattern, e.g. /...@a..../. Expand the list of elements,
626             * then recursively append elements.
627             * The code in this block is based on S_pushav() */
628
629            AV *const av = (AV*)msv;
630            const SSize_t maxarg = AvFILL(av) + 1;
631            SV **array;
632
633            if (oplist) {
634                assert(oplist->op_type == OP_PADAV
635                    || oplist->op_type == OP_RV2AV);
636                oplist = OpSIBLING(oplist);
637            }
638
639            if (SvRMAGICAL(av)) {
640                SSize_t i;
641
642                Newx(array, maxarg, SV*);
643                SAVEFREEPV(array);
644                for (i=0; i < maxarg; i++) {
645                    SV ** const svp = av_fetch(av, i, FALSE);
646                    array[i] = svp ? *svp : &PL_sv_undef;
647                }
648            }
649            else
650                array = AvARRAY(av);
651
652            if (maxarg > 0) {
653                pat = S_concat_pat(aTHX_ pRExC_state, pat,
654                                   array, maxarg, NULL, recompile_p,
655                                   /* $" */
656                                   GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
657            }
658            else if (!pat) {
659                pat = newSVpvs_flags("", SVs_TEMP);
660            }
661
662            continue;
663        }
664
665
666        /* we make the assumption here that each op in the list of
667         * op_siblings maps to one SV pushed onto the stack,
668         * except for code blocks, with have both an OP_NULL and
669         * an OP_CONST.
670         * This allows us to match up the list of SVs against the
671         * list of OPs to find the next code block.
672         *
673         * Note that       PUSHMARK PADSV PADSV ..
674         * is optimised to
675         *                 PADRANGE PADSV  PADSV  ..
676         * so the alignment still works. */
677
678        if (oplist) {
679            if (oplist->op_type == OP_NULL
680                && (oplist->op_flags & OPf_SPECIAL))
681            {
682                assert(n < pRExC_state->code_blocks->count);
683                pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
684                pRExC_state->code_blocks->cb[n].block = oplist;
685                pRExC_state->code_blocks->cb[n].src_regex = NULL;
686                n++;
687                code = 1;
688                oplist = OpSIBLING(oplist); /* skip CONST */
689                assert(oplist);
690            }
691            oplist = OpSIBLING(oplist);;
692        }
693
694        /* apply magic and QR overloading to arg */
695
696        SvGETMAGIC(msv);
697        if (SvROK(msv) && SvAMAGIC(msv)) {
698            SV *sv = AMG_CALLunary(msv, regexp_amg);
699            if (sv) {
700                if (SvROK(sv))
701                    sv = SvRV(sv);
702                if (SvTYPE(sv) != SVt_REGEXP)
703                    Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
704                msv = sv;
705            }
706        }
707
708        /* try concatenation overload ... */
709        if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
710                (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
711        {
712            sv_setsv(pat, sv);
713            /* overloading involved: all bets are off over literal
714             * code. Pretend we haven't seen it */
715            if (n)
716                pRExC_state->code_blocks->count -= n;
717            n = 0;
718        }
719        else {
720            /* ... or failing that, try "" overload */
721            while (SvAMAGIC(msv)
722                    && (sv = AMG_CALLunary(msv, string_amg))
723                    && sv != msv
724                    &&  !(   SvROK(msv)
725                          && SvROK(sv)
726                          && SvRV(msv) == SvRV(sv))
727            ) {
728                msv = sv;
729                SvGETMAGIC(msv);
730            }
731            if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
732                msv = SvRV(msv);
733
734            if (pat) {
735                /* this is a partially unrolled
736                 *     sv_catsv_nomg(pat, msv);
737                 * that allows us to adjust code block indices if
738                 * needed */
739                STRLEN dlen;
740                char *dst = SvPV_force_nomg(pat, dlen);
741                orig_patlen = dlen;
742                if (SvUTF8(msv) && !SvUTF8(pat)) {
743                    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
744                    sv_setpvn(pat, dst, dlen);
745                    SvUTF8_on(pat);
746                }
747                sv_catsv_nomg(pat, msv);
748                rx = msv;
749            }
750            else {
751                /* We have only one SV to process, but we need to verify
752                 * it is properly null terminated or we will fail asserts
753                 * later. In theory we probably shouldn't get such SV's,
754                 * but if we do we should handle it gracefully. */
755                if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
756                    /* not a string, or a string with a trailing null */
757                    pat = msv;
758                } else {
759                    /* a string with no trailing null, we need to copy it
760                     * so it has a trailing null */
761                    pat = sv_2mortal(newSVsv(msv));
762                }
763            }
764
765            if (code)
766                pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
767        }
768
769        /* extract any code blocks within any embedded qr//'s */
770        if (rx && SvTYPE(rx) == SVt_REGEXP
771            && RX_ENGINE((REGEXP*)rx)->op_comp)
772        {
773
774            RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
775            if (ri->code_blocks && ri->code_blocks->count) {
776                int i;
777                /* the presence of an embedded qr// with code means
778                 * we should always recompile: the text of the
779                 * qr// may not have changed, but it may be a
780                 * different closure than last time */
781                *recompile_p = 1;
782                if (pRExC_state->code_blocks) {
783                    int new_count = pRExC_state->code_blocks->count
784                            + ri->code_blocks->count;
785                    Renew(pRExC_state->code_blocks->cb,
786                            new_count, struct reg_code_block);
787                    pRExC_state->code_blocks->count = new_count;
788                }
789                else
790                    pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
791                                                    ri->code_blocks->count);
792
793                for (i=0; i < ri->code_blocks->count; i++) {
794                    struct reg_code_block *src, *dst;
795                    STRLEN offset =  orig_patlen
796                        + ReANY((REGEXP *)rx)->pre_prefix;
797                    assert(n < pRExC_state->code_blocks->count);
798                    src = &ri->code_blocks->cb[i];
799                    dst = &pRExC_state->code_blocks->cb[n];
800                    dst->start	    = src->start + offset;
801                    dst->end	    = src->end   + offset;
802                    dst->block	    = src->block;
803                    dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
804                                            src->src_regex
805                                                ? src->src_regex
806                                                : (REGEXP*)rx);
807                    n++;
808                }
809            }
810        }
811    }
812    /* avoid calling magic multiple times on a single element e.g. =~ $qr */
813    if (alloced)
814        SvSETMAGIC(pat);
815
816    return pat;
817}
818
819
820
821/* see if there are any run-time code blocks in the pattern.
822 * False positives are allowed */
823
824static bool
825S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
826                    char *pat, STRLEN plen)
827{
828    int n = 0;
829    STRLEN s;
830
831    PERL_UNUSED_CONTEXT;
832
833    for (s = 0; s < plen; s++) {
834        if (   pRExC_state->code_blocks
835            && n < pRExC_state->code_blocks->count
836            && s == pRExC_state->code_blocks->cb[n].start)
837        {
838            s = pRExC_state->code_blocks->cb[n].end;
839            n++;
840            continue;
841        }
842        /* TODO ideally should handle [..], (#..), /#.../x to reduce false
843         * positives here */
844        if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
845            (pat[s+2] == '{'
846                || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
847        )
848            return 1;
849    }
850    return 0;
851}
852
853/* Handle run-time code blocks. We will already have compiled any direct
854 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
855 * copy of it, but with any literal code blocks blanked out and
856 * appropriate chars escaped; then feed it into
857 *
858 *    eval "qr'modified_pattern'"
859 *
860 * For example,
861 *
862 *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
863 *
864 * becomes
865 *
866 *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
867 *
868 * After eval_sv()-ing that, grab any new code blocks from the returned qr
869 * and merge them with any code blocks of the original regexp.
870 *
871 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
872 * instead, just save the qr and return FALSE; this tells our caller that
873 * the original pattern needs upgrading to utf8.
874 */
875
876static bool
877S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
878    char *pat, STRLEN plen)
879{
880    SV *qr;
881
882    DECLARE_AND_GET_RE_DEBUG_FLAGS;
883
884    if (pRExC_state->runtime_code_qr) {
885        /* this is the second time we've been called; this should
886         * only happen if the main pattern got upgraded to utf8
887         * during compilation; re-use the qr we compiled first time
888         * round (which should be utf8 too)
889         */
890        qr = pRExC_state->runtime_code_qr;
891        pRExC_state->runtime_code_qr = NULL;
892        assert(RExC_utf8 && SvUTF8(qr));
893    }
894    else {
895        int n = 0;
896        STRLEN s;
897        char *p, *newpat;
898        int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
899        SV *sv, *qr_ref;
900        dSP;
901
902        /* determine how many extra chars we need for ' and \ escaping */
903        for (s = 0; s < plen; s++) {
904            if (pat[s] == '\'' || pat[s] == '\\')
905                newlen++;
906        }
907
908        Newx(newpat, newlen, char);
909        p = newpat;
910        *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
911
912        for (s = 0; s < plen; s++) {
913            if (   pRExC_state->code_blocks
914                && n < pRExC_state->code_blocks->count
915                && s == pRExC_state->code_blocks->cb[n].start)
916            {
917                /* blank out literal code block so that they aren't
918                 * recompiled: eg change from/to:
919                 *     /(?{xyz})/
920                 *     /(?=====)/
921                 * and
922                 *     /(??{xyz})/
923                 *     /(?======)/
924                 * and
925                 *     /(?(?{xyz}))/
926                 *     /(?(?=====))/
927                */
928                assert(pat[s]   == '(');
929                assert(pat[s+1] == '?');
930                *p++ = '(';
931                *p++ = '?';
932                s += 2;
933                while (s < pRExC_state->code_blocks->cb[n].end) {
934                    *p++ = '=';
935                    s++;
936                }
937                *p++ = ')';
938                n++;
939                continue;
940            }
941            if (pat[s] == '\'' || pat[s] == '\\')
942                *p++ = '\\';
943            *p++ = pat[s];
944        }
945        *p++ = '\'';
946        if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
947            *p++ = 'x';
948            if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
949                *p++ = 'x';
950            }
951        }
952        *p++ = '\0';
953        DEBUG_COMPILE_r({
954            Perl_re_printf( aTHX_
955                "%sre-parsing pattern for runtime code:%s %s\n",
956                PL_colors[4], PL_colors[5], newpat);
957        });
958
959        sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
960        Safefree(newpat);
961
962        ENTER;
963        SAVETMPS;
964        save_re_context();
965        PUSHSTACKi(PERLSI_REQUIRE);
966        /* G_RE_REPARSING causes the toker to collapse \\ into \ when
967         * parsing qr''; normally only q'' does this. It also alters
968         * hints handling */
969        eval_sv(sv, G_SCALAR|G_RE_REPARSING);
970        SvREFCNT_dec_NN(sv);
971        SPAGAIN;
972        qr_ref = POPs;
973        PUTBACK;
974        {
975            SV * const errsv = ERRSV;
976            if (SvTRUE_NN(errsv))
977                /* use croak_sv ? */
978                Perl_croak_nocontext("%" SVf, SVfARG(errsv));
979        }
980        assert(SvROK(qr_ref));
981        qr = SvRV(qr_ref);
982        assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
983        /* the leaving below frees the tmp qr_ref.
984         * Give qr a life of its own */
985        SvREFCNT_inc(qr);
986        POPSTACK;
987        FREETMPS;
988        LEAVE;
989
990    }
991
992    if (!RExC_utf8 && SvUTF8(qr)) {
993        /* first time through; the pattern got upgraded; save the
994         * qr for the next time through */
995        assert(!pRExC_state->runtime_code_qr);
996        pRExC_state->runtime_code_qr = qr;
997        return 0;
998    }
999
1000
1001    /* extract any code blocks within the returned qr//  */
1002
1003
1004    /* merge the main (r1) and run-time (r2) code blocks into one */
1005    {
1006        RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1007        struct reg_code_block *new_block, *dst;
1008        RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1009        int i1 = 0, i2 = 0;
1010        int r1c, r2c;
1011
1012        if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1013        {
1014            SvREFCNT_dec_NN(qr);
1015            return 1;
1016        }
1017
1018        if (!r1->code_blocks)
1019            r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1020
1021        r1c = r1->code_blocks->count;
1022        r2c = r2->code_blocks->count;
1023
1024        Newx(new_block, r1c + r2c, struct reg_code_block);
1025
1026        dst = new_block;
1027
1028        while (i1 < r1c || i2 < r2c) {
1029            struct reg_code_block *src;
1030            bool is_qr = 0;
1031
1032            if (i1 == r1c) {
1033                src = &r2->code_blocks->cb[i2++];
1034                is_qr = 1;
1035            }
1036            else if (i2 == r2c)
1037                src = &r1->code_blocks->cb[i1++];
1038            else if (  r1->code_blocks->cb[i1].start
1039                     < r2->code_blocks->cb[i2].start)
1040            {
1041                src = &r1->code_blocks->cb[i1++];
1042                assert(src->end < r2->code_blocks->cb[i2].start);
1043            }
1044            else {
1045                assert(  r1->code_blocks->cb[i1].start
1046                       > r2->code_blocks->cb[i2].start);
1047                src = &r2->code_blocks->cb[i2++];
1048                is_qr = 1;
1049                assert(src->end < r1->code_blocks->cb[i1].start);
1050            }
1051
1052            assert(pat[src->start] == '(');
1053            assert(pat[src->end]   == ')');
1054            dst->start	    = src->start;
1055            dst->end	    = src->end;
1056            dst->block	    = src->block;
1057            dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1058                                    : src->src_regex;
1059            dst++;
1060        }
1061        r1->code_blocks->count += r2c;
1062        Safefree(r1->code_blocks->cb);
1063        r1->code_blocks->cb = new_block;
1064    }
1065
1066    SvREFCNT_dec_NN(qr);
1067    return 1;
1068}
1069
1070
1071STATIC bool
1072S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1073                      struct reg_substr_datum  *rsd,
1074                      struct scan_data_substrs *sub,
1075                      STRLEN longest_length)
1076{
1077    /* This is the common code for setting up the floating and fixed length
1078     * string data extracted from Perl_re_op_compile() below.  Returns a boolean
1079     * as to whether succeeded or not */
1080
1081    I32 t;
1082    SSize_t ml;
1083    bool eol  = cBOOL(sub->flags & SF_BEFORE_EOL);
1084    bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1085
1086    if (! (longest_length
1087           || (eol /* Can't have SEOL and MULTI */
1088               && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1089          )
1090            /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1091        || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1092    {
1093        return FALSE;
1094    }
1095
1096    /* copy the information about the longest from the reg_scan_data
1097        over to the program. */
1098    if (SvUTF8(sub->str)) {
1099        rsd->substr      = NULL;
1100        rsd->utf8_substr = sub->str;
1101    } else {
1102        rsd->substr      = sub->str;
1103        rsd->utf8_substr = NULL;
1104    }
1105    /* end_shift is how many chars that must be matched that
1106        follow this item. We calculate it ahead of time as once the
1107        lookbehind offset is added in we lose the ability to correctly
1108        calculate it.*/
1109    ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1110    rsd->end_shift = ml - sub->min_offset
1111        - longest_length
1112            /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1113             * intead? - DAPM
1114            + (SvTAIL(sub->str) != 0)
1115            */
1116        + sub->lookbehind;
1117
1118    t = (eol/* Can't have SEOL and MULTI */
1119         && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1120    fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1121
1122    return TRUE;
1123}
1124
1125STATIC void
1126S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1127{
1128    /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1129     * properly wrapped with the right modifiers */
1130
1131    bool has_p     = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1132    bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1133                                                != REGEX_DEPENDS_CHARSET);
1134
1135    /* The caret is output if there are any defaults: if not all the STD
1136        * flags are set, or if no character set specifier is needed */
1137    bool has_default =
1138                (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1139                || ! has_charset);
1140    bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1141                                                == REG_RUN_ON_COMMENT_SEEN);
1142    U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1143                        >> RXf_PMf_STD_PMMOD_SHIFT);
1144    const char *fptr = STD_PAT_MODS;        /*"msixxn"*/
1145    char *p;
1146    STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1147
1148    /* We output all the necessary flags; we never output a minus, as all
1149        * those are defaults, so are
1150        * covered by the caret */
1151    const STRLEN wraplen = pat_len + has_p + has_runon
1152        + has_default       /* If needs a caret */
1153        + PL_bitcount[reganch] /* 1 char for each set standard flag */
1154
1155            /* If needs a character set specifier */
1156        + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1157        + (sizeof("(?:)") - 1);
1158
1159    PERL_ARGS_ASSERT_SET_REGEX_PV;
1160
1161    /* make sure PL_bitcount bounds not exceeded */
1162    STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1163
1164    p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1165    SvPOK_on(Rx);
1166    if (RExC_utf8)
1167        SvFLAGS(Rx) |= SVf_UTF8;
1168    *p++='('; *p++='?';
1169
1170    /* If a default, cover it using the caret */
1171    if (has_default) {
1172        *p++= DEFAULT_PAT_MOD;
1173    }
1174    if (has_charset) {
1175        STRLEN len;
1176        const char* name;
1177
1178        name = get_regex_charset_name(RExC_rx->extflags, &len);
1179        if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
1180            assert(RExC_utf8);
1181            name = UNICODE_PAT_MODS;
1182            len = sizeof(UNICODE_PAT_MODS) - 1;
1183        }
1184        Copy(name, p, len, char);
1185        p += len;
1186    }
1187    if (has_p)
1188        *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1189    {
1190        char ch;
1191        while((ch = *fptr++)) {
1192            if(reganch & 1)
1193                *p++ = ch;
1194            reganch >>= 1;
1195        }
1196    }
1197
1198    *p++ = ':';
1199    Copy(RExC_precomp, p, pat_len, char);
1200    assert ((RX_WRAPPED(Rx) - p) < 16);
1201    RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1202    p += pat_len;
1203
1204    /* Adding a trailing \n causes this to compile properly:
1205            my $R = qr / A B C # D E/x; /($R)/
1206        Otherwise the parens are considered part of the comment */
1207    if (has_runon)
1208        *p++ = '\n';
1209    *p++ = ')';
1210    *p = 0;
1211    SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1212}
1213
1214/*
1215 * Perl_re_op_compile - the perl internal RE engine's function to compile a
1216 * regular expression into internal code.
1217 * The pattern may be passed either as:
1218 *    a list of SVs (patternp plus pat_count)
1219 *    a list of OPs (expr)
1220 * If both are passed, the SV list is used, but the OP list indicates
1221 * which SVs are actually pre-compiled code blocks
1222 *
1223 * The SVs in the list have magic and qr overloading applied to them (and
1224 * the list may be modified in-place with replacement SVs in the latter
1225 * case).
1226 *
1227 * If the pattern hasn't changed from old_re, then old_re will be
1228 * returned.
1229 *
1230 * eng is the current engine. If that engine has an op_comp method, then
1231 * handle directly (i.e. we assume that op_comp was us); otherwise, just
1232 * do the initial concatenation of arguments and pass on to the external
1233 * engine.
1234 *
1235 * If is_bare_re is not null, set it to a boolean indicating whether the
1236 * arg list reduced (after overloading) to a single bare regex which has
1237 * been returned (i.e. /$qr/).
1238 *
1239 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1240 *
1241 * pm_flags contains the PMf_* flags, typically based on those from the
1242 * pm_flags field of the related PMOP. Currently we're only interested in
1243 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1244 *
1245 * For many years this code had an initial sizing pass that calculated
1246 * (sometimes incorrectly, leading to security holes) the size needed for the
1247 * compiled pattern.  That was changed by commit
1248 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1249 * node at a time, as parsing goes along.  Patches welcome to fix any obsolete
1250 * references to this sizing pass.
1251 *
1252 * Now, an initial crude guess as to the size needed is made, based on the
1253 * length of the pattern.  Patches welcome to improve that guess.  That amount
1254 * of space is malloc'd and then immediately freed, and then clawed back node
1255 * by node.  This design is to minimize, to the extent possible, memory churn
1256 * when doing the reallocs.
1257 *
1258 * A separate parentheses counting pass may be needed in some cases.
1259 * (Previously the sizing pass did this.)  Patches welcome to reduce the number
1260 * of these cases.
1261 *
1262 * The existence of a sizing pass necessitated design decisions that are no
1263 * longer needed.  There are potential areas of simplification.
1264 *
1265 * Beware that the optimization-preparation code in here knows about some
1266 * of the structure of the compiled regexp.  [I'll say.]
1267 */
1268
1269REGEXP *
1270Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1271                    OP *expr, const regexp_engine* eng, REGEXP *old_re,
1272                     bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1273{
1274    REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
1275    STRLEN plen;
1276    char *exp;
1277    regnode *scan;
1278    I32 flags;
1279    SSize_t minlen = 0;
1280    U32 rx_flags;
1281    SV *pat;
1282    SV** new_patternp = patternp;
1283
1284    /* these are all flags - maybe they should be turned
1285     * into a single int with different bit masks */
1286    I32 sawlookahead = 0;
1287    I32 sawplus = 0;
1288    I32 sawopen = 0;
1289    I32 sawminmod = 0;
1290
1291    regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1292    bool recompile = 0;
1293    bool runtime_code = 0;
1294    scan_data_t data;
1295    RExC_state_t RExC_state;
1296    RExC_state_t * const pRExC_state = &RExC_state;
1297#ifdef TRIE_STUDY_OPT
1298    /* search for "restudy" in this file for a detailed explanation */
1299    int restudied = 0;
1300    RExC_state_t copyRExC_state;
1301#endif
1302    DECLARE_AND_GET_RE_DEBUG_FLAGS;
1303
1304    PERL_ARGS_ASSERT_RE_OP_COMPILE;
1305
1306    DEBUG_r(if (!PL_colorset) reginitcolors());
1307
1308
1309    pRExC_state->warn_text = NULL;
1310    pRExC_state->unlexed_names = NULL;
1311    pRExC_state->code_blocks = NULL;
1312
1313    if (is_bare_re)
1314        *is_bare_re = FALSE;
1315
1316    if (expr && (expr->op_type == OP_LIST ||
1317                (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1318        /* allocate code_blocks if needed */
1319        OP *o;
1320        int ncode = 0;
1321
1322        for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1323            if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1324                ncode++; /* count of DO blocks */
1325
1326        if (ncode)
1327            pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1328    }
1329
1330    if (!pat_count) {
1331        /* compile-time pattern with just OP_CONSTs and DO blocks */
1332
1333        int n;
1334        OP *o;
1335
1336        /* find how many CONSTs there are */
1337        assert(expr);
1338        n = 0;
1339        if (expr->op_type == OP_CONST)
1340            n = 1;
1341        else
1342            for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1343                if (o->op_type == OP_CONST)
1344                    n++;
1345            }
1346
1347        /* fake up an SV array */
1348
1349        assert(!new_patternp);
1350        Newx(new_patternp, n, SV*);
1351        SAVEFREEPV(new_patternp);
1352        pat_count = n;
1353
1354        n = 0;
1355        if (expr->op_type == OP_CONST)
1356            new_patternp[n] = cSVOPx_sv(expr);
1357        else
1358            for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1359                if (o->op_type == OP_CONST)
1360                    new_patternp[n++] = cSVOPo_sv;
1361            }
1362
1363    }
1364
1365    DEBUG_PARSE_r(Perl_re_printf( aTHX_
1366        "Assembling pattern from %d elements%s\n", pat_count,
1367            orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1368
1369    /* set expr to the first arg op */
1370
1371    if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1372         && expr->op_type != OP_CONST)
1373    {
1374            expr = cLISTOPx(expr)->op_first;
1375            assert(   expr->op_type == OP_PUSHMARK
1376                   || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1377                   || expr->op_type == OP_PADRANGE);
1378            expr = OpSIBLING(expr);
1379    }
1380
1381    pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1382                        expr, &recompile, NULL);
1383
1384    /* handle bare (possibly after overloading) regex: foo =~ $re */
1385    {
1386        SV *re = pat;
1387        if (SvROK(re))
1388            re = SvRV(re);
1389        if (SvTYPE(re) == SVt_REGEXP) {
1390            if (is_bare_re)
1391                *is_bare_re = TRUE;
1392            SvREFCNT_inc(re);
1393            DEBUG_PARSE_r(Perl_re_printf( aTHX_
1394                "Precompiled pattern%s\n",
1395                    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1396
1397            return (REGEXP*)re;
1398        }
1399    }
1400
1401    exp = SvPV_nomg(pat, plen);
1402
1403    if (!eng->op_comp) {
1404        if ((SvUTF8(pat) && IN_BYTES)
1405                || SvGMAGICAL(pat) || SvAMAGIC(pat))
1406        {
1407            /* make a temporary copy; either to convert to bytes,
1408             * or to avoid repeating get-magic / overloaded stringify */
1409            pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1410                                        (IN_BYTES ? 0 : SvUTF8(pat)));
1411        }
1412        return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1413    }
1414
1415    /* ignore the utf8ness if the pattern is 0 length */
1416    RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1417    RExC_uni_semantics = 0;
1418    RExC_contains_locale = 0;
1419    RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1420    RExC_in_script_run = 0;
1421    RExC_study_started = 0;
1422    pRExC_state->runtime_code_qr = NULL;
1423    RExC_frame_head= NULL;
1424    RExC_frame_last= NULL;
1425    RExC_frame_count= 0;
1426    RExC_latest_warn_offset = 0;
1427    RExC_use_BRANCHJ = 0;
1428    RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
1429    RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
1430    RExC_logical_total_parens = 0;
1431    RExC_total_parens = 0;
1432    RExC_logical_to_parno = NULL;
1433    RExC_parno_to_logical = NULL;
1434    RExC_open_parens = NULL;
1435    RExC_close_parens = NULL;
1436    RExC_paren_names = NULL;
1437    RExC_size = 0;
1438    RExC_seen_d_op = FALSE;
1439#ifdef DEBUGGING
1440    RExC_paren_name_list = NULL;
1441#endif
1442
1443    DEBUG_r({
1444        RExC_mysv1= sv_newmortal();
1445        RExC_mysv2= sv_newmortal();
1446    });
1447
1448    DEBUG_COMPILE_r({
1449            SV *dsv= sv_newmortal();
1450            RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1451            Perl_re_printf( aTHX_  "%sCompiling REx%s %s\n",
1452                          PL_colors[4], PL_colors[5], s);
1453        });
1454
1455    /* we jump here if we have to recompile, e.g., from upgrading the pattern
1456     * to utf8 */
1457
1458    if ((pm_flags & PMf_USE_RE_EVAL)
1459                /* this second condition covers the non-regex literal case,
1460                 * i.e.  $foo =~ '(?{})'. */
1461                || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1462    )
1463        runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1464
1465  redo_parse:
1466    /* return old regex if pattern hasn't changed */
1467    /* XXX: note in the below we have to check the flags as well as the
1468     * pattern.
1469     *
1470     * Things get a touch tricky as we have to compare the utf8 flag
1471     * independently from the compile flags.  */
1472
1473    if (   old_re
1474        && !recompile
1475        && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1476        && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1477        && RX_PRELEN(old_re) == plen
1478        && memEQ(RX_PRECOMP(old_re), exp, plen)
1479        && !runtime_code /* with runtime code, always recompile */ )
1480    {
1481        DEBUG_COMPILE_r({
1482            SV *dsv= sv_newmortal();
1483            RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1484            Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
1485                          PL_colors[4], PL_colors[5], s);
1486        });
1487        return old_re;
1488    }
1489
1490    /* Allocate the pattern's SV */
1491    RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1492    RExC_rx = ReANY(Rx);
1493    if ( RExC_rx == NULL )
1494        FAIL("Regexp out of space");
1495
1496    rx_flags = orig_rx_flags;
1497
1498    if (   toUSE_UNI_CHARSET_NOT_DEPENDS
1499        && initial_charset == REGEX_DEPENDS_CHARSET)
1500    {
1501
1502        /* Set to use unicode semantics if the pattern is in utf8 and has the
1503         * 'depends' charset specified, as it means unicode when utf8  */
1504        set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1505        RExC_uni_semantics = 1;
1506    }
1507
1508    RExC_pm_flags = pm_flags;
1509
1510    if (runtime_code) {
1511        assert(TAINTING_get || !TAINT_get);
1512        if (TAINT_get)
1513            Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1514
1515        if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1516            /* whoops, we have a non-utf8 pattern, whilst run-time code
1517             * got compiled as utf8. Try again with a utf8 pattern */
1518            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1519                pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1520            goto redo_parse;
1521        }
1522    }
1523    assert(!pRExC_state->runtime_code_qr);
1524
1525    RExC_sawback = 0;
1526
1527    RExC_seen = 0;
1528    RExC_maxlen = 0;
1529    RExC_in_lookaround = 0;
1530    RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1531    RExC_recode_x_to_native = 0;
1532    RExC_in_multi_char_class = 0;
1533
1534    RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1535    RExC_precomp_end = RExC_end = exp + plen;
1536    RExC_nestroot = 0;
1537    RExC_whilem_seen = 0;
1538    RExC_end_op = NULL;
1539    RExC_recurse = NULL;
1540    RExC_study_chunk_recursed = NULL;
1541    RExC_study_chunk_recursed_bytes= 0;
1542    RExC_recurse_count = 0;
1543    RExC_sets_depth = 0;
1544    pRExC_state->code_index = 0;
1545
1546    /* Initialize the string in the compiled pattern.  This is so that there is
1547     * something to output if necessary */
1548    set_regex_pv(pRExC_state, Rx);
1549
1550    DEBUG_PARSE_r({
1551        Perl_re_printf( aTHX_
1552            "Starting parse and generation\n");
1553        RExC_lastnum=0;
1554        RExC_lastparse=NULL;
1555    });
1556
1557    /* Allocate space and zero-initialize. Note, the two step process
1558       of zeroing when in debug mode, thus anything assigned has to
1559       happen after that */
1560    if (!  RExC_size) {
1561
1562        /* On the first pass of the parse, we guess how big this will be.  Then
1563         * we grow in one operation to that amount and then give it back.  As
1564         * we go along, we re-allocate what we need.
1565         *
1566         * XXX Currently the guess is essentially that the pattern will be an
1567         * EXACT node with one byte input, one byte output.  This is crude, and
1568         * better heuristics are welcome.
1569         *
1570         * On any subsequent passes, we guess what we actually computed in the
1571         * latest earlier pass.  Such a pass probably didn't complete so is
1572         * missing stuff.  We could improve those guesses by knowing where the
1573         * parse stopped, and use the length so far plus apply the above
1574         * assumption to what's left. */
1575        RExC_size = STR_SZ(RExC_end - RExC_start);
1576    }
1577
1578    Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1579    if ( RExC_rxi == NULL )
1580        FAIL("Regexp out of space");
1581
1582    Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1583    RXi_SET( RExC_rx, RExC_rxi );
1584
1585    /* We start from 0 (over from 0 in the case this is a reparse.  The first
1586     * node parsed will give back any excess memory we have allocated so far).
1587     * */
1588    RExC_size = 0;
1589
1590    /* non-zero initialization begins here */
1591    RExC_rx->engine= eng;
1592    RExC_rx->extflags = rx_flags;
1593    RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1594
1595    if (pm_flags & PMf_IS_QR) {
1596        RExC_rxi->code_blocks = pRExC_state->code_blocks;
1597        if (RExC_rxi->code_blocks) {
1598            RExC_rxi->code_blocks->refcnt++;
1599        }
1600    }
1601
1602    RExC_rx->intflags = 0;
1603
1604    RExC_flags = rx_flags;	/* don't let top level (?i) bleed */
1605    RExC_parse_set(exp);
1606
1607    /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1608     * code makes sure the final byte is an uncounted NUL.  But should this
1609     * ever not be the case, lots of things could read beyond the end of the
1610     * buffer: loops like
1611     *      while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1612     *      strchr(RExC_parse, "foo");
1613     * etc.  So it is worth noting. */
1614    assert(*RExC_end == '\0');
1615
1616    RExC_naughty = 0;
1617    RExC_npar = 1;
1618    RExC_logical_npar = 1;
1619    RExC_parens_buf_size = 0;
1620    RExC_emit_start = RExC_rxi->program;
1621    pRExC_state->code_index = 0;
1622
1623    *((char*) RExC_emit_start) = (char) REG_MAGIC;
1624    RExC_emit = NODE_STEP_REGNODE;
1625
1626    /* Do the parse */
1627    if (reg(pRExC_state, 0, &flags, 1)) {
1628
1629        /* Success!, But we may need to redo the parse knowing how many parens
1630         * there actually are */
1631        if (IN_PARENS_PASS) {
1632            flags |= RESTART_PARSE;
1633        }
1634
1635        /* We have that number in RExC_npar */
1636        RExC_total_parens = RExC_npar;
1637        RExC_logical_total_parens = RExC_logical_npar;
1638    }
1639    else if (! MUST_RESTART(flags)) {
1640        ReREFCNT_dec(Rx);
1641        Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1642    }
1643
1644    /* Here, we either have success, or we have to redo the parse for some reason */
1645    if (MUST_RESTART(flags)) {
1646
1647        /* It's possible to write a regexp in ascii that represents Unicode
1648        codepoints outside of the byte range, such as via \x{100}. If we
1649        detect such a sequence we have to convert the entire pattern to utf8
1650        and then recompile, as our sizing calculation will have been based
1651        on 1 byte == 1 character, but we will need to use utf8 to encode
1652        at least some part of the pattern, and therefore must convert the whole
1653        thing.
1654        -- dmq */
1655        if (flags & NEED_UTF8) {
1656
1657            /* We have stored the offset of the final warning output so far.
1658             * That must be adjusted.  Any variant characters between the start
1659             * of the pattern and this warning count for 2 bytes in the final,
1660             * so just add them again */
1661            if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1662                RExC_latest_warn_offset +=
1663                            variant_under_utf8_count((U8 *) exp, (U8 *) exp
1664                                                + RExC_latest_warn_offset);
1665            }
1666            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1667            pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1668            DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1669        }
1670        else {
1671            DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1672        }
1673
1674        if (ALL_PARENS_COUNTED) {
1675            /* Make enough room for all the known parens, and zero it */
1676            Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1677            Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1678            RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
1679
1680            Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1681            Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1682            /* we do NOT reinitialize  RExC_logical_to_parno and
1683             * RExC_parno_to_logical here. We need their data on the second
1684             * pass */
1685        }
1686        else { /* Parse did not complete.  Reinitialize the parentheses
1687                  structures */
1688            RExC_total_parens = 0;
1689            if (RExC_open_parens) {
1690                Safefree(RExC_open_parens);
1691                RExC_open_parens = NULL;
1692            }
1693            if (RExC_close_parens) {
1694                Safefree(RExC_close_parens);
1695                RExC_close_parens = NULL;
1696            }
1697            if (RExC_logical_to_parno) {
1698                Safefree(RExC_logical_to_parno);
1699                RExC_logical_to_parno = NULL;
1700            }
1701            if (RExC_parno_to_logical) {
1702                Safefree(RExC_parno_to_logical);
1703                RExC_parno_to_logical = NULL;
1704            }
1705        }
1706
1707        /* Clean up what we did in this parse */
1708        SvREFCNT_dec_NN(RExC_rx_sv);
1709
1710        goto redo_parse;
1711    }
1712
1713    /* Here, we have successfully parsed and generated the pattern's program
1714     * for the regex engine.  We are ready to finish things up and look for
1715     * optimizations. */
1716
1717    /* Update the string to compile, with correct modifiers, etc */
1718    set_regex_pv(pRExC_state, Rx);
1719
1720    RExC_rx->nparens = RExC_total_parens - 1;
1721    RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1722
1723    /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1724    if (RExC_whilem_seen > 15)
1725        RExC_whilem_seen = 15;
1726
1727    DEBUG_PARSE_r({
1728        Perl_re_printf( aTHX_
1729            "Required size %" IVdf " nodes\n", (IV)RExC_size);
1730        RExC_lastnum=0;
1731        RExC_lastparse=NULL;
1732    });
1733
1734    SetProgLen(RExC_rxi,RExC_size);
1735
1736    DEBUG_DUMP_PRE_OPTIMIZE_r({
1737        SV * const sv = sv_newmortal();
1738        RXi_GET_DECL(RExC_rx, ri);
1739        DEBUG_RExC_seen();
1740        Perl_re_printf( aTHX_ "Program before optimization:\n");
1741
1742        (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1743                        sv, 0, 0);
1744    });
1745
1746    DEBUG_OPTIMISE_r(
1747        Perl_re_printf( aTHX_  "Starting post parse optimization\n");
1748    );
1749
1750    /* XXXX To minimize changes to RE engine we always allocate
1751       3-units-long substrs field. */
1752    Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1753    if (RExC_recurse_count) {
1754        Newx(RExC_recurse, RExC_recurse_count, regnode *);
1755        SAVEFREEPV(RExC_recurse);
1756    }
1757
1758    if (RExC_seen & REG_RECURSE_SEEN) {
1759        /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1760         * So its 1 if there are no parens. */
1761        RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1762                                         ((RExC_total_parens & 0x07) != 0);
1763        Newx(RExC_study_chunk_recursed,
1764             RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1765        SAVEFREEPV(RExC_study_chunk_recursed);
1766    }
1767
1768  reStudy:
1769    RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1770    DEBUG_r(
1771        RExC_study_chunk_recursed_count= 0;
1772    );
1773    Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1774    if (RExC_study_chunk_recursed) {
1775        Zero(RExC_study_chunk_recursed,
1776             RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1777    }
1778
1779
1780#ifdef TRIE_STUDY_OPT
1781    /* search for "restudy" in this file for a detailed explanation */
1782    if (!restudied) {
1783        StructCopy(&zero_scan_data, &data, scan_data_t);
1784        copyRExC_state = RExC_state;
1785    } else {
1786        U32 seen=RExC_seen;
1787        DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1788
1789        RExC_state = copyRExC_state;
1790        if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1791            RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1792        else
1793            RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1794        StructCopy(&zero_scan_data, &data, scan_data_t);
1795    }
1796#else
1797    StructCopy(&zero_scan_data, &data, scan_data_t);
1798#endif
1799
1800    /* Dig out information for optimizations. */
1801    RExC_rx->extflags = RExC_flags; /* was pm_op */
1802    /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1803
1804    if (UTF)
1805        SvUTF8_on(Rx);	/* Unicode in it? */
1806    RExC_rxi->regstclass = NULL;
1807    if (RExC_naughty >= TOO_NAUGHTY)	/* Probably an expensive pattern. */
1808        RExC_rx->intflags |= PREGf_NAUGHTY;
1809    scan = RExC_rxi->program + 1;		/* First BRANCH. */
1810
1811    /* testing for BRANCH here tells us whether there is "must appear"
1812       data in the pattern. If there is then we can use it for optimisations */
1813    if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
1814                                                  */
1815        SSize_t fake_deltap;
1816        STRLEN longest_length[2];
1817        regnode_ssc ch_class; /* pointed to by data */
1818        int stclass_flag;
1819        SSize_t last_close = 0; /* pointed to by data */
1820        regnode *first= scan;
1821        regnode *first_next= regnext(first);
1822        regnode *last_close_op= NULL;
1823        int i;
1824
1825        /*
1826         * Skip introductions and multiplicators >= 1
1827         * so that we can extract the 'meat' of the pattern that must
1828         * match in the large if() sequence following.
1829         * NOTE that EXACT is NOT covered here, as it is normally
1830         * picked up by the optimiser separately.
1831         *
1832         * This is unfortunate as the optimiser isnt handling lookahead
1833         * properly currently.
1834         *
1835         */
1836        while (1)
1837        {
1838            if (OP(first) == OPEN)
1839                sawopen = 1;
1840            else
1841            if (OP(first) == IFMATCH && !FLAGS(first))
1842                /* for now we can't handle lookbehind IFMATCH */
1843                sawlookahead = 1;
1844            else
1845            if (OP(first) == PLUS)
1846                sawplus = 1;
1847            else
1848            if (OP(first) == MINMOD)
1849                sawminmod = 1;
1850            else
1851            if (!(
1852                /* An OR of *one* alternative - should not happen now. */
1853                (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1854                /* An {n,m} with n>0 */
1855                (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1856                (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1857            )){
1858                break;
1859            }
1860
1861            first = REGNODE_AFTER(first);
1862            first_next= regnext(first);
1863        }
1864
1865        /* Starting-point info. */
1866      again:
1867        DEBUG_PEEP("first:", first, 0, 0);
1868        /* Ignore EXACT as we deal with it later. */
1869        if (REGNODE_TYPE(OP(first)) == EXACT) {
1870            if (! isEXACTFish(OP(first))) {
1871                NOOP;	/* Empty, get anchored substr later. */
1872            }
1873            else
1874                RExC_rxi->regstclass = first;
1875        }
1876#ifdef TRIE_STCLASS
1877        else if (REGNODE_TYPE(OP(first)) == TRIE &&
1878                ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1879        {
1880            /* this can happen only on restudy
1881             * Search for "restudy" in this file to find
1882             * a comment with details. */
1883            RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1884        }
1885#endif
1886        else if (REGNODE_SIMPLE(OP(first)))
1887            RExC_rxi->regstclass = first;
1888        else if (REGNODE_TYPE(OP(first)) == BOUND ||
1889                 REGNODE_TYPE(OP(first)) == NBOUND)
1890            RExC_rxi->regstclass = first;
1891        else if (REGNODE_TYPE(OP(first)) == BOL) {
1892            RExC_rx->intflags |= (OP(first) == MBOL
1893                           ? PREGf_ANCH_MBOL
1894                           : PREGf_ANCH_SBOL);
1895            first = REGNODE_AFTER(first);
1896            goto again;
1897        }
1898        else if (OP(first) == GPOS) {
1899            RExC_rx->intflags |= PREGf_ANCH_GPOS;
1900            first = REGNODE_AFTER_type(first,tregnode_GPOS);
1901            goto again;
1902        }
1903        else if ((!sawopen || !RExC_sawback) &&
1904            !sawlookahead &&
1905            (OP(first) == STAR &&
1906            REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
1907            !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
1908        {
1909            /* turn .* into ^.* with an implied $*=1 */
1910            const int type =
1911                (OP(REGNODE_AFTER(first)) == REG_ANY)
1912                    ? PREGf_ANCH_MBOL
1913                    : PREGf_ANCH_SBOL;
1914            RExC_rx->intflags |= (type | PREGf_IMPLICIT);
1915            first = REGNODE_AFTER(first);
1916            goto again;
1917        }
1918        if (sawplus && !sawminmod && !sawlookahead
1919            && (!sawopen || !RExC_sawback)
1920            && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
1921            /* x+ must match at the 1st pos of run of x's */
1922            RExC_rx->intflags |= PREGf_SKIP;
1923
1924        /* Scan is after the zeroth branch, first is atomic matcher. */
1925#ifdef TRIE_STUDY_OPT
1926        /* search for "restudy" in this file for a detailed explanation */
1927        DEBUG_PARSE_r(
1928            if (!restudied)
1929                Perl_re_printf( aTHX_  "first at %" IVdf "\n",
1930                              (IV)(first - scan + 1))
1931        );
1932#else
1933        DEBUG_PARSE_r(
1934            Perl_re_printf( aTHX_  "first at %" IVdf "\n",
1935                (IV)(first - scan + 1))
1936        );
1937#endif
1938
1939
1940        /*
1941        * If there's something expensive in the r.e., find the
1942        * longest literal string that must appear and make it the
1943        * regmust.  Resolve ties in favor of later strings, since
1944        * the regstart check works with the beginning of the r.e.
1945        * and avoiding duplication strengthens checking.  Not a
1946        * strong reason, but sufficient in the absence of others.
1947        * [Now we resolve ties in favor of the earlier string if
1948        * it happens that c_offset_min has been invalidated, since the
1949        * earlier string may buy us something the later one won't.]
1950        */
1951
1952        data.substrs[0].str = newSVpvs("");
1953        data.substrs[1].str = newSVpvs("");
1954        data.last_found = newSVpvs("");
1955        data.cur_is_floating = 0; /* initially any found substring is fixed */
1956        ENTER_with_name("study_chunk");
1957        SAVEFREESV(data.substrs[0].str);
1958        SAVEFREESV(data.substrs[1].str);
1959        SAVEFREESV(data.last_found);
1960        first = scan;
1961        if (!RExC_rxi->regstclass) {
1962            ssc_init(pRExC_state, &ch_class);
1963            data.start_class = &ch_class;
1964            stclass_flag = SCF_DO_STCLASS_AND;
1965        } else				/* XXXX Check for BOUND? */
1966            stclass_flag = 0;
1967        data.last_closep = &last_close;
1968        data.last_close_opp = &last_close_op;
1969
1970        DEBUG_RExC_seen();
1971        /*
1972         * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
1973         * (NO top level branches)
1974         */
1975        minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
1976                             scan + RExC_size, /* Up to end */
1977            &data, -1, 0, NULL,
1978            SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
1979                          | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
1980            0, TRUE);
1981        /* search for "restudy" in this file for a detailed explanation
1982         * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
1983
1984
1985        CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
1986
1987
1988        if ( RExC_total_parens == 1 && !data.cur_is_floating
1989             && data.last_start_min == 0 && data.last_end > 0
1990             && !RExC_seen_zerolen
1991             && !(RExC_seen & REG_VERBARG_SEEN)
1992             && !(RExC_seen & REG_GPOS_SEEN)
1993        ){
1994            RExC_rx->extflags |= RXf_CHECK_ALL;
1995        }
1996        scan_commit(pRExC_state, &data,&minlen, 0);
1997
1998
1999        /* XXX this is done in reverse order because that's the way the
2000         * code was before it was parameterised. Don't know whether it
2001         * actually needs doing in reverse order. DAPM */
2002        for (i = 1; i >= 0; i--) {
2003            longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2004
2005            if (   !(   i
2006                     && SvCUR(data.substrs[0].str)  /* ok to leave SvCUR */
2007                     &&    data.substrs[0].min_offset
2008                        == data.substrs[1].min_offset
2009                     &&    SvCUR(data.substrs[0].str)
2010                        == SvCUR(data.substrs[1].str)
2011                    )
2012                && S_setup_longest (aTHX_ pRExC_state,
2013                                        &(RExC_rx->substrs->data[i]),
2014                                        &(data.substrs[i]),
2015                                        longest_length[i]))
2016            {
2017                RExC_rx->substrs->data[i].min_offset =
2018                        data.substrs[i].min_offset - data.substrs[i].lookbehind;
2019
2020                RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2021                /* Don't offset infinity */
2022                if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2023                    RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2024                SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2025            }
2026            else {
2027                RExC_rx->substrs->data[i].substr      = NULL;
2028                RExC_rx->substrs->data[i].utf8_substr = NULL;
2029                longest_length[i] = 0;
2030            }
2031        }
2032
2033        LEAVE_with_name("study_chunk");
2034
2035        if (RExC_rxi->regstclass
2036            && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2037            RExC_rxi->regstclass = NULL;
2038
2039        if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2040              || RExC_rx->substrs->data[0].min_offset)
2041            && stclass_flag
2042            && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2043            && is_ssc_worth_it(pRExC_state, data.start_class))
2044        {
2045            const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2046
2047            ssc_finalize(pRExC_state, data.start_class);
2048
2049            Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2050            StructCopy(data.start_class,
2051                       (regnode_ssc*)RExC_rxi->data->data[n],
2052                       regnode_ssc);
2053            RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2054            RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
2055            DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2056                      regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2057                      Perl_re_printf( aTHX_
2058                                    "synthetic stclass \"%s\".\n",
2059                                    SvPVX_const(sv));});
2060            data.start_class = NULL;
2061        }
2062
2063        /* A temporary algorithm prefers floated substr to fixed one of
2064         * same length to dig more info. */
2065        i = (longest_length[0] <= longest_length[1]);
2066        RExC_rx->substrs->check_ix = i;
2067        RExC_rx->check_end_shift  = RExC_rx->substrs->data[i].end_shift;
2068        RExC_rx->check_substr     = RExC_rx->substrs->data[i].substr;
2069        RExC_rx->check_utf8       = RExC_rx->substrs->data[i].utf8_substr;
2070        RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2071        RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2072        if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2073            RExC_rx->intflags |= PREGf_NOSCAN;
2074
2075        if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2076            RExC_rx->extflags |= RXf_USE_INTUIT;
2077            if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2078                RExC_rx->extflags |= RXf_INTUIT_TAIL;
2079        }
2080
2081        /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2082        if ( (STRLEN)minlen < longest_length[1] )
2083            minlen= longest_length[1];
2084        if ( (STRLEN)minlen < longest_length[0] )
2085            minlen= longest_length[0];
2086        */
2087    }
2088    else {
2089        /* Several toplevels. Best we can is to set minlen. */
2090        SSize_t fake_deltap;
2091        regnode_ssc ch_class;
2092        SSize_t last_close = 0;
2093        regnode *last_close_op = NULL;
2094
2095        DEBUG_PARSE_r(Perl_re_printf( aTHX_  "\nMulti Top Level\n"));
2096
2097        scan = RExC_rxi->program + 1;
2098        ssc_init(pRExC_state, &ch_class);
2099        data.start_class = &ch_class;
2100        data.last_closep = &last_close;
2101        data.last_close_opp = &last_close_op;
2102
2103        DEBUG_RExC_seen();
2104        /*
2105         * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2106         * (patterns WITH top level branches)
2107         */
2108        minlen = study_chunk(pRExC_state,
2109            &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2110            SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2111                                                      ? SCF_TRIE_DOING_RESTUDY
2112                                                      : 0),
2113            0, TRUE);
2114        /* search for "restudy" in this file for a detailed explanation
2115         * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2116
2117        CHECK_RESTUDY_GOTO_butfirst(NOOP);
2118
2119        RExC_rx->check_substr = NULL;
2120        RExC_rx->check_utf8 = NULL;
2121        RExC_rx->substrs->data[0].substr      = NULL;
2122        RExC_rx->substrs->data[0].utf8_substr = NULL;
2123        RExC_rx->substrs->data[1].substr      = NULL;
2124        RExC_rx->substrs->data[1].utf8_substr = NULL;
2125
2126        if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2127            && is_ssc_worth_it(pRExC_state, data.start_class))
2128        {
2129            const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2130
2131            ssc_finalize(pRExC_state, data.start_class);
2132
2133            Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2134            StructCopy(data.start_class,
2135                       (regnode_ssc*)RExC_rxi->data->data[n],
2136                       regnode_ssc);
2137            RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2138            RExC_rx->intflags &= ~PREGf_SKIP;	/* Used in find_byclass(). */
2139            DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2140                      regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2141                      Perl_re_printf( aTHX_
2142                                    "synthetic stclass \"%s\".\n",
2143                                    SvPVX_const(sv));});
2144            data.start_class = NULL;
2145        }
2146    }
2147
2148    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2149        RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2150        RExC_rx->maxlen = REG_INFTY;
2151    }
2152    else {
2153        RExC_rx->maxlen = RExC_maxlen;
2154    }
2155
2156    /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2157       the "real" pattern. */
2158    DEBUG_OPTIMISE_r({
2159        Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2160                      (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2161    });
2162    RExC_rx->minlenret = minlen;
2163    if (RExC_rx->minlen < minlen)
2164        RExC_rx->minlen = minlen;
2165
2166    if (RExC_seen & REG_RECURSE_SEEN ) {
2167        RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2168        Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2169    }
2170    if (RExC_seen & REG_GPOS_SEEN)
2171        RExC_rx->intflags |= PREGf_GPOS_SEEN;
2172
2173    if (RExC_seen & REG_PESSIMIZE_SEEN)
2174        RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2175
2176    if (RExC_seen & REG_LOOKBEHIND_SEEN)
2177        RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2178                                                lookbehind */
2179    if (pRExC_state->code_blocks)
2180        RExC_rx->extflags |= RXf_EVAL_SEEN;
2181
2182    if (RExC_seen & REG_VERBARG_SEEN) {
2183        RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2184        RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2185    }
2186
2187    if (RExC_seen & REG_CUTGROUP_SEEN)
2188        RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2189
2190    if (pm_flags & PMf_USE_RE_EVAL)
2191        RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2192
2193    if (RExC_paren_names)
2194        RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2195    else
2196        RXp_PAREN_NAMES(RExC_rx) = NULL;
2197
2198    /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2199     * so it can be used in pp.c */
2200    if (RExC_rx->intflags & PREGf_ANCH)
2201        RExC_rx->extflags |= RXf_IS_ANCHORED;
2202
2203
2204    {
2205        /* this is used to identify "special" patterns that might result
2206         * in Perl NOT calling the regex engine and instead doing the match "itself",
2207         * particularly special cases in split//. By having the regex compiler
2208         * do this pattern matching at a regop level (instead of by inspecting the pattern)
2209         * we avoid weird issues with equivalent patterns resulting in different behavior,
2210         * AND we allow non Perl engines to get the same optimizations by the setting the
2211         * flags appropriately - Yves */
2212        regnode *first = RExC_rxi->program + 1;
2213        U8 fop = OP(first);
2214        regnode *next = NULL;
2215        U8 nop = 0;
2216        if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2217            next = REGNODE_AFTER(first);
2218            nop = OP(next);
2219        }
2220        /* It's safe to read through *next only if OP(first) is a regop of
2221         * the right type (not EXACT, for example).
2222         */
2223        if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2224            RExC_rx->extflags |= RXf_NULL;
2225        else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2226            /* when fop is SBOL first->flags will be true only when it was
2227             * produced by parsing /\A/, and not when parsing /^/. This is
2228             * very important for the split code as there we want to
2229             * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2230             * See rt #122761 for more details. -- Yves */
2231            RExC_rx->extflags |= RXf_START_ONLY;
2232        else if (fop == PLUS
2233                 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2234                 && OP(regnext(first)) == END)
2235            RExC_rx->extflags |= RXf_WHITE;
2236        else if ( RExC_rx->extflags & RXf_SPLIT
2237                  && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2238                  && STR_LEN(first) == 1
2239                  && *(STRING(first)) == ' '
2240                  && OP(regnext(first)) == END )
2241            RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2242
2243    }
2244
2245    if (RExC_contains_locale) {
2246        RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2247    }
2248
2249#ifdef DEBUGGING
2250    if (RExC_paren_names) {
2251        RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2252        RExC_rxi->data->data[RExC_rxi->name_list_idx]
2253                                   = (void*)SvREFCNT_inc(RExC_paren_name_list);
2254    } else
2255#endif
2256    RExC_rxi->name_list_idx = 0;
2257
2258    while ( RExC_recurse_count > 0 ) {
2259        const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2260        /*
2261         * This data structure is set up in study_chunk() and is used
2262         * to calculate the distance between a GOSUB regopcode and
2263         * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2264         * it refers to.
2265         *
2266         * If for some reason someone writes code that optimises
2267         * away a GOSUB opcode then the assert should be changed to
2268         * an if(scan) to guard the ARG2i_SET() - Yves
2269         *
2270         */
2271        assert(scan && OP(scan) == GOSUB);
2272        ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2273    }
2274    if (RExC_logical_total_parens != RExC_total_parens) {
2275        Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2276        /* we rebuild this below */
2277        Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2278        for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2279            int logical_parno= RExC_parno_to_logical[parno];
2280            assert(logical_parno);
2281            RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2282            RExC_logical_to_parno[logical_parno] = parno;
2283        }
2284        RExC_rx->logical_to_parno = RExC_logical_to_parno;
2285        RExC_rx->parno_to_logical = RExC_parno_to_logical;
2286        RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2287        RExC_logical_to_parno = NULL;
2288        RExC_parno_to_logical = NULL;
2289        RExC_parno_to_logical_next = NULL;
2290    } else {
2291        RExC_rx->logical_to_parno = NULL;
2292        RExC_rx->parno_to_logical = NULL;
2293        RExC_rx->parno_to_logical_next = NULL;
2294    }
2295
2296    Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2297    /* assume we don't need to swap parens around before we match */
2298    DEBUG_TEST_r({
2299        Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2300            (unsigned long)RExC_study_chunk_recursed_count);
2301    });
2302    DEBUG_DUMP_r({
2303        DEBUG_RExC_seen();
2304        Perl_re_printf( aTHX_ "Final program:\n");
2305        regdump(RExC_rx);
2306    });
2307
2308    if (RExC_open_parens) {
2309        Safefree(RExC_open_parens);
2310        RExC_open_parens = NULL;
2311    }
2312    if (RExC_close_parens) {
2313        Safefree(RExC_close_parens);
2314        RExC_close_parens = NULL;
2315    }
2316    if (RExC_logical_to_parno) {
2317        Safefree(RExC_logical_to_parno);
2318        RExC_logical_to_parno = NULL;
2319    }
2320    if (RExC_parno_to_logical) {
2321        Safefree(RExC_parno_to_logical);
2322        RExC_parno_to_logical = NULL;
2323    }
2324
2325#ifdef USE_ITHREADS
2326    /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2327     * by setting the regexp SV to readonly-only instead. If the
2328     * pattern's been recompiled, the USEDness should remain. */
2329    if (old_re && SvREADONLY(old_re))
2330        SvREADONLY_on(Rx);
2331#endif
2332    return Rx;
2333}
2334
2335
2336
2337SV*
2338Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2339{
2340    PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2341        PERL_UNUSED_ARG(rx);
2342        if (0)
2343            return NULL;
2344        else
2345            return newSVpvs("Regexp");
2346}
2347
2348/* Scans the name of a named buffer from the pattern.
2349 * If flags is REG_RSN_RETURN_NULL returns null.
2350 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2351 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2352 * to the parsed name as looked up in the RExC_paren_names hash.
2353 * If there is an error throws a vFAIL().. type exception.
2354 */
2355
2356#define REG_RSN_RETURN_NULL    0
2357#define REG_RSN_RETURN_NAME    1
2358#define REG_RSN_RETURN_DATA    2
2359
2360STATIC SV*
2361S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2362{
2363    char *name_start = RExC_parse;
2364    SV* sv_name;
2365
2366    PERL_ARGS_ASSERT_REG_SCAN_NAME;
2367
2368    assert (RExC_parse <= RExC_end);
2369    if (RExC_parse == RExC_end) NOOP;
2370    else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2371         /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
2372          * using do...while */
2373        if (UTF)
2374            do {
2375                RExC_parse_inc_utf8();
2376            } while (   RExC_parse < RExC_end
2377                     && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2378        else
2379            do {
2380                RExC_parse_inc_by(1);
2381            } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2382    } else {
2383        RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2384                         character */
2385        vFAIL("Group name must start with a non-digit word character");
2386    }
2387    sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2388                             SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2389    if ( flags == REG_RSN_RETURN_NAME)
2390        return sv_name;
2391    else if (flags==REG_RSN_RETURN_DATA) {
2392        HE *he_str = NULL;
2393        SV *sv_dat = NULL;
2394        if ( ! sv_name )      /* should not happen*/
2395            Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2396        if (RExC_paren_names)
2397            he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2398        if ( he_str )
2399            sv_dat = HeVAL(he_str);
2400        if ( ! sv_dat ) {   /* Didn't find group */
2401
2402            /* It might be a forward reference; we can't fail until we
2403                * know, by completing the parse to get all the groups, and
2404                * then reparsing */
2405            if (ALL_PARENS_COUNTED)  {
2406                vFAIL("Reference to nonexistent named group");
2407            }
2408            else {
2409                REQUIRE_PARENS_PASS;
2410            }
2411        }
2412        return sv_dat;
2413    }
2414
2415    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2416                     (unsigned long) flags);
2417}
2418
2419#define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
2420    if (RExC_lastparse!=RExC_parse) {                           \
2421        Perl_re_printf( aTHX_  "%s",                            \
2422            Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
2423                RExC_end - RExC_parse, 16,                      \
2424                "", "",                                         \
2425                PERL_PV_ESCAPE_UNI_DETECT |                     \
2426                PERL_PV_PRETTY_ELLIPSES   |                     \
2427                PERL_PV_PRETTY_LTGT       |                     \
2428                PERL_PV_ESCAPE_RE         |                     \
2429                PERL_PV_PRETTY_EXACTSIZE                        \
2430            )                                                   \
2431        );                                                      \
2432    } else                                                      \
2433        Perl_re_printf( aTHX_ "%16s","");                       \
2434                                                                \
2435    if (RExC_lastnum!=RExC_emit)                                \
2436       Perl_re_printf( aTHX_ "|%4zu", RExC_emit);                \
2437    else                                                        \
2438       Perl_re_printf( aTHX_ "|%4s","");                        \
2439    Perl_re_printf( aTHX_ "|%*s%-4s",                           \
2440        (int)((depth*2)), "",                                   \
2441        (funcname)                                              \
2442    );                                                          \
2443    RExC_lastnum=RExC_emit;                                     \
2444    RExC_lastparse=RExC_parse;                                  \
2445})
2446
2447
2448
2449#define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
2450    DEBUG_PARSE_MSG((funcname));                            \
2451    Perl_re_printf( aTHX_ "%4s","\n");                                  \
2452})
2453#define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({\
2454    DEBUG_PARSE_MSG((funcname));                            \
2455    Perl_re_printf( aTHX_ fmt "\n",args);                               \
2456})
2457
2458
2459STATIC void
2460S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2461{
2462    /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2463     * constructs, and updates RExC_flags with them.  On input, RExC_parse
2464     * should point to the first flag; it is updated on output to point to the
2465     * final ')' or ':'.  There needs to be at least one flag, or this will
2466     * abort */
2467
2468    /* for (?g), (?gc), and (?o) warnings; warning
2469       about (?c) will warn about (?g) -- japhy    */
2470
2471#define WASTED_O  0x01
2472#define WASTED_G  0x02
2473#define WASTED_C  0x04
2474#define WASTED_GC (WASTED_G|WASTED_C)
2475    I32 wastedflags = 0x00;
2476    U32 posflags = 0, negflags = 0;
2477    U32 *flagsp = &posflags;
2478    char has_charset_modifier = '\0';
2479    regex_charset cs;
2480    bool has_use_defaults = FALSE;
2481    const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2482    int x_mod_count = 0;
2483
2484    PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2485
2486    /* '^' as an initial flag sets certain defaults */
2487    if (UCHARAT(RExC_parse) == '^') {
2488        RExC_parse_inc_by(1);
2489        has_use_defaults = TRUE;
2490        STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2491        cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2492             ? REGEX_UNICODE_CHARSET
2493             : REGEX_DEPENDS_CHARSET;
2494        set_regex_charset(&RExC_flags, cs);
2495    }
2496    else {
2497        cs = get_regex_charset(RExC_flags);
2498        if (   cs == REGEX_DEPENDS_CHARSET
2499            && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2500        {
2501            cs = REGEX_UNICODE_CHARSET;
2502        }
2503    }
2504
2505    while (RExC_parse < RExC_end) {
2506        /* && memCHRs("iogcmsx", *RExC_parse) */
2507        /* (?g), (?gc) and (?o) are useless here
2508           and must be globally applied -- japhy */
2509        if ((RExC_pm_flags & PMf_WILDCARD)) {
2510            if (flagsp == & negflags) {
2511                if (*RExC_parse == 'm') {
2512                    RExC_parse_inc_by(1);
2513                    /* diag_listed_as: Use of %s is not allowed in Unicode
2514                       property wildcard subpatterns in regex; marked by <--
2515                       HERE in m/%s/ */
2516                    vFAIL("Use of modifier '-m' is not allowed in Unicode"
2517                          " property wildcard subpatterns");
2518                }
2519            }
2520            else {
2521                if (*RExC_parse == 's') {
2522                    goto modifier_illegal_in_wildcard;
2523                }
2524            }
2525        }
2526
2527        switch (*RExC_parse) {
2528
2529            /* Code for the imsxn flags */
2530            CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2531
2532            case LOCALE_PAT_MOD:
2533                if (has_charset_modifier) {
2534                    goto excess_modifier;
2535                }
2536                else if (flagsp == &negflags) {
2537                    goto neg_modifier;
2538                }
2539                cs = REGEX_LOCALE_CHARSET;
2540                has_charset_modifier = LOCALE_PAT_MOD;
2541                break;
2542            case UNICODE_PAT_MOD:
2543                if (has_charset_modifier) {
2544                    goto excess_modifier;
2545                }
2546                else if (flagsp == &negflags) {
2547                    goto neg_modifier;
2548                }
2549                cs = REGEX_UNICODE_CHARSET;
2550                has_charset_modifier = UNICODE_PAT_MOD;
2551                break;
2552            case ASCII_RESTRICT_PAT_MOD:
2553                if (flagsp == &negflags) {
2554                    goto neg_modifier;
2555                }
2556                if (has_charset_modifier) {
2557                    if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2558                        goto excess_modifier;
2559                    }
2560                    /* Doubled modifier implies more restricted */
2561                    cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2562                }
2563                else {
2564                    cs = REGEX_ASCII_RESTRICTED_CHARSET;
2565                }
2566                has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2567                break;
2568            case DEPENDS_PAT_MOD:
2569                if (has_use_defaults) {
2570                    goto fail_modifiers;
2571                }
2572                else if (flagsp == &negflags) {
2573                    goto neg_modifier;
2574                }
2575                else if (has_charset_modifier) {
2576                    goto excess_modifier;
2577                }
2578
2579                /* The dual charset means unicode semantics if the
2580                 * pattern (or target, not known until runtime) are
2581                 * utf8, or something in the pattern indicates unicode
2582                 * semantics */
2583                cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2584                     ? REGEX_UNICODE_CHARSET
2585                     : REGEX_DEPENDS_CHARSET;
2586                has_charset_modifier = DEPENDS_PAT_MOD;
2587                break;
2588              excess_modifier:
2589                RExC_parse_inc_by(1);
2590                if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2591                    vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2592                }
2593                else if (has_charset_modifier == *(RExC_parse - 1)) {
2594                    vFAIL2("Regexp modifier \"%c\" may not appear twice",
2595                                        *(RExC_parse - 1));
2596                }
2597                else {
2598                    vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2599                }
2600                NOT_REACHED; /*NOTREACHED*/
2601              neg_modifier:
2602                RExC_parse_inc_by(1);
2603                vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2604                                    *(RExC_parse - 1));
2605                NOT_REACHED; /*NOTREACHED*/
2606            case GLOBAL_PAT_MOD: /* 'g' */
2607                if (RExC_pm_flags & PMf_WILDCARD) {
2608                    goto modifier_illegal_in_wildcard;
2609                }
2610                /*FALLTHROUGH*/
2611            case ONCE_PAT_MOD: /* 'o' */
2612                if (ckWARN(WARN_REGEXP)) {
2613                    const I32 wflagbit = *RExC_parse == 'o'
2614                                         ? WASTED_O
2615                                         : WASTED_G;
2616                    if (! (wastedflags & wflagbit) ) {
2617                        wastedflags |= wflagbit;
2618                        /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2619                        vWARN5(
2620                            RExC_parse + 1,
2621                            "Useless (%s%c) - %suse /%c modifier",
2622                            flagsp == &negflags ? "?-" : "?",
2623                            *RExC_parse,
2624                            flagsp == &negflags ? "don't " : "",
2625                            *RExC_parse
2626                        );
2627                    }
2628                }
2629                break;
2630
2631            case CONTINUE_PAT_MOD: /* 'c' */
2632                if (RExC_pm_flags & PMf_WILDCARD) {
2633                    goto modifier_illegal_in_wildcard;
2634                }
2635                if (ckWARN(WARN_REGEXP)) {
2636                    if (! (wastedflags & WASTED_C) ) {
2637                        wastedflags |= WASTED_GC;
2638                        /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2639                        vWARN3(
2640                            RExC_parse + 1,
2641                            "Useless (%sc) - %suse /gc modifier",
2642                            flagsp == &negflags ? "?-" : "?",
2643                            flagsp == &negflags ? "don't " : ""
2644                        );
2645                    }
2646                }
2647                break;
2648            case KEEPCOPY_PAT_MOD: /* 'p' */
2649                if (RExC_pm_flags & PMf_WILDCARD) {
2650                    goto modifier_illegal_in_wildcard;
2651                }
2652                if (flagsp == &negflags) {
2653                    ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2654                } else {
2655                    *flagsp |= RXf_PMf_KEEPCOPY;
2656                }
2657                break;
2658            case '-':
2659                /* A flag is a default iff it is following a minus, so
2660                 * if there is a minus, it means will be trying to
2661                 * re-specify a default which is an error */
2662                if (has_use_defaults || flagsp == &negflags) {
2663                    goto fail_modifiers;
2664                }
2665                flagsp = &negflags;
2666                wastedflags = 0;  /* reset so (?g-c) warns twice */
2667                x_mod_count = 0;
2668                break;
2669            case ':':
2670            case ')':
2671
2672                if (  (RExC_pm_flags & PMf_WILDCARD)
2673                    && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2674                {
2675                    RExC_parse_inc_by(1);
2676                    /* diag_listed_as: Use of %s is not allowed in Unicode
2677                       property wildcard subpatterns in regex; marked by <--
2678                       HERE in m/%s/ */
2679                    vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2680                           " property wildcard subpatterns",
2681                           has_charset_modifier);
2682                }
2683
2684                if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2685                    negflags |= RXf_PMf_EXTENDED_MORE;
2686                }
2687                RExC_flags |= posflags;
2688
2689                if (negflags & RXf_PMf_EXTENDED) {
2690                    negflags |= RXf_PMf_EXTENDED_MORE;
2691                }
2692                RExC_flags &= ~negflags;
2693                set_regex_charset(&RExC_flags, cs);
2694
2695                return;
2696            default:
2697              fail_modifiers:
2698                RExC_parse_inc_if_char();
2699                /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2700                vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2701                      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2702                NOT_REACHED; /*NOTREACHED*/
2703        }
2704
2705        RExC_parse_inc();
2706    }
2707
2708    vFAIL("Sequence (?... not terminated");
2709
2710  modifier_illegal_in_wildcard:
2711    RExC_parse_inc_by(1);
2712    /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2713       subpatterns in regex; marked by <-- HERE in m/%s/ */
2714    vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2715           " subpatterns", *(RExC_parse - 1));
2716}
2717
2718/*
2719 - reg - regular expression, i.e. main body or parenthesized thing
2720 *
2721 * Caller must absorb opening parenthesis.
2722 *
2723 * Combining parenthesis handling with the base level of regular expression
2724 * is a trifle forced, but the need to tie the tails of the branches to what
2725 * follows makes it hard to avoid.
2726 */
2727
2728STATIC regnode_offset
2729S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2730                             I32 *flagp,
2731                             char * backref_parse_start,
2732                             char ch
2733                      )
2734{
2735    regnode_offset ret;
2736    char* name_start = RExC_parse;
2737    U32 num = 0;
2738    SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2739    DECLARE_AND_GET_RE_DEBUG_FLAGS;
2740
2741    PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2742
2743    if (RExC_parse != name_start && ch == '}') {
2744        while (isBLANK(*RExC_parse)) {
2745            RExC_parse_inc_by(1);
2746        }
2747    }
2748    if (RExC_parse == name_start || *RExC_parse != ch) {
2749        /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2750        vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2751    }
2752
2753    if (sv_dat) {
2754        num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2755        RExC_rxi->data->data[num]=(void*)sv_dat;
2756        SvREFCNT_inc_simple_void_NN(sv_dat);
2757    }
2758    RExC_sawback = 1;
2759    ret = reg2node(pRExC_state,
2760                   ((! FOLD)
2761                     ? REFN
2762                     : (ASCII_FOLD_RESTRICTED)
2763                       ? REFFAN
2764                       : (AT_LEAST_UNI_SEMANTICS)
2765                         ? REFFUN
2766                         : (LOC)
2767                           ? REFFLN
2768                           : REFFN),
2769                    num, RExC_nestroot);
2770    if (RExC_nestroot && num >= (U32)RExC_nestroot)
2771        FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2772    *flagp |= HASWIDTH;
2773
2774    nextchar(pRExC_state);
2775    return ret;
2776}
2777
2778/* reg_la_NOTHING()
2779 *
2780 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2781 * NOTHING regop when the construct is empty.
2782 *
2783 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2784 *
2785 * Checks for unterminated constructs and throws a "not terminated" error
2786 * with the appropriate type if necessary
2787 *
2788 * Assuming it does not throw an exception increments RExC_seen_zerolen.
2789 *
2790 * If the construct is empty generates a NOTHING op and returns its
2791 * regnode_offset, which the caller would then return to its caller.
2792 *
2793 * If the construct is not empty increments RExC_in_lookaround, and turns
2794 * on any flags provided in RExC_seen, and then returns 0 to signify
2795 * that parsing should continue.
2796 *
2797 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2798 * any use of it would have had to be broken onto multiple lines, hence
2799 * the abbreviation.
2800 */
2801STATIC regnode_offset
2802S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2803    const char *type)
2804{
2805
2806    PERL_ARGS_ASSERT_REG_LA_NOTHING;
2807
2808    /* false below so we do not force /x */
2809    skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2810
2811    if (RExC_parse >= RExC_end)
2812        vFAIL2("Sequence (%s... not terminated", type);
2813
2814    /* Always increment as NOTHING regops are zerolen */
2815    RExC_seen_zerolen++;
2816
2817    if (*RExC_parse == ')') {
2818        regnode_offset ret= reg_node(pRExC_state, NOTHING);
2819        nextchar(pRExC_state);
2820        return ret;
2821    }
2822
2823    RExC_seen |= flags;
2824    RExC_in_lookaround++;
2825    return 0; /* keep parsing! */
2826}
2827
2828/* reg_la_OPFAIL()
2829 *
2830 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2831 * OPFAIL regop when the construct is empty.
2832 *
2833 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2834 *
2835 * Checks for unterminated constructs and throws a "not terminated" error
2836 * if necessary.
2837 *
2838 * If the construct is empty generates an OPFAIL op and returns its
2839 * regnode_offset which the caller should then return to its caller.
2840 *
2841 * If the construct is not empty increments RExC_in_lookaround, and also
2842 * increments RExC_seen_zerolen, and turns on the flags provided in
2843 * RExC_seen, and then returns 0 to signify that parsing should continue.
2844 *
2845 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2846 * any use of it would have had to be broken onto multiple lines, hence
2847 * the abbreviation.
2848 */
2849
2850STATIC regnode_offset
2851S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2852    const char *type)
2853{
2854
2855    PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2856
2857    /* FALSE so we don't force to /x below */;
2858    skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2859
2860    if (RExC_parse >= RExC_end)
2861        vFAIL2("Sequence (%s... not terminated", type);
2862
2863    if (*RExC_parse == ')') {
2864        regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2865        nextchar(pRExC_state);
2866        return ret; /* return produced regop */
2867    }
2868
2869    /* only increment zerolen *after* we check if we produce an OPFAIL
2870     * as an OPFAIL does not match a zero length construct, as it
2871     * does not match ever. */
2872    RExC_seen_zerolen++;
2873    RExC_seen |= flags;
2874    RExC_in_lookaround++;
2875    return 0; /* keep parsing! */
2876}
2877
2878/* Below are the main parsing routines.
2879 *
2880 * S_reg()      parses a whole pattern or subpattern.  It itself handles things
2881 *              like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2882 *              alternation '|' in the '...' pattern.
2883 * S_regbranch() effectively implements the concatenation operator, handling
2884 *              one alternative of '|', repeatedly calling S_regpiece on each
2885 *              segment of the input.
2886 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2887 *              and then adds any quantifier for that chunk.
2888 * S_regatom()  parses the next chunk of the input, returning when it
2889 *              determines it has found a complete atomic chunk.  The chunk may
2890 *              be a nested subpattern, in which case S_reg is called
2891 *              recursively
2892 *
2893 * The functions generate regnodes as they go along, appending each to the
2894 * pattern data structure so far.  They return the offset of the current final
2895 * node into that structure, or 0 on failure.
2896 *
2897 * There are three parameters common to all of them:
2898 *   pRExC_state    is a structure with much information about the current
2899 *                  state of the parse.  It's easy to add new elements to
2900 *                  convey new information, but beware that an error return may
2901 *                  require clearing the element.
2902 *   flagp          is a pointer to bit flags set in a lower level to pass up
2903 *                  to higher levels information, such as the cause of a
2904 *                  failure, or some characteristic about the generated node
2905 *   depth          is roughly the recursion depth, mostly unused except for
2906 *                  pretty printing debugging info.
2907 *
2908 * There are ancillary functions that these may farm work out to, using the
2909 * same parameters.
2910 *
2911 * The protocol for handling flags is that each function will, before
2912 * returning, add into *flagp the flags it needs to pass up.  Each function has
2913 * a second flags variable, typically named 'flags', which it sets and clears
2914 * at will.  Flag bits in it are used in that function, and it calls the next
2915 * layer down with its 'flagp' parameter set to '&flags'.  Thus, upon return,
2916 * 'flags' will contain whatever it had before the call, plus whatever that
2917 * function passed up.  If it wants to pass any of these up to its caller, it
2918 * has to add them to its *flagp.  This means that it takes extra steps to keep
2919 * passing a flag upwards, and otherwise the flag bit is cleared for higher
2920 * functions.
2921 */
2922
2923/* On success, returns the offset at which any next node should be placed into
2924 * the regex engine program being compiled.
2925 *
2926 * Returns 0 otherwise, with *flagp set to indicate why:
2927 *  TRYAGAIN        at the end of (?) that only sets flags.
2928 *  RESTART_PARSE   if the parse needs to be restarted, or'd with
2929 *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
2930 *  Otherwise would only return 0 if regbranch() returns 0, which cannot
2931 *  happen.  */
2932STATIC regnode_offset
2933S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
2934    /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
2935     * 2 is like 1, but indicates that nextchar() has been called to advance
2936     * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
2937     * this flag alerts us to the need to check for that */
2938{
2939    regnode_offset ret = 0;    /* Will be the head of the group. */
2940    regnode_offset br;
2941    regnode_offset lastbr;
2942    regnode_offset ender = 0;
2943    I32 logical_parno = 0;
2944    I32 parno = 0;
2945    I32 flags;
2946    U32 oregflags = RExC_flags;
2947    bool have_branch = 0;
2948    bool is_open = 0;
2949    I32 freeze_paren = 0;
2950    I32 after_freeze = 0;
2951    I32 num; /* numeric backreferences */
2952    SV * max_open;  /* Max number of unclosed parens */
2953    I32 was_in_lookaround = RExC_in_lookaround;
2954    I32 fake_eval = 0; /* matches paren */
2955
2956    /* The difference between the following variables can be seen with  *
2957     * the broken pattern /(?:foo/ where segment_parse_start will point *
2958     * at the 'f', and reg_parse_start will point at the '('            */
2959
2960    /* the following is used for unmatched '(' errors */
2961    char * const reg_parse_start = RExC_parse;
2962
2963    /* the following is used to track where various segments of
2964     * the pattern that we parse out started. */
2965    char * segment_parse_start = RExC_parse;
2966
2967    DECLARE_AND_GET_RE_DEBUG_FLAGS;
2968
2969    PERL_ARGS_ASSERT_REG;
2970    DEBUG_PARSE("reg ");
2971
2972    max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
2973    assert(max_open);
2974    if (!SvIOK(max_open)) {
2975        sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
2976    }
2977    if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
2978                                              open paren */
2979        vFAIL("Too many nested open parens");
2980    }
2981
2982    *flagp = 0;				/* Initialize. */
2983
2984    /* Having this true makes it feasible to have a lot fewer tests for the
2985     * parse pointer being in scope.  For example, we can write
2986     *      while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2987     * instead of
2988     *      while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2989     */
2990    assert(*RExC_end == '\0');
2991
2992    /* Make an OPEN node, if parenthesized. */
2993    if (paren) {
2994
2995        /* Under /x, space and comments can be gobbled up between the '(' and
2996         * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
2997         * intervening space, as the sequence is a token, and a token should be
2998         * indivisible */
2999        bool has_intervening_patws = (paren == 2)
3000                                  && *(RExC_parse - 1) != '(';
3001
3002        if (RExC_parse >= RExC_end) {
3003            vFAIL("Unmatched (");
3004        }
3005
3006        if (paren == 'r') {     /* Atomic script run */
3007            paren = '>';
3008            goto parse_rest;
3009        }
3010        else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3011            if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3012                fake_eval = '{';
3013                goto handle_qmark;
3014            }
3015
3016            char *start_verb = RExC_parse + 1;
3017            STRLEN verb_len;
3018            char *start_arg = NULL;
3019            unsigned char op = 0;
3020            int arg_required = 0;
3021            int internal_argval = -1; /* if > -1 no argument allowed */
3022            bool has_upper = FALSE;
3023            U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3024
3025            if (has_intervening_patws) {
3026                RExC_parse_inc_by(1);   /* past the '*' */
3027
3028                /* For strict backwards compatibility, don't change the message
3029                 * now that we also have lowercase operands */
3030                if (isUPPER(*RExC_parse)) {
3031                    vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3032                }
3033                else {
3034                    vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3035                }
3036            }
3037            while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3038                if ( *RExC_parse == ':' ) {
3039                    start_arg = RExC_parse + 1;
3040                    break;
3041                }
3042                else if (! UTF) {
3043                    if (isUPPER(*RExC_parse)) {
3044                        has_upper = TRUE;
3045                    }
3046                    RExC_parse_inc_by(1);
3047                }
3048                else {
3049                    RExC_parse_inc_utf8();
3050                }
3051            }
3052            verb_len = RExC_parse - start_verb;
3053            if ( start_arg ) {
3054                if (RExC_parse >= RExC_end) {
3055                    goto unterminated_verb_pattern;
3056                }
3057
3058                RExC_parse_inc();
3059                while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3060                    RExC_parse_inc();
3061                }
3062                if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3063                  unterminated_verb_pattern:
3064                    if (has_upper) {
3065                        vFAIL("Unterminated verb pattern argument");
3066                    }
3067                    else {
3068                        vFAIL("Unterminated '(*...' argument");
3069                    }
3070                }
3071            } else {
3072                if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3073                    if (has_upper) {
3074                        vFAIL("Unterminated verb pattern");
3075                    }
3076                    else {
3077                        vFAIL("Unterminated '(*...' construct");
3078                    }
3079                }
3080            }
3081
3082            /* Here, we know that RExC_parse < RExC_end */
3083
3084            switch ( *start_verb ) {
3085            case 'A':  /* (*ACCEPT) */
3086                if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3087                    op = ACCEPT;
3088                    internal_argval = RExC_nestroot;
3089                }
3090                break;
3091            case 'C':  /* (*COMMIT) */
3092                if ( memEQs(start_verb, verb_len,"COMMIT") )
3093                    op = COMMIT;
3094                break;
3095            case 'F':  /* (*FAIL) */
3096                if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3097                    op = OPFAIL;
3098                }
3099                break;
3100            case ':':  /* (*:NAME) */
3101            case 'M':  /* (*MARK:NAME) */
3102                if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3103                    op = MARKPOINT;
3104                    arg_required = 1;
3105                }
3106                break;
3107            case 'P':  /* (*PRUNE) */
3108                if ( memEQs(start_verb, verb_len,"PRUNE") )
3109                    op = PRUNE;
3110                break;
3111            case 'S':   /* (*SKIP) */
3112                if ( memEQs(start_verb, verb_len,"SKIP") )
3113                    op = SKIP;
3114                break;
3115            case 'T':  /* (*THEN) */
3116                /* [19:06] <TimToady> :: is then */
3117                if ( memEQs(start_verb, verb_len,"THEN") ) {
3118                    op = CUTGROUP;
3119                    RExC_seen |= REG_CUTGROUP_SEEN;
3120                }
3121                break;
3122            case 'a':
3123                if (   memEQs(start_verb, verb_len, "asr")
3124                    || memEQs(start_verb, verb_len, "atomic_script_run"))
3125                {
3126                    paren = 'r';        /* Mnemonic: recursed run */
3127                    goto script_run;
3128                }
3129                else if (memEQs(start_verb, verb_len, "atomic")) {
3130                    paren = 't';    /* AtOMIC */
3131                    goto alpha_assertions;
3132                }
3133                break;
3134            case 'p':
3135                if (   memEQs(start_verb, verb_len, "plb")
3136                    || memEQs(start_verb, verb_len, "positive_lookbehind"))
3137                {
3138                    paren = 'b';
3139                    goto lookbehind_alpha_assertions;
3140                }
3141                else if (   memEQs(start_verb, verb_len, "pla")
3142                         || memEQs(start_verb, verb_len, "positive_lookahead"))
3143                {
3144                    paren = 'a';
3145                    goto alpha_assertions;
3146                }
3147                break;
3148            case 'n':
3149                if (   memEQs(start_verb, verb_len, "nlb")
3150                    || memEQs(start_verb, verb_len, "negative_lookbehind"))
3151                {
3152                    paren = 'B';
3153                    goto lookbehind_alpha_assertions;
3154                }
3155                else if (   memEQs(start_verb, verb_len, "nla")
3156                         || memEQs(start_verb, verb_len, "negative_lookahead"))
3157                {
3158                    paren = 'A';
3159                    goto alpha_assertions;
3160                }
3161                break;
3162            case 's':
3163                if (   memEQs(start_verb, verb_len, "sr")
3164                    || memEQs(start_verb, verb_len, "script_run"))
3165                {
3166                    regnode_offset atomic;
3167
3168                    paren = 's';
3169
3170                   script_run:
3171
3172                    /* This indicates Unicode rules. */
3173                    REQUIRE_UNI_RULES(flagp, 0);
3174
3175                    if (! start_arg) {
3176                        goto no_colon;
3177                    }
3178
3179                    RExC_parse_set(start_arg);
3180
3181                    if (RExC_in_script_run) {
3182
3183                        /*  Nested script runs are treated as no-ops, because
3184                         *  if the nested one fails, the outer one must as
3185                         *  well.  It could fail sooner, and avoid (??{} with
3186                         *  side effects, but that is explicitly documented as
3187                         *  undefined behavior. */
3188
3189                        ret = 0;
3190
3191                        if (paren == 's') {
3192                            paren = ':';
3193                            goto parse_rest;
3194                        }
3195
3196                        /* But, the atomic part of a nested atomic script run
3197                         * isn't a no-op, but can be treated just like a '(?>'
3198                         * */
3199                        paren = '>';
3200                        goto parse_rest;
3201                    }
3202
3203                    if (paren == 's') {
3204                        /* Here, we're starting a new regular script run */
3205                        ret = reg_node(pRExC_state, SROPEN);
3206                        RExC_in_script_run = 1;
3207                        is_open = 1;
3208                        goto parse_rest;
3209                    }
3210
3211                    /* Here, we are starting an atomic script run.  This is
3212                     * handled by recursing to deal with the atomic portion
3213                     * separately, enclosed in SROPEN ... SRCLOSE nodes */
3214
3215                    ret = reg_node(pRExC_state, SROPEN);
3216
3217                    RExC_in_script_run = 1;
3218
3219                    atomic = reg(pRExC_state, 'r', &flags, depth);
3220                    if (flags & (RESTART_PARSE|NEED_UTF8)) {
3221                        *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3222                        return 0;
3223                    }
3224
3225                    if (! REGTAIL(pRExC_state, ret, atomic)) {
3226                        REQUIRE_BRANCHJ(flagp, 0);
3227                    }
3228
3229                    if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3230                                                                SRCLOSE)))
3231                    {
3232                        REQUIRE_BRANCHJ(flagp, 0);
3233                    }
3234
3235                    RExC_in_script_run = 0;
3236                    return ret;
3237                }
3238
3239                break;
3240
3241            lookbehind_alpha_assertions:
3242                seen_flag_set = REG_LOOKBEHIND_SEEN;
3243                /*FALLTHROUGH*/
3244
3245            alpha_assertions:
3246
3247                if ( !start_arg ) {
3248                    goto no_colon;
3249                }
3250
3251                if ( RExC_parse == start_arg ) {
3252                    if ( paren == 'A' || paren == 'B' ) {
3253                        /* An empty negative lookaround assertion is failure.
3254                         * See also: S_reg_la_OPFAIL() */
3255
3256                        /* Note: OPFAIL is *not* zerolen. */
3257                        ret = reg1node(pRExC_state, OPFAIL, 0);
3258                        nextchar(pRExC_state);
3259                        return ret;
3260                    }
3261                    else
3262                    if ( paren == 'a' || paren == 'b' ) {
3263                        /* An empty positive lookaround assertion is success.
3264                         * See also: S_reg_la_NOTHING() */
3265
3266                        /* Note: NOTHING is zerolen, so increment here */
3267                        RExC_seen_zerolen++;
3268                        ret = reg_node(pRExC_state, NOTHING);
3269                        nextchar(pRExC_state);
3270                        return ret;
3271                    }
3272                }
3273
3274                RExC_seen_zerolen++;
3275                RExC_in_lookaround++;
3276                RExC_seen |= seen_flag_set;
3277
3278                RExC_parse_set(start_arg);
3279                goto parse_rest;
3280
3281              no_colon:
3282                vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3283                    UTF8fARG(UTF, verb_len, start_verb));
3284                NOT_REACHED; /*NOTREACHED*/
3285
3286            } /* End of switch */
3287            if ( ! op ) {
3288                RExC_parse_inc_safe();
3289                if (has_upper || verb_len == 0) {
3290                    vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3291                        UTF8fARG(UTF, verb_len, start_verb));
3292                }
3293                else {
3294                    vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3295                        UTF8fARG(UTF, verb_len, start_verb));
3296                }
3297            }
3298            if ( RExC_parse == start_arg ) {
3299                start_arg = NULL;
3300            }
3301            if ( arg_required && !start_arg ) {
3302                vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3303                    (int) verb_len, start_verb);
3304            }
3305            if (internal_argval == -1) {
3306                ret = reg1node(pRExC_state, op, 0);
3307            } else {
3308                ret = reg2node(pRExC_state, op, 0, internal_argval);
3309            }
3310            RExC_seen |= REG_VERBARG_SEEN;
3311            if (start_arg) {
3312                SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3313                ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3314                                        STR_WITH_LEN("S"));
3315                RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3316                FLAGS(REGNODE_p(ret)) = 1;
3317            } else {
3318                FLAGS(REGNODE_p(ret)) = 0;
3319            }
3320            if ( internal_argval != -1 )
3321                ARG2i_SET(REGNODE_p(ret), internal_argval);
3322            nextchar(pRExC_state);
3323            return ret;
3324        }
3325        else if (*RExC_parse == '?') { /* (?...) */
3326          handle_qmark:
3327            ; /* make sure the label has a statement associated with it*/
3328            bool is_logical = 0, is_optimistic = 0;
3329            const char * const seqstart = RExC_parse;
3330            const char * endptr;
3331            const char non_existent_group_msg[]
3332                                            = "Reference to nonexistent group";
3333            const char impossible_group[] = "Invalid reference to group";
3334
3335            if (has_intervening_patws) {
3336                RExC_parse_inc_by(1);
3337                vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3338            }
3339
3340            RExC_parse_inc_by(1);   /* past the '?' */
3341            if (!fake_eval) {
3342                paren = *RExC_parse;    /* might be a trailing NUL, if not
3343                                           well-formed */
3344                is_optimistic = 0;
3345            } else {
3346                is_optimistic = 1;
3347                paren = fake_eval;
3348            }
3349            RExC_parse_inc();
3350            if (RExC_parse > RExC_end) {
3351                paren = '\0';
3352            }
3353            ret = 0;			/* For look-ahead/behind. */
3354            switch (paren) {
3355
3356            case 'P':	/* (?P...) variants for those used to PCRE/Python */
3357                paren = *RExC_parse;
3358                if ( paren == '<') {    /* (?P<...>) named capture */
3359                    RExC_parse_inc_by(1);
3360                    if (RExC_parse >= RExC_end) {
3361                        vFAIL("Sequence (?P<... not terminated");
3362                    }
3363                    goto named_capture;
3364                }
3365                else if (paren == '>') {   /* (?P>name) named recursion */
3366                    RExC_parse_inc_by(1);
3367                    if (RExC_parse >= RExC_end) {
3368                        vFAIL("Sequence (?P>... not terminated");
3369                    }
3370                    goto named_recursion;
3371                }
3372                else if (paren == '=') {   /* (?P=...)  named backref */
3373                    RExC_parse_inc_by(1);
3374                    return handle_named_backref(pRExC_state, flagp,
3375                                                segment_parse_start, ')');
3376                }
3377                RExC_parse_inc_if_char();
3378                /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3379                vFAIL3("Sequence (%.*s...) not recognized",
3380                                (int) (RExC_parse - seqstart), seqstart);
3381                NOT_REACHED; /*NOTREACHED*/
3382            case '<':           /* (?<...) */
3383                /* If you want to support (?<*...), first reconcile with GH #17363 */
3384                if (*RExC_parse == '!') {
3385                    paren = ','; /* negative lookbehind (?<! ... ) */
3386                    RExC_parse_inc_by(1);
3387                    if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3388                        return ret;
3389                    break;
3390                }
3391                else
3392                if (*RExC_parse == '=') {
3393                    /* paren = '<' - negative lookahead (?<= ... ) */
3394                    RExC_parse_inc_by(1);
3395                    if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3396                        return ret;
3397                    break;
3398                }
3399                else
3400              named_capture:
3401                {               /* (?<...>) */
3402                    char *name_start;
3403                    SV *svname;
3404                    paren= '>';
3405                /* FALLTHROUGH */
3406            case '\'':          /* (?'...') */
3407                    name_start = RExC_parse;
3408                    svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3409                    if (   RExC_parse == name_start
3410                        || RExC_parse >= RExC_end
3411                        || *RExC_parse != paren)
3412                    {
3413                        vFAIL2("Sequence (?%c... not terminated",
3414                            paren=='>' ? '<' : (char) paren);
3415                    }
3416                    {
3417                        HE *he_str;
3418                        SV *sv_dat = NULL;
3419                        if (!svname) /* shouldn't happen */
3420                            Perl_croak(aTHX_
3421                                "panic: reg_scan_name returned NULL");
3422                        if (!RExC_paren_names) {
3423                            RExC_paren_names= newHV();
3424                            sv_2mortal(MUTABLE_SV(RExC_paren_names));
3425#ifdef DEBUGGING
3426                            RExC_paren_name_list= newAV();
3427                            sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3428#endif
3429                        }
3430                        he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3431                        if ( he_str )
3432                            sv_dat = HeVAL(he_str);
3433                        if ( ! sv_dat ) {
3434                            /* croak baby croak */
3435                            Perl_croak(aTHX_
3436                                "panic: paren_name hash element allocation failed");
3437                        } else if ( SvPOK(sv_dat) ) {
3438                            /* (?|...) can mean we have dupes so scan to check
3439                               its already been stored. Maybe a flag indicating
3440                               we are inside such a construct would be useful,
3441                               but the arrays are likely to be quite small, so
3442                               for now we punt -- dmq */
3443                            IV count = SvIV(sv_dat);
3444                            I32 *pv = (I32*)SvPVX(sv_dat);
3445                            IV i;
3446                            for ( i = 0 ; i < count ; i++ ) {
3447                                if ( pv[i] == RExC_npar ) {
3448                                    count = 0;
3449                                    break;
3450                                }
3451                            }
3452                            if ( count ) {
3453                                pv = (I32*)SvGROW(sv_dat,
3454                                                SvCUR(sv_dat) + sizeof(I32)+1);
3455                                SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3456                                pv[count] = RExC_npar;
3457                                SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3458                            }
3459                        } else {
3460                            (void)SvUPGRADE(sv_dat, SVt_PVNV);
3461                            sv_setpvn(sv_dat, (char *)&(RExC_npar),
3462                                                                sizeof(I32));
3463                            SvIOK_on(sv_dat);
3464                            SvIV_set(sv_dat, 1);
3465                        }
3466#ifdef DEBUGGING
3467                        /* No, this does not cause a memory leak under
3468                         * debugging. RExC_paren_name_list is freed later
3469                         * on in the dump process. - Yves
3470                         */
3471                        if (!av_store(RExC_paren_name_list,
3472                                      RExC_npar, SvREFCNT_inc_NN(svname)))
3473                            SvREFCNT_dec_NN(svname);
3474#endif
3475
3476                    }
3477                    nextchar(pRExC_state);
3478                    paren = 1;
3479                    goto capturing_parens;
3480                }
3481                NOT_REACHED; /*NOTREACHED*/
3482            case '=':           /* (?=...) */
3483                if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3484                    return ret;
3485                break;
3486            case '!':           /* (?!...) */
3487                if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3488                    return ret;
3489                break;
3490            case '|':           /* (?|...) */
3491                /* branch reset, behave like a (?:...) except that
3492                   buffers in alternations share the same numbers */
3493                paren = ':';
3494                after_freeze = freeze_paren = RExC_logical_npar;
3495
3496                /* XXX This construct currently requires an extra pass.
3497                 * Investigation would be required to see if that could be
3498                 * changed */
3499                REQUIRE_PARENS_PASS;
3500                break;
3501            case ':':           /* (?:...) */
3502            case '>':           /* (?>...) */
3503                break;
3504            case '$':           /* (?$...) */
3505            case '@':           /* (?@...) */
3506                vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3507                break;
3508            case '0' :           /* (?0) */
3509            case 'R' :           /* (?R) */
3510                if (RExC_parse == RExC_end || *RExC_parse != ')')
3511                    FAIL("Sequence (?R) not terminated");
3512                num = 0;
3513                RExC_seen |= REG_RECURSE_SEEN;
3514
3515                /* XXX These constructs currently require an extra pass.
3516                 * It probably could be changed */
3517                REQUIRE_PARENS_PASS;
3518
3519                *flagp |= POSTPONED;
3520                goto gen_recurse_regop;
3521                /*notreached*/
3522            /* named and numeric backreferences */
3523            case '&':            /* (?&NAME) */
3524                segment_parse_start = RExC_parse - 1;
3525              named_recursion:
3526                {
3527                    SV *sv_dat = reg_scan_name(pRExC_state,
3528                                               REG_RSN_RETURN_DATA);
3529                   num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3530                }
3531                if (RExC_parse >= RExC_end || *RExC_parse != ')')
3532                    vFAIL("Sequence (?&... not terminated");
3533                goto gen_recurse_regop;
3534                /* NOTREACHED */
3535            case '+':
3536                if (! inRANGE(RExC_parse[0], '1', '9')) {
3537                    RExC_parse_inc_by(1);
3538                    vFAIL("Illegal pattern");
3539                }
3540                goto parse_recursion;
3541                /* NOTREACHED*/
3542            case '-': /* (?-1) */
3543                if (! inRANGE(RExC_parse[0], '1', '9')) {
3544                    RExC_parse--; /* rewind to let it be handled later */
3545                    goto parse_flags;
3546                }
3547                /* FALLTHROUGH */
3548            case '1': case '2': case '3': case '4': /* (?1) */
3549            case '5': case '6': case '7': case '8': case '9':
3550                RExC_parse_set((char *) seqstart + 1);  /* Point to the digit */
3551              parse_recursion:
3552                {
3553                    bool is_neg = FALSE;
3554                    UV unum;
3555                    segment_parse_start = RExC_parse - 1;
3556                    if (*RExC_parse == '-') {
3557                        RExC_parse_inc_by(1);
3558                        is_neg = TRUE;
3559                    }
3560                    endptr = RExC_end;
3561                    if (grok_atoUV(RExC_parse, &unum, &endptr)
3562                        && unum <= I32_MAX
3563                    ) {
3564                        num = (I32)unum;
3565                        RExC_parse_set((char*)endptr);
3566                    }
3567                    else {  /* Overflow, or something like that.  Position
3568                               beyond all digits for the message */
3569                        while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
3570                            RExC_parse_inc_by(1);
3571                        }
3572                        vFAIL(impossible_group);
3573                    }
3574                    if (is_neg) {
3575                        /* -num is always representable on 1 and 2's complement
3576                         * machines */
3577                        num = -num;
3578                    }
3579                }
3580                if (*RExC_parse!=')')
3581                    vFAIL("Expecting close bracket");
3582
3583                if (paren == '-' || paren == '+') {
3584
3585                    /* Don't overflow */
3586                    if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3587                        RExC_parse_inc_by(1);
3588                        vFAIL(impossible_group);
3589                    }
3590
3591                    /*
3592                    Diagram of capture buffer numbering.
3593                    Top line is the normal capture buffer numbers
3594                    Bottom line is the negative indexing as from
3595                    the X (the (?-2))
3596
3597                        1 2    3 4 5 X   Y      6 7
3598                       /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3599                       /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3600                    -   5 4    3 2 1 X   Y      x x
3601
3602                    Resolve to absolute group.  Recall that RExC_npar is +1 of
3603                    the actual parenthesis group number.  For lookahead, we
3604                    have to compensate for that.  Using the above example, when
3605                    we get to Y in the parse, num is 2 and RExC_npar is 6.  We
3606                    want 7 for +2, and 4 for -2.
3607                    */
3608                    if ( paren == '+' ) {
3609                        num--;
3610                    }
3611
3612                    num += RExC_npar;
3613
3614                    if (paren == '-' && num < 1) {
3615                        RExC_parse_inc_by(1);
3616                        vFAIL(non_existent_group_msg);
3617                    }
3618                }
3619                else
3620                if (num && num < RExC_logical_npar) {
3621                    num = RExC_logical_to_parno[num];
3622                }
3623                else
3624                if (ALL_PARENS_COUNTED) {
3625                    if (num < RExC_logical_total_parens) {
3626                        num = RExC_logical_to_parno[num];
3627                    }
3628                    else {
3629                        RExC_parse_inc_by(1);
3630                        vFAIL(non_existent_group_msg);
3631                    }
3632                }
3633                else {
3634                    REQUIRE_PARENS_PASS;
3635                }
3636
3637
3638              gen_recurse_regop:
3639                if (num >= RExC_npar) {
3640
3641                    /* It might be a forward reference; we can't fail until we
3642                     * know, by completing the parse to get all the groups, and
3643                     * then reparsing */
3644                    if (ALL_PARENS_COUNTED)  {
3645                        if (num >= RExC_total_parens) {
3646                            RExC_parse_inc_by(1);
3647                            vFAIL(non_existent_group_msg);
3648                        }
3649                    }
3650                    else {
3651                        REQUIRE_PARENS_PASS;
3652                    }
3653                }
3654
3655                /* We keep track how many GOSUB items we have produced.
3656                   To start off the ARG2i() of the GOSUB holds its "id",
3657                   which is used later in conjunction with RExC_recurse
3658                   to calculate the offset we need to jump for the GOSUB,
3659                   which it will store in the final representation.
3660                   We have to defer the actual calculation until much later
3661                   as the regop may move.
3662                 */
3663                ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3664                RExC_recurse_count++;
3665                DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3666                    "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3667                            22, "|    |", (int)(depth * 2 + 1), "",
3668                            (UV)ARG1u(REGNODE_p(ret)),
3669                            (IV)ARG2i(REGNODE_p(ret))));
3670                RExC_seen |= REG_RECURSE_SEEN;
3671
3672                *flagp |= POSTPONED;
3673                assert(*RExC_parse == ')');
3674                nextchar(pRExC_state);
3675                return ret;
3676
3677            /* NOTREACHED */
3678
3679            case '?':           /* (??...) */
3680                is_logical = 1;
3681                if (*RExC_parse != '{') {
3682                    RExC_parse_inc_if_char();
3683                    /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3684                    vFAIL2utf8f(
3685                        "Sequence (%" UTF8f "...) not recognized",
3686                        UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3687                    NOT_REACHED; /*NOTREACHED*/
3688                }
3689                *flagp |= POSTPONED;
3690                paren = '{';
3691                RExC_parse_inc_by(1);
3692                /* FALLTHROUGH */
3693            case '{':           /* (?{...}) */
3694            {
3695                U32 n = 0;
3696                struct reg_code_block *cb;
3697                OP * o;
3698
3699                RExC_seen_zerolen++;
3700
3701                if (   !pRExC_state->code_blocks
3702                    || pRExC_state->code_index
3703                                        >= pRExC_state->code_blocks->count
3704                    || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3705                        != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3706                            - RExC_start)
3707                ) {
3708                    if (RExC_pm_flags & PMf_USE_RE_EVAL)
3709                        FAIL("panic: Sequence (?{...}): no code block found\n");
3710                    FAIL("Eval-group not allowed at runtime, use re 'eval'");
3711                }
3712                /* this is a pre-compiled code block (?{...}) */
3713                cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3714                RExC_parse_set(RExC_start + cb->end);
3715                o = cb->block;
3716                if (cb->src_regex) {
3717                    n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3718                    RExC_rxi->data->data[n] =
3719                        (void*)SvREFCNT_inc((SV*)cb->src_regex);
3720                    RExC_rxi->data->data[n+1] = (void*)o;
3721                }
3722                else {
3723                    n = reg_add_data(pRExC_state,
3724                            (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3725                    RExC_rxi->data->data[n] = (void*)o;
3726                }
3727                pRExC_state->code_index++;
3728                nextchar(pRExC_state);
3729                if (!is_optimistic)
3730                    RExC_seen |= REG_PESSIMIZE_SEEN;
3731
3732                if (is_logical) {
3733                    regnode_offset eval;
3734                    ret = reg_node(pRExC_state, LOGICAL);
3735                    FLAGS(REGNODE_p(ret)) = 2;
3736
3737                    eval = reg2node(pRExC_state, EVAL,
3738                                       n,
3739
3740                                       /* for later propagation into (??{})
3741                                        * return value */
3742                                       RExC_flags & RXf_PMf_COMPILETIME
3743                                      );
3744                    FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3745                    if (! REGTAIL(pRExC_state, ret, eval)) {
3746                        REQUIRE_BRANCHJ(flagp, 0);
3747                    }
3748                    return ret;
3749                }
3750                ret = reg2node(pRExC_state, EVAL, n, 0);
3751                FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3752
3753                return ret;
3754            }
3755            case '(':           /* (?(?{...})...) and (?(?=...)...) */
3756            {
3757                int is_define= 0;
3758                const int DEFINE_len = sizeof("DEFINE") - 1;
3759                if (    RExC_parse < RExC_end - 1
3760                    && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
3761                            && (   RExC_parse[1] == '='
3762                                || RExC_parse[1] == '!'
3763                                || RExC_parse[1] == '<'
3764                                || RExC_parse[1] == '{'))
3765                        || (       RExC_parse[0] == '*'        /* (?(*...)) */
3766                            && (   RExC_parse[1] == '{'
3767                            || (   memBEGINs(RExC_parse + 1,
3768                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3769                                         "pla:")
3770                                || memBEGINs(RExC_parse + 1,
3771                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3772                                         "plb:")
3773                                || memBEGINs(RExC_parse + 1,
3774                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3775                                         "nla:")
3776                                || memBEGINs(RExC_parse + 1,
3777                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3778                                         "nlb:")
3779                                || memBEGINs(RExC_parse + 1,
3780                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3781                                         "positive_lookahead:")
3782                                || memBEGINs(RExC_parse + 1,
3783                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3784                                         "positive_lookbehind:")
3785                                || memBEGINs(RExC_parse + 1,
3786                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3787                                         "negative_lookahead:")
3788                                || memBEGINs(RExC_parse + 1,
3789                                         (Size_t) (RExC_end - (RExC_parse + 1)),
3790                                         "negative_lookbehind:")))))
3791                ) { /* Lookahead or eval. */
3792                    I32 flag;
3793                    regnode_offset tail;
3794
3795                    ret = reg_node(pRExC_state, LOGICAL);
3796                    FLAGS(REGNODE_p(ret)) = 1;
3797
3798                    tail = reg(pRExC_state, 1, &flag, depth+1);
3799                    RETURN_FAIL_ON_RESTART(flag, flagp);
3800                    if (! REGTAIL(pRExC_state, ret, tail)) {
3801                        REQUIRE_BRANCHJ(flagp, 0);
3802                    }
3803                    goto insert_if;
3804                }
3805                else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
3806                         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3807                {
3808                    char ch = RExC_parse[0] == '<' ? '>' : '\'';
3809                    char *name_start= RExC_parse;
3810                    RExC_parse_inc_by(1);
3811                    U32 num = 0;
3812                    SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3813                    if (   RExC_parse == name_start
3814                        || RExC_parse >= RExC_end
3815                        || *RExC_parse != ch)
3816                    {
3817                        vFAIL2("Sequence (?(%c... not terminated",
3818                            (ch == '>' ? '<' : ch));
3819                    }
3820                    RExC_parse_inc_by(1);
3821                    if (sv_dat) {
3822                        num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3823                        RExC_rxi->data->data[num]=(void*)sv_dat;
3824                        SvREFCNT_inc_simple_void_NN(sv_dat);
3825                    }
3826                    ret = reg1node(pRExC_state, GROUPPN, num);
3827                    goto insert_if_check_paren;
3828                }
3829                else if (memBEGINs(RExC_parse,
3830                                   (STRLEN) (RExC_end - RExC_parse),
3831                                   "DEFINE"))
3832                {
3833                    ret = reg1node(pRExC_state, DEFINEP, 0);
3834                    RExC_parse_inc_by(DEFINE_len);
3835                    is_define = 1;
3836                    goto insert_if_check_paren;
3837                }
3838                else if (RExC_parse[0] == 'R') {
3839                    RExC_parse_inc_by(1);
3840                    /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
3841                     * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3842                     * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3843                     */
3844                    parno = 0;
3845                    if (RExC_parse[0] == '0') {
3846                        parno = 1;
3847                        RExC_parse_inc_by(1);
3848                    }
3849                    else if (inRANGE(RExC_parse[0], '1', '9')) {
3850                        UV uv;
3851                        endptr = RExC_end;
3852                        if (grok_atoUV(RExC_parse, &uv, &endptr)
3853                            && uv <= I32_MAX
3854                        ) {
3855                            parno = (I32)uv + 1;
3856                            RExC_parse_set((char*)endptr);
3857                        }
3858                        /* else "Switch condition not recognized" below */
3859                    } else if (RExC_parse[0] == '&') {
3860                        SV *sv_dat;
3861                        RExC_parse_inc_by(1);
3862                        sv_dat = reg_scan_name(pRExC_state,
3863                                               REG_RSN_RETURN_DATA);
3864                        if (sv_dat)
3865                            parno = 1 + *((I32 *)SvPVX(sv_dat));
3866                    }
3867                    ret = reg1node(pRExC_state, INSUBP, parno);
3868                    goto insert_if_check_paren;
3869                }
3870                else if (inRANGE(RExC_parse[0], '1', '9')) {
3871                    /* (?(1)...) */
3872                    char c;
3873                    UV uv;
3874                    endptr = RExC_end;
3875                    if (grok_atoUV(RExC_parse, &uv, &endptr)
3876                        && uv <= I32_MAX
3877                    ) {
3878                        parno = (I32)uv;
3879                        RExC_parse_set((char*)endptr);
3880                    }
3881                    else {
3882                        vFAIL("panic: grok_atoUV returned FALSE");
3883                    }
3884                    ret = reg1node(pRExC_state, GROUPP, parno);
3885
3886                 insert_if_check_paren:
3887                    if (UCHARAT(RExC_parse) != ')') {
3888                        RExC_parse_inc_safe();
3889                        vFAIL("Switch condition not recognized");
3890                    }
3891                    nextchar(pRExC_state);
3892                  insert_if:
3893                    if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3894                                                             IFTHEN, 0)))
3895                    {
3896                        REQUIRE_BRANCHJ(flagp, 0);
3897                    }
3898                    br = regbranch(pRExC_state, &flags, 1, depth+1);
3899                    if (br == 0) {
3900                        RETURN_FAIL_ON_RESTART(flags,flagp);
3901                        FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3902                              (UV) flags);
3903                    } else
3904                    if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
3905                                                             LONGJMP, 0)))
3906                    {
3907                        REQUIRE_BRANCHJ(flagp, 0);
3908                    }
3909                    c = UCHARAT(RExC_parse);
3910                    nextchar(pRExC_state);
3911                    if (flags&HASWIDTH)
3912                        *flagp |= HASWIDTH;
3913                    if (c == '|') {
3914                        if (is_define)
3915                            vFAIL("(?(DEFINE)....) does not allow branches");
3916
3917                        /* Fake one for optimizer.  */
3918                        lastbr = reg1node(pRExC_state, IFTHEN, 0);
3919
3920                        if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
3921                            RETURN_FAIL_ON_RESTART(flags, flagp);
3922                            FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3923                                  (UV) flags);
3924                        }
3925                        if (! REGTAIL(pRExC_state, ret, lastbr)) {
3926                            REQUIRE_BRANCHJ(flagp, 0);
3927                        }
3928                        if (flags&HASWIDTH)
3929                            *flagp |= HASWIDTH;
3930                        c = UCHARAT(RExC_parse);
3931                        nextchar(pRExC_state);
3932                    }
3933                    else
3934                        lastbr = 0;
3935                    if (c != ')') {
3936                        if (RExC_parse >= RExC_end)
3937                            vFAIL("Switch (?(condition)... not terminated");
3938                        else
3939                            vFAIL("Switch (?(condition)... contains too many branches");
3940                    }
3941                    ender = reg_node(pRExC_state, TAIL);
3942                    if (! REGTAIL(pRExC_state, br, ender)) {
3943                        REQUIRE_BRANCHJ(flagp, 0);
3944                    }
3945                    if (lastbr) {
3946                        if (! REGTAIL(pRExC_state, lastbr, ender)) {
3947                            REQUIRE_BRANCHJ(flagp, 0);
3948                        }
3949                        if (! REGTAIL(pRExC_state,
3950                                      REGNODE_OFFSET(
3951                                        REGNODE_AFTER(REGNODE_p(lastbr))),
3952                                      ender))
3953                        {
3954                            REQUIRE_BRANCHJ(flagp, 0);
3955                        }
3956                    }
3957                    else
3958                        if (! REGTAIL(pRExC_state, ret, ender)) {
3959                            REQUIRE_BRANCHJ(flagp, 0);
3960                        }
3961#if 0  /* Removing this doesn't cause failures in the test suite -- khw */
3962                    RExC_size++; /* XXX WHY do we need this?!!
3963                                    For large programs it seems to be required
3964                                    but I can't figure out why. -- dmq*/
3965#endif
3966                    return ret;
3967                }
3968                RExC_parse_inc_safe();
3969                vFAIL("Unknown switch condition (?(...))");
3970            }
3971            case '[':           /* (?[ ... ]) */
3972                return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
3973            case 0: /* A NUL */
3974                RExC_parse--; /* for vFAIL to print correctly */
3975                vFAIL("Sequence (? incomplete");
3976                break;
3977
3978            case ')':
3979                if (RExC_strict) {  /* [perl #132851] */
3980                    ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
3981                }
3982                /* FALLTHROUGH */
3983            case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
3984            /* FALLTHROUGH */
3985            default: /* e.g., (?i) */
3986                RExC_parse_set((char *) seqstart + 1);
3987              parse_flags:
3988                parse_lparen_question_flags(pRExC_state);
3989                if (UCHARAT(RExC_parse) != ':') {
3990                    if (RExC_parse < RExC_end)
3991                        nextchar(pRExC_state);
3992                    *flagp = TRYAGAIN;
3993                    return 0;
3994                }
3995                paren = ':';
3996                nextchar(pRExC_state);
3997                ret = 0;
3998                goto parse_rest;
3999            } /* end switch */
4000        }
4001        else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
4002          capturing_parens:
4003            parno = RExC_npar;
4004            RExC_npar++;
4005            if (RExC_npar >= U16_MAX)
4006                FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4007
4008            logical_parno = RExC_logical_npar;
4009            RExC_logical_npar++;
4010            if (! ALL_PARENS_COUNTED) {
4011                /* If we are in our first pass through (and maybe only pass),
4012                 * we  need to allocate memory for the capturing parentheses
4013                 * data structures.
4014                 */
4015
4016                if (!RExC_parens_buf_size) {
4017                    /* first guess at number of parens we might encounter */
4018                    RExC_parens_buf_size = 10;
4019
4020                    /* setup RExC_open_parens, which holds the address of each
4021                     * OPEN tag, and to make things simpler for the 0 index the
4022                     * start of the program - this is used later for offsets */
4023                    Newxz(RExC_open_parens, RExC_parens_buf_size,
4024                            regnode_offset);
4025                    RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
4026
4027                    /* setup RExC_close_parens, which holds the address of each
4028                     * CLOSE tag, and to make things simpler for the 0 index
4029                     * the end of the program - this is used later for offsets
4030                     * */
4031                    Newxz(RExC_close_parens, RExC_parens_buf_size,
4032                            regnode_offset);
4033                    /* we don't know where end op starts yet, so we don't need to
4034                     * set RExC_close_parens[0] like we do RExC_open_parens[0]
4035                     * above */
4036
4037                    Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4038                    Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4039                }
4040                else if (RExC_npar > RExC_parens_buf_size) {
4041                    I32 old_size = RExC_parens_buf_size;
4042
4043                    RExC_parens_buf_size *= 2;
4044
4045                    Renew(RExC_open_parens, RExC_parens_buf_size,
4046                            regnode_offset);
4047                    Zero(RExC_open_parens + old_size,
4048                            RExC_parens_buf_size - old_size, regnode_offset);
4049
4050                    Renew(RExC_close_parens, RExC_parens_buf_size,
4051                            regnode_offset);
4052                    Zero(RExC_close_parens + old_size,
4053                            RExC_parens_buf_size - old_size, regnode_offset);
4054
4055                    Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4056                    Zero(RExC_logical_to_parno + old_size,
4057                         RExC_parens_buf_size - old_size, I32);
4058
4059                    Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4060                    Zero(RExC_parno_to_logical + old_size,
4061                         RExC_parens_buf_size - old_size, I32);
4062                }
4063            }
4064
4065            ret = reg1node(pRExC_state, OPEN, parno);
4066            if (!RExC_nestroot)
4067                RExC_nestroot = parno;
4068            if (RExC_open_parens && !RExC_open_parens[parno])
4069            {
4070                DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4071                    "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4072                    22, "|    |", (int)(depth * 2 + 1), "",
4073                    (IV)parno, ret));
4074                RExC_open_parens[parno]= ret;
4075            }
4076            if (RExC_parno_to_logical) {
4077                RExC_parno_to_logical[parno] = logical_parno;
4078                if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4079                    RExC_logical_to_parno[logical_parno] = parno;
4080            }
4081            is_open = 1;
4082        } else {
4083            /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4084            paren = ':';
4085            ret = 0;
4086        }
4087    }
4088    else                        /* ! paren */
4089        ret = 0;
4090
4091   parse_rest:
4092    /* Pick up the branches, linking them together. */
4093    segment_parse_start = RExC_parse;
4094    I32 npar_before_regbranch = RExC_npar - 1;
4095    br = regbranch(pRExC_state, &flags, 1, depth+1);
4096
4097    /*     branch_len = (paren != 0); */
4098
4099    if (br == 0) {
4100        RETURN_FAIL_ON_RESTART(flags, flagp);
4101        FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4102    }
4103    if (*RExC_parse == '|') {
4104        if (RExC_use_BRANCHJ) {
4105            reginsert(pRExC_state, BRANCHJ, br, depth+1);
4106            ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4107            ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4108        }
4109        else {
4110            reginsert(pRExC_state, BRANCH, br, depth+1);
4111            ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4112            ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4113        }
4114        have_branch = 1;
4115    }
4116    else if (paren == ':') {
4117        *flagp |= flags&SIMPLE;
4118    }
4119    if (is_open) {				/* Starts with OPEN. */
4120        if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
4121            REQUIRE_BRANCHJ(flagp, 0);
4122        }
4123    }
4124    else if (paren != '?')		/* Not Conditional */
4125        ret = br;
4126    *flagp |= flags & (HASWIDTH | POSTPONED);
4127    lastbr = br;
4128    while (*RExC_parse == '|') {
4129        if (RExC_use_BRANCHJ) {
4130            bool shut_gcc_up;
4131
4132            ender = reg1node(pRExC_state, LONGJMP, 0);
4133
4134            /* Append to the previous. */
4135            shut_gcc_up = REGTAIL(pRExC_state,
4136                         REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4137                         ender);
4138            PERL_UNUSED_VAR(shut_gcc_up);
4139        }
4140        nextchar(pRExC_state);
4141        if (freeze_paren) {
4142            if (RExC_logical_npar > after_freeze)
4143                after_freeze = RExC_logical_npar;
4144            RExC_logical_npar = freeze_paren;
4145        }
4146        br = regbranch(pRExC_state, &flags, 0, depth+1);
4147
4148        if (br == 0) {
4149            RETURN_FAIL_ON_RESTART(flags, flagp);
4150            FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4151        }
4152        if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
4153            REQUIRE_BRANCHJ(flagp, 0);
4154        }
4155        assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4156        assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4157        if (OP(REGNODE_p(br)) == BRANCH) {
4158            if (OP(REGNODE_p(lastbr)) == BRANCH)
4159                ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4160            else
4161                ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4162        }
4163        else
4164        if (OP(REGNODE_p(br)) == BRANCHJ) {
4165            if (OP(REGNODE_p(lastbr)) == BRANCH)
4166                ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4167            else
4168                ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4169        }
4170
4171        lastbr = br;
4172        *flagp |= flags & (HASWIDTH | POSTPONED);
4173    }
4174
4175    if (have_branch || paren != ':') {
4176        regnode * br;
4177
4178        /* Make a closing node, and hook it on the end. */
4179        switch (paren) {
4180        case ':':
4181            ender = reg_node(pRExC_state, TAIL);
4182            break;
4183        case 1: case 2:
4184            ender = reg1node(pRExC_state, CLOSE, parno);
4185            if ( RExC_close_parens ) {
4186                DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4187                        "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4188                        22, "|    |", (int)(depth * 2 + 1), "",
4189                        (IV)parno, ender));
4190                RExC_close_parens[parno]= ender;
4191                if (RExC_nestroot == parno)
4192                    RExC_nestroot = 0;
4193            }
4194            break;
4195        case 's':
4196            ender = reg_node(pRExC_state, SRCLOSE);
4197            RExC_in_script_run = 0;
4198            break;
4199        /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4200        case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4201        case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4202        case '<': /* (?<= ... ) */
4203        case ',': /* (?<! ... ) */
4204            *flagp &= ~HASWIDTH;
4205            ender = reg_node(pRExC_state, LOOKBEHIND_END);
4206            break;
4207        /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4208        case 'a':
4209        case 'A':
4210        case '=':
4211        case '!':
4212            *flagp &= ~HASWIDTH;
4213            /* FALLTHROUGH */
4214        case 't':   /* aTomic */
4215        case '>':
4216            ender = reg_node(pRExC_state, SUCCEED);
4217            break;
4218        case 0:
4219            ender = reg_node(pRExC_state, END);
4220            assert(!RExC_end_op); /* there can only be one! */
4221            RExC_end_op = REGNODE_p(ender);
4222            if (RExC_close_parens) {
4223                DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4224                    "%*s%*s Setting close paren #0 (END) to %zu\n",
4225                    22, "|    |", (int)(depth * 2 + 1), "",
4226                    ender));
4227
4228                RExC_close_parens[0]= ender;
4229            }
4230            break;
4231        }
4232        DEBUG_PARSE_r({
4233            DEBUG_PARSE_MSG("lsbr");
4234            regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4235            regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4236            Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4237                          SvPV_nolen_const(RExC_mysv1),
4238                          (IV)lastbr,
4239                          SvPV_nolen_const(RExC_mysv2),
4240                          (IV)ender,
4241                          (IV)(ender - lastbr)
4242            );
4243        });
4244        if (OP(REGNODE_p(lastbr)) == BRANCH) {
4245            ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4246        }
4247        else
4248        if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4249            ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4250        }
4251
4252        if (! REGTAIL(pRExC_state, lastbr, ender)) {
4253            REQUIRE_BRANCHJ(flagp, 0);
4254        }
4255
4256        if (have_branch) {
4257            char is_nothing= 1;
4258            if (depth==1)
4259                RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4260
4261            /* Hook the tails of the branches to the closing node. */
4262            for (br = REGNODE_p(ret); br; br = regnext(br)) {
4263                const U8 op = REGNODE_TYPE(OP(br));
4264                regnode *nextoper = REGNODE_AFTER(br);
4265                if (op == BRANCH) {
4266                    if (! REGTAIL_STUDY(pRExC_state,
4267                                        REGNODE_OFFSET(nextoper),
4268                                        ender))
4269                    {
4270                        REQUIRE_BRANCHJ(flagp, 0);
4271                    }
4272                    if ( OP(nextoper) != NOTHING
4273                         || regnext(nextoper) != REGNODE_p(ender))
4274                        is_nothing= 0;
4275                }
4276                else if (op == BRANCHJ) {
4277                    bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4278                                        REGNODE_OFFSET(nextoper),
4279                                        ender);
4280                    PERL_UNUSED_VAR(shut_gcc_up);
4281                    /* for now we always disable this optimisation * /
4282                    regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4283                    if ( OP(nopr) != NOTHING
4284                         || regnext(nopr) != REGNODE_p(ender))
4285                    */
4286                        is_nothing= 0;
4287                }
4288            }
4289            if (is_nothing) {
4290                regnode * ret_as_regnode = REGNODE_p(ret);
4291                br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4292                               ? regnext(ret_as_regnode)
4293                               : ret_as_regnode;
4294                DEBUG_PARSE_r({
4295                    DEBUG_PARSE_MSG("NADA");
4296                    regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4297                                     NULL, pRExC_state);
4298                    regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4299                                     NULL, pRExC_state);
4300                    Perl_re_printf( aTHX_  "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4301                                  SvPV_nolen_const(RExC_mysv1),
4302                                  (IV)REG_NODE_NUM(ret_as_regnode),
4303                                  SvPV_nolen_const(RExC_mysv2),
4304                                  (IV)ender,
4305                                  (IV)(ender - ret)
4306                    );
4307                });
4308                OP(br)= NOTHING;
4309                if (OP(REGNODE_p(ender)) == TAIL) {
4310                    NEXT_OFF(br)= 0;
4311                    RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4312                } else {
4313                    regnode *opt;
4314                    for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4315                        OP(opt)= OPTIMIZED;
4316                    NEXT_OFF(br)= REGNODE_p(ender) - br;
4317                }
4318            }
4319        }
4320    }
4321
4322    {
4323        const char *p;
4324         /* Even/odd or x=don't care: 010101x10x */
4325        static const char parens[] = "=!aA<,>Bbt";
4326         /* flag below is set to 0 up through 'A'; 1 for larger */
4327
4328        if (paren && (p = strchr(parens, paren))) {
4329            U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4330            int flag = (p - parens) > 3;
4331
4332            if (paren == '>' || paren == 't') {
4333                node = SUSPEND, flag = 0;
4334            }
4335
4336            reginsert(pRExC_state, node, ret, depth+1);
4337            FLAGS(REGNODE_p(ret)) = flag;
4338            if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4339            {
4340                REQUIRE_BRANCHJ(flagp, 0);
4341            }
4342        }
4343    }
4344
4345    /* Check for proper termination. */
4346    if (paren) {
4347        /* restore original flags, but keep (?p) and, if we've encountered
4348         * something in the parse that changes /d rules into /u, keep the /u */
4349        RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4350        if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4351            set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4352        }
4353        if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4354            RExC_parse_set(reg_parse_start);
4355            vFAIL("Unmatched (");
4356        }
4357        nextchar(pRExC_state);
4358    }
4359    else if (!paren && RExC_parse < RExC_end) {
4360        if (*RExC_parse == ')') {
4361            RExC_parse_inc_by(1);
4362            vFAIL("Unmatched )");
4363        }
4364        else
4365            FAIL("Junk on end of regexp");	/* "Can't happen". */
4366        NOT_REACHED; /* NOTREACHED */
4367    }
4368
4369    if (after_freeze > RExC_logical_npar)
4370        RExC_logical_npar = after_freeze;
4371
4372    RExC_in_lookaround = was_in_lookaround;
4373
4374    return(ret);
4375}
4376
4377/*
4378 - regbranch - one alternative of an | operator
4379 *
4380 * Implements the concatenation operator.
4381 *
4382 * On success, returns the offset at which any next node should be placed into
4383 * the regex engine program being compiled.
4384 *
4385 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4386 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4387 * UTF-8
4388 */
4389STATIC regnode_offset
4390S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4391{
4392    regnode_offset ret;
4393    regnode_offset chain = 0;
4394    regnode_offset latest;
4395    regnode *branch_node = NULL;
4396    I32 flags = 0, c = 0;
4397    DECLARE_AND_GET_RE_DEBUG_FLAGS;
4398
4399    PERL_ARGS_ASSERT_REGBRANCH;
4400
4401    DEBUG_PARSE("brnc");
4402
4403    if (first)
4404        ret = 0;
4405    else {
4406        if (RExC_use_BRANCHJ) {
4407            ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4408            branch_node = REGNODE_p(ret);
4409            ARG2a_SET(branch_node, (U16)RExC_npar-1);
4410        } else {
4411            ret = reg1node(pRExC_state, BRANCH, 0);
4412            branch_node = REGNODE_p(ret);
4413            ARG1a_SET(branch_node, (U16)RExC_npar-1);
4414        }
4415    }
4416
4417    *flagp = 0;			/* Initialize. */
4418
4419    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4420                            FALSE /* Don't force to /x */ );
4421    while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4422        flags &= ~TRYAGAIN;
4423        latest = regpiece(pRExC_state, &flags, depth+1);
4424        if (latest == 0) {
4425            if (flags & TRYAGAIN)
4426                continue;
4427            RETURN_FAIL_ON_RESTART(flags, flagp);
4428            FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4429        }
4430        else if (ret == 0)
4431            ret = latest;
4432        *flagp |= flags&(HASWIDTH|POSTPONED);
4433        if (chain != 0) {
4434            /* FIXME adding one for every branch after the first is probably
4435             * excessive now we have TRIE support. (hv) */
4436            MARK_NAUGHTY(1);
4437            if (! REGTAIL(pRExC_state, chain, latest)) {
4438                /* XXX We could just redo this branch, but figuring out what
4439                 * bookkeeping needs to be reset is a pain, and it's likely
4440                 * that other branches that goto END will also be too large */
4441                REQUIRE_BRANCHJ(flagp, 0);
4442            }
4443        }
4444        chain = latest;
4445        c++;
4446    }
4447    if (chain == 0) {	/* Loop ran zero times. */
4448        chain = reg_node(pRExC_state, NOTHING);
4449        if (ret == 0)
4450            ret = chain;
4451    }
4452    if (c == 1) {
4453        *flagp |= flags & SIMPLE;
4454    }
4455    return ret;
4456}
4457
4458#define RBRACE  0
4459#define MIN_S   1
4460#define MIN_E   2
4461#define MAX_S   3
4462#define MAX_E   4
4463
4464#ifndef PERL_IN_XSUB_RE
4465bool
4466Perl_regcurly(const char *s, const char *e, const char * result[5])
4467{
4468    /* This function matches a {m,n} quantifier.  When called with a NULL final
4469     * argument, it simply parses the input from 's' up through 'e-1', and
4470     * returns a boolean as to whether or not this input is syntactically a
4471     * {m,n} quantifier.
4472     *
4473     * When called with a non-NULL final parameter, and when the function
4474     * returns TRUE, it additionally stores information into the array
4475     * specified by that parameter about what it found in the parse.  The
4476     * parameter must be a pointer into a 5 element array of 'const char *'
4477     * elements.  The returned information is as follows:
4478     *   result[RBRACE]  points to the closing brace
4479     *   result[MIN_S]   points to the first byte of the lower bound
4480     *   result[MIN_E]   points to one beyond the final byte of the lower bound
4481     *   result[MAX_S]   points to the first byte of the upper bound
4482     *   result[MAX_E]   points to one beyond the final byte of the upper bound
4483     *
4484     * If the quantifier is of the form {m,} (meaning an infinite upper
4485     * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4486     * to is irrelevant, just that it's the same place
4487     *
4488     * If instead the quantifier is of the form {m} there is actually only
4489     * one bound, and both the upper and lower result[] elements are set to
4490     * point to it.
4491     *
4492     * This function checks only for syntactic validity; it leaves checking for
4493     * semantic validity and raising any diagnostics to the caller.  This
4494     * function is called in multiple places to check for syntax, but only from
4495     * one for semantics.  It makes it as simple as possible for the
4496     * syntax-only callers, while furnishing just enough information for the
4497     * semantic caller.
4498     */
4499
4500    const char * min_start = NULL;
4501    const char * max_start = NULL;
4502    const char * min_end = NULL;
4503    const char * max_end = NULL;
4504
4505    bool has_comma = FALSE;
4506
4507    PERL_ARGS_ASSERT_REGCURLY;
4508
4509    if (s >= e || *s++ != '{')
4510        return FALSE;
4511
4512    while (s < e && isBLANK(*s)) {
4513        s++;
4514    }
4515
4516    if isDIGIT(*s) {
4517        min_start = s;
4518        do {
4519            s++;
4520        } while (s < e && isDIGIT(*s));
4521        min_end = s;
4522    }
4523
4524    while (s < e && isBLANK(*s)) {
4525        s++;
4526    }
4527
4528    if (*s == ',') {
4529        has_comma = TRUE;
4530        s++;
4531
4532        while (s < e && isBLANK(*s)) {
4533            s++;
4534        }
4535
4536        if isDIGIT(*s) {
4537            max_start = s;
4538            do {
4539                s++;
4540            } while (s < e && isDIGIT(*s));
4541            max_end = s;
4542        }
4543    }
4544
4545    while (s < e && isBLANK(*s)) {
4546        s++;
4547    }
4548                               /* Need at least one number */
4549    if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4550        return FALSE;
4551    }
4552
4553    if (result) {
4554
4555        result[RBRACE] = s;
4556
4557        result[MIN_S] = min_start;
4558        result[MIN_E] = min_end;
4559        if (has_comma) {
4560            if (max_start) {
4561                result[MAX_S] = max_start;
4562                result[MAX_E] = max_end;
4563            }
4564            else {
4565                /* Having no value after the comma is signalled by setting
4566                 * start and end to the same value.  What that value is isn't
4567                 * relevant; NULL is chosen simply because it will fail if the
4568                 * caller mistakenly uses it */
4569                result[MAX_S] = result[MAX_E] = NULL;
4570            }
4571        }
4572        else {  /* No comma means lower and upper bounds are the same */
4573            result[MAX_S] = min_start;
4574            result[MAX_E] = min_end;
4575        }
4576    }
4577
4578    return TRUE;
4579}
4580#endif
4581
4582U32
4583S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4584                       const char * start, const char * end)
4585{
4586    /* This is a helper function for regpiece() to compute, given the
4587     * quantifier {m,n}, the value of either m or n, based on the starting
4588     * position 'start' in the string, through the byte 'end-1', returning it
4589     * if valid, and failing appropriately if not.  It knows the restrictions
4590     * imposed on quantifier values */
4591
4592    UV uv;
4593    STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4594
4595    PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4596
4597    if (grok_atoUV(start, &uv, &end)) {
4598        if (uv < REG_INFTY) {   /* A valid, small-enough number */
4599            return (U32) uv;
4600        }
4601    }
4602    else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4603                                 leading zeros or overflow */
4604        RExC_parse_set((char * ) end);
4605
4606        /* Perhaps too generic a msg for what is only failure from having
4607         * leading zeros, but this is how it's always behaved. */
4608        vFAIL("Invalid quantifier in {,}");
4609        NOT_REACHED; /*NOTREACHED*/
4610    }
4611
4612    /* Here, found a quantifier, but was too large; either it overflowed or was
4613     * too big a legal number */
4614    RExC_parse_set((char * ) end);
4615    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4616
4617    NOT_REACHED; /*NOTREACHED*/
4618    return U32_MAX; /* Perhaps some compilers will be expecting a return */
4619}
4620
4621/*
4622 - regpiece - something followed by possible quantifier * + ? {n,m}
4623 *
4624 * Note that the branching code sequences used for ? and the general cases
4625 * of * and + are somewhat optimized:  they use the same NOTHING node as
4626 * both the endmarker for their branch list and the body of the last branch.
4627 * It might seem that this node could be dispensed with entirely, but the
4628 * endmarker role is not redundant.
4629 *
4630 * On success, returns the offset at which any next node should be placed into
4631 * the regex engine program being compiled.
4632 *
4633 * Returns 0 otherwise, with *flagp set to indicate why:
4634 *  TRYAGAIN        if regatom() returns 0 with TRYAGAIN.
4635 *  RESTART_PARSE   if the parse needs to be restarted, or'd with
4636 *                  NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4637 */
4638STATIC regnode_offset
4639S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4640{
4641    regnode_offset ret;
4642    char op;
4643    I32 flags;
4644    const char * const origparse = RExC_parse;
4645    I32 min;
4646    I32 max = REG_INFTY;
4647    I32 npar_before = RExC_npar-1;
4648
4649    /* Save the original in case we change the emitted regop to a FAIL. */
4650    const regnode_offset orig_emit = RExC_emit;
4651
4652    DECLARE_AND_GET_RE_DEBUG_FLAGS;
4653
4654    PERL_ARGS_ASSERT_REGPIECE;
4655
4656    DEBUG_PARSE("piec");
4657
4658    ret = regatom(pRExC_state, &flags, depth+1);
4659    if (ret == 0) {
4660        RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4661        FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4662    }
4663    I32 npar_after = RExC_npar-1;
4664
4665    op = *RExC_parse;
4666    switch (op) {
4667        const char * regcurly_return[5];
4668
4669      case '*':
4670        nextchar(pRExC_state);
4671        min = 0;
4672        break;
4673
4674      case '+':
4675        nextchar(pRExC_state);
4676        min = 1;
4677        break;
4678
4679      case '?':
4680        nextchar(pRExC_state);
4681        min = 0; max = 1;
4682        break;
4683
4684      case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
4685                    to determine which */
4686        if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4687            const char * min_start = regcurly_return[MIN_S];
4688            const char * min_end   = regcurly_return[MIN_E];
4689            const char * max_start = regcurly_return[MAX_S];
4690            const char * max_end   = regcurly_return[MAX_E];
4691
4692            if (min_start) {
4693                min = get_quantifier_value(pRExC_state, min_start, min_end);
4694            }
4695            else {
4696                min = 0;
4697            }
4698
4699            if (max_start == max_end) {     /* Was of the form {m,} */
4700                max = REG_INFTY;
4701            }
4702            else if (max_start == min_start) {  /* Was of the form {m} */
4703                max = min;
4704            }
4705            else {  /* Was of the form {m,n} */
4706                assert(max_end >= max_start);
4707
4708                max = get_quantifier_value(pRExC_state, max_start, max_end);
4709            }
4710
4711            RExC_parse_set((char *) regcurly_return[RBRACE]);
4712            nextchar(pRExC_state);
4713
4714            if (max < min) {    /* If can't match, warn and optimize to fail
4715                                   unconditionally */
4716                reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4717                ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4718                NEXT_OFF(REGNODE_p(orig_emit)) =
4719                                    REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4720                return ret;
4721            }
4722            else if (min == max && *RExC_parse == '?') {
4723                ckWARN2reg(RExC_parse + 1,
4724                           "Useless use of greediness modifier '%c'",
4725                           *RExC_parse);
4726            }
4727
4728            break;
4729        } /* End of is {m,n} */
4730
4731        /* Here was a '{', but what followed it didn't form a quantifier. */
4732        /* FALLTHROUGH */
4733
4734      default:
4735        *flagp = flags;
4736        return(ret);
4737        NOT_REACHED; /*NOTREACHED*/
4738    }
4739
4740    /* Here we have a quantifier, and have calculated 'min' and 'max'.
4741     *
4742     * Check and possibly adjust a zero width operand */
4743    if (! (flags & (HASWIDTH|POSTPONED))) {
4744        if (max > REG_INFTY/3) {
4745            ckWARN2reg(RExC_parse,
4746                       "%" UTF8f " matches null string many times",
4747                       UTF8fARG(UTF, (RExC_parse >= origparse
4748                                     ? RExC_parse - origparse
4749                                     : 0),
4750                       origparse));
4751        }
4752
4753        /* There's no point in trying to match something 0 length more than
4754         * once except for extra side effects, which we don't have here since
4755         * not POSTPONED */
4756        if (max > 1) {
4757            max = 1;
4758            if (min > max) {
4759                min = max;
4760            }
4761        }
4762    }
4763
4764    /* If this is a code block pass it up */
4765    *flagp |= (flags & POSTPONED);
4766
4767    if (max > 0) {
4768        *flagp |= (flags & HASWIDTH);
4769        if (max == REG_INFTY)
4770            RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4771    }
4772
4773    /* 'SIMPLE' operands don't require full generality */
4774    if ((flags&SIMPLE)) {
4775        if (max == REG_INFTY) {
4776            if (min == 0) {
4777                if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4778                    goto min0_maxINF_wildcard_forbidden;
4779                }
4780
4781                reginsert(pRExC_state, STAR, ret, depth+1);
4782                MARK_NAUGHTY(4);
4783                goto done_main_op;
4784            }
4785            else if (min == 1) {
4786                reginsert(pRExC_state, PLUS, ret, depth+1);
4787                MARK_NAUGHTY(3);
4788                goto done_main_op;
4789            }
4790        }
4791
4792        /* Here, SIMPLE, but not the '*' and '+' special cases */
4793
4794        MARK_NAUGHTY_EXP(2, 2);
4795        reginsert(pRExC_state, CURLY, ret, depth+1);
4796    }
4797    else {  /* not SIMPLE */
4798        const regnode_offset w = reg_node(pRExC_state, WHILEM);
4799
4800        FLAGS(REGNODE_p(w)) = 0;
4801        if (!  REGTAIL(pRExC_state, ret, w)) {
4802            REQUIRE_BRANCHJ(flagp, 0);
4803        }
4804        if (RExC_use_BRANCHJ) {
4805            reginsert(pRExC_state, LONGJMP, ret, depth+1);
4806            reginsert(pRExC_state, NOTHING, ret, depth+1);
4807            REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4808        }
4809        reginsert(pRExC_state, CURLYX, ret, depth+1);
4810        if (RExC_use_BRANCHJ)
4811            /* Go over NOTHING to LONGJMP. */
4812            REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4813
4814        if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4815                                                  NOTHING)))
4816        {
4817            REQUIRE_BRANCHJ(flagp, 0);
4818        }
4819        RExC_whilem_seen++;
4820        MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
4821    }
4822
4823    /* Finish up the CURLY/CURLYX case */
4824    FLAGS(REGNODE_p(ret)) = 0;
4825
4826    ARG1i_SET(REGNODE_p(ret), min);
4827    ARG2i_SET(REGNODE_p(ret), max);
4828
4829    /* if we had a npar_after then we need to increment npar_before,
4830     * we want to track the range of parens we need to reset each iteration
4831     */
4832    if (npar_after!=npar_before) {
4833        ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4834        ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4835    } else {
4836        ARG3a_SET(REGNODE_p(ret), 0);
4837        ARG3b_SET(REGNODE_p(ret), 0);
4838    }
4839
4840  done_main_op:
4841
4842    /* Process any greediness modifiers */
4843    if (*RExC_parse == '?') {
4844        nextchar(pRExC_state);
4845        reginsert(pRExC_state, MINMOD, ret, depth+1);
4846        if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4847            REQUIRE_BRANCHJ(flagp, 0);
4848        }
4849    }
4850    else if (*RExC_parse == '+') {
4851        regnode_offset ender;
4852        nextchar(pRExC_state);
4853        ender = reg_node(pRExC_state, SUCCEED);
4854        if (! REGTAIL(pRExC_state, ret, ender)) {
4855            REQUIRE_BRANCHJ(flagp, 0);
4856        }
4857        reginsert(pRExC_state, SUSPEND, ret, depth+1);
4858        ender = reg_node(pRExC_state, TAIL);
4859        if (! REGTAIL(pRExC_state, ret, ender)) {
4860            REQUIRE_BRANCHJ(flagp, 0);
4861        }
4862    }
4863
4864    /* Forbid extra quantifiers */
4865    if (isQUANTIFIER(RExC_parse, RExC_end)) {
4866        RExC_parse_inc_by(1);
4867        vFAIL("Nested quantifiers");
4868    }
4869
4870    return(ret);
4871
4872  min0_maxINF_wildcard_forbidden:
4873
4874    /* Here we are in a wildcard match, and the minimum match length is 0, and
4875     * the max could be infinity.  This is currently forbidden.  The only
4876     * reason is to make it harder to write patterns that take a long long time
4877     * to halt, and because the use of this construct isn't necessary in
4878     * matching Unicode property values */
4879    RExC_parse_inc_by(1);
4880    /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4881       subpatterns in regex; marked by <-- HERE in m/%s/
4882     */
4883    vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4884          " subpatterns");
4885
4886    /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4887     * legal at all in wildcards, so can't get this far */
4888
4889    NOT_REACHED; /*NOTREACHED*/
4890}
4891
4892STATIC bool
4893S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4894                regnode_offset * node_p,
4895                UV * code_point_p,
4896                int * cp_count,
4897                I32 * flagp,
4898                const bool strict,
4899                const U32 depth
4900    )
4901{
4902 /* This routine teases apart the various meanings of \N and returns
4903  * accordingly.  The input parameters constrain which meaning(s) is/are valid
4904  * in the current context.
4905  *
4906  * Exactly one of <node_p> and <code_point_p> must be non-NULL.
4907  *
4908  * If <code_point_p> is not NULL, the context is expecting the result to be a
4909  * single code point.  If this \N instance turns out to a single code point,
4910  * the function returns TRUE and sets *code_point_p to that code point.
4911  *
4912  * If <node_p> is not NULL, the context is expecting the result to be one of
4913  * the things representable by a regnode.  If this \N instance turns out to be
4914  * one such, the function generates the regnode, returns TRUE and sets *node_p
4915  * to point to the offset of that regnode into the regex engine program being
4916  * compiled.
4917  *
4918  * If this instance of \N isn't legal in any context, this function will
4919  * generate a fatal error and not return.
4920  *
4921  * On input, RExC_parse should point to the first char following the \N at the
4922  * time of the call.  On successful return, RExC_parse will have been updated
4923  * to point to just after the sequence identified by this routine.  Also
4924  * *flagp has been updated as needed.
4925  *
4926  * When there is some problem with the current context and this \N instance,
4927  * the function returns FALSE, without advancing RExC_parse, nor setting
4928  * *node_p, nor *code_point_p, nor *flagp.
4929  *
4930  * If <cp_count> is not NULL, the caller wants to know the length (in code
4931  * points) that this \N sequence matches.  This is set, and the input is
4932  * parsed for errors, even if the function returns FALSE, as detailed below.
4933  *
4934  * There are 6 possibilities here, as detailed in the next 6 paragraphs.
4935  *
4936  * Probably the most common case is for the \N to specify a single code point.
4937  * *cp_count will be set to 1, and *code_point_p will be set to that code
4938  * point.
4939  *
4940  * Another possibility is for the input to be an empty \N{}.  This is no
4941  * longer accepted, and will generate a fatal error.
4942  *
4943  * Another possibility is for a custom charnames handler to be in effect which
4944  * translates the input name to an empty string.  *cp_count will be set to 0.
4945  * *node_p will be set to a generated NOTHING node.
4946  *
4947  * Still another possibility is for the \N to mean [^\n]. *cp_count will be
4948  * set to 0. *node_p will be set to a generated REG_ANY node.
4949  *
4950  * The fifth possibility is that \N resolves to a sequence of more than one
4951  * code points.  *cp_count will be set to the number of code points in the
4952  * sequence. *node_p will be set to a generated node returned by this
4953  * function calling S_reg().
4954  *
4955  * The sixth and final possibility is that it is premature to be calling this
4956  * function; the parse needs to be restarted.  This can happen when this
4957  * changes from /d to /u rules, or when the pattern needs to be upgraded to
4958  * UTF-8.  The latter occurs only when the fifth possibility would otherwise
4959  * be in effect, and is because one of those code points requires the pattern
4960  * to be recompiled as UTF-8.  The function returns FALSE, and sets the
4961  * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate.  When this
4962  * happens, the caller needs to desist from continuing parsing, and return
4963  * this information to its caller.  This is not set for when there is only one
4964  * code point, as this can be called as part of an ANYOF node, and they can
4965  * store above-Latin1 code points without the pattern having to be in UTF-8.
4966  *
4967  * For non-single-quoted regexes, the tokenizer has resolved character and
4968  * sequence names inside \N{...} into their Unicode values, normalizing the
4969  * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
4970  * hex-represented code points in the sequence.  This is done there because
4971  * the names can vary based on what charnames pragma is in scope at the time,
4972  * so we need a way to take a snapshot of what they resolve to at the time of
4973  * the original parse. [perl #56444].
4974  *
4975  * That parsing is skipped for single-quoted regexes, so here we may get
4976  * '\N{NAME}', which is parsed now.  If the single-quoted regex is something
4977  * like '\N{U+41}', that code point is Unicode, and has to be translated into
4978  * the native character set for non-ASCII platforms.  The other possibilities
4979  * are already native, so no translation is done. */
4980
4981    char * endbrace;    /* points to '}' following the name */
4982    char * e;           /* points to final non-blank before endbrace */
4983    char* p = RExC_parse; /* Temporary */
4984
4985    SV * substitute_parse = NULL;
4986    char *orig_end;
4987    char *save_start;
4988    I32 flags;
4989
4990    DECLARE_AND_GET_RE_DEBUG_FLAGS;
4991
4992    PERL_ARGS_ASSERT_GROK_BSLASH_N;
4993
4994    assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
4995    assert(! (node_p && cp_count));               /* At most 1 should be set */
4996
4997    if (cp_count) {     /* Initialize return for the most common case */
4998        *cp_count = 1;
4999    }
5000
5001    /* The [^\n] meaning of \N ignores spaces and comments under the /x
5002     * modifier.  The other meanings do not (except blanks adjacent to and
5003     * within the braces), so use a temporary until we find out which we are
5004     * being called with */
5005    skip_to_be_ignored_text(pRExC_state, &p,
5006                            FALSE /* Don't force to /x */ );
5007
5008    /* Disambiguate between \N meaning a named character versus \N meaning
5009     * [^\n].  The latter is assumed when the {...} following the \N is a legal
5010     * quantifier, or if there is no '{' at all */
5011    if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5012        RExC_parse_set(p);
5013        if (cp_count) {
5014            *cp_count = -1;
5015        }
5016
5017        if (! node_p) {
5018            return FALSE;
5019        }
5020
5021        *node_p = reg_node(pRExC_state, REG_ANY);
5022        *flagp |= HASWIDTH|SIMPLE;
5023        MARK_NAUGHTY(1);
5024        return TRUE;
5025    }
5026
5027    /* The test above made sure that the next real character is a '{', but
5028     * under the /x modifier, it could be separated by space (or a comment and
5029     * \n) and this is not allowed (for consistency with \x{...} and the
5030     * tokenizer handling of \N{NAME}). */
5031    if (*RExC_parse != '{') {
5032        vFAIL("Missing braces on \\N{}");
5033    }
5034
5035    RExC_parse_inc_by(1);       /* Skip past the '{' */
5036
5037    endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5038    if (! endbrace) { /* no trailing brace */
5039        vFAIL2("Missing right brace on \\%c{}", 'N');
5040    }
5041
5042    /* Here, we have decided it should be a named character or sequence.  These
5043     * imply Unicode semantics */
5044    REQUIRE_UNI_RULES(flagp, FALSE);
5045
5046    /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5047     * nothing at all (not allowed under strict) */
5048    if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5049        RExC_parse_set(endbrace);
5050        if (strict) {
5051            RExC_parse_inc_by(1);   /* Position after the "}" */
5052            vFAIL("Zero length \\N{}");
5053        }
5054
5055        if (cp_count) {
5056            *cp_count = 0;
5057        }
5058        nextchar(pRExC_state);
5059        if (! node_p) {
5060            return FALSE;
5061        }
5062
5063        *node_p = reg_node(pRExC_state, NOTHING);
5064        return TRUE;
5065    }
5066
5067    while (isBLANK(*RExC_parse)) {
5068        RExC_parse_inc_by(1);
5069    }
5070
5071    e = endbrace;
5072    while (RExC_parse < e && isBLANK(*(e-1))) {
5073        e--;
5074    }
5075
5076    if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5077
5078        /* Here, the name isn't of the form  U+....  This can happen if the
5079         * pattern is single-quoted, so didn't get evaluated in toke.c.  Now
5080         * is the time to find out what the name means */
5081
5082        const STRLEN name_len = e - RExC_parse;
5083        SV *  value_sv;     /* What does this name evaluate to */
5084        SV ** value_svp;
5085        const U8 * value;   /* string of name's value */
5086        STRLEN value_len;   /* and its length */
5087
5088        /*  RExC_unlexed_names is a hash of names that weren't evaluated by
5089         *  toke.c, and their values. Make sure is initialized */
5090        if (! RExC_unlexed_names) {
5091            RExC_unlexed_names = newHV();
5092        }
5093
5094        /* If we have already seen this name in this pattern, use that.  This
5095         * allows us to only call the charnames handler once per name per
5096         * pattern.  A broken or malicious handler could return something
5097         * different each time, which could cause the results to vary depending
5098         * on if something gets added or subtracted from the pattern that
5099         * causes the number of passes to change, for example */
5100        if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5101                                                      name_len, 0)))
5102        {
5103            value_sv = *value_svp;
5104        }
5105        else { /* Otherwise we have to go out and get the name */
5106            const char * error_msg = NULL;
5107            value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5108                                                      UTF,
5109                                                      &error_msg);
5110            if (error_msg) {
5111                RExC_parse_set(endbrace);
5112                vFAIL(error_msg);
5113            }
5114
5115            /* If no error message, should have gotten a valid return */
5116            assert (value_sv);
5117
5118            /* Save the name's meaning for later use */
5119            if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5120                           value_sv, 0))
5121            {
5122                Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5123            }
5124        }
5125
5126        /* Here, we have the value the name evaluates to in 'value_sv' */
5127        value = (U8 *) SvPV(value_sv, value_len);
5128
5129        /* See if the result is one code point vs 0 or multiple */
5130        if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5131                                  ? UTF8SKIP(value)
5132                                  : 1)))
5133        {
5134            /* Here, exactly one code point.  If that isn't what is wanted,
5135             * fail */
5136            if (! code_point_p) {
5137                RExC_parse_set(p);
5138                return FALSE;
5139            }
5140
5141            /* Convert from string to numeric code point */
5142            *code_point_p = (SvUTF8(value_sv))
5143                            ? valid_utf8_to_uvchr(value, NULL)
5144                            : *value;
5145
5146            /* Have parsed this entire single code point \N{...}.  *cp_count
5147             * has already been set to 1, so don't do it again. */
5148            RExC_parse_set(endbrace);
5149            nextchar(pRExC_state);
5150            return TRUE;
5151        } /* End of is a single code point */
5152
5153        /* Count the code points, if caller desires.  The API says to do this
5154         * even if we will later return FALSE */
5155        if (cp_count) {
5156            *cp_count = 0;
5157
5158            *cp_count = (SvUTF8(value_sv))
5159                        ? utf8_length(value, value + value_len)
5160                        : value_len;
5161        }
5162
5163        /* Fail if caller doesn't want to handle a multi-code-point sequence.
5164         * But don't back the pointer up if the caller wants to know how many
5165         * code points there are (they need to handle it themselves in this
5166         * case).  */
5167        if (! node_p) {
5168            if (! cp_count) {
5169                RExC_parse_set(p);
5170            }
5171            return FALSE;
5172        }
5173
5174        /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5175         * reg recursively to parse it.  That way, it retains its atomicness,
5176         * while not having to worry about any special handling that some code
5177         * points may have. */
5178
5179        substitute_parse = newSVpvs("?:");
5180        sv_catsv(substitute_parse, value_sv);
5181        sv_catpv(substitute_parse, ")");
5182
5183        /* The value should already be native, so no need to convert on EBCDIC
5184         * platforms.*/
5185        assert(! RExC_recode_x_to_native);
5186
5187    }
5188    else {   /* \N{U+...} */
5189        Size_t count = 0;   /* code point count kept internally */
5190
5191        /* We can get to here when the input is \N{U+...} or when toke.c has
5192         * converted a name to the \N{U+...} form.  This include changing a
5193         * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5194
5195        RExC_parse_inc_by(2);    /* Skip past the 'U+' */
5196
5197        /* Code points are separated by dots.  The '}' terminates the whole
5198         * thing. */
5199
5200        do {    /* Loop until the ending brace */
5201            I32 flags = PERL_SCAN_SILENT_OVERFLOW
5202                      | PERL_SCAN_SILENT_ILLDIGIT
5203                      | PERL_SCAN_NOTIFY_ILLDIGIT
5204                      | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5205                      | PERL_SCAN_DISALLOW_PREFIX;
5206            STRLEN len = e - RExC_parse;
5207            NV overflow_value;
5208            char * start_digit = RExC_parse;
5209            UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5210
5211            if (len == 0) {
5212                RExC_parse_inc_by(1);
5213              bad_NU:
5214                vFAIL("Invalid hexadecimal number in \\N{U+...}");
5215            }
5216
5217            RExC_parse_inc_by(len);
5218
5219            if (cp > MAX_LEGAL_CP) {
5220                vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5221            }
5222
5223            if (RExC_parse >= e) { /* Got to the closing '}' */
5224                if (count) {
5225                    goto do_concat;
5226                }
5227
5228                /* Here, is a single code point; fail if doesn't want that */
5229                if (! code_point_p) {
5230                    RExC_parse_set(p);
5231                    return FALSE;
5232                }
5233
5234                /* A single code point is easy to handle; just return it */
5235                *code_point_p = UNI_TO_NATIVE(cp);
5236                RExC_parse_set(endbrace);
5237                nextchar(pRExC_state);
5238                return TRUE;
5239            }
5240
5241            /* Here, the parse stopped bfore the ending brace.  This is legal
5242             * only if that character is a dot separating code points, like a
5243             * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5244             * So the next character must be a dot (and the one after that
5245             * can't be the ending brace, or we'd have something like
5246             * \N{U+100.} )
5247             * */
5248            if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5249                /*point to after 1st invalid */
5250                RExC_parse_incf(RExC_orig_utf8);
5251                /*Guard against malformed utf8*/
5252                RExC_parse_set(MIN(e, RExC_parse));
5253                goto bad_NU;
5254            }
5255
5256            /* Here, looks like its really a multiple character sequence.  Fail
5257             * if that's not what the caller wants.  But continue with counting
5258             * and error checking if they still want a count */
5259            if (! node_p && ! cp_count) {
5260                return FALSE;
5261            }
5262
5263            /* What is done here is to convert this to a sub-pattern of the
5264             * form \x{char1}\x{char2}...  and then call reg recursively to
5265             * parse it (enclosing in "(?: ... )" ).  That way, it retains its
5266             * atomicness, while not having to worry about special handling
5267             * that some code points may have.  We don't create a subpattern,
5268             * but go through the motions of code point counting and error
5269             * checking, if the caller doesn't want a node returned. */
5270
5271            if (node_p && ! substitute_parse) {
5272                substitute_parse = newSVpvs("?:");
5273            }
5274
5275          do_concat:
5276
5277            if (node_p) {
5278                /* Convert to notation the rest of the code understands */
5279                sv_catpvs(substitute_parse, "\\x{");
5280                sv_catpvn(substitute_parse, start_digit,
5281                                            RExC_parse - start_digit);
5282                sv_catpvs(substitute_parse, "}");
5283            }
5284
5285            /* Move to after the dot (or ending brace the final time through.)
5286             * */
5287            RExC_parse_inc_by(1);
5288            count++;
5289
5290        } while (RExC_parse < e);
5291
5292        if (! node_p) { /* Doesn't want the node */
5293            assert (cp_count);
5294
5295            *cp_count = count;
5296            return FALSE;
5297        }
5298
5299        sv_catpvs(substitute_parse, ")");
5300
5301        /* The values are Unicode, and therefore have to be converted to native
5302         * on a non-Unicode (meaning non-ASCII) platform. */
5303        SET_recode_x_to_native(1);
5304    }
5305
5306    /* Here, we have the string the name evaluates to, ready to be parsed,
5307     * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5308     * constructs.  This can be called from within a substitute parse already.
5309     * The error reporting mechanism doesn't work for 2 levels of this, but the
5310     * code above has validated this new construct, so there should be no
5311     * errors generated by the below.  And this isn't an exact copy, so the
5312     * mechanism to seamlessly deal with this won't work, so turn off warnings
5313     * during it */
5314    save_start = RExC_start;
5315    orig_end = RExC_end;
5316
5317    RExC_start = SvPVX(substitute_parse);
5318    RExC_parse_set(RExC_start);
5319    RExC_end = RExC_parse + SvCUR(substitute_parse);
5320    TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5321
5322    *node_p = reg(pRExC_state, 1, &flags, depth+1);
5323
5324    /* Restore the saved values */
5325    RESTORE_WARNINGS;
5326    RExC_start = save_start;
5327    RExC_parse_set(endbrace);
5328    RExC_end = orig_end;
5329    SET_recode_x_to_native(0);
5330
5331    SvREFCNT_dec_NN(substitute_parse);
5332
5333    if (! *node_p) {
5334        RETURN_FAIL_ON_RESTART(flags, flagp);
5335        FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5336            (UV) flags);
5337    }
5338    *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5339
5340    nextchar(pRExC_state);
5341
5342    return TRUE;
5343}
5344
5345
5346STATIC U8
5347S_compute_EXACTish(RExC_state_t *pRExC_state)
5348{
5349    U8 op;
5350
5351    PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5352
5353    if (! FOLD) {
5354        return (LOC)
5355                ? EXACTL
5356                : EXACT;
5357    }
5358
5359    op = get_regex_charset(RExC_flags);
5360    if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5361        op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5362                 been, so there is no hole */
5363    }
5364
5365    return op + EXACTF;
5366}
5367
5368/* Parse backref decimal value, unless it's too big to sensibly be a backref,
5369 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5370
5371static I32
5372S_backref_value(char *p, char *e)
5373{
5374    const char* endptr = e;
5375    UV val;
5376    if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5377        return (I32)val;
5378    return I32_MAX;
5379}
5380
5381
5382/*
5383 - regatom - the lowest level
5384
5385   Try to identify anything special at the start of the current parse position.
5386   If there is, then handle it as required. This may involve generating a
5387   single regop, such as for an assertion; or it may involve recursing, such as
5388   to handle a () structure.
5389
5390   If the string doesn't start with something special then we gobble up
5391   as much literal text as we can.  If we encounter a quantifier, we have to
5392   back off the final literal character, as that quantifier applies to just it
5393   and not to the whole string of literals.
5394
5395   Once we have been able to handle whatever type of thing started the
5396   sequence, we return the offset into the regex engine program being compiled
5397   at which any  next regnode should be placed.
5398
5399   Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5400   Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5401   restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5402   Otherwise does not return 0.
5403
5404   Note: we have to be careful with escapes, as they can be both literal
5405   and special, and in the case of \10 and friends, context determines which.
5406
5407   A summary of the code structure is:
5408
5409   switch (first_byte) {
5410        cases for each special:
5411            handle this special;
5412            break;
5413        case '\\':
5414            switch (2nd byte) {
5415                cases for each unambiguous special:
5416                    handle this special;
5417                    break;
5418                cases for each ambiguous special/literal:
5419                    disambiguate;
5420                    if (special)  handle here
5421                    else goto defchar;
5422                default: // unambiguously literal:
5423                    goto defchar;
5424            }
5425        default:  // is a literal char
5426            // FALL THROUGH
5427        defchar:
5428            create EXACTish node for literal;
5429            while (more input and node isn't full) {
5430                switch (input_byte) {
5431                   cases for each special;
5432                       make sure parse pointer is set so that the next call to
5433                           regatom will see this special first
5434                       goto loopdone; // EXACTish node terminated by prev. char
5435                   default:
5436                       append char to EXACTISH node;
5437                }
5438                get next input byte;
5439            }
5440        loopdone:
5441   }
5442   return the generated node;
5443
5444   Specifically there are two separate switches for handling
5445   escape sequences, with the one for handling literal escapes requiring
5446   a dummy entry for all of the special escapes that are actually handled
5447   by the other.
5448
5449*/
5450
5451STATIC regnode_offset
5452S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5453{
5454    regnode_offset ret = 0;
5455    I32 flags = 0;
5456    char *atom_parse_start;
5457    U8 op;
5458    int invert = 0;
5459
5460    DECLARE_AND_GET_RE_DEBUG_FLAGS;
5461
5462    *flagp = 0;		/* Initialize. */
5463
5464    DEBUG_PARSE("atom");
5465
5466    PERL_ARGS_ASSERT_REGATOM;
5467
5468  tryagain:
5469    atom_parse_start = RExC_parse;
5470    assert(RExC_parse < RExC_end);
5471    switch ((U8)*RExC_parse) {
5472    case '^':
5473        RExC_seen_zerolen++;
5474        nextchar(pRExC_state);
5475        if (RExC_flags & RXf_PMf_MULTILINE)
5476            ret = reg_node(pRExC_state, MBOL);
5477        else
5478            ret = reg_node(pRExC_state, SBOL);
5479        break;
5480    case '$':
5481        nextchar(pRExC_state);
5482        if (*RExC_parse)
5483            RExC_seen_zerolen++;
5484        if (RExC_flags & RXf_PMf_MULTILINE)
5485            ret = reg_node(pRExC_state, MEOL);
5486        else
5487            ret = reg_node(pRExC_state, SEOL);
5488        break;
5489    case '.':
5490        nextchar(pRExC_state);
5491        if (RExC_flags & RXf_PMf_SINGLELINE)
5492            ret = reg_node(pRExC_state, SANY);
5493        else
5494            ret = reg_node(pRExC_state, REG_ANY);
5495        *flagp |= HASWIDTH|SIMPLE;
5496        MARK_NAUGHTY(1);
5497        break;
5498    case '[':
5499    {
5500        char * const cc_parse_start = ++RExC_parse;
5501        ret = regclass(pRExC_state, flagp, depth+1,
5502                       FALSE, /* means parse the whole char class */
5503                       TRUE, /* allow multi-char folds */
5504                       FALSE, /* don't silence non-portable warnings. */
5505                       (bool) RExC_strict,
5506                       TRUE, /* Allow an optimized regnode result */
5507                       NULL);
5508        if (ret == 0) {
5509            RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5510            FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5511                  (UV) *flagp);
5512        }
5513        if (*RExC_parse != ']') {
5514            RExC_parse_set(cc_parse_start);
5515            vFAIL("Unmatched [");
5516        }
5517        nextchar(pRExC_state);
5518        break;
5519    }
5520    case '(':
5521        nextchar(pRExC_state);
5522        ret = reg(pRExC_state, 2, &flags, depth+1);
5523        if (ret == 0) {
5524                if (flags & TRYAGAIN) {
5525                    if (RExC_parse >= RExC_end) {
5526                         /* Make parent create an empty node if needed. */
5527                        *flagp |= TRYAGAIN;
5528                        return(0);
5529                    }
5530                    goto tryagain;
5531                }
5532                RETURN_FAIL_ON_RESTART(flags, flagp);
5533                FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5534                                                                 (UV) flags);
5535        }
5536        *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5537        break;
5538    case '|':
5539    case ')':
5540        if (flags & TRYAGAIN) {
5541            *flagp |= TRYAGAIN;
5542            return 0;
5543        }
5544        vFAIL("Internal urp");
5545                                /* Supposed to be caught earlier. */
5546        break;
5547    case '?':
5548    case '+':
5549    case '*':
5550        RExC_parse_inc_by(1);
5551        vFAIL("Quantifier follows nothing");
5552        break;
5553    case '\\':
5554        /* Special Escapes
5555
5556           This switch handles escape sequences that resolve to some kind
5557           of special regop and not to literal text. Escape sequences that
5558           resolve to literal text are handled below in the switch marked
5559           "Literal Escapes".
5560
5561           Every entry in this switch *must* have a corresponding entry
5562           in the literal escape switch. However, the opposite is not
5563           required, as the default for this switch is to jump to the
5564           literal text handling code.
5565        */
5566        RExC_parse_inc_by(1);
5567        switch ((U8)*RExC_parse) {
5568        /* Special Escapes */
5569        case 'A':
5570            RExC_seen_zerolen++;
5571            /* Under wildcards, this is changed to match \n; should be
5572             * invisible to the user, as they have to compile under /m */
5573            if (RExC_pm_flags & PMf_WILDCARD) {
5574                ret = reg_node(pRExC_state, MBOL);
5575            }
5576            else {
5577                ret = reg_node(pRExC_state, SBOL);
5578                /* SBOL is shared with /^/ so we set the flags so we can tell
5579                 * /\A/ from /^/ in split. */
5580                FLAGS(REGNODE_p(ret)) = 1;
5581            }
5582            goto finish_meta_pat;
5583        case 'G':
5584            if (RExC_pm_flags & PMf_WILDCARD) {
5585                RExC_parse_inc_by(1);
5586                /* diag_listed_as: Use of %s is not allowed in Unicode property
5587                   wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5588                 */
5589                vFAIL("Use of '\\G' is not allowed in Unicode property"
5590                      " wildcard subpatterns");
5591            }
5592            ret = reg_node(pRExC_state, GPOS);
5593            RExC_seen |= REG_GPOS_SEEN;
5594            goto finish_meta_pat;
5595        case 'K':
5596            if (!RExC_in_lookaround) {
5597                RExC_seen_zerolen++;
5598                ret = reg_node(pRExC_state, KEEPS);
5599                /* XXX:dmq : disabling in-place substitution seems to
5600                 * be necessary here to avoid cases of memory corruption, as
5601                 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5602                 */
5603                RExC_seen |= REG_LOOKBEHIND_SEEN;
5604                goto finish_meta_pat;
5605            }
5606            else {
5607                ++RExC_parse; /* advance past the 'K' */
5608                vFAIL("\\K not permitted in lookahead/lookbehind");
5609            }
5610        case 'Z':
5611            if (RExC_pm_flags & PMf_WILDCARD) {
5612                /* See comment under \A above */
5613                ret = reg_node(pRExC_state, MEOL);
5614            }
5615            else {
5616                ret = reg_node(pRExC_state, SEOL);
5617            }
5618            RExC_seen_zerolen++;		/* Do not optimize RE away */
5619            goto finish_meta_pat;
5620        case 'z':
5621            if (RExC_pm_flags & PMf_WILDCARD) {
5622                /* See comment under \A above */
5623                ret = reg_node(pRExC_state, MEOL);
5624            }
5625            else {
5626                ret = reg_node(pRExC_state, EOS);
5627            }
5628            RExC_seen_zerolen++;		/* Do not optimize RE away */
5629            goto finish_meta_pat;
5630        case 'C':
5631            vFAIL("\\C no longer supported");
5632        case 'X':
5633            ret = reg_node(pRExC_state, CLUMP);
5634            *flagp |= HASWIDTH;
5635            goto finish_meta_pat;
5636
5637        case 'B':
5638            invert = 1;
5639            /* FALLTHROUGH */
5640        case 'b':
5641          {
5642            U8 flags = 0;
5643            regex_charset charset = get_regex_charset(RExC_flags);
5644
5645            RExC_seen_zerolen++;
5646            RExC_seen |= REG_LOOKBEHIND_SEEN;
5647            op = BOUND + charset;
5648
5649            if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5650                flags = TRADITIONAL_BOUND;
5651                if (op > BOUNDA) {  /* /aa is same as /a */
5652                    op = BOUNDA;
5653                }
5654            }
5655            else {
5656                STRLEN length;
5657                char name = *RExC_parse;
5658                char * endbrace =  (char *) memchr(RExC_parse, '}',
5659                                                   RExC_end - RExC_parse);
5660                char * e = endbrace;
5661
5662                RExC_parse_inc_by(2);
5663
5664                if (! endbrace) {
5665                    vFAIL2("Missing right brace on \\%c{}", name);
5666                }
5667
5668                while (isBLANK(*RExC_parse)) {
5669                    RExC_parse_inc_by(1);
5670                }
5671
5672                while (RExC_parse < e && isBLANK(*(e - 1))) {
5673                    e--;
5674                }
5675
5676                if (e == RExC_parse) {
5677                    RExC_parse_set(endbrace + 1);  /* After the '}' */
5678                    vFAIL2("Empty \\%c{}", name);
5679                }
5680
5681                length = e - RExC_parse;
5682
5683                switch (*RExC_parse) {
5684                    case 'g':
5685                        if (    length != 1
5686                            && (memNEs(RExC_parse + 1, length - 1, "cb")))
5687                        {
5688                            goto bad_bound_type;
5689                        }
5690                        flags = GCB_BOUND;
5691                        break;
5692                    case 'l':
5693                        if (length != 2 || *(RExC_parse + 1) != 'b') {
5694                            goto bad_bound_type;
5695                        }
5696                        flags = LB_BOUND;
5697                        break;
5698                    case 's':
5699                        if (length != 2 || *(RExC_parse + 1) != 'b') {
5700                            goto bad_bound_type;
5701                        }
5702                        flags = SB_BOUND;
5703                        break;
5704                    case 'w':
5705                        if (length != 2 || *(RExC_parse + 1) != 'b') {
5706                            goto bad_bound_type;
5707                        }
5708                        flags = WB_BOUND;
5709                        break;
5710                    default:
5711                      bad_bound_type:
5712                        RExC_parse_set(e);
5713                        vFAIL2utf8f(
5714                            "'%" UTF8f "' is an unknown bound type",
5715                            UTF8fARG(UTF, length, e - length));
5716                        NOT_REACHED; /*NOTREACHED*/
5717                }
5718                RExC_parse_set(endbrace);
5719                REQUIRE_UNI_RULES(flagp, 0);
5720
5721                if (op == BOUND) {
5722                    op = BOUNDU;
5723                }
5724                else if (op >= BOUNDA) {  /* /aa is same as /a */
5725                    op = BOUNDU;
5726                    length += 4;
5727
5728                    /* Don't have to worry about UTF-8, in this message because
5729                     * to get here the contents of the \b must be ASCII */
5730                    ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
5731                              "Using /u for '%.*s' instead of /%s",
5732                              (unsigned) length,
5733                              endbrace - length + 1,
5734                              (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5735                              ? ASCII_RESTRICT_PAT_MODS
5736                              : ASCII_MORE_RESTRICT_PAT_MODS);
5737                }
5738            }
5739
5740            if (op == BOUND) {
5741                RExC_seen_d_op = TRUE;
5742            }
5743            else if (op == BOUNDL) {
5744                RExC_contains_locale = 1;
5745            }
5746
5747            if (invert) {
5748                op += NBOUND - BOUND;
5749            }
5750
5751            ret = reg_node(pRExC_state, op);
5752            FLAGS(REGNODE_p(ret)) = flags;
5753
5754            goto finish_meta_pat;
5755          }
5756
5757        case 'R':
5758            ret = reg_node(pRExC_state, LNBREAK);
5759            *flagp |= HASWIDTH|SIMPLE;
5760            goto finish_meta_pat;
5761
5762        case 'd':
5763        case 'D':
5764        case 'h':
5765        case 'H':
5766        case 'p':
5767        case 'P':
5768        case 's':
5769        case 'S':
5770        case 'v':
5771        case 'V':
5772        case 'w':
5773        case 'W':
5774            /* These all have the same meaning inside [brackets], and it knows
5775             * how to do the best optimizations for them.  So, pretend we found
5776             * these within brackets, and let it do the work */
5777            RExC_parse--;
5778
5779            ret = regclass(pRExC_state, flagp, depth+1,
5780                           TRUE, /* means just parse this element */
5781                           FALSE, /* don't allow multi-char folds */
5782                           FALSE, /* don't silence non-portable warnings.  It
5783                                     would be a bug if these returned
5784                                     non-portables */
5785                           (bool) RExC_strict,
5786                           TRUE, /* Allow an optimized regnode result */
5787                           NULL);
5788            RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5789            /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5790             * multi-char folds are allowed.  */
5791            if (!ret)
5792                FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5793                      (UV) *flagp);
5794
5795            RExC_parse--;   /* regclass() leaves this one too far ahead */
5796
5797          finish_meta_pat:
5798                   /* The escapes above that don't take a parameter can't be
5799                    * followed by a '{'.  But 'pX', 'p{foo}' and
5800                    * correspondingly 'P' can be */
5801            if (   RExC_parse - atom_parse_start == 1
5802                && UCHARAT(RExC_parse + 1) == '{'
5803                && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5804            {
5805                RExC_parse_inc_by(2);
5806                vFAIL("Unescaped left brace in regex is illegal here");
5807            }
5808            nextchar(pRExC_state);
5809            break;
5810        case 'N':
5811            /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5812             * \N{...} evaluates to a sequence of more than one code points).
5813             * The function call below returns a regnode, which is our result.
5814             * The parameters cause it to fail if the \N{} evaluates to a
5815             * single code point; we handle those like any other literal.  The
5816             * reason that the multicharacter case is handled here and not as
5817             * part of the EXACtish code is because of quantifiers.  In
5818             * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5819             * this way makes that Just Happen. dmq.
5820             * join_exact() will join this up with adjacent EXACTish nodes
5821             * later on, if appropriate. */
5822            ++RExC_parse;
5823            if (grok_bslash_N(pRExC_state,
5824                              &ret,     /* Want a regnode returned */
5825                              NULL,     /* Fail if evaluates to a single code
5826                                           point */
5827                              NULL,     /* Don't need a count of how many code
5828                                           points */
5829                              flagp,
5830                              RExC_strict,
5831                              depth)
5832            ) {
5833                break;
5834            }
5835
5836            RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5837
5838            /* Here, evaluates to a single code point.  Go get that */
5839            RExC_parse_set(atom_parse_start);
5840            goto defchar;
5841
5842        case 'k':    /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5843      parse_named_seq:  /* Also handle non-numeric \g{...} */
5844        {
5845            char ch;
5846            if (   RExC_parse >= RExC_end - 1
5847                || ((   ch = RExC_parse[1]) != '<'
5848                                      && ch != '\''
5849                                      && ch != '{'))
5850            {
5851                RExC_parse_inc_by(1);
5852                /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5853                vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5854            } else {
5855                RExC_parse_inc_by(2);
5856                if (ch == '{') {
5857                    while (isBLANK(*RExC_parse)) {
5858                        RExC_parse_inc_by(1);
5859                    }
5860                }
5861                ret = handle_named_backref(pRExC_state,
5862                                           flagp,
5863                                           atom_parse_start,
5864                                           (ch == '<')
5865                                           ? '>'
5866                                           : (ch == '{')
5867                                             ? '}'
5868                                             : '\'');
5869            }
5870            break;
5871        }
5872        case 'g':
5873        case '1': case '2': case '3': case '4':
5874        case '5': case '6': case '7': case '8': case '9':
5875            {
5876                I32 num;
5877                char * endbrace = NULL;
5878                char * s = RExC_parse;
5879                char * e = RExC_end;
5880
5881                if (*s == 'g') {
5882                    bool isrel = 0;
5883
5884                    s++;
5885                    if (*s == '{') {
5886                        endbrace = (char *) memchr(s, '}', RExC_end - s);
5887                        if (! endbrace ) {
5888
5889                            /* Missing '}'.  Position after the number to give
5890                             * a better indication to the user of where the
5891                             * problem is. */
5892                            s++;
5893                            if (*s == '-') {
5894                                s++;
5895                            }
5896
5897                            /* If it looks to be a name and not a number, go
5898                             * handle it there */
5899                            if (! isDIGIT(*s)) {
5900                                goto parse_named_seq;
5901                            }
5902
5903                            do {
5904                                s++;
5905                            } while isDIGIT(*s);
5906
5907                            RExC_parse_set(s);
5908                            vFAIL("Unterminated \\g{...} pattern");
5909                        }
5910
5911                        s++;    /* Past the '{' */
5912
5913                        while (isBLANK(*s)) {
5914                            s++;
5915                        }
5916
5917                        /* Ignore trailing blanks */
5918                        e = endbrace;
5919                        while (s < e && isBLANK(*(e - 1))) {
5920                            e--;
5921                        }
5922                    }
5923
5924                    /* Here, have isolated the meat of the construct from any
5925                     * surrounding braces */
5926
5927                    if (*s == '-') {
5928                        isrel = 1;
5929                        s++;
5930                    }
5931
5932                    if (endbrace && !isDIGIT(*s)) {
5933                        goto parse_named_seq;
5934                    }
5935
5936                    RExC_parse_set(s);
5937                    num = S_backref_value(RExC_parse, RExC_end);
5938                    if (num == 0)
5939                        vFAIL("Reference to invalid group 0");
5940                    else if (num == I32_MAX) {
5941                         if (isDIGIT(*RExC_parse))
5942                            vFAIL("Reference to nonexistent group");
5943                        else
5944                            vFAIL("Unterminated \\g... pattern");
5945                    }
5946
5947                    if (isrel) {
5948                        num = RExC_npar - num;
5949                        if (num < 1)
5950                            vFAIL("Reference to nonexistent or unclosed group");
5951                    }
5952                    else
5953                    if (num < RExC_logical_npar) {
5954                        num = RExC_logical_to_parno[num];
5955                    }
5956                    else
5957                    if (ALL_PARENS_COUNTED)  {
5958                        if (num < RExC_logical_total_parens)
5959                            num = RExC_logical_to_parno[num];
5960                        else {
5961                            num = -1;
5962                        }
5963                    }
5964                    else{
5965                        REQUIRE_PARENS_PASS;
5966                    }
5967                }
5968                else {
5969                    num = S_backref_value(RExC_parse, RExC_end);
5970                    /* bare \NNN might be backref or octal - if it is larger
5971                     * than or equal RExC_npar then it is assumed to be an
5972                     * octal escape. Note RExC_npar is +1 from the actual
5973                     * number of parens. */
5974                    /* Note we do NOT check if num == I32_MAX here, as that is
5975                     * handled by the RExC_npar check */
5976
5977                    if (    /* any numeric escape < 10 is always a backref */
5978                           num > 9
5979                            /* any numeric escape < RExC_npar is a backref */
5980                        && num >= RExC_logical_npar
5981                            /* cannot be an octal escape if it starts with [89]
5982                             * */
5983                        && ! inRANGE(*RExC_parse, '8', '9')
5984                    ) {
5985                        /* Probably not meant to be a backref, instead likely
5986                         * to be an octal character escape, e.g. \35 or \777.
5987                         * The above logic should make it obvious why using
5988                         * octal escapes in patterns is problematic. - Yves */
5989                        RExC_parse_set(atom_parse_start);
5990                        goto defchar;
5991                    }
5992                    if (num < RExC_logical_npar) {
5993                        num = RExC_logical_to_parno[num];
5994                    }
5995                    else
5996                    if (ALL_PARENS_COUNTED) {
5997                        if (num < RExC_logical_total_parens) {
5998                            num = RExC_logical_to_parno[num];
5999                        } else {
6000                            num = -1;
6001                        }
6002                    } else {
6003                        REQUIRE_PARENS_PASS;
6004                    }
6005                }
6006
6007                /* At this point RExC_parse points at a numeric escape like
6008                 * \12 or \88 or the digits in \g{34} or \g34 or something
6009                 * similar, which we should NOT treat as an octal escape. It
6010                 * may or may not be a valid backref escape. For instance
6011                 * \88888888 is unlikely to be a valid backref.
6012                 *
6013                 * We've already figured out what value the digits represent.
6014                 * Now, move the parse to beyond them. */
6015                if (endbrace) {
6016                    RExC_parse_set(endbrace + 1);
6017                }
6018                else while (isDIGIT(*RExC_parse)) {
6019                    RExC_parse_inc_by(1);
6020                }
6021                if (num < 0)
6022                    vFAIL("Reference to nonexistent group");
6023
6024                if (num >= (I32)RExC_npar) {
6025                    /* It might be a forward reference; we can't fail until we
6026                     * know, by completing the parse to get all the groups, and
6027                     * then reparsing */
6028                    if (ALL_PARENS_COUNTED)  {
6029                        if (num >= RExC_total_parens)  {
6030                            vFAIL("Reference to nonexistent group");
6031                        }
6032                    }
6033                    else {
6034                        REQUIRE_PARENS_PASS;
6035                    }
6036                }
6037                RExC_sawback = 1;
6038                ret = reg2node(pRExC_state,
6039                               ((! FOLD)
6040                                 ? REF
6041                                 : (ASCII_FOLD_RESTRICTED)
6042                                   ? REFFA
6043                                   : (AT_LEAST_UNI_SEMANTICS)
6044                                     ? REFFU
6045                                     : (LOC)
6046                                       ? REFFL
6047                                       : REFF),
6048                                num, RExC_nestroot);
6049                if (RExC_nestroot && num >= RExC_nestroot)
6050                    FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6051                if (OP(REGNODE_p(ret)) == REFF) {
6052                    RExC_seen_d_op = TRUE;
6053                }
6054                *flagp |= HASWIDTH;
6055
6056                skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6057                                        FALSE /* Don't force to /x */ );
6058            }
6059            break;
6060        case '\0':
6061            if (RExC_parse >= RExC_end)
6062                FAIL("Trailing \\");
6063            /* FALLTHROUGH */
6064        default:
6065            /* Do not generate "unrecognized" warnings here, we fall
6066               back into the quick-grab loop below */
6067            RExC_parse_set(atom_parse_start);
6068            goto defchar;
6069        } /* end of switch on a \foo sequence */
6070        break;
6071
6072    case '#':
6073
6074        /* '#' comments should have been spaced over before this function was
6075         * called */
6076        assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6077        /*
6078        if (RExC_flags & RXf_PMf_EXTENDED) {
6079            RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6080            if (RExC_parse < RExC_end)
6081                goto tryagain;
6082        }
6083        */
6084
6085        /* FALLTHROUGH */
6086
6087    default:
6088          defchar: {
6089
6090            /* Here, we have determined that the next thing is probably a
6091             * literal character.  RExC_parse points to the first byte of its
6092             * definition.  (It still may be an escape sequence that evaluates
6093             * to a single character) */
6094
6095            STRLEN len = 0;
6096            UV ender = 0;
6097            char *p;
6098            char *s, *old_s = NULL, *old_old_s = NULL;
6099            char *s0;
6100            U32 max_string_len = 255;
6101
6102            /* We may have to reparse the node, artificially stopping filling
6103             * it early, based on info gleaned in the first parse.  This
6104             * variable gives where we stop.  Make it above the normal stopping
6105             * place first time through; otherwise it would stop too early */
6106            U32 upper_fill = max_string_len + 1;
6107
6108            /* We start out as an EXACT node, even if under /i, until we find a
6109             * character which is in a fold.  The algorithm now segregates into
6110             * separate nodes, characters that fold from those that don't under
6111             * /i.  (This hopefully will create nodes that are fixed strings
6112             * even under /i, giving the optimizer something to grab on to.)
6113             * So, if a node has something in it and the next character is in
6114             * the opposite category, that node is closed up, and the function
6115             * returns.  Then regatom is called again, and a new node is
6116             * created for the new category. */
6117            U8 node_type = EXACT;
6118
6119            /* Assume the node will be fully used; the excess is given back at
6120             * the end.  Under /i, we may need to temporarily add the fold of
6121             * an extra character or two at the end to check for splitting
6122             * multi-char folds, so allocate extra space for that.   We can't
6123             * make any other length assumptions, as a byte input sequence
6124             * could shrink down. */
6125            Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6126                                                 + ((! FOLD)
6127                                                    ? 0
6128                                                    : 2 * ((UTF)
6129                                                           ? UTF8_MAXBYTES_CASE
6130                        /* Max non-UTF-8 expansion is 2 */ : 2)));
6131
6132            bool next_is_quantifier;
6133            char * oldp = NULL;
6134
6135            /* We can convert EXACTF nodes to EXACTFU if they contain only
6136             * characters that match identically regardless of the target
6137             * string's UTF8ness.  The reason to do this is that EXACTF is not
6138             * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6139             * runtime.
6140             *
6141             * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6142             * contain only above-Latin1 characters (hence must be in UTF8),
6143             * which don't participate in folds with Latin1-range characters,
6144             * as the latter's folds aren't known until runtime. */
6145            bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6146
6147            /* Single-character EXACTish nodes are almost always SIMPLE.  This
6148             * allows us to override this as encountered */
6149            U8 maybe_SIMPLE = SIMPLE;
6150
6151            /* Does this node contain something that can't match unless the
6152             * target string is (also) in UTF-8 */
6153            bool requires_utf8_target = FALSE;
6154
6155            /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6156            bool has_ss = FALSE;
6157
6158            /* So is the MICRO SIGN */
6159            bool has_micro_sign = FALSE;
6160
6161            /* Set when we fill up the current node and there is still more
6162             * text to process */
6163            bool overflowed;
6164
6165            /* Allocate an EXACT node.  The node_type may change below to
6166             * another EXACTish node, but since the size of the node doesn't
6167             * change, it works */
6168            ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6169            FILL_NODE(ret, node_type);
6170            RExC_emit += NODE_STEP_REGNODE;
6171
6172            s = STRING(REGNODE_p(ret));
6173
6174            s0 = s;
6175
6176          reparse:
6177
6178            p = RExC_parse;
6179            len = 0;
6180            s = s0;
6181            node_type = EXACT;
6182            oldp = NULL;
6183            maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6184            maybe_SIMPLE = SIMPLE;
6185            requires_utf8_target = FALSE;
6186            has_ss = FALSE;
6187            has_micro_sign = FALSE;
6188
6189          continue_parse:
6190
6191            /* This breaks under rare circumstances.  If folding, we do not
6192             * want to split a node at a character that is a non-final in a
6193             * multi-char fold, as an input string could just happen to want to
6194             * match across the node boundary.  The code at the end of the loop
6195             * looks for this, and backs off until it finds not such a
6196             * character, but it is possible (though extremely, extremely
6197             * unlikely) for all characters in the node to be non-final fold
6198             * ones, in which case we just leave the node fully filled, and
6199             * hope that it doesn't match the string in just the wrong place */
6200
6201            assert( ! UTF     /* Is at the beginning of a character */
6202                   || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6203                   || UTF8_IS_START(UCHARAT(RExC_parse)));
6204
6205            overflowed = FALSE;
6206
6207            /* Here, we have a literal character.  Find the maximal string of
6208             * them in the input that we can fit into a single EXACTish node.
6209             * We quit at the first non-literal or when the node gets full, or
6210             * under /i the categorization of folding/non-folding character
6211             * changes */
6212            while (p < RExC_end && len < upper_fill) {
6213
6214                /* In most cases each iteration adds one byte to the output.
6215                 * The exceptions override this */
6216                Size_t added_len = 1;
6217
6218                oldp = p;
6219                old_old_s = old_s;
6220                old_s = s;
6221
6222                /* White space has already been ignored */
6223                assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
6224                       || ! is_PATWS_safe((p), RExC_end, UTF));
6225
6226                switch ((U8)*p) {
6227                  const char* message;
6228                  U32 packed_warn;
6229                  U8 grok_c_char;
6230
6231                case '^':
6232                case '$':
6233                case '.':
6234                case '[':
6235                case '(':
6236                case ')':
6237                case '|':
6238                    goto loopdone;
6239                case '\\':
6240                    /* Literal Escapes Switch
6241
6242                       This switch is meant to handle escape sequences that
6243                       resolve to a literal character.
6244
6245                       Every escape sequence that represents something
6246                       else, like an assertion or a char class, is handled
6247                       in the switch marked 'Special Escapes' above in this
6248                       routine, but also has an entry here as anything that
6249                       isn't explicitly mentioned here will be treated as
6250                       an unescaped equivalent literal.
6251                    */
6252
6253                    switch ((U8)*++p) {
6254
6255                    /* These are all the special escapes. */
6256                    case 'A':             /* Start assertion */
6257                    case 'b': case 'B':   /* Word-boundary assertion*/
6258                    case 'C':             /* Single char !DANGEROUS! */
6259                    case 'd': case 'D':   /* digit class */
6260                    case 'g': case 'G':   /* generic-backref, pos assertion */
6261                    case 'h': case 'H':   /* HORIZWS */
6262                    case 'k': case 'K':   /* named backref, keep marker */
6263                    case 'p': case 'P':   /* Unicode property */
6264                              case 'R':   /* LNBREAK */
6265                    case 's': case 'S':   /* space class */
6266                    case 'v': case 'V':   /* VERTWS */
6267                    case 'w': case 'W':   /* word class */
6268                    case 'X':             /* eXtended Unicode "combining
6269                                             character sequence" */
6270                    case 'z': case 'Z':   /* End of line/string assertion */
6271                        --p;
6272                        goto loopdone;
6273
6274                    /* Anything after here is an escape that resolves to a
6275                       literal. (Except digits, which may or may not)
6276                     */
6277                    case 'n':
6278                        ender = '\n';
6279                        p++;
6280                        break;
6281                    case 'N': /* Handle a single-code point named character. */
6282                        RExC_parse_set( p + 1 );
6283                        if (! grok_bslash_N(pRExC_state,
6284                                            NULL,   /* Fail if evaluates to
6285                                                       anything other than a
6286                                                       single code point */
6287                                            &ender, /* The returned single code
6288                                                       point */
6289                                            NULL,   /* Don't need a count of
6290                                                       how many code points */
6291                                            flagp,
6292                                            RExC_strict,
6293                                            depth)
6294                        ) {
6295                            if (*flagp & NEED_UTF8)
6296                                FAIL("panic: grok_bslash_N set NEED_UTF8");
6297                            RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6298
6299                            /* Here, it wasn't a single code point.  Go close
6300                             * up this EXACTish node.  The switch() prior to
6301                             * this switch handles the other cases */
6302                            p = oldp;
6303                            RExC_parse_set(p);
6304                            goto loopdone;
6305                        }
6306                        p = RExC_parse;
6307                        RExC_parse_set(atom_parse_start);
6308
6309                        /* The \N{} means the pattern, if previously /d,
6310                         * becomes /u.  That means it can't be an EXACTF node,
6311                         * but an EXACTFU */
6312                        if (node_type == EXACTF) {
6313                            node_type = EXACTFU;
6314
6315                            /* If the node already contains something that
6316                             * differs between EXACTF and EXACTFU, reparse it
6317                             * as EXACTFU */
6318                            if (! maybe_exactfu) {
6319                                len = 0;
6320                                s = s0;
6321                                goto reparse;
6322                            }
6323                        }
6324
6325                        break;
6326                    case 'r':
6327                        ender = '\r';
6328                        p++;
6329                        break;
6330                    case 't':
6331                        ender = '\t';
6332                        p++;
6333                        break;
6334                    case 'f':
6335                        ender = '\f';
6336                        p++;
6337                        break;
6338                    case 'e':
6339                        ender = ESC_NATIVE;
6340                        p++;
6341                        break;
6342                    case 'a':
6343                        ender = '\a';
6344                        p++;
6345                        break;
6346                    case 'o':
6347                        if (! grok_bslash_o(&p,
6348                                            RExC_end,
6349                                            &ender,
6350                                            &message,
6351                                            &packed_warn,
6352                                            (bool) RExC_strict,
6353                                            FALSE, /* No illegal cp's */
6354                                            UTF))
6355                        {
6356                            RExC_parse_set(p); /* going to die anyway; point to
6357                                               exact spot of failure */
6358                            vFAIL(message);
6359                        }
6360
6361                        if (message && TO_OUTPUT_WARNINGS(p)) {
6362                            warn_non_literal_string(p, packed_warn, message);
6363                        }
6364                        break;
6365                    case 'x':
6366                        if (! grok_bslash_x(&p,
6367                                            RExC_end,
6368                                            &ender,
6369                                            &message,
6370                                            &packed_warn,
6371                                            (bool) RExC_strict,
6372                                            FALSE, /* No illegal cp's */
6373                                            UTF))
6374                        {
6375                            RExC_parse_set(p);        /* going to die anyway; point
6376                                                   to exact spot of failure */
6377                            vFAIL(message);
6378                        }
6379
6380                        if (message && TO_OUTPUT_WARNINGS(p)) {
6381                            warn_non_literal_string(p, packed_warn, message);
6382                        }
6383
6384#ifdef EBCDIC
6385                        if (ender < 0x100) {
6386                            if (RExC_recode_x_to_native) {
6387                                ender = LATIN1_TO_NATIVE(ender);
6388                            }
6389                        }
6390#endif
6391                        break;
6392                    case 'c':
6393                        p++;
6394                        if (! grok_bslash_c(*p, &grok_c_char,
6395                                            &message, &packed_warn))
6396                        {
6397                            /* going to die anyway; point to exact spot of
6398                             * failure */
6399                            char *new_p= p + ((UTF)
6400                                              ? UTF8_SAFE_SKIP(p, RExC_end)
6401                                              : 1);
6402                            RExC_parse_set(new_p);
6403                            vFAIL(message);
6404                        }
6405
6406                        ender = grok_c_char;
6407                        p++;
6408                        if (message && TO_OUTPUT_WARNINGS(p)) {
6409                            warn_non_literal_string(p, packed_warn, message);
6410                        }
6411
6412                        break;
6413                    case '8': case '9': /* must be a backreference */
6414                        --p;
6415                        /* we have an escape like \8 which cannot be an octal escape
6416                         * so we exit the loop, and let the outer loop handle this
6417                         * escape which may or may not be a legitimate backref. */
6418                        goto loopdone;
6419                    case '1': case '2': case '3':case '4':
6420                    case '5': case '6': case '7':
6421
6422                        /* When we parse backslash escapes there is ambiguity
6423                         * between backreferences and octal escapes. Any escape
6424                         * from \1 - \9 is a backreference, any multi-digit
6425                         * escape which does not start with 0 and which when
6426                         * evaluated as decimal could refer to an already
6427                         * parsed capture buffer is a back reference. Anything
6428                         * else is octal.
6429                         *
6430                         * Note this implies that \118 could be interpreted as
6431                         * 118 OR as "\11" . "8" depending on whether there
6432                         * were 118 capture buffers defined already in the
6433                         * pattern.  */
6434
6435                        /* NOTE, RExC_npar is 1 more than the actual number of
6436                         * parens we have seen so far, hence the "<" as opposed
6437                         * to "<=" */
6438                        if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6439                        {  /* Not to be treated as an octal constant, go
6440                                   find backref */
6441                            p = oldp;
6442                            goto loopdone;
6443                        }
6444                        /* FALLTHROUGH */
6445                    case '0':
6446                        {
6447                            I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6448                                      | PERL_SCAN_NOTIFY_ILLDIGIT;
6449                            STRLEN numlen = 3;
6450                            ender = grok_oct(p, &numlen, &flags, NULL);
6451                            p += numlen;
6452                            if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6453                                && isDIGIT(*p)  /* like \08, \178 */
6454                                && ckWARN(WARN_REGEXP))
6455                            {
6456                                reg_warn_non_literal_string(
6457                                     p + 1,
6458                                     form_alien_digit_msg(8, numlen, p,
6459                                                        RExC_end, UTF, FALSE));
6460                            }
6461                        }
6462                        break;
6463                    case '\0':
6464                        if (p >= RExC_end)
6465                            FAIL("Trailing \\");
6466                        /* FALLTHROUGH */
6467                    default:
6468                        if (isALPHANUMERIC(*p)) {
6469                            /* An alpha followed by '{' is going to fail next
6470                             * iteration, so don't output this warning in that
6471                             * case */
6472                            if (! isALPHA(*p) || *(p + 1) != '{') {
6473                                ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6474                                                  " passed through", p);
6475                            }
6476                        }
6477                        goto normal_default;
6478                    } /* End of switch on '\' */
6479                    break;
6480                case '{':
6481                    /* Trying to gain new uses for '{' without breaking too
6482                     * much existing code is hard.  The solution currently
6483                     * adopted is:
6484                     *  1)  If there is no ambiguity that a '{' should always
6485                     *      be taken literally, at the start of a construct, we
6486                     *      just do so.
6487                     *  2)  If the literal '{' conflicts with our desired use
6488                     *      of it as a metacharacter, we die.  The deprecation
6489                     *      cycles for this have come and gone.
6490                     *  3)  If there is ambiguity, we raise a simple warning.
6491                     *      This could happen, for example, if the user
6492                     *      intended it to introduce a quantifier, but slightly
6493                     *      misspelled the quantifier.  Without this warning,
6494                     *      the quantifier would silently be taken as a literal
6495                     *      string of characters instead of a meta construct */
6496                    if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6497                        if (      RExC_strict
6498                            || (  p > atom_parse_start + 1
6499                                && isALPHA_A(*(p - 1))
6500                                && *(p - 2) == '\\'))
6501                        {
6502                            RExC_parse_set(p + 1);
6503                            vFAIL("Unescaped left brace in regex is "
6504                                  "illegal here");
6505                        }
6506                        ckWARNreg(p + 1, "Unescaped left brace in regex is"
6507                                         " passed through");
6508                    }
6509                    goto normal_default;
6510                case '}':
6511                case ']':
6512                    if (p > RExC_parse && RExC_strict) {
6513                        ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6514                    }
6515                    /*FALLTHROUGH*/
6516                default:    /* A literal character */
6517                  normal_default:
6518                    if (! UTF8_IS_INVARIANT(*p) && UTF) {
6519                        STRLEN numlen;
6520                        ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6521                                               &numlen, UTF8_ALLOW_DEFAULT);
6522                        p += numlen;
6523                    }
6524                    else
6525                        ender = (U8) *p++;
6526                    break;
6527                } /* End of switch on the literal */
6528
6529                /* Here, have looked at the literal character, and <ender>
6530                 * contains its ordinal; <p> points to the character after it.
6531                 * */
6532
6533                if (ender > 255) {
6534                    REQUIRE_UTF8(flagp);
6535                    if (   UNICODE_IS_PERL_EXTENDED(ender)
6536                        && TO_OUTPUT_WARNINGS(p))
6537                    {
6538                        ckWARN2_non_literal_string(p,
6539                                                   packWARN(WARN_PORTABLE),
6540                                                   PL_extended_cp_format,
6541                                                   ender);
6542                    }
6543                }
6544
6545                /* We need to check if the next non-ignored thing is a
6546                 * quantifier.  Move <p> to after anything that should be
6547                 * ignored, which, as a side effect, positions <p> for the next
6548                 * loop iteration */
6549                skip_to_be_ignored_text(pRExC_state, &p,
6550                                        FALSE /* Don't force to /x */ );
6551
6552                /* If the next thing is a quantifier, it applies to this
6553                 * character only, which means that this character has to be in
6554                 * its own node and can't just be appended to the string in an
6555                 * existing node, so if there are already other characters in
6556                 * the node, close the node with just them, and set up to do
6557                 * this character again next time through, when it will be the
6558                 * only thing in its new node */
6559
6560                next_is_quantifier =    LIKELY(p < RExC_end)
6561                                     && UNLIKELY(isQUANTIFIER(p, RExC_end));
6562
6563                if (next_is_quantifier && LIKELY(len)) {
6564                    p = oldp;
6565                    goto loopdone;
6566                }
6567
6568                /* Ready to add 'ender' to the node */
6569
6570                if (! FOLD) {  /* The simple case, just append the literal */
6571                  not_fold_common:
6572
6573                    /* Don't output if it would overflow */
6574                    if (UNLIKELY(len > max_string_len - ((UTF)
6575                                                      ? UVCHR_SKIP(ender)
6576                                                      : 1)))
6577                    {
6578                        overflowed = TRUE;
6579                        break;
6580                    }
6581
6582                    if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6583                        *(s++) = (char) ender;
6584                    }
6585                    else {
6586                        U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6587                        added_len = (char *) new_s - s;
6588                        s = (char *) new_s;
6589
6590                        if (ender > 255)  {
6591                            requires_utf8_target = TRUE;
6592                        }
6593                    }
6594                }
6595                else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6596
6597                    /* Here are folding under /l, and the code point is
6598                     * problematic.  If this is the first character in the
6599                     * node, change the node type to folding.   Otherwise, if
6600                     * this is the first problematic character, close up the
6601                     * existing node, so can start a new node with this one */
6602                    if (! len) {
6603                        node_type = EXACTFL;
6604                        RExC_contains_locale = 1;
6605                    }
6606                    else if (node_type == EXACT) {
6607                        p = oldp;
6608                        goto loopdone;
6609                    }
6610
6611                    /* This problematic code point means we can't simplify
6612                     * things */
6613                    maybe_exactfu = FALSE;
6614
6615                    /* Although these two characters have folds that are
6616                     * locale-problematic, they also have folds to above Latin1
6617                     * that aren't a problem.  Doing these now helps at
6618                     * runtime. */
6619                    if (UNLIKELY(   ender == GREEK_CAPITAL_LETTER_MU
6620                                 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6621                    {
6622                        goto fold_anyway;
6623                    }
6624
6625                    /* Here, we are adding a problematic fold character.
6626                     * "Problematic" in this context means that its fold isn't
6627                     * known until runtime.  (The non-problematic code points
6628                     * are the above-Latin1 ones that fold to also all
6629                     * above-Latin1.  Their folds don't vary no matter what the
6630                     * locale is.) But here we have characters whose fold
6631                     * depends on the locale.  We just add in the unfolded
6632                     * character, and wait until runtime to fold it */
6633                    goto not_fold_common;
6634                }
6635                else /* regular fold; see if actually is in a fold */
6636                     if (   (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6637                         || (ender > 255
6638                            && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6639                {
6640                    /* Here, folding, but the character isn't in a fold.
6641                     *
6642                     * Start a new node if previous characters in the node were
6643                     * folded */
6644                    if (len && node_type != EXACT) {
6645                        p = oldp;
6646                        goto loopdone;
6647                    }
6648
6649                    /* Here, continuing a node with non-folded characters.  Add
6650                     * this one */
6651                    goto not_fold_common;
6652                }
6653                else {  /* Here, does participate in some fold */
6654
6655                    /* If this is the first character in the node, change its
6656                     * type to folding.  Otherwise, if this is the first
6657                     * folding character in the node, close up the existing
6658                     * node, so can start a new node with this one.  */
6659                    if (! len) {
6660                        node_type = compute_EXACTish(pRExC_state);
6661                    }
6662                    else if (node_type == EXACT) {
6663                        p = oldp;
6664                        goto loopdone;
6665                    }
6666
6667                    if (UTF) {  /* Alway use the folded value for UTF-8
6668                                   patterns */
6669                        if (UVCHR_IS_INVARIANT(ender)) {
6670                            if (UNLIKELY(len + 1 > max_string_len)) {
6671                                overflowed = TRUE;
6672                                break;
6673                            }
6674
6675                            *(s)++ = (U8) toFOLD(ender);
6676                        }
6677                        else {
6678                            UV folded;
6679
6680                          fold_anyway:
6681                            folded = _to_uni_fold_flags(
6682                                    ender,
6683                                    (U8 *) s,  /* We have allocated extra space
6684                                                  in 's' so can't run off the
6685                                                  end */
6686                                    &added_len,
6687                                    FOLD_FLAGS_FULL
6688                                  | ((   ASCII_FOLD_RESTRICTED
6689                                      || node_type == EXACTFL)
6690                                    ? FOLD_FLAGS_NOMIX_ASCII
6691                                    : 0));
6692                            if (UNLIKELY(len + added_len > max_string_len)) {
6693                                overflowed = TRUE;
6694                                break;
6695                            }
6696
6697                            s += added_len;
6698
6699                            if (   folded > 255
6700                                && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6701                            {
6702                                /* U+B5 folds to the MU, so its possible for a
6703                                 * non-UTF-8 target to match it */
6704                                requires_utf8_target = TRUE;
6705                            }
6706                        }
6707                    }
6708                    else { /* Here is non-UTF8. */
6709
6710                        /* The fold will be one or (rarely) two characters.
6711                         * Check that there's room for at least a single one
6712                         * before setting any flags, etc.  Because otherwise an
6713                         * overflowing character could cause a flag to be set
6714                         * even though it doesn't end up in this node.  (For
6715                         * the two character fold, we check again, before
6716                         * setting any flags) */
6717                        if (UNLIKELY(len + 1 > max_string_len)) {
6718                            overflowed = TRUE;
6719                            break;
6720                        }
6721
6722#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
6723   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
6724                                      || UNICODE_DOT_DOT_VERSION > 0)
6725
6726                        /* On non-ancient Unicodes, check for the only possible
6727                         * multi-char fold  */
6728                        if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6729
6730                            /* This potential multi-char fold means the node
6731                             * can't be simple (because it could match more
6732                             * than a single char).  And in some cases it will
6733                             * match 'ss', so set that flag */
6734                            maybe_SIMPLE = 0;
6735                            has_ss = TRUE;
6736
6737                            /* It can't change to be an EXACTFU (unless already
6738                             * is one).  We fold it iff under /u rules. */
6739                            if (node_type != EXACTFU) {
6740                                maybe_exactfu = FALSE;
6741                            }
6742                            else {
6743                                if (UNLIKELY(len + 2 > max_string_len)) {
6744                                    overflowed = TRUE;
6745                                    break;
6746                                }
6747
6748                                *(s++) = 's';
6749                                *(s++) = 's';
6750                                added_len = 2;
6751
6752                                goto done_with_this_char;
6753                            }
6754                        }
6755                        else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6756                                 && LIKELY(len > 0)
6757                                 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6758                        {
6759                            /* Also, the sequence 'ss' is special when not
6760                             * under /u.  If the target string is UTF-8, it
6761                             * should match SHARP S; otherwise it won't.  So,
6762                             * here we have to exclude the possibility of this
6763                             * node moving to /u.*/
6764                            has_ss = TRUE;
6765                            maybe_exactfu = FALSE;
6766                        }
6767#endif
6768                        /* Here, the fold will be a single character */
6769
6770                        if (UNLIKELY(ender == MICRO_SIGN)) {
6771                            has_micro_sign = TRUE;
6772                        }
6773                        else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6774
6775                            /* If the character's fold differs between /d and
6776                             * /u, this can't change to be an EXACTFU node */
6777                            maybe_exactfu = FALSE;
6778                        }
6779
6780                        *(s++) = (DEPENDS_SEMANTICS)
6781                                 ? (char) toFOLD(ender)
6782
6783                                   /* Under /u, the fold of any character in
6784                                    * the 0-255 range happens to be its
6785                                    * lowercase equivalent, except for LATIN
6786                                    * SMALL LETTER SHARP S, which was handled
6787                                    * above, and the MICRO SIGN, whose fold
6788                                    * requires UTF-8 to represent.  */
6789                                 : (char) toLOWER_L1(ender);
6790                    }
6791                } /* End of adding current character to the node */
6792
6793              done_with_this_char:
6794
6795                len += added_len;
6796
6797                if (next_is_quantifier) {
6798
6799                    /* Here, the next input is a quantifier, and to get here,
6800                     * the current character is the only one in the node. */
6801                    goto loopdone;
6802                }
6803
6804            } /* End of loop through literal characters */
6805
6806            /* Here we have either exhausted the input or run out of room in
6807             * the node.  If the former, we are done.  (If we encountered a
6808             * character that can't be in the node, transfer is made directly
6809             * to <loopdone>, and so we wouldn't have fallen off the end of the
6810             * loop.)  */
6811            if (LIKELY(! overflowed)) {
6812                goto loopdone;
6813            }
6814
6815            /* Here we have run out of room.  We can grow plain EXACT and
6816             * LEXACT nodes.  If the pattern is gigantic enough, though,
6817             * eventually we'll have to artificially chunk the pattern into
6818             * multiple nodes. */
6819            if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6820                Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6821                Size_t overhead_expansion = 0;
6822                char temp[256];
6823                Size_t max_nodes_for_string;
6824                Size_t achievable;
6825                SSize_t delta;
6826
6827                /* Here we couldn't fit the final character in the current
6828                 * node, so it will have to be reparsed, no matter what else we
6829                 * do */
6830                p = oldp;
6831
6832                /* If would have overflowed a regular EXACT node, switch
6833                 * instead to an LEXACT.  The code below is structured so that
6834                 * the actual growing code is common to changing from an EXACT
6835                 * or just increasing the LEXACT size.  This means that we have
6836                 * to save the string in the EXACT case before growing, and
6837                 * then copy it afterwards to its new location */
6838                if (node_type == EXACT) {
6839                    overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6840                    RExC_emit += overhead_expansion;
6841                    Copy(s0, temp, len, char);
6842                }
6843
6844                /* Ready to grow.  If it was a plain EXACT, the string was
6845                 * saved, and the first few bytes of it overwritten by adding
6846                 * an argument field.  We assume, as we do elsewhere in this
6847                 * file, that one byte of remaining input will translate into
6848                 * one byte of output, and if that's too small, we grow again,
6849                 * if too large the excess memory is freed at the end */
6850
6851                max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6852                achievable = MIN(max_nodes_for_string,
6853                                 current_string_nodes + STR_SZ(RExC_end - p));
6854                delta = achievable - current_string_nodes;
6855
6856                /* If there is just no more room, go finish up this chunk of
6857                 * the pattern. */
6858                if (delta <= 0) {
6859                    goto loopdone;
6860                }
6861
6862                change_engine_size(pRExC_state, delta + overhead_expansion);
6863                current_string_nodes += delta;
6864                max_string_len
6865                           = sizeof(struct regnode) * current_string_nodes;
6866                upper_fill = max_string_len + 1;
6867
6868                /* If the length was small, we know this was originally an
6869                 * EXACT node now converted to LEXACT, and the string has to be
6870                 * restored.  Otherwise the string was untouched.  260 is just
6871                 * a number safely above 255 so don't have to worry about
6872                 * getting it precise */
6873                if (len < 260) {
6874                    node_type = LEXACT;
6875                    FILL_NODE(ret, node_type);
6876                    s0 = STRING(REGNODE_p(ret));
6877                    Copy(temp, s0, len, char);
6878                    s = s0 + len;
6879                }
6880
6881                goto continue_parse;
6882            }
6883            else if (FOLD) {
6884                bool splittable = FALSE;
6885                bool backed_up = FALSE;
6886                char * e;       /* should this be U8? */
6887                char * s_start; /* should this be U8? */
6888
6889                /* Here is /i.  Running out of room creates a problem if we are
6890                 * folding, and the split happens in the middle of a
6891                 * multi-character fold, as a match that should have occurred,
6892                 * won't, due to the way nodes are matched, and our artificial
6893                 * boundary.  So back off until we aren't splitting such a
6894                 * fold.  If there is no such place to back off to, we end up
6895                 * taking the entire node as-is.  This can happen if the node
6896                 * consists entirely of 'f' or entirely of 's' characters (or
6897                 * things that fold to them) as 'ff' and 'ss' are
6898                 * multi-character folds.
6899                 *
6900                 * The Unicode standard says that multi character folds consist
6901                 * of either two or three characters.  That means we would be
6902                 * splitting one if the final character in the node is at the
6903                 * beginning of either type, or is the second of a three
6904                 * character fold.
6905                 *
6906                 * At this point:
6907                 *  ender     is the code point of the character that won't fit
6908                 *            in the node
6909                 *  s         points to just beyond the final byte in the node.
6910                 *            It's where we would place ender if there were
6911                 *            room, and where in fact we do place ender's fold
6912                 *            in the code below, as we've over-allocated space
6913                 *            for s0 (hence s) to allow for this
6914                 *  e         starts at 's' and advances as we append things.
6915                 *  old_s     is the same as 's'.  (If ender had fit, 's' would
6916                 *            have been advanced to beyond it).
6917                 *  old_old_s points to the beginning byte of the final
6918                 *            character in the node
6919                 *  p         points to the beginning byte in the input of the
6920                 *            character beyond 'ender'.
6921                 *  oldp      points to the beginning byte in the input of
6922                 *            'ender'.
6923                 *
6924                 * In the case of /il, we haven't folded anything that could be
6925                 * affected by the locale.  That means only above-Latin1
6926                 * characters that fold to other above-latin1 characters get
6927                 * folded at compile time.  To check where a good place to
6928                 * split nodes is, everything in it will have to be folded.
6929                 * The boolean 'maybe_exactfu' keeps track in /il if there are
6930                 * any unfolded characters in the node. */
6931                bool need_to_fold_loc = LOC && ! maybe_exactfu;
6932
6933                /* If we do need to fold the node, we need a place to store the
6934                 * folded copy, and a way to map back to the unfolded original
6935                 * */
6936                char * locfold_buf = NULL;
6937                Size_t * loc_correspondence = NULL;
6938
6939                if (! need_to_fold_loc) {   /* The normal case.  Just
6940                                               initialize to the actual node */
6941                    e = s;
6942                    s_start = s0;
6943                    s = old_old_s;  /* Point to the beginning of the final char
6944                                       that fits in the node */
6945                }
6946                else {
6947
6948                    /* Here, we have filled a /il node, and there are unfolded
6949                     * characters in it.  If the runtime locale turns out to be
6950                     * UTF-8, there are possible multi-character folds, just
6951                     * like when not under /l.  The node hence can't terminate
6952                     * in the middle of such a fold.  To determine this, we
6953                     * have to create a folded copy of this node.  That means
6954                     * reparsing the node, folding everything assuming a UTF-8
6955                     * locale.  (If at runtime it isn't such a locale, the
6956                     * actions here wouldn't have been necessary, but we have
6957                     * to assume the worst case.)  If we find we need to back
6958                     * off the folded string, we do so, and then map that
6959                     * position back to the original unfolded node, which then
6960                     * gets output, truncated at that spot */
6961
6962                    char * redo_p = RExC_parse;
6963                    char * redo_e;
6964                    char * old_redo_e;
6965
6966                    /* Allow enough space assuming a single byte input folds to
6967                     * a single byte output, plus assume that the two unparsed
6968                     * characters (that we may need) fold to the largest number
6969                     * of bytes possible, plus extra for one more worst case
6970                     * scenario.  In the loop below, if we start eating into
6971                     * that final spare space, we enlarge this initial space */
6972                    Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
6973
6974                    Newxz(locfold_buf, size, char);
6975                    Newxz(loc_correspondence, size, Size_t);
6976
6977                    /* Redo this node's parse, folding into 'locfold_buf' */
6978                    redo_p = RExC_parse;
6979                    old_redo_e = redo_e = locfold_buf;
6980                    while (redo_p <= oldp) {
6981
6982                        old_redo_e = redo_e;
6983                        loc_correspondence[redo_e - locfold_buf]
6984                                                        = redo_p - RExC_parse;
6985
6986                        if (UTF) {
6987                            Size_t added_len;
6988
6989                            (void) _to_utf8_fold_flags((U8 *) redo_p,
6990                                                       (U8 *) RExC_end,
6991                                                       (U8 *) redo_e,
6992                                                       &added_len,
6993                                                       FOLD_FLAGS_FULL);
6994                            redo_e += added_len;
6995                            redo_p += UTF8SKIP(redo_p);
6996                        }
6997                        else {
6998
6999                            /* Note that if this code is run on some ancient
7000                             * Unicode versions, SHARP S doesn't fold to 'ss',
7001                             * but rather than clutter the code with #ifdef's,
7002                             * as is done above, we ignore that possibility.
7003                             * This is ok because this code doesn't affect what
7004                             * gets matched, but merely where the node gets
7005                             * split */
7006                            if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7007                                *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7008                            }
7009                            else {
7010                                *redo_e++ = 's';
7011                                *redo_e++ = 's';
7012                            }
7013                            redo_p++;
7014                        }
7015
7016
7017                        /* If we're getting so close to the end that a
7018                         * worst-case fold in the next character would cause us
7019                         * to overflow, increase, assuming one byte output byte
7020                         * per one byte input one, plus room for another worst
7021                         * case fold */
7022                        if (   redo_p <= oldp
7023                            && redo_e > locfold_buf + size
7024                                                    - (UTF8_MAXBYTES_CASE + 1))
7025                        {
7026                            Size_t new_size = size
7027                                            + (oldp - redo_p)
7028                                            + UTF8_MAXBYTES_CASE + 1;
7029                            Ptrdiff_t e_offset = redo_e - locfold_buf;
7030
7031                            Renew(locfold_buf, new_size, char);
7032                            Renew(loc_correspondence, new_size, Size_t);
7033                            size = new_size;
7034
7035                            redo_e = locfold_buf + e_offset;
7036                        }
7037                    }
7038
7039                    /* Set so that things are in terms of the folded, temporary
7040                     * string */
7041                    s = old_redo_e;
7042                    s_start = locfold_buf;
7043                    e = redo_e;
7044
7045                }
7046
7047                /* Here, we have 's', 's_start' and 'e' set up to point to the
7048                 * input that goes into the node, folded.
7049                 *
7050                 * If the final character of the node and the fold of ender
7051                 * form the first two characters of a three character fold, we
7052                 * need to peek ahead at the next (unparsed) character in the
7053                 * input to determine if the three actually do form such a
7054                 * fold.  Just looking at that character is not generally
7055                 * sufficient, as it could be, for example, an escape sequence
7056                 * that evaluates to something else, and it needs to be folded.
7057                 *
7058                 * khw originally thought to just go through the parse loop one
7059                 * extra time, but that doesn't work easily as that iteration
7060                 * could cause things to think that the parse is over and to
7061                 * goto loopdone.  The character could be a '$' for example, or
7062                 * the character beyond could be a quantifier, and other
7063                 * glitches as well.
7064                 *
7065                 * The solution used here for peeking ahead is to look at that
7066                 * next character.  If it isn't ASCII punctuation, then it will
7067                 * be something that would continue on in an EXACTish node if
7068                 * there were space.  We append the fold of it to s, having
7069                 * reserved enough room in s0 for the purpose.  If we can't
7070                 * reasonably peek ahead, we instead assume the worst case:
7071                 * that it is something that would form the completion of a
7072                 * multi-char fold.
7073                 *
7074                 * If we can't split between s and ender, we work backwards
7075                 * character-by-character down to s0.  At each current point
7076                 * see if we are at the beginning of a multi-char fold.  If so,
7077                 * that means we would be splitting the fold across nodes, and
7078                 * so we back up one and try again.
7079                 *
7080                 * If we're not at the beginning, we still could be at the
7081                 * final two characters of a (rare) three character fold.  We
7082                 * check if the sequence starting at the character before the
7083                 * current position (and including the current and next
7084                 * characters) is a three character fold.  If not, the node can
7085                 * be split here.  If it is, we have to backup two characters
7086                 * and try again.
7087                 *
7088                 * Otherwise, the node can be split at the current position.
7089                 *
7090                 * The same logic is used for UTF-8 patterns and not */
7091                if (UTF) {
7092                    Size_t added_len;
7093
7094                    /* Append the fold of ender */
7095                    (void) _to_uni_fold_flags(
7096                        ender,
7097                        (U8 *) e,
7098                        &added_len,
7099                        FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7100                                        ? FOLD_FLAGS_NOMIX_ASCII
7101                                        : 0));
7102                    e += added_len;
7103
7104                    /* 's' and the character folded to by ender may be the
7105                     * first two of a three-character fold, in which case the
7106                     * node should not be split here.  That may mean examining
7107                     * the so-far unparsed character starting at 'p'.  But if
7108                     * ender folded to more than one character, we already have
7109                     * three characters to look at.  Also, we first check if
7110                     * the sequence consisting of s and the next character form
7111                     * the first two of some three character fold.  If not,
7112                     * there's no need to peek ahead. */
7113                    if (   added_len <= UTF8SKIP(e - added_len)
7114                        && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7115                    {
7116                        /* Here, the two do form the beginning of a potential
7117                         * three character fold.  The unexamined character may
7118                         * or may not complete it.  Peek at it.  It might be
7119                         * something that ends the node or an escape sequence,
7120                         * in which case we don't know without a lot of work
7121                         * what it evaluates to, so we have to assume the worst
7122                         * case: that it does complete the fold, and so we
7123                         * can't split here.  All such instances  will have
7124                         * that character be an ASCII punctuation character,
7125                         * like a backslash.  So, for that case, backup one and
7126                         * drop down to try at that position */
7127                        if (isPUNCT(*p)) {
7128                            s = (char *) utf8_hop_back((U8 *) s, -1,
7129                                       (U8 *) s_start);
7130                            backed_up = TRUE;
7131                        }
7132                        else {
7133                            /* Here, since it's not punctuation, it must be a
7134                             * real character, and we can append its fold to
7135                             * 'e' (having deliberately reserved enough space
7136                             * for this eventuality) and drop down to check if
7137                             * the three actually do form a folded sequence */
7138                            (void) _to_utf8_fold_flags(
7139                                (U8 *) p, (U8 *) RExC_end,
7140                                (U8 *) e,
7141                                &added_len,
7142                                FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7143                                                ? FOLD_FLAGS_NOMIX_ASCII
7144                                                : 0));
7145                            e += added_len;
7146                        }
7147                    }
7148
7149                    /* Here, we either have three characters available in
7150                     * sequence starting at 's', or we have two characters and
7151                     * know that the following one can't possibly be part of a
7152                     * three character fold.  We go through the node backwards
7153                     * until we find a place where we can split it without
7154                     * breaking apart a multi-character fold.  At any given
7155                     * point we have to worry about if such a fold begins at
7156                     * the current 's', and also if a three-character fold
7157                     * begins at s-1, (containing s and s+1).  Splitting in
7158                     * either case would break apart a fold */
7159                    do {
7160                        char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7161                                                            (U8 *) s_start);
7162
7163                        /* If is a multi-char fold, can't split here.  Backup
7164                         * one char and try again */
7165                        if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7166                            s = prev_s;
7167                            backed_up = TRUE;
7168                            continue;
7169                        }
7170
7171                        /* If the two characters beginning at 's' are part of a
7172                         * three character fold starting at the character
7173                         * before s, we can't split either before or after s.
7174                         * Backup two chars and try again */
7175                        if (   LIKELY(s > s_start)
7176                            && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7177                        {
7178                            s = prev_s;
7179                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7180                            backed_up = TRUE;
7181                            continue;
7182                        }
7183
7184                        /* Here there's no multi-char fold between s and the
7185                         * next character following it.  We can split */
7186                        splittable = TRUE;
7187                        break;
7188
7189                    } while (s > s_start); /* End of loops backing up through the node */
7190
7191                    /* Here we either couldn't find a place to split the node,
7192                     * or else we broke out of the loop setting 'splittable' to
7193                     * true.  In the latter case, the place to split is between
7194                     * the first and second characters in the sequence starting
7195                     * at 's' */
7196                    if (splittable) {
7197                        s += UTF8SKIP(s);
7198                    }
7199                }
7200                else {  /* Pattern not UTF-8 */
7201                    if (   ender != LATIN_SMALL_LETTER_SHARP_S
7202                        || ASCII_FOLD_RESTRICTED)
7203                    {
7204                        assert( toLOWER_L1(ender) < 256 );
7205                        *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7206                    }
7207                    else {
7208                        *e++ = 's';
7209                        *e++ = 's';
7210                    }
7211
7212                    if (   e - s  <= 1
7213                        && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7214                    {
7215                        if (isPUNCT(*p)) {
7216                            s--;
7217                            backed_up = TRUE;
7218                        }
7219                        else {
7220                            if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7221                                || ASCII_FOLD_RESTRICTED)
7222                            {
7223                                assert( toLOWER_L1(ender) < 256 );
7224                                *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7225                            }
7226                            else {
7227                                *e++ = 's';
7228                                *e++ = 's';
7229                            }
7230                        }
7231                    }
7232
7233                    do {
7234                        if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7235                            s--;
7236                            backed_up = TRUE;
7237                            continue;
7238                        }
7239
7240                        if (   LIKELY(s > s_start)
7241                            && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7242                        {
7243                            s -= 2;
7244                            backed_up = TRUE;
7245                            continue;
7246                        }
7247
7248                        splittable = TRUE;
7249                        break;
7250
7251                    } while (s > s_start);
7252
7253                    if (splittable) {
7254                        s++;
7255                    }
7256                }
7257
7258                /* Here, we are done backing up.  If we didn't backup at all
7259                 * (the likely case), just proceed */
7260                if (backed_up) {
7261
7262                   /* If we did find a place to split, reparse the entire node
7263                    * stopping where we have calculated. */
7264                    if (splittable) {
7265
7266                       /* If we created a temporary folded string under /l, we
7267                        * have to map that back to the original */
7268                        if (need_to_fold_loc) {
7269                            upper_fill = loc_correspondence[s - s_start];
7270                            if (upper_fill == 0) {
7271                                FAIL2("panic: loc_correspondence[%d] is 0",
7272                                      (int) (s - s_start));
7273                            }
7274                            Safefree(locfold_buf);
7275                            Safefree(loc_correspondence);
7276                        }
7277                        else {
7278                            upper_fill = s - s0;
7279                        }
7280                        goto reparse;
7281                    }
7282
7283                    /* Here the node consists entirely of non-final multi-char
7284                     * folds.  (Likely it is all 'f's or all 's's.)  There's no
7285                     * decent place to split it, so give up and just take the
7286                     * whole thing */
7287                    len = old_s - s0;
7288                }
7289
7290                if (need_to_fold_loc) {
7291                    Safefree(locfold_buf);
7292                    Safefree(loc_correspondence);
7293                }
7294            }   /* End of verifying node ends with an appropriate char */
7295
7296            /* We need to start the next node at the character that didn't fit
7297             * in this one */
7298            p = oldp;
7299
7300          loopdone:   /* Jumped to when encounters something that shouldn't be
7301                         in the node */
7302
7303            /* Free up any over-allocated space; cast is to silence bogus
7304             * warning in MS VC */
7305            change_engine_size(pRExC_state,
7306                        - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7307
7308            /* I (khw) don't know if you can get here with zero length, but the
7309             * old code handled this situation by creating a zero-length EXACT
7310             * node.  Might as well be NOTHING instead */
7311            if (len == 0) {
7312                OP(REGNODE_p(ret)) = NOTHING;
7313            }
7314            else {
7315
7316                /* If the node type is EXACT here, check to see if it
7317                 * should be EXACTL, or EXACT_REQ8. */
7318                if (node_type == EXACT) {
7319                    if (LOC) {
7320                        node_type = EXACTL;
7321                    }
7322                    else if (requires_utf8_target) {
7323                        node_type = EXACT_REQ8;
7324                    }
7325                }
7326                else if (node_type == LEXACT) {
7327                    if (requires_utf8_target) {
7328                        node_type = LEXACT_REQ8;
7329                    }
7330                }
7331                else if (FOLD) {
7332                    if (    UNLIKELY(has_micro_sign || has_ss)
7333                        && (node_type == EXACTFU || (   node_type == EXACTF
7334                                                     && maybe_exactfu)))
7335                    {   /* These two conditions are problematic in non-UTF-8
7336                           EXACTFU nodes. */
7337                        assert(! UTF);
7338                        node_type = EXACTFUP;
7339                    }
7340                    else if (node_type == EXACTFL) {
7341
7342                        /* 'maybe_exactfu' is deliberately set above to
7343                         * indicate this node type, where all code points in it
7344                         * are above 255 */
7345                        if (maybe_exactfu) {
7346                            node_type = EXACTFLU8;
7347                        }
7348                        else if (UNLIKELY(
7349                             _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7350                        {
7351                            /* A character that folds to more than one will
7352                             * match multiple characters, so can't be SIMPLE.
7353                             * We don't have to worry about this with EXACTFLU8
7354                             * nodes just above, as they have already been
7355                             * folded (since the fold doesn't vary at run
7356                             * time).  Here, if the final character in the node
7357                             * folds to multiple, it can't be simple.  (This
7358                             * only has an effect if the node has only a single
7359                             * character, hence the final one, as elsewhere we
7360                             * turn off simple for nodes whose length > 1 */
7361                            maybe_SIMPLE = 0;
7362                        }
7363                    }
7364                    else if (node_type == EXACTF) {  /* Means is /di */
7365
7366                        /* This intermediate variable is needed solely because
7367                         * the asserts in the macro where used exceed Win32's
7368                         * literal string capacity */
7369                        char first_char = * STRING(REGNODE_p(ret));
7370
7371                        /* If 'maybe_exactfu' is clear, then we need to stay
7372                         * /di.  If it is set, it means there are no code
7373                         * points that match differently depending on UTF8ness
7374                         * of the target string, so it can become an EXACTFU
7375                         * node */
7376                        if (! maybe_exactfu) {
7377                            RExC_seen_d_op = TRUE;
7378                        }
7379                        else if (   isALPHA_FOLD_EQ(first_char, 's')
7380                                 || isALPHA_FOLD_EQ(ender, 's'))
7381                        {
7382                            /* But, if the node begins or ends in an 's' we
7383                             * have to defer changing it into an EXACTFU, as
7384                             * the node could later get joined with another one
7385                             * that ends or begins with 's' creating an 'ss'
7386                             * sequence which would then wrongly match the
7387                             * sharp s without the target being UTF-8.  We
7388                             * create a special node that we resolve later when
7389                             * we join nodes together */
7390
7391                            node_type = EXACTFU_S_EDGE;
7392                        }
7393                        else {
7394                            node_type = EXACTFU;
7395                        }
7396                    }
7397
7398                    if (requires_utf8_target && node_type == EXACTFU) {
7399                        node_type = EXACTFU_REQ8;
7400                    }
7401                }
7402
7403                OP(REGNODE_p(ret)) = node_type;
7404                setSTR_LEN(REGNODE_p(ret), len);
7405                RExC_emit += STR_SZ(len);
7406
7407                /* If the node isn't a single character, it can't be SIMPLE */
7408                if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7409                    maybe_SIMPLE = 0;
7410                }
7411
7412                *flagp |= HASWIDTH | maybe_SIMPLE;
7413            }
7414
7415            RExC_parse_set(p);
7416
7417            {
7418                /* len is STRLEN which is unsigned, need to copy to signed */
7419                IV iv = len;
7420                if (iv < 0)
7421                    vFAIL("Internal disaster");
7422            }
7423
7424        } /* End of label 'defchar:' */
7425        break;
7426    } /* End of giant switch on input character */
7427
7428    /* Position parse to next real character */
7429    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7430                                            FALSE /* Don't force to /x */ );
7431    if (   *RExC_parse == '{'
7432        && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7433    {
7434        if (RExC_strict) {
7435            RExC_parse_inc_by(1);
7436            vFAIL("Unescaped left brace in regex is illegal here");
7437        }
7438        ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7439                                  " passed through");
7440    }
7441
7442    return(ret);
7443}
7444
7445
7446void
7447Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7448{
7449    /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
7450     * sets up the bitmap and any flags, removing those code points from the
7451     * inversion list, setting it to NULL should it become completely empty */
7452
7453
7454    PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7455
7456    /* There is no bitmap for this node type */
7457    if (REGNODE_TYPE(OP(node))  != ANYOF) {
7458        return;
7459    }
7460
7461    ANYOF_BITMAP_ZERO(node);
7462    if (*invlist_ptr) {
7463
7464        /* This gets set if we actually need to modify things */
7465        bool change_invlist = FALSE;
7466
7467        UV start, end;
7468
7469        /* Start looking through *invlist_ptr */
7470        invlist_iterinit(*invlist_ptr);
7471        while (invlist_iternext(*invlist_ptr, &start, &end)) {
7472            UV high;
7473            int i;
7474
7475            /* Quit if are above what we should change */
7476            if (start >= NUM_ANYOF_CODE_POINTS) {
7477                break;
7478            }
7479
7480            change_invlist = TRUE;
7481
7482            /* Set all the bits in the range, up to the max that we are doing */
7483            high = (end < NUM_ANYOF_CODE_POINTS - 1)
7484                   ? end
7485                   : NUM_ANYOF_CODE_POINTS - 1;
7486            for (i = start; i <= (int) high; i++) {
7487                ANYOF_BITMAP_SET(node, i);
7488            }
7489        }
7490        invlist_iterfinish(*invlist_ptr);
7491
7492        /* Done with loop; remove any code points that are in the bitmap from
7493         * *invlist_ptr */
7494        if (change_invlist) {
7495            _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7496        }
7497
7498        /* If have completely emptied it, remove it completely */
7499        if (_invlist_len(*invlist_ptr) == 0) {
7500            SvREFCNT_dec_NN(*invlist_ptr);
7501            *invlist_ptr = NULL;
7502        }
7503    }
7504}
7505
7506/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7507   Character classes ([:foo:]) can also be negated ([:^foo:]).
7508   Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7509   Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7510   but trigger failures because they are currently unimplemented. */
7511
7512#define POSIXCC_DONE(c)   ((c) == ':')
7513#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7514#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7515#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7516
7517#define WARNING_PREFIX              "Assuming NOT a POSIX class since "
7518#define NO_BLANKS_POSIX_WARNING     "no blanks are allowed in one"
7519#define SEMI_COLON_POSIX_WARNING    "a semi-colon was found instead of a colon"
7520
7521#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7522
7523/* 'posix_warnings' and 'warn_text' are names of variables in the following
7524 * routine. q.v. */
7525#define ADD_POSIX_WARNING(p, text)  STMT_START {                            \
7526        if (posix_warnings) {                                               \
7527            if (! RExC_warn_text ) RExC_warn_text =                         \
7528                                         (AV *) sv_2mortal((SV *) newAV()); \
7529            av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_                     \
7530                                             WARNING_PREFIX                 \
7531                                             text                           \
7532                                             REPORT_LOCATION,               \
7533                                             REPORT_LOCATION_ARGS(p)));     \
7534        }                                                                   \
7535    } STMT_END
7536#define CLEAR_POSIX_WARNINGS()                                              \
7537    STMT_START {                                                            \
7538        if (posix_warnings && RExC_warn_text)                               \
7539            av_clear(RExC_warn_text);                                       \
7540    } STMT_END
7541
7542#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret)                                \
7543    STMT_START {                                                            \
7544        CLEAR_POSIX_WARNINGS();                                             \
7545        return ret;                                                         \
7546    } STMT_END
7547
7548STATIC int
7549S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7550
7551    const char * const s,      /* Where the putative posix class begins.
7552                                  Normally, this is one past the '['.  This
7553                                  parameter exists so it can be somewhere
7554                                  besides RExC_parse. */
7555    char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7556                                  NULL */
7557    AV ** posix_warnings,      /* Where to place any generated warnings, or
7558                                  NULL */
7559    const bool check_only      /* Don't die if error */
7560)
7561{
7562    /* This parses what the caller thinks may be one of the three POSIX
7563     * constructs:
7564     *  1) a character class, like [:blank:]
7565     *  2) a collating symbol, like [. .]
7566     *  3) an equivalence class, like [= =]
7567     * In the latter two cases, it croaks if it finds a syntactically legal
7568     * one, as these are not handled by Perl.
7569     *
7570     * The main purpose is to look for a POSIX character class.  It returns:
7571     *  a) the class number
7572     *      if it is a completely syntactically and semantically legal class.
7573     *      'updated_parse_ptr', if not NULL, is set to point to just after the
7574     *      closing ']' of the class
7575     *  b) OOB_NAMEDCLASS
7576     *      if it appears that one of the three POSIX constructs was meant, but
7577     *      its specification was somehow defective.  'updated_parse_ptr', if
7578     *      not NULL, is set to point to the character just after the end
7579     *      character of the class.  See below for handling of warnings.
7580     *  c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7581     *      if it  doesn't appear that a POSIX construct was intended.
7582     *      'updated_parse_ptr' is not changed.  No warnings nor errors are
7583     *      raised.
7584     *
7585     * In b) there may be errors or warnings generated.  If 'check_only' is
7586     * TRUE, then any errors are discarded.  Warnings are returned to the
7587     * caller via an AV* created into '*posix_warnings' if it is not NULL.  If
7588     * instead it is NULL, warnings are suppressed.
7589     *
7590     * The reason for this function, and its complexity is that a bracketed
7591     * character class can contain just about anything.  But it's easy to
7592     * mistype the very specific posix class syntax but yielding a valid
7593     * regular bracketed class, so it silently gets compiled into something
7594     * quite unintended.
7595     *
7596     * The solution adopted here maintains backward compatibility except that
7597     * it adds a warning if it looks like a posix class was intended but
7598     * improperly specified.  The warning is not raised unless what is input
7599     * very closely resembles one of the 14 legal posix classes.  To do this,
7600     * it uses fuzzy parsing.  It calculates how many single-character edits it
7601     * would take to transform what was input into a legal posix class.  Only
7602     * if that number is quite small does it think that the intention was a
7603     * posix class.  Obviously these are heuristics, and there will be cases
7604     * where it errs on one side or another, and they can be tweaked as
7605     * experience informs.
7606     *
7607     * The syntax for a legal posix class is:
7608     *
7609     * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7610     *
7611     * What this routine considers syntactically to be an intended posix class
7612     * is this (the comments indicate some restrictions that the pattern
7613     * doesn't show):
7614     *
7615     *  qr/(?x: \[?                         # The left bracket, possibly
7616     *                                      # omitted
7617     *          \h*                         # possibly followed by blanks
7618     *          (?: \^ \h* )?               # possibly a misplaced caret
7619     *          [:;]?                       # The opening class character,
7620     *                                      # possibly omitted.  A typo
7621     *                                      # semi-colon can also be used.
7622     *          \h*
7623     *          \^?                         # possibly a correctly placed
7624     *                                      # caret, but not if there was also
7625     *                                      # a misplaced one
7626     *          \h*
7627     *          .{3,15}                     # The class name.  If there are
7628     *                                      # deviations from the legal syntax,
7629     *                                      # its edit distance must be close
7630     *                                      # to a real class name in order
7631     *                                      # for it to be considered to be
7632     *                                      # an intended posix class.
7633     *          \h*
7634     *          [[:punct:]]?                # The closing class character,
7635     *                                      # possibly omitted.  If not a colon
7636     *                                      # nor semi colon, the class name
7637     *                                      # must be even closer to a valid
7638     *                                      # one
7639     *          \h*
7640     *          \]?                         # The right bracket, possibly
7641     *                                      # omitted.
7642     *     )/
7643     *
7644     * In the above, \h must be ASCII-only.
7645     *
7646     * These are heuristics, and can be tweaked as field experience dictates.
7647     * There will be cases when someone didn't intend to specify a posix class
7648     * that this warns as being so.  The goal is to minimize these, while
7649     * maximizing the catching of things intended to be a posix class that
7650     * aren't parsed as such.
7651     */
7652
7653    const char* p             = s;
7654    const char * const e      = RExC_end;
7655    unsigned complement       = 0;      /* If to complement the class */
7656    bool found_problem        = FALSE;  /* Assume OK until proven otherwise */
7657    bool has_opening_bracket  = FALSE;
7658    bool has_opening_colon    = FALSE;
7659    int class_number          = OOB_NAMEDCLASS; /* Out-of-bounds until find
7660                                                   valid class */
7661    const char * possible_end = NULL;   /* used for a 2nd parse pass */
7662    const char* name_start;             /* ptr to class name first char */
7663
7664    /* If the number of single-character typos the input name is away from a
7665     * legal name is no more than this number, it is considered to have meant
7666     * the legal name */
7667    int max_distance          = 2;
7668
7669    /* to store the name.  The size determines the maximum length before we
7670     * decide that no posix class was intended.  Should be at least
7671     * sizeof("alphanumeric") */
7672    UV input_text[15];
7673    STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7674
7675    PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7676
7677    CLEAR_POSIX_WARNINGS();
7678
7679    if (p >= e) {
7680        return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7681    }
7682
7683    if (*(p - 1) != '[') {
7684        ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7685        found_problem = TRUE;
7686    }
7687    else {
7688        has_opening_bracket = TRUE;
7689    }
7690
7691    /* They could be confused and think you can put spaces between the
7692     * components */
7693    if (isBLANK(*p)) {
7694        found_problem = TRUE;
7695
7696        do {
7697            p++;
7698        } while (p < e && isBLANK(*p));
7699
7700        ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7701    }
7702
7703    /* For [. .] and [= =].  These are quite different internally from [: :],
7704     * so they are handled separately.  */
7705    if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7706                                            and 1 for at least one char in it
7707                                          */
7708    {
7709        const char open_char  = *p;
7710        const char * temp_ptr = p + 1;
7711
7712        /* These two constructs are not handled by perl, and if we find a
7713         * syntactically valid one, we croak.  khw, who wrote this code, finds
7714         * this explanation of them very unclear:
7715         * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7716         * And searching the rest of the internet wasn't very helpful either.
7717         * It looks like just about any byte can be in these constructs,
7718         * depending on the locale.  But unless the pattern is being compiled
7719         * under /l, which is very rare, Perl runs under the C or POSIX locale.
7720         * In that case, it looks like [= =] isn't allowed at all, and that
7721         * [. .] could be any single code point, but for longer strings the
7722         * constituent characters would have to be the ASCII alphabetics plus
7723         * the minus-hyphen.  Any sensible locale definition would limit itself
7724         * to these.  And any portable one definitely should.  Trying to parse
7725         * the general case is a nightmare (see [perl #127604]).  So, this code
7726         * looks only for interiors of these constructs that match:
7727         *      qr/.|[-\w]{2,}/
7728         * Using \w relaxes the apparent rules a little, without adding much
7729         * danger of mistaking something else for one of these constructs.
7730         *
7731         * [. .] in some implementations described on the internet is usable to
7732         * escape a character that otherwise is special in bracketed character
7733         * classes.  For example [.].] means a literal right bracket instead of
7734         * the ending of the class
7735         *
7736         * [= =] can legitimately contain a [. .] construct, but we don't
7737         * handle this case, as that [. .] construct will later get parsed
7738         * itself and croak then.  And [= =] is checked for even when not under
7739         * /l, as Perl has long done so.
7740         *
7741         * The code below relies on there being a trailing NUL, so it doesn't
7742         * have to keep checking if the parse ptr < e.
7743         */
7744        if (temp_ptr[1] == open_char) {
7745            temp_ptr++;
7746        }
7747        else while (    temp_ptr < e
7748                    && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7749        {
7750            temp_ptr++;
7751        }
7752
7753        if (*temp_ptr == open_char) {
7754            temp_ptr++;
7755            if (*temp_ptr == ']') {
7756                temp_ptr++;
7757                if (! found_problem && ! check_only) {
7758                    RExC_parse_set((char *) temp_ptr);
7759                    vFAIL3("POSIX syntax [%c %c] is reserved for future "
7760                            "extensions", open_char, open_char);
7761                }
7762
7763                /* Here, the syntax wasn't completely valid, or else the call
7764                 * is to check-only */
7765                if (updated_parse_ptr) {
7766                    *updated_parse_ptr = (char *) temp_ptr;
7767                }
7768
7769                CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7770            }
7771        }
7772
7773        /* If we find something that started out to look like one of these
7774         * constructs, but isn't, we continue below so that it can be checked
7775         * for being a class name with a typo of '.' or '=' instead of a colon.
7776         * */
7777    }
7778
7779    /* Here, we think there is a possibility that a [: :] class was meant, and
7780     * we have the first real character.  It could be they think the '^' comes
7781     * first */
7782    if (*p == '^') {
7783        found_problem = TRUE;
7784        ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7785        complement = 1;
7786        p++;
7787
7788        if (isBLANK(*p)) {
7789            found_problem = TRUE;
7790
7791            do {
7792                p++;
7793            } while (p < e && isBLANK(*p));
7794
7795            ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7796        }
7797    }
7798
7799    /* But the first character should be a colon, which they could have easily
7800     * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7801     * distinguish from a colon, so treat that as a colon).  */
7802    if (*p == ':') {
7803        p++;
7804        has_opening_colon = TRUE;
7805    }
7806    else if (*p == ';') {
7807        found_problem = TRUE;
7808        p++;
7809        ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7810        has_opening_colon = TRUE;
7811    }
7812    else {
7813        found_problem = TRUE;
7814        ADD_POSIX_WARNING(p, "there must be a starting ':'");
7815
7816        /* Consider an initial punctuation (not one of the recognized ones) to
7817         * be a left terminator */
7818        if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7819            p++;
7820        }
7821    }
7822
7823    /* They may think that you can put spaces between the components */
7824    if (isBLANK(*p)) {
7825        found_problem = TRUE;
7826
7827        do {
7828            p++;
7829        } while (p < e && isBLANK(*p));
7830
7831        ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7832    }
7833
7834    if (*p == '^') {
7835
7836        /* We consider something like [^:^alnum:]] to not have been intended to
7837         * be a posix class, but XXX maybe we should */
7838        if (complement) {
7839            CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7840        }
7841
7842        complement = 1;
7843        p++;
7844    }
7845
7846    /* Again, they may think that you can put spaces between the components */
7847    if (isBLANK(*p)) {
7848        found_problem = TRUE;
7849
7850        do {
7851            p++;
7852        } while (p < e && isBLANK(*p));
7853
7854        ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7855    }
7856
7857    if (*p == ']') {
7858
7859        /* XXX This ']' may be a typo, and something else was meant.  But
7860         * treating it as such creates enough complications, that that
7861         * possibility isn't currently considered here.  So we assume that the
7862         * ']' is what is intended, and if we've already found an initial '[',
7863         * this leaves this construct looking like [:] or [:^], which almost
7864         * certainly weren't intended to be posix classes */
7865        if (has_opening_bracket) {
7866            CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7867        }
7868
7869        /* But this function can be called when we parse the colon for
7870         * something like qr/[alpha:]]/, so we back up to look for the
7871         * beginning */
7872        p--;
7873
7874        if (*p == ';') {
7875            found_problem = TRUE;
7876            ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7877        }
7878        else if (*p != ':') {
7879
7880            /* XXX We are currently very restrictive here, so this code doesn't
7881             * consider the possibility that, say, /[alpha.]]/ was intended to
7882             * be a posix class. */
7883            CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7884        }
7885
7886        /* Here we have something like 'foo:]'.  There was no initial colon,
7887         * and we back up over 'foo.  XXX Unlike the going forward case, we
7888         * don't handle typos of non-word chars in the middle */
7889        has_opening_colon = FALSE;
7890        p--;
7891
7892        while (p > RExC_start && isWORDCHAR(*p)) {
7893            p--;
7894        }
7895        p++;
7896
7897        /* Here, we have positioned ourselves to where we think the first
7898         * character in the potential class is */
7899    }
7900
7901    /* Now the interior really starts.  There are certain key characters that
7902     * can end the interior, or these could just be typos.  To catch both
7903     * cases, we may have to do two passes.  In the first pass, we keep on
7904     * going unless we come to a sequence that matches
7905     *      qr/ [[:punct:]] [[:blank:]]* \] /xa
7906     * This means it takes a sequence to end the pass, so two typos in a row if
7907     * that wasn't what was intended.  If the class is perfectly formed, just
7908     * this one pass is needed.  We also stop if there are too many characters
7909     * being accumulated, but this number is deliberately set higher than any
7910     * real class.  It is set high enough so that someone who thinks that
7911     * 'alphanumeric' is a correct name would get warned that it wasn't.
7912     * While doing the pass, we keep track of where the key characters were in
7913     * it.  If we don't find an end to the class, and one of the key characters
7914     * was found, we redo the pass, but stop when we get to that character.
7915     * Thus the key character was considered a typo in the first pass, but a
7916     * terminator in the second.  If two key characters are found, we stop at
7917     * the second one in the first pass.  Again this can miss two typos, but
7918     * catches a single one
7919     *
7920     * In the first pass, 'possible_end' starts as NULL, and then gets set to
7921     * point to the first key character.  For the second pass, it starts as -1.
7922     * */
7923
7924    name_start = p;
7925  parse_name:
7926    {
7927        bool has_blank               = FALSE;
7928        bool has_upper               = FALSE;
7929        bool has_terminating_colon   = FALSE;
7930        bool has_terminating_bracket = FALSE;
7931        bool has_semi_colon          = FALSE;
7932        unsigned int name_len        = 0;
7933        int punct_count              = 0;
7934
7935        while (p < e) {
7936
7937            /* Squeeze out blanks when looking up the class name below */
7938            if (isBLANK(*p) ) {
7939                has_blank = TRUE;
7940                found_problem = TRUE;
7941                p++;
7942                continue;
7943            }
7944
7945            /* The name will end with a punctuation */
7946            if (isPUNCT(*p)) {
7947                const char * peek = p + 1;
7948
7949                /* Treat any non-']' punctuation followed by a ']' (possibly
7950                 * with intervening blanks) as trying to terminate the class.
7951                 * ']]' is very likely to mean a class was intended (but
7952                 * missing the colon), but the warning message that gets
7953                 * generated shows the error position better if we exit the
7954                 * loop at the bottom (eventually), so skip it here. */
7955                if (*p != ']') {
7956                    if (peek < e && isBLANK(*peek)) {
7957                        has_blank = TRUE;
7958                        found_problem = TRUE;
7959                        do {
7960                            peek++;
7961                        } while (peek < e && isBLANK(*peek));
7962                    }
7963
7964                    if (peek < e && *peek == ']') {
7965                        has_terminating_bracket = TRUE;
7966                        if (*p == ':') {
7967                            has_terminating_colon = TRUE;
7968                        }
7969                        else if (*p == ';') {
7970                            has_semi_colon = TRUE;
7971                            has_terminating_colon = TRUE;
7972                        }
7973                        else {
7974                            found_problem = TRUE;
7975                        }
7976                        p = peek + 1;
7977                        goto try_posix;
7978                    }
7979                }
7980
7981                /* Here we have punctuation we thought didn't end the class.
7982                 * Keep track of the position of the key characters that are
7983                 * more likely to have been class-enders */
7984                if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
7985
7986                    /* Allow just one such possible class-ender not actually
7987                     * ending the class. */
7988                    if (possible_end) {
7989                        break;
7990                    }
7991                    possible_end = p;
7992                }
7993
7994                /* If we have too many punctuation characters, no use in
7995                 * keeping going */
7996                if (++punct_count > max_distance) {
7997                    break;
7998                }
7999
8000                /* Treat the punctuation as a typo. */
8001                input_text[name_len++] = *p;
8002                p++;
8003            }
8004            else if (isUPPER(*p)) { /* Use lowercase for lookup */
8005                input_text[name_len++] = toLOWER(*p);
8006                has_upper = TRUE;
8007                found_problem = TRUE;
8008                p++;
8009            } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8010                input_text[name_len++] = *p;
8011                p++;
8012            }
8013            else {
8014                input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8015                p+= UTF8SKIP(p);
8016            }
8017
8018            /* The declaration of 'input_text' is how long we allow a potential
8019             * class name to be, before saying they didn't mean a class name at
8020             * all */
8021            if (name_len >= C_ARRAY_LENGTH(input_text)) {
8022                break;
8023            }
8024        }
8025
8026        /* We get to here when the possible class name hasn't been properly
8027         * terminated before:
8028         *   1) we ran off the end of the pattern; or
8029         *   2) found two characters, each of which might have been intended to
8030         *      be the name's terminator
8031         *   3) found so many punctuation characters in the purported name,
8032         *      that the edit distance to a valid one is exceeded
8033         *   4) we decided it was more characters than anyone could have
8034         *      intended to be one. */
8035
8036        found_problem = TRUE;
8037
8038        /* In the final two cases, we know that looking up what we've
8039         * accumulated won't lead to a match, even a fuzzy one. */
8040        if (   name_len >= C_ARRAY_LENGTH(input_text)
8041            || punct_count > max_distance)
8042        {
8043            /* If there was an intermediate key character that could have been
8044             * an intended end, redo the parse, but stop there */
8045            if (possible_end && possible_end != (char *) -1) {
8046                possible_end = (char *) -1; /* Special signal value to say
8047                                               we've done a first pass */
8048                p = name_start;
8049                goto parse_name;
8050            }
8051
8052            /* Otherwise, it can't have meant to have been a class */
8053            CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8054        }
8055
8056        /* If we ran off the end, and the final character was a punctuation
8057         * one, back up one, to look at that final one just below.  Later, we
8058         * will restore the parse pointer if appropriate */
8059        if (name_len && p == e && isPUNCT(*(p-1))) {
8060            p--;
8061            name_len--;
8062        }
8063
8064        if (p < e && isPUNCT(*p)) {
8065            if (*p == ']') {
8066                has_terminating_bracket = TRUE;
8067
8068                /* If this is a 2nd ']', and the first one is just below this
8069                 * one, consider that to be the real terminator.  This gives a
8070                 * uniform and better positioning for the warning message  */
8071                if (   possible_end
8072                    && possible_end != (char *) -1
8073                    && *possible_end == ']'
8074                    && name_len && input_text[name_len - 1] == ']')
8075                {
8076                    name_len--;
8077                    p = possible_end;
8078
8079                    /* And this is actually equivalent to having done the 2nd
8080                     * pass now, so set it to not try again */
8081                    possible_end = (char *) -1;
8082                }
8083            }
8084            else {
8085                if (*p == ':') {
8086                    has_terminating_colon = TRUE;
8087                }
8088                else if (*p == ';') {
8089                    has_semi_colon = TRUE;
8090                    has_terminating_colon = TRUE;
8091                }
8092                p++;
8093            }
8094        }
8095
8096    try_posix:
8097
8098        /* Here, we have a class name to look up.  We can short circuit the
8099         * stuff below for short names that can't possibly be meant to be a
8100         * class name.  (We can do this on the first pass, as any second pass
8101         * will yield an even shorter name) */
8102        if (name_len < 3) {
8103            CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8104        }
8105
8106        /* Find which class it is.  Initially switch on the length of the name.
8107         * */
8108        switch (name_len) {
8109            case 4:
8110                if (memEQs(name_start, 4, "word")) {
8111                    /* this is not POSIX, this is the Perl \w */
8112                    class_number = ANYOF_WORDCHAR;
8113                }
8114                break;
8115            case 5:
8116                /* Names all of length 5: alnum alpha ascii blank cntrl digit
8117                 *                        graph lower print punct space upper
8118                 * Offset 4 gives the best switch position.  */
8119                switch (name_start[4]) {
8120                    case 'a':
8121                        if (memBEGINs(name_start, 5, "alph")) /* alpha */
8122                            class_number = ANYOF_ALPHA;
8123                        break;
8124                    case 'e':
8125                        if (memBEGINs(name_start, 5, "spac")) /* space */
8126                            class_number = ANYOF_SPACE;
8127                        break;
8128                    case 'h':
8129                        if (memBEGINs(name_start, 5, "grap")) /* graph */
8130                            class_number = ANYOF_GRAPH;
8131                        break;
8132                    case 'i':
8133                        if (memBEGINs(name_start, 5, "asci")) /* ascii */
8134                            class_number = ANYOF_ASCII;
8135                        break;
8136                    case 'k':
8137                        if (memBEGINs(name_start, 5, "blan")) /* blank */
8138                            class_number = ANYOF_BLANK;
8139                        break;
8140                    case 'l':
8141                        if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8142                            class_number = ANYOF_CNTRL;
8143                        break;
8144                    case 'm':
8145                        if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8146                            class_number = ANYOF_ALPHANUMERIC;
8147                        break;
8148                    case 'r':
8149                        if (memBEGINs(name_start, 5, "lowe")) /* lower */
8150                            class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8151                        else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8152                            class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8153                        break;
8154                    case 't':
8155                        if (memBEGINs(name_start, 5, "digi")) /* digit */
8156                            class_number = ANYOF_DIGIT;
8157                        else if (memBEGINs(name_start, 5, "prin")) /* print */
8158                            class_number = ANYOF_PRINT;
8159                        else if (memBEGINs(name_start, 5, "punc")) /* punct */
8160                            class_number = ANYOF_PUNCT;
8161                        break;
8162                }
8163                break;
8164            case 6:
8165                if (memEQs(name_start, 6, "xdigit"))
8166                    class_number = ANYOF_XDIGIT;
8167                break;
8168        }
8169
8170        /* If the name exactly matches a posix class name the class number will
8171         * here be set to it, and the input almost certainly was meant to be a
8172         * posix class, so we can skip further checking.  If instead the syntax
8173         * is exactly correct, but the name isn't one of the legal ones, we
8174         * will return that as an error below.  But if neither of these apply,
8175         * it could be that no posix class was intended at all, or that one
8176         * was, but there was a typo.  We tease these apart by doing fuzzy
8177         * matching on the name */
8178        if (class_number == OOB_NAMEDCLASS && found_problem) {
8179            const UV posix_names[][6] = {
8180                                                { 'a', 'l', 'n', 'u', 'm' },
8181                                                { 'a', 'l', 'p', 'h', 'a' },
8182                                                { 'a', 's', 'c', 'i', 'i' },
8183                                                { 'b', 'l', 'a', 'n', 'k' },
8184                                                { 'c', 'n', 't', 'r', 'l' },
8185                                                { 'd', 'i', 'g', 'i', 't' },
8186                                                { 'g', 'r', 'a', 'p', 'h' },
8187                                                { 'l', 'o', 'w', 'e', 'r' },
8188                                                { 'p', 'r', 'i', 'n', 't' },
8189                                                { 'p', 'u', 'n', 'c', 't' },
8190                                                { 's', 'p', 'a', 'c', 'e' },
8191                                                { 'u', 'p', 'p', 'e', 'r' },
8192                                                { 'w', 'o', 'r', 'd' },
8193                                                { 'x', 'd', 'i', 'g', 'i', 't' }
8194                                            };
8195            /* The names of the above all have added NULs to make them the same
8196             * size, so we need to also have the real lengths */
8197            const UV posix_name_lengths[] = {
8198                                                sizeof("alnum") - 1,
8199                                                sizeof("alpha") - 1,
8200                                                sizeof("ascii") - 1,
8201                                                sizeof("blank") - 1,
8202                                                sizeof("cntrl") - 1,
8203                                                sizeof("digit") - 1,
8204                                                sizeof("graph") - 1,
8205                                                sizeof("lower") - 1,
8206                                                sizeof("print") - 1,
8207                                                sizeof("punct") - 1,
8208                                                sizeof("space") - 1,
8209                                                sizeof("upper") - 1,
8210                                                sizeof("word")  - 1,
8211                                                sizeof("xdigit")- 1
8212                                            };
8213            unsigned int i;
8214            int temp_max = max_distance;    /* Use a temporary, so if we
8215                                               reparse, we haven't changed the
8216                                               outer one */
8217
8218            /* Use a smaller max edit distance if we are missing one of the
8219             * delimiters */
8220            if (   has_opening_bracket + has_opening_colon < 2
8221                || has_terminating_bracket + has_terminating_colon < 2)
8222            {
8223                temp_max--;
8224            }
8225
8226            /* See if the input name is close to a legal one */
8227            for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8228
8229                /* Short circuit call if the lengths are too far apart to be
8230                 * able to match */
8231                if (abs( (int) (name_len - posix_name_lengths[i]))
8232                    > temp_max)
8233                {
8234                    continue;
8235                }
8236
8237                if (edit_distance(input_text,
8238                                  posix_names[i],
8239                                  name_len,
8240                                  posix_name_lengths[i],
8241                                  temp_max
8242                                 )
8243                    > -1)
8244                { /* If it is close, it probably was intended to be a class */
8245                    goto probably_meant_to_be;
8246                }
8247            }
8248
8249            /* Here the input name is not close enough to a valid class name
8250             * for us to consider it to be intended to be a posix class.  If
8251             * we haven't already done so, and the parse found a character that
8252             * could have been terminators for the name, but which we absorbed
8253             * as typos during the first pass, repeat the parse, signalling it
8254             * to stop at that character */
8255            if (possible_end && possible_end != (char *) -1) {
8256                possible_end = (char *) -1;
8257                p = name_start;
8258                goto parse_name;
8259            }
8260
8261            /* Here neither pass found a close-enough class name */
8262            CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8263        }
8264
8265    probably_meant_to_be:
8266
8267        /* Here we think that a posix specification was intended.  Update any
8268         * parse pointer */
8269        if (updated_parse_ptr) {
8270            *updated_parse_ptr = (char *) p;
8271        }
8272
8273        /* If a posix class name was intended but incorrectly specified, we
8274         * output or return the warnings */
8275        if (found_problem) {
8276
8277            /* We set flags for these issues in the parse loop above instead of
8278             * adding them to the list of warnings, because we can parse it
8279             * twice, and we only want one warning instance */
8280            if (has_upper) {
8281                ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8282            }
8283            if (has_blank) {
8284                ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8285            }
8286            if (has_semi_colon) {
8287                ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8288            }
8289            else if (! has_terminating_colon) {
8290                ADD_POSIX_WARNING(p, "there is no terminating ':'");
8291            }
8292            if (! has_terminating_bracket) {
8293                ADD_POSIX_WARNING(p, "there is no terminating ']'");
8294            }
8295
8296            if (   posix_warnings
8297                && RExC_warn_text
8298                && av_count(RExC_warn_text) > 0)
8299            {
8300                *posix_warnings = RExC_warn_text;
8301            }
8302        }
8303        else if (class_number != OOB_NAMEDCLASS) {
8304            /* If it is a known class, return the class.  The class number
8305             * #defines are structured so each complement is +1 to the normal
8306             * one */
8307            CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8308        }
8309        else if (! check_only) {
8310
8311            /* Here, it is an unrecognized class.  This is an error (unless the
8312            * call is to check only, which we've already handled above) */
8313            const char * const complement_string = (complement)
8314                                                   ? "^"
8315                                                   : "";
8316            RExC_parse_set((char *) p);
8317            vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8318                        complement_string,
8319                        UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8320        }
8321    }
8322
8323    return OOB_NAMEDCLASS;
8324}
8325#undef ADD_POSIX_WARNING
8326
8327STATIC unsigned  int
8328S_regex_set_precedence(const U8 my_operator) {
8329
8330    /* Returns the precedence in the (?[...]) construct of the input operator,
8331     * specified by its character representation.  The precedence follows
8332     * general Perl rules, but it extends this so that ')' and ']' have (low)
8333     * precedence even though they aren't really operators */
8334
8335    switch (my_operator) {
8336        case '!':
8337            return 5;
8338        case '&':
8339            return 4;
8340        case '^':
8341        case '|':
8342        case '+':
8343        case '-':
8344            return 3;
8345        case ')':
8346            return 2;
8347        case ']':
8348            return 1;
8349    }
8350
8351    NOT_REACHED; /* NOTREACHED */
8352    return 0;   /* Silence compiler warning */
8353}
8354
8355STATIC regnode_offset
8356S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8357                    I32 *flagp, U32 depth)
8358{
8359    /* Handle the (?[...]) construct to do set operations */
8360
8361    U8 curchar;                     /* Current character being parsed */
8362    UV start, end;	            /* End points of code point ranges */
8363    SV* final = NULL;               /* The end result inversion list */
8364    SV* result_string;              /* 'final' stringified */
8365    AV* stack;                      /* stack of operators and operands not yet
8366                                       resolved */
8367    AV* fence_stack = NULL;         /* A stack containing the positions in
8368                                       'stack' of where the undealt-with left
8369                                       parens would be if they were actually
8370                                       put there */
8371    /* The 'volatile' is a workaround for an optimiser bug
8372     * in Solaris Studio 12.3. See RT #127455 */
8373    volatile IV fence = 0;          /* Position of where most recent undealt-
8374                                       with left paren in stack is; -1 if none.
8375                                     */
8376    STRLEN len;                     /* Temporary */
8377    regnode_offset node;            /* Temporary, and final regnode returned by
8378                                       this function */
8379    const bool save_fold = FOLD;    /* Temporary */
8380    char *save_end, *save_parse;    /* Temporaries */
8381    const bool in_locale = LOC;     /* we turn off /l during processing */
8382
8383    DECLARE_AND_GET_RE_DEBUG_FLAGS;
8384
8385    PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8386
8387    DEBUG_PARSE("xcls");
8388
8389    if (in_locale) {
8390        set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8391    }
8392
8393    /* The use of this operator implies /u.  This is required so that the
8394     * compile time values are valid in all runtime cases */
8395    REQUIRE_UNI_RULES(flagp, 0);
8396
8397    /* Everything in this construct is a metacharacter.  Operands begin with
8398     * either a '\' (for an escape sequence), or a '[' for a bracketed
8399     * character class.  Any other character should be an operator, or
8400     * parenthesis for grouping.  Both types of operands are handled by calling
8401     * regclass() to parse them.  It is called with a parameter to indicate to
8402     * return the computed inversion list.  The parsing here is implemented via
8403     * a stack.  Each entry on the stack is a single character representing one
8404     * of the operators; or else a pointer to an operand inversion list. */
8405
8406#define IS_OPERATOR(a) SvIOK(a)
8407#define IS_OPERAND(a)  (! IS_OPERATOR(a))
8408
8409    /* The stack is kept in ��ukasiewicz order.  (That's pronounced similar
8410     * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8411     * with pronouncing it called it Reverse Polish instead, but now that YOU
8412     * know how to pronounce it you can use the correct term, thus giving due
8413     * credit to the person who invented it, and impressing your geek friends.
8414     * Wikipedia says that the pronunciation of "��" has been changing so that
8415     * it is now more like an English initial W (as in wonk) than an L.)
8416     *
8417     * This means that, for example, 'a | b & c' is stored on the stack as
8418     *
8419     * c  [4]
8420     * b  [3]
8421     * &  [2]
8422     * a  [1]
8423     * |  [0]
8424     *
8425     * where the numbers in brackets give the stack [array] element number.
8426     * In this implementation, parentheses are not stored on the stack.
8427     * Instead a '(' creates a "fence" so that the part of the stack below the
8428     * fence is invisible except to the corresponding ')' (this allows us to
8429     * replace testing for parens, by using instead subtraction of the fence
8430     * position).  As new operands are processed they are pushed onto the stack
8431     * (except as noted in the next paragraph).  New operators of higher
8432     * precedence than the current final one are inserted on the stack before
8433     * the lhs operand (so that when the rhs is pushed next, everything will be
8434     * in the correct positions shown above.  When an operator of equal or
8435     * lower precedence is encountered in parsing, all the stacked operations
8436     * of equal or higher precedence are evaluated, leaving the result as the
8437     * top entry on the stack.  This makes higher precedence operations
8438     * evaluate before lower precedence ones, and causes operations of equal
8439     * precedence to left associate.
8440     *
8441     * The only unary operator '!' is immediately pushed onto the stack when
8442     * encountered.  When an operand is encountered, if the top of the stack is
8443     * a '!", the complement is immediately performed, and the '!' popped.  The
8444     * resulting value is treated as a new operand, and the logic in the
8445     * previous paragraph is executed.  Thus in the expression
8446     *      [a] + ! [b]
8447     * the stack looks like
8448     *
8449     * !
8450     * a
8451     * +
8452     *
8453     * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8454     * becomes
8455     *
8456     * !b
8457     * a
8458     * +
8459     *
8460     * A ')' is treated as an operator with lower precedence than all the
8461     * aforementioned ones, which causes all operations on the stack above the
8462     * corresponding '(' to be evaluated down to a single resultant operand.
8463     * Then the fence for the '(' is removed, and the operand goes through the
8464     * algorithm above, without the fence.
8465     *
8466     * A separate stack is kept of the fence positions, so that the position of
8467     * the latest so-far unbalanced '(' is at the top of it.
8468     *
8469     * The ']' ending the construct is treated as the lowest operator of all,
8470     * so that everything gets evaluated down to a single operand, which is the
8471     * result */
8472
8473    stack = (AV*)newSV_type_mortal(SVt_PVAV);
8474    fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8475
8476    while (RExC_parse < RExC_end) {
8477        I32 top_index;              /* Index of top-most element in 'stack' */
8478        SV** top_ptr;               /* Pointer to top 'stack' element */
8479        SV* current = NULL;         /* To contain the current inversion list
8480                                       operand */
8481        SV* only_to_avoid_leaks;
8482
8483        skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8484                                TRUE /* Force /x */ );
8485        if (RExC_parse >= RExC_end) {   /* Fail */
8486            break;
8487        }
8488
8489        curchar = UCHARAT(RExC_parse);
8490
8491redo_curchar:
8492
8493#ifdef ENABLE_REGEX_SETS_DEBUGGING
8494                    /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8495        DEBUG_U(dump_regex_sets_structures(pRExC_state,
8496                                           stack, fence, fence_stack));
8497#endif
8498
8499        top_index = av_tindex_skip_len_mg(stack);
8500
8501        switch (curchar) {
8502            SV** stacked_ptr;       /* Ptr to something already on 'stack' */
8503            char stacked_operator;  /* The topmost operator on the 'stack'. */
8504            SV* lhs;                /* Operand to the left of the operator */
8505            SV* rhs;                /* Operand to the right of the operator */
8506            SV* fence_ptr;          /* Pointer to top element of the fence
8507                                       stack */
8508            case '(':
8509
8510                if (   RExC_parse < RExC_end - 2
8511                    && UCHARAT(RExC_parse + 1) == '?'
8512                    && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8513                {
8514                    const regnode_offset orig_emit = RExC_emit;
8515                    SV * resultant_invlist;
8516
8517                    /* Here it could be an embedded '(?flags:(?[...])'.
8518                     * This happens when we have some thing like
8519                     *
8520                     *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8521                     *   ...
8522                     *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
8523                     *
8524                     * Here we would be handling the interpolated
8525                     * '$thai_or_lao'.  We handle this by a recursive call to
8526                     * reg which returns the inversion list the
8527                     * interpolated expression evaluates to.  Actually, the
8528                     * return is a special regnode containing a pointer to that
8529                     * inversion list.  If the return isn't that regnode alone,
8530                     * we know that this wasn't such an interpolation, which is
8531                     * an error: we need to get a single inversion list back
8532                     * from the recursion */
8533
8534                    RExC_parse_inc_by(1);
8535                    RExC_sets_depth++;
8536
8537                    node = reg(pRExC_state, 2, flagp, depth+1);
8538                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
8539
8540                    if (   OP(REGNODE_p(node)) != REGEX_SET
8541                           /* If more than a single node returned, the nested
8542                            * parens evaluated to more than just a (?[...]),
8543                            * which isn't legal */
8544                        || RExC_emit != orig_emit
8545                                      + NODE_STEP_REGNODE
8546                                      + REGNODE_ARG_LEN(REGEX_SET))
8547                    {
8548                        vFAIL("Expecting interpolated extended charclass");
8549                    }
8550                    resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8551                    current = invlist_clone(resultant_invlist, NULL);
8552                    SvREFCNT_dec(resultant_invlist);
8553
8554                    RExC_sets_depth--;
8555                    RExC_emit = orig_emit;
8556                    goto handle_operand;
8557                }
8558
8559                /* A regular '('.  Look behind for illegal syntax */
8560                if (top_index - fence >= 0) {
8561                    /* If the top entry on the stack is an operator, it had
8562                     * better be a '!', otherwise the entry below the top
8563                     * operand should be an operator */
8564                    if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
8565                        || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8566                        || (   IS_OPERAND(*top_ptr)
8567                            && (   top_index - fence < 1
8568                                || ! (stacked_ptr = av_fetch(stack,
8569                                                             top_index - 1,
8570                                                             FALSE))
8571                                || ! IS_OPERATOR(*stacked_ptr))))
8572                    {
8573                        RExC_parse_inc_by(1);
8574                        vFAIL("Unexpected '(' with no preceding operator");
8575                    }
8576                }
8577
8578                /* Stack the position of this undealt-with left paren */
8579                av_push_simple(fence_stack, newSViv(fence));
8580                fence = top_index + 1;
8581                break;
8582
8583            case '\\':
8584                /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8585                 * multi-char folds are allowed.  */
8586                if (!regclass(pRExC_state, flagp, depth+1,
8587                              TRUE, /* means parse just the next thing */
8588                              FALSE, /* don't allow multi-char folds */
8589                              FALSE, /* don't silence non-portable warnings.  */
8590                              TRUE,  /* strict */
8591                              FALSE, /* Require return to be an ANYOF */
8592                              &current))
8593                {
8594                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
8595                    goto regclass_failed;
8596                }
8597
8598                assert(current);
8599
8600                /* regclass() will return with parsing just the \ sequence,
8601                 * leaving the parse pointer at the next thing to parse */
8602                RExC_parse--;
8603                goto handle_operand;
8604
8605            case '[':   /* Is a bracketed character class */
8606            {
8607                /* See if this is a [:posix:] class. */
8608                bool is_posix_class = (OOB_NAMEDCLASS
8609                            < handle_possible_posix(pRExC_state,
8610                                                RExC_parse + 1,
8611                                                NULL,
8612                                                NULL,
8613                                                TRUE /* checking only */));
8614                /* If it is a posix class, leave the parse pointer at the '['
8615                 * to fool regclass() into thinking it is part of a
8616                 * '[[:posix:]]'. */
8617                if (! is_posix_class) {
8618                    RExC_parse_inc_by(1);
8619                }
8620
8621                /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8622                 * multi-char folds are allowed.  */
8623                if (!regclass(pRExC_state, flagp, depth+1,
8624                                is_posix_class, /* parse the whole char
8625                                                    class only if not a
8626                                                    posix class */
8627                                FALSE, /* don't allow multi-char folds */
8628                                TRUE, /* silence non-portable warnings. */
8629                                TRUE, /* strict */
8630                                FALSE, /* Require return to be an ANYOF */
8631                                &current))
8632                {
8633                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
8634                    goto regclass_failed;
8635                }
8636
8637                assert(current);
8638
8639                /* function call leaves parse pointing to the ']', except if we
8640                 * faked it */
8641                if (is_posix_class) {
8642                    RExC_parse--;
8643                }
8644
8645                goto handle_operand;
8646            }
8647
8648            case ']':
8649                if (top_index >= 1) {
8650                    goto join_operators;
8651                }
8652
8653                /* Only a single operand on the stack: are done */
8654                goto done;
8655
8656            case ')':
8657                if (av_tindex_skip_len_mg(fence_stack) < 0) {
8658                    if (UCHARAT(RExC_parse - 1) == ']')  {
8659                        break;
8660                    }
8661                    RExC_parse_inc_by(1);
8662                    vFAIL("Unexpected ')'");
8663                }
8664
8665                /* If nothing after the fence, is missing an operand */
8666                if (top_index - fence < 0) {
8667                    RExC_parse_inc_by(1);
8668                    goto bad_syntax;
8669                }
8670                /* If at least two things on the stack, treat this as an
8671                  * operator */
8672                if (top_index - fence >= 1) {
8673                    goto join_operators;
8674                }
8675
8676                /* Here only a single thing on the fenced stack, and there is a
8677                 * fence.  Get rid of it */
8678                fence_ptr = av_pop(fence_stack);
8679                assert(fence_ptr);
8680                fence = SvIV(fence_ptr);
8681                SvREFCNT_dec_NN(fence_ptr);
8682                fence_ptr = NULL;
8683
8684                if (fence < 0) {
8685                    fence = 0;
8686                }
8687
8688                /* Having gotten rid of the fence, we pop the operand at the
8689                 * stack top and process it as a newly encountered operand */
8690                current = av_pop(stack);
8691                if (IS_OPERAND(current)) {
8692                    goto handle_operand;
8693                }
8694
8695                RExC_parse_inc_by(1);
8696                goto bad_syntax;
8697
8698            case '&':
8699            case '|':
8700            case '+':
8701            case '-':
8702            case '^':
8703
8704                /* These binary operators should have a left operand already
8705                 * parsed */
8706                if (   top_index - fence < 0
8707                    || top_index - fence == 1
8708                    || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8709                    || ! IS_OPERAND(*top_ptr))
8710                {
8711                    goto unexpected_binary;
8712                }
8713
8714                /* If only the one operand is on the part of the stack visible
8715                 * to us, we just place this operator in the proper position */
8716                if (top_index - fence < 2) {
8717
8718                    /* Place the operator before the operand */
8719
8720                    SV* lhs = av_pop(stack);
8721                    av_push_simple(stack, newSVuv(curchar));
8722                    av_push_simple(stack, lhs);
8723                    break;
8724                }
8725
8726                /* But if there is something else on the stack, we need to
8727                 * process it before this new operator if and only if the
8728                 * stacked operation has equal or higher precedence than the
8729                 * new one */
8730
8731             join_operators:
8732
8733                /* The operator on the stack is supposed to be below both its
8734                 * operands */
8735                if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8736                    || IS_OPERAND(*stacked_ptr))
8737                {
8738                    /* But if not, it's legal and indicates we are completely
8739                     * done if and only if we're currently processing a ']',
8740                     * which should be the final thing in the expression */
8741                    if (curchar == ']') {
8742                        goto done;
8743                    }
8744
8745                  unexpected_binary:
8746                    RExC_parse_inc_by(1);
8747                    vFAIL2("Unexpected binary operator '%c' with no "
8748                           "preceding operand", curchar);
8749                }
8750                stacked_operator = (char) SvUV(*stacked_ptr);
8751
8752                if (regex_set_precedence(curchar)
8753                    > regex_set_precedence(stacked_operator))
8754                {
8755                    /* Here, the new operator has higher precedence than the
8756                     * stacked one.  This means we need to add the new one to
8757                     * the stack to await its rhs operand (and maybe more
8758                     * stuff).  We put it before the lhs operand, leaving
8759                     * untouched the stacked operator and everything below it
8760                     * */
8761                    lhs = av_pop(stack);
8762                    assert(IS_OPERAND(lhs));
8763                    av_push_simple(stack, newSVuv(curchar));
8764                    av_push_simple(stack, lhs);
8765                    break;
8766                }
8767
8768                /* Here, the new operator has equal or lower precedence than
8769                 * what's already there.  This means the operation already
8770                 * there should be performed now, before the new one. */
8771
8772                rhs = av_pop(stack);
8773                if (! IS_OPERAND(rhs)) {
8774
8775                    /* This can happen when a ! is not followed by an operand,
8776                     * like in /(?[\t &!])/ */
8777                    goto bad_syntax;
8778                }
8779
8780                lhs = av_pop(stack);
8781
8782                if (! IS_OPERAND(lhs)) {
8783
8784                    /* This can happen when there is an empty (), like in
8785                     * /(?[[0]+()+])/ */
8786                    goto bad_syntax;
8787                }
8788
8789                switch (stacked_operator) {
8790                    case '&':
8791                        _invlist_intersection(lhs, rhs, &rhs);
8792                        break;
8793
8794                    case '|':
8795                    case '+':
8796                        _invlist_union(lhs, rhs, &rhs);
8797                        break;
8798
8799                    case '-':
8800                        _invlist_subtract(lhs, rhs, &rhs);
8801                        break;
8802
8803                    case '^':   /* The union minus the intersection */
8804                    {
8805                        SV* i = NULL;
8806                        SV* u = NULL;
8807
8808                        _invlist_union(lhs, rhs, &u);
8809                        _invlist_intersection(lhs, rhs, &i);
8810                        _invlist_subtract(u, i, &rhs);
8811                        SvREFCNT_dec_NN(i);
8812                        SvREFCNT_dec_NN(u);
8813                        break;
8814                    }
8815                }
8816                SvREFCNT_dec(lhs);
8817
8818                /* Here, the higher precedence operation has been done, and the
8819                 * result is in 'rhs'.  We overwrite the stacked operator with
8820                 * the result.  Then we redo this code to either push the new
8821                 * operator onto the stack or perform any higher precedence
8822                 * stacked operation */
8823                only_to_avoid_leaks = av_pop(stack);
8824                SvREFCNT_dec(only_to_avoid_leaks);
8825                av_push_simple(stack, rhs);
8826                goto redo_curchar;
8827
8828            case '!':   /* Highest priority, right associative */
8829
8830                /* If what's already at the top of the stack is another '!",
8831                 * they just cancel each other out */
8832                if (   (top_ptr = av_fetch(stack, top_index, FALSE))
8833                    && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8834                {
8835                    only_to_avoid_leaks = av_pop(stack);
8836                    SvREFCNT_dec(only_to_avoid_leaks);
8837                }
8838                else { /* Otherwise, since it's right associative, just push
8839                          onto the stack */
8840                    av_push_simple(stack, newSVuv(curchar));
8841                }
8842                break;
8843
8844            default:
8845                RExC_parse_inc();
8846                if (RExC_parse >= RExC_end) {
8847                    break;
8848                }
8849                vFAIL("Unexpected character");
8850
8851          handle_operand:
8852
8853            /* Here 'current' is the operand.  If something is already on the
8854             * stack, we have to check if it is a !.  But first, the code above
8855             * may have altered the stack in the time since we earlier set
8856             * 'top_index'.  */
8857
8858            top_index = av_tindex_skip_len_mg(stack);
8859            if (top_index - fence >= 0) {
8860                /* If the top entry on the stack is an operator, it had better
8861                 * be a '!', otherwise the entry below the top operand should
8862                 * be an operator */
8863                top_ptr = av_fetch(stack, top_index, FALSE);
8864                assert(top_ptr);
8865                if (IS_OPERATOR(*top_ptr)) {
8866
8867                    /* The only permissible operator at the top of the stack is
8868                     * '!', which is applied immediately to this operand. */
8869                    curchar = (char) SvUV(*top_ptr);
8870                    if (curchar != '!') {
8871                        SvREFCNT_dec(current);
8872                        vFAIL2("Unexpected binary operator '%c' with no "
8873                                "preceding operand", curchar);
8874                    }
8875
8876                    _invlist_invert(current);
8877
8878                    only_to_avoid_leaks = av_pop(stack);
8879                    SvREFCNT_dec(only_to_avoid_leaks);
8880
8881                    /* And we redo with the inverted operand.  This allows
8882                     * handling multiple ! in a row */
8883                    goto handle_operand;
8884                }
8885                          /* Single operand is ok only for the non-binary ')'
8886                           * operator */
8887                else if ((top_index - fence == 0 && curchar != ')')
8888                         || (top_index - fence > 0
8889                             && (! (stacked_ptr = av_fetch(stack,
8890                                                           top_index - 1,
8891                                                           FALSE))
8892                                 || IS_OPERAND(*stacked_ptr))))
8893                {
8894                    SvREFCNT_dec(current);
8895                    vFAIL("Operand with no preceding operator");
8896                }
8897            }
8898
8899            /* Here there was nothing on the stack or the top element was
8900             * another operand.  Just add this new one */
8901            av_push_simple(stack, current);
8902
8903        } /* End of switch on next parse token */
8904
8905        RExC_parse_inc();
8906    } /* End of loop parsing through the construct */
8907
8908    vFAIL("Syntax error in (?[...])");
8909
8910  done:
8911
8912    if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
8913        if (RExC_parse < RExC_end) {
8914            RExC_parse_inc_by(1);
8915        }
8916
8917        vFAIL("Unexpected ']' with no following ')' in (?[...");
8918    }
8919
8920    if (av_tindex_skip_len_mg(fence_stack) >= 0) {
8921        vFAIL("Unmatched (");
8922    }
8923
8924    if (av_tindex_skip_len_mg(stack) < 0   /* Was empty */
8925        || ((final = av_pop(stack)) == NULL)
8926        || ! IS_OPERAND(final)
8927        || ! is_invlist(final)
8928        || av_tindex_skip_len_mg(stack) >= 0)  /* More left on stack */
8929    {
8930      bad_syntax:
8931        SvREFCNT_dec(final);
8932        vFAIL("Incomplete expression within '(?[ ])'");
8933    }
8934
8935    /* Here, 'final' is the resultant inversion list from evaluating the
8936     * expression.  Return it if so requested */
8937    if (return_invlist) {
8938        *return_invlist = final;
8939        return END;
8940    }
8941
8942    if (RExC_sets_depth) {  /* If within a recursive call, return in a special
8943                               regnode */
8944        RExC_parse_inc_by(1);
8945        node = regpnode(pRExC_state, REGEX_SET, final);
8946    }
8947    else {
8948
8949        /* Otherwise generate a resultant node, based on 'final'.  regclass()
8950         * is expecting a string of ranges and individual code points */
8951        invlist_iterinit(final);
8952        result_string = newSVpvs("");
8953        while (invlist_iternext(final, &start, &end)) {
8954            if (start == end) {
8955                Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
8956            }
8957            else {
8958                Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
8959                                                        UVXf "}", start, end);
8960            }
8961        }
8962
8963        /* About to generate an ANYOF (or similar) node from the inversion list
8964         * we have calculated */
8965        save_parse = RExC_parse;
8966        RExC_parse_set(SvPV(result_string, len));
8967        save_end = RExC_end;
8968        RExC_end = RExC_parse + len;
8969        TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
8970
8971        /* We turn off folding around the call, as the class we have
8972         * constructed already has all folding taken into consideration, and we
8973         * don't want regclass() to add to that */
8974        RExC_flags &= ~RXf_PMf_FOLD;
8975        /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
8976         * folds are allowed.  */
8977        node = regclass(pRExC_state, flagp, depth+1,
8978                        FALSE, /* means parse the whole char class */
8979                        FALSE, /* don't allow multi-char folds */
8980                        TRUE, /* silence non-portable warnings.  The above may
8981                                 very well have generated non-portable code
8982                                 points, but they're valid on this machine */
8983                        FALSE, /* similarly, no need for strict */
8984
8985                        /* We can optimize into something besides an ANYOF,
8986                         * except under /l, which needs to be ANYOF because of
8987                         * runtime checks for locale sanity, etc */
8988                    ! in_locale,
8989                        NULL
8990                    );
8991
8992        RESTORE_WARNINGS;
8993        RExC_parse_set(save_parse + 1);
8994        RExC_end = save_end;
8995        SvREFCNT_dec_NN(final);
8996        SvREFCNT_dec_NN(result_string);
8997
8998        if (save_fold) {
8999            RExC_flags |= RXf_PMf_FOLD;
9000        }
9001
9002        if (!node) {
9003            RETURN_FAIL_ON_RESTART(*flagp, flagp);
9004            goto regclass_failed;
9005        }
9006
9007        /* Fix up the node type if we are in locale.  (We have pretended we are
9008         * under /u for the purposes of regclass(), as this construct will only
9009         * work under UTF-8 locales.  But now we change the opcode to be ANYOFL
9010         * (so as to cause any warnings about bad locales to be output in
9011         * regexec.c), and add the flag that indicates to check if not in a
9012         * UTF-8 locale.  The reason we above forbid optimization into
9013         * something other than an ANYOF node is simply to minimize the number
9014         * of code changes in regexec.c.  Otherwise we would have to create new
9015         * EXACTish node types and deal with them.  This decision could be
9016         * revisited should this construct become popular.
9017         *
9018         * (One might think we could look at the resulting ANYOF node and
9019         * suppress the flag if everything is above 255, as those would be
9020         * UTF-8 only, but this isn't true, as the components that led to that
9021         * result could have been locale-affected, and just happen to cancel
9022         * each other out under UTF-8 locales.) */
9023        if (in_locale) {
9024            set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9025
9026            assert(OP(REGNODE_p(node)) == ANYOF);
9027
9028            OP(REGNODE_p(node)) = ANYOFL;
9029            ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9030        }
9031    }
9032
9033    nextchar(pRExC_state);
9034    return node;
9035
9036  regclass_failed:
9037    FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9038                                                                (UV) *flagp);
9039}
9040
9041#ifdef ENABLE_REGEX_SETS_DEBUGGING
9042
9043STATIC void
9044S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9045                             AV * stack, const IV fence, AV * fence_stack)
9046{   /* Dumps the stacks in handle_regex_sets() */
9047
9048    const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9049    const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9050    SSize_t i;
9051
9052    PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9053
9054    PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9055
9056    if (stack_top < 0) {
9057        PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9058    }
9059    else {
9060        PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9061        for (i = stack_top; i >= 0; i--) {
9062            SV ** element_ptr = av_fetch(stack, i, FALSE);
9063            if (! element_ptr) {
9064            }
9065
9066            if (IS_OPERATOR(*element_ptr)) {
9067                PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9068                                            (int) i, (int) SvIV(*element_ptr));
9069            }
9070            else {
9071                PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9072                sv_dump(*element_ptr);
9073            }
9074        }
9075    }
9076
9077    if (fence_stack_top < 0) {
9078        PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9079    }
9080    else {
9081        PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9082        for (i = fence_stack_top; i >= 0; i--) {
9083            SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9084            if (! element_ptr) {
9085            }
9086
9087            PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9088                                            (int) i, (int) SvIV(*element_ptr));
9089        }
9090    }
9091}
9092
9093#endif
9094
9095#undef IS_OPERATOR
9096#undef IS_OPERAND
9097
9098void
9099Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9100{
9101    /* This adds the Latin1/above-Latin1 folding rules.
9102     *
9103     * This should be called only for a Latin1-range code points, cp, which is
9104     * known to be involved in a simple fold with other code points above
9105     * Latin1.  It would give false results if /aa has been specified.
9106     * Multi-char folds are outside the scope of this, and must be handled
9107     * specially. */
9108
9109    PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9110
9111    assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9112
9113    /* The rules that are valid for all Unicode versions are hard-coded in */
9114    switch (cp) {
9115        case 'k':
9116        case 'K':
9117          *invlist =
9118             add_cp_to_invlist(*invlist, KELVIN_SIGN);
9119            break;
9120        case 's':
9121        case 'S':
9122          *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9123            break;
9124        case MICRO_SIGN:
9125          *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9126          *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9127            break;
9128        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9129        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9130          *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9131            break;
9132        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9133          *invlist = add_cp_to_invlist(*invlist,
9134                                        LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9135            break;
9136
9137        default:    /* Other code points are checked against the data for the
9138                       current Unicode version */
9139          {
9140            Size_t folds_count;
9141            U32 first_fold;
9142            const U32 * remaining_folds;
9143            UV folded_cp;
9144
9145            if (isASCII(cp)) {
9146                folded_cp = toFOLD(cp);
9147            }
9148            else {
9149                U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9150                Size_t dummy_len;
9151                folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9152            }
9153
9154            if (folded_cp > 255) {
9155                *invlist = add_cp_to_invlist(*invlist, folded_cp);
9156            }
9157
9158            folds_count = _inverse_folds(folded_cp, &first_fold,
9159                                                    &remaining_folds);
9160            if (folds_count == 0) {
9161
9162                /* Use deprecated warning to increase the chances of this being
9163                 * output */
9164                ckWARN2reg_d(RExC_parse,
9165                        "Perl folding rules are not up-to-date for 0x%02X;"
9166                        " please use the perlbug utility to report;", cp);
9167            }
9168            else {
9169                unsigned int i;
9170
9171                if (first_fold > 255) {
9172                    *invlist = add_cp_to_invlist(*invlist, first_fold);
9173                }
9174                for (i = 0; i < folds_count - 1; i++) {
9175                    if (remaining_folds[i] > 255) {
9176                        *invlist = add_cp_to_invlist(*invlist,
9177                                                    remaining_folds[i]);
9178                    }
9179                }
9180            }
9181            break;
9182         }
9183    }
9184}
9185
9186STATIC void
9187S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9188{
9189    /* Output the elements of the array given by '*posix_warnings' as REGEXP
9190     * warnings. */
9191
9192    SV * msg;
9193    const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9194
9195    PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9196
9197    if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9198        CLEAR_POSIX_WARNINGS();
9199        return;
9200    }
9201
9202    while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9203        if (first_is_fatal) {           /* Avoid leaking this */
9204            av_undef(posix_warnings);   /* This isn't necessary if the
9205                                            array is mortal, but is a
9206                                            fail-safe */
9207            (void) sv_2mortal(msg);
9208            PREPARE_TO_DIE;
9209        }
9210        Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9211        SvREFCNT_dec_NN(msg);
9212    }
9213
9214    UPDATE_WARNINGS_LOC(RExC_parse);
9215}
9216
9217PERL_STATIC_INLINE Size_t
9218S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9219{
9220    const U8 * const start = s1;
9221    const U8 * const send = start + max;
9222
9223    PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9224
9225    while (s1 < send && *s1  == *s2) {
9226        s1++; s2++;
9227    }
9228
9229    return s1 - start;
9230}
9231
9232STATIC AV *
9233S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9234{
9235    /* This adds the string scalar <multi_string> to the array
9236     * <multi_char_matches>.  <multi_string> is known to have exactly
9237     * <cp_count> code points in it.  This is used when constructing a
9238     * bracketed character class and we find something that needs to match more
9239     * than a single character.
9240     *
9241     * <multi_char_matches> is actually an array of arrays.  Each top-level
9242     * element is an array that contains all the strings known so far that are
9243     * the same length.  And that length (in number of code points) is the same
9244     * as the index of the top-level array.  Hence, the [2] element is an
9245     * array, each element thereof is a string containing TWO code points;
9246     * while element [3] is for strings of THREE characters, and so on.  Since
9247     * this is for multi-char strings there can never be a [0] nor [1] element.
9248     *
9249     * When we rewrite the character class below, we will do so such that the
9250     * longest strings are written first, so that it prefers the longest
9251     * matching strings first.  This is done even if it turns out that any
9252     * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
9253     * Christiansen has agreed that this is ok.  This makes the test for the
9254     * ligature 'ffi' come before the test for 'ff', for example */
9255
9256    AV* this_array;
9257    AV** this_array_ptr;
9258
9259    PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9260
9261    if (! multi_char_matches) {
9262        multi_char_matches = newAV();
9263    }
9264
9265    if (av_exists(multi_char_matches, cp_count)) {
9266        this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9267        this_array = *this_array_ptr;
9268    }
9269    else {
9270        this_array = newAV();
9271        av_store_simple(multi_char_matches, cp_count,
9272                 (SV*) this_array);
9273    }
9274    av_push_simple(this_array, multi_string);
9275
9276    return multi_char_matches;
9277}
9278
9279/* The names of properties whose definitions are not known at compile time are
9280 * stored in this SV, after a constant heading.  So if the length has been
9281 * changed since initialization, then there is a run-time definition. */
9282#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
9283                                        (SvCUR(listsv) != initial_listsv_len)
9284
9285/* There is a restricted set of white space characters that are legal when
9286 * ignoring white space in a bracketed character class.  This generates the
9287 * code to skip them.
9288 *
9289 * There is a line below that uses the same white space criteria but is outside
9290 * this macro.  Both here and there must use the same definition */
9291#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
9292    STMT_START {                                                        \
9293        if (do_skip) {                                                  \
9294            while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
9295            {                                                           \
9296                p++;                                                    \
9297            }                                                           \
9298        }                                                               \
9299    } STMT_END
9300
9301STATIC regnode_offset
9302S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9303                 const bool stop_at_1,  /* Just parse the next thing, don't
9304                                           look for a full character class */
9305                 bool allow_mutiple_chars,
9306                 const bool silence_non_portable,   /* Don't output warnings
9307                                                       about too large
9308                                                       characters */
9309                 const bool strict,
9310                 bool optimizable,                  /* ? Allow a non-ANYOF return
9311                                                       node */
9312                 SV** ret_invlist  /* Return an inversion list, not a node */
9313          )
9314{
9315    /* parse a bracketed class specification.  Most of these will produce an
9316     * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9317     * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
9318     * under /i with multi-character folds: it will be rewritten following the
9319     * paradigm of this example, where the <multi-fold>s are characters which
9320     * fold to multiple character sequences:
9321     *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9322     * gets effectively rewritten as:
9323     *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9324     * reg() gets called (recursively) on the rewritten version, and this
9325     * function will return what it constructs.  (Actually the <multi-fold>s
9326     * aren't physically removed from the [abcdefghi], it's just that they are
9327     * ignored in the recursion by means of a flag:
9328     * <RExC_in_multi_char_class>.)
9329     *
9330     * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9331     * characters, with the corresponding bit set if that character is in the
9332     * list.  For characters above this, an inversion list is used.  There
9333     * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9334     * determinable at compile time
9335     *
9336     * On success, returns the offset at which any next node should be placed
9337     * into the regex engine program being compiled.
9338     *
9339     * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9340     * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9341     * UTF-8
9342     */
9343
9344    UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9345    IV range = 0;
9346    UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9347    regnode_offset ret = -1;    /* Initialized to an illegal value */
9348    STRLEN numlen;
9349    int namedclass = OOB_NAMEDCLASS;
9350    char *rangebegin = NULL;
9351    SV *listsv = NULL;      /* List of \p{user-defined} whose definitions
9352                               aren't available at the time this was called */
9353    STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9354                                      than just initialized.  */
9355    SV* properties = NULL;    /* Code points that match \p{} \P{} */
9356    SV* posixes = NULL;     /* Code points that match classes like [:word:],
9357                               extended beyond the Latin1 range.  These have to
9358                               be kept separate from other code points for much
9359                               of this function because their handling  is
9360                               different under /i, and for most classes under
9361                               /d as well */
9362    SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
9363                               separate for a while from the non-complemented
9364                               versions because of complications with /d
9365                               matching */
9366    SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9367                                  treated more simply than the general case,
9368                                  leading to less compilation and execution
9369                                  work */
9370    UV element_count = 0;   /* Number of distinct elements in the class.
9371                               Optimizations may be possible if this is tiny */
9372    AV * multi_char_matches = NULL; /* Code points that fold to more than one
9373                                       character; used under /i */
9374    UV n;
9375    char * stop_ptr = RExC_end;    /* where to stop parsing */
9376
9377    /* ignore unescaped whitespace? */
9378    const bool skip_white = cBOOL(   ret_invlist
9379                                  || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9380
9381    /* inversion list of code points this node matches only when the target
9382     * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
9383     * /d) */
9384    SV* upper_latin1_only_utf8_matches = NULL;
9385
9386    /* Inversion list of code points this node matches regardless of things
9387     * like locale, folding, utf8ness of the target string */
9388    SV* cp_list = NULL;
9389
9390    /* Like cp_list, but code points on this list need to be checked for things
9391     * that fold to/from them under /i */
9392    SV* cp_foldable_list = NULL;
9393
9394    /* Like cp_list, but code points on this list are valid only when the
9395     * runtime locale is UTF-8 */
9396    SV* only_utf8_locale_list = NULL;
9397
9398    /* In a range, if one of the endpoints is non-character-set portable,
9399     * meaning that it hard-codes a code point that may mean a different
9400     * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9401     * mnemonic '\t' which each mean the same character no matter which
9402     * character set the platform is on. */
9403    unsigned int non_portable_endpoint = 0;
9404
9405    /* Is the range unicode? which means on a platform that isn't 1-1 native
9406     * to Unicode (i.e. non-ASCII), each code point in it should be considered
9407     * to be a Unicode value.  */
9408    bool unicode_range = FALSE;
9409    bool invert = FALSE;    /* Is this class to be complemented */
9410
9411    bool warn_super = ALWAYS_WARN_SUPER;
9412
9413    const char * orig_parse = RExC_parse;
9414
9415    /* This variable is used to mark where the end in the input is of something
9416     * that looks like a POSIX construct but isn't.  During the parse, when
9417     * something looks like it could be such a construct is encountered, it is
9418     * checked for being one, but not if we've already checked this area of the
9419     * input.  Only after this position is reached do we check again */
9420    char *not_posix_region_end = RExC_parse - 1;
9421
9422    AV* posix_warnings = NULL;
9423    const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9424    U8 op = ANYOF;    /* The returned node-type, initialized to the expected
9425                         type. */
9426    U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
9427    U32 posixl = 0;       /* bit field of posix classes matched under /l */
9428
9429
9430/* Flags as to what things aren't knowable until runtime.  (Note that these are
9431 * mutually exclusive.) */
9432#define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
9433                                            haven't been defined as of yet */
9434#define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
9435                                            UTF-8 or not */
9436#define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
9437                                            what gets folded */
9438    U32 has_runtime_dependency = 0;     /* OR of the above flags */
9439
9440    DECLARE_AND_GET_RE_DEBUG_FLAGS;
9441
9442    PERL_ARGS_ASSERT_REGCLASS;
9443#ifndef DEBUGGING
9444    PERL_UNUSED_ARG(depth);
9445#endif
9446
9447    assert(! (ret_invlist && allow_mutiple_chars));
9448
9449    /* If wants an inversion list returned, we can't optimize to something
9450     * else. */
9451    if (ret_invlist) {
9452        optimizable = FALSE;
9453    }
9454
9455    DEBUG_PARSE("clas");
9456
9457#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
9458    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
9459                                   && UNICODE_DOT_DOT_VERSION == 0)
9460    allow_mutiple_chars = FALSE;
9461#endif
9462
9463    /* We include the /i status at the beginning of this so that we can
9464     * know it at runtime */
9465    listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9466    initial_listsv_len = SvCUR(listsv);
9467    SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
9468
9469    SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9470
9471    assert(RExC_parse <= RExC_end);
9472
9473    if (UCHARAT(RExC_parse) == '^') {	/* Complement the class */
9474        RExC_parse_inc_by(1);
9475        invert = TRUE;
9476        allow_mutiple_chars = FALSE;
9477        MARK_NAUGHTY(1);
9478        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9479    }
9480
9481    /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9482    if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9483        int maybe_class = handle_possible_posix(pRExC_state,
9484                                                RExC_parse,
9485                                                &not_posix_region_end,
9486                                                NULL,
9487                                                TRUE /* checking only */);
9488        if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9489            ckWARN4reg(not_posix_region_end,
9490                    "POSIX syntax [%c %c] belongs inside character classes%s",
9491                    *RExC_parse, *RExC_parse,
9492                    (maybe_class == OOB_NAMEDCLASS)
9493                    ? ((POSIXCC_NOTYET(*RExC_parse))
9494                        ? " (but this one isn't implemented)"
9495                        : " (but this one isn't fully valid)")
9496                    : ""
9497                    );
9498        }
9499    }
9500
9501    /* If the caller wants us to just parse a single element, accomplish this
9502     * by faking the loop ending condition */
9503    if (stop_at_1 && RExC_end > RExC_parse) {
9504        stop_ptr = RExC_parse + 1;
9505    }
9506
9507    /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9508    if (UCHARAT(RExC_parse) == ']')
9509        goto charclassloop;
9510
9511    while (1) {
9512
9513        if (   posix_warnings
9514            && av_tindex_skip_len_mg(posix_warnings) >= 0
9515            && RExC_parse > not_posix_region_end)
9516        {
9517            /* Warnings about posix class issues are considered tentative until
9518             * we are far enough along in the parse that we can no longer
9519             * change our mind, at which point we output them.  This is done
9520             * each time through the loop so that a later class won't zap them
9521             * before they have been dealt with. */
9522            output_posix_warnings(pRExC_state, posix_warnings);
9523        }
9524
9525        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9526
9527        if  (RExC_parse >= stop_ptr) {
9528            break;
9529        }
9530
9531        if  (UCHARAT(RExC_parse) == ']') {
9532            break;
9533        }
9534
9535      charclassloop:
9536
9537        namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9538        save_value = value;
9539        save_prevvalue = prevvalue;
9540
9541        if (!range) {
9542            rangebegin = RExC_parse;
9543            element_count++;
9544            non_portable_endpoint = 0;
9545        }
9546        if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9547            value = utf8n_to_uvchr((U8*)RExC_parse,
9548                                   RExC_end - RExC_parse,
9549                                   &numlen, UTF8_ALLOW_DEFAULT);
9550            RExC_parse_inc_by(numlen);
9551        }
9552        else {
9553            value = UCHARAT(RExC_parse);
9554            RExC_parse_inc_by(1);
9555        }
9556
9557        if (value == '[') {
9558            char * posix_class_end;
9559            namedclass = handle_possible_posix(pRExC_state,
9560                                               RExC_parse,
9561                                               &posix_class_end,
9562                                               do_posix_warnings ? &posix_warnings : NULL,
9563                                               FALSE    /* die if error */);
9564            if (namedclass > OOB_NAMEDCLASS) {
9565
9566                /* If there was an earlier attempt to parse this particular
9567                 * posix class, and it failed, it was a false alarm, as this
9568                 * successful one proves */
9569                if (   posix_warnings
9570                    && av_tindex_skip_len_mg(posix_warnings) >= 0
9571                    && not_posix_region_end >= RExC_parse
9572                    && not_posix_region_end <= posix_class_end)
9573                {
9574                    av_undef(posix_warnings);
9575                }
9576
9577                RExC_parse_set(posix_class_end);
9578            }
9579            else if (namedclass == OOB_NAMEDCLASS) {
9580                not_posix_region_end = posix_class_end;
9581            }
9582            else {
9583                namedclass = OOB_NAMEDCLASS;
9584            }
9585        }
9586        else if (   RExC_parse - 1 > not_posix_region_end
9587                 && MAYBE_POSIXCC(value))
9588        {
9589            (void) handle_possible_posix(
9590                        pRExC_state,
9591                        RExC_parse - 1,  /* -1 because parse has already been
9592                                            advanced */
9593                        &not_posix_region_end,
9594                        do_posix_warnings ? &posix_warnings : NULL,
9595                        TRUE /* checking only */);
9596        }
9597        else if (  strict && ! skip_white
9598                 && (   generic_isCC_(value, CC_VERTSPACE_)
9599                     || is_VERTWS_cp_high(value)))
9600        {
9601            vFAIL("Literal vertical space in [] is illegal except under /x");
9602        }
9603        else if (value == '\\') {
9604            /* Is a backslash; get the code point of the char after it */
9605
9606            if (RExC_parse >= RExC_end) {
9607                vFAIL("Unmatched [");
9608            }
9609
9610            if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9611                value = utf8n_to_uvchr((U8*)RExC_parse,
9612                                   RExC_end - RExC_parse,
9613                                   &numlen, UTF8_ALLOW_DEFAULT);
9614                RExC_parse_inc_by(numlen);
9615            }
9616            else {
9617                value = UCHARAT(RExC_parse);
9618                RExC_parse_inc_by(1);
9619            }
9620
9621            /* Some compilers cannot handle switching on 64-bit integer
9622             * values, therefore value cannot be an UV.  Yes, this will
9623             * be a problem later if we want switch on Unicode.
9624             * A similar issue a little bit later when switching on
9625             * namedclass. --jhi */
9626
9627            /* If the \ is escaping white space when white space is being
9628             * skipped, it means that that white space is wanted literally, and
9629             * is already in 'value'.  Otherwise, need to translate the escape
9630             * into what it signifies. */
9631            if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9632                const char * message;
9633                U32 packed_warn;
9634                U8 grok_c_char;
9635
9636            case 'w':	namedclass = ANYOF_WORDCHAR;	break;
9637            case 'W':	namedclass = ANYOF_NWORDCHAR;	break;
9638            case 's':	namedclass = ANYOF_SPACE;	break;
9639            case 'S':	namedclass = ANYOF_NSPACE;	break;
9640            case 'd':	namedclass = ANYOF_DIGIT;	break;
9641            case 'D':	namedclass = ANYOF_NDIGIT;	break;
9642            case 'v':	namedclass = ANYOF_VERTWS;	break;
9643            case 'V':	namedclass = ANYOF_NVERTWS;	break;
9644            case 'h':	namedclass = ANYOF_HORIZWS;	break;
9645            case 'H':	namedclass = ANYOF_NHORIZWS;	break;
9646            case 'N':  /* Handle \N{NAME} in class */
9647                {
9648                    const char * const backslash_N_beg = RExC_parse - 2;
9649                    int cp_count;
9650
9651                    if (! grok_bslash_N(pRExC_state,
9652                                        NULL,      /* No regnode */
9653                                        &value,    /* Yes single value */
9654                                        &cp_count, /* Multiple code pt count */
9655                                        flagp,
9656                                        strict,
9657                                        depth)
9658                    ) {
9659
9660                        if (*flagp & NEED_UTF8)
9661                            FAIL("panic: grok_bslash_N set NEED_UTF8");
9662
9663                        RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9664
9665                        if (cp_count < 0) {
9666                            vFAIL("\\N in a character class must be a named character: \\N{...}");
9667                        }
9668                        else if (cp_count == 0) {
9669                            ckWARNreg(RExC_parse,
9670                              "Ignoring zero length \\N{} in character class");
9671                        }
9672                        else { /* cp_count > 1 */
9673                            assert(cp_count > 1);
9674                            if (! RExC_in_multi_char_class) {
9675                                if ( ! allow_mutiple_chars
9676                                    || invert
9677                                    || range
9678                                    || *RExC_parse == '-')
9679                                {
9680                                    if (strict) {
9681                                        RExC_parse--;
9682                                        vFAIL("\\N{} here is restricted to one character");
9683                                    }
9684                                    ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9685                                    break; /* <value> contains the first code
9686                                              point. Drop out of the switch to
9687                                              process it */
9688                                }
9689                                else {
9690                                    SV * multi_char_N = newSVpvn(backslash_N_beg,
9691                                                 RExC_parse - backslash_N_beg);
9692                                    multi_char_matches
9693                                        = add_multi_match(multi_char_matches,
9694                                                          multi_char_N,
9695                                                          cp_count);
9696                                }
9697                            }
9698                        } /* End of cp_count != 1 */
9699
9700                        /* This element should not be processed further in this
9701                         * class */
9702                        element_count--;
9703                        value = save_value;
9704                        prevvalue = save_prevvalue;
9705                        continue;   /* Back to top of loop to get next char */
9706                    }
9707
9708                    /* Here, is a single code point, and <value> contains it */
9709                    unicode_range = TRUE;   /* \N{} are Unicode */
9710                }
9711                break;
9712            case 'p':
9713            case 'P':
9714                {
9715                char *e;
9716
9717                if (RExC_pm_flags & PMf_WILDCARD) {
9718                    RExC_parse_inc_by(1);
9719                    /* diag_listed_as: Use of %s is not allowed in Unicode
9720                       property wildcard subpatterns in regex; marked by <--
9721                       HERE in m/%s/ */
9722                    vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9723                           " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9724                }
9725
9726                /* \p means they want Unicode semantics */
9727                REQUIRE_UNI_RULES(flagp, 0);
9728
9729                if (RExC_parse >= RExC_end)
9730                    vFAIL2("Empty \\%c", (U8)value);
9731                if (*RExC_parse == '{') {
9732                    const U8 c = (U8)value;
9733                    e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9734                    if (!e) {
9735                        RExC_parse_inc_by(1);
9736                        vFAIL2("Missing right brace on \\%c{}", c);
9737                    }
9738
9739                    RExC_parse_inc_by(1);
9740
9741                    /* White space is allowed adjacent to the braces and after
9742                     * any '^', even when not under /x */
9743                    while (isSPACE(*RExC_parse)) {
9744                         RExC_parse_inc_by(1);
9745                    }
9746
9747                    if (UCHARAT(RExC_parse) == '^') {
9748
9749                        /* toggle.  (The rhs xor gets the single bit that
9750                         * differs between P and p; the other xor inverts just
9751                         * that bit) */
9752                        value ^= 'P' ^ 'p';
9753
9754                        RExC_parse_inc_by(1);
9755                        while (isSPACE(*RExC_parse)) {
9756                            RExC_parse_inc_by(1);
9757                        }
9758                    }
9759
9760                    if (e == RExC_parse)
9761                        vFAIL2("Empty \\%c{}", c);
9762
9763                    n = e - RExC_parse;
9764                    while (isSPACE(*(RExC_parse + n - 1)))
9765                        n--;
9766
9767                }   /* The \p isn't immediately followed by a '{' */
9768                else if (! isALPHA(*RExC_parse)) {
9769                    RExC_parse_inc_safe();
9770                    vFAIL2("Character following \\%c must be '{' or a "
9771                           "single-character Unicode property name",
9772                           (U8) value);
9773                }
9774                else {
9775                    e = RExC_parse;
9776                    n = 1;
9777                }
9778                {
9779                    char* name = RExC_parse;
9780
9781                    /* Any message returned about expanding the definition */
9782                    SV* msg = newSVpvs_flags("", SVs_TEMP);
9783
9784                    /* If set TRUE, the property is user-defined as opposed to
9785                     * official Unicode */
9786                    bool user_defined = FALSE;
9787                    AV * strings = NULL;
9788
9789                    SV * prop_definition = parse_uniprop_string(
9790                                            name, n, UTF, FOLD,
9791                                            FALSE, /* This is compile-time */
9792
9793                                            /* We can't defer this defn when
9794                                             * the full result is required in
9795                                             * this call */
9796                                            ! cBOOL(ret_invlist),
9797
9798                                            &strings,
9799                                            &user_defined,
9800                                            msg,
9801                                            0 /* Base level */
9802                                           );
9803                    if (SvCUR(msg)) {   /* Assumes any error causes a msg */
9804                        assert(prop_definition == NULL);
9805                        RExC_parse_set(e + 1);
9806                        if (SvUTF8(msg)) {  /* msg being UTF-8 makes the whole
9807                                               thing so, or else the display is
9808                                               mojibake */
9809                            RExC_utf8 = TRUE;
9810                        }
9811                        /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9812                        vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9813                                    SvCUR(msg), SvPVX(msg)));
9814                    }
9815
9816                    assert(prop_definition || strings);
9817
9818                    if (strings) {
9819                        if (ret_invlist) {
9820                            if (! prop_definition) {
9821                                RExC_parse_set(e + 1);
9822                                vFAIL("Unicode string properties are not implemented in (?[...])");
9823                            }
9824                            else {
9825                                ckWARNreg(e + 1,
9826                                    "Using just the single character results"
9827                                    " returned by \\p{} in (?[...])");
9828                            }
9829                        }
9830                        else if (! RExC_in_multi_char_class) {
9831                            if (invert ^ (value == 'P')) {
9832                                RExC_parse_set(e + 1);
9833                                vFAIL("Inverting a character class which contains"
9834                                    " a multi-character sequence is illegal");
9835                            }
9836
9837                            /* For each multi-character string ... */
9838                            while (av_count(strings) > 0) {
9839                                /* ... Each entry is itself an array of code
9840                                * points. */
9841                                AV * this_string = (AV *) av_shift( strings);
9842                                STRLEN cp_count = av_count(this_string);
9843                                SV * final = newSV(cp_count ? cp_count * 4 : 1);
9844                                SvPVCLEAR_FRESH(final);
9845
9846                                /* Create another string of sequences of \x{...} */
9847                                while (av_count(this_string) > 0) {
9848                                    SV * character = av_shift(this_string);
9849                                    UV cp = SvUV(character);
9850
9851                                    if (cp > 255) {
9852                                        REQUIRE_UTF8(flagp);
9853                                    }
9854                                    Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9855                                                                        cp);
9856                                    SvREFCNT_dec_NN(character);
9857                                }
9858                                SvREFCNT_dec_NN(this_string);
9859
9860                                /* And add that to the list of such things */
9861                                multi_char_matches
9862                                            = add_multi_match(multi_char_matches,
9863                                                            final,
9864                                                            cp_count);
9865                            }
9866                        }
9867                        SvREFCNT_dec_NN(strings);
9868                    }
9869
9870                    if (! prop_definition) {    /* If we got only a string,
9871                                                   this iteration didn't really
9872                                                   find a character */
9873                        element_count--;
9874                    }
9875                    else if (! is_invlist(prop_definition)) {
9876
9877                        /* Here, the definition isn't known, so we have gotten
9878                         * returned a string that will be evaluated if and when
9879                         * encountered at runtime.  We add it to the list of
9880                         * such properties, along with whether it should be
9881                         * complemented or not */
9882                        if (value == 'P') {
9883                            sv_catpvs(listsv, "!");
9884                        }
9885                        else {
9886                            sv_catpvs(listsv, "+");
9887                        }
9888                        sv_catsv(listsv, prop_definition);
9889
9890                        has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9891
9892                        /* We don't know yet what this matches, so have to flag
9893                         * it */
9894                        anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9895                    }
9896                    else {
9897                        assert (prop_definition && is_invlist(prop_definition));
9898
9899                        /* Here we do have the complete property definition
9900                         *
9901                         * Temporary workaround for [GH #16520].  For this
9902                         * precise input that is in the .t that is failing,
9903                         * load utf8.pm, which is what the test wants, so that
9904                         * that .t passes */
9905                        if (     memEQs(RExC_start, e + 1 - RExC_start,
9906                                        "foo\\p{Alnum}")
9907                            && ! hv_common(GvHVn(PL_incgv),
9908                                           NULL,
9909                                           "utf8.pm", sizeof("utf8.pm") - 1,
9910                                           0, HV_FETCH_ISEXISTS, NULL, 0))
9911                        {
9912                            require_pv("utf8.pm");
9913                        }
9914
9915                        if (! user_defined &&
9916                            /* We warn on matching an above-Unicode code point
9917                             * if the match would return true, except don't
9918                             * warn for \p{All}, which has exactly one element
9919                             * = 0 */
9920                            (_invlist_contains_cp(prop_definition, 0x110000)
9921                                && (! (_invlist_len(prop_definition) == 1
9922                                       && *invlist_array(prop_definition) == 0))))
9923                        {
9924                            warn_super = TRUE;
9925                        }
9926
9927                        /* Invert if asking for the complement */
9928                        if (value == 'P') {
9929                            _invlist_union_complement_2nd(properties,
9930                                                          prop_definition,
9931                                                          &properties);
9932                        }
9933                        else {
9934                            _invlist_union(properties, prop_definition, &properties);
9935                        }
9936                    }
9937                }
9938
9939                RExC_parse_set(e + 1);
9940                namedclass = ANYOF_UNIPROP;  /* no official name, but it's
9941                                                named */
9942                }
9943                break;
9944            case 'n':	value = '\n';			break;
9945            case 'r':	value = '\r';			break;
9946            case 't':	value = '\t';			break;
9947            case 'f':	value = '\f';			break;
9948            case 'b':	value = '\b';			break;
9949            case 'e':	value = ESC_NATIVE;             break;
9950            case 'a':	value = '\a';                   break;
9951            case 'o':
9952                RExC_parse--;	/* function expects to be pointed at the 'o' */
9953                if (! grok_bslash_o(&RExC_parse,
9954                                            RExC_end,
9955                                            &value,
9956                                            &message,
9957                                            &packed_warn,
9958                                            strict,
9959                                            cBOOL(range), /* MAX_UV allowed for range
9960                                                      upper limit */
9961                                            UTF))
9962                {
9963                    vFAIL(message);
9964                }
9965                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9966                    warn_non_literal_string(RExC_parse, packed_warn, message);
9967                }
9968
9969                if (value < 256) {
9970                    non_portable_endpoint++;
9971                }
9972                break;
9973            case 'x':
9974                RExC_parse--;	/* function expects to be pointed at the 'x' */
9975                if (!  grok_bslash_x(&RExC_parse,
9976                                            RExC_end,
9977                                            &value,
9978                                            &message,
9979                                            &packed_warn,
9980                                            strict,
9981                                            cBOOL(range), /* MAX_UV allowed for range
9982                                                      upper limit */
9983                                            UTF))
9984                {
9985                    vFAIL(message);
9986                }
9987                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9988                    warn_non_literal_string(RExC_parse, packed_warn, message);
9989                }
9990
9991                if (value < 256) {
9992                    non_portable_endpoint++;
9993                }
9994                break;
9995            case 'c':
9996                if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
9997                                                                &packed_warn))
9998                {
9999                    /* going to die anyway; point to exact spot of
10000                        * failure */
10001                    RExC_parse_inc_safe();
10002                    vFAIL(message);
10003                }
10004
10005                value = grok_c_char;
10006                RExC_parse_inc_by(1);
10007                if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10008                    warn_non_literal_string(RExC_parse, packed_warn, message);
10009                }
10010
10011                non_portable_endpoint++;
10012                break;
10013            case '0': case '1': case '2': case '3': case '4':
10014            case '5': case '6': case '7':
10015                {
10016                    /* Take 1-3 octal digits */
10017                    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10018                              | PERL_SCAN_NOTIFY_ILLDIGIT;
10019                    numlen = (strict) ? 4 : 3;
10020                    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10021                    RExC_parse_inc_by(numlen);
10022                    if (numlen != 3) {
10023                        if (strict) {
10024                            RExC_parse_inc_safe();
10025                            vFAIL("Need exactly 3 octal digits");
10026                        }
10027                        else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10028                                 && RExC_parse < RExC_end
10029                                 && isDIGIT(*RExC_parse)
10030                                 && ckWARN(WARN_REGEXP))
10031                        {
10032                            reg_warn_non_literal_string(
10033                                 RExC_parse + 1,
10034                                 form_alien_digit_msg(8, numlen, RExC_parse,
10035                                                        RExC_end, UTF, FALSE));
10036                        }
10037                    }
10038                    if (value < 256) {
10039                        non_portable_endpoint++;
10040                    }
10041                    break;
10042                }
10043            default:
10044                /* Allow \_ to not give an error */
10045                if (isWORDCHAR(value) && value != '_') {
10046                    if (strict) {
10047                        vFAIL2("Unrecognized escape \\%c in character class",
10048                               (int)value);
10049                    }
10050                    else {
10051                        ckWARN2reg(RExC_parse,
10052                            "Unrecognized escape \\%c in character class passed through",
10053                            (int)value);
10054                    }
10055                }
10056                break;
10057            }   /* End of switch on char following backslash */
10058        } /* end of handling backslash escape sequences */
10059
10060        /* Here, we have the current token in 'value' */
10061
10062        if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10063            U8 classnum;
10064
10065            /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10066             * literal, as is the character that began the false range, i.e.
10067             * the 'a' in the examples */
10068            if (range) {
10069                const int w = (RExC_parse >= rangebegin)
10070                                ? RExC_parse - rangebegin
10071                                : 0;
10072                if (strict) {
10073                    vFAIL2utf8f(
10074                        "False [] range \"%" UTF8f "\"",
10075                        UTF8fARG(UTF, w, rangebegin));
10076                }
10077                else {
10078                    ckWARN2reg(RExC_parse,
10079                        "False [] range \"%" UTF8f "\"",
10080                        UTF8fARG(UTF, w, rangebegin));
10081                    cp_list = add_cp_to_invlist(cp_list, '-');
10082                    cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10083                                                            prevvalue);
10084                }
10085
10086                range = 0; /* this was not a true range */
10087                element_count += 2; /* So counts for three values */
10088            }
10089
10090            classnum = namedclass_to_classnum(namedclass);
10091
10092            if (LOC && namedclass < ANYOF_POSIXL_MAX
10093#ifndef HAS_ISASCII
10094                && classnum != CC_ASCII_
10095#endif
10096            ) {
10097                SV* scratch_list = NULL;
10098
10099                /* What the Posix classes (like \w, [:space:]) match isn't
10100                 * generally knowable under locale until actual match time.  A
10101                 * special node is used for these which has extra space for a
10102                 * bitmap, with a bit reserved for each named class that is to
10103                 * be matched against.  (This isn't needed for \p{} and
10104                 * pseudo-classes, as they are not affected by locale, and
10105                 * hence are dealt with separately.)  However, if a named class
10106                 * and its complement are both present, then it matches
10107                 * everything, and there is no runtime dependency.  Odd numbers
10108                 * are the complements of the next lower number, so xor works.
10109                 * (Note that something like [\w\D] should match everything,
10110                 * because \d should be a proper subset of \w.  But rather than
10111                 * trust that the locale is well behaved, we leave this to
10112                 * runtime to sort out) */
10113                if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10114                    cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10115                    POSIXL_ZERO(posixl);
10116                    has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10117                    anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10118                    continue;   /* We could ignore the rest of the class, but
10119                                   best to parse it for any errors */
10120                }
10121                else { /* Here, isn't the complement of any already parsed
10122                          class */
10123                    POSIXL_SET(posixl, namedclass);
10124                    has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10125                    anyof_flags |= ANYOF_MATCHES_POSIXL;
10126
10127                    /* The above-Latin1 characters are not subject to locale
10128                     * rules.  Just add them to the unconditionally-matched
10129                     * list */
10130
10131                    /* Get the list of the above-Latin1 code points this
10132                     * matches */
10133                    _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10134                                            PL_XPosix_ptrs[classnum],
10135
10136                                            /* Odd numbers are complements,
10137                                             * like NDIGIT, NASCII, ... */
10138                                            namedclass % 2 != 0,
10139                                            &scratch_list);
10140                    /* Checking if 'cp_list' is NULL first saves an extra
10141                     * clone.  Its reference count will be decremented at the
10142                     * next union, etc, or if this is the only instance, at the
10143                     * end of the routine */
10144                    if (! cp_list) {
10145                        cp_list = scratch_list;
10146                    }
10147                    else {
10148                        _invlist_union(cp_list, scratch_list, &cp_list);
10149                        SvREFCNT_dec_NN(scratch_list);
10150                    }
10151                    continue;   /* Go get next character */
10152                }
10153            }
10154            else {
10155
10156                /* Here, is not /l, or is a POSIX class for which /l doesn't
10157                 * matter (or is a Unicode property, which is skipped here). */
10158                if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
10159                    if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10160
10161                        /* Here, should be \h, \H, \v, or \V.  None of /d, /i
10162                         * nor /l make a difference in what these match,
10163                         * therefore we just add what they match to cp_list. */
10164                        if (classnum != CC_VERTSPACE_) {
10165                            assert(   namedclass == ANYOF_HORIZWS
10166                                   || namedclass == ANYOF_NHORIZWS);
10167
10168                            /* It turns out that \h is just a synonym for
10169                             * XPosixBlank */
10170                            classnum = CC_BLANK_;
10171                        }
10172
10173                        _invlist_union_maybe_complement_2nd(
10174                                cp_list,
10175                                PL_XPosix_ptrs[classnum],
10176                                namedclass % 2 != 0,    /* Complement if odd
10177                                                          (NHORIZWS, NVERTWS)
10178                                                        */
10179                                &cp_list);
10180                    }
10181                }
10182                else if (   AT_LEAST_UNI_SEMANTICS
10183                         || classnum == CC_ASCII_
10184                         || (DEPENDS_SEMANTICS && (   classnum == CC_DIGIT_
10185                                                   || classnum == CC_XDIGIT_)))
10186                {
10187                    /* We usually have to worry about /d affecting what POSIX
10188                     * classes match, with special code needed because we won't
10189                     * know until runtime what all matches.  But there is no
10190                     * extra work needed under /u and /a; and [:ascii:] is
10191                     * unaffected by /d; and :digit: and :xdigit: don't have
10192                     * runtime differences under /d.  So we can special case
10193                     * these, and avoid some extra work below, and at runtime.
10194                     * */
10195                    _invlist_union_maybe_complement_2nd(
10196                                                     simple_posixes,
10197                                                      ((AT_LEAST_ASCII_RESTRICTED)
10198                                                       ? PL_Posix_ptrs[classnum]
10199                                                       : PL_XPosix_ptrs[classnum]),
10200                                                     namedclass % 2 != 0,
10201                                                     &simple_posixes);
10202                }
10203                else {  /* Garden variety class.  If is NUPPER, NALPHA, ...
10204                           complement and use nposixes */
10205                    SV** posixes_ptr = namedclass % 2 == 0
10206                                       ? &posixes
10207                                       : &nposixes;
10208                    _invlist_union_maybe_complement_2nd(
10209                                                     *posixes_ptr,
10210                                                     PL_XPosix_ptrs[classnum],
10211                                                     namedclass % 2 != 0,
10212                                                     posixes_ptr);
10213                }
10214            }
10215        } /* end of namedclass \blah */
10216
10217        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10218
10219        /* If 'range' is set, 'value' is the ending of a range--check its
10220         * validity.  (If value isn't a single code point in the case of a
10221         * range, we should have figured that out above in the code that
10222         * catches false ranges).  Later, we will handle each individual code
10223         * point in the range.  If 'range' isn't set, this could be the
10224         * beginning of a range, so check for that by looking ahead to see if
10225         * the next real character to be processed is the range indicator--the
10226         * minus sign */
10227
10228        if (range) {
10229#ifdef EBCDIC
10230            /* For unicode ranges, we have to test that the Unicode as opposed
10231             * to the native values are not decreasing.  (Above 255, there is
10232             * no difference between native and Unicode) */
10233            if (unicode_range && prevvalue < 255 && value < 255) {
10234                if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10235                    goto backwards_range;
10236                }
10237            }
10238            else
10239#endif
10240            if (prevvalue > value) /* b-a */ {
10241                int w;
10242#ifdef EBCDIC
10243              backwards_range:
10244#endif
10245                w = RExC_parse - rangebegin;
10246                vFAIL2utf8f(
10247                    "Invalid [] range \"%" UTF8f "\"",
10248                    UTF8fARG(UTF, w, rangebegin));
10249                NOT_REACHED; /* NOTREACHED */
10250            }
10251        }
10252        else {
10253            prevvalue = value; /* save the beginning of the potential range */
10254            if (! stop_at_1     /* Can't be a range if parsing just one thing */
10255                && *RExC_parse == '-')
10256            {
10257                char* next_char_ptr = RExC_parse + 1;
10258
10259                /* Get the next real char after the '-' */
10260                SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10261
10262                /* If the '-' is at the end of the class (just before the ']',
10263                 * it is a literal minus; otherwise it is a range */
10264                if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10265                    RExC_parse_set(next_char_ptr);
10266
10267                    /* a bad range like \w-, [:word:]- ? */
10268                    if (namedclass > OOB_NAMEDCLASS) {
10269                        if (strict || ckWARN(WARN_REGEXP)) {
10270                            const int w = RExC_parse >= rangebegin
10271                                          ?  RExC_parse - rangebegin
10272                                          : 0;
10273                            if (strict) {
10274                                vFAIL4("False [] range \"%*.*s\"",
10275                                    w, w, rangebegin);
10276                            }
10277                            else {
10278                                vWARN4(RExC_parse,
10279                                    "False [] range \"%*.*s\"",
10280                                    w, w, rangebegin);
10281                            }
10282                        }
10283                        cp_list = add_cp_to_invlist(cp_list, '-');
10284                        element_count++;
10285                    } else
10286                        range = 1;	/* yeah, it's a range! */
10287                    continue;	/* but do it the next time */
10288                }
10289            }
10290        }
10291
10292        if (namedclass > OOB_NAMEDCLASS) {
10293            continue;
10294        }
10295
10296        /* Here, we have a single value this time through the loop, and
10297         * <prevvalue> is the beginning of the range, if any; or <value> if
10298         * not. */
10299
10300        /* non-Latin1 code point implies unicode semantics. */
10301        if (value > 255) {
10302            if (value > MAX_LEGAL_CP && (   value != UV_MAX
10303                                         || prevvalue > MAX_LEGAL_CP))
10304            {
10305                vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10306            }
10307            REQUIRE_UNI_RULES(flagp, 0);
10308            if (  ! silence_non_portable
10309                &&  UNICODE_IS_PERL_EXTENDED(value)
10310                &&  TO_OUTPUT_WARNINGS(RExC_parse))
10311            {
10312                ckWARN2_non_literal_string(RExC_parse,
10313                                           packWARN(WARN_PORTABLE),
10314                                           PL_extended_cp_format,
10315                                           value);
10316            }
10317        }
10318
10319        /* Ready to process either the single value, or the completed range.
10320         * For single-valued non-inverted ranges, we consider the possibility
10321         * of multi-char folds.  (We made a conscious decision to not do this
10322         * for the other cases because it can often lead to non-intuitive
10323         * results.  For example, you have the peculiar case that:
10324         *  "s s" =~ /^[^\xDF]+$/i => Y
10325         *  "ss"  =~ /^[^\xDF]+$/i => N
10326         *
10327         * See [perl #89750] */
10328        if (FOLD && allow_mutiple_chars && value == prevvalue) {
10329            if (    value == LATIN_SMALL_LETTER_SHARP_S
10330                || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10331                                                        value)))
10332            {
10333                /* Here <value> is indeed a multi-char fold.  Get what it is */
10334
10335                U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10336                STRLEN foldlen;
10337
10338                UV folded = _to_uni_fold_flags(
10339                                value,
10340                                foldbuf,
10341                                &foldlen,
10342                                FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10343                                                   ? FOLD_FLAGS_NOMIX_ASCII
10344                                                   : 0)
10345                                );
10346
10347                /* Here, <folded> should be the first character of the
10348                 * multi-char fold of <value>, with <foldbuf> containing the
10349                 * whole thing.  But, if this fold is not allowed (because of
10350                 * the flags), <fold> will be the same as <value>, and should
10351                 * be processed like any other character, so skip the special
10352                 * handling */
10353                if (folded != value) {
10354
10355                    /* Skip if we are recursed, currently parsing the class
10356                     * again.  Otherwise add this character to the list of
10357                     * multi-char folds. */
10358                    if (! RExC_in_multi_char_class) {
10359                        STRLEN cp_count = utf8_length(foldbuf,
10360                                                      foldbuf + foldlen);
10361                        SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10362
10363                        Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10364
10365                        multi_char_matches
10366                                        = add_multi_match(multi_char_matches,
10367                                                          multi_fold,
10368                                                          cp_count);
10369
10370                    }
10371
10372                    /* This element should not be processed further in this
10373                     * class */
10374                    element_count--;
10375                    value = save_value;
10376                    prevvalue = save_prevvalue;
10377                    continue;
10378                }
10379            }
10380        }
10381
10382        if (strict && ckWARN(WARN_REGEXP)) {
10383            if (range) {
10384
10385                /* If the range starts above 255, everything is portable and
10386                 * likely to be so for any forseeable character set, so don't
10387                 * warn. */
10388                if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10389                    vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10390                }
10391                else if (prevvalue != value) {
10392
10393                    /* Under strict, ranges that stop and/or end in an ASCII
10394                     * printable should have each end point be a portable value
10395                     * for it (preferably like 'A', but we don't warn if it is
10396                     * a (portable) Unicode name or code point), and the range
10397                     * must be all digits or all letters of the same case.
10398                     * Otherwise, the range is non-portable and unclear as to
10399                     * what it contains */
10400                    if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
10401                        && (          non_portable_endpoint
10402                            || ! (   (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10403                                  || (isLOWER_A(prevvalue) && isLOWER_A(value))
10404                                  || (isUPPER_A(prevvalue) && isUPPER_A(value))
10405                    ))) {
10406                        vWARN(RExC_parse, "Ranges of ASCII printables should"
10407                                          " be some subset of \"0-9\","
10408                                          " \"A-Z\", or \"a-z\"");
10409                    }
10410                    else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10411                        SSize_t index_start;
10412                        SSize_t index_final;
10413
10414                        /* But the nature of Unicode and languages mean we
10415                         * can't do the same checks for above-ASCII ranges,
10416                         * except in the case of digit ones.  These should
10417                         * contain only digits from the same group of 10.  The
10418                         * ASCII case is handled just above.  Hence here, the
10419                         * range could be a range of digits.  First some
10420                         * unlikely special cases.  Grandfather in that a range
10421                         * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10422                         * if its starting value is one of the 10 digits prior
10423                         * to it.  This is because it is an alternate way of
10424                         * writing 19D1, and some people may expect it to be in
10425                         * that group.  But it is bad, because it won't give
10426                         * the expected results.  In Unicode 5.2 it was
10427                         * considered to be in that group (of 11, hence), but
10428                         * this was fixed in the next version */
10429
10430                        if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10431                            goto warn_bad_digit_range;
10432                        }
10433                        else if (UNLIKELY(   prevvalue >= 0x1D7CE
10434                                          &&     value <= 0x1D7FF))
10435                        {
10436                            /* This is the only other case currently in Unicode
10437                             * where the algorithm below fails.  The code
10438                             * points just above are the end points of a single
10439                             * range containing only decimal digits.  It is 5
10440                             * different series of 0-9.  All other ranges of
10441                             * digits currently in Unicode are just a single
10442                             * series.  (And mktables will notify us if a later
10443                             * Unicode version breaks this.)
10444                             *
10445                             * If the range being checked is at most 9 long,
10446                             * and the digit values represented are in
10447                             * numerical order, they are from the same series.
10448                             * */
10449                            if (         value - prevvalue > 9
10450                                ||    (((    value - 0x1D7CE) % 10)
10451                                     <= (prevvalue - 0x1D7CE) % 10))
10452                            {
10453                                goto warn_bad_digit_range;
10454                            }
10455                        }
10456                        else {
10457
10458                            /* For all other ranges of digits in Unicode, the
10459                             * algorithm is just to check if both end points
10460                             * are in the same series, which is the same range.
10461                             * */
10462                            index_start = _invlist_search(
10463                                                    PL_XPosix_ptrs[CC_DIGIT_],
10464                                                    prevvalue);
10465
10466                            /* Warn if the range starts and ends with a digit,
10467                             * and they are not in the same group of 10. */
10468                            if (   index_start >= 0
10469                                && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10470                                && (index_final =
10471                                    _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10472                                                    value)) != index_start
10473                                && index_final >= 0
10474                                && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10475                            {
10476                              warn_bad_digit_range:
10477                                vWARN(RExC_parse, "Ranges of digits should be"
10478                                                  " from the same group of"
10479                                                  " 10");
10480                            }
10481                        }
10482                    }
10483                }
10484            }
10485            if ((! range || prevvalue == value) && non_portable_endpoint) {
10486                if (isPRINT_A(value)) {
10487                    char literal[3];
10488                    unsigned d = 0;
10489                    if (isBACKSLASHED_PUNCT(value)) {
10490                        literal[d++] = '\\';
10491                    }
10492                    literal[d++] = (char) value;
10493                    literal[d++] = '\0';
10494
10495                    vWARN4(RExC_parse,
10496                           "\"%.*s\" is more clearly written simply as \"%s\"",
10497                           (int) (RExC_parse - rangebegin),
10498                           rangebegin,
10499                           literal
10500                        );
10501                }
10502                else if (isMNEMONIC_CNTRL(value)) {
10503                    vWARN4(RExC_parse,
10504                           "\"%.*s\" is more clearly written simply as \"%s\"",
10505                           (int) (RExC_parse - rangebegin),
10506                           rangebegin,
10507                           cntrl_to_mnemonic((U8) value)
10508                        );
10509                }
10510            }
10511        }
10512
10513        /* Deal with this element of the class */
10514
10515#ifndef EBCDIC
10516        cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10517                                                    prevvalue, value);
10518#else
10519        /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10520         * that don't require special handling, we can just add the range like
10521         * we do for ASCII platforms */
10522        if ((UNLIKELY(prevvalue == 0) && value >= 255)
10523            || ! (prevvalue < 256
10524                    && (unicode_range
10525                        || (! non_portable_endpoint
10526                            && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10527                                || (isUPPER_A(prevvalue)
10528                                    && isUPPER_A(value)))))))
10529        {
10530            cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10531                                                        prevvalue, value);
10532        }
10533        else {
10534            /* Here, requires special handling.  This can be because it is a
10535             * range whose code points are considered to be Unicode, and so
10536             * must be individually translated into native, or because its a
10537             * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10538             * EBCDIC, but we have defined them to include only the "expected"
10539             * upper or lower case ASCII alphabetics.  Subranges above 255 are
10540             * the same in native and Unicode, so can be added as a range */
10541            U8 start = NATIVE_TO_LATIN1(prevvalue);
10542            unsigned j;
10543            U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10544            for (j = start; j <= end; j++) {
10545                cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10546            }
10547            if (value > 255) {
10548                cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10549                                                            256, value);
10550            }
10551        }
10552#endif
10553
10554        range = 0; /* this range (if it was one) is done now */
10555    } /* End of loop through all the text within the brackets */
10556
10557    if (   posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10558        output_posix_warnings(pRExC_state, posix_warnings);
10559    }
10560
10561    /* If anything in the class expands to more than one character, we have to
10562     * deal with them by building up a substitute parse string, and recursively
10563     * calling reg() on it, instead of proceeding */
10564    if (multi_char_matches) {
10565        SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10566        I32 cp_count;
10567        STRLEN len;
10568        char *save_end = RExC_end;
10569        char *save_parse = RExC_parse;
10570        char *save_start = RExC_start;
10571        Size_t constructed_prefix_len = 0; /* This gives the length of the
10572                                              constructed portion of the
10573                                              substitute parse. */
10574        bool first_time = TRUE;     /* First multi-char occurrence doesn't get
10575                                       a "|" */
10576        I32 reg_flags;
10577
10578        assert(! invert);
10579        /* Only one level of recursion allowed */
10580        assert(RExC_copy_start_in_constructed == RExC_precomp);
10581
10582#if 0   /* Have decided not to deal with multi-char folds in inverted classes,
10583           because too confusing */
10584        if (invert) {
10585            sv_catpvs(substitute_parse, "(?:");
10586        }
10587#endif
10588
10589        /* Look at the longest strings first */
10590        for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10591                        cp_count > 0;
10592                        cp_count--)
10593        {
10594
10595            if (av_exists(multi_char_matches, cp_count)) {
10596                AV** this_array_ptr;
10597                SV* this_sequence;
10598
10599                this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10600                                                 cp_count, FALSE);
10601                while ((this_sequence = av_pop(*this_array_ptr)) !=
10602                                                                &PL_sv_undef)
10603                {
10604                    if (! first_time) {
10605                        sv_catpvs(substitute_parse, "|");
10606                    }
10607                    first_time = FALSE;
10608
10609                    sv_catpv(substitute_parse, SvPVX(this_sequence));
10610                }
10611            }
10612        }
10613
10614        /* If the character class contains anything else besides these
10615         * multi-character strings, have to include it in recursive parsing */
10616        if (element_count) {
10617            bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10618
10619            sv_catpvs(substitute_parse, "|");
10620            if (has_l_bracket) {    /* Add an [ if the original had one */
10621                sv_catpvs(substitute_parse, "[");
10622            }
10623            constructed_prefix_len = SvCUR(substitute_parse);
10624            sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10625
10626            /* Put in a closing ']' to match any opening one, but not if going
10627             * off the end, as otherwise we are adding something that really
10628             * isn't there */
10629            if (has_l_bracket && RExC_parse < RExC_end) {
10630                sv_catpvs(substitute_parse, "]");
10631            }
10632        }
10633
10634        sv_catpvs(substitute_parse, ")");
10635#if 0
10636        if (invert) {
10637            /* This is a way to get the parse to skip forward a whole named
10638             * sequence instead of matching the 2nd character when it fails the
10639             * first */
10640            sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10641        }
10642#endif
10643
10644        /* Set up the data structure so that any errors will be properly
10645         * reported.  See the comments at the definition of
10646         * REPORT_LOCATION_ARGS for details */
10647        RExC_copy_start_in_input = (char *) orig_parse;
10648        RExC_start = SvPV(substitute_parse, len);
10649        RExC_parse_set( RExC_start );
10650        RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10651        RExC_end = RExC_parse + len;
10652        RExC_in_multi_char_class = 1;
10653
10654        ret = reg(pRExC_state, 1, &reg_flags, depth+1);
10655
10656        *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10657
10658        /* And restore so can parse the rest of the pattern */
10659        RExC_parse_set(save_parse);
10660        RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10661        RExC_end = save_end;
10662        RExC_in_multi_char_class = 0;
10663        SvREFCNT_dec_NN(multi_char_matches);
10664        SvREFCNT_dec(properties);
10665        SvREFCNT_dec(cp_list);
10666        SvREFCNT_dec(simple_posixes);
10667        SvREFCNT_dec(posixes);
10668        SvREFCNT_dec(nposixes);
10669        SvREFCNT_dec(cp_foldable_list);
10670        return ret;
10671    }
10672
10673    /* If folding, we calculate all characters that could fold to or from the
10674     * ones already on the list */
10675    if (cp_foldable_list) {
10676        if (FOLD) {
10677            UV start, end;	/* End points of code point ranges */
10678
10679            SV* fold_intersection = NULL;
10680            SV** use_list;
10681
10682            /* Our calculated list will be for Unicode rules.  For locale
10683             * matching, we have to keep a separate list that is consulted at
10684             * runtime only when the locale indicates Unicode rules (and we
10685             * don't include potential matches in the ASCII/Latin1 range, as
10686             * any code point could fold to any other, based on the run-time
10687             * locale).   For non-locale, we just use the general list */
10688            if (LOC) {
10689                use_list = &only_utf8_locale_list;
10690            }
10691            else {
10692                use_list = &cp_list;
10693            }
10694
10695            /* Only the characters in this class that participate in folds need
10696             * be checked.  Get the intersection of this class and all the
10697             * possible characters that are foldable.  This can quickly narrow
10698             * down a large class */
10699            _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10700                                  &fold_intersection);
10701
10702            /* Now look at the foldable characters in this class individually */
10703            invlist_iterinit(fold_intersection);
10704            while (invlist_iternext(fold_intersection, &start, &end)) {
10705                UV j;
10706                UV folded;
10707
10708                /* Look at every character in the range */
10709                for (j = start; j <= end; j++) {
10710                    U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10711                    STRLEN foldlen;
10712                    unsigned int k;
10713                    Size_t folds_count;
10714                    U32 first_fold;
10715                    const U32 * remaining_folds;
10716
10717                    if (j < 256) {
10718
10719                        /* Under /l, we don't know what code points below 256
10720                         * fold to, except we do know the MICRO SIGN folds to
10721                         * an above-255 character if the locale is UTF-8, so we
10722                         * add it to the special list (in *use_list)  Otherwise
10723                         * we know now what things can match, though some folds
10724                         * are valid under /d only if the target is UTF-8.
10725                         * Those go in a separate list */
10726                        if (      IS_IN_SOME_FOLD_L1(j)
10727                            && ! (LOC && j != MICRO_SIGN))
10728                        {
10729
10730                            /* ASCII is always matched; non-ASCII is matched
10731                             * only under Unicode rules (which could happen
10732                             * under /l if the locale is a UTF-8 one */
10733                            if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10734                                *use_list = add_cp_to_invlist(*use_list,
10735                                                            PL_fold_latin1[j]);
10736                            }
10737                            else if (j != PL_fold_latin1[j]) {
10738                                upper_latin1_only_utf8_matches
10739                                        = add_cp_to_invlist(
10740                                                upper_latin1_only_utf8_matches,
10741                                                PL_fold_latin1[j]);
10742                            }
10743                        }
10744
10745                        if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10746                            && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10747                        {
10748                            add_above_Latin1_folds(pRExC_state,
10749                                                   (U8) j,
10750                                                   use_list);
10751                        }
10752                        continue;
10753                    }
10754
10755                    /* Here is an above Latin1 character.  We don't have the
10756                     * rules hard-coded for it.  First, get its fold.  This is
10757                     * the simple fold, as the multi-character folds have been
10758                     * handled earlier and separated out */
10759                    folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10760                                                        (ASCII_FOLD_RESTRICTED)
10761                                                        ? FOLD_FLAGS_NOMIX_ASCII
10762                                                        : 0);
10763
10764                    /* Single character fold of above Latin1.  Add everything
10765                     * in its fold closure to the list that this node should
10766                     * match. */
10767                    folds_count = _inverse_folds(folded, &first_fold,
10768                                                    &remaining_folds);
10769                    for (k = 0; k <= folds_count; k++) {
10770                        UV c = (k == 0)     /* First time through use itself */
10771                                ? folded
10772                                : (k == 1)  /* 2nd time use, the first fold */
10773                                   ? first_fold
10774
10775                                     /* Then the remaining ones */
10776                                   : remaining_folds[k-2];
10777
10778                        /* /aa doesn't allow folds between ASCII and non- */
10779                        if ((   ASCII_FOLD_RESTRICTED
10780                            && (isASCII(c) != isASCII(j))))
10781                        {
10782                            continue;
10783                        }
10784
10785                        /* Folds under /l which cross the 255/256 boundary are
10786                         * added to a separate list.  (These are valid only
10787                         * when the locale is UTF-8.) */
10788                        if (c < 256 && LOC) {
10789                            *use_list = add_cp_to_invlist(*use_list, c);
10790                            continue;
10791                        }
10792
10793                        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10794                        {
10795                            cp_list = add_cp_to_invlist(cp_list, c);
10796                        }
10797                        else {
10798                            /* Similarly folds involving non-ascii Latin1
10799                             * characters under /d are added to their list */
10800                            upper_latin1_only_utf8_matches
10801                                    = add_cp_to_invlist(
10802                                                upper_latin1_only_utf8_matches,
10803                                                c);
10804                        }
10805                    }
10806                }
10807            }
10808            SvREFCNT_dec_NN(fold_intersection);
10809        }
10810
10811        /* Now that we have finished adding all the folds, there is no reason
10812         * to keep the foldable list separate */
10813        _invlist_union(cp_list, cp_foldable_list, &cp_list);
10814        SvREFCNT_dec_NN(cp_foldable_list);
10815    }
10816
10817    /* And combine the result (if any) with any inversion lists from posix
10818     * classes.  The lists are kept separate up to now because we don't want to
10819     * fold the classes */
10820    if (simple_posixes) {   /* These are the classes known to be unaffected by
10821                               /a, /aa, and /d */
10822        if (cp_list) {
10823            _invlist_union(cp_list, simple_posixes, &cp_list);
10824            SvREFCNT_dec_NN(simple_posixes);
10825        }
10826        else {
10827            cp_list = simple_posixes;
10828        }
10829    }
10830    if (posixes || nposixes) {
10831        if (! DEPENDS_SEMANTICS) {
10832
10833            /* For everything but /d, we can just add the current 'posixes' and
10834             * 'nposixes' to the main list */
10835            if (posixes) {
10836                if (cp_list) {
10837                    _invlist_union(cp_list, posixes, &cp_list);
10838                    SvREFCNT_dec_NN(posixes);
10839                }
10840                else {
10841                    cp_list = posixes;
10842                }
10843            }
10844            if (nposixes) {
10845                if (cp_list) {
10846                    _invlist_union(cp_list, nposixes, &cp_list);
10847                    SvREFCNT_dec_NN(nposixes);
10848                }
10849                else {
10850                    cp_list = nposixes;
10851                }
10852            }
10853        }
10854        else {
10855            /* Under /d, things like \w match upper Latin1 characters only if
10856             * the target string is in UTF-8.  But things like \W match all the
10857             * upper Latin1 characters if the target string is not in UTF-8.
10858             *
10859             * Handle the case with something like \W separately */
10860            if (nposixes) {
10861                SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10862
10863                /* A complemented posix class matches all upper Latin1
10864                 * characters if not in UTF-8.  And it matches just certain
10865                 * ones when in UTF-8.  That means those certain ones are
10866                 * matched regardless, so can just be added to the
10867                 * unconditional list */
10868                if (cp_list) {
10869                    _invlist_union(cp_list, nposixes, &cp_list);
10870                    SvREFCNT_dec_NN(nposixes);
10871                    nposixes = NULL;
10872                }
10873                else {
10874                    cp_list = nposixes;
10875                }
10876
10877                /* Likewise for 'posixes' */
10878                _invlist_union(posixes, cp_list, &cp_list);
10879                SvREFCNT_dec(posixes);
10880
10881                /* Likewise for anything else in the range that matched only
10882                 * under UTF-8 */
10883                if (upper_latin1_only_utf8_matches) {
10884                    _invlist_union(cp_list,
10885                                   upper_latin1_only_utf8_matches,
10886                                   &cp_list);
10887                    SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10888                    upper_latin1_only_utf8_matches = NULL;
10889                }
10890
10891                /* If we don't match all the upper Latin1 characters regardless
10892                 * of UTF-8ness, we have to set a flag to match the rest when
10893                 * not in UTF-8 */
10894                _invlist_subtract(only_non_utf8_list, cp_list,
10895                                  &only_non_utf8_list);
10896                if (_invlist_len(only_non_utf8_list) != 0) {
10897                    anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10898                }
10899                SvREFCNT_dec_NN(only_non_utf8_list);
10900            }
10901            else {
10902                /* Here there were no complemented posix classes.  That means
10903                 * the upper Latin1 characters in 'posixes' match only when the
10904                 * target string is in UTF-8.  So we have to add them to the
10905                 * list of those types of code points, while adding the
10906                 * remainder to the unconditional list.
10907                 *
10908                 * First calculate what they are */
10909                SV* nonascii_but_latin1_properties = NULL;
10910                _invlist_intersection(posixes, PL_UpperLatin1,
10911                                      &nonascii_but_latin1_properties);
10912
10913                /* And add them to the final list of such characters. */
10914                _invlist_union(upper_latin1_only_utf8_matches,
10915                               nonascii_but_latin1_properties,
10916                               &upper_latin1_only_utf8_matches);
10917
10918                /* Remove them from what now becomes the unconditional list */
10919                _invlist_subtract(posixes, nonascii_but_latin1_properties,
10920                                  &posixes);
10921
10922                /* And add those unconditional ones to the final list */
10923                if (cp_list) {
10924                    _invlist_union(cp_list, posixes, &cp_list);
10925                    SvREFCNT_dec_NN(posixes);
10926                    posixes = NULL;
10927                }
10928                else {
10929                    cp_list = posixes;
10930                }
10931
10932                SvREFCNT_dec(nonascii_but_latin1_properties);
10933
10934                /* Get rid of any characters from the conditional list that we
10935                 * now know are matched unconditionally, which may make that
10936                 * list empty */
10937                _invlist_subtract(upper_latin1_only_utf8_matches,
10938                                  cp_list,
10939                                  &upper_latin1_only_utf8_matches);
10940                if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
10941                    SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10942                    upper_latin1_only_utf8_matches = NULL;
10943                }
10944            }
10945        }
10946    }
10947
10948    /* And combine the result (if any) with any inversion list from properties.
10949     * The lists are kept separate up to now so that we can distinguish the two
10950     * in regards to matching above-Unicode.  A run-time warning is generated
10951     * if a Unicode property is matched against a non-Unicode code point. But,
10952     * we allow user-defined properties to match anything, without any warning,
10953     * and we also suppress the warning if there is a portion of the character
10954     * class that isn't a Unicode property, and which matches above Unicode, \W
10955     * or [\x{110000}] for example.
10956     * (Note that in this case, unlike the Posix one above, there is no
10957     * <upper_latin1_only_utf8_matches>, because having a Unicode property
10958     * forces Unicode semantics */
10959    if (properties) {
10960        if (cp_list) {
10961
10962            /* If it matters to the final outcome, see if a non-property
10963             * component of the class matches above Unicode.  If so, the
10964             * warning gets suppressed.  This is true even if just a single
10965             * such code point is specified, as, though not strictly correct if
10966             * another such code point is matched against, the fact that they
10967             * are using above-Unicode code points indicates they should know
10968             * the issues involved */
10969            if (warn_super) {
10970                warn_super = ! (invert
10971                               ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
10972            }
10973
10974            _invlist_union(properties, cp_list, &cp_list);
10975            SvREFCNT_dec_NN(properties);
10976        }
10977        else {
10978            cp_list = properties;
10979        }
10980
10981        if (warn_super) {
10982            anyof_flags |= ANYOF_WARN_SUPER__shared;
10983
10984            /* Because an ANYOF node is the only one that warns, this node
10985             * can't be optimized into something else */
10986            optimizable = FALSE;
10987        }
10988    }
10989
10990    /* Here, we have calculated what code points should be in the character
10991     * class.
10992     *
10993     * Now we can see about various optimizations.  Fold calculation (which we
10994     * did above) needs to take place before inversion.  Otherwise /[^k]/i
10995     * would invert to include K, which under /i would match k, which it
10996     * shouldn't.  Therefore we can't invert folded locale now, as it won't be
10997     * folded until runtime */
10998
10999    /* If we didn't do folding, it's because some information isn't available
11000     * until runtime; set the run-time fold flag for these  We know to set the
11001     * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11002     * at least one 0-255 range code point */
11003    if (LOC && FOLD) {
11004
11005        /* Some things on the list might be unconditionally included because of
11006         * other components.  Remove them, and clean up the list if it goes to
11007         * 0 elements */
11008        if (only_utf8_locale_list && cp_list) {
11009            _invlist_subtract(only_utf8_locale_list, cp_list,
11010                              &only_utf8_locale_list);
11011
11012            if (_invlist_len(only_utf8_locale_list) == 0) {
11013                SvREFCNT_dec_NN(only_utf8_locale_list);
11014                only_utf8_locale_list = NULL;
11015            }
11016        }
11017        if (    only_utf8_locale_list
11018            || (    cp_list
11019                && (   _invlist_contains_cp(cp_list,
11020                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11021                    || _invlist_contains_cp(cp_list,
11022                                            LATIN_SMALL_LETTER_DOTLESS_I))))
11023        {
11024            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11025            anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11026        }
11027        else if (cp_list && invlist_lowest(cp_list) < 256) {
11028            /* If nothing is below 256, has no locale dependency; otherwise it
11029             * does */
11030            anyof_flags |= ANYOFL_FOLD;
11031            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11032
11033            /* In a Turkish locale these could match, notify the run-time code
11034             * to check for that */
11035            if (   _invlist_contains_cp(cp_list, 'I')
11036                || _invlist_contains_cp(cp_list, 'i'))
11037            {
11038                anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11039            }
11040        }
11041    }
11042    else if (   DEPENDS_SEMANTICS
11043             && (    upper_latin1_only_utf8_matches
11044                 || (  anyof_flags
11045                     & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11046    {
11047        RExC_seen_d_op = TRUE;
11048        has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11049    }
11050
11051    /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11052     * compile time. */
11053    if (     cp_list
11054        &&   invert
11055        && ! has_runtime_dependency)
11056    {
11057        _invlist_invert(cp_list);
11058
11059        /* Clear the invert flag since have just done it here */
11060        invert = FALSE;
11061    }
11062
11063    /* All possible optimizations below still have these characteristics.
11064     * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11065     * routine) */
11066    *flagp |= HASWIDTH|SIMPLE;
11067
11068    if (ret_invlist) {
11069        *ret_invlist = cp_list;
11070
11071        return (cp_list) ? RExC_emit : 0;
11072    }
11073
11074    if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11075        RExC_contains_locale = 1;
11076    }
11077
11078    if (optimizable) {
11079
11080        /* Some character classes are equivalent to other nodes.  Such nodes
11081         * take up less room, and some nodes require fewer operations to
11082         * execute, than ANYOF nodes.  EXACTish nodes may be joinable with
11083         * adjacent nodes to improve efficiency. */
11084        op = optimize_regclass(pRExC_state, cp_list,
11085                                            only_utf8_locale_list,
11086                                            upper_latin1_only_utf8_matches,
11087                                            has_runtime_dependency,
11088                                            posixl,
11089                                            &anyof_flags, &invert, &ret, flagp);
11090        RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11091
11092        /* If optimized to something else and emitted, clean up and return */
11093        if (ret >= 0) {
11094            SvREFCNT_dec(cp_list);;
11095            SvREFCNT_dec(only_utf8_locale_list);
11096            SvREFCNT_dec(upper_latin1_only_utf8_matches);
11097            return ret;
11098        }
11099
11100        /* If no optimization was found, an END was returned and we will now
11101         * emit an ANYOF */
11102        if (op == END) {
11103            op = ANYOF;
11104        }
11105    }
11106
11107    /* Here are going to emit an ANYOF; set the particular type */
11108    if (op == ANYOF) {
11109        if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11110            op = ANYOFD;
11111        }
11112        else if (posixl) {
11113            op = ANYOFPOSIXL;
11114        }
11115        else if (LOC) {
11116            op = ANYOFL;
11117        }
11118    }
11119
11120    ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11121    FILL_NODE(ret, op);        /* We set the argument later */
11122    RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11123    ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11124
11125    /* Here, <cp_list> contains all the code points we can determine at
11126     * compile time that match under all conditions.  Go through it, and
11127     * for things that belong in the bitmap, put them there, and delete from
11128     * <cp_list>.  While we are at it, see if everything above 255 is in the
11129     * list, and if so, set a flag to speed up execution */
11130
11131    populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11132
11133    if (posixl) {
11134        ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11135    }
11136
11137    if (invert) {
11138        ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11139    }
11140
11141    /* Here, the bitmap has been populated with all the Latin1 code points that
11142     * always match.  Can now add to the overall list those that match only
11143     * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11144     * */
11145    if (upper_latin1_only_utf8_matches) {
11146        if (cp_list) {
11147            _invlist_union(cp_list,
11148                           upper_latin1_only_utf8_matches,
11149                           &cp_list);
11150            SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11151        }
11152        else {
11153            cp_list = upper_latin1_only_utf8_matches;
11154        }
11155        ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11156    }
11157
11158    set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11159                  (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11160                   ? listsv
11161                   : NULL,
11162                  only_utf8_locale_list);
11163
11164    SvREFCNT_dec(cp_list);;
11165    SvREFCNT_dec(only_utf8_locale_list);
11166    return ret;
11167}
11168
11169STATIC U8
11170S_optimize_regclass(pTHX_
11171                    RExC_state_t *pRExC_state,
11172                    SV * cp_list,
11173                    SV* only_utf8_locale_list,
11174                    SV* upper_latin1_only_utf8_matches,
11175                    const U32 has_runtime_dependency,
11176                    const U32 posixl,
11177                    U8  * anyof_flags,
11178                    bool * invert,
11179                    regnode_offset * ret,
11180                    I32 *flagp
11181                  )
11182{
11183    /* This function exists just to make S_regclass() smaller.  It extracts out
11184     * the code that looks for potential optimizations away from a full generic
11185     * ANYOF node.  The parameter names are the same as the corresponding
11186     * variables in S_regclass.
11187     *
11188     * It returns the new op (the impossible END one if no optimization found)
11189     * and sets *ret to any created regnode.  If the new op is sufficiently
11190     * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11191     *
11192     * Certain of the parameters may be updated as a result of the changes
11193     * herein */
11194
11195    U8 op = END;    /* The returned node-type, initialized to an impossible
11196                      one. */
11197    UV value = 0;
11198    PERL_UINT_FAST8_T i;
11199    UV partial_cp_count = 0;
11200    UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11201    UV   end[MAX_FOLD_FROMS+1] = { 0 };
11202    bool single_range = FALSE;
11203    UV lowest_cp = 0, highest_cp = 0;
11204
11205    PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11206
11207    if (cp_list) { /* Count the code points in enough ranges that we would see
11208                      all the ones possible in any fold in this version of
11209                      Unicode */
11210
11211        invlist_iterinit(cp_list);
11212        for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11213            if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11214                break;
11215            }
11216            partial_cp_count += end[i] - start[i] + 1;
11217        }
11218
11219        if (i == 1) {
11220            single_range = TRUE;
11221        }
11222        invlist_iterfinish(cp_list);
11223
11224        /* If we know at compile time that this matches every possible code
11225         * point, any run-time dependencies don't matter */
11226        if (start[0] == 0 && end[0] == UV_MAX) {
11227            if (*invert) {
11228                goto return_OPFAIL;
11229            }
11230            else {
11231                goto return_SANY;
11232            }
11233        }
11234
11235        /* Use a clearer mnemonic for below */
11236        lowest_cp = start[0];
11237
11238        highest_cp = invlist_highest(cp_list);
11239    }
11240
11241    /* Similarly, for /l posix classes, if both a class and its complement
11242     * match, any run-time dependencies don't matter */
11243    if (posixl) {
11244        int namedclass;
11245        for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11246            if (   POSIXL_TEST(posixl, namedclass)      /* class */
11247                && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11248            {
11249                if (*invert) {
11250                    goto return_OPFAIL;
11251                }
11252                goto return_SANY;
11253            }
11254        }
11255
11256        /* For well-behaved locales, some classes are subsets of others, so
11257         * complementing the subset and including the non-complemented superset
11258         * should match everything, like [\D[:alnum:]], and
11259         * [[:^alpha:][:alnum:]], but some implementations of locales are
11260         * buggy, and khw thinks its a bad idea to have optimization change
11261         * behavior, even if it avoids an OS bug in a given case */
11262
11263#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11264
11265        /* If is a single posix /l class, can optimize to just that op.  Such a
11266         * node will not match anything in the Latin1 range, as that is not
11267         * determinable until runtime, but will match whatever the class does
11268         * outside that range.  (Note that some classes won't match anything
11269         * outside the range, like [:ascii:]) */
11270        if (   isSINGLE_BIT_SET(posixl)
11271            && (partial_cp_count == 0 || lowest_cp > 255))
11272        {
11273            U8 classnum;
11274            SV * class_above_latin1 = NULL;
11275            bool already_inverted;
11276            bool are_equivalent;
11277
11278
11279            namedclass = single_1bit_pos32(posixl);
11280            classnum = namedclass_to_classnum(namedclass);
11281
11282            /* The named classes are such that the inverted number is one
11283             * larger than the non-inverted one */
11284            already_inverted = namedclass - classnum_to_namedclass(classnum);
11285
11286            /* Create an inversion list of the official property, inverted if
11287             * the constructed node list is inverted, and restricted to only
11288             * the above latin1 code points, which are the only ones known at
11289             * compile time */
11290            _invlist_intersection_maybe_complement_2nd(
11291                                                PL_AboveLatin1,
11292                                                PL_XPosix_ptrs[classnum],
11293                                                already_inverted,
11294                                                &class_above_latin1);
11295            are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11296            SvREFCNT_dec_NN(class_above_latin1);
11297
11298            if (are_equivalent) {
11299
11300                /* Resolve the run-time inversion flag with this possibly
11301                 * inverted class */
11302                *invert = *invert ^ already_inverted;
11303
11304                op = POSIXL + *invert * (NPOSIXL - POSIXL);
11305                *ret = reg_node(pRExC_state, op);
11306                FLAGS(REGNODE_p(*ret)) = classnum;
11307                return op;
11308            }
11309        }
11310    }
11311
11312    /* khw can't think of any other possible transformation involving these. */
11313    if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11314        return END;
11315    }
11316
11317    if (! has_runtime_dependency) {
11318
11319        /* If the list is empty, nothing matches.  This happens, for example,
11320         * when a Unicode property that doesn't match anything is the only
11321         * element in the character class (perluniprops.pod notes such
11322         * properties). */
11323        if (partial_cp_count == 0) {
11324            if (*invert) {
11325                goto return_SANY;
11326            }
11327            else {
11328                goto return_OPFAIL;
11329            }
11330        }
11331
11332        /* If matches everything but \n */
11333        if (   start[0] == 0 && end[0] == '\n' - 1
11334            && start[1] == '\n' + 1 && end[1] == UV_MAX)
11335        {
11336            assert (! *invert);
11337            op = REG_ANY;
11338            *ret = reg_node(pRExC_state, op);
11339            MARK_NAUGHTY(1);
11340            return op;
11341        }
11342    }
11343
11344    /* Next see if can optimize classes that contain just a few code points
11345     * into an EXACTish node.  The reason to do this is to let the optimizer
11346     * join this node with adjacent EXACTish ones, and ANYOF nodes require
11347     * runtime conversion to code point from UTF-8, which we'd like to avoid.
11348     *
11349     * An EXACTFish node can be generated even if not under /i, and vice versa.
11350     * But care must be taken.  An EXACTFish node has to be such that it only
11351     * matches precisely the code points in the class, but we want to generate
11352     * the least restrictive one that does that, to increase the odds of being
11353     * able to join with an adjacent node.  For example, if the class contains
11354     * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11355     * from matching.  Whether we are under /i or not is irrelevant in this
11356     * case.  Less obvious is the pattern qr/[\x{02BC}]n/i.  U+02BC is MODIFIER
11357     * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11358     * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE.  And so even though there
11359     * is no simple fold that includes \X{02BC}, there is a multi-char fold
11360     * that does, and so the node generated for it must be an EXACTFish one.
11361     * On the other hand qr/:/i should generate a plain EXACT node since the
11362     * colon participates in no fold whatsoever, and having it be EXACT tells
11363     * the optimizer the target string cannot match unless it has a colon in
11364     * it. */
11365    if (   ! posixl
11366        && ! *invert
11367
11368            /* Only try if there are no more code points in the class than in
11369             * the max possible fold */
11370        &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11371    {
11372        /* We can always make a single code point class into an EXACTish node.
11373         * */
11374        if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11375            if (LOC) {
11376
11377                /* Here is /l:  Use EXACTL, except if there is a fold not known
11378                 * until runtime so shows as only a single code point here.
11379                 * For code points above 255, we know which can cause problems
11380                 * by having a potential fold to the Latin1 range. */
11381                if (  ! FOLD
11382                    || (     lowest_cp > 255
11383                        && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11384                {
11385                    op = EXACTL;
11386                }
11387                else {
11388                    op = EXACTFL;
11389                }
11390            }
11391            else if (! FOLD) { /* Not /l and not /i */
11392                op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11393            }
11394            else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11395                                          small */
11396
11397                /* Under /i, it gets a little tricky.  A code point that
11398                 * doesn't participate in a fold should be an EXACT node.  We
11399                 * know this one isn't the result of a simple fold, or there'd
11400                 * be more than one code point in the list, but it could be
11401                 * part of a multi-character fold.  In that case we better not
11402                 * create an EXACT node, as we would wrongly be telling the
11403                 * optimizer that this code point must be in the target string,
11404                 * and that is wrong.  This is because if the sequence around
11405                 * this code point forms a multi-char fold, what needs to be in
11406                 * the string could be the code point that folds to the
11407                 * sequence.
11408                 *
11409                 * This handles the case of below-255 code points, as we have
11410                 * an easy look up for those.  The next clause handles the
11411                 * above-256 one */
11412                op = IS_IN_SOME_FOLD_L1(lowest_cp)
11413                     ? EXACTFU
11414                     : EXACT;
11415            }
11416            else {  /* /i, larger code point.  Since we are under /i, and have
11417                       just this code point, we know that it can't fold to
11418                       something else, so PL_InMultiCharFold applies to it */
11419                op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11420                         ? EXACTFU_REQ8
11421                         : EXACT_REQ8;
11422                }
11423
11424                value = lowest_cp;
11425        }
11426        else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11427                 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11428        {
11429            /* Here, the only runtime dependency, if any, is from /d, and the
11430             * class matches more than one code point, and the lowest code
11431             * point participates in some fold.  It might be that the other
11432             * code points are /i equivalent to this one, and hence they would
11433             * be representable by an EXACTFish node.  Above, we eliminated
11434             * classes that contain too many code points to be EXACTFish, with
11435             * the test for MAX_FOLD_FROMS
11436             *
11437             * First, special case the ASCII fold pairs, like 'B' and 'b'.  We
11438             * do this because we have EXACTFAA at our disposal for the ASCII
11439             * range */
11440            if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11441
11442                /* The only ASCII characters that participate in folds are
11443                 * alphabetics */
11444                assert(isALPHA(lowest_cp));
11445                if (   end[0] == start[0]   /* First range is a single
11446                                               character, so 2nd exists */
11447                    && isALPHA_FOLD_EQ(start[0], start[1]))
11448                {
11449                    /* Here, is part of an ASCII fold pair */
11450
11451                    if (   ASCII_FOLD_RESTRICTED
11452                        || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11453                    {
11454                        /* If the second clause just above was true, it means
11455                         * we can't be under /i, or else the list would have
11456                         * included more than this fold pair.  Therefore we
11457                         * have to exclude the possibility of whatever else it
11458                         * is that folds to these, by using EXACTFAA */
11459                        op = EXACTFAA;
11460                    }
11461                    else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11462
11463                        /* Here, there's no simple fold that lowest_cp is part
11464                         * of, but there is a multi-character one.  If we are
11465                         * not under /i, we want to exclude that possibility;
11466                         * if under /i, we want to include it */
11467                        op = (FOLD) ? EXACTFU : EXACTFAA;
11468                    }
11469                    else {
11470
11471                        /* Here, the only possible fold lowest_cp participates in
11472                         * is with start[1].  /i or not isn't relevant */
11473                        op = EXACTFU;
11474                    }
11475
11476                    value = toFOLD(lowest_cp);
11477                }
11478            }
11479            else if (  ! upper_latin1_only_utf8_matches
11480                     || (   _invlist_len(upper_latin1_only_utf8_matches) == 2
11481                         && PL_fold_latin1[
11482                           invlist_highest(upper_latin1_only_utf8_matches)]
11483                         == lowest_cp))
11484            {
11485                /* Here, the smallest character is non-ascii or there are more
11486                 * than 2 code points matched by this node.  Also, we either
11487                 * don't have /d UTF-8 dependent matches, or if we do, they
11488                 * look like they could be a single character that is the fold
11489                 * of the lowest one is in the always-match list.  This test
11490                 * quickly excludes most of the false positives when there are
11491                 * /d UTF-8 depdendent matches.  These are like LATIN CAPITAL
11492                 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11493                 * iff the target string is UTF-8.  (We don't have to worry
11494                 * above about exceeding the array bounds of PL_fold_latin1[]
11495                 * because any code point in 'upper_latin1_only_utf8_matches'
11496                 * is below 256.)
11497                 *
11498                 * EXACTFAA would apply only to pairs (hence exactly 2 code
11499                 * points) in the ASCII range, so we can't use it here to
11500                 * artificially restrict the fold domain, so we check if the
11501                 * class does or does not match some EXACTFish node.  Further,
11502                 * if we aren't under /i, and and the folded-to character is
11503                 * part of a multi-character fold, we can't do this
11504                 * optimization, as the sequence around it could be that
11505                 * multi-character fold, and we don't here know the context, so
11506                 * we have to assume it is that multi-char fold, to prevent
11507                 * potential bugs.
11508                 *
11509                 * To do the general case, we first find the fold of the lowest
11510                 * code point (which may be higher than that lowest unfolded
11511                 * one), then find everything that folds to it.  (The data
11512                 * structure we have only maps from the folded code points, so
11513                 * we have to do the earlier step.) */
11514
11515                Size_t foldlen;
11516                U8 foldbuf[UTF8_MAXBYTES_CASE];
11517                UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11518                U32 first_fold;
11519                const U32 * remaining_folds;
11520                Size_t folds_to_this_cp_count = _inverse_folds(
11521                                                            folded,
11522                                                            &first_fold,
11523                                                            &remaining_folds);
11524                Size_t folds_count = folds_to_this_cp_count + 1;
11525                SV * fold_list = _new_invlist(folds_count);
11526                unsigned int i;
11527
11528                /* If there are UTF-8 dependent matches, create a temporary
11529                 * list of what this node matches, including them. */
11530                SV * all_cp_list = NULL;
11531                SV ** use_this_list = &cp_list;
11532
11533                if (upper_latin1_only_utf8_matches) {
11534                    all_cp_list = _new_invlist(0);
11535                    use_this_list = &all_cp_list;
11536                    _invlist_union(cp_list,
11537                                   upper_latin1_only_utf8_matches,
11538                                   use_this_list);
11539                }
11540
11541                /* Having gotten everything that participates in the fold
11542                 * containing the lowest code point, we turn that into an
11543                 * inversion list, making sure everything is included. */
11544                fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11545                fold_list = add_cp_to_invlist(fold_list, folded);
11546                if (folds_to_this_cp_count > 0) {
11547                    fold_list = add_cp_to_invlist(fold_list, first_fold);
11548                    for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11549                        fold_list = add_cp_to_invlist(fold_list,
11550                                                    remaining_folds[i]);
11551                    }
11552                }
11553
11554                /* If the fold list is identical to what's in this ANYOF node,
11555                 * the node can be represented by an EXACTFish one instead */
11556                if (_invlistEQ(*use_this_list, fold_list,
11557                               0 /* Don't complement */ )
11558                ) {
11559
11560                    /* But, we have to be careful, as mentioned above.  Just
11561                     * the right sequence of characters could match this if it
11562                     * is part of a multi-character fold.  That IS what we want
11563                     * if we are under /i.  But it ISN'T what we want if not
11564                     * under /i, as it could match when it shouldn't.  So, when
11565                     * we aren't under /i and this character participates in a
11566                     * multi-char fold, we don't optimize into an EXACTFish
11567                     * node.  So, for each case below we have to check if we
11568                     * are folding, and if not, if it is not part of a
11569                     * multi-char fold.  */
11570                    if (lowest_cp > 255) {    /* Highish code point */
11571                        if (FOLD || ! _invlist_contains_cp(
11572                                                   PL_InMultiCharFold, folded))
11573                        {
11574                            op = (LOC)
11575                                 ? EXACTFLU8
11576                                 : (ASCII_FOLD_RESTRICTED)
11577                                   ? EXACTFAA
11578                                   : EXACTFU_REQ8;
11579                            value = folded;
11580                        }
11581                    }   /* Below, the lowest code point < 256 */
11582                    else if (    FOLD
11583                             &&  folded == 's'
11584                             &&  DEPENDS_SEMANTICS)
11585                    {   /* An EXACTF node containing a single character 's',
11586                           can be an EXACTFU if it doesn't get joined with an
11587                           adjacent 's' */
11588                        op = EXACTFU_S_EDGE;
11589                        value = folded;
11590                    }
11591                    else if (     FOLD
11592                             || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11593                    {
11594                        if (upper_latin1_only_utf8_matches) {
11595                            op = EXACTF;
11596
11597                            /* We can't use the fold, as that only matches
11598                             * under UTF-8 */
11599                            value = lowest_cp;
11600                        }
11601                        else if (     UNLIKELY(lowest_cp == MICRO_SIGN)
11602                                 && ! UTF)
11603                        {   /* EXACTFUP is a special node for this character */
11604                            op = (ASCII_FOLD_RESTRICTED)
11605                                 ? EXACTFAA
11606                                 : EXACTFUP;
11607                            value = MICRO_SIGN;
11608                        }
11609                        else if (     ASCII_FOLD_RESTRICTED
11610                                 && ! isASCII(lowest_cp))
11611                        {   /* For ASCII under /iaa, we can use EXACTFU below
11612                             */
11613                            op = EXACTFAA;
11614                            value = folded;
11615                        }
11616                        else {
11617                            op = EXACTFU;
11618                            value = folded;
11619                        }
11620                    }
11621                }
11622
11623                SvREFCNT_dec_NN(fold_list);
11624                SvREFCNT_dec(all_cp_list);
11625            }
11626        }
11627
11628        if (op != END) {
11629            U8 len;
11630
11631            /* Here, we have calculated what EXACTish node to use.  Have to
11632             * convert to UTF-8 if not already there */
11633            if (value > 255) {
11634                if (! UTF) {
11635                    SvREFCNT_dec(cp_list);;
11636                    REQUIRE_UTF8(flagp);
11637                }
11638
11639                /* This is a kludge to the special casing issues with this
11640                 * ligature under /aa.  FB05 should fold to FB06, but the call
11641                 * above to _to_uni_fold_flags() didn't find this, as it didn't
11642                 * use the /aa restriction in order to not miss other folds
11643                 * that would be affected.  This is the only instance likely to
11644                 * ever be a problem in all of Unicode.  So special case it. */
11645                if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
11646                    && ASCII_FOLD_RESTRICTED)
11647                {
11648                    value = LATIN_SMALL_LIGATURE_ST;
11649                }
11650            }
11651
11652            len = (UTF) ? UVCHR_SKIP(value) : 1;
11653
11654            *ret = REGNODE_GUTS(pRExC_state, op, len);
11655            FILL_NODE(*ret, op);
11656            RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11657            setSTR_LEN(REGNODE_p(*ret), len);
11658            if (len == 1) {
11659                *STRINGs(REGNODE_p(*ret)) = (U8) value;
11660            }
11661            else {
11662                uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11663            }
11664
11665            return op;
11666        }
11667    }
11668
11669    if (! has_runtime_dependency) {
11670
11671        /* See if this can be turned into an ANYOFM node.  Think about the bit
11672         * patterns in two different bytes.  In some positions, the bits in
11673         * each will be 1; and in other positions both will be 0; and in some
11674         * positions the bit will be 1 in one byte, and 0 in the other.  Let
11675         * 'n' be the number of positions where the bits differ.  We create a
11676         * mask which has exactly 'n' 0 bits, each in a position where the two
11677         * bytes differ.  Now take the set of all bytes that when ANDed with
11678         * the mask yield the same result.  That set has 2**n elements, and is
11679         * representable by just two 8 bit numbers: the result and the mask.
11680         * Importantly, matching the set can be vectorized by creating a word
11681         * full of the result bytes, and a word full of the mask bytes,
11682         * yielding a significant speed up.  Here, see if this node matches
11683         * such a set.  As a concrete example consider [01], and the byte
11684         * representing '0' which is 0x30 on ASCII machines.  It has the bits
11685         * 0011 0000.  Take the mask 1111 1110.  If we AND 0x31 and 0x30 with
11686         * that mask we get 0x30.  Any other bytes ANDed yield something else.
11687         * So [01], which is a common usage, is optimizable into ANYOFM, and
11688         * can benefit from the speed up.  We can only do this on UTF-8
11689         * invariant bytes, because they have the same bit patterns under UTF-8
11690         * as not. */
11691        PERL_UINT_FAST8_T inverted = 0;
11692
11693        /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11694         * EBCDIC */
11695        const PERL_UINT_FAST8_T max_permissible
11696                                    = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11697
11698        /* If doesn't fit the criteria for ANYOFM, invert and try again.  If
11699         * that works we will instead later generate an NANYOFM, and invert
11700         * back when through */
11701        if (highest_cp > max_permissible) {
11702            _invlist_invert(cp_list);
11703            inverted = 1;
11704        }
11705
11706        if (invlist_highest(cp_list) <= max_permissible) {
11707            UV this_start, this_end;
11708            UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
11709            U8 bits_differing = 0;
11710            Size_t full_cp_count = 0;
11711            bool first_time = TRUE;
11712
11713            /* Go through the bytes and find the bit positions that differ */
11714            invlist_iterinit(cp_list);
11715            while (invlist_iternext(cp_list, &this_start, &this_end)) {
11716                unsigned int i = this_start;
11717
11718                if (first_time) {
11719                    if (! UVCHR_IS_INVARIANT(i)) {
11720                        goto done_anyofm;
11721                    }
11722
11723                    first_time = FALSE;
11724                    lowest_cp = this_start;
11725
11726                    /* We have set up the code point to compare with.  Don't
11727                     * compare it with itself */
11728                    i++;
11729                }
11730
11731                /* Find the bit positions that differ from the lowest code
11732                 * point in the node.  Keep track of all such positions by
11733                 * OR'ing */
11734                for (; i <= this_end; i++) {
11735                    if (! UVCHR_IS_INVARIANT(i)) {
11736                        goto done_anyofm;
11737                    }
11738
11739                    bits_differing  |= i ^ lowest_cp;
11740                }
11741
11742                full_cp_count += this_end - this_start + 1;
11743            }
11744
11745            /* At the end of the loop, we count how many bits differ from the
11746             * bits in lowest code point, call the count 'd'.  If the set we
11747             * found contains 2**d elements, it is the closure of all code
11748             * points that differ only in those bit positions.  To convince
11749             * yourself of that, first note that the number in the closure must
11750             * be a power of 2, which we test for.  The only way we could have
11751             * that count and it be some differing set, is if we got some code
11752             * points that don't differ from the lowest code point in any
11753             * position, but do differ from each other in some other position.
11754             * That means one code point has a 1 in that position, and another
11755             * has a 0.  But that would mean that one of them differs from the
11756             * lowest code point in that position, which possibility we've
11757             * already excluded.  */
11758            if (  (inverted || full_cp_count > 1)
11759                && full_cp_count == 1U << PL_bitcount[bits_differing])
11760            {
11761                U8 ANYOFM_mask;
11762
11763                op = ANYOFM + inverted;;
11764
11765                /* We need to make the bits that differ be 0's */
11766                ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11767
11768                /* The argument is the lowest code point */
11769                *ret = reg1node(pRExC_state, op, lowest_cp);
11770                FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11771            }
11772
11773          done_anyofm:
11774            invlist_iterfinish(cp_list);
11775        }
11776
11777        if (inverted) {
11778            _invlist_invert(cp_list);
11779        }
11780
11781        if (op != END) {
11782            return op;
11783        }
11784
11785        /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11786         * were invariants, it wasn't inverted, and there is a single range.
11787         * This would be faster than some of the posix nodes we create below
11788         * like /\d/a, but would be twice the size.  Without having actually
11789         * measured the gain, khw doesn't think the tradeoff is really worth it
11790         * */
11791    }
11792
11793    if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11794        PERL_UINT_FAST8_T type;
11795        SV * intersection = NULL;
11796        SV* d_invlist = NULL;
11797
11798        /* See if this matches any of the POSIX classes.  The POSIXA and POSIXD
11799         * ones are about the same speed as ANYOF ops, but take less room; the
11800         * ones that have above-Latin1 code point matches are somewhat faster
11801         * than ANYOF. */
11802
11803        for (type = POSIXA; type >= POSIXD; type--) {
11804            int posix_class;
11805
11806            if (type == POSIXL) {   /* But not /l posix classes */
11807                continue;
11808            }
11809
11810            for (posix_class = 0;
11811                 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11812                 posix_class++)
11813            {
11814                SV** our_code_points = &cp_list;
11815                SV** official_code_points;
11816                int try_inverted;
11817
11818                if (type == POSIXA) {
11819                    official_code_points = &PL_Posix_ptrs[posix_class];
11820                }
11821                else {
11822                    official_code_points = &PL_XPosix_ptrs[posix_class];
11823                }
11824
11825                /* Skip non-existent classes of this type.  e.g. \v only has an
11826                 * entry in PL_XPosix_ptrs */
11827                if (! *official_code_points) {
11828                    continue;
11829                }
11830
11831                /* Try both the regular class, and its inversion */
11832                for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11833                    bool this_inverted = *invert ^ try_inverted;
11834
11835                    if (type != POSIXD) {
11836
11837                        /* This class that isn't /d can't match if we have /d
11838                         * dependencies */
11839                        if (has_runtime_dependency
11840                                                & HAS_D_RUNTIME_DEPENDENCY)
11841                        {
11842                            continue;
11843                        }
11844                    }
11845                    else /* is /d */ if (! this_inverted) {
11846
11847                        /* /d classes don't match anything non-ASCII below 256
11848                         * unconditionally (which cp_list contains) */
11849                        _invlist_intersection(cp_list, PL_UpperLatin1,
11850                                                       &intersection);
11851                        if (_invlist_len(intersection) != 0) {
11852                            continue;
11853                        }
11854
11855                        SvREFCNT_dec(d_invlist);
11856                        d_invlist = invlist_clone(cp_list, NULL);
11857
11858                        /* But under UTF-8 it turns into using /u rules.  Add
11859                         * the things it matches under these conditions so that
11860                         * we check below that these are identical to what the
11861                         * tested class should match */
11862                        if (upper_latin1_only_utf8_matches) {
11863                            _invlist_union(
11864                                        d_invlist,
11865                                        upper_latin1_only_utf8_matches,
11866                                        &d_invlist);
11867                        }
11868                        our_code_points = &d_invlist;
11869                    }
11870                    else {  /* POSIXD, inverted.  If this doesn't have this
11871                               flag set, it isn't /d. */
11872                        if (! ( *anyof_flags
11873                               & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11874                        {
11875                            continue;
11876                        }
11877
11878                        our_code_points = &cp_list;
11879                    }
11880
11881                    /* Here, have weeded out some things.  We want to see if
11882                     * the list of characters this node contains
11883                     * ('*our_code_points') precisely matches those of the
11884                     * class we are currently checking against
11885                     * ('*official_code_points'). */
11886                    if (_invlistEQ(*our_code_points,
11887                                   *official_code_points,
11888                                   try_inverted))
11889                    {
11890                        /* Here, they precisely match.  Optimize this ANYOF
11891                         * node into its equivalent POSIX one of the correct
11892                         * type, possibly inverted.
11893                         *
11894                         * Some of these nodes match a single range of
11895                         * characters (or [:alpha:] matches two parallel ranges
11896                         * on ASCII platforms).  The array lookup at execution
11897                         * time could be replaced by a range check for such
11898                         * nodes.  But regnodes are a finite resource, and the
11899                         * possible performance boost isn't large, so this
11900                         * hasn't been done.  An attempt to use just one node
11901                         * (and its inverse) to encompass all such cases was
11902                         * made in d62feba66bf43f35d092bb026694f927e9f94d38.
11903                         * But the shifting/masking it used ended up being
11904                         * slower than the array look up, so it was reverted */
11905                        op = (try_inverted)
11906                            ? type + NPOSIXA - POSIXA
11907                            : type;
11908                        *ret = reg_node(pRExC_state, op);
11909                        FLAGS(REGNODE_p(*ret)) = posix_class;
11910                        SvREFCNT_dec(d_invlist);
11911                        SvREFCNT_dec(intersection);
11912                        return op;
11913                    }
11914                }
11915            }
11916        }
11917        SvREFCNT_dec(d_invlist);
11918        SvREFCNT_dec(intersection);
11919    }
11920
11921    /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
11922     * in size and speed.  Currently, a 20 bit range base (smallest code point
11923     * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
11924     * This allows for using it on all of the Unicode code points except for
11925     * the highest plane, which is only for private use code points.  khw
11926     * doubts that a bigger delta is likely in real world applications */
11927    if (     single_range
11928        && ! has_runtime_dependency
11929        &&   *anyof_flags == 0
11930        &&   start[0] < (1 << ANYOFR_BASE_BITS)
11931        &&   end[0] - start[0]
11932                < ((1U << (sizeof(ARG1u_LOC(NULL))
11933                               * CHARBITS - ANYOFR_BASE_BITS))))
11934
11935    {
11936        U8 low_utf8[UTF8_MAXBYTES+1];
11937        U8 high_utf8[UTF8_MAXBYTES+1];
11938
11939        op = ANYOFR;
11940        *ret = reg1node(pRExC_state, op,
11941                        (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
11942
11943        /* Place the lowest UTF-8 start byte in the flags field, so as to allow
11944         * efficient ruling out at run time of many possible inputs.  */
11945        (void) uvchr_to_utf8(low_utf8, start[0]);
11946        (void) uvchr_to_utf8(high_utf8, end[0]);
11947
11948        /* If all code points share the same first byte, this can be an
11949         * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
11950         * quickly rule out many inputs at run-time without having to compute
11951         * the code point from UTF-8.  For EBCDIC, we use I8, as not doing that
11952         * transformation would not rule out nearly so many things */
11953        if (low_utf8[0] == high_utf8[0]) {
11954            op = ANYOFRb;
11955            OP(REGNODE_p(*ret)) = op;
11956            ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
11957        }
11958        else {
11959            ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
11960        }
11961
11962        return op;
11963    }
11964
11965    /* If didn't find an optimization and there is no need for a bitmap,
11966     * of the lowest code points, optimize to indicate that */
11967    if (     lowest_cp >= NUM_ANYOF_CODE_POINTS
11968        && ! LOC
11969        && ! upper_latin1_only_utf8_matches
11970        &&   *anyof_flags == 0)
11971    {
11972        U8 low_utf8[UTF8_MAXBYTES+1];
11973        UV highest_cp = invlist_highest(cp_list);
11974
11975        /* Currently the maximum allowed code point by the system is IV_MAX.
11976         * Higher ones are reserved for future internal use.  This particular
11977         * regnode can be used for higher ones, but we can't calculate the code
11978         * point of those.  IV_MAX suffices though, as it will be a large first
11979         * byte */
11980        Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
11981                       - low_utf8;
11982
11983        /* We store the lowest possible first byte of the UTF-8 representation,
11984         * using the flags field.  This allows for quick ruling out of some
11985         * inputs without having to convert from UTF-8 to code point.  For
11986         * EBCDIC, we use I8, as not doing that transformation would not rule
11987         * out nearly so many things */
11988        *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
11989
11990        op = ANYOFH;
11991
11992        /* If the first UTF-8 start byte for the highest code point in the
11993         * range is suitably small, we may be able to get an upper bound as
11994         * well */
11995        if (highest_cp <= IV_MAX) {
11996            U8 high_utf8[UTF8_MAXBYTES+1];
11997            Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
11998
11999            /* If the lowest and highest are the same, we can get an exact
12000             * first byte instead of a just minimum or even a sequence of exact
12001             * leading bytes.  We signal these with different regnodes */
12002            if (low_utf8[0] == high_utf8[0]) {
12003                Size_t len = find_first_differing_byte_pos(low_utf8,
12004                                                           high_utf8,
12005                                                   MIN(low_len, high_len));
12006                if (len == 1) {
12007
12008                    /* No need to convert to I8 for EBCDIC as this is an exact
12009                     * match */
12010                    *anyof_flags = low_utf8[0];
12011
12012                    if (high_len == 2) {
12013                        /* If the elements matched all have a 2-byte UTF-8
12014                         * representation, with the first byte being the same,
12015                         * we can use a compact, fast regnode. capable of
12016                         * matching any combination of continuation byte
12017                         * patterns.
12018                         *
12019                         * (A similar regnode could be created for the Latin1
12020                         * range; the complication being that it could match
12021                         * non-UTF8 targets.  The internal bitmap would serve
12022                         * both cases; with some extra code in regexec.c) */
12023                        op = ANYOFHbbm;
12024                        *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12025                        FILL_NODE(*ret, op);
12026                        FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12027
12028                        /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12029                         * directly based on the continuation byte, without
12030                         * needing to convert to code point */
12031                        populate_bitmap_from_invlist(
12032                            cp_list,
12033
12034                            /* The base code point is from the start byte */
12035                            TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12036                                                    UTF_CONTINUATION_MARK | 0),
12037
12038                            ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12039                            REGNODE_BBM_BITMAP_LEN);
12040                        RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12041                        return op;
12042                    }
12043                    else {
12044                        op = ANYOFHb;
12045                    }
12046                }
12047                else {
12048                    op = ANYOFHs;
12049                    *ret = REGNODE_GUTS(pRExC_state, op,
12050                                       REGNODE_ARG_LEN(op) + STR_SZ(len));
12051                    FILL_NODE(*ret, op);
12052                    STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12053                                                                    = len;
12054                    Copy(low_utf8,  /* Add the common bytes */
12055                    ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12056                       len, U8);
12057                    RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12058                    set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12059                                              NULL, only_utf8_locale_list);
12060                    return op;
12061                }
12062            }
12063            else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12064
12065                /* Here, the high byte is not the same as the low, but is small
12066                 * enough that its reasonable to have a loose upper bound,
12067                 * which is packed in with the strict lower bound.  See
12068                 * comments at the definition of MAX_ANYOF_HRx_BYTE.  On EBCDIC
12069                 * platforms, I8 is used.  On ASCII platforms I8 is the same
12070                 * thing as UTF-8 */
12071
12072                U8 bits = 0;
12073                U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12074                U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12075                            - *anyof_flags;
12076
12077                if (range_diff <= max_range_diff / 8) {
12078                    bits = 3;
12079                }
12080                else if (range_diff <= max_range_diff / 4) {
12081                    bits = 2;
12082                }
12083                else if (range_diff <= max_range_diff / 2) {
12084                    bits = 1;
12085                }
12086                *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12087                op = ANYOFHr;
12088            }
12089        }
12090    }
12091
12092    return op;
12093
12094  return_OPFAIL:
12095    op = OPFAIL;
12096    *ret = reg1node(pRExC_state, op, 0);
12097    return op;
12098
12099  return_SANY:
12100    op = SANY;
12101    *ret = reg_node(pRExC_state, op);
12102    MARK_NAUGHTY(1);
12103    return op;
12104}
12105
12106#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12107
12108void
12109Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12110                regnode* const node,
12111                SV* const cp_list,
12112                SV* const runtime_defns,
12113                SV* const only_utf8_locale_list)
12114{
12115    /* Sets the arg field of an ANYOF-type node 'node', using information about
12116     * the node passed-in.  If only the bitmap is needed to determine what
12117     * matches, the arg is set appropriately to either
12118     *      1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12119     *      2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12120     *
12121     * Otherwise, it sets the argument to the count returned by reg_add_data(),
12122     * having allocated and stored an array, av, as follows:
12123     *  av[0] stores the inversion list defining this class as far as known at
12124     *        this time, or PL_sv_undef if nothing definite is now known.
12125     *  av[1] stores the inversion list of code points that match only if the
12126     *        current locale is UTF-8, or if none, PL_sv_undef if there is an
12127     *        av[2], or no entry otherwise.
12128     *  av[2] stores the list of user-defined properties whose subroutine
12129     *        definitions aren't known at this time, or no entry if none. */
12130
12131    UV n;
12132
12133    PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12134
12135    /* If this is set, the final disposition won't be known until runtime, so
12136     * we can't do any of the compile time optimizations */
12137    if (! runtime_defns) {
12138
12139        /* On plain ANYOF nodes without the possibility of a runtime locale
12140         * making a difference, maybe there's no information to be gleaned
12141         * except for what's in the bitmap */
12142        if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12143
12144            /* There are two such cases:
12145             *  1)  there is no list of code points matched outside the bitmap
12146             */
12147            if (! cp_list) {
12148                ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12149                return;
12150            }
12151
12152            /*  2)  the list indicates everything outside the bitmap matches */
12153            if (   invlist_highest(cp_list) == UV_MAX
12154                && invlist_highest_range_start(cp_list)
12155                                                       <= NUM_ANYOF_CODE_POINTS)
12156            {
12157                ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12158                return;
12159            }
12160
12161            /* In all other cases there are things outside the bitmap that we
12162             * may need to check at runtime. */
12163        }
12164
12165        /* Here, we have resolved all the possible run-time matches, and they
12166         * are stored in one or both of two possible lists.  (While some match
12167         * only under certain runtime circumstances, we know all the possible
12168         * ones for each such circumstance.)
12169         *
12170         * It may very well be that the pattern being compiled contains an
12171         * identical class, already encountered.  Reusing that class here saves
12172         * space.  Look through all classes so far encountered. */
12173        U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12174        for (unsigned int i = 0; i < existing_items; i++) {
12175
12176            /* Only look at auxiliary data of this type */
12177            if (RExC_rxi->data->what[i] != 's') {
12178                continue;
12179            }
12180
12181            SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12182            AV * const av = MUTABLE_AV(SvRV(rv));
12183
12184            /* If the already encountered class has data that won't be known
12185             * until runtime (stored in the final element of the array), we
12186             * can't share */
12187            if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12188                continue;
12189            }
12190
12191            SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12192                                                false /* no lvalue */);
12193
12194            /* The new and the existing one both have to have or both not
12195             * have this element, for this one to duplicate that one */
12196            if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12197                continue;
12198            }
12199
12200            /* If the inversion lists aren't equivalent, can't share */
12201            if (cp_list && ! _invlistEQ(cp_list,
12202                                        *stored_cp_list_ptr,
12203                                        FALSE /* don't complement */))
12204            {
12205                continue;
12206            }
12207
12208            /* Similarly for the other list */
12209            SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12210                                                av,
12211                                                ONLY_LOCALE_MATCHES_INDEX,
12212                                                false /* no lvalue */);
12213            if (   cBOOL(only_utf8_locale_list)
12214                != cBOOL(stored_only_utf8_locale_list_ptr))
12215            {
12216                continue;
12217            }
12218
12219            if (only_utf8_locale_list && ! _invlistEQ(
12220                                         only_utf8_locale_list,
12221                                         *stored_only_utf8_locale_list_ptr,
12222                                         FALSE /* don't complement */))
12223            {
12224                continue;
12225            }
12226
12227            /* Here, the existence and contents of both compile-time lists
12228             * are identical between the new and existing data.  Re-use the
12229             * existing one */
12230            ARG1u_SET(node, i);
12231            return;
12232        } /* end of loop through existing classes */
12233    }
12234
12235    /* Here, we need to create a new auxiliary data element; either because
12236     * this doesn't duplicate an existing one, or we can't tell at this time if
12237     * it eventually will */
12238
12239    AV * const av = newAV();
12240    SV *rv;
12241
12242    if (cp_list) {
12243        av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12244    }
12245
12246    /* (Note that if any of this changes, the size calculations in
12247     * S_optimize_regclass() might need to be updated.) */
12248
12249    if (only_utf8_locale_list) {
12250        av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12251                                       SvREFCNT_inc_NN(only_utf8_locale_list));
12252    }
12253
12254    if (runtime_defns) {
12255        av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12256                     SvREFCNT_inc_NN(runtime_defns));
12257    }
12258
12259    rv = newRV_noinc(MUTABLE_SV(av));
12260    n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12261    RExC_rxi->data->data[n] = (void*)rv;
12262    ARG1u_SET(node, n);
12263}
12264
12265SV *
12266
12267#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12268Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12269#else
12270Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12271#endif
12272
12273{
12274    /* For internal core use only.
12275     * Returns the inversion list for the input 'node' in the regex 'prog'.
12276     * If <doinit> is 'true', will attempt to create the inversion list if not
12277     *    already done.  If it is created, it will add to the normal inversion
12278     *    list any that comes from user-defined properties.  It croaks if this
12279     *    is called before such a list is ready to be generated, that is when a
12280     *    user-defined property has been declared, buyt still not yet defined.
12281     * If <listsvp> is non-null, will return the printable contents of the
12282     *    property definition.  This can be used to get debugging information
12283     *    even before the inversion list exists, by calling this function with
12284     *    'doinit' set to false, in which case the components that will be used
12285     *    to eventually create the inversion list are returned  (in a printable
12286     *    form).
12287     * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12288     *    store an inversion list of code points that should match only if the
12289     *    execution-time locale is a UTF-8 one.
12290     * If <output_invlist> is not NULL, it is where this routine is to store an
12291     *    inversion list of the code points that would be instead returned in
12292     *    <listsvp> if this were NULL.  Thus, what gets output in <listsvp>
12293     *    when this parameter is used, is just the non-code point data that
12294     *    will go into creating the inversion list.  This currently should be just
12295     *    user-defined properties whose definitions were not known at compile
12296     *    time.  Using this parameter allows for easier manipulation of the
12297     *    inversion list's data by the caller.  It is illegal to call this
12298     *    function with this parameter set, but not <listsvp>
12299     *
12300     * Tied intimately to how S_set_ANYOF_arg sets up the data structure.  Note
12301     * that, in spite of this function's name, the inversion list it returns
12302     * may include the bitmap data as well */
12303
12304    SV *si  = NULL;         /* Input initialization string */
12305    SV* invlist = NULL;
12306
12307    RXi_GET_DECL_NULL(prog, progi);
12308    const struct reg_data * const data = prog ? progi->data : NULL;
12309
12310#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12311    PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12312#else
12313    PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12314#endif
12315    assert(! output_invlist || listsvp);
12316
12317    if (data && data->count) {
12318        const U32 n = ARG1u(node);
12319
12320        if (data->what[n] == 's') {
12321            SV * const rv = MUTABLE_SV(data->data[n]);
12322            AV * const av = MUTABLE_AV(SvRV(rv));
12323            SV **const ary = AvARRAY(av);
12324
12325            invlist = ary[INVLIST_INDEX];
12326
12327            if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12328                *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12329            }
12330
12331            if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12332                si = ary[DEFERRED_USER_DEFINED_INDEX];
12333            }
12334
12335            if (doinit && (si || invlist)) {
12336                if (si) {
12337                    bool user_defined;
12338                    SV * msg = newSVpvs_flags("", SVs_TEMP);
12339
12340                    SV * prop_definition = handle_user_defined_property(
12341                            "", 0, FALSE,   /* There is no \p{}, \P{} */
12342                            SvPVX_const(si)[1] - '0',   /* /i or not has been
12343                                                           stored here for just
12344                                                           this occasion */
12345                            TRUE,           /* run time */
12346                            FALSE,          /* This call must find the defn */
12347                            si,             /* The property definition  */
12348                            &user_defined,
12349                            msg,
12350                            0               /* base level call */
12351                           );
12352
12353                    if (SvCUR(msg)) {
12354                        assert(prop_definition == NULL);
12355
12356                        Perl_croak(aTHX_ "%" UTF8f,
12357                                UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12358                    }
12359
12360                    if (invlist) {
12361                        _invlist_union(invlist, prop_definition, &invlist);
12362                        SvREFCNT_dec_NN(prop_definition);
12363                    }
12364                    else {
12365                        invlist = prop_definition;
12366                    }
12367
12368                    STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12369                    STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12370
12371                    ary[INVLIST_INDEX] = invlist;
12372                    av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12373                                 ? ONLY_LOCALE_MATCHES_INDEX
12374                                 : INVLIST_INDEX);
12375                    si = NULL;
12376                }
12377            }
12378        }
12379    }
12380
12381    /* If requested, return a printable version of what this ANYOF node matches
12382     * */
12383    if (listsvp) {
12384        SV* matches_string = NULL;
12385
12386        /* This function can be called at compile-time, before everything gets
12387         * resolved, in which case we return the currently best available
12388         * information, which is the string that will eventually be used to do
12389         * that resolving, 'si' */
12390        if (si) {
12391            /* Here, we only have 'si' (and possibly some passed-in data in
12392             * 'invlist', which is handled below)  If the caller only wants
12393             * 'si', use that.  */
12394            if (! output_invlist) {
12395                matches_string = newSVsv(si);
12396            }
12397            else {
12398                /* But if the caller wants an inversion list of the node, we
12399                 * need to parse 'si' and place as much as possible in the
12400                 * desired output inversion list, making 'matches_string' only
12401                 * contain the currently unresolvable things */
12402                const char *si_string = SvPVX(si);
12403                STRLEN remaining = SvCUR(si);
12404                UV prev_cp = 0;
12405                U8 count = 0;
12406
12407                /* Ignore everything before and including the first new-line */
12408                si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12409                assert (si_string != NULL);
12410                si_string++;
12411                remaining = SvPVX(si) + SvCUR(si) - si_string;
12412
12413                while (remaining > 0) {
12414
12415                    /* The data consists of just strings defining user-defined
12416                     * property names, but in prior incarnations, and perhaps
12417                     * somehow from pluggable regex engines, it could still
12418                     * hold hex code point definitions, all of which should be
12419                     * legal (or it wouldn't have gotten this far).  Each
12420                     * component of a range would be separated by a tab, and
12421                     * each range by a new-line.  If these are found, instead
12422                     * add them to the inversion list */
12423                    I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
12424                                     |PERL_SCAN_SILENT_NON_PORTABLE;
12425                    STRLEN len = remaining;
12426                    UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12427
12428                    /* If the hex decode routine found something, it should go
12429                     * up to the next \n */
12430                    if (   *(si_string + len) == '\n') {
12431                        if (count) {    /* 2nd code point on line */
12432                            *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12433                        }
12434                        else {
12435                            *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12436                        }
12437                        count = 0;
12438                        goto prepare_for_next_iteration;
12439                    }
12440
12441                    /* If the hex decode was instead for the lower range limit,
12442                     * save it, and go parse the upper range limit */
12443                    if (*(si_string + len) == '\t') {
12444                        assert(count == 0);
12445
12446                        prev_cp = cp;
12447                        count = 1;
12448                      prepare_for_next_iteration:
12449                        si_string += len + 1;
12450                        remaining -= len + 1;
12451                        continue;
12452                    }
12453
12454                    /* Here, didn't find a legal hex number.  Just add the text
12455                     * from here up to the next \n, omitting any trailing
12456                     * markers. */
12457
12458                    remaining -= len;
12459                    len = strcspn(si_string,
12460                                        DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12461                    remaining -= len;
12462                    if (matches_string) {
12463                        sv_catpvn(matches_string, si_string, len);
12464                    }
12465                    else {
12466                        matches_string = newSVpvn(si_string, len);
12467                    }
12468                    sv_catpvs(matches_string, " ");
12469
12470                    si_string += len;
12471                    if (   remaining
12472                        && UCHARAT(si_string)
12473                                            == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12474                    {
12475                        si_string++;
12476                        remaining--;
12477                    }
12478                    if (remaining && UCHARAT(si_string) == '\n') {
12479                        si_string++;
12480                        remaining--;
12481                    }
12482                } /* end of loop through the text */
12483
12484                assert(matches_string);
12485                if (SvCUR(matches_string)) {  /* Get rid of trailing blank */
12486                    SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12487                }
12488            } /* end of has an 'si' */
12489        }
12490
12491        /* Add the stuff that's already known */
12492        if (invlist) {
12493
12494            /* Again, if the caller doesn't want the output inversion list, put
12495             * everything in 'matches-string' */
12496            if (! output_invlist) {
12497                if ( ! matches_string) {
12498                    matches_string = newSVpvs("\n");
12499                }
12500                sv_catsv(matches_string, invlist_contents(invlist,
12501                                                  TRUE /* traditional style */
12502                                                  ));
12503            }
12504            else if (! *output_invlist) {
12505                *output_invlist = invlist_clone(invlist, NULL);
12506            }
12507            else {
12508                _invlist_union(*output_invlist, invlist, output_invlist);
12509            }
12510        }
12511
12512        *listsvp = matches_string;
12513    }
12514
12515    return invlist;
12516}
12517
12518/* reg_skipcomment()
12519
12520   Absorbs an /x style # comment from the input stream,
12521   returning a pointer to the first character beyond the comment, or if the
12522   comment terminates the pattern without anything following it, this returns
12523   one past the final character of the pattern (in other words, RExC_end) and
12524   sets the REG_RUN_ON_COMMENT_SEEN flag.
12525
12526   Note it's the callers responsibility to ensure that we are
12527   actually in /x mode
12528
12529*/
12530
12531PERL_STATIC_INLINE char*
12532S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12533{
12534    PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12535
12536    assert(*p == '#');
12537
12538    while (p < RExC_end) {
12539        if (*(++p) == '\n') {
12540            return p+1;
12541        }
12542    }
12543
12544    /* we ran off the end of the pattern without ending the comment, so we have
12545     * to add an \n when wrapping */
12546    RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12547    return p;
12548}
12549
12550STATIC void
12551S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12552                                char ** p,
12553                                const bool force_to_xmod
12554                         )
12555{
12556    /* If the text at the current parse position '*p' is a '(?#...)' comment,
12557     * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12558     * is /x whitespace, advance '*p' so that on exit it points to the first
12559     * byte past all such white space and comments */
12560
12561    const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12562
12563    PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12564
12565    assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12566
12567    for (;;) {
12568        if (RExC_end - (*p) >= 3
12569            && *(*p)     == '('
12570            && *(*p + 1) == '?'
12571            && *(*p + 2) == '#')
12572        {
12573            while (*(*p) != ')') {
12574                if ((*p) == RExC_end)
12575                    FAIL("Sequence (?#... not terminated");
12576                (*p)++;
12577            }
12578            (*p)++;
12579            continue;
12580        }
12581
12582        if (use_xmod) {
12583            const char * save_p = *p;
12584            while ((*p) < RExC_end) {
12585                STRLEN len;
12586                if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12587                    (*p) += len;
12588                }
12589                else if (*(*p) == '#') {
12590                    (*p) = reg_skipcomment(pRExC_state, (*p));
12591                }
12592                else {
12593                    break;
12594                }
12595            }
12596            if (*p != save_p) {
12597                continue;
12598            }
12599        }
12600
12601        break;
12602    }
12603
12604    return;
12605}
12606
12607/* nextchar()
12608
12609   Advances the parse position by one byte, unless that byte is the beginning
12610   of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
12611   those two cases, the parse position is advanced beyond all such comments and
12612   white space.
12613
12614   This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12615*/
12616
12617STATIC void
12618S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12619{
12620    PERL_ARGS_ASSERT_NEXTCHAR;
12621
12622    if (RExC_parse < RExC_end) {
12623        assert(   ! UTF
12624               || UTF8_IS_INVARIANT(*RExC_parse)
12625               || UTF8_IS_START(*RExC_parse));
12626
12627        RExC_parse_inc_safe();
12628
12629        skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12630                                FALSE /* Don't force /x */ );
12631    }
12632}
12633
12634STATIC void
12635S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12636{
12637    /* 'size' is the delta number of smallest regnode equivalents to add or
12638     * subtract from the current memory allocated to the regex engine being
12639     * constructed. */
12640
12641    PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12642
12643    RExC_size += size;
12644
12645    Renewc(RExC_rxi,
12646           sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12647                                                /* +1 for REG_MAGIC */
12648           char,
12649           regexp_internal);
12650    if ( RExC_rxi == NULL )
12651        FAIL("Regexp out of space");
12652    RXi_SET(RExC_rx, RExC_rxi);
12653
12654    RExC_emit_start = RExC_rxi->program;
12655    if (size > 0) {
12656        Zero(REGNODE_p(RExC_emit), size, regnode);
12657    }
12658}
12659
12660STATIC regnode_offset
12661S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12662{
12663    /* Allocate a regnode that is (1 + extra_size) times as big as the
12664     * smallest regnode worth of space, and also aligns and increments
12665     * RExC_size appropriately.
12666     *
12667     * It returns the regnode's offset into the regex engine program */
12668
12669    const regnode_offset ret = RExC_emit;
12670
12671    PERL_ARGS_ASSERT_REGNODE_GUTS;
12672
12673    SIZE_ALIGN(RExC_size);
12674    change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12675    NODE_ALIGN_FILL(REGNODE_p(ret));
12676    return(ret);
12677}
12678
12679#ifdef DEBUGGING
12680
12681STATIC regnode_offset
12682S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12683    PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12684    assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12685    return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12686}
12687
12688#endif
12689
12690
12691
12692/*
12693- reg_node - emit a node
12694*/
12695STATIC regnode_offset /* Location. */
12696S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12697{
12698    const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12699    regnode_offset ptr = ret;
12700
12701    PERL_ARGS_ASSERT_REG_NODE;
12702
12703    assert(REGNODE_ARG_LEN(op) == 0);
12704
12705    FILL_ADVANCE_NODE(ptr, op);
12706    RExC_emit = ptr;
12707    return(ret);
12708}
12709
12710/*
12711- reg1node - emit a node with an argument
12712*/
12713STATIC regnode_offset /* Location. */
12714S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12715{
12716    const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12717    regnode_offset ptr = ret;
12718
12719    PERL_ARGS_ASSERT_REG1NODE;
12720
12721    /* ANYOF are special cased to allow non-length 1 args */
12722    assert(REGNODE_ARG_LEN(op) == 1);
12723
12724    FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12725    RExC_emit = ptr;
12726    return(ret);
12727}
12728
12729/*
12730- regpnode - emit a temporary node with a SV* argument
12731*/
12732STATIC regnode_offset /* Location. */
12733S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12734{
12735    const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12736    regnode_offset ptr = ret;
12737
12738    PERL_ARGS_ASSERT_REGPNODE;
12739
12740    FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12741    RExC_emit = ptr;
12742    return(ret);
12743}
12744
12745STATIC regnode_offset
12746S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12747{
12748    /* emit a node with U32 and I32 arguments */
12749
12750    const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12751    regnode_offset ptr = ret;
12752
12753    PERL_ARGS_ASSERT_REG2NODE;
12754
12755    assert(REGNODE_ARG_LEN(op) == 2);
12756
12757    FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12758    RExC_emit = ptr;
12759    return(ret);
12760}
12761
12762/*
12763- reginsert - insert an operator in front of already-emitted operand
12764*
12765* That means that on exit 'operand' is the offset of the newly inserted
12766* operator, and the original operand has been relocated.
12767*
12768* IMPORTANT NOTE - it is the *callers* responsibility to correctly
12769* set up NEXT_OFF() of the inserted node if needed. Something like this:
12770*
12771*   reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12772*   NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12773*
12774* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12775*/
12776STATIC void
12777S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12778                  const regnode_offset operand, const U32 depth)
12779{
12780    regnode *src;
12781    regnode *dst;
12782    regnode *place;
12783    const int offset = REGNODE_ARG_LEN((U8)op);
12784    const int size = NODE_STEP_REGNODE + offset;
12785    DECLARE_AND_GET_RE_DEBUG_FLAGS;
12786
12787    PERL_ARGS_ASSERT_REGINSERT;
12788    PERL_UNUSED_CONTEXT;
12789    PERL_UNUSED_ARG(depth);
12790    DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12791    assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12792                                    studying. If this is wrong then we need to adjust RExC_recurse
12793                                    below like we do with RExC_open_parens/RExC_close_parens. */
12794    change_engine_size(pRExC_state, (Ptrdiff_t) size);
12795    src = REGNODE_p(RExC_emit);
12796    RExC_emit += size;
12797    dst = REGNODE_p(RExC_emit);
12798
12799    /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12800     * and [perl #133871] shows this can lead to problems, so skip this
12801     * realignment of parens until a later pass when they are reliable */
12802    if (! IN_PARENS_PASS && RExC_open_parens) {
12803        int paren;
12804        /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12805        /* remember that RExC_npar is rex->nparens + 1,
12806         * iow it is 1 more than the number of parens seen in
12807         * the pattern so far. */
12808        for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12809            /* note, RExC_open_parens[0] is the start of the
12810             * regex, it can't move. RExC_close_parens[0] is the end
12811             * of the regex, it *can* move. */
12812            if ( paren && RExC_open_parens[paren] >= operand ) {
12813                /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12814                RExC_open_parens[paren] += size;
12815            } else {
12816                /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12817            }
12818            if ( RExC_close_parens[paren] >= operand ) {
12819                /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12820                RExC_close_parens[paren] += size;
12821            } else {
12822                /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12823            }
12824        }
12825    }
12826    if (RExC_end_op)
12827        RExC_end_op += size;
12828
12829    while (src > REGNODE_p(operand)) {
12830        StructCopy(--src, --dst, regnode);
12831    }
12832
12833    place = REGNODE_p(operand);	/* Op node, where operand used to be. */
12834    src = place + 1; /* NOT REGNODE_AFTER! */
12835    FLAGS(place) = 0;
12836    FILL_NODE(operand, op);
12837
12838    /* Zero out any arguments in the new node */
12839    Zero(src, offset, regnode);
12840}
12841
12842/*
12843- regtail - set the next-pointer at the end of a node chain of p to val.  If
12844            that value won't fit in the space available, instead returns FALSE.
12845            (Except asserts if we can't fit in the largest space the regex
12846            engine is designed for.)
12847- SEE ALSO: regtail_study
12848*/
12849STATIC bool
12850S_regtail(pTHX_ RExC_state_t * pRExC_state,
12851                const regnode_offset p,
12852                const regnode_offset val,
12853                const U32 depth)
12854{
12855    regnode_offset scan;
12856    DECLARE_AND_GET_RE_DEBUG_FLAGS;
12857
12858    PERL_ARGS_ASSERT_REGTAIL;
12859#ifndef DEBUGGING
12860    PERL_UNUSED_ARG(depth);
12861#endif
12862
12863    /* The final node in the chain is the first one with a nonzero next pointer
12864     * */
12865    scan = (regnode_offset) p;
12866    for (;;) {
12867        regnode * const temp = regnext(REGNODE_p(scan));
12868        DEBUG_PARSE_r({
12869            DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12870            regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12871            Perl_re_printf( aTHX_  "~ %s (%zu) %s %s\n",
12872                SvPV_nolen_const(RExC_mysv), scan,
12873                    (temp == NULL ? "->" : ""),
12874                    (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12875            );
12876        });
12877        if (temp == NULL)
12878            break;
12879        scan = REGNODE_OFFSET(temp);
12880    }
12881
12882    /* Populate this node's next pointer */
12883    assert(val >= scan);
12884    if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12885        assert((UV) (val - scan) <= U32_MAX);
12886        ARG1u_SET(REGNODE_p(scan), val - scan);
12887    }
12888    else {
12889        if (val - scan > U16_MAX) {
12890            /* Populate this with something that won't loop and will likely
12891             * lead to a crash if the caller ignores the failure return, and
12892             * execution continues */
12893            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12894            return FALSE;
12895        }
12896        NEXT_OFF(REGNODE_p(scan)) = val - scan;
12897    }
12898
12899    return TRUE;
12900}
12901
12902#ifdef DEBUGGING
12903/*
12904- regtail_study - set the next-pointer at the end of a node chain of p to val.
12905- Look for optimizable sequences at the same time.
12906- currently only looks for EXACT chains.
12907
12908This is experimental code. The idea is to use this routine to perform
12909in place optimizations on branches and groups as they are constructed,
12910with the long term intention of removing optimization from study_chunk so
12911that it is purely analytical.
12912
12913Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12914to control which is which.
12915
12916This used to return a value that was ignored.  It was a problem that it is
12917#ifdef'd to be another function that didn't return a value.  khw has changed it
12918so both currently return a pass/fail return.
12919
12920*/
12921/* TODO: All four parms should be const */
12922
12923STATIC bool
12924S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
12925                      const regnode_offset val, U32 depth)
12926{
12927    regnode_offset scan;
12928    U8 exact = PSEUDO;
12929#ifdef EXPERIMENTAL_INPLACESCAN
12930    I32 min = 0;
12931#endif
12932    DECLARE_AND_GET_RE_DEBUG_FLAGS;
12933
12934    PERL_ARGS_ASSERT_REGTAIL_STUDY;
12935
12936
12937    /* Find last node. */
12938
12939    scan = p;
12940    for (;;) {
12941        regnode * const temp = regnext(REGNODE_p(scan));
12942#ifdef EXPERIMENTAL_INPLACESCAN
12943        if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12944            bool unfolded_multi_char;	/* Unexamined in this routine */
12945            if (join_exact(pRExC_state, scan, &min,
12946                           &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
12947                return TRUE; /* Was return EXACT */
12948        }
12949#endif
12950        if ( exact ) {
12951            if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12952                if (exact == PSEUDO )
12953                    exact= OP(REGNODE_p(scan));
12954                else if (exact != OP(REGNODE_p(scan)) )
12955                    exact= 0;
12956            }
12957            else if (OP(REGNODE_p(scan)) != NOTHING) {
12958                exact= 0;
12959            }
12960        }
12961        DEBUG_PARSE_r({
12962            DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12963            regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12964            Perl_re_printf( aTHX_  "~ %s (%zu) -> %s\n",
12965                SvPV_nolen_const(RExC_mysv),
12966                scan,
12967                REGNODE_NAME(exact));
12968        });
12969        if (temp == NULL)
12970            break;
12971        scan = REGNODE_OFFSET(temp);
12972    }
12973    DEBUG_PARSE_r({
12974        DEBUG_PARSE_MSG("");
12975        regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
12976        Perl_re_printf( aTHX_
12977                      "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
12978                      SvPV_nolen_const(RExC_mysv),
12979                      (IV)val,
12980                      (IV)(val - scan)
12981        );
12982    });
12983    if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12984        assert((UV) (val - scan) <= U32_MAX);
12985        ARG1u_SET(REGNODE_p(scan), val - scan);
12986    }
12987    else {
12988        if (val - scan > U16_MAX) {
12989            /* Populate this with something that won't loop and will likely
12990             * lead to a crash if the caller ignores the failure return, and
12991             * execution continues */
12992            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12993            return FALSE;
12994        }
12995        NEXT_OFF(REGNODE_p(scan)) = val - scan;
12996    }
12997
12998    return TRUE; /* Was 'return exact' */
12999}
13000#endif
13001
13002SV*
13003Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13004
13005    /* Returns an inversion list of all the code points matched by the
13006     * ANYOFM/NANYOFM node 'n' */
13007
13008    SV * cp_list = _new_invlist(-1);
13009    const U8 lowest = (U8) ARG1u(n);
13010    unsigned int i;
13011    U8 count = 0;
13012    U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13013
13014    PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13015
13016    /* Starting with the lowest code point, any code point that ANDed with the
13017     * mask yields the lowest code point is in the set */
13018    for (i = lowest; i <= 0xFF; i++) {
13019        if ((i & FLAGS(n)) == ARG1u(n)) {
13020            cp_list = add_cp_to_invlist(cp_list, i);
13021            count++;
13022
13023            /* We know how many code points (a power of two) that are in the
13024             * set.  No use looking once we've got that number */
13025            if (count >= needed) break;
13026        }
13027    }
13028
13029    if (OP(n) == NANYOFM) {
13030        _invlist_invert(cp_list);
13031    }
13032    return cp_list;
13033}
13034
13035SV *
13036Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13037    PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13038
13039    SV * cp_list = NULL;
13040    populate_invlist_from_bitmap(
13041              ((struct regnode_bbm *) n)->bitmap,
13042              REGNODE_BBM_BITMAP_LEN * CHARBITS,
13043              &cp_list,
13044
13045              /* The base cp is from the start byte plus a zero continuation */
13046              TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13047                                      UTF_CONTINUATION_MARK | 0));
13048    return cp_list;
13049}
13050
13051
13052
13053SV *
13054Perl_re_intuit_string(pTHX_ REGEXP * const r)
13055{				/* Assume that RE_INTUIT is set */
13056    /* Returns an SV containing a string that must appear in the target for it
13057     * to match, or NULL if nothing is known that must match.
13058     *
13059     * CAUTION: the SV can be freed during execution of the regex engine */
13060
13061    struct regexp *const prog = ReANY(r);
13062    DECLARE_AND_GET_RE_DEBUG_FLAGS;
13063
13064    PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13065    PERL_UNUSED_CONTEXT;
13066
13067    DEBUG_COMPILE_r(
13068        {
13069            if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13070                const char * const s = SvPV_nolen_const(RX_UTF8(r)
13071                      ? prog->check_utf8 : prog->check_substr);
13072
13073                if (!PL_colorset) reginitcolors();
13074                Perl_re_printf( aTHX_
13075                      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13076                      PL_colors[4],
13077                      RX_UTF8(r) ? "utf8 " : "",
13078                      PL_colors[5], PL_colors[0],
13079                      s,
13080                      PL_colors[1],
13081                      (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13082            }
13083        } );
13084
13085    /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13086    return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13087}
13088
13089/*
13090   pregfree()
13091
13092   handles refcounting and freeing the perl core regexp structure. When
13093   it is necessary to actually free the structure the first thing it
13094   does is call the 'free' method of the regexp_engine associated to
13095   the regexp, allowing the handling of the void *pprivate; member
13096   first. (This routine is not overridable by extensions, which is why
13097   the extensions free is called first.)
13098
13099   See regdupe and regdupe_internal if you change anything here.
13100*/
13101#ifndef PERL_IN_XSUB_RE
13102void
13103Perl_pregfree(pTHX_ REGEXP *r)
13104{
13105    SvREFCNT_dec(r);
13106}
13107
13108void
13109Perl_pregfree2(pTHX_ REGEXP *rx)
13110{
13111    struct regexp *const r = ReANY(rx);
13112    DECLARE_AND_GET_RE_DEBUG_FLAGS;
13113
13114    PERL_ARGS_ASSERT_PREGFREE2;
13115
13116    if (! r)
13117        return;
13118
13119    if (r->mother_re) {
13120        ReREFCNT_dec(r->mother_re);
13121    } else {
13122        CALLREGFREE_PVT(rx); /* free the private data */
13123        SvREFCNT_dec(RXp_PAREN_NAMES(r));
13124    }
13125    if (r->substrs) {
13126        int i;
13127        for (i = 0; i < 2; i++) {
13128            SvREFCNT_dec(r->substrs->data[i].substr);
13129            SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13130        }
13131        Safefree(r->substrs);
13132    }
13133    RX_MATCH_COPY_FREE(rx);
13134#ifdef PERL_ANY_COW
13135    SvREFCNT_dec(r->saved_copy);
13136#endif
13137    Safefree(RXp_OFFSp(r));
13138    if (r->logical_to_parno) {
13139        Safefree(r->logical_to_parno);
13140        Safefree(r->parno_to_logical);
13141        Safefree(r->parno_to_logical_next);
13142    }
13143
13144    SvREFCNT_dec(r->qr_anoncv);
13145    if (r->recurse_locinput)
13146        Safefree(r->recurse_locinput);
13147}
13148
13149
13150/*  reg_temp_copy()
13151
13152    Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13153    except that dsv will be created if NULL.
13154
13155    This function is used in two main ways. First to implement
13156        $r = qr/....; $s = $$r;
13157
13158    Secondly, it is used as a hacky workaround to the structural issue of
13159    match results
13160    being stored in the regexp structure which is in turn stored in
13161    PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13162    could be PL_curpm in multiple contexts, and could require multiple
13163    result sets being associated with the pattern simultaneously, such
13164    as when doing a recursive match with (??{$qr})
13165
13166    The solution is to make a lightweight copy of the regexp structure
13167    when a qr// is returned from the code executed by (??{$qr}) this
13168    lightweight copy doesn't actually own any of its data except for
13169    the starp/end and the actual regexp structure itself.
13170
13171*/
13172
13173
13174REGEXP *
13175Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13176{
13177    struct regexp *drx;
13178    struct regexp *const srx = ReANY(ssv);
13179    const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13180
13181    PERL_ARGS_ASSERT_REG_TEMP_COPY;
13182
13183    if (!dsv)
13184        dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13185    else {
13186        assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13187
13188        /* our only valid caller, sv_setsv_flags(), should have done
13189         * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13190        assert(!SvOOK(dsv));
13191        assert(!SvIsCOW(dsv));
13192        assert(!SvROK(dsv));
13193
13194        if (SvPVX_const(dsv)) {
13195            if (SvLEN(dsv))
13196                Safefree(SvPVX(dsv));
13197            SvPVX(dsv) = NULL;
13198        }
13199        SvLEN_set(dsv, 0);
13200        SvCUR_set(dsv, 0);
13201        SvOK_off((SV *)dsv);
13202
13203        if (islv) {
13204            /* For PVLVs, the head (sv_any) points to an XPVLV, while
13205             * the LV's xpvlenu_rx will point to a regexp body, which
13206             * we allocate here */
13207            REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13208            assert(!SvPVX(dsv));
13209            /* We "steal" the body from the newly allocated SV temp, changing
13210             * the pointer in its HEAD to NULL. We then change its type to
13211             * SVt_NULL so that when we immediately release its only reference,
13212             * no memory deallocation happens.
13213             *
13214             * The body will eventually be freed (from the PVLV) either in
13215             * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13216             * the regexp body needs to be removed)
13217             * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13218             * the PVLV itself is deallocated). */
13219            ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13220            temp->sv_any = NULL;
13221            SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13222            SvREFCNT_dec_NN(temp);
13223            /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13224               ing below will not set it. */
13225            SvCUR_set(dsv, SvCUR(ssv));
13226        }
13227    }
13228    /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13229       sv_force_normal(sv) is called.  */
13230    SvFAKE_on(dsv);
13231    drx = ReANY(dsv);
13232
13233    SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13234    SvPV_set(dsv, RX_WRAPPED(ssv));
13235    /* We share the same string buffer as the original regexp, on which we
13236       hold a reference count, incremented when mother_re is set below.
13237       The string pointer is copied here, being part of the regexp struct.
13238     */
13239    memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13240           sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13241
13242    if (!islv)
13243        SvLEN_set(dsv, 0);
13244    if (RXp_OFFSp(srx)) {
13245        const I32 npar = srx->nparens+1;
13246        NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13247    }
13248    if (srx->substrs) {
13249        int i;
13250        Newx(drx->substrs, 1, struct reg_substr_data);
13251        StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13252
13253        for (i = 0; i < 2; i++) {
13254            SvREFCNT_inc_void(drx->substrs->data[i].substr);
13255            SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13256        }
13257
13258        /* check_substr and check_utf8, if non-NULL, point to either their
13259           anchored or float namesakes, and don't hold a second reference.  */
13260    }
13261    if (srx->logical_to_parno) {
13262        NewCopy(srx->logical_to_parno,
13263                drx->logical_to_parno,
13264                srx->nparens+1, I32);
13265        NewCopy(srx->parno_to_logical,
13266                drx->parno_to_logical,
13267                srx->nparens+1, I32);
13268        NewCopy(srx->parno_to_logical_next,
13269                drx->parno_to_logical_next,
13270                srx->nparens+1, I32);
13271    } else {
13272        drx->logical_to_parno = NULL;
13273        drx->parno_to_logical = NULL;
13274        drx->parno_to_logical_next = NULL;
13275    }
13276    drx->logical_nparens = srx->logical_nparens;
13277
13278    RX_MATCH_COPIED_off(dsv);
13279#ifdef PERL_ANY_COW
13280    RXp_SAVED_COPY(drx) = NULL;
13281#endif
13282    drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13283    SvREFCNT_inc_void(drx->qr_anoncv);
13284    if (srx->recurse_locinput)
13285        Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13286
13287    return dsv;
13288}
13289#endif
13290
13291
13292/* regfree_internal()
13293
13294   Free the private data in a regexp. This is overloadable by
13295   extensions. Perl takes care of the regexp structure in pregfree(),
13296   this covers the *pprivate pointer which technically perl doesn't
13297   know about, however of course we have to handle the
13298   regexp_internal structure when no extension is in use.
13299
13300   Note this is called before freeing anything in the regexp
13301   structure.
13302 */
13303
13304void
13305Perl_regfree_internal(pTHX_ REGEXP * const rx)
13306{
13307    struct regexp *const r = ReANY(rx);
13308    RXi_GET_DECL(r, ri);
13309    DECLARE_AND_GET_RE_DEBUG_FLAGS;
13310
13311    PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13312
13313    if (! ri) {
13314        return;
13315    }
13316
13317    DEBUG_COMPILE_r({
13318        if (!PL_colorset)
13319            reginitcolors();
13320        {
13321            SV *dsv= sv_newmortal();
13322            RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13323                dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13324            Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13325                PL_colors[4], PL_colors[5], s);
13326        }
13327    });
13328
13329    if (ri->code_blocks)
13330        S_free_codeblocks(aTHX_ ri->code_blocks);
13331
13332    if (ri->data) {
13333        int n = ri->data->count;
13334
13335        while (--n >= 0) {
13336          /* If you add a ->what type here, update the comment in regcomp.h */
13337            switch (ri->data->what[n]) {
13338            case 'a':
13339            case 'r':
13340            case 's':
13341            case 'S':
13342            case 'u':
13343                SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13344                break;
13345            case 'f':
13346                Safefree(ri->data->data[n]);
13347                break;
13348            case 'l':
13349            case 'L':
13350                break;
13351            case 'T':
13352                { /* Aho Corasick add-on structure for a trie node.
13353                     Used in stclass optimization only */
13354                    U32 refcount;
13355                    reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13356                    OP_REFCNT_LOCK;
13357                    refcount = --aho->refcount;
13358                    OP_REFCNT_UNLOCK;
13359                    if ( !refcount ) {
13360                        PerlMemShared_free(aho->states);
13361                        PerlMemShared_free(aho->fail);
13362                         /* do this last!!!! */
13363                        PerlMemShared_free(ri->data->data[n]);
13364                        /* we should only ever get called once, so
13365                         * assert as much, and also guard the free
13366                         * which /might/ happen twice. At the least
13367                         * it will make code anlyzers happy and it
13368                         * doesn't cost much. - Yves */
13369                        assert(ri->regstclass);
13370                        if (ri->regstclass) {
13371                            PerlMemShared_free(ri->regstclass);
13372                            ri->regstclass = 0;
13373                        }
13374                    }
13375                }
13376                break;
13377            case 't':
13378                {
13379                    /* trie structure. */
13380                    U32 refcount;
13381                    reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13382                    OP_REFCNT_LOCK;
13383                    refcount = --trie->refcount;
13384                    OP_REFCNT_UNLOCK;
13385                    if ( !refcount ) {
13386                        PerlMemShared_free(trie->charmap);
13387                        PerlMemShared_free(trie->states);
13388                        PerlMemShared_free(trie->trans);
13389                        if (trie->bitmap)
13390                            PerlMemShared_free(trie->bitmap);
13391                        if (trie->jump)
13392                            PerlMemShared_free(trie->jump);
13393                        if (trie->j_before_paren)
13394                            PerlMemShared_free(trie->j_before_paren);
13395                        if (trie->j_after_paren)
13396                            PerlMemShared_free(trie->j_after_paren);
13397                        PerlMemShared_free(trie->wordinfo);
13398                        /* do this last!!!! */
13399                        PerlMemShared_free(ri->data->data[n]);
13400                    }
13401                }
13402                break;
13403            case '%':
13404                /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13405                 * always returns non-zero, this should only ever happen in the
13406                 * 0 index */
13407                assert(n==0);
13408                break;
13409            default:
13410                Perl_croak(aTHX_ "panic: regfree data code '%c'",
13411                                                    ri->data->what[n]);
13412            }
13413        }
13414        Safefree(ri->data->what);
13415        Safefree(ri->data);
13416    }
13417
13418    Safefree(ri);
13419}
13420
13421#define SAVEPVN(p, n)	((p) ? savepvn(p, n) : NULL)
13422
13423/*
13424=for apidoc re_dup_guts
13425Duplicate a regexp.
13426
13427This routine is expected to clone a given regexp structure. It is only
13428compiled under USE_ITHREADS.
13429
13430After all of the core data stored in struct regexp is duplicated
13431the C<regexp_engine.dupe> method is used to copy any private data
13432stored in the *pprivate pointer. This allows extensions to handle
13433any duplication they need to do.
13434
13435=cut
13436
13437   See pregfree() and regfree_internal() if you change anything here.
13438*/
13439#if defined(USE_ITHREADS)
13440#ifndef PERL_IN_XSUB_RE
13441void
13442Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13443{
13444    I32 npar;
13445    const struct regexp *r = ReANY(sstr);
13446    struct regexp *ret = ReANY(dstr);
13447
13448    PERL_ARGS_ASSERT_RE_DUP_GUTS;
13449
13450    npar = r->nparens+1;
13451    NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13452
13453    if (ret->substrs) {
13454        /* Do it this way to avoid reading from *r after the StructCopy().
13455           That way, if any of the sv_dup_inc()s dislodge *r from the L1
13456           cache, it doesn't matter.  */
13457        int i;
13458        const bool anchored = r->check_substr
13459            ? r->check_substr == r->substrs->data[0].substr
13460            : r->check_utf8   == r->substrs->data[0].utf8_substr;
13461        Newx(ret->substrs, 1, struct reg_substr_data);
13462        StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13463
13464        for (i = 0; i < 2; i++) {
13465            ret->substrs->data[i].substr =
13466                        sv_dup_inc(ret->substrs->data[i].substr, param);
13467            ret->substrs->data[i].utf8_substr =
13468                        sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13469        }
13470
13471        /* check_substr and check_utf8, if non-NULL, point to either their
13472           anchored or float namesakes, and don't hold a second reference.  */
13473
13474        if (ret->check_substr) {
13475            if (anchored) {
13476                assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13477
13478                ret->check_substr = ret->substrs->data[0].substr;
13479                ret->check_utf8   = ret->substrs->data[0].utf8_substr;
13480            } else {
13481                assert(r->check_substr == r->substrs->data[1].substr);
13482                assert(r->check_utf8   == r->substrs->data[1].utf8_substr);
13483
13484                ret->check_substr = ret->substrs->data[1].substr;
13485                ret->check_utf8   = ret->substrs->data[1].utf8_substr;
13486            }
13487        } else if (ret->check_utf8) {
13488            if (anchored) {
13489                ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13490            } else {
13491                ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13492            }
13493        }
13494    }
13495
13496    RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13497    ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13498    if (r->recurse_locinput)
13499        Newx(ret->recurse_locinput, r->nparens + 1, char *);
13500
13501    if (ret->pprivate)
13502        RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13503
13504    if (RX_MATCH_COPIED(dstr))
13505        RXp_SUBBEG(ret)  = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13506    else
13507        RXp_SUBBEG(ret) = NULL;
13508#ifdef PERL_ANY_COW
13509    RXp_SAVED_COPY(ret) = NULL;
13510#endif
13511
13512    if (r->logical_to_parno) {
13513        /* we use total_parens for all three just for symmetry */
13514        ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13515        ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13516        ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13517    } else {
13518        ret->logical_to_parno = NULL;
13519        ret->parno_to_logical = NULL;
13520        ret->parno_to_logical_next = NULL;
13521    }
13522
13523    ret->logical_nparens = r->logical_nparens;
13524
13525    /* Whether mother_re be set or no, we need to copy the string.  We
13526       cannot refrain from copying it when the storage points directly to
13527       our mother regexp, because that's
13528               1: a buffer in a different thread
13529               2: something we no longer hold a reference on
13530               so we need to copy it locally.  */
13531    RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13532    /* set malloced length to a non-zero value so it will be freed
13533     * (otherwise in combination with SVf_FAKE it looks like an alien
13534     * buffer). It doesn't have to be the actual malloced size, since it
13535     * should never be grown */
13536    SvLEN_set(dstr, SvCUR(sstr)+1);
13537    ret->mother_re   = NULL;
13538}
13539#endif /* PERL_IN_XSUB_RE */
13540
13541/*
13542   regdupe_internal()
13543
13544   This is the internal complement to regdupe() which is used to copy
13545   the structure pointed to by the *pprivate pointer in the regexp.
13546   This is the core version of the extension overridable cloning hook.
13547   The regexp structure being duplicated will be copied by perl prior
13548   to this and will be provided as the regexp *r argument, however
13549   with the /old/ structures pprivate pointer value. Thus this routine
13550   may override any copying normally done by perl.
13551
13552   It returns a pointer to the new regexp_internal structure.
13553*/
13554
13555void *
13556Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13557{
13558    struct regexp *const r = ReANY(rx);
13559    regexp_internal *reti;
13560    int len;
13561    RXi_GET_DECL(r, ri);
13562
13563    PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13564
13565    len = ProgLen(ri);
13566
13567    Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13568          char, regexp_internal);
13569    Copy(ri->program, reti->program, len+1, regnode);
13570
13571
13572    if (ri->code_blocks) {
13573        int n;
13574        Newx(reti->code_blocks, 1, struct reg_code_blocks);
13575        Newx(reti->code_blocks->cb, ri->code_blocks->count,
13576                    struct reg_code_block);
13577        Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13578             ri->code_blocks->count, struct reg_code_block);
13579        for (n = 0; n < ri->code_blocks->count; n++)
13580             reti->code_blocks->cb[n].src_regex = (REGEXP*)
13581                    sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13582        reti->code_blocks->count = ri->code_blocks->count;
13583        reti->code_blocks->refcnt = 1;
13584    }
13585    else
13586        reti->code_blocks = NULL;
13587
13588    reti->regstclass = NULL;
13589
13590    if (ri->data) {
13591        struct reg_data *d;
13592        const int count = ri->data->count;
13593        int i;
13594
13595        Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13596                char, struct reg_data);
13597        Newx(d->what, count, U8);
13598
13599        d->count = count;
13600        for (i = 0; i < count; i++) {
13601            d->what[i] = ri->data->what[i];
13602            switch (d->what[i]) {
13603                /* see also regcomp.h and regfree_internal() */
13604            case 'a': /* actually an AV, but the dup function is identical.
13605                         values seem to be "plain sv's" generally. */
13606            case 'r': /* a compiled regex (but still just another SV) */
13607            case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13608                         this use case should go away, the code could have used
13609                         'a' instead - see S_set_ANYOF_arg() for array contents. */
13610            case 'S': /* actually an SV, but the dup function is identical.  */
13611            case 'u': /* actually an HV, but the dup function is identical.
13612                         values are "plain sv's" */
13613                d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13614                break;
13615            case 'f':
13616                /* Synthetic Start Class - "Fake" charclass we generate to optimize
13617                 * patterns which could start with several different things. Pre-TRIE
13618                 * this was more important than it is now, however this still helps
13619                 * in some places, for instance /x?a+/ might produce a SSC equivalent
13620                 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13621                 * in regexec.c
13622                 */
13623                /* This is cheating. */
13624                Newx(d->data[i], 1, regnode_ssc);
13625                StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13626                reti->regstclass = (regnode*)d->data[i];
13627                break;
13628            case 'T':
13629                /* AHO-CORASICK fail table */
13630                /* Trie stclasses are readonly and can thus be shared
13631                 * without duplication. We free the stclass in pregfree
13632                 * when the corresponding reg_ac_data struct is freed.
13633                 */
13634                reti->regstclass= ri->regstclass;
13635                /* FALLTHROUGH */
13636            case 't':
13637                /* TRIE transition table */
13638                OP_REFCNT_LOCK;
13639                ((reg_trie_data*)ri->data->data[i])->refcount++;
13640                OP_REFCNT_UNLOCK;
13641                /* FALLTHROUGH */
13642            case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13643            case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13644                         is not from another regexp */
13645                d->data[i] = ri->data->data[i];
13646                break;
13647            case '%':
13648                /* this is a placeholder type, it exists purely so that
13649                 * reg_add_data always returns a non-zero value, this type of
13650                 * entry should ONLY be present in the 0 slot of the array */
13651                assert(i == 0);
13652                d->data[i]= ri->data->data[i];
13653                break;
13654            default:
13655                Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13656                                                           ri->data->what[i]);
13657            }
13658        }
13659
13660        reti->data = d;
13661    }
13662    else
13663        reti->data = NULL;
13664
13665    if (ri->regstclass && !reti->regstclass) {
13666        /* Assume that the regstclass is a regnode which is inside of the
13667         * program which we have to copy over */
13668        regnode *node= ri->regstclass;
13669        assert(node >= ri->program && (node - ri->program) < len);
13670        reti->regstclass = reti->program + (node - ri->program);
13671    }
13672
13673
13674    reti->name_list_idx = ri->name_list_idx;
13675
13676    SetProgLen(reti, len);
13677
13678    return (void*)reti;
13679}
13680
13681#endif    /* USE_ITHREADS */
13682
13683STATIC void
13684S_re_croak(pTHX_ bool utf8, const char* pat,...)
13685{
13686    va_list args;
13687    STRLEN len = strlen(pat);
13688    char buf[512];
13689    SV *msv;
13690    const char *message;
13691
13692    PERL_ARGS_ASSERT_RE_CROAK;
13693
13694    if (len > 510)
13695        len = 510;
13696    Copy(pat, buf, len , char);
13697    buf[len] = '\n';
13698    buf[len + 1] = '\0';
13699    va_start(args, pat);
13700    msv = vmess(buf, &args);
13701    va_end(args);
13702    message = SvPV_const(msv, len);
13703    if (len > 512)
13704        len = 512;
13705    Copy(message, buf, len , char);
13706    /* len-1 to avoid \n */
13707    Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13708}
13709
13710/* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
13711
13712#ifndef PERL_IN_XSUB_RE
13713void
13714Perl_save_re_context(pTHX)
13715{
13716    I32 nparens = -1;
13717    I32 i;
13718
13719    /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13720
13721    if (PL_curpm) {
13722        const REGEXP * const rx = PM_GETRE(PL_curpm);
13723        if (rx)
13724            nparens = RX_NPARENS(rx);
13725    }
13726
13727    /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13728     * that PL_curpm will be null, but that utf8.pm and the modules it
13729     * loads will only use $1..$3.
13730     * The t/porting/re_context.t test file checks this assumption.
13731     */
13732    if (nparens == -1)
13733        nparens = 3;
13734
13735    for (i = 1; i <= nparens; i++) {
13736        char digits[TYPE_CHARS(long)];
13737        const STRLEN len = my_snprintf(digits, sizeof(digits),
13738                                       "%lu", (long)i);
13739        GV *const *const gvp
13740            = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13741
13742        if (gvp) {
13743            GV * const gv = *gvp;
13744            if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13745                save_scalar(gv);
13746        }
13747    }
13748}
13749#endif
13750
13751#ifndef PERL_IN_XSUB_RE
13752
13753#  include "uni_keywords.h"
13754
13755void
13756Perl_init_uniprops(pTHX)
13757{
13758
13759#  ifdef DEBUGGING
13760    char * dump_len_string;
13761
13762    dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13763    if (   ! dump_len_string
13764        || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13765    {
13766        PL_dump_re_max_len = 60;    /* A reasonable default */
13767    }
13768#  endif
13769
13770    PL_user_def_props = newHV();
13771
13772#  ifdef USE_ITHREADS
13773
13774    HvSHAREKEYS_off(PL_user_def_props);
13775    PL_user_def_props_aTHX = aTHX;
13776
13777#  endif
13778
13779    /* Set up the inversion list interpreter-level variables */
13780
13781    PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13782    PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13783    PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13784    PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13785    PL_XPosix_ptrs[CC_CASED_] =  _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13786    PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13787    PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13788    PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13789    PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13790    PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13791    PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13792    PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13793    PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13794    PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13795    PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13796    PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13797
13798    PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13799    PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13800    PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13801    PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13802    PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13803    PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13804    PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13805    PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13806    PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13807    PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13808    PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13809    PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13810    PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13811    PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13812    PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13813    PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13814
13815    PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13816    PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13817    PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13818    PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13819    PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13820
13821    PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13822    PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13823    PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13824    PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13825
13826    PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13827
13828    PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13829    PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13830
13831    PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13832    PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13833
13834    PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13835    PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13836                                            UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13837    PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13838                                            UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13839    PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13840    PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13841    PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13842    PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13843    PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13844    PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13845    PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13846    PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13847    PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13848
13849#  ifdef UNI_XIDC
13850    /* The below are used only by deprecated functions.  They could be removed */
13851    PL_utf8_xidcont  = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13852    PL_utf8_idcont   = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13853    PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13854#  endif
13855}
13856
13857/* These four functions are compiled only in regcomp.c, where they have access
13858 * to the data they return.  They are a way for re_comp.c to get access to that
13859 * data without having to compile the whole data structures. */
13860
13861I16
13862Perl_do_uniprop_match(const char * const key, const U16 key_len)
13863{
13864    PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13865
13866    return match_uniprop((U8 *) key, key_len);
13867}
13868
13869SV *
13870Perl_get_prop_definition(pTHX_ const int table_index)
13871{
13872    PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13873
13874    /* Create and return the inversion list */
13875    return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13876}
13877
13878const char * const *
13879Perl_get_prop_values(const int table_index)
13880{
13881    PERL_ARGS_ASSERT_GET_PROP_VALUES;
13882
13883    return UNI_prop_value_ptrs[table_index];
13884}
13885
13886const char *
13887Perl_get_deprecated_property_msg(const Size_t warning_offset)
13888{
13889    PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13890
13891    return deprecated_property_msgs[warning_offset];
13892}
13893
13894#  if 0
13895
13896This code was mainly added for backcompat to give a warning for non-portable
13897code points in user-defined properties.  But experiments showed that the
13898warning in earlier perls were only omitted on overflow, which should be an
13899error, so there really isnt a backcompat issue, and actually adding the
13900warning when none was present before might cause breakage, for little gain.  So
13901khw left this code in, but not enabled.  Tests were never added.
13902
13903embed.fnc entry:
13904Ei	|const char *|get_extended_utf8_msg|const UV cp
13905
13906PERL_STATIC_INLINE const char *
13907S_get_extended_utf8_msg(pTHX_ const UV cp)
13908{
13909    U8 dummy[UTF8_MAXBYTES + 1];
13910    HV *msgs;
13911    SV **msg;
13912
13913    uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
13914                             &msgs);
13915
13916    msg = hv_fetchs(msgs, "text", 0);
13917    assert(msg);
13918
13919    (void) sv_2mortal((SV *) msgs);
13920
13921    return SvPVX(*msg);
13922}
13923
13924#  endif
13925#endif /* end of ! PERL_IN_XSUB_RE */
13926
13927STATIC REGEXP *
13928S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
13929                         const bool ignore_case)
13930{
13931    /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
13932     * possibly with /i if the 'ignore_case' parameter is true.  Use /aa
13933     * because nothing outside of ASCII will match.  Use /m because the input
13934     * string may be a bunch of lines strung together.
13935     *
13936     * Also sets up the debugging info */
13937
13938    U32 flags = PMf_MULTILINE|PMf_WILDCARD;
13939    U32 rx_flags;
13940    SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
13941    REGEXP * subpattern_re;
13942    DECLARE_AND_GET_RE_DEBUG_FLAGS;
13943
13944    PERL_ARGS_ASSERT_COMPILE_WILDCARD;
13945
13946    if (ignore_case) {
13947        flags |= PMf_FOLD;
13948    }
13949    set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
13950
13951    /* Like in op.c, we copy the compile time pm flags to the rx ones */
13952    rx_flags = flags & RXf_PMf_COMPILETIME;
13953
13954#ifndef PERL_IN_XSUB_RE
13955    /* Use the core engine if this file is regcomp.c.  That means no
13956     * 'use re "Debug ..." is in effect, so the core engine is sufficient */
13957    subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13958                                             &PL_core_reg_engine,
13959                                             NULL, NULL,
13960                                             rx_flags, flags);
13961#else
13962    if (isDEBUG_WILDCARD) {
13963        /* Use the special debugging engine if this file is re_comp.c and wants
13964         * to output the wildcard matching.  This uses whatever
13965         * 'use re "Debug ..." is in effect */
13966        subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13967                                                 &my_reg_engine,
13968                                                 NULL, NULL,
13969                                                 rx_flags, flags);
13970    }
13971    else {
13972        /* Use the special wildcard engine if this file is re_comp.c and
13973         * doesn't want to output the wildcard matching.  This uses whatever
13974         * 'use re "Debug ..." is in effect for compilation, but this engine
13975         * structure has been set up so that it uses the core engine for
13976         * execution, so no execution debugging as a result of re.pm will be
13977         * displayed. */
13978        subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13979                                                 &wild_reg_engine,
13980                                                 NULL, NULL,
13981                                                 rx_flags, flags);
13982        /* XXX The above has the effect that any user-supplied regex engine
13983         * won't be called for matching wildcards.  That might be good, or bad.
13984         * It could be changed in several ways.  The reason it is done the
13985         * current way is to avoid having to save and restore
13986         * ^{^RE_DEBUG_FLAGS} around the execution.  save_scalar() perhaps
13987         * could be used.  Another suggestion is to keep the authoritative
13988         * value of the debug flags in a thread-local variable and add set/get
13989         * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
13990         * Still another is to pass a flag, say in the engine's intflags that
13991         * would be checked each time before doing the debug output */
13992    }
13993#endif
13994
13995    assert(subpattern_re);  /* Should have died if didn't compile successfully */
13996    return subpattern_re;
13997}
13998
13999STATIC I32
14000S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14001         char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14002{
14003    I32 result;
14004    DECLARE_AND_GET_RE_DEBUG_FLAGS;
14005
14006    PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14007
14008    ENTER;
14009
14010    /* The compilation has set things up so that if the program doesn't want to
14011     * see the wildcard matching procedure, it will get the core execution
14012     * engine, which is subject only to -Dr.  So we have to turn that off
14013     * around this procedure */
14014    if (! isDEBUG_WILDCARD) {
14015        /* Note! Casts away 'volatile' */
14016        SAVEI32(PL_debug);
14017        PL_debug &= ~ DEBUG_r_FLAG;
14018    }
14019
14020    result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14021                         NULL, nosave);
14022    LEAVE;
14023
14024    return result;
14025}
14026
14027SV *
14028S_handle_user_defined_property(pTHX_
14029
14030    /* Parses the contents of a user-defined property definition; returning the
14031     * expanded definition if possible.  If so, the return is an inversion
14032     * list.
14033     *
14034     * If there are subroutines that are part of the expansion and which aren't
14035     * known at the time of the call to this function, this returns what
14036     * parse_uniprop_string() returned for the first one encountered.
14037     *
14038     * If an error was found, NULL is returned, and 'msg' gets a suitable
14039     * message appended to it.  (Appending allows the back trace of how we got
14040     * to the faulty definition to be displayed through nested calls of
14041     * user-defined subs.)
14042     *
14043     * The caller IS responsible for freeing any returned SV.
14044     *
14045     * The syntax of the contents is pretty much described in perlunicode.pod,
14046     * but we also allow comments on each line */
14047
14048    const char * name,          /* Name of property */
14049    const STRLEN name_len,      /* The name's length in bytes */
14050    const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14051    const bool to_fold,         /* ? Is this under /i */
14052    const bool runtime,         /* ? Are we in compile- or run-time */
14053    const bool deferrable,      /* Is it ok for this property's full definition
14054                                   to be deferred until later? */
14055    SV* contents,               /* The property's definition */
14056    bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
14057                                   getting called unless this is thought to be
14058                                   a user-defined property */
14059    SV * msg,                   /* Any error or warning msg(s) are appended to
14060                                   this */
14061    const STRLEN level)         /* Recursion level of this call */
14062{
14063    STRLEN len;
14064    const char * string         = SvPV_const(contents, len);
14065    const char * const e        = string + len;
14066    const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14067    const STRLEN msgs_length_on_entry = SvCUR(msg);
14068
14069    const char * s0 = string;   /* Points to first byte in the current line
14070                                   being parsed in 'string' */
14071    const char overflow_msg[] = "Code point too large in \"";
14072    SV* running_definition = NULL;
14073
14074    PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14075
14076    *user_defined_ptr = TRUE;
14077
14078    /* Look at each line */
14079    while (s0 < e) {
14080        const char * s;     /* Current byte */
14081        char op = '+';      /* Default operation is 'union' */
14082        IV   min = 0;       /* range begin code point */
14083        IV   max = -1;      /* and range end */
14084        SV* this_definition;
14085
14086        /* Skip comment lines */
14087        if (*s0 == '#') {
14088            s0 = strchr(s0, '\n');
14089            if (s0 == NULL) {
14090                break;
14091            }
14092            s0++;
14093            continue;
14094        }
14095
14096        /* For backcompat, allow an empty first line */
14097        if (*s0 == '\n') {
14098            s0++;
14099            continue;
14100        }
14101
14102        /* First character in the line may optionally be the operation */
14103        if (   *s0 == '+'
14104            || *s0 == '!'
14105            || *s0 == '-'
14106            || *s0 == '&')
14107        {
14108            op = *s0++;
14109        }
14110
14111        /* If the line is one or two hex digits separated by blank space, its
14112         * a range; otherwise it is either another user-defined property or an
14113         * error */
14114
14115        s = s0;
14116
14117        if (! isXDIGIT(*s)) {
14118            goto check_if_property;
14119        }
14120
14121        do { /* Each new hex digit will add 4 bits. */
14122            if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14123                s = strchr(s, '\n');
14124                if (s == NULL) {
14125                    s = e;
14126                }
14127                if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14128                sv_catpv(msg, overflow_msg);
14129                Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14130                                     UTF8fARG(is_contents_utf8, s - s0, s0));
14131                sv_catpvs(msg, "\"");
14132                goto return_failure;
14133            }
14134
14135            /* Accumulate this digit into the value */
14136            min = (min << 4) + READ_XDIGIT(s);
14137        } while (isXDIGIT(*s));
14138
14139        while (isBLANK(*s)) { s++; }
14140
14141        /* We allow comments at the end of the line */
14142        if (*s == '#') {
14143            s = strchr(s, '\n');
14144            if (s == NULL) {
14145                s = e;
14146            }
14147            s++;
14148        }
14149        else if (s < e && *s != '\n') {
14150            if (! isXDIGIT(*s)) {
14151                goto check_if_property;
14152            }
14153
14154            /* Look for the high point of the range */
14155            max = 0;
14156            do {
14157                if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14158                    s = strchr(s, '\n');
14159                    if (s == NULL) {
14160                        s = e;
14161                    }
14162                    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14163                    sv_catpv(msg, overflow_msg);
14164                    Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14165                                      UTF8fARG(is_contents_utf8, s - s0, s0));
14166                    sv_catpvs(msg, "\"");
14167                    goto return_failure;
14168                }
14169
14170                max = (max << 4) + READ_XDIGIT(s);
14171            } while (isXDIGIT(*s));
14172
14173            while (isBLANK(*s)) { s++; }
14174
14175            if (*s == '#') {
14176                s = strchr(s, '\n');
14177                if (s == NULL) {
14178                    s = e;
14179                }
14180            }
14181            else if (s < e && *s != '\n') {
14182                goto check_if_property;
14183            }
14184        }
14185
14186        if (max == -1) {    /* The line only had one entry */
14187            max = min;
14188        }
14189        else if (max < min) {
14190            if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14191            sv_catpvs(msg, "Illegal range in \"");
14192            Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14193                                UTF8fARG(is_contents_utf8, s - s0, s0));
14194            sv_catpvs(msg, "\"");
14195            goto return_failure;
14196        }
14197
14198#  if 0   /* See explanation at definition above of get_extended_utf8_msg() */
14199
14200        if (   UNICODE_IS_PERL_EXTENDED(min)
14201            || UNICODE_IS_PERL_EXTENDED(max))
14202        {
14203            if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14204
14205            /* If both code points are non-portable, warn only on the lower
14206             * one. */
14207            sv_catpv(msg, get_extended_utf8_msg(
14208                                            (UNICODE_IS_PERL_EXTENDED(min))
14209                                            ? min : max));
14210            sv_catpvs(msg, " in \"");
14211            Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14212                                 UTF8fARG(is_contents_utf8, s - s0, s0));
14213            sv_catpvs(msg, "\"");
14214        }
14215
14216#  endif
14217
14218        /* Here, this line contains a legal range */
14219        this_definition = sv_2mortal(_new_invlist(2));
14220        this_definition = _add_range_to_invlist(this_definition, min, max);
14221        goto calculate;
14222
14223      check_if_property:
14224
14225        /* Here it isn't a legal range line.  See if it is a legal property
14226         * line.  First find the end of the meat of the line */
14227        s = strpbrk(s, "#\n");
14228        if (s == NULL) {
14229            s = e;
14230        }
14231
14232        /* Ignore trailing blanks in keeping with the requirements of
14233         * parse_uniprop_string() */
14234        s--;
14235        while (s > s0 && isBLANK_A(*s)) {
14236            s--;
14237        }
14238        s++;
14239
14240        this_definition = parse_uniprop_string(s0, s - s0,
14241                                               is_utf8, to_fold, runtime,
14242                                               deferrable,
14243                                               NULL,
14244                                               user_defined_ptr, msg,
14245                                               (name_len == 0)
14246                                                ? level /* Don't increase level
14247                                                           if input is empty */
14248                                                : level + 1
14249                                              );
14250        if (this_definition == NULL) {
14251            goto return_failure;    /* 'msg' should have had the reason
14252                                       appended to it by the above call */
14253        }
14254
14255        if (! is_invlist(this_definition)) {    /* Unknown at this time */
14256            return newSVsv(this_definition);
14257        }
14258
14259        if (*s != '\n') {
14260            s = strchr(s, '\n');
14261            if (s == NULL) {
14262                s = e;
14263            }
14264        }
14265
14266      calculate:
14267
14268        switch (op) {
14269            case '+':
14270                _invlist_union(running_definition, this_definition,
14271                                                        &running_definition);
14272                break;
14273            case '-':
14274                _invlist_subtract(running_definition, this_definition,
14275                                                        &running_definition);
14276                break;
14277            case '&':
14278                _invlist_intersection(running_definition, this_definition,
14279                                                        &running_definition);
14280                break;
14281            case '!':
14282                _invlist_union_complement_2nd(running_definition,
14283                                        this_definition, &running_definition);
14284                break;
14285            default:
14286                Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14287                                 __FILE__, __LINE__, op);
14288                break;
14289        }
14290
14291        /* Position past the '\n' */
14292        s0 = s + 1;
14293    }   /* End of loop through the lines of 'contents' */
14294
14295    /* Here, we processed all the lines in 'contents' without error.  If we
14296     * didn't add any warnings, simply return success */
14297    if (msgs_length_on_entry == SvCUR(msg)) {
14298
14299        /* If the expansion was empty, the answer isn't nothing: its an empty
14300         * inversion list */
14301        if (running_definition == NULL) {
14302            running_definition = _new_invlist(1);
14303        }
14304
14305        return running_definition;
14306    }
14307
14308    /* Otherwise, add some explanatory text, but we will return success */
14309    goto return_msg;
14310
14311  return_failure:
14312    running_definition = NULL;
14313
14314  return_msg:
14315
14316    if (name_len > 0) {
14317        sv_catpvs(msg, " in expansion of ");
14318        Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14319    }
14320
14321    return running_definition;
14322}
14323
14324/* As explained below, certain operations need to take place in the first
14325 * thread created.  These macros switch contexts */
14326#  ifdef USE_ITHREADS
14327#    define DECLARATION_FOR_GLOBAL_CONTEXT                                  \
14328                                        PerlInterpreter * save_aTHX = aTHX;
14329#    define SWITCH_TO_GLOBAL_CONTEXT                                        \
14330                           PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14331#    define RESTORE_CONTEXT  PERL_SET_CONTEXT((aTHX = save_aTHX));
14332#    define CUR_CONTEXT      aTHX
14333#    define ORIGINAL_CONTEXT save_aTHX
14334#  else
14335#    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
14336#    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
14337#    define RESTORE_CONTEXT                   NOOP
14338#    define CUR_CONTEXT                       NULL
14339#    define ORIGINAL_CONTEXT                  NULL
14340#  endif
14341
14342STATIC void
14343S_delete_recursion_entry(pTHX_ void *key)
14344{
14345    /* Deletes the entry used to detect recursion when expanding user-defined
14346     * properties.  This is a function so it can be set up to be called even if
14347     * the program unexpectedly quits */
14348
14349    SV ** current_entry;
14350    const STRLEN key_len = strlen((const char *) key);
14351    DECLARATION_FOR_GLOBAL_CONTEXT;
14352
14353    SWITCH_TO_GLOBAL_CONTEXT;
14354
14355    /* If the entry is one of these types, it is a permanent entry, and not the
14356     * one used to detect recursions.  This function should delete only the
14357     * recursion entry */
14358    current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14359    if (     current_entry
14360        && ! is_invlist(*current_entry)
14361        && ! SvPOK(*current_entry))
14362    {
14363        (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14364                                                                    G_DISCARD);
14365    }
14366
14367    RESTORE_CONTEXT;
14368}
14369
14370STATIC SV *
14371S_get_fq_name(pTHX_
14372              const char * const name,    /* The first non-blank in the \p{}, \P{} */
14373              const Size_t name_len,      /* Its length in bytes, not including any trailing space */
14374              const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14375              const bool has_colon_colon
14376             )
14377{
14378    /* Returns a mortal SV containing the fully qualified version of the input
14379     * name */
14380
14381    SV * fq_name;
14382
14383    fq_name = newSVpvs_flags("", SVs_TEMP);
14384
14385    /* Use the current package if it wasn't included in our input */
14386    if (! has_colon_colon) {
14387        const HV * pkg = (IN_PERL_COMPILETIME)
14388                         ? PL_curstash
14389                         : CopSTASH(PL_curcop);
14390        const char* pkgname = HvNAME(pkg);
14391
14392        Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14393                      UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14394        sv_catpvs(fq_name, "::");
14395    }
14396
14397    Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14398                         UTF8fARG(is_utf8, name_len, name));
14399    return fq_name;
14400}
14401
14402STATIC SV *
14403S_parse_uniprop_string(pTHX_
14404
14405    /* Parse the interior of a \p{}, \P{}.  Returns its definition if knowable
14406     * now.  If so, the return is an inversion list.
14407     *
14408     * If the property is user-defined, it is a subroutine, which in turn
14409     * may call other subroutines.  This function will call the whole nest of
14410     * them to get the definition they return; if some aren't known at the time
14411     * of the call to this function, the fully qualified name of the highest
14412     * level sub is returned.  It is an error to call this function at runtime
14413     * without every sub defined.
14414     *
14415     * If an error was found, NULL is returned, and 'msg' gets a suitable
14416     * message appended to it.  (Appending allows the back trace of how we got
14417     * to the faulty definition to be displayed through nested calls of
14418     * user-defined subs.)
14419     *
14420     * The caller should NOT try to free any returned inversion list.
14421     *
14422     * Other parameters will be set on return as described below */
14423
14424    const char * const name,    /* The first non-blank in the \p{}, \P{} */
14425    Size_t name_len,            /* Its length in bytes, not including any
14426                                   trailing space */
14427    const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
14428    const bool to_fold,         /* ? Is this under /i */
14429    const bool runtime,         /* TRUE if this is being called at run time */
14430    const bool deferrable,      /* TRUE if it's ok for the definition to not be
14431                                   known at this call */
14432    AV ** strings,              /* To return string property values, like named
14433                                   sequences */
14434    bool *user_defined_ptr,     /* Upon return from this function it will be
14435                                   set to TRUE if any component is a
14436                                   user-defined property */
14437    SV * msg,                   /* Any error or warning msg(s) are appended to
14438                                   this */
14439    const STRLEN level)         /* Recursion level of this call */
14440{
14441    char* lookup_name;          /* normalized name for lookup in our tables */
14442    unsigned lookup_len;        /* Its length */
14443    enum { Not_Strict = 0,      /* Some properties have stricter name */
14444           Strict,              /* normalization rules, which we decide */
14445           As_Is                /* upon based on parsing */
14446         } stricter = Not_Strict;
14447
14448    /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14449     * (though it requires extra effort to download them from Unicode and
14450     * compile perl to know about them) */
14451    bool is_nv_type = FALSE;
14452
14453    unsigned int i = 0, i_zero = 0, j = 0;
14454    int equals_pos = -1;    /* Where the '=' is found, or negative if none */
14455    int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
14456    int table_index = 0;    /* The entry number for this property in the table
14457                               of all Unicode property names */
14458    bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
14459    Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
14460                                   the normalized name in certain situations */
14461    Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
14462                                   part of a package name */
14463    Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
14464    bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
14465                                             property rather than a Unicode
14466                                             one. */
14467    SV * prop_definition = NULL;  /* The returned definition of 'name' or NULL
14468                                     if an error.  If it is an inversion list,
14469                                     it is the definition.  Otherwise it is a
14470                                     string containing the fully qualified sub
14471                                     name of 'name' */
14472    SV * fq_name = NULL;        /* For user-defined properties, the fully
14473                                   qualified name */
14474    bool invert_return = FALSE; /* ? Do we need to complement the result before
14475                                     returning it */
14476    bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14477                                       explicit utf8:: package that we strip
14478                                       off  */
14479    /* The expansion of properties that could be either user-defined or
14480     * official unicode ones is deferred until runtime, including a marker for
14481     * those that might be in the latter category.  This boolean indicates if
14482     * we've seen that marker.  If not, what we're parsing can't be such an
14483     * official Unicode property whose expansion was deferred */
14484    bool could_be_deferred_official = FALSE;
14485
14486    PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14487
14488    /* The input will be normalized into 'lookup_name' */
14489    Newx(lookup_name, name_len, char);
14490    SAVEFREEPV(lookup_name);
14491
14492    /* Parse the input. */
14493    for (i = 0; i < name_len; i++) {
14494        char cur = name[i];
14495
14496        /* Most of the characters in the input will be of this ilk, being parts
14497         * of a name */
14498        if (isIDCONT_A(cur)) {
14499
14500            /* Case differences are ignored.  Our lookup routine assumes
14501             * everything is lowercase, so normalize to that */
14502            if (isUPPER_A(cur)) {
14503                lookup_name[j++] = toLOWER_A(cur);
14504                continue;
14505            }
14506
14507            if (cur == '_') { /* Don't include these in the normalized name */
14508                continue;
14509            }
14510
14511            lookup_name[j++] = cur;
14512
14513            /* The first character in a user-defined name must be of this type.
14514             * */
14515            if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14516                could_be_user_defined = FALSE;
14517            }
14518
14519            continue;
14520        }
14521
14522        /* Here, the character is not something typically in a name,  But these
14523         * two types of characters (and the '_' above) can be freely ignored in
14524         * most situations.  Later it may turn out we shouldn't have ignored
14525         * them, and we have to reparse, but we don't have enough information
14526         * yet to make that decision */
14527        if (cur == '-' || isSPACE_A(cur)) {
14528            could_be_user_defined = FALSE;
14529            continue;
14530        }
14531
14532        /* An equals sign or single colon mark the end of the first part of
14533         * the property name */
14534        if (    cur == '='
14535            || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14536        {
14537            lookup_name[j++] = '='; /* Treat the colon as an '=' */
14538            equals_pos = j; /* Note where it occurred in the input */
14539            could_be_user_defined = FALSE;
14540            break;
14541        }
14542
14543        /* If this looks like it is a marker we inserted at compile time,
14544         * set a flag and otherwise ignore it.  If it isn't in the final
14545         * position, keep it as it would have been user input. */
14546        if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14547            && ! deferrable
14548            &&   could_be_user_defined
14549            &&   i == name_len - 1)
14550        {
14551            name_len--;
14552            could_be_deferred_official = TRUE;
14553            continue;
14554        }
14555
14556        /* Otherwise, this character is part of the name. */
14557        lookup_name[j++] = cur;
14558
14559        /* Here it isn't a single colon, so if it is a colon, it must be a
14560         * double colon */
14561        if (cur == ':') {
14562
14563            /* A double colon should be a package qualifier.  We note its
14564             * position and continue.  Note that one could have
14565             *      pkg1::pkg2::...::foo
14566             * so that the position at the end of the loop will be just after
14567             * the final qualifier */
14568
14569            i++;
14570            non_pkg_begin = i + 1;
14571            lookup_name[j++] = ':';
14572            lun_non_pkg_begin = j;
14573        }
14574        else { /* Only word chars (and '::') can be in a user-defined name */
14575            could_be_user_defined = FALSE;
14576        }
14577    } /* End of parsing through the lhs of the property name (or all of it if
14578         no rhs) */
14579
14580    /* If there is a single package name 'utf8::', it is ambiguous.  It could
14581     * be for a user-defined property, or it could be a Unicode property, as
14582     * all of them are considered to be for that package.  For the purposes of
14583     * parsing the rest of the property, strip it off */
14584    if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14585        lookup_name += STRLENs("utf8::");
14586        j           -= STRLENs("utf8::");
14587        equals_pos  -= STRLENs("utf8::");
14588        i_zero       = STRLENs("utf8::");   /* When resetting 'i' to reparse
14589                                               from the beginning, it has to be
14590                                               set past what we're stripping
14591                                               off */
14592        stripped_utf8_pkg = TRUE;
14593    }
14594
14595    /* Here, we are either done with the whole property name, if it was simple;
14596     * or are positioned just after the '=' if it is compound. */
14597
14598    if (equals_pos >= 0) {
14599        assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14600
14601        /* Space immediately after the '=' is ignored */
14602        i++;
14603        for (; i < name_len; i++) {
14604            if (! isSPACE_A(name[i])) {
14605                break;
14606            }
14607        }
14608
14609        /* Most punctuation after the equals indicates a subpattern, like
14610         * \p{foo=/bar/} */
14611        if (   isPUNCT_A(name[i])
14612            &&  name[i] != '-'
14613            &&  name[i] != '+'
14614            &&  name[i] != '_'
14615            &&  name[i] != '{'
14616                /* A backslash means the real delimiter is the next character,
14617                 * but it must be punctuation */
14618            && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14619        {
14620            bool special_property = memEQs(lookup_name, j - 1, "name")
14621                                 || memEQs(lookup_name, j - 1, "na");
14622            if (! special_property) {
14623                /* Find the property.  The table includes the equals sign, so
14624                 * we use 'j' as-is */
14625                table_index = do_uniprop_match(lookup_name, j);
14626            }
14627            if (special_property || table_index) {
14628                REGEXP * subpattern_re;
14629                char open = name[i++];
14630                char close;
14631                const char * pos_in_brackets;
14632                const char * const * prop_values;
14633                bool escaped = 0;
14634
14635                /* Backslash => delimiter is the character following.  We
14636                 * already checked that it is punctuation */
14637                if (open == '\\') {
14638                    open = name[i++];
14639                    escaped = 1;
14640                }
14641
14642                /* This data structure is constructed so that the matching
14643                 * closing bracket is 3 past its matching opening.  The second
14644                 * set of closing is so that if the opening is something like
14645                 * ']', the closing will be that as well.  Something similar is
14646                 * done in toke.c */
14647                pos_in_brackets = memCHRs("([<)]>)]>", open);
14648                close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14649
14650                if (    i >= name_len
14651                    ||  name[name_len-1] != close
14652                    || (escaped && name[name_len-2] != '\\')
14653                        /* Also make sure that there are enough characters.
14654                         * e.g., '\\\' would show up incorrectly as legal even
14655                         * though it is too short */
14656                    || (SSize_t) (name_len - i - 1 - escaped) < 0)
14657                {
14658                    sv_catpvs(msg, "Unicode property wildcard not terminated");
14659                    goto append_name_to_msg;
14660                }
14661
14662                Perl_ck_warner_d(aTHX_
14663                    packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14664                    "The Unicode property wildcards feature is experimental");
14665
14666                if (special_property) {
14667                    const char * error_msg;
14668                    const char * revised_name = name + i;
14669                    Size_t revised_name_len = name_len - (i + 1 + escaped);
14670
14671                    /* Currently, the only 'special_property' is name, which we
14672                     * lookup in _charnames.pm */
14673
14674                    if (! load_charnames(newSVpvs("placeholder"),
14675                                         revised_name, revised_name_len,
14676                                         &error_msg))
14677                    {
14678                        sv_catpv(msg, error_msg);
14679                        goto append_name_to_msg;
14680                    }
14681
14682                    /* Farm this out to a function just to make the current
14683                     * function less unwieldy */
14684                    if (handle_names_wildcard(revised_name, revised_name_len,
14685                                              &prop_definition,
14686                                              strings))
14687                    {
14688                        return prop_definition;
14689                    }
14690
14691                    goto failed;
14692                }
14693
14694                prop_values = get_prop_values(table_index);
14695
14696                /* Now create and compile the wildcard subpattern.  Use /i
14697                 * because the property values are supposed to match with case
14698                 * ignored. */
14699                subpattern_re = compile_wildcard(name + i,
14700                                                 name_len - i - 1 - escaped,
14701                                                 TRUE /* /i */
14702                                                );
14703
14704                /* For each legal property value, see if the supplied pattern
14705                 * matches it. */
14706                while (*prop_values) {
14707                    const char * const entry = *prop_values;
14708                    const Size_t len = strlen(entry);
14709                    SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14710
14711                    if (execute_wildcard(subpattern_re,
14712                                 (char *) entry,
14713                                 (char *) entry + len,
14714                                 (char *) entry, 0,
14715                                 entry_sv,
14716                                 0))
14717                    { /* Here, matched.  Add to the returned list */
14718                        Size_t total_len = j + len;
14719                        SV * sub_invlist = NULL;
14720                        char * this_string;
14721
14722                        /* We know this is a legal \p{property=value}.  Call
14723                         * the function to return the list of code points that
14724                         * match it */
14725                        Newxz(this_string, total_len + 1, char);
14726                        Copy(lookup_name, this_string, j, char);
14727                        my_strlcat(this_string, entry, total_len + 1);
14728                        SAVEFREEPV(this_string);
14729                        sub_invlist = parse_uniprop_string(this_string,
14730                                                           total_len,
14731                                                           is_utf8,
14732                                                           to_fold,
14733                                                           runtime,
14734                                                           deferrable,
14735                                                           NULL,
14736                                                           user_defined_ptr,
14737                                                           msg,
14738                                                           level + 1);
14739                        _invlist_union(prop_definition, sub_invlist,
14740                                       &prop_definition);
14741                    }
14742
14743                    prop_values++;  /* Next iteration, look at next propvalue */
14744                } /* End of looking through property values; (the data
14745                     structure is terminated by a NULL ptr) */
14746
14747                SvREFCNT_dec_NN(subpattern_re);
14748
14749                if (prop_definition) {
14750                    return prop_definition;
14751                }
14752
14753                sv_catpvs(msg, "No Unicode property value wildcard matches:");
14754                goto append_name_to_msg;
14755            }
14756
14757            /* Here's how khw thinks we should proceed to handle the properties
14758             * not yet done:    Bidi Mirroring Glyph        can map to ""
14759                                Bidi Paired Bracket         can map to ""
14760                                Case Folding  (both full and simple)
14761                                            Shouldn't /i be good enough for Full
14762                                Decomposition Mapping
14763                                Equivalent Unified Ideograph    can map to ""
14764                                Lowercase Mapping  (both full and simple)
14765                                NFKC Case Fold                  can map to ""
14766                                Titlecase Mapping  (both full and simple)
14767                                Uppercase Mapping  (both full and simple)
14768             * Handle these the same way Name is done, using say, _wild.pm, but
14769             * having both loose and full, like in charclass_invlists.h.
14770             * Perhaps move block and script to that as they are somewhat large
14771             * in charclass_invlists.h.
14772             * For properties where the default is the code point itself, such
14773             * as any of the case changing mappings, the string would otherwise
14774             * consist of all Unicode code points in UTF-8 strung together.
14775             * This would be impractical.  So instead, examine their compiled
14776             * pattern, looking at the ssc.  If none, reject the pattern as an
14777             * error.  Otherwise run the pattern against every code point in
14778             * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
14779             * And it might be good to create an API to return the ssc.
14780             * Or handle them like the algorithmic names are done
14781             */
14782        } /* End of is a wildcard subppattern */
14783
14784        /* \p{name=...} is handled specially.  Instead of using the normal
14785         * mechanism involving charclass_invlists.h, it uses _charnames.pm
14786         * which has the necessary (huge) data accessible to it, and which
14787         * doesn't get loaded unless necessary.  The legal syntax for names is
14788         * somewhat different than other properties due both to the vagaries of
14789         * a few outlier official names, and the fact that only a few ASCII
14790         * characters are permitted in them */
14791        if (   memEQs(lookup_name, j - 1, "name")
14792            || memEQs(lookup_name, j - 1, "na"))
14793        {
14794            dSP;
14795            HV * table;
14796            SV * character;
14797            const char * error_msg;
14798            CV* lookup_loose;
14799            SV * character_name;
14800            STRLEN character_len;
14801            UV cp;
14802
14803            stricter = As_Is;
14804
14805            /* Since the RHS (after skipping initial space) is passed unchanged
14806             * to charnames, and there are different criteria for what are
14807             * legal characters in the name, just parse it here.  A character
14808             * name must begin with an ASCII alphabetic */
14809            if (! isALPHA(name[i])) {
14810                goto failed;
14811            }
14812            lookup_name[j++] = name[i];
14813
14814            for (++i; i < name_len; i++) {
14815                /* Official names can only be in the ASCII range, and only
14816                 * certain characters */
14817                if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14818                    goto failed;
14819                }
14820                lookup_name[j++] = name[i];
14821            }
14822
14823            /* Finished parsing, save the name into an SV */
14824            character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14825
14826            /* Make sure _charnames is loaded.  (The parameters give context
14827             * for any errors generated */
14828            table = load_charnames(character_name, name, name_len, &error_msg);
14829            if (table == NULL) {
14830                sv_catpv(msg, error_msg);
14831                goto append_name_to_msg;
14832            }
14833
14834            lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14835            if (! lookup_loose) {
14836                Perl_croak(aTHX_
14837                       "panic: Can't find '_charnames::_loose_regcomp_lookup");
14838            }
14839
14840            PUSHSTACKi(PERLSI_REGCOMP);
14841            ENTER ;
14842            SAVETMPS;
14843            save_re_context();
14844
14845            PUSHMARK(SP) ;
14846            XPUSHs(character_name);
14847            PUTBACK;
14848            call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14849
14850            SPAGAIN ;
14851
14852            character = POPs;
14853            SvREFCNT_inc_simple_void_NN(character);
14854
14855            PUTBACK ;
14856            FREETMPS ;
14857            LEAVE ;
14858            POPSTACK;
14859
14860            if (! SvOK(character)) {
14861                goto failed;
14862            }
14863
14864            cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14865            if (character_len == SvCUR(character)) {
14866                prop_definition = add_cp_to_invlist(NULL, cp);
14867            }
14868            else {
14869                AV * this_string;
14870
14871                /* First of the remaining characters in the string. */
14872                char * remaining = SvPVX(character) + character_len;
14873
14874                if (strings == NULL) {
14875                    goto failed;    /* XXX Perhaps a specific msg instead, like
14876                                       'not available here' */
14877                }
14878
14879                if (*strings == NULL) {
14880                    *strings = newAV();
14881                }
14882
14883                this_string = newAV();
14884                av_push_simple(this_string, newSVuv(cp));
14885
14886                do {
14887                    cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14888                    av_push_simple(this_string, newSVuv(cp));
14889                    remaining += character_len;
14890                } while (remaining < SvEND(character));
14891
14892                av_push_simple(*strings, (SV *) this_string);
14893            }
14894
14895            return prop_definition;
14896        }
14897
14898        /* Certain properties whose values are numeric need special handling.
14899         * They may optionally be prefixed by 'is'.  Ignore that prefix for the
14900         * purposes of checking if this is one of those properties */
14901        if (memBEGINPs(lookup_name, j, "is")) {
14902            lookup_offset = 2;
14903        }
14904
14905        /* Then check if it is one of these specially-handled properties.  The
14906         * possibilities are hard-coded because easier this way, and the list
14907         * is unlikely to change.
14908         *
14909         * All numeric value type properties are of this ilk, and are also
14910         * special in a different way later on.  So find those first.  There
14911         * are several numeric value type properties in the Unihan DB (which is
14912         * unlikely to be compiled with perl, but we handle it here in case it
14913         * does get compiled).  They all end with 'numeric'.  The interiors
14914         * aren't checked for the precise property.  This would stop working if
14915         * a cjk property were to be created that ended with 'numeric' and
14916         * wasn't a numeric type */
14917        is_nv_type = memEQs(lookup_name + lookup_offset,
14918                       j - 1 - lookup_offset, "numericvalue")
14919                  || memEQs(lookup_name + lookup_offset,
14920                      j - 1 - lookup_offset, "nv")
14921                  || (   memENDPs(lookup_name + lookup_offset,
14922                            j - 1 - lookup_offset, "numeric")
14923                      && (   memBEGINPs(lookup_name + lookup_offset,
14924                                      j - 1 - lookup_offset, "cjk")
14925                          || memBEGINPs(lookup_name + lookup_offset,
14926                                      j - 1 - lookup_offset, "k")));
14927        if (   is_nv_type
14928            || memEQs(lookup_name + lookup_offset,
14929                      j - 1 - lookup_offset, "canonicalcombiningclass")
14930            || memEQs(lookup_name + lookup_offset,
14931                      j - 1 - lookup_offset, "ccc")
14932            || memEQs(lookup_name + lookup_offset,
14933                      j - 1 - lookup_offset, "age")
14934            || memEQs(lookup_name + lookup_offset,
14935                      j - 1 - lookup_offset, "in")
14936            || memEQs(lookup_name + lookup_offset,
14937                      j - 1 - lookup_offset, "presentin"))
14938        {
14939            unsigned int k;
14940
14941            /* Since the stuff after the '=' is a number, we can't throw away
14942             * '-' willy-nilly, as those could be a minus sign.  Other stricter
14943             * rules also apply.  However, these properties all can have the
14944             * rhs not be a number, in which case they contain at least one
14945             * alphabetic.  In those cases, the stricter rules don't apply.
14946             * But the numeric type properties can have the alphas [Ee] to
14947             * signify an exponent, and it is still a number with stricter
14948             * rules.  So look for an alpha that signifies not-strict */
14949            stricter = Strict;
14950            for (k = i; k < name_len; k++) {
14951                if (   isALPHA_A(name[k])
14952                    && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
14953                {
14954                    stricter = Not_Strict;
14955                    break;
14956                }
14957            }
14958        }
14959
14960        if (stricter) {
14961
14962            /* A number may have a leading '+' or '-'.  The latter is retained
14963             * */
14964            if (name[i] == '+') {
14965                i++;
14966            }
14967            else if (name[i] == '-') {
14968                lookup_name[j++] = '-';
14969                i++;
14970            }
14971
14972            /* Skip leading zeros including single underscores separating the
14973             * zeros, or between the final leading zero and the first other
14974             * digit */
14975            for (; i < name_len - 1; i++) {
14976                if (    name[i] != '0'
14977                    && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
14978                {
14979                    break;
14980                }
14981            }
14982
14983            /* Turn nv=-0 into nv=0.  These should be equivalent, but vary by
14984             * underling libc implementation. */
14985            if (   i == name_len - 1
14986                && name[name_len-1] == '0'
14987                && lookup_name[j-1] == '-')
14988            {
14989                j--;
14990            }
14991        }
14992    }
14993    else {  /* No '=' */
14994
14995       /* Only a few properties without an '=' should be parsed with stricter
14996        * rules.  The list is unlikely to change. */
14997        if (   memBEGINPs(lookup_name, j, "perl")
14998            && memNEs(lookup_name + 4, j - 4, "space")
14999            && memNEs(lookup_name + 4, j - 4, "word"))
15000        {
15001            stricter = Strict;
15002
15003            /* We set the inputs back to 0 and the code below will reparse,
15004             * using strict */
15005            i = i_zero;
15006            j = 0;
15007        }
15008    }
15009
15010    /* Here, we have either finished the property, or are positioned to parse
15011     * the remainder, and we know if stricter rules apply.  Finish out, if not
15012     * already done */
15013    for (; i < name_len; i++) {
15014        char cur = name[i];
15015
15016        /* In all instances, case differences are ignored, and we normalize to
15017         * lowercase */
15018        if (isUPPER_A(cur)) {
15019            lookup_name[j++] = toLOWER(cur);
15020            continue;
15021        }
15022
15023        /* An underscore is skipped, but not under strict rules unless it
15024         * separates two digits */
15025        if (cur == '_') {
15026            if (    stricter
15027                && (   i == i_zero || (int) i == equals_pos || i == name_len- 1
15028                    || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15029            {
15030                lookup_name[j++] = '_';
15031            }
15032            continue;
15033        }
15034
15035        /* Hyphens are skipped except under strict */
15036        if (cur == '-' && ! stricter) {
15037            continue;
15038        }
15039
15040        /* XXX Bug in documentation.  It says white space skipped adjacent to
15041         * non-word char.  Maybe we should, but shouldn't skip it next to a dot
15042         * in a number */
15043        if (isSPACE_A(cur) && ! stricter) {
15044            continue;
15045        }
15046
15047        lookup_name[j++] = cur;
15048
15049        /* Unless this is a non-trailing slash, we are done with it */
15050        if (i >= name_len - 1 || cur != '/') {
15051            continue;
15052        }
15053
15054        slash_pos = j;
15055
15056        /* A slash in the 'numeric value' property indicates that what follows
15057         * is a denominator.  It can have a leading '+' and '0's that should be
15058         * skipped.  But we have never allowed a negative denominator, so treat
15059         * a minus like every other character.  (No need to rule out a second
15060         * '/', as that won't match anything anyway */
15061        if (is_nv_type) {
15062            i++;
15063            if (i < name_len && name[i] == '+') {
15064                i++;
15065            }
15066
15067            /* Skip leading zeros including underscores separating digits */
15068            for (; i < name_len - 1; i++) {
15069                if (   name[i] != '0'
15070                    && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15071                {
15072                    break;
15073                }
15074            }
15075
15076            /* Store the first real character in the denominator */
15077            if (i < name_len) {
15078                lookup_name[j++] = name[i];
15079            }
15080        }
15081    }
15082
15083    /* Here are completely done parsing the input 'name', and 'lookup_name'
15084     * contains a copy, normalized.
15085     *
15086     * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15087     * different from without the underscores.  */
15088    if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
15089           || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15090        && UNLIKELY(name[name_len-1] == '_'))
15091    {
15092        lookup_name[j++] = '&';
15093    }
15094
15095    /* If the original input began with 'In' or 'Is', it could be a subroutine
15096     * call to a user-defined property instead of a Unicode property name. */
15097    if (    name_len - non_pkg_begin > 2
15098        &&  name[non_pkg_begin+0] == 'I'
15099        && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15100    {
15101        /* Names that start with In have different characteristics than those
15102         * that start with Is */
15103        if (name[non_pkg_begin+1] == 's') {
15104            starts_with_Is = TRUE;
15105        }
15106    }
15107    else {
15108        could_be_user_defined = FALSE;
15109    }
15110
15111    if (could_be_user_defined) {
15112        CV* user_sub;
15113
15114        /* If the user defined property returns the empty string, it could
15115         * easily be because the pattern is being compiled before the data it
15116         * actually needs to compile is available.  This could be argued to be
15117         * a bug in the perl code, but this is a change of behavior for Perl,
15118         * so we handle it.  This means that intentionally returning nothing
15119         * will not be resolved until runtime */
15120        bool empty_return = FALSE;
15121
15122        /* Here, the name could be for a user defined property, which are
15123         * implemented as subs. */
15124        user_sub = get_cvn_flags(name, name_len, 0);
15125        if (! user_sub) {
15126
15127            /* Here, the property name could be a user-defined one, but there
15128             * is no subroutine to handle it (as of now).   Defer handling it
15129             * until runtime.  Otherwise, a block defined by Unicode in a later
15130             * release would get the synonym InFoo added for it, and existing
15131             * code that used that name would suddenly break if it referred to
15132             * the property before the sub was declared.  See [perl #134146] */
15133            if (deferrable) {
15134                goto definition_deferred;
15135            }
15136
15137            /* Here, we are at runtime, and didn't find the user property.  It
15138             * could be an official property, but only if no package was
15139             * specified, or just the utf8:: package. */
15140            if (could_be_deferred_official) {
15141                lookup_name += lun_non_pkg_begin;
15142                j -= lun_non_pkg_begin;
15143            }
15144            else if (! stripped_utf8_pkg) {
15145                goto unknown_user_defined;
15146            }
15147
15148            /* Drop down to look up in the official properties */
15149        }
15150        else {
15151            const char insecure[] = "Insecure user-defined property";
15152
15153            /* Here, there is a sub by the correct name.  Normally we call it
15154             * to get the property definition */
15155            dSP;
15156            SV * user_sub_sv = MUTABLE_SV(user_sub);
15157            SV * error;     /* Any error returned by calling 'user_sub' */
15158            SV * key;       /* The key into the hash of user defined sub names
15159                             */
15160            SV * placeholder;
15161            SV ** saved_user_prop_ptr;      /* Hash entry for this property */
15162
15163            /* How many times to retry when another thread is in the middle of
15164             * expanding the same definition we want */
15165            PERL_INT_FAST8_T retry_countdown = 10;
15166
15167            DECLARATION_FOR_GLOBAL_CONTEXT;
15168
15169            /* If we get here, we know this property is user-defined */
15170            *user_defined_ptr = TRUE;
15171
15172            /* We refuse to call a potentially tainted subroutine; returning an
15173             * error instead */
15174            if (TAINT_get) {
15175                if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15176                sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15177                goto append_name_to_msg;
15178            }
15179
15180            /* In principal, we only call each subroutine property definition
15181             * once during the life of the program.  This guarantees that the
15182             * property definition never changes.  The results of the single
15183             * sub call are stored in a hash, which is used instead for future
15184             * references to this property.  The property definition is thus
15185             * immutable.  But, to allow the user to have a /i-dependent
15186             * definition, we call the sub once for non-/i, and once for /i,
15187             * should the need arise, passing the /i status as a parameter.
15188             *
15189             * We start by constructing the hash key name, consisting of the
15190             * fully qualified subroutine name, preceded by the /i status, so
15191             * that there is a key for /i and a different key for non-/i */
15192            key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15193            fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15194                                          non_pkg_begin != 0);
15195            sv_catsv(key, fq_name);
15196
15197            /* We only call the sub once throughout the life of the program
15198             * (with the /i, non-/i exception noted above).  That means the
15199             * hash must be global and accessible to all threads.  It is
15200             * created at program start-up, before any threads are created, so
15201             * is accessible to all children.  But this creates some
15202             * complications.
15203             *
15204             * 1) The keys can't be shared, or else problems arise; sharing is
15205             *    turned off at hash creation time
15206             * 2) All SVs in it are there for the remainder of the life of the
15207             *    program, and must be created in the same interpreter context
15208             *    as the hash, or else they will be freed from the wrong pool
15209             *    at global destruction time.  This is handled by switching to
15210             *    the hash's context to create each SV going into it, and then
15211             *    immediately switching back
15212             * 3) All accesses to the hash must be controlled by a mutex, to
15213             *    prevent two threads from getting an unstable state should
15214             *    they simultaneously be accessing it.  The code below is
15215             *    crafted so that the mutex is locked whenever there is an
15216             *    access and unlocked only when the next stable state is
15217             *    achieved.
15218             *
15219             * The hash stores either the definition of the property if it was
15220             * valid, or, if invalid, the error message that was raised.  We
15221             * use the type of SV to distinguish.
15222             *
15223             * There's also the need to guard against the definition expansion
15224             * from infinitely recursing.  This is handled by storing the aTHX
15225             * of the expanding thread during the expansion.  Again the SV type
15226             * is used to distinguish this from the other two cases.  If we
15227             * come to here and the hash entry for this property is our aTHX,
15228             * it means we have recursed, and the code assumes that we would
15229             * infinitely recurse, so instead stops and raises an error.
15230             * (Any recursion has always been treated as infinite recursion in
15231             * this feature.)
15232             *
15233             * If instead, the entry is for a different aTHX, it means that
15234             * that thread has gotten here first, and hasn't finished expanding
15235             * the definition yet.  We just have to wait until it is done.  We
15236             * sleep and retry a few times, returning an error if the other
15237             * thread doesn't complete. */
15238
15239          re_fetch:
15240            USER_PROP_MUTEX_LOCK;
15241
15242            /* If we have an entry for this key, the subroutine has already
15243             * been called once with this /i status. */
15244            saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15245                                                   SvPVX(key), SvCUR(key), 0);
15246            if (saved_user_prop_ptr) {
15247
15248                /* If the saved result is an inversion list, it is the valid
15249                 * definition of this property */
15250                if (is_invlist(*saved_user_prop_ptr)) {
15251                    prop_definition = *saved_user_prop_ptr;
15252
15253                    /* The SV in the hash won't be removed until global
15254                     * destruction, so it is stable and we can unlock */
15255                    USER_PROP_MUTEX_UNLOCK;
15256
15257                    /* The caller shouldn't try to free this SV */
15258                    return prop_definition;
15259                }
15260
15261                /* Otherwise, if it is a string, it is the error message
15262                 * that was returned when we first tried to evaluate this
15263                 * property.  Fail, and append the message */
15264                if (SvPOK(*saved_user_prop_ptr)) {
15265                    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15266                    sv_catsv(msg, *saved_user_prop_ptr);
15267
15268                    /* The SV in the hash won't be removed until global
15269                     * destruction, so it is stable and we can unlock */
15270                    USER_PROP_MUTEX_UNLOCK;
15271
15272                    return NULL;
15273                }
15274
15275                assert(SvIOK(*saved_user_prop_ptr));
15276
15277                /* Here, we have an unstable entry in the hash.  Either another
15278                 * thread is in the middle of expanding the property's
15279                 * definition, or we are ourselves recursing.  We use the aTHX
15280                 * in it to distinguish */
15281                if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15282
15283                    /* Here, it's another thread doing the expanding.  We've
15284                     * looked as much as we are going to at the contents of the
15285                     * hash entry.  It's safe to unlock. */
15286                    USER_PROP_MUTEX_UNLOCK;
15287
15288                    /* Retry a few times */
15289                    if (retry_countdown-- > 0) {
15290                        PerlProc_sleep(1);
15291                        goto re_fetch;
15292                    }
15293
15294                    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15295                    sv_catpvs(msg, "Timeout waiting for another thread to "
15296                                   "define");
15297                    goto append_name_to_msg;
15298                }
15299
15300                /* Here, we are recursing; don't dig any deeper */
15301                USER_PROP_MUTEX_UNLOCK;
15302
15303                if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15304                sv_catpvs(msg,
15305                          "Infinite recursion in user-defined property");
15306                goto append_name_to_msg;
15307            }
15308
15309            /* Here, this thread has exclusive control, and there is no entry
15310             * for this property in the hash.  So we have the go ahead to
15311             * expand the definition ourselves. */
15312
15313            PUSHSTACKi(PERLSI_REGCOMP);
15314            ENTER;
15315
15316            /* Create a temporary placeholder in the hash to detect recursion
15317             * */
15318            SWITCH_TO_GLOBAL_CONTEXT;
15319            placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15320            (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15321            RESTORE_CONTEXT;
15322
15323            /* Now that we have a placeholder, we can let other threads
15324             * continue */
15325            USER_PROP_MUTEX_UNLOCK;
15326
15327            /* Make sure the placeholder always gets destroyed */
15328            SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15329
15330            PUSHMARK(SP);
15331            SAVETMPS;
15332
15333            /* Call the user's function, with the /i status as a parameter.
15334             * Note that we have gone to a lot of trouble to keep this call
15335             * from being within the locked mutex region. */
15336            XPUSHs(boolSV(to_fold));
15337            PUTBACK;
15338
15339            /* The following block was taken from swash_init().  Presumably
15340             * they apply to here as well, though we no longer use a swash --
15341             * khw */
15342            SAVEHINTS();
15343            save_re_context();
15344            /* We might get here via a subroutine signature which uses a utf8
15345             * parameter name, at which point PL_subname will have been set
15346             * but not yet used. */
15347            save_item(PL_subname);
15348
15349            /* G_SCALAR guarantees a single return value */
15350            (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15351
15352            SPAGAIN;
15353
15354            error = ERRSV;
15355            if (TAINT_get || SvTRUE(error)) {
15356                if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15357                if (SvTRUE(error)) {
15358                    sv_catpvs(msg, "Error \"");
15359                    sv_catsv(msg, error);
15360                    sv_catpvs(msg, "\"");
15361                }
15362                if (TAINT_get) {
15363                    if (SvTRUE(error)) sv_catpvs(msg, "; ");
15364                    sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15365                }
15366
15367                if (name_len > 0) {
15368                    sv_catpvs(msg, " in expansion of ");
15369                    Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15370                                                                  name_len,
15371                                                                  name));
15372                }
15373
15374                (void) POPs;
15375                prop_definition = NULL;
15376            }
15377            else {
15378                SV * contents = POPs;
15379
15380                /* The contents is supposed to be the expansion of the property
15381                 * definition.  If the definition is deferrable, and we got an
15382                 * empty string back, set a flag to later defer it (after clean
15383                 * up below). */
15384                if (      deferrable
15385                    && (! SvPOK(contents) || SvCUR(contents) == 0))
15386                {
15387                        empty_return = TRUE;
15388                }
15389                else { /* Otherwise, call a function to check for valid syntax,
15390                          and handle it */
15391
15392                    prop_definition = handle_user_defined_property(
15393                                                    name, name_len,
15394                                                    is_utf8, to_fold, runtime,
15395                                                    deferrable,
15396                                                    contents, user_defined_ptr,
15397                                                    msg,
15398                                                    level);
15399                }
15400            }
15401
15402            /* Here, we have the results of the expansion.  Delete the
15403             * placeholder, and if the definition is now known, replace it with
15404             * that definition.  We need exclusive access to the hash, and we
15405             * can't let anyone else in, between when we delete the placeholder
15406             * and add the permanent entry */
15407            USER_PROP_MUTEX_LOCK;
15408
15409            S_delete_recursion_entry(aTHX_ SvPVX(key));
15410
15411            if (    ! empty_return
15412                && (! prop_definition || is_invlist(prop_definition)))
15413            {
15414                /* If we got success we use the inversion list defining the
15415                 * property; otherwise use the error message */
15416                SWITCH_TO_GLOBAL_CONTEXT;
15417                (void) hv_store_ent(PL_user_def_props,
15418                                    key,
15419                                    ((prop_definition)
15420                                     ? newSVsv(prop_definition)
15421                                     : newSVsv(msg)),
15422                                    0);
15423                RESTORE_CONTEXT;
15424            }
15425
15426            /* All done, and the hash now has a permanent entry for this
15427             * property.  Give up exclusive control */
15428            USER_PROP_MUTEX_UNLOCK;
15429
15430            FREETMPS;
15431            LEAVE;
15432            POPSTACK;
15433
15434            if (empty_return) {
15435                goto definition_deferred;
15436            }
15437
15438            if (prop_definition) {
15439
15440                /* If the definition is for something not known at this time,
15441                 * we toss it, and go return the main property name, as that's
15442                 * the one the user will be aware of */
15443                if (! is_invlist(prop_definition)) {
15444                    SvREFCNT_dec_NN(prop_definition);
15445                    goto definition_deferred;
15446                }
15447
15448                sv_2mortal(prop_definition);
15449            }
15450
15451            /* And return */
15452            return prop_definition;
15453
15454        }   /* End of calling the subroutine for the user-defined property */
15455    }       /* End of it could be a user-defined property */
15456
15457    /* Here it wasn't a user-defined property that is known at this time.  See
15458     * if it is a Unicode property */
15459
15460    lookup_len = j;     /* This is a more mnemonic name than 'j' */
15461
15462    /* Get the index into our pointer table of the inversion list corresponding
15463     * to the property */
15464    table_index = do_uniprop_match(lookup_name, lookup_len);
15465
15466    /* If it didn't find the property ... */
15467    if (table_index == 0) {
15468
15469        /* Try again stripping off any initial 'Is'.  This is because we
15470         * promise that an initial Is is optional.  The same isn't true of
15471         * names that start with 'In'.  Those can match only blocks, and the
15472         * lookup table already has those accounted for.  The lookup table also
15473         * has already accounted for Perl extensions (without and = sign)
15474         * starting with 'i's'. */
15475        if (starts_with_Is && equals_pos >= 0) {
15476            lookup_name += 2;
15477            lookup_len -= 2;
15478            equals_pos -= 2;
15479            slash_pos -= 2;
15480
15481            table_index = do_uniprop_match(lookup_name, lookup_len);
15482        }
15483
15484        if (table_index == 0) {
15485            char * canonical;
15486
15487            /* Here, we didn't find it.  If not a numeric type property, and
15488             * can't be a user-defined one, it isn't a legal property */
15489            if (! is_nv_type) {
15490                if (! could_be_user_defined) {
15491                    goto failed;
15492                }
15493
15494                /* Here, the property name is legal as a user-defined one.   At
15495                 * compile time, it might just be that the subroutine for that
15496                 * property hasn't been encountered yet, but at runtime, it's
15497                 * an error to try to use an undefined one */
15498                if (! deferrable) {
15499                    goto unknown_user_defined;;
15500                }
15501
15502                goto definition_deferred;
15503            } /* End of isn't a numeric type property */
15504
15505            /* The numeric type properties need more work to decide.  What we
15506             * do is make sure we have the number in canonical form and look
15507             * that up. */
15508
15509            if (slash_pos < 0) {    /* No slash */
15510
15511                /* When it isn't a rational, take the input, convert it to a
15512                 * NV, then create a canonical string representation of that
15513                 * NV. */
15514
15515                NV value;
15516                SSize_t value_len = lookup_len - equals_pos;
15517
15518                /* Get the value */
15519                if (   value_len <= 0
15520                    || my_atof3(lookup_name + equals_pos, &value,
15521                                value_len)
15522                          != lookup_name + lookup_len)
15523                {
15524                    goto failed;
15525                }
15526
15527                /* If the value is an integer, the canonical value is integral
15528                 * */
15529                if (Perl_ceil(value) == value) {
15530                    canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15531                                            equals_pos, lookup_name, value);
15532                }
15533                else {  /* Otherwise, it is %e with a known precision */
15534                    char * exp_ptr;
15535
15536                    canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15537                                                equals_pos, lookup_name,
15538                                                PL_E_FORMAT_PRECISION, value);
15539
15540                    /* The exponent generated is expecting two digits, whereas
15541                     * %e on some systems will generate three.  Remove leading
15542                     * zeros in excess of 2 from the exponent.  We start
15543                     * looking for them after the '=' */
15544                    exp_ptr = strchr(canonical + equals_pos, 'e');
15545                    if (exp_ptr) {
15546                        char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15547                        SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15548
15549                        assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15550
15551                        if (excess_exponent_len > 0) {
15552                            SSize_t leading_zeros = strspn(cur_ptr, "0");
15553                            SSize_t excess_leading_zeros
15554                                    = MIN(leading_zeros, excess_exponent_len);
15555                            if (excess_leading_zeros > 0) {
15556                                Move(cur_ptr + excess_leading_zeros,
15557                                     cur_ptr,
15558                                     strlen(cur_ptr) - excess_leading_zeros
15559                                       + 1,  /* Copy the NUL as well */
15560                                     char);
15561                            }
15562                        }
15563                    }
15564                }
15565            }
15566            else {  /* Has a slash.  Create a rational in canonical form  */
15567                UV numerator, denominator, gcd, trial;
15568                const char * end_ptr;
15569                const char * sign = "";
15570
15571                /* We can't just find the numerator, denominator, and do the
15572                 * division, then use the method above, because that is
15573                 * inexact.  And the input could be a rational that is within
15574                 * epsilon (given our precision) of a valid rational, and would
15575                 * then incorrectly compare valid.
15576                 *
15577                 * We're only interested in the part after the '=' */
15578                const char * this_lookup_name = lookup_name + equals_pos;
15579                lookup_len -= equals_pos;
15580                slash_pos -= equals_pos;
15581
15582                /* Handle any leading minus */
15583                if (this_lookup_name[0] == '-') {
15584                    sign = "-";
15585                    this_lookup_name++;
15586                    lookup_len--;
15587                    slash_pos--;
15588                }
15589
15590                /* Convert the numerator to numeric */
15591                end_ptr = this_lookup_name + slash_pos;
15592                if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15593                    goto failed;
15594                }
15595
15596                /* It better have included all characters before the slash */
15597                if (*end_ptr != '/') {
15598                    goto failed;
15599                }
15600
15601                /* Set to look at just the denominator */
15602                this_lookup_name += slash_pos;
15603                lookup_len -= slash_pos;
15604                end_ptr = this_lookup_name + lookup_len;
15605
15606                /* Convert the denominator to numeric */
15607                if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15608                    goto failed;
15609                }
15610
15611                /* It better be the rest of the characters, and don't divide by
15612                 * 0 */
15613                if (   end_ptr != this_lookup_name + lookup_len
15614                    || denominator == 0)
15615                {
15616                    goto failed;
15617                }
15618
15619                /* Get the greatest common denominator using
15620                   http://en.wikipedia.org/wiki/Euclidean_algorithm */
15621                gcd = numerator;
15622                trial = denominator;
15623                while (trial != 0) {
15624                    UV temp = trial;
15625                    trial = gcd % trial;
15626                    gcd = temp;
15627                }
15628
15629                /* If already in lowest possible terms, we have already tried
15630                 * looking this up */
15631                if (gcd == 1) {
15632                    goto failed;
15633                }
15634
15635                /* Reduce the rational, which should put it in canonical form
15636                 * */
15637                numerator /= gcd;
15638                denominator /= gcd;
15639
15640                canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15641                        equals_pos, lookup_name, sign, numerator, denominator);
15642            }
15643
15644            /* Here, we have the number in canonical form.  Try that */
15645            table_index = do_uniprop_match(canonical, strlen(canonical));
15646            if (table_index == 0) {
15647                goto failed;
15648            }
15649        }   /* End of still didn't find the property in our table */
15650    }       /* End of       didn't find the property in our table */
15651
15652    /* Here, we have a non-zero return, which is an index into a table of ptrs.
15653     * A negative return signifies that the real index is the absolute value,
15654     * but the result needs to be inverted */
15655    if (table_index < 0) {
15656        invert_return = TRUE;
15657        table_index = -table_index;
15658    }
15659
15660    /* Out-of band indices indicate a deprecated property.  The proper index is
15661     * modulo it with the table size.  And dividing by the table size yields
15662     * an offset into a table constructed by regen/mk_invlists.pl to contain
15663     * the corresponding warning message */
15664    if (table_index > MAX_UNI_KEYWORD_INDEX) {
15665        Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15666        table_index %= MAX_UNI_KEYWORD_INDEX;
15667        Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15668                "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15669                (int) name_len, name,
15670                get_deprecated_property_msg(warning_offset));
15671    }
15672
15673    /* In a few properties, a different property is used under /i.  These are
15674     * unlikely to change, so are hard-coded here. */
15675    if (to_fold) {
15676        if (   table_index == UNI_XPOSIXUPPER
15677            || table_index == UNI_XPOSIXLOWER
15678            || table_index == UNI_TITLE)
15679        {
15680            table_index = UNI_CASED;
15681        }
15682        else if (   table_index == UNI_UPPERCASELETTER
15683                 || table_index == UNI_LOWERCASELETTER
15684#  ifdef UNI_TITLECASELETTER   /* Missing from early Unicodes */
15685                 || table_index == UNI_TITLECASELETTER
15686#  endif
15687        ) {
15688            table_index = UNI_CASEDLETTER;
15689        }
15690        else if (  table_index == UNI_POSIXUPPER
15691                || table_index == UNI_POSIXLOWER)
15692        {
15693            table_index = UNI_POSIXALPHA;
15694        }
15695    }
15696
15697    /* Create and return the inversion list */
15698    prop_definition = get_prop_definition(table_index);
15699    sv_2mortal(prop_definition);
15700
15701    /* See if there is a private use override to add to this definition */
15702    {
15703        COPHH * hinthash = (IN_PERL_COMPILETIME)
15704                           ? CopHINTHASH_get(&PL_compiling)
15705                           : CopHINTHASH_get(PL_curcop);
15706        SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15707
15708        if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15709
15710            /* See if there is an element in the hints hash for this table */
15711            SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15712            const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15713
15714            if (pos) {
15715                bool dummy;
15716                SV * pu_definition;
15717                SV * pu_invlist;
15718                SV * expanded_prop_definition =
15719                            sv_2mortal(invlist_clone(prop_definition, NULL));
15720
15721                /* If so, it's definition is the string from here to the next
15722                 * \a character.  And its format is the same as a user-defined
15723                 * property */
15724                pos += SvCUR(pu_lookup);
15725                pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15726                pu_invlist = handle_user_defined_property(lookup_name,
15727                                                          lookup_len,
15728                                                          0, /* Not UTF-8 */
15729                                                          0, /* Not folded */
15730                                                          runtime,
15731                                                          deferrable,
15732                                                          pu_definition,
15733                                                          &dummy,
15734                                                          msg,
15735                                                          level);
15736                if (TAINT_get) {
15737                    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15738                    sv_catpvs(msg, "Insecure private-use override");
15739                    goto append_name_to_msg;
15740                }
15741
15742                /* For now, as a safety measure, make sure that it doesn't
15743                 * override non-private use code points */
15744                _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15745
15746                /* Add it to the list to be returned */
15747                _invlist_union(prop_definition, pu_invlist,
15748                               &expanded_prop_definition);
15749                prop_definition = expanded_prop_definition;
15750                Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15751            }
15752        }
15753    }
15754
15755    if (invert_return) {
15756        _invlist_invert(prop_definition);
15757    }
15758    return prop_definition;
15759
15760  unknown_user_defined:
15761    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15762    sv_catpvs(msg, "Unknown user-defined property name");
15763    goto append_name_to_msg;
15764
15765  failed:
15766    if (non_pkg_begin != 0) {
15767        if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15768        sv_catpvs(msg, "Illegal user-defined property name");
15769    }
15770    else {
15771        if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15772        sv_catpvs(msg, "Can't find Unicode property definition");
15773    }
15774    /* FALLTHROUGH */
15775
15776  append_name_to_msg:
15777    {
15778        const char * prefix = (runtime && level == 0) ?  " \\p{" : " \"";
15779        const char * suffix = (runtime && level == 0) ?  "}" : "\"";
15780
15781        sv_catpv(msg, prefix);
15782        Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15783        sv_catpv(msg, suffix);
15784    }
15785
15786    return NULL;
15787
15788  definition_deferred:
15789
15790    {
15791        bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
15792
15793        /* Here it could yet to be defined, so defer evaluation of this until
15794         * its needed at runtime.  We need the fully qualified property name to
15795         * avoid ambiguity */
15796        if (! fq_name) {
15797            fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15798                                                                is_qualified);
15799        }
15800
15801        /* If it didn't come with a package, or the package is utf8::, this
15802         * actually could be an official Unicode property whose inclusion we
15803         * are deferring until runtime to make sure that it isn't overridden by
15804         * a user-defined property of the same name (which we haven't
15805         * encountered yet).  Add a marker to indicate this possibility, for
15806         * use at such time when we first need the definition during pattern
15807         * matching execution */
15808        if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15809            sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15810        }
15811
15812        /* We also need a trailing newline */
15813        sv_catpvs(fq_name, "\n");
15814
15815        *user_defined_ptr = TRUE;
15816        return fq_name;
15817    }
15818}
15819
15820STATIC bool
15821S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15822                              const STRLEN wname_len, /* Its length */
15823                              SV ** prop_definition,
15824                              AV ** strings)
15825{
15826    /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15827     * any matches, adding them to prop_definition */
15828
15829    dSP;
15830
15831    CV * get_names_info;        /* entry to charnames.pm to get info we need */
15832    SV * names_string;          /* Contains all character names, except algo */
15833    SV * algorithmic_names;     /* Contains info about algorithmically
15834                                   generated character names */
15835    REGEXP * subpattern_re;     /* The user's pattern to match with */
15836    struct regexp * prog;       /* The compiled pattern */
15837    char * all_names_start;     /* lib/unicore/Name.pl string of every
15838                                   (non-algorithmic) character name */
15839    char * cur_pos;             /* We match, effectively using /gc; this is
15840                                   where we are now */
15841    bool found_matches = FALSE; /* Did any name match so far? */
15842    SV * empty;                 /* For matching zero length names */
15843    SV * must_sv;               /* Contains the substring, if any, that must be
15844                                   in a name for the subpattern to match */
15845    const char * must;          /* The PV of 'must' */
15846    STRLEN must_len;            /* And its length */
15847    SV * syllable_name = NULL;  /* For Hangul syllables */
15848    const char hangul_prefix[] = "HANGUL SYLLABLE ";
15849    const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15850
15851    /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15852     * syllable name, and these are immutable and guaranteed by the Unicode
15853     * standard to never be extended */
15854    const STRLEN syl_max_len = hangul_prefix_len + 7;
15855
15856    IV i;
15857
15858    PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15859
15860    /* Make sure _charnames is loaded.  (The parameters give context
15861     * for any errors generated */
15862    get_names_info = get_cv("_charnames::_get_names_info", 0);
15863    if (! get_names_info) {
15864        Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15865    }
15866
15867    /* Get the charnames data */
15868    PUSHSTACKi(PERLSI_REGCOMP);
15869    ENTER ;
15870    SAVETMPS;
15871    save_re_context();
15872
15873    PUSHMARK(SP) ;
15874    PUTBACK;
15875
15876    /* Special _charnames entry point that returns the info this routine
15877     * requires */
15878    call_sv(MUTABLE_SV(get_names_info), G_LIST);
15879
15880    SPAGAIN ;
15881
15882    /* Data structure for names which end in their very own code points */
15883    algorithmic_names = POPs;
15884    SvREFCNT_inc_simple_void_NN(algorithmic_names);
15885
15886    /* The lib/unicore/Name.pl string */
15887    names_string = POPs;
15888    SvREFCNT_inc_simple_void_NN(names_string);
15889
15890    PUTBACK ;
15891    FREETMPS ;
15892    LEAVE ;
15893    POPSTACK;
15894
15895    if (   ! SvROK(names_string)
15896        || ! SvROK(algorithmic_names))
15897    {   /* Perhaps should panic instead XXX */
15898        SvREFCNT_dec(names_string);
15899        SvREFCNT_dec(algorithmic_names);
15900        return FALSE;
15901    }
15902
15903    names_string = sv_2mortal(SvRV(names_string));
15904    all_names_start = SvPVX(names_string);
15905    cur_pos = all_names_start;
15906
15907    algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
15908
15909    /* Compile the subpattern consisting of the name being looked for */
15910    subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
15911
15912    must_sv = re_intuit_string(subpattern_re);
15913    if (must_sv) {
15914        /* regexec.c can free the re_intuit_string() return. GH #17734 */
15915        must_sv = sv_2mortal(newSVsv(must_sv));
15916        must = SvPV(must_sv, must_len);
15917    }
15918    else {
15919        must = "";
15920        must_len = 0;
15921    }
15922
15923    /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
15924     * This works because the NUL causes the function to return early, thus
15925     * showing that there are characters in it other than the acceptable ones,
15926     * which is our desired result.) */
15927
15928    prog = ReANY(subpattern_re);
15929
15930    /* If only nothing is matched, skip to where empty names are looked for */
15931    if (prog->maxlen == 0) {
15932        goto check_empty;
15933    }
15934
15935    /* And match against the string of all names /gc.  Don't even try if it
15936     * must match a character not found in any name. */
15937    if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
15938    {
15939        while (execute_wildcard(subpattern_re,
15940                                cur_pos,
15941                                SvEND(names_string),
15942                                all_names_start, 0,
15943                                names_string,
15944                                0))
15945        { /* Here, matched. */
15946
15947            /* Note the string entries look like
15948             *      00001\nSTART OF HEADING\n\n
15949             * so we could match anywhere in that string.  We have to rule out
15950             * matching a code point line */
15951            char * this_name_start = all_names_start
15952                                                + RX_OFFS_START(subpattern_re,0);
15953            char * this_name_end   = all_names_start
15954                                                + RX_OFFS_END(subpattern_re,0);
15955            char * cp_start;
15956            char * cp_end;
15957            UV cp = 0;      /* Silences some compilers */
15958            AV * this_string = NULL;
15959            bool is_multi = FALSE;
15960
15961            /* If matched nothing, advance to next possible match */
15962            if (this_name_start == this_name_end) {
15963                cur_pos = (char *) memchr(this_name_end + 1, '\n',
15964                                          SvEND(names_string) - this_name_end);
15965                if (cur_pos == NULL) {
15966                    break;
15967                }
15968            }
15969            else {
15970                /* Position the next match to start beyond the current returned
15971                 * entry */
15972                cur_pos = (char *) memchr(this_name_end, '\n',
15973                                          SvEND(names_string) - this_name_end);
15974            }
15975
15976            /* Back up to the \n just before the beginning of the character. */
15977            cp_end = (char *) my_memrchr(all_names_start,
15978                                         '\n',
15979                                         this_name_start - all_names_start);
15980
15981            /* If we didn't find a \n, it means it matched somewhere in the
15982             * initial '00000' in the string, so isn't a real match */
15983            if (cp_end == NULL) {
15984                continue;
15985            }
15986
15987            this_name_start = cp_end + 1;   /* The name starts just after */
15988            cp_end--;                       /* the \n, and the code point */
15989                                            /* ends just before it */
15990
15991            /* All code points are 5 digits long */
15992            cp_start = cp_end - 4;
15993
15994            /* This shouldn't happen, as we found a \n, and the first \n is
15995             * further along than what we subtracted */
15996            assert(cp_start >= all_names_start);
15997
15998            if (cp_start == all_names_start) {
15999                *prop_definition = add_cp_to_invlist(*prop_definition, 0);
16000                continue;
16001            }
16002
16003            /* If the character is a blank, we either have a named sequence, or
16004             * something is wrong */
16005            if (*(cp_start - 1) == ' ') {
16006                cp_start = (char *) my_memrchr(all_names_start,
16007                                               '\n',
16008                                               cp_start - all_names_start);
16009                cp_start++;
16010            }
16011
16012            assert(cp_start != NULL && cp_start >= all_names_start + 2);
16013
16014            /* Except for the first line in the string, the sequence before the
16015             * code point is \n\n.  If that isn't the case here, we didn't
16016             * match the name of a character.  (We could have matched a named
16017             * sequence, not currently handled */
16018            if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16019                continue;
16020            }
16021
16022            /* We matched!  Add this to the list */
16023            found_matches = TRUE;
16024
16025            /* Loop through all the code points in the sequence */
16026            while (cp_start < cp_end) {
16027
16028                /* Calculate this code point from its 5 digits */
16029                cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16030                   + (XDIGIT_VALUE(cp_start[1]) << 12)
16031                   + (XDIGIT_VALUE(cp_start[2]) << 8)
16032                   + (XDIGIT_VALUE(cp_start[3]) << 4)
16033                   +  XDIGIT_VALUE(cp_start[4]);
16034
16035                cp_start += 6;  /* Go past any blank */
16036
16037                if (cp_start < cp_end || is_multi) {
16038                    if (this_string == NULL) {
16039                        this_string = newAV();
16040                    }
16041
16042                    is_multi = TRUE;
16043                    av_push_simple(this_string, newSVuv(cp));
16044                }
16045            }
16046
16047            if (is_multi) { /* Was more than one code point */
16048                if (*strings == NULL) {
16049                    *strings = newAV();
16050                }
16051
16052                av_push_simple(*strings, (SV *) this_string);
16053            }
16054            else {  /* Only a single code point */
16055                *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16056            }
16057        } /* End of loop through the non-algorithmic names string */
16058    }
16059
16060    /* There are also character names not in 'names_string'.  These are
16061     * algorithmically generatable.  Try this pattern on each possible one.
16062     * (khw originally planned to leave this out given the large number of
16063     * matches attempted; but the speed turned out to be quite acceptable
16064     *
16065     * There are plenty of opportunities to optimize to skip many of the tests.
16066     * beyond the rudimentary ones already here */
16067
16068    /* First see if the subpattern matches any of the algorithmic generatable
16069     * Hangul syllable names.
16070     *
16071     * We know none of these syllable names will match if the input pattern
16072     * requires more bytes than any syllable has, or if the input pattern only
16073     * matches an empty name, or if the pattern has something it must match and
16074     * one of the characters in that isn't in any Hangul syllable. */
16075    if (    prog->minlen <= (SSize_t) syl_max_len
16076        &&  prog->maxlen > 0
16077        && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16078    {
16079        /* These constants, names, values, and algorithm are adapted from the
16080         * Unicode standard, version 5.1, section 3.12, and should never
16081         * change. */
16082        const char * JamoL[] = {
16083            "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16084            "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16085        };
16086        const int LCount = C_ARRAY_LENGTH(JamoL);
16087
16088        const char * JamoV[] = {
16089            "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16090            "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16091            "I"
16092        };
16093        const int VCount = C_ARRAY_LENGTH(JamoV);
16094
16095        const char * JamoT[] = {
16096            "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16097            "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16098            "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16099        };
16100        const int TCount = C_ARRAY_LENGTH(JamoT);
16101
16102        int L, V, T;
16103
16104        /* This is the initial Hangul syllable code point; each time through the
16105         * inner loop, it maps to the next higher code point.  For more info,
16106         * see the Hangul syllable section of the Unicode standard. */
16107        int cp = 0xAC00;
16108
16109        syllable_name = sv_2mortal(newSV(syl_max_len));
16110        sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16111
16112        for (L = 0; L < LCount; L++) {
16113            for (V = 0; V < VCount; V++) {
16114                for (T = 0; T < TCount; T++) {
16115
16116                    /* Truncate back to the prefix, which is unvarying */
16117                    SvCUR_set(syllable_name, hangul_prefix_len);
16118
16119                    sv_catpv(syllable_name, JamoL[L]);
16120                    sv_catpv(syllable_name, JamoV[V]);
16121                    sv_catpv(syllable_name, JamoT[T]);
16122
16123                    if (execute_wildcard(subpattern_re,
16124                                SvPVX(syllable_name),
16125                                SvEND(syllable_name),
16126                                SvPVX(syllable_name), 0,
16127                                syllable_name,
16128                                0))
16129                    {
16130                        *prop_definition = add_cp_to_invlist(*prop_definition,
16131                                                             cp);
16132                        found_matches = TRUE;
16133                    }
16134
16135                    cp++;
16136                }
16137            }
16138        }
16139    }
16140
16141    /* The rest of the algorithmically generatable names are of the form
16142     * "PREFIX-code_point".  The prefixes and the code point limits of each
16143     * were returned to us in the array 'algorithmic_names' from data in
16144     * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
16145    for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16146        IV j;
16147
16148        /* Each element of the array is a hash, giving the details for the
16149         * series of names it covers.  There is the base name of the characters
16150         * in the series, and the low and high code points in the series.  And,
16151         * for optimization purposes a string containing all the legal
16152         * characters that could possibly be in a name in this series. */
16153        HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16154        SV * prefix = * hv_fetchs(this_series, "name", 0);
16155        IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16156        IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16157        char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16158
16159        /* Pre-allocate an SV with enough space */
16160        SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16161                                                        SvPVX(prefix)));
16162        if (high >= 0x10000) {
16163            sv_catpvs(algo_name, "0");
16164        }
16165
16166        /* This series can be skipped entirely if the pattern requires
16167         * something longer than any name in the series, or can only match an
16168         * empty name, or contains a character not found in any name in the
16169         * series */
16170        if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
16171            &&  prog->maxlen > 0
16172            && (strspn(must, legal) == must_len))
16173        {
16174            for (j = low; j <= high; j++) { /* For each code point in the series */
16175
16176                /* Get its name, and see if it matches the subpattern */
16177                Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16178                                     (unsigned) j);
16179
16180                if (execute_wildcard(subpattern_re,
16181                                    SvPVX(algo_name),
16182                                    SvEND(algo_name),
16183                                    SvPVX(algo_name), 0,
16184                                    algo_name,
16185                                    0))
16186                {
16187                    *prop_definition = add_cp_to_invlist(*prop_definition, j);
16188                    found_matches = TRUE;
16189                }
16190            }
16191        }
16192    }
16193
16194  check_empty:
16195    /* Finally, see if the subpattern matches an empty string */
16196    empty = newSVpvs("");
16197    if (execute_wildcard(subpattern_re,
16198                         SvPVX(empty),
16199                         SvEND(empty),
16200                         SvPVX(empty), 0,
16201                         empty,
16202                         0))
16203    {
16204        /* Many code points have empty names.  Currently these are the \p{GC=C}
16205         * ones, minus CC and CF */
16206
16207        SV * empty_names_ref = get_prop_definition(UNI_C);
16208        SV * empty_names = invlist_clone(empty_names_ref, NULL);
16209
16210        SV * subtract = get_prop_definition(UNI_CC);
16211
16212        _invlist_subtract(empty_names, subtract, &empty_names);
16213        SvREFCNT_dec_NN(empty_names_ref);
16214        SvREFCNT_dec_NN(subtract);
16215
16216        subtract = get_prop_definition(UNI_CF);
16217        _invlist_subtract(empty_names, subtract, &empty_names);
16218        SvREFCNT_dec_NN(subtract);
16219
16220        _invlist_union(*prop_definition, empty_names, prop_definition);
16221        found_matches = TRUE;
16222        SvREFCNT_dec_NN(empty_names);
16223    }
16224    SvREFCNT_dec_NN(empty);
16225
16226#if 0
16227    /* If we ever were to accept aliases for, say private use names, we would
16228     * need to do something fancier to find empty names.  The code below works
16229     * (at the time it was written), and is slower than the above */
16230    const char empties_pat[] = "^.";
16231    if (strNE(name, empties_pat)) {
16232        SV * empty = newSVpvs("");
16233        if (execute_wildcard(subpattern_re,
16234                    SvPVX(empty),
16235                    SvEND(empty),
16236                    SvPVX(empty), 0,
16237                    empty,
16238                    0))
16239        {
16240            SV * empties = NULL;
16241
16242            (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16243
16244            _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16245            SvREFCNT_dec_NN(empties);
16246
16247            found_matches = TRUE;
16248        }
16249        SvREFCNT_dec_NN(empty);
16250    }
16251#endif
16252
16253    SvREFCNT_dec_NN(subpattern_re);
16254    return found_matches;
16255}
16256
16257/*
16258 * ex: set ts=8 sts=4 sw=4 et:
16259 */
16260