1/*    inline.h
2 *
3 *    Copyright (C) 2012 by Larry Wall and others
4 *
5 *    You may distribute under the terms of either the GNU General Public
6 *    License or the Artistic License, as specified in the README file.
7 *
8 *    This file contains tables and code adapted from
9 *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 *    copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
36 *
37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 * whose details should be exposed to the compiler, for such things as tail
39 * call optimization.
40 *
41 * Each section names the header file that the functions "belong" to.
42 */
43
44/* ------------------------------- av.h ------------------------------- */
45
46/*
47=for apidoc_section $AV
48=for apidoc av_count
49Returns the number of elements in the array C<av>.  This is the true length of
50the array, including any undefined elements.  It is always the same as
51S<C<av_top_index(av) + 1>>.
52
53=cut
54*/
55PERL_STATIC_INLINE Size_t
56Perl_av_count(pTHX_ AV *av)
57{
58    PERL_ARGS_ASSERT_AV_COUNT;
59    assert(SvTYPE(av) == SVt_PVAV);
60
61    return AvFILL(av) + 1;
62}
63
64/* ------------------------------- av.c ------------------------------- */
65
66/*
67=for apidoc av_store_simple
68
69This is a cut-down version of av_store that assumes that the array is
70very straightforward - no magic, not readonly, and AvREAL - and that
71C<key> is not negative. This function MUST NOT be used in situations
72where any of those assumptions may not hold.
73
74Stores an SV in an array.  The array index is specified as C<key>. It
75can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76
77Note that the caller is responsible for suitably incrementing the reference
78count of C<val> before the call.
79
80Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81
82=cut
83*/
84
85PERL_STATIC_INLINE SV**
86Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87{
88    SV** ary;
89
90    PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91    assert(SvTYPE(av) == SVt_PVAV);
92    assert(!SvMAGICAL(av));
93    assert(!SvREADONLY(av));
94    assert(AvREAL(av));
95    assert(key > -1);
96
97    ary = AvARRAY(av);
98
99    if (AvFILLp(av) < key) {
100        if (key > AvMAX(av)) {
101            av_extend(av,key);
102            ary = AvARRAY(av);
103        }
104        AvFILLp(av) = key;
105    } else
106        SvREFCNT_dec(ary[key]);
107
108    ary[key] = val;
109    return &ary[key];
110}
111
112/*
113=for apidoc av_fetch_simple
114
115This is a cut-down version of av_fetch that assumes that the array is
116very straightforward - no magic, not readonly, and AvREAL - and that
117C<key> is not negative. This function MUST NOT be used in situations
118where any of those assumptions may not hold.
119
120Returns the SV at the specified index in the array.  The C<key> is the
121index.  If lval is true, you are guaranteed to get a real SV back (in case
122it wasn't real before), which you can then modify.  Check that the return
123value is non-null before dereferencing it to a C<SV*>.
124
125The rough perl equivalent is C<$myarray[$key]>.
126
127=cut
128*/
129
130PERL_STATIC_INLINE SV**
131Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132{
133    PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134    assert(SvTYPE(av) == SVt_PVAV);
135    assert(!SvMAGICAL(av));
136    assert(!SvREADONLY(av));
137    assert(AvREAL(av));
138    assert(key > -1);
139
140    if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141        return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
142    } else {
143        return &AvARRAY(av)[key];
144    }
145}
146
147/*
148=for apidoc av_push_simple
149
150This is a cut-down version of av_push that assumes that the array is very
151straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152not less than -1. This function MUST NOT be used in situations where any
153of those assumptions may not hold.
154
155Pushes an SV (transferring control of one reference count) onto the end of the
156array.  The array will grow automatically to accommodate the addition.
157
158Perl equivalent: C<push @myarray, $val;>.
159
160=cut
161*/
162
163PERL_STATIC_INLINE void
164Perl_av_push_simple(pTHX_ AV *av, SV *val)
165{
166    PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167    assert(SvTYPE(av) == SVt_PVAV);
168    assert(!SvMAGICAL(av));
169    assert(!SvREADONLY(av));
170    assert(AvREAL(av));
171    assert(AvFILLp(av) > -2);
172
173    (void)av_store_simple(av,AvFILLp(av)+1,val);
174}
175
176/*
177=for apidoc av_new_alloc
178
179This implements L<perlapi/C<newAV_alloc_x>>
180and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181functionality.
182
183Creates a new AV and allocates its SV* array.
184
185This is similar to, but more efficient than doing:
186
187    AV *av = newAV();
188    av_extend(av, key);
189
190The size parameter is used to pre-allocate a SV* array large enough to
191hold at least elements C<0..(size-1)>.  C<size> must be at least 1.
192
193The C<zeroflag> parameter controls whether or not the array is NULL
194initialized.
195
196=cut
197*/
198
199PERL_STATIC_INLINE AV *
200Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201{
202    AV * const av = newAV();
203    SV** ary;
204    PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205    assert(size > 0);
206
207    Newx(ary, size, SV*); /* Newx performs the memwrap check */
208    AvALLOC(av) = ary;
209    AvARRAY(av) = ary;
210    AvMAX(av) = size - 1;
211
212    if (zeroflag)
213        Zero(ary, size, SV*);
214
215    return av;
216}
217
218
219/* ------------------------------- cv.h ------------------------------- */
220
221/*
222=for apidoc_section $CV
223=for apidoc CvGV
224Returns the GV associated with the CV C<sv>, reifying it if necessary.
225
226=cut
227*/
228PERL_STATIC_INLINE GV *
229Perl_CvGV(pTHX_ CV *sv)
230{
231    PERL_ARGS_ASSERT_CVGV;
232
233    return CvNAMED(sv)
234        ? Perl_cvgv_from_hek(aTHX_ sv)
235        : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
236}
237
238/*
239=for apidoc CvDEPTH
240Returns the recursion level of the CV C<sv>.  Hence >= 2 indicates we are in a
241recursive call.
242
243=cut
244*/
245PERL_STATIC_INLINE I32 *
246Perl_CvDEPTH(const CV * const sv)
247{
248    PERL_ARGS_ASSERT_CVDEPTH;
249    assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
250
251    return &((XPVCV*)SvANY(sv))->xcv_depth;
252}
253
254/*
255 CvPROTO returns the prototype as stored, which is not necessarily what
256 the interpreter should be using. Specifically, the interpreter assumes
257 that spaces have been stripped, which has been the case if the prototype
258 was added by toke.c, but is generally not the case if it was added elsewhere.
259 Since we can't enforce the spacelessness at assignment time, this routine
260 provides a temporary copy at parse time with spaces removed.
261 I<orig> is the start of the original buffer, I<len> is the length of the
262 prototype and will be updated when this returns.
263 */
264
265#ifdef PERL_CORE
266PERL_STATIC_INLINE char *
267S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
268{
269    SV * tmpsv;
270    char * tmps;
271    tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
272    tmps = SvPVX(tmpsv);
273    while ((*len)--) {
274        if (!isSPACE(*orig))
275            *tmps++ = *orig;
276        orig++;
277    }
278    *tmps = '\0';
279    *len = tmps - SvPVX(tmpsv);
280                return SvPVX(tmpsv);
281}
282#endif
283
284/* ------------------------------- iperlsys.h ------------------------------- */
285#if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
286
287/* Otherwise this function is implemented as macros in iperlsys.h */
288
289PERL_STATIC_INLINE bool
290S_PerlEnv_putenv(pTHX_ char * str)
291{
292    PERL_ARGS_ASSERT_PERLENV_PUTENV;
293
294    ENV_LOCK;
295    bool retval = putenv(str);
296    ENV_UNLOCK;
297
298    return retval;
299}
300
301#endif
302
303/* ------------------------------- mg.h ------------------------------- */
304
305#if defined(PERL_CORE) || defined(PERL_EXT)
306/* assumes get-magic and stringification have already occurred */
307PERL_STATIC_INLINE STRLEN
308S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
309{
310    assert(mg->mg_type == PERL_MAGIC_regex_global);
311    assert(mg->mg_len != -1);
312    if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
313        return (STRLEN)mg->mg_len;
314    else {
315        const STRLEN pos = (STRLEN)mg->mg_len;
316        /* Without this check, we may read past the end of the buffer: */
317        if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
318        return sv_or_pv_pos_u2b(sv, s, pos, NULL);
319    }
320}
321#endif
322
323/* ------------------------------- pad.h ------------------------------ */
324
325#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
326PERL_STATIC_INLINE bool
327S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
328{
329    PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
330
331    /* is seq within the range _LOW to _HIGH ?
332     * This is complicated by the fact that PL_cop_seqmax
333     * may have wrapped around at some point */
334    if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
335        return FALSE; /* not yet introduced */
336
337    if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
338    /* in compiling scope */
339        if (
340            (seq >  COP_SEQ_RANGE_LOW(pn))
341            ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
342            : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
343        )
344            return TRUE;
345    }
346    else if (
347        (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
348        ?
349            (  seq >  COP_SEQ_RANGE_LOW(pn)
350            || seq <= COP_SEQ_RANGE_HIGH(pn))
351
352        :    (  seq >  COP_SEQ_RANGE_LOW(pn)
353             && seq <= COP_SEQ_RANGE_HIGH(pn))
354    )
355        return TRUE;
356    return FALSE;
357}
358#endif
359
360/* ------------------------------- pp.h ------------------------------- */
361
362PERL_STATIC_INLINE I32
363Perl_TOPMARK(pTHX)
364{
365    DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
366                                 "MARK top  %p %" IVdf "\n",
367                                  PL_markstack_ptr,
368                                  (IV)*PL_markstack_ptr)));
369    return *PL_markstack_ptr;
370}
371
372PERL_STATIC_INLINE I32
373Perl_POPMARK(pTHX)
374{
375    DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
376                                 "MARK pop  %p %" IVdf "\n",
377                                  (PL_markstack_ptr-1),
378                                  (IV)*(PL_markstack_ptr-1))));
379    assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
380    return *PL_markstack_ptr--;
381}
382
383/* ----------------------------- regexp.h ----------------------------- */
384
385/* PVLVs need to act as a superset of all scalar types - they are basically
386 * PVMGs with a few extra fields.
387 * REGEXPs are first class scalars, but have many fields that can't be copied
388 * into a PVLV body.
389 *
390 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
391 * back to the original body. To avoid increasing the size of PVLVs just for the
392 * rare case of REGEXP assignment, this pointer is stored in the memory usually
393 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
394 * read the pointer from the two possible locations. The macro SvLEN() wraps the
395 * access to the union's member xpvlenu_len, but there is no equivalent macro
396 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
397 *
398 * See commit df6b4bd56551f2d3 for more details. */
399
400PERL_STATIC_INLINE struct regexp *
401Perl_ReANY(const REGEXP * const re)
402{
403    XPV* const p = (XPV*)SvANY(re);
404
405    PERL_ARGS_ASSERT_REANY;
406    assert(isREGEXP(re));
407
408    return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
409                                   : (struct regexp *)p;
410}
411
412/* ------------------------------- utf8.h ------------------------------- */
413
414/*
415=for apidoc_section $unicode
416*/
417
418PERL_STATIC_INLINE void
419Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
420{
421    /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
422     * encoded string at '*dest', updating '*dest' to include it */
423
424    PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
425
426    if (NATIVE_BYTE_IS_INVARIANT(byte))
427        *((*dest)++) = byte;
428    else {
429        *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
430        *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
431    }
432}
433
434/*
435=for apidoc valid_utf8_to_uvchr
436Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
437known that the next character in the input UTF-8 string C<s> is well-formed
438(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
439points, and non-Unicode code points are allowed.
440
441=cut
442
443 */
444
445PERL_STATIC_INLINE UV
446Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
447{
448    const UV expectlen = UTF8SKIP(s);
449    const U8* send = s + expectlen;
450    UV uv = *s;
451
452    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
453
454    if (retlen) {
455        *retlen = expectlen;
456    }
457
458    /* An invariant is trivially returned */
459    if (expectlen == 1) {
460        return uv;
461    }
462
463    /* Remove the leading bits that indicate the number of bytes, leaving just
464     * the bits that are part of the value */
465    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
466
467    /* Now, loop through the remaining bytes, accumulating each into the
468     * working total as we go.  (I khw tried unrolling the loop for up to 4
469     * bytes, but there was no performance improvement) */
470    for (++s; s < send; s++) {
471        uv = UTF8_ACCUMULATE(uv, *s);
472    }
473
474    return UNI_TO_NATIVE(uv);
475
476}
477
478/*
479=for apidoc is_utf8_invariant_string
480
481Returns TRUE if the first C<len> bytes of the string C<s> are the same
482regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
483EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
484are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
485the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
486characters are invariant, but so also are the C1 controls.
487
488If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
489use this option, that C<s> can't have embedded C<NUL> characters and has to
490have a terminating C<NUL> byte).
491
492See also
493C<L</is_utf8_string>>,
494C<L</is_utf8_string_flags>>,
495C<L</is_utf8_string_loc>>,
496C<L</is_utf8_string_loc_flags>>,
497C<L</is_utf8_string_loclen>>,
498C<L</is_utf8_string_loclen_flags>>,
499C<L</is_utf8_fixed_width_buf_flags>>,
500C<L</is_utf8_fixed_width_buf_loc_flags>>,
501C<L</is_utf8_fixed_width_buf_loclen_flags>>,
502C<L</is_strict_utf8_string>>,
503C<L</is_strict_utf8_string_loc>>,
504C<L</is_strict_utf8_string_loclen>>,
505C<L</is_c9strict_utf8_string>>,
506C<L</is_c9strict_utf8_string_loc>>,
507and
508C<L</is_c9strict_utf8_string_loclen>>.
509
510=cut
511
512*/
513
514#define is_utf8_invariant_string(s, len)                                    \
515                                is_utf8_invariant_string_loc(s, len, NULL)
516
517/*
518=for apidoc is_utf8_invariant_string_loc
519
520Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
521the first UTF-8 variant character in the C<ep> pointer; if all characters are
522UTF-8 invariant, this function does not change the contents of C<*ep>.
523
524=cut
525
526*/
527
528PERL_STATIC_INLINE bool
529Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
530{
531    const U8* send;
532    const U8* x = s;
533
534    PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
535
536    if (len == 0) {
537        len = strlen((const char *)s);
538    }
539
540    send = s + len;
541
542/* This looks like 0x010101... */
543#  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
544
545/* This looks like 0x808080... */
546#  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
547#  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
548#  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
549
550/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
551 * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
552 * optimized out completely on a 32-bit system, and its mask gets optimized out
553 * on a 64-bit system */
554#  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
555                                      |   (  PTR2nat(x) >> 1)                 \
556                                      | ( ( (PTR2nat(x)                       \
557                                           & PERL_WORD_BOUNDARY_MASK) >> 2))))
558
559#ifndef EBCDIC
560
561    /* Do the word-at-a-time iff there is at least one usable full word.  That
562     * means that after advancing to a word boundary, there still is at least a
563     * full word left.  The number of bytes needed to advance is 'wordsize -
564     * offset' unless offset is 0. */
565    if ((STRLEN) (send - x) >= PERL_WORDSIZE
566
567                            /* This term is wordsize if subword; 0 if not */
568                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
569
570                            /* 'offset' */
571                          - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
572    {
573
574        /* Process per-byte until reach word boundary.  XXX This loop could be
575         * eliminated if we knew that this platform had fast unaligned reads */
576        while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
577            if (! UTF8_IS_INVARIANT(*x)) {
578                if (ep) {
579                    *ep = x;
580                }
581
582                return FALSE;
583            }
584            x++;
585        }
586
587        /* Here, we know we have at least one full word to process.  Process
588         * per-word as long as we have at least a full word left */
589        do {
590            if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
591
592                /* Found a variant.  Just return if caller doesn't want its
593                 * exact position */
594                if (! ep) {
595                    return FALSE;
596                }
597
598#  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
599     || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
600
601                *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
602                assert(*ep >= s && *ep < send);
603
604                return FALSE;
605
606#  else   /* If weird byte order, drop into next loop to do byte-at-a-time
607           checks. */
608
609                break;
610#  endif
611            }
612
613            x += PERL_WORDSIZE;
614
615        } while (x + PERL_WORDSIZE <= send);
616    }
617
618#endif      /* End of ! EBCDIC */
619
620    /* Process per-byte */
621    while (x < send) {
622        if (! UTF8_IS_INVARIANT(*x)) {
623            if (ep) {
624                *ep = x;
625            }
626
627            return FALSE;
628        }
629
630        x++;
631    }
632
633    return TRUE;
634}
635
636/* See if the platform has builtins for finding the most/least significant bit,
637 * and which one is right for using on 32 and 64 bit operands */
638#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
639#  if U32SIZE == INTSIZE
640#    define PERL_CLZ_32 __builtin_clz
641#  endif
642#  if defined(U64TYPE) && U64SIZE == INTSIZE
643#    define PERL_CLZ_64 __builtin_clz
644#  endif
645#endif
646#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
647#  if U32SIZE == INTSIZE
648#    define PERL_CTZ_32 __builtin_ctz
649#  endif
650#  if defined(U64TYPE) && U64SIZE == INTSIZE
651#    define PERL_CTZ_64 __builtin_ctz
652#  endif
653#endif
654
655#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
656#  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
657#    define PERL_CLZ_32 __builtin_clzl
658#  endif
659#  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
660#    define PERL_CLZ_64 __builtin_clzl
661#  endif
662#endif
663#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
664#  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
665#    define PERL_CTZ_32 __builtin_ctzl
666#  endif
667#  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
668#    define PERL_CTZ_64 __builtin_ctzl
669#  endif
670#endif
671
672#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
673#  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
674#    define PERL_CLZ_32 __builtin_clzll
675#  endif
676#  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
677#    define PERL_CLZ_64 __builtin_clzll
678#  endif
679#endif
680#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
681#  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
682#    define PERL_CTZ_32 __builtin_ctzll
683#  endif
684#  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
685#    define PERL_CTZ_64 __builtin_ctzll
686#  endif
687#endif
688
689#if defined(_MSC_VER)
690#  include <intrin.h>
691#  pragma intrinsic(_BitScanForward)
692#  pragma intrinsic(_BitScanReverse)
693#  ifdef _WIN64
694#    pragma intrinsic(_BitScanForward64)
695#    pragma intrinsic(_BitScanReverse64)
696#  endif
697#endif
698
699/* The reason there are not checks to see if ffs() and ffsl() are available for
700 * determining the lsb, is because these don't improve on the deBruijn method
701 * fallback, which is just a branchless integer multiply, array element
702 * retrieval, and shift.  The others, even if the function call overhead is
703 * optimized out, have to cope with the possibility of the input being all
704 * zeroes, and almost certainly will have conditionals for this eventuality.
705 * khw, at the time of this commit, looked at the source for both gcc and clang
706 * to verify this.  (gcc used a method inferior to deBruijn.) */
707
708/* Below are functions to find the first, last, or only set bit in a word.  On
709 * platforms with 64-bit capability, there is a pair for each operation; the
710 * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
711 * the same in each pair, so the second is stripped of most comments. */
712
713#ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
714
715PERL_STATIC_INLINE unsigned
716Perl_lsbit_pos64(U64 word)
717{
718    /* Find the position (0..63) of the least significant set bit in the input
719     * word */
720
721    ASSUME(word != 0);
722
723    /* If we can determine that the platform has a usable fast method to get
724     * this info, use that */
725
726#  if defined(PERL_CTZ_64)
727#    define PERL_HAS_FAST_GET_LSB_POS64
728
729    return (unsigned) PERL_CTZ_64(word);
730
731#  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
732#    define PERL_HAS_FAST_GET_LSB_POS64
733
734    {
735        unsigned long index;
736        _BitScanForward64(&index, word);
737        return (unsigned)index;
738    }
739
740#  else
741
742    /* Here, we didn't find a fast method for finding the lsb.  Fall back to
743     * making the lsb the only set bit in the word, and use our function that
744     * works on words with a single bit set.
745     *
746     * Isolate the lsb;
747     * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
748     *
749     * The word will look like this, with a rightmost set bit in position 's':
750     * ('x's are don't cares, and 'y's are their complements)
751     *      s
752     *  x..x100..00
753     *  y..y011..11      Complement
754     *  y..y100..00      Add 1
755     *  0..0100..00      And with the original
756     *
757     *  (Yes, complementing and adding 1 is just taking the negative on 2's
758     *  complement machines, but not on 1's complement ones, and some compilers
759     *  complain about negating an unsigned.)
760     */
761    return single_1bit_pos64(word & (~word + 1));
762
763#  endif
764
765}
766
767#  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
768#else   /* ! QUAD */
769#  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
770#endif
771
772PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
773Perl_lsbit_pos32(U32 word)
774{
775    /* Find the position (0..31) of the least significant set bit in the input
776     * word */
777
778    ASSUME(word != 0);
779
780#if defined(PERL_CTZ_32)
781#  define PERL_HAS_FAST_GET_LSB_POS32
782
783    return (unsigned) PERL_CTZ_32(word);
784
785#elif U32SIZE == 4 && defined(_MSC_VER)
786#  define PERL_HAS_FAST_GET_LSB_POS32
787
788    {
789        unsigned long index;
790        _BitScanForward(&index, word);
791        return (unsigned)index;
792    }
793
794#else
795
796    return single_1bit_pos32(word & (~word + 1));
797
798#endif
799
800}
801
802
803/* Convert the leading zeros count to the bit position of the first set bit.
804 * This just subtracts from the highest position, 31 or 63.  But some compilers
805 * don't optimize this optimally, and so a bit of bit twiddling encourages them
806 * to do the right thing.  It turns out that subtracting a smaller non-negative
807 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
808 * the two numbers.  To see why, first note that the sum of any number, x, and
809 * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
810 * the xor of x and all ones is x'. */
811#define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
812
813#ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
814
815PERL_STATIC_INLINE unsigned
816Perl_msbit_pos64(U64 word)
817{
818    /* Find the position (0..63) of the most significant set bit in the input
819     * word */
820
821    ASSUME(word != 0);
822
823    /* If we can determine that the platform has a usable fast method to get
824     * this, use that */
825
826#  if defined(PERL_CLZ_64)
827#    define PERL_HAS_FAST_GET_MSB_POS64
828
829    return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
830
831#  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
832#    define PERL_HAS_FAST_GET_MSB_POS64
833
834    {
835        unsigned long index;
836        _BitScanReverse64(&index, word);
837        return (unsigned)index;
838    }
839
840#  else
841
842    /* Here, we didn't find a fast method for finding the msb.  Fall back to
843     * making the msb the only set bit in the word, and use our function that
844     * works on words with a single bit set.
845     *
846     * Isolate the msb; http://codeforces.com/blog/entry/10330
847     *
848     * Only the most significant set bit matters.  Or'ing word with its right
849     * shift of 1 makes that bit and the next one to its right both 1.
850     * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
851     * ...  We end with the msb and all to the right being 1. */
852    word |= (word >>  1);
853    word |= (word >>  2);
854    word |= (word >>  4);
855    word |= (word >>  8);
856    word |= (word >> 16);
857    word |= (word >> 32);
858
859    /* Then subtracting the right shift by 1 clears all but the left-most of
860     * the 1 bits, which is our desired result */
861    word -= (word >> 1);
862
863    /* Now we have a single bit set */
864    return single_1bit_pos64(word);
865
866#  endif
867
868}
869
870#  define msbit_pos_uintmax_(word) msbit_pos64(word)
871#else   /* ! QUAD */
872#  define msbit_pos_uintmax_(word) msbit_pos32(word)
873#endif
874
875PERL_STATIC_INLINE unsigned
876Perl_msbit_pos32(U32 word)
877{
878    /* Find the position (0..31) of the most significant set bit in the input
879     * word */
880
881    ASSUME(word != 0);
882
883#if defined(PERL_CLZ_32)
884#  define PERL_HAS_FAST_GET_MSB_POS32
885
886    return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
887
888#elif U32SIZE == 4 && defined(_MSC_VER)
889#  define PERL_HAS_FAST_GET_MSB_POS32
890
891    {
892        unsigned long index;
893        _BitScanReverse(&index, word);
894        return (unsigned)index;
895    }
896
897#else
898
899    word |= (word >>  1);
900    word |= (word >>  2);
901    word |= (word >>  4);
902    word |= (word >>  8);
903    word |= (word >> 16);
904    word -= (word >> 1);
905    return single_1bit_pos32(word);
906
907#endif
908
909}
910
911#if UVSIZE == U64SIZE
912#  define msbit_pos(word)  msbit_pos64(word)
913#  define lsbit_pos(word)  lsbit_pos64(word)
914#elif UVSIZE == U32SIZE
915#  define msbit_pos(word)  msbit_pos32(word)
916#  define lsbit_pos(word)  lsbit_pos32(word)
917#endif
918
919#ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
920
921PERL_STATIC_INLINE unsigned
922Perl_single_1bit_pos64(U64 word)
923{
924    /* Given a 64-bit word known to contain all zero bits except one 1 bit,
925     * find and return the 1's position: 0..63 */
926
927#  ifdef PERL_CORE    /* macro not exported */
928    ASSUME(isPOWER_OF_2(word));
929#  else
930    ASSUME(word && (word & (word-1)) == 0);
931#  endif
932
933    /* The only set bit is both the most and least significant bit.  If we have
934     * a fast way of finding either one, use that.
935     *
936     * It may appear at first glance that those functions call this one, but
937     * they don't if the corresponding #define is set */
938
939#  ifdef PERL_HAS_FAST_GET_MSB_POS64
940
941    return msbit_pos64(word);
942
943#  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
944
945    return lsbit_pos64(word);
946
947#  else
948
949    /* The position of the only set bit in a word can be quickly calculated
950     * using deBruijn sequences.  See for example
951     * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
952    return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
953                                                    >> PERL_deBruijnShift64_];
954#  endif
955
956}
957
958#endif
959
960PERL_STATIC_INLINE unsigned
961Perl_single_1bit_pos32(U32 word)
962{
963    /* Given a 32-bit word known to contain all zero bits except one 1 bit,
964     * find and return the 1's position: 0..31 */
965
966#ifdef PERL_CORE    /* macro not exported */
967    ASSUME(isPOWER_OF_2(word));
968#else
969    ASSUME(word && (word & (word-1)) == 0);
970#endif
971#ifdef PERL_HAS_FAST_GET_MSB_POS32
972
973    return msbit_pos32(word);
974
975#elif defined(PERL_HAS_FAST_GET_LSB_POS32)
976
977    return lsbit_pos32(word);
978
979/* Unlikely, but possible for the platform to have a wider fast operation but
980 * not a narrower one.  But easy enough to handle the case by widening the
981 * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
982 * would be slower than the deBruijn method.) */
983#elif defined(PERL_HAS_FAST_GET_MSB_POS64)
984
985    return msbit_pos64(word);
986
987#elif defined(PERL_HAS_FAST_GET_LSB_POS64)
988
989    return lsbit_pos64(word);
990
991#else
992
993    return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
994                                                    >> PERL_deBruijnShift32_];
995#endif
996
997}
998
999#ifndef EBCDIC
1000
1001PERL_STATIC_INLINE unsigned int
1002Perl_variant_byte_number(PERL_UINTMAX_T word)
1003{
1004    /* This returns the position in a word (0..7) of the first variant byte in
1005     * it.  This is a helper function.  Note that there are no branches */
1006
1007    /* Get just the msb bits of each byte */
1008    word &= PERL_VARIANTS_WORD_MASK;
1009
1010    /* This should only be called if we know there is a variant byte in the
1011     * word */
1012    assert(word);
1013
1014#  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1015
1016    /* Bytes are stored like
1017     *  Byte8 ... Byte2 Byte1
1018     *  63..56...15...8 7...0
1019     * so getting the lsb of the whole modified word is getting the msb of the
1020     * first byte that has its msb set */
1021    word = lsbit_pos_uintmax_(word);
1022
1023    /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1024     * to 0..7 */
1025    return (unsigned int) ((word + 1) >> 3) - 1;
1026
1027#  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1028
1029    /* Bytes are stored like
1030     *  Byte1 Byte2  ... Byte8
1031     * 63..56 55..47 ... 7...0
1032     * so getting the msb of the whole modified word is getting the msb of the
1033     * first byte that has its msb set */
1034    word = msbit_pos_uintmax_(word);
1035
1036    /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1037     * to 0..7 */
1038    word = ((word + 1) >> 3) - 1;
1039
1040    /* And invert the result because of the reversed byte order on this
1041     * platform */
1042    word = CHARBITS - word - 1;
1043
1044    return (unsigned int) word;
1045
1046#  else
1047#    error Unexpected byte order
1048#  endif
1049
1050}
1051
1052#endif
1053#if defined(PERL_CORE) || defined(PERL_EXT)
1054
1055/*
1056=for apidoc variant_under_utf8_count
1057
1058This function looks at the sequence of bytes between C<s> and C<e>, which are
1059assumed to be encoded in ASCII/Latin1, and returns how many of them would
1060change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1061each of these would occupy two bytes instead of the single one in the input
1062string.  Thus, this function returns the precise number of bytes the string
1063would expand by when translated to UTF-8.
1064
1065Unlike most of the other functions that have C<utf8> in their name, the input
1066to this function is NOT a UTF-8-encoded string.  The function name is slightly
1067I<odd> to emphasize this.
1068
1069This function is internal to Perl because khw thinks that any XS code that
1070would want this is probably operating too close to the internals.  Presenting a
1071valid use case could change that.
1072
1073See also
1074C<L<perlapi/is_utf8_invariant_string>>
1075and
1076C<L<perlapi/is_utf8_invariant_string_loc>>,
1077
1078=cut
1079
1080*/
1081
1082PERL_STATIC_INLINE Size_t
1083S_variant_under_utf8_count(const U8* const s, const U8* const e)
1084{
1085    const U8* x = s;
1086    Size_t count = 0;
1087
1088    PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1089
1090#  ifndef EBCDIC
1091
1092    /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1093     * same as for is_utf8_invariant_string()) */
1094    if ((STRLEN) (e - x) >= PERL_WORDSIZE
1095                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1096                          - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1097    {
1098
1099        /* Process per-byte until reach word boundary.  XXX This loop could be
1100         * eliminated if we knew that this platform had fast unaligned reads */
1101        while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1102            count += ! UTF8_IS_INVARIANT(*x++);
1103        }
1104
1105        /* Process per-word as long as we have at least a full word left */
1106        do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1107                   explanation of how this works */
1108            PERL_UINTMAX_T increment
1109                = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1110                      * PERL_COUNT_MULTIPLIER)
1111                    >> ((PERL_WORDSIZE - 1) * CHARBITS);
1112            count += (Size_t) increment;
1113            x += PERL_WORDSIZE;
1114        } while (x + PERL_WORDSIZE <= e);
1115    }
1116
1117#  endif
1118
1119    /* Process per-byte */
1120    while (x < e) {
1121        if (! UTF8_IS_INVARIANT(*x)) {
1122            count++;
1123        }
1124
1125        x++;
1126    }
1127
1128    return count;
1129}
1130
1131#endif
1132
1133   /* Keep  these around for these files */
1134#if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1135#  undef PERL_WORDSIZE
1136#  undef PERL_COUNT_MULTIPLIER
1137#  undef PERL_WORD_BOUNDARY_MASK
1138#  undef PERL_VARIANTS_WORD_MASK
1139#endif
1140
1141/*
1142=for apidoc is_utf8_string
1143
1144Returns TRUE if the first C<len> bytes of string C<s> form a valid
1145Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1146be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1147can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1148byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1149
1150This function considers Perl's extended UTF-8 to be valid.  That means that
1151code points above Unicode, surrogates, and non-character code points are
1152considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1153C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1154code points are considered valid.
1155
1156See also
1157C<L</is_utf8_invariant_string>>,
1158C<L</is_utf8_invariant_string_loc>>,
1159C<L</is_utf8_string_loc>>,
1160C<L</is_utf8_string_loclen>>,
1161C<L</is_utf8_fixed_width_buf_flags>>,
1162C<L</is_utf8_fixed_width_buf_loc_flags>>,
1163C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1164
1165=cut
1166*/
1167
1168#define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1169
1170#if defined(PERL_CORE) || defined (PERL_EXT)
1171
1172/*
1173=for apidoc is_utf8_non_invariant_string
1174
1175Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1176C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1177UTF-8; otherwise returns FALSE.
1178
1179A TRUE return means that at least one code point represented by the sequence
1180either is a wide character not representable as a single byte, or the
1181representation differs depending on whether the sequence is encoded in UTF-8 or
1182not.
1183
1184See also
1185C<L<perlapi/is_utf8_invariant_string>>,
1186C<L<perlapi/is_utf8_string>>
1187
1188=cut
1189
1190This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1191It generally needn't be if its string is entirely UTF-8 invariant, and it
1192shouldn't be if it otherwise contains invalid UTF-8.
1193
1194It is an internal function because khw thinks that XS code shouldn't be working
1195at this low a level.  A valid use case could change that.
1196
1197*/
1198
1199PERL_STATIC_INLINE bool
1200Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1201{
1202    const U8 * first_variant;
1203
1204    PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1205
1206    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1207        return FALSE;
1208    }
1209
1210    return is_utf8_string(first_variant, len - (first_variant - s));
1211}
1212
1213#endif
1214
1215/*
1216=for apidoc is_strict_utf8_string
1217
1218Returns TRUE if the first C<len> bytes of string C<s> form a valid
1219UTF-8-encoded string that is fully interchangeable by any application using
1220Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1221calculated using C<strlen(s)> (which means if you use this option, that C<s>
1222can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1223byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1224
1225This function returns FALSE for strings containing any
1226code points above the Unicode max of 0x10FFFF, surrogate code points, or
1227non-character code points.
1228
1229See also
1230C<L</is_utf8_invariant_string>>,
1231C<L</is_utf8_invariant_string_loc>>,
1232C<L</is_utf8_string>>,
1233C<L</is_utf8_string_flags>>,
1234C<L</is_utf8_string_loc>>,
1235C<L</is_utf8_string_loc_flags>>,
1236C<L</is_utf8_string_loclen>>,
1237C<L</is_utf8_string_loclen_flags>>,
1238C<L</is_utf8_fixed_width_buf_flags>>,
1239C<L</is_utf8_fixed_width_buf_loc_flags>>,
1240C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1241C<L</is_strict_utf8_string_loc>>,
1242C<L</is_strict_utf8_string_loclen>>,
1243C<L</is_c9strict_utf8_string>>,
1244C<L</is_c9strict_utf8_string_loc>>,
1245and
1246C<L</is_c9strict_utf8_string_loclen>>.
1247
1248=cut
1249*/
1250
1251#define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1252
1253/*
1254=for apidoc is_c9strict_utf8_string
1255
1256Returns TRUE if the first C<len> bytes of string C<s> form a valid
1257UTF-8-encoded string that conforms to
1258L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1259otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1260C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1261C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1262characters being ASCII constitute 'a valid UTF-8 string'.
1263
1264This function returns FALSE for strings containing any code points above the
1265Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1266code points per
1267L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1268
1269See also
1270C<L</is_utf8_invariant_string>>,
1271C<L</is_utf8_invariant_string_loc>>,
1272C<L</is_utf8_string>>,
1273C<L</is_utf8_string_flags>>,
1274C<L</is_utf8_string_loc>>,
1275C<L</is_utf8_string_loc_flags>>,
1276C<L</is_utf8_string_loclen>>,
1277C<L</is_utf8_string_loclen_flags>>,
1278C<L</is_utf8_fixed_width_buf_flags>>,
1279C<L</is_utf8_fixed_width_buf_loc_flags>>,
1280C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1281C<L</is_strict_utf8_string>>,
1282C<L</is_strict_utf8_string_loc>>,
1283C<L</is_strict_utf8_string_loclen>>,
1284C<L</is_c9strict_utf8_string_loc>>,
1285and
1286C<L</is_c9strict_utf8_string_loclen>>.
1287
1288=cut
1289*/
1290
1291#define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1292
1293/*
1294=for apidoc is_utf8_string_flags
1295
1296Returns TRUE if the first C<len> bytes of string C<s> form a valid
1297UTF-8 string, subject to the restrictions imposed by C<flags>;
1298returns FALSE otherwise.  If C<len> is 0, it will be calculated
1299using C<strlen(s)> (which means if you use this option, that C<s> can't have
1300embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1301that all characters being ASCII constitute 'a valid UTF-8 string'.
1302
1303If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1304C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1305as C<L</is_strict_utf8_string>>; and if C<flags> is
1306C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1307C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1308combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1309C<L</utf8n_to_uvchr>>, with the same meanings.
1310
1311See also
1312C<L</is_utf8_invariant_string>>,
1313C<L</is_utf8_invariant_string_loc>>,
1314C<L</is_utf8_string>>,
1315C<L</is_utf8_string_loc>>,
1316C<L</is_utf8_string_loc_flags>>,
1317C<L</is_utf8_string_loclen>>,
1318C<L</is_utf8_string_loclen_flags>>,
1319C<L</is_utf8_fixed_width_buf_flags>>,
1320C<L</is_utf8_fixed_width_buf_loc_flags>>,
1321C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1322C<L</is_strict_utf8_string>>,
1323C<L</is_strict_utf8_string_loc>>,
1324C<L</is_strict_utf8_string_loclen>>,
1325C<L</is_c9strict_utf8_string>>,
1326C<L</is_c9strict_utf8_string_loc>>,
1327and
1328C<L</is_c9strict_utf8_string_loclen>>.
1329
1330=cut
1331*/
1332
1333PERL_STATIC_INLINE bool
1334Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1335{
1336    const U8 * first_variant;
1337
1338    PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1339    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1340                          |UTF8_DISALLOW_PERL_EXTENDED)));
1341
1342    if (len == 0) {
1343        len = strlen((const char *)s);
1344    }
1345
1346    if (flags == 0) {
1347        return is_utf8_string(s, len);
1348    }
1349
1350    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1351                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1352    {
1353        return is_strict_utf8_string(s, len);
1354    }
1355
1356    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1357                                       == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1358    {
1359        return is_c9strict_utf8_string(s, len);
1360    }
1361
1362    if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1363        const U8* const send = s + len;
1364        const U8* x = first_variant;
1365
1366        while (x < send) {
1367            STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1368            if (UNLIKELY(! cur_len)) {
1369                return FALSE;
1370            }
1371            x += cur_len;
1372        }
1373    }
1374
1375    return TRUE;
1376}
1377
1378/*
1379
1380=for apidoc is_utf8_string_loc
1381
1382Like C<L</is_utf8_string>> but stores the location of the failure (in the
1383case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1384"utf8ness success") in the C<ep> pointer.
1385
1386See also C<L</is_utf8_string_loclen>>.
1387
1388=cut
1389*/
1390
1391#define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1392
1393/*
1394
1395=for apidoc is_utf8_string_loclen
1396
1397Like C<L</is_utf8_string>> but stores the location of the failure (in the
1398case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1399"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1400encoded characters in the C<el> pointer.
1401
1402See also C<L</is_utf8_string_loc>>.
1403
1404=cut
1405*/
1406
1407PERL_STATIC_INLINE bool
1408Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1409{
1410    const U8 * first_variant;
1411
1412    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1413
1414    if (len == 0) {
1415        len = strlen((const char *) s);
1416    }
1417
1418    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1419        if (el)
1420            *el = len;
1421
1422        if (ep) {
1423            *ep = s + len;
1424        }
1425
1426        return TRUE;
1427    }
1428
1429    {
1430        const U8* const send = s + len;
1431        const U8* x = first_variant;
1432        STRLEN outlen = first_variant - s;
1433
1434        while (x < send) {
1435            const STRLEN cur_len = isUTF8_CHAR(x, send);
1436            if (UNLIKELY(! cur_len)) {
1437                break;
1438            }
1439            x += cur_len;
1440            outlen++;
1441        }
1442
1443        if (el)
1444            *el = outlen;
1445
1446        if (ep) {
1447            *ep = x;
1448        }
1449
1450        return (x == send);
1451    }
1452}
1453
1454/* The perl core arranges to never call the DFA below without there being at
1455 * least one byte available to look at.  This allows the DFA to use a do {}
1456 * while loop which means that calling it with a UTF-8 invariant has a single
1457 * conditional, same as the calling code checking for invariance ahead of time.
1458 * And having the calling code remove that conditional speeds up by that
1459 * conditional, the case where it wasn't invariant.  So there's no reason to
1460 * check before caling this.
1461 *
1462 * But we don't know this for non-core calls, so have to retain the check for
1463 * them. */
1464#ifdef PERL_CORE
1465#  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
1466#else
1467#  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
1468#endif
1469
1470/*
1471 * DFA for checking input is valid UTF-8 syntax.
1472 *
1473 * This uses adaptations of the table and algorithm given in
1474 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1475 * documentation of the original version.  A copyright notice for the original
1476 * version is given at the beginning of this file.  The Perl adaptations are
1477 * documented at the definition of PL_extended_utf8_dfa_tab[].
1478 *
1479 * This dfa is fast.  There are three exit conditions:
1480 *  1) a well-formed code point, acceptable to the table
1481 *  2) the beginning bytes of an incomplete character, whose completion might
1482 *     or might not be acceptable
1483 *  3) unacceptable to the table.  Some of the adaptations have certain,
1484 *     hopefully less likely to occur, legal inputs be unacceptable to the
1485 *     table, so these must be sorted out afterwards.
1486 *
1487 * This macro is a complete implementation of the code executing the DFA.  It
1488 * is passed the input sequence bounds and the table to use, and what to do
1489 * for each of the exit conditions.  There are three canned actions, likely to
1490 * be the ones you want:
1491 *      DFA_RETURN_SUCCESS_
1492 *      DFA_RETURN_FAILURE_
1493 *      DFA_GOTO_TEASE_APART_FF_
1494 *
1495 * You pass a parameter giving the action to take for each of the three
1496 * possible exit conditions:
1497 *
1498 * 'accept_action'  This is executed when the DFA accepts the input.
1499 *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1500 * 'reject_action'  This is executed when the DFA rejects the input.
1501 *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1502 *                  you have written code to distinguish the rejecting state
1503 *                  results.  Because it happens in several places, and
1504 *                  involves #ifdefs, the special action
1505 *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1506 *                  PL_extended_utf8_dfa_tab.  On platforms without
1507 *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1508 *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1509 *                  need to have a label 'tease_apart_FF' that it will transfer
1510 *                  to.
1511 * 'incomplete_char_action'  This is executed when the DFA ran off the end
1512 *                  before accepting or rejecting the input.
1513 *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1514 *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1515 *                  off the end, and you place your code to handle this case
1516 *                  immediately after it.
1517 */
1518
1519#define DFA_RETURN_SUCCESS_      return s - s0
1520#define DFA_RETURN_FAILURE_      return 0
1521#ifdef HAS_EXTRA_LONG_UTF8
1522#  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1523#else
1524#  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1525#endif
1526
1527#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1528                              accept_action,                                \
1529                              reject_action,                                \
1530                              incomplete_char_action)                       \
1531    STMT_START {                                                            \
1532        const U8 * s = s0;                                                  \
1533        const U8 * e_ = e;                                                  \
1534        UV state = 0;                                                       \
1535                                                                            \
1536        PERL_NON_CORE_CHECK_EMPTY(s, e_);                                   \
1537                                                                            \
1538        do {                                                                \
1539            state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1540            s++;                                                            \
1541                                                                            \
1542            if (state == 0) {   /* Accepting state */                       \
1543                accept_action;                                              \
1544            }                                                               \
1545                                                                            \
1546            if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1547                reject_action;                                              \
1548            }                                                               \
1549        } while (s < e_);                                                   \
1550                                                                            \
1551        /* Here, dropped out of loop before end-of-char */                  \
1552        incomplete_char_action;                                             \
1553    } STMT_END
1554
1555
1556/*
1557
1558=for apidoc isUTF8_CHAR
1559
1560Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1561looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1562that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1563value gives how many bytes starting at C<s> comprise the code point's
1564representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1565form the first code point in C<s>, are not examined.
1566
1567The code point can be any that will fit in an IV on this machine, using Perl's
1568extension to official UTF-8 to represent those higher than the Unicode maximum
1569of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1570next few bytes in C<s> is legal UTF-8 for a single character.
1571
1572Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1573defined by Unicode to be fully interchangeable across applications;
1574C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1575#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1576code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1577
1578Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1579C<L</is_utf8_string_loclen>> to check entire strings.
1580
1581Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1582machines) is a valid UTF-8 character.
1583
1584=cut
1585
1586This uses an adaptation of the table and algorithm given in
1587https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1588documentation of the original version.  A copyright notice for the original
1589version is given at the beginning of this file.  The Perl adaptation is
1590documented at the definition of PL_extended_utf8_dfa_tab[].
1591*/
1592
1593PERL_STATIC_INLINE Size_t
1594Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1595{
1596    PERL_ARGS_ASSERT_ISUTF8_CHAR;
1597
1598    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1599                          DFA_RETURN_SUCCESS_,
1600                          DFA_TEASE_APART_FF_,
1601                          DFA_RETURN_FAILURE_);
1602
1603    /* Here, we didn't return success, but dropped out of the loop.  In the
1604     * case of PL_extended_utf8_dfa_tab, this means the input is either
1605     * malformed, or the start byte was FF on a platform that the dfa doesn't
1606     * handle FF's.  Call a helper function. */
1607
1608#ifdef HAS_EXTRA_LONG_UTF8
1609
1610  tease_apart_FF:
1611
1612    /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1613     * either malformed, or was for the largest possible start byte, which we
1614     * now check, not inline */
1615    if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1616        return 0;
1617    }
1618
1619    return is_utf8_FF_helper_(s0, e,
1620                              FALSE /* require full, not partial char */
1621                             );
1622#endif
1623
1624}
1625
1626/*
1627
1628=for apidoc isSTRICT_UTF8_CHAR
1629
1630Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1631looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1632Unicode code point completely acceptable for open interchange between all
1633applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1634many bytes starting at C<s> comprise the code point's representation.  Any
1635bytes remaining before C<e>, but beyond the ones needed to form the first code
1636point in C<s>, are not examined.
1637
1638The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1639be a surrogate nor a non-character code point.  Thus this excludes any code
1640point from Perl's extended UTF-8.
1641
1642This is used to efficiently decide if the next few bytes in C<s> is
1643legal Unicode-acceptable UTF-8 for a single character.
1644
1645Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1646#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1647code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1648and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1649
1650Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1651C<L</is_strict_utf8_string_loclen>> to check entire strings.
1652
1653=cut
1654
1655This uses an adaptation of the tables and algorithm given in
1656https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1657documentation of the original version.  A copyright notice for the original
1658version is given at the beginning of this file.  The Perl adaptation is
1659documented at the definition of strict_extended_utf8_dfa_tab[].
1660
1661*/
1662
1663PERL_STATIC_INLINE Size_t
1664Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1665{
1666    PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1667
1668    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1669                          DFA_RETURN_SUCCESS_,
1670                          goto check_hanguls,
1671                          DFA_RETURN_FAILURE_);
1672  check_hanguls:
1673
1674    /* Here, we didn't return success, but dropped out of the loop.  In the
1675     * case of PL_strict_utf8_dfa_tab, this means the input is either
1676     * malformed, or was for certain Hanguls; handle them specially */
1677
1678    /* The dfa above drops out for incomplete or illegal inputs, and certain
1679     * legal Hanguls; check and return accordingly */
1680    return is_HANGUL_ED_utf8_safe(s0, e);
1681}
1682
1683/*
1684
1685=for apidoc isC9_STRICT_UTF8_CHAR
1686
1687Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1688looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1689Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1690the value gives how many bytes starting at C<s> comprise the code point's
1691representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1692form the first code point in C<s>, are not examined.
1693
1694The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1695differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1696code points.  This corresponds to
1697L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1698which said that non-character code points are merely discouraged rather than
1699completely forbidden in open interchange.  See
1700L<perlunicode/Noncharacter code points>.
1701
1702Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1703C<L</isUTF8_CHAR_flags>> for a more customized definition.
1704
1705Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1706C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1707
1708=cut
1709
1710This uses an adaptation of the tables and algorithm given in
1711https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1712documentation of the original version.  A copyright notice for the original
1713version is given at the beginning of this file.  The Perl adaptation is
1714documented at the definition of PL_c9_utf8_dfa_tab[].
1715
1716*/
1717
1718PERL_STATIC_INLINE Size_t
1719Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1720{
1721    PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1722
1723    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1724                          DFA_RETURN_SUCCESS_,
1725                          DFA_RETURN_FAILURE_,
1726                          DFA_RETURN_FAILURE_);
1727}
1728
1729/*
1730
1731=for apidoc is_strict_utf8_string_loc
1732
1733Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1734case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1735"utf8ness success") in the C<ep> pointer.
1736
1737See also C<L</is_strict_utf8_string_loclen>>.
1738
1739=cut
1740*/
1741
1742#define is_strict_utf8_string_loc(s, len, ep)                               \
1743                                is_strict_utf8_string_loclen(s, len, ep, 0)
1744
1745/*
1746
1747=for apidoc is_strict_utf8_string_loclen
1748
1749Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1750case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1751"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1752encoded characters in the C<el> pointer.
1753
1754See also C<L</is_strict_utf8_string_loc>>.
1755
1756=cut
1757*/
1758
1759PERL_STATIC_INLINE bool
1760Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1761{
1762    const U8 * first_variant;
1763
1764    PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1765
1766    if (len == 0) {
1767        len = strlen((const char *) s);
1768    }
1769
1770    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1771        if (el)
1772            *el = len;
1773
1774        if (ep) {
1775            *ep = s + len;
1776        }
1777
1778        return TRUE;
1779    }
1780
1781    {
1782        const U8* const send = s + len;
1783        const U8* x = first_variant;
1784        STRLEN outlen = first_variant - s;
1785
1786        while (x < send) {
1787            const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1788            if (UNLIKELY(! cur_len)) {
1789                break;
1790            }
1791            x += cur_len;
1792            outlen++;
1793        }
1794
1795        if (el)
1796            *el = outlen;
1797
1798        if (ep) {
1799            *ep = x;
1800        }
1801
1802        return (x == send);
1803    }
1804}
1805
1806/*
1807
1808=for apidoc is_c9strict_utf8_string_loc
1809
1810Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1811the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1812"utf8ness success") in the C<ep> pointer.
1813
1814See also C<L</is_c9strict_utf8_string_loclen>>.
1815
1816=cut
1817*/
1818
1819#define is_c9strict_utf8_string_loc(s, len, ep)	                            \
1820                            is_c9strict_utf8_string_loclen(s, len, ep, 0)
1821
1822/*
1823
1824=for apidoc is_c9strict_utf8_string_loclen
1825
1826Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1827the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1828"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1829characters in the C<el> pointer.
1830
1831See also C<L</is_c9strict_utf8_string_loc>>.
1832
1833=cut
1834*/
1835
1836PERL_STATIC_INLINE bool
1837Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1838{
1839    const U8 * first_variant;
1840
1841    PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1842
1843    if (len == 0) {
1844        len = strlen((const char *) s);
1845    }
1846
1847    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1848        if (el)
1849            *el = len;
1850
1851        if (ep) {
1852            *ep = s + len;
1853        }
1854
1855        return TRUE;
1856    }
1857
1858    {
1859        const U8* const send = s + len;
1860        const U8* x = first_variant;
1861        STRLEN outlen = first_variant - s;
1862
1863        while (x < send) {
1864            const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1865            if (UNLIKELY(! cur_len)) {
1866                break;
1867            }
1868            x += cur_len;
1869            outlen++;
1870        }
1871
1872        if (el)
1873            *el = outlen;
1874
1875        if (ep) {
1876            *ep = x;
1877        }
1878
1879        return (x == send);
1880    }
1881}
1882
1883/*
1884
1885=for apidoc is_utf8_string_loc_flags
1886
1887Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1888case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1889"utf8ness success") in the C<ep> pointer.
1890
1891See also C<L</is_utf8_string_loclen_flags>>.
1892
1893=cut
1894*/
1895
1896#define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1897                        is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1898
1899
1900/* The above 3 actual functions could have been moved into the more general one
1901 * just below, and made #defines that call it with the right 'flags'.  They are
1902 * currently kept separate to increase their chances of getting inlined */
1903
1904/*
1905
1906=for apidoc is_utf8_string_loclen_flags
1907
1908Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1909case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1910"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1911encoded characters in the C<el> pointer.
1912
1913See also C<L</is_utf8_string_loc_flags>>.
1914
1915=cut
1916*/
1917
1918PERL_STATIC_INLINE bool
1919Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1920{
1921    const U8 * first_variant;
1922
1923    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1924    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1925                          |UTF8_DISALLOW_PERL_EXTENDED)));
1926
1927    if (len == 0) {
1928        len = strlen((const char *) s);
1929    }
1930
1931    if (flags == 0) {
1932        return is_utf8_string_loclen(s, len, ep, el);
1933    }
1934
1935    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1936                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1937    {
1938        return is_strict_utf8_string_loclen(s, len, ep, el);
1939    }
1940
1941    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1942                                    == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1943    {
1944        return is_c9strict_utf8_string_loclen(s, len, ep, el);
1945    }
1946
1947    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1948        if (el)
1949            *el = len;
1950
1951        if (ep) {
1952            *ep = s + len;
1953        }
1954
1955        return TRUE;
1956    }
1957
1958    {
1959        const U8* send = s + len;
1960        const U8* x = first_variant;
1961        STRLEN outlen = first_variant - s;
1962
1963        while (x < send) {
1964            const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1965            if (UNLIKELY(! cur_len)) {
1966                break;
1967            }
1968            x += cur_len;
1969            outlen++;
1970        }
1971
1972        if (el)
1973            *el = outlen;
1974
1975        if (ep) {
1976            *ep = x;
1977        }
1978
1979        return (x == send);
1980    }
1981}
1982
1983/*
1984=for apidoc utf8_distance
1985
1986Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1987and C<b>.
1988
1989WARNING: use only if you *know* that the pointers point inside the
1990same UTF-8 buffer.
1991
1992=cut
1993*/
1994
1995PERL_STATIC_INLINE IV
1996Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1997{
1998    PERL_ARGS_ASSERT_UTF8_DISTANCE;
1999
2000    return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2001}
2002
2003/*
2004=for apidoc utf8_hop
2005
2006Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2007forward (if C<off> is positive) or backward (if negative).  C<s> does not need
2008to be pointing to the starting byte of a character.  If it isn't, one count of
2009C<off> will be used up to get to the start of the next character for forward
2010hops, and to the start of the current character for negative ones.
2011
2012WARNING: Prefer L</utf8_hop_safe> to this one.
2013
2014Do NOT use this function unless you B<know> C<off> is within
2015the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2016on the first byte of a character or just after the last byte of a character.
2017
2018=cut
2019*/
2020
2021PERL_STATIC_INLINE U8 *
2022Perl_utf8_hop(const U8 *s, SSize_t off)
2023{
2024    PERL_ARGS_ASSERT_UTF8_HOP;
2025
2026    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2027     * the XXX bitops (especially ~) can create illegal UTF-8.
2028     * In other words: in Perl UTF-8 is not just for Unicode. */
2029
2030    if (off > 0) {
2031
2032        /* Get to next non-continuation byte */
2033        if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2034            do {
2035                s++;
2036            }
2037            while (UTF8_IS_CONTINUATION(*s));
2038            off--;
2039        }
2040
2041        while (off--)
2042            s += UTF8SKIP(s);
2043    }
2044    else {
2045        while (off++) {
2046            s--;
2047            while (UTF8_IS_CONTINUATION(*s))
2048                s--;
2049        }
2050    }
2051
2052    GCC_DIAG_IGNORE(-Wcast-qual)
2053    return (U8 *)s;
2054    GCC_DIAG_RESTORE
2055}
2056
2057/*
2058=for apidoc utf8_hop_forward
2059
2060Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2061forward.  C<s> does not need to be pointing to the starting byte of a
2062character.  If it isn't, one count of C<off> will be used up to get to the
2063start of the next character.
2064
2065C<off> must be non-negative.
2066
2067C<s> must be before or equal to C<end>.
2068
2069When moving forward it will not move beyond C<end>.
2070
2071Will not exceed this limit even if the string is not valid "UTF-8".
2072
2073=cut
2074*/
2075
2076PERL_STATIC_INLINE U8 *
2077Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2078{
2079    PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2080
2081    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2082     * the bitops (especially ~) can create illegal UTF-8.
2083     * In other words: in Perl UTF-8 is not just for Unicode. */
2084
2085    assert(s <= end);
2086    assert(off >= 0);
2087
2088    if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2089        /* Get to next non-continuation byte */
2090        do {
2091            s++;
2092        }
2093        while (UTF8_IS_CONTINUATION(*s));
2094        off--;
2095    }
2096
2097    while (off--) {
2098        STRLEN skip = UTF8SKIP(s);
2099        if ((STRLEN)(end - s) <= skip) {
2100            GCC_DIAG_IGNORE(-Wcast-qual)
2101            return (U8 *)end;
2102            GCC_DIAG_RESTORE
2103        }
2104        s += skip;
2105    }
2106
2107    GCC_DIAG_IGNORE(-Wcast-qual)
2108    return (U8 *)s;
2109    GCC_DIAG_RESTORE
2110}
2111
2112/*
2113=for apidoc utf8_hop_back
2114
2115Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2116backward.  C<s> does not need to be pointing to the starting byte of a
2117character.  If it isn't, one count of C<off> will be used up to get to that
2118start.
2119
2120C<off> must be non-positive.
2121
2122C<s> must be after or equal to C<start>.
2123
2124When moving backward it will not move before C<start>.
2125
2126Will not exceed this limit even if the string is not valid "UTF-8".
2127
2128=cut
2129*/
2130
2131PERL_STATIC_INLINE U8 *
2132Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2133{
2134    PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2135
2136    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2137     * the bitops (especially ~) can create illegal UTF-8.
2138     * In other words: in Perl UTF-8 is not just for Unicode. */
2139
2140    assert(start <= s);
2141    assert(off <= 0);
2142
2143    /* Note: if we know that the input is well-formed, we can do per-word
2144     * hop-back.  Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2145     * that.  But it was reverted because doing per-word has some
2146     * start-up/tear-down overhead, so only makes sense if the distance to be
2147     * moved is large, and core perl doesn't currently move more than a few
2148     * characters at a time.  You can reinstate it if it does become
2149     * advantageous. */
2150    while (off++ && s > start) {
2151        do {
2152            s--;
2153        } while (UTF8_IS_CONTINUATION(*s) && s > start);
2154    }
2155
2156    GCC_DIAG_IGNORE(-Wcast-qual)
2157    return (U8 *)s;
2158    GCC_DIAG_RESTORE
2159}
2160
2161/*
2162=for apidoc utf8_hop_safe
2163
2164Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2165either forward or backward.  C<s> does not need to be pointing to the starting
2166byte of a character.  If it isn't, one count of C<off> will be used up to get
2167to the start of the next character for forward hops, and to the start of the
2168current character for negative ones.
2169
2170When moving backward it will not move before C<start>.
2171
2172When moving forward it will not move beyond C<end>.
2173
2174Will not exceed those limits even if the string is not valid "UTF-8".
2175
2176=cut
2177*/
2178
2179PERL_STATIC_INLINE U8 *
2180Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2181{
2182    PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2183
2184    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2185     * the bitops (especially ~) can create illegal UTF-8.
2186     * In other words: in Perl UTF-8 is not just for Unicode. */
2187
2188    assert(start <= s && s <= end);
2189
2190    if (off >= 0) {
2191        return utf8_hop_forward(s, off, end);
2192    }
2193    else {
2194        return utf8_hop_back(s, off, start);
2195    }
2196}
2197
2198/*
2199
2200=for apidoc isUTF8_CHAR_flags
2201
2202Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2203looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2204that represents some code point, subject to the restrictions given by C<flags>;
2205otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
2206starting at C<s> comprise the code point's representation.  Any bytes remaining
2207before C<e>, but beyond the ones needed to form the first code point in C<s>,
2208are not examined.
2209
2210If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2211if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2212as C<L</isSTRICT_UTF8_CHAR>>;
2213and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2214the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2215Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2216understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2217
2218The three alternative macros are for the most commonly needed validations; they
2219are likely to run somewhat faster than this more general one, as they can be
2220inlined into your code.
2221
2222Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2223L</is_utf8_string_loclen_flags> to check entire strings.
2224
2225=cut
2226*/
2227
2228PERL_STATIC_INLINE STRLEN
2229Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2230{
2231    PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2232    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2233                          |UTF8_DISALLOW_PERL_EXTENDED)));
2234
2235    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2236                          goto check_success,
2237                          DFA_TEASE_APART_FF_,
2238                          DFA_RETURN_FAILURE_);
2239
2240  check_success:
2241
2242    return is_utf8_char_helper_(s0, e, flags);
2243
2244#ifdef HAS_EXTRA_LONG_UTF8
2245
2246  tease_apart_FF:
2247
2248    /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2249     * either malformed, or was for the largest possible start byte, which
2250     * indicates perl extended UTF-8, well above the Unicode maximum */
2251    if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2252        || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2253    {
2254        return 0;
2255    }
2256
2257    /* Otherwise examine the sequence not inline */
2258    return is_utf8_FF_helper_(s0, e,
2259                              FALSE /* require full, not partial char */
2260                             );
2261#endif
2262
2263}
2264
2265/*
2266
2267=for apidoc is_utf8_valid_partial_char
2268
2269Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2270S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2271points.  Otherwise, it returns 1 if there exists at least one non-empty
2272sequence of bytes that when appended to sequence C<s>, starting at position
2273C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2274otherwise returns 0.
2275
2276In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2277point.
2278
2279This is useful when a fixed-length buffer is being tested for being well-formed
2280UTF-8, but the final few bytes in it don't comprise a full character; that is,
2281it is split somewhere in the middle of the final code point's UTF-8
2282representation.  (Presumably when the buffer is refreshed with the next chunk
2283of data, the new first bytes will complete the partial code point.)   This
2284function is used to verify that the final bytes in the current buffer are in
2285fact the legal beginning of some code point, so that if they aren't, the
2286failure can be signalled without having to wait for the next read.
2287
2288=cut
2289*/
2290#define is_utf8_valid_partial_char(s, e)                                    \
2291                                is_utf8_valid_partial_char_flags(s, e, 0)
2292
2293/*
2294
2295=for apidoc is_utf8_valid_partial_char_flags
2296
2297Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2298or not the input is a valid UTF-8 encoded partial character, but it takes an
2299extra parameter, C<flags>, which can further restrict which code points are
2300considered valid.
2301
2302If C<flags> is 0, this behaves identically to
2303C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2304of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2305there is any sequence of bytes that can complete the input partial character in
2306such a way that a non-prohibited character is formed, the function returns
2307TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2308partial character input.  But many  of the other possible excluded types can be
2309determined from just the first one or two bytes.
2310
2311=cut
2312 */
2313
2314PERL_STATIC_INLINE bool
2315Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2316{
2317    PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2318    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2319                          |UTF8_DISALLOW_PERL_EXTENDED)));
2320
2321    PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2322                          DFA_RETURN_FAILURE_,
2323                          DFA_TEASE_APART_FF_,
2324                          NOOP);
2325
2326    /* The NOOP above causes the DFA to drop down here iff the input was a
2327     * partial character.  flags=0 => can return TRUE immediately; otherwise we
2328     * need to check (not inline) if the partial character is the beginning of
2329     * a disallowed one */
2330    if (flags == 0) {
2331        return TRUE;
2332    }
2333
2334    return cBOOL(is_utf8_char_helper_(s0, e, flags));
2335
2336#ifdef HAS_EXTRA_LONG_UTF8
2337
2338  tease_apart_FF:
2339
2340    /* Getting here means the input is either malformed, or, in the case of
2341     * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
2342     * latter case has to be extended UTF-8, so can fail immediately if that is
2343     * forbidden */
2344
2345    if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2346        || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2347    {
2348        return 0;
2349    }
2350
2351    return is_utf8_FF_helper_(s0, e,
2352                              TRUE /* Require to be a partial character */
2353                             );
2354#endif
2355
2356}
2357
2358/*
2359
2360=for apidoc is_utf8_fixed_width_buf_flags
2361
2362Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2363is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2364otherwise it returns FALSE.
2365
2366If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2367without restriction.  If the final few bytes of the buffer do not form a
2368complete code point, this will return TRUE anyway, provided that
2369C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2370
2371If C<flags> in non-zero, it can be any combination of the
2372C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2373same meanings.
2374
2375This function differs from C<L</is_utf8_string_flags>> only in that the latter
2376returns FALSE if the final few bytes of the string don't form a complete code
2377point.
2378
2379=cut
2380 */
2381#define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2382                is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2383
2384/*
2385
2386=for apidoc is_utf8_fixed_width_buf_loc_flags
2387
2388Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2389failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2390to the beginning of any partial character at the end of the buffer; if there is
2391no partial character C<*ep> will contain C<s>+C<len>.
2392
2393See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2394
2395=cut
2396*/
2397
2398#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2399                is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2400
2401/*
2402
2403=for apidoc is_utf8_fixed_width_buf_loclen_flags
2404
2405Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2406complete, valid characters found in the C<el> pointer.
2407
2408=cut
2409*/
2410
2411PERL_STATIC_INLINE bool
2412Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2413                                       STRLEN len,
2414                                       const U8 **ep,
2415                                       STRLEN *el,
2416                                       const U32 flags)
2417{
2418    const U8 * maybe_partial;
2419
2420    PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2421
2422    if (! ep) {
2423        ep  = &maybe_partial;
2424    }
2425
2426    /* If it's entirely valid, return that; otherwise see if the only error is
2427     * that the final few bytes are for a partial character */
2428    return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2429           || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2430}
2431
2432PERL_STATIC_INLINE UV
2433Perl_utf8n_to_uvchr_msgs(const U8 *s,
2434                         STRLEN curlen,
2435                         STRLEN *retlen,
2436                         const U32 flags,
2437                         U32 * errors,
2438                         AV ** msgs)
2439{
2440    /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2441     * simple cases, and, if necessary calls a helper function to deal with the
2442     * more complex ones.  Almost all well-formed non-problematic code points
2443     * are considered simple, so that it's unlikely that the helper function
2444     * will need to be called.
2445     *
2446     * This is an adaptation of the tables and algorithm given in
2447     * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2448     * comprehensive documentation of the original version.  A copyright notice
2449     * for the original version is given at the beginning of this file.  The
2450     * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
2451     */
2452
2453    const U8 * const s0 = s;
2454    const U8 * send = s0 + curlen;
2455    UV type;
2456    UV uv;
2457
2458    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2459
2460    /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2461     * non-problematic code point, which can be returned immediately.
2462     * Otherwise we call a helper function to figure out the more complicated
2463     * cases. */
2464
2465    /* No calls from core pass in an empty string; non-core need a check */
2466#ifdef PERL_CORE
2467    assert(curlen > 0);
2468#else
2469    if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2470                                                        flags, errors, msgs);
2471#endif
2472
2473    type = PL_strict_utf8_dfa_tab[*s];
2474
2475    /* The table is structured so that 'type' is 0 iff the input byte is
2476     * represented identically regardless of the UTF-8ness of the string */
2477    if (type == 0) {   /* UTF-8 invariants are returned unchanged */
2478        uv = *s;
2479    }
2480    else {
2481        UV state = PL_strict_utf8_dfa_tab[256 + type];
2482        uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2483
2484        while (++s < send) {
2485            type  = PL_strict_utf8_dfa_tab[*s];
2486            state = PL_strict_utf8_dfa_tab[256 + state + type];
2487
2488            uv = UTF8_ACCUMULATE(uv, *s);
2489
2490            if (state == 0) {
2491#ifdef EBCDIC
2492                uv = UNI_TO_NATIVE(uv);
2493#endif
2494                goto success;
2495            }
2496
2497            if (UNLIKELY(state == 1)) {
2498                break;
2499            }
2500        }
2501
2502        /* Here is potentially problematic.  Use the full mechanism */
2503        return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2504                                           errors, msgs);
2505    }
2506
2507  success:
2508    if (retlen) {
2509        *retlen = s - s0 + 1;
2510    }
2511    if (errors) {
2512        *errors = 0;
2513    }
2514    if (msgs) {
2515        *msgs = NULL;
2516    }
2517
2518    return uv;
2519}
2520
2521PERL_STATIC_INLINE UV
2522Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2523{
2524    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2525
2526    assert(s < send);
2527
2528    if (! ckWARN_d(WARN_UTF8)) {
2529
2530        /* EMPTY is not really allowed, and asserts on debugging builds.  But
2531         * on non-debugging we have to deal with it, and this causes it to
2532         * return the REPLACEMENT CHARACTER, as the documentation indicates */
2533        return utf8n_to_uvchr(s, send - s, retlen,
2534                              (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2535    }
2536    else {
2537        UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2538        if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2539            *retlen = (STRLEN) -1;
2540        }
2541
2542        return ret;
2543    }
2544}
2545
2546/* ------------------------------- perl.h ----------------------------- */
2547
2548/*
2549=for apidoc_section $utility
2550
2551=for apidoc is_safe_syscall
2552
2553Test that the given C<pv> (with length C<len>) doesn't contain any internal
2554C<NUL> characters.
2555If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2556category, and return FALSE.
2557
2558Return TRUE if the name is safe.
2559
2560C<what> and C<op_name> are used in any warning.
2561
2562Used by the C<IS_SAFE_SYSCALL()> macro.
2563
2564=cut
2565*/
2566
2567PERL_STATIC_INLINE bool
2568Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2569{
2570    /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2571     * perl itself uses xce*() functions which accept 8-bit strings.
2572     */
2573
2574    PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2575
2576    if (len > 1) {
2577        char *null_at;
2578        if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2579                SETERRNO(ENOENT, LIB_INVARG);
2580                Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2581                                   "Invalid \\0 character in %s for %s: %s\\0%s",
2582                                   what, op_name, pv, null_at+1);
2583                return FALSE;
2584        }
2585    }
2586
2587    return TRUE;
2588}
2589
2590/*
2591
2592Return true if the supplied filename has a newline character
2593immediately before the first (hopefully only) NUL.
2594
2595My original look at this incorrectly used the len from SvPV(), but
2596that's incorrect, since we allow for a NUL in pv[len-1].
2597
2598So instead, strlen() and work from there.
2599
2600This allow for the user reading a filename, forgetting to chomp it,
2601then calling:
2602
2603  open my $foo, "$file\0";
2604
2605*/
2606
2607#ifdef PERL_CORE
2608
2609PERL_STATIC_INLINE bool
2610S_should_warn_nl(const char *pv)
2611{
2612    STRLEN len;
2613
2614    PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2615
2616    len = strlen(pv);
2617
2618    return len > 0 && pv[len-1] == '\n';
2619}
2620
2621#endif
2622
2623#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2624
2625PERL_STATIC_INLINE bool
2626S_lossless_NV_to_IV(const NV nv, IV *ivp)
2627{
2628    /* This function determines if the input NV 'nv' may be converted without
2629     * loss of data to an IV.  If not, it returns FALSE taking no other action.
2630     * But if it is possible, it does the conversion, returning TRUE, and
2631     * storing the converted result in '*ivp' */
2632
2633    PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2634
2635#  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2636    /* Normally any comparison with a NaN returns false; if we can't rely
2637     * on that behaviour, check explicitly */
2638    if (UNLIKELY(Perl_isnan(nv))) {
2639        return FALSE;
2640    }
2641#  endif
2642
2643    /* Written this way so that with an always-false NaN comparison we
2644     * return false */
2645    if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2646        return FALSE;
2647    }
2648
2649    if ((IV) nv != nv) {
2650        return FALSE;
2651    }
2652
2653    *ivp = (IV) nv;
2654    return TRUE;
2655}
2656
2657#endif
2658
2659/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2660
2661#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2662
2663#define MAX_CHARSET_NAME_LENGTH 2
2664
2665PERL_STATIC_INLINE const char *
2666S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2667{
2668    PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2669
2670    /* Returns a string that corresponds to the name of the regex character set
2671     * given by 'flags', and *lenp is set the length of that string, which
2672     * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2673
2674    *lenp = 1;
2675    switch (get_regex_charset(flags)) {
2676        case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2677        case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2678        case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2679        case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2680        case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2681            *lenp = 2;
2682            return ASCII_MORE_RESTRICT_PAT_MODS;
2683    }
2684    /* The NOT_REACHED; hides an assert() which has a rather complex
2685     * definition in perl.h. */
2686    NOT_REACHED; /* NOTREACHED */
2687    return "?";	    /* Unknown */
2688}
2689
2690#endif
2691
2692/*
2693
2694Return false if any get magic is on the SV other than taint magic.
2695
2696*/
2697
2698PERL_STATIC_INLINE bool
2699Perl_sv_only_taint_gmagic(SV *sv)
2700{
2701    MAGIC *mg = SvMAGIC(sv);
2702
2703    PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2704
2705    while (mg) {
2706        if (mg->mg_type != PERL_MAGIC_taint
2707            && !(mg->mg_flags & MGf_GSKIP)
2708            && mg->mg_virtual->svt_get) {
2709            return FALSE;
2710        }
2711        mg = mg->mg_moremagic;
2712    }
2713
2714    return TRUE;
2715}
2716
2717/* ------------------ cop.h ------------------------------------------- */
2718
2719/* implement GIMME_V() macro */
2720
2721PERL_STATIC_INLINE U8
2722Perl_gimme_V(pTHX)
2723{
2724    I32 cxix;
2725    U8  gimme = (PL_op->op_flags & OPf_WANT);
2726
2727    if (gimme)
2728        return gimme;
2729    cxix = PL_curstackinfo->si_cxsubix;
2730    if (cxix < 0)
2731        return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2732    assert(cxstack[cxix].blk_gimme & G_WANT);
2733    return (cxstack[cxix].blk_gimme & G_WANT);
2734}
2735
2736
2737/* Enter a block. Push a new base context and return its address. */
2738
2739PERL_STATIC_INLINE PERL_CONTEXT *
2740Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2741{
2742    PERL_CONTEXT * cx;
2743
2744    PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2745
2746    CXINC;
2747    cx = CX_CUR();
2748    cx->cx_type        = type;
2749    cx->blk_gimme      = gimme;
2750    cx->blk_oldsaveix  = saveix;
2751    cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2752    cx->blk_oldcop     = PL_curcop;
2753    cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2754    cx->blk_oldscopesp = PL_scopestack_ix;
2755    cx->blk_oldpm      = PL_curpm;
2756    cx->blk_old_tmpsfloor = PL_tmps_floor;
2757
2758    PL_tmps_floor        = PL_tmps_ix;
2759    CX_DEBUG(cx, "PUSH");
2760    return cx;
2761}
2762
2763
2764/* Exit a block (RETURN and LAST). */
2765
2766PERL_STATIC_INLINE void
2767Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2768{
2769    PERL_ARGS_ASSERT_CX_POPBLOCK;
2770
2771    CX_DEBUG(cx, "POP");
2772    /* these 3 are common to cx_popblock and cx_topblock */
2773    PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2774    PL_scopestack_ix = cx->blk_oldscopesp;
2775    PL_curpm         = cx->blk_oldpm;
2776
2777    /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2778     * and leaves a CX entry lying around for repeated use, so
2779     * skip for multicall */                  \
2780    assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2781            || PL_savestack_ix == cx->blk_oldsaveix);
2782    PL_curcop     = cx->blk_oldcop;
2783    PL_tmps_floor = cx->blk_old_tmpsfloor;
2784}
2785
2786/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2787 * Whereas cx_popblock() restores the state to the point just before
2788 * cx_pushblock() was called,  cx_topblock() restores it to the point just
2789 * *after* cx_pushblock() was called. */
2790
2791PERL_STATIC_INLINE void
2792Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2793{
2794    PERL_ARGS_ASSERT_CX_TOPBLOCK;
2795
2796    CX_DEBUG(cx, "TOP");
2797    /* these 3 are common to cx_popblock and cx_topblock */
2798    PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2799    PL_scopestack_ix = cx->blk_oldscopesp;
2800    PL_curpm         = cx->blk_oldpm;
2801
2802    PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2803}
2804
2805
2806PERL_STATIC_INLINE void
2807Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2808{
2809    U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2810
2811    PERL_ARGS_ASSERT_CX_PUSHSUB;
2812
2813    PERL_DTRACE_PROBE_ENTRY(cv);
2814    cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2815    PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2816    cx->blk_sub.cv = cv;
2817    cx->blk_sub.olddepth = CvDEPTH(cv);
2818    cx->blk_sub.prevcomppad = PL_comppad;
2819    cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2820    cx->blk_sub.retop = retop;
2821    SvREFCNT_inc_simple_void_NN(cv);
2822    cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2823}
2824
2825
2826/* subsets of cx_popsub() */
2827
2828PERL_STATIC_INLINE void
2829Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2830{
2831    CV *cv;
2832
2833    PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2834    assert(CxTYPE(cx) == CXt_SUB);
2835
2836    PL_comppad = cx->blk_sub.prevcomppad;
2837    PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2838    cv = cx->blk_sub.cv;
2839    CvDEPTH(cv) = cx->blk_sub.olddepth;
2840    cx->blk_sub.cv = NULL;
2841    SvREFCNT_dec(cv);
2842    PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2843}
2844
2845
2846/* handle the @_ part of leaving a sub */
2847
2848PERL_STATIC_INLINE void
2849Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2850{
2851    AV *av;
2852
2853    PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2854    assert(CxTYPE(cx) == CXt_SUB);
2855    assert(AvARRAY(MUTABLE_AV(
2856        PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2857                CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2858
2859    CX_POP_SAVEARRAY(cx);
2860    av = MUTABLE_AV(PAD_SVl(0));
2861    if (UNLIKELY(AvREAL(av)))
2862        /* abandon @_ if it got reified */
2863        clear_defarray(av, 0);
2864    else {
2865        CLEAR_ARGARRAY(av);
2866    }
2867}
2868
2869
2870PERL_STATIC_INLINE void
2871Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2872{
2873    PERL_ARGS_ASSERT_CX_POPSUB;
2874    assert(CxTYPE(cx) == CXt_SUB);
2875
2876    PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2877
2878    if (CxHASARGS(cx))
2879        cx_popsub_args(cx);
2880    cx_popsub_common(cx);
2881}
2882
2883
2884PERL_STATIC_INLINE void
2885Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2886{
2887    PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2888
2889    cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2890    PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2891    cx->blk_format.cv          = cv;
2892    cx->blk_format.retop       = retop;
2893    cx->blk_format.gv          = gv;
2894    cx->blk_format.dfoutgv     = PL_defoutgv;
2895    cx->blk_format.prevcomppad = PL_comppad;
2896    cx->blk_u16                = 0;
2897
2898    SvREFCNT_inc_simple_void_NN(cv);
2899    CvDEPTH(cv)++;
2900    SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2901}
2902
2903
2904PERL_STATIC_INLINE void
2905Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2906{
2907    CV *cv;
2908    GV *dfout;
2909
2910    PERL_ARGS_ASSERT_CX_POPFORMAT;
2911    assert(CxTYPE(cx) == CXt_FORMAT);
2912
2913    dfout = cx->blk_format.dfoutgv;
2914    setdefout(dfout);
2915    cx->blk_format.dfoutgv = NULL;
2916    SvREFCNT_dec_NN(dfout);
2917
2918    PL_comppad = cx->blk_format.prevcomppad;
2919    PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2920    cv = cx->blk_format.cv;
2921    cx->blk_format.cv = NULL;
2922    --CvDEPTH(cv);
2923    SvREFCNT_dec_NN(cv);
2924    PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2925}
2926
2927
2928PERL_STATIC_INLINE void
2929Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2930{
2931    cx->blk_eval.retop         = retop;
2932    cx->blk_eval.old_namesv    = namesv;
2933    cx->blk_eval.old_eval_root = PL_eval_root;
2934    cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2935    cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2936    cx->blk_eval.cur_top_env   = PL_top_env;
2937
2938    assert(!(PL_in_eval     & ~ 0x3F));
2939    assert(!(PL_op->op_type & ~0x1FF));
2940    cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2941}
2942
2943PERL_STATIC_INLINE void
2944Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2945{
2946    PERL_ARGS_ASSERT_CX_PUSHEVAL;
2947
2948    Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2949
2950    cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2951    PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2952}
2953
2954PERL_STATIC_INLINE void
2955Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2956{
2957    PERL_ARGS_ASSERT_CX_PUSHTRY;
2958
2959    Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2960
2961    /* Don't actually change it, just store the current value so it's restored
2962     * by the common popeval */
2963    cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2964}
2965
2966
2967PERL_STATIC_INLINE void
2968Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2969{
2970    SV *sv;
2971
2972    PERL_ARGS_ASSERT_CX_POPEVAL;
2973    assert(CxTYPE(cx) == CXt_EVAL);
2974
2975    PL_in_eval = CxOLD_IN_EVAL(cx);
2976    assert(!(PL_in_eval & 0xc0));
2977    PL_eval_root = cx->blk_eval.old_eval_root;
2978    sv = cx->blk_eval.cur_text;
2979    if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2980        cx->blk_eval.cur_text = NULL;
2981        SvREFCNT_dec_NN(sv);
2982    }
2983
2984    sv = cx->blk_eval.old_namesv;
2985    if (sv) {
2986        cx->blk_eval.old_namesv = NULL;
2987        SvREFCNT_dec_NN(sv);
2988    }
2989    PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2990}
2991
2992
2993/* push a plain loop, i.e.
2994 *     { block }
2995 *     while (cond) { block }
2996 *     for (init;cond;continue) { block }
2997 * This loop can be last/redo'ed etc.
2998 */
2999
3000PERL_STATIC_INLINE void
3001Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3002{
3003    PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3004    cx->blk_loop.my_op = cLOOP;
3005}
3006
3007
3008/* push a true for loop, i.e.
3009 *     for var (list) { block }
3010 */
3011
3012PERL_STATIC_INLINE void
3013Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3014{
3015    PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3016
3017    /* this one line is common with cx_pushloop_plain */
3018    cx->blk_loop.my_op = cLOOP;
3019
3020    cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3021    cx->blk_loop.itersave      = itersave;
3022#ifdef USE_ITHREADS
3023    cx->blk_loop.oldcomppad = PL_comppad;
3024#endif
3025}
3026
3027
3028/* pop all loop types, including plain */
3029
3030PERL_STATIC_INLINE void
3031Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3032{
3033    PERL_ARGS_ASSERT_CX_POPLOOP;
3034
3035    assert(CxTYPE_is_LOOP(cx));
3036    if (  CxTYPE(cx) == CXt_LOOP_ARY
3037       || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3038    {
3039        /* Free ary or cur. This assumes that state_u.ary.ary
3040         * aligns with state_u.lazysv.cur. See cx_dup() */
3041        SV *sv = cx->blk_loop.state_u.lazysv.cur;
3042        cx->blk_loop.state_u.lazysv.cur = NULL;
3043        SvREFCNT_dec_NN(sv);
3044        if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3045            sv = cx->blk_loop.state_u.lazysv.end;
3046            cx->blk_loop.state_u.lazysv.end = NULL;
3047            SvREFCNT_dec_NN(sv);
3048        }
3049    }
3050    if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3051        SV *cursv;
3052        SV **svp = (cx)->blk_loop.itervar_u.svp;
3053        if ((cx->cx_type & CXp_FOR_GV))
3054            svp = &GvSV((GV*)svp);
3055        cursv = *svp;
3056        *svp = cx->blk_loop.itersave;
3057        cx->blk_loop.itersave = NULL;
3058        SvREFCNT_dec(cursv);
3059    }
3060    if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3061        SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3062}
3063
3064
3065PERL_STATIC_INLINE void
3066Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3067{
3068    PERL_ARGS_ASSERT_CX_PUSHWHEN;
3069
3070    cx->blk_givwhen.leave_op = cLOGOP->op_other;
3071}
3072
3073
3074PERL_STATIC_INLINE void
3075Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3076{
3077    PERL_ARGS_ASSERT_CX_POPWHEN;
3078    assert(CxTYPE(cx) == CXt_WHEN);
3079
3080    PERL_UNUSED_ARG(cx);
3081    PERL_UNUSED_CONTEXT;
3082    /* currently NOOP */
3083}
3084
3085
3086PERL_STATIC_INLINE void
3087Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3088{
3089    PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3090
3091    cx->blk_givwhen.leave_op = cLOGOP->op_other;
3092    cx->blk_givwhen.defsv_save = orig_defsv;
3093}
3094
3095
3096PERL_STATIC_INLINE void
3097Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3098{
3099    SV *sv;
3100
3101    PERL_ARGS_ASSERT_CX_POPGIVEN;
3102    assert(CxTYPE(cx) == CXt_GIVEN);
3103
3104    sv = GvSV(PL_defgv);
3105    GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3106    cx->blk_givwhen.defsv_save = NULL;
3107    SvREFCNT_dec(sv);
3108}
3109
3110/*
3111=for apidoc newPADxVOP
3112
3113Constructs, checks and returns an op containing a pad offset.  C<type> is
3114the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
3115or C<OP_PADCV>.  The returned op will have the C<op_targ> field set by
3116the C<padix> argument.
3117
3118This is convenient when constructing a large optree in nested function
3119calls, as it avoids needing to store the pad op directly to set the
3120C<op_targ> field as a side-effect. For example
3121
3122    o = op_append_elem(OP_LINESEQ, o,
3123        newPADxVOP(OP_PADSV, 0, padix));
3124
3125=cut
3126*/
3127
3128PERL_STATIC_INLINE OP *
3129Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
3130{
3131    PERL_ARGS_ASSERT_NEWPADXVOP;
3132
3133    assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
3134            || type == OP_PADCV);
3135    OP *o = newOP(type, flags);
3136    o->op_targ = padix;
3137    return o;
3138}
3139
3140/* ------------------ util.h ------------------------------------------- */
3141
3142/*
3143=for apidoc_section $string
3144
3145=for apidoc foldEQ
3146
3147Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3148same
3149case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
3150match themselves and their opposite case counterparts.  Non-cased and non-ASCII
3151range bytes match only themselves.
3152
3153=cut
3154*/
3155
3156PERL_STATIC_INLINE I32
3157Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
3158{
3159    const U8 *a = (const U8 *)s1;
3160    const U8 *b = (const U8 *)s2;
3161
3162    PERL_ARGS_ASSERT_FOLDEQ;
3163
3164    assert(len >= 0);
3165
3166    while (len--) {
3167        if (*a != *b && *a != PL_fold[*b])
3168            return 0;
3169        a++,b++;
3170    }
3171    return 1;
3172}
3173
3174PERL_STATIC_INLINE I32
3175Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
3176{
3177    /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
3178     * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3179     * does not check for this.  Nor does it check that the strings each have
3180     * at least 'len' characters. */
3181
3182    const U8 *a = (const U8 *)s1;
3183    const U8 *b = (const U8 *)s2;
3184
3185    PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3186
3187    assert(len >= 0);
3188
3189    while (len--) {
3190        if (*a != *b && *a != PL_fold_latin1[*b]) {
3191            return 0;
3192        }
3193        a++, b++;
3194    }
3195    return 1;
3196}
3197
3198/*
3199=for apidoc_section $locale
3200=for apidoc foldEQ_locale
3201
3202Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3203same case-insensitively in the current locale; false otherwise.
3204
3205=cut
3206*/
3207
3208PERL_STATIC_INLINE I32
3209Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
3210{
3211    const U8 *a = (const U8 *)s1;
3212    const U8 *b = (const U8 *)s2;
3213
3214    PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3215
3216    assert(len >= 0);
3217
3218    while (len--) {
3219        if (*a != *b && *a != PL_fold_locale[*b]) {
3220            DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3221                     "%s:%d: Our records indicate %02x is not a fold of %02x"
3222                     " or its mate %02x\n",
3223                     __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
3224
3225            return 0;
3226        }
3227        a++,b++;
3228    }
3229    return 1;
3230}
3231
3232/*
3233=for apidoc_section $string
3234=for apidoc my_strnlen
3235
3236The C library C<strnlen> if available, or a Perl implementation of it.
3237
3238C<my_strnlen()> computes the length of the string, up to C<maxlen>
3239characters.  It will never attempt to address more than C<maxlen>
3240characters, making it suitable for use with strings that are not
3241guaranteed to be NUL-terminated.
3242
3243=cut
3244
3245Description stolen from http://man.openbsd.org/strnlen.3,
3246implementation stolen from PostgreSQL.
3247*/
3248#ifndef HAS_STRNLEN
3249
3250PERL_STATIC_INLINE Size_t
3251Perl_my_strnlen(const char *str, Size_t maxlen)
3252{
3253    const char *end = (char *) memchr(str, '\0', maxlen);
3254
3255    PERL_ARGS_ASSERT_MY_STRNLEN;
3256
3257    if (end == NULL) return maxlen;
3258    return end - str;
3259}
3260
3261#endif
3262
3263#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3264
3265PERL_STATIC_INLINE void *
3266S_my_memrchr(const char * s, const char c, const STRLEN len)
3267{
3268    /* memrchr(), since many platforms lack it */
3269
3270    const char * t = s + len - 1;
3271
3272    PERL_ARGS_ASSERT_MY_MEMRCHR;
3273
3274    while (t >= s) {
3275        if (*t == c) {
3276            return (void *) t;
3277        }
3278        t--;
3279    }
3280
3281    return NULL;
3282}
3283
3284#endif
3285
3286PERL_STATIC_INLINE char *
3287Perl_mortal_getenv(const char * str)
3288{
3289    /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3290     *
3291     * It's (mostly) thread-safe because it uses a mutex to prevent other
3292     * threads (that look at this mutex) from destroying the result before this
3293     * routine has a chance to copy the result to a place that won't be
3294     * destroyed before the caller gets a chance to handle it.  That place is a
3295     * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3296     * impression that the SV will hang around longer under more circumstances
3297     *
3298     * The reason it isn't completely thread-safe is that other code could
3299     * simply not pay attention to the mutex.  All of the Perl core uses the
3300     * mutex, but it is possible for code from, say XS, to not use this mutex,
3301     * defeating the safety.
3302     *
3303     * getenv() returns, in some implementations, a pointer to a spot in the
3304     * **environ array, which could be invalidated at any time by this or
3305     * another thread changing the environment.  Other implementations copy the
3306     * **environ value to a static buffer, returning a pointer to that.  That
3307     * buffer might or might not be invalidated by a getenv() call in another
3308     * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3309     * many getenv() calls can safely be running simultaneously, so a
3310     * many-reader (but no simultaneous writers) lock is ok.  There is a
3311     * Configure probe to see if another thread destroys the buffer, and the
3312     * mutex is defined accordingly.
3313     *
3314     * But in all cases, using the mutex prevents these problems, as long as
3315     * all code uses the same mutex.
3316     *
3317     * A complication is that this can be called during phases where the
3318     * mortalization process isn't available.  These are in interpreter
3319     * destruction or early in construction.  khw believes that at these times
3320     * there shouldn't be anything else going on, so plain getenv is safe AS
3321     * LONG AS the caller acts on the return before calling it again. */
3322
3323    char * ret;
3324    dTHX;
3325
3326    PERL_ARGS_ASSERT_MORTAL_GETENV;
3327
3328    /* Can't mortalize without stacks.  khw believes that no other threads
3329     * should be running, so no need to lock things, and this may be during a
3330     * phase when locking isn't even available */
3331    if (UNLIKELY(PL_scopestack_ix == 0)) {
3332        return getenv(str);
3333    }
3334
3335#ifdef PERL_MEM_LOG
3336
3337    /* A major complication arises under PERL_MEM_LOG.  When that is active,
3338     * every memory allocation may result in logging, depending on the value of
3339     * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3340     * saving ENV{foo}'s value (but before saving it), the logging code will
3341     * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3342     * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3343     * lock a boolean mutex recursively); 3) destroying the getenv() static
3344     * buffer; or 4) destroying the temporary created by this for the copy
3345     * causes a log entry to be made which could cause a new temporary to be
3346     * created, which will need to be destroyed at some point, leading to an
3347     * infinite loop.
3348     *
3349     * The solution adopted here (after some gnashing of teeth) is to detect
3350     * the recursive calls and calls from the logger, and treat them specially.
3351     * Let's say we want to do getenv("foo").  We first find
3352     * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3353     * variable, so no temporary is required.  Then we do getenv(foo), and in
3354     * the process of creating a temporary to save it, this function will be
3355     * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3356     * we detect that it is such a call and return our saved value instead of
3357     * locking and doing a new getenv().  This solves all of problems 1), 2),
3358     * and 3).  Because all the getenv()s are done while the mutex is locked,
3359     * the state cannot have changed.  To solve 4), we don't create a temporary
3360     * when this is called from the logging code.  That code disposes of the
3361     * return value while the mutex is still locked.
3362     *
3363     * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3364     * digits and 3 particular letters are significant; the rest are ignored by
3365     * the memory logging code.  Thus the per-interpreter variable only needs
3366     * to be large enough to save the significant information, the size of
3367     * which is known at compile time.  The first byte is extra, reserved for
3368     * flags for our use.  To protect against overflowing, only the reserved
3369     * byte, as many digits as don't overflow, and the three letters are
3370     * stored.
3371     *
3372     * The reserved byte has two bits:
3373     *      0x1 if set indicates that if we get here, it is a recursive call of
3374     *          getenv()
3375     *      0x2 if set indicates that the call is from the logging code.
3376     *
3377     * If the flag indicates this is a recursive call, just return the stored
3378     * value of PL_mem_log;  An empty value gets turned into NULL. */
3379    if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3380        if (PL_mem_log[1] == '\0') {
3381            return NULL;
3382        } else {
3383            return PL_mem_log + 1;
3384        }
3385    }
3386
3387#endif
3388
3389    GETENV_LOCK;
3390
3391#ifdef PERL_MEM_LOG
3392
3393    /* Here we are in a critical section.  As explained above, we do our own
3394     * getenv(PERL_MEM_LOG), saving the result safely. */
3395    ret = getenv("PERL_MEM_LOG");
3396    if (ret == NULL) {  /* No logging active */
3397
3398        /* Return that immediately if called from the logging code */
3399        if (PL_mem_log[0] & 0x2) {
3400            GETENV_UNLOCK;
3401            return NULL;
3402        }
3403
3404        PL_mem_log[1] = '\0';
3405    }
3406    else {
3407        char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3408
3409        /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3410         * extremely long string.  But we want only a few characters from it.
3411         * PL_mem_log has been made large enough to hold just the ones we need.
3412         * First the file descriptor. */
3413        if (isDIGIT(*ret)) {
3414            const char * s = ret;
3415            if (UNLIKELY(*s == '0')) {
3416
3417                /* Reduce multiple leading zeros to a single one.  This is to
3418                 * allow the caller to change what to do with leading zeros. */
3419                *mem_log_meat++ = '0';
3420                s++;
3421                while (*s == '0') {
3422                    s++;
3423                }
3424            }
3425
3426            /* If the input overflows, copy just enough for the result to also
3427             * overflow, plus 1 to make sure */
3428            while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3429                *mem_log_meat++ = *s++;
3430            }
3431        }
3432
3433        /* Then each of the four significant characters */
3434        if (strchr(ret, 'm')) {
3435            *mem_log_meat++ = 'm';
3436        }
3437        if (strchr(ret, 's')) {
3438            *mem_log_meat++ = 's';
3439        }
3440        if (strchr(ret, 't')) {
3441            *mem_log_meat++ = 't';
3442        }
3443        if (strchr(ret, 'c')) {
3444            *mem_log_meat++ = 'c';
3445        }
3446        *mem_log_meat = '\0';
3447
3448        assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3449    }
3450
3451    /* If we are being called from the logger, it only needs the significant
3452     * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3453    if (PL_mem_log[0] & 0x2) {
3454        assert(strEQ(str, "PERL_MEM_LOG"));
3455        GETENV_UNLOCK;
3456        return PL_mem_log + 1;
3457    }
3458
3459    /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3460     * is coming from other than the logging code, so it should be treated the
3461     * same as any other getenv(), returning the full value, not just the
3462     * significant part, and having its value saved.  Set the flag that
3463     * indicates any call to this routine will be a recursion from here */
3464    PL_mem_log[0] = 0x1;
3465
3466#endif
3467
3468    /* Now get the value of the real desired variable, and save a copy */
3469    ret = getenv(str);
3470
3471    if (ret != NULL) {
3472        ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3473    }
3474
3475    GETENV_UNLOCK;
3476
3477#ifdef PERL_MEM_LOG
3478
3479    /* Clear the buffer */
3480    Zero(PL_mem_log, sizeof(PL_mem_log), char);
3481
3482#endif
3483
3484    return ret;
3485}
3486
3487PERL_STATIC_INLINE bool
3488Perl_sv_isbool(pTHX_ const SV *sv)
3489{
3490    return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
3491}
3492
3493#ifdef USE_ITHREADS
3494
3495PERL_STATIC_INLINE AV *
3496Perl_cop_file_avn(pTHX_ const COP *cop) {
3497
3498    PERL_ARGS_ASSERT_COP_FILE_AVN;
3499
3500    const char *file = CopFILE(cop);
3501    if (file) {
3502        GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3503        if (gv) {
3504            return GvAVn(gv);
3505        }
3506        else
3507            return NULL;
3508     }
3509     else
3510         return NULL;
3511}
3512
3513#endif
3514
3515PERL_STATIC_INLINE PADNAME *
3516Perl_padname_refcnt_inc(PADNAME *pn)
3517{
3518    PadnameREFCNT(pn)++;
3519    return pn;
3520}
3521
3522PERL_STATIC_INLINE PADNAMELIST *
3523Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
3524{
3525    PadnamelistREFCNT(pnl)++;
3526    return pnl;
3527}
3528
3529/* copy a string to a safe spot */
3530
3531/*
3532=for apidoc_section $string
3533=for apidoc savepv
3534
3535Perl's version of C<strdup()>.  Returns a pointer to a newly allocated
3536string which is a duplicate of C<pv>.  The size of the string is
3537determined by C<strlen()>, which means it may not contain embedded C<NUL>
3538characters and must have a trailing C<NUL>.  To prevent memory leaks, the
3539memory allocated for the new string needs to be freed when no longer needed.
3540This can be done with the C<L</Safefree>> function, or
3541L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
3542
3543On some platforms, Windows for example, all allocated memory owned by a thread
3544is deallocated when that thread ends.  So if you need that not to happen, you
3545need to use the shared memory functions, such as C<L</savesharedpv>>.
3546
3547=cut
3548*/
3549
3550PERL_STATIC_INLINE char *
3551Perl_savepv(pTHX_ const char *pv)
3552{
3553    PERL_UNUSED_CONTEXT;
3554    if (!pv)
3555        return NULL;
3556    else {
3557        char *newaddr;
3558        const STRLEN pvlen = strlen(pv)+1;
3559        Newx(newaddr, pvlen, char);
3560        return (char*)memcpy(newaddr, pv, pvlen);
3561    }
3562}
3563
3564/* same thing but with a known length */
3565
3566/*
3567=for apidoc savepvn
3568
3569Perl's version of what C<strndup()> would be if it existed.  Returns a
3570pointer to a newly allocated string which is a duplicate of the first
3571C<len> bytes from C<pv>, plus a trailing
3572C<NUL> byte.  The memory allocated for
3573the new string can be freed with the C<Safefree()> function.
3574
3575On some platforms, Windows for example, all allocated memory owned by a thread
3576is deallocated when that thread ends.  So if you need that not to happen, you
3577need to use the shared memory functions, such as C<L</savesharedpvn>>.
3578
3579=cut
3580*/
3581
3582PERL_STATIC_INLINE char *
3583Perl_savepvn(pTHX_ const char *pv, Size_t len)
3584{
3585    char *newaddr;
3586    PERL_UNUSED_CONTEXT;
3587
3588    Newx(newaddr,len+1,char);
3589    /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
3590    if (pv) {
3591        /* might not be null terminated */
3592        newaddr[len] = '\0';
3593        return (char *) CopyD(pv,newaddr,len,char);
3594    }
3595    else {
3596        return (char *) ZeroD(newaddr,len+1,char);
3597    }
3598}
3599
3600/*
3601=for apidoc savesvpv
3602
3603A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
3604the passed in SV using C<SvPV()>
3605
3606On some platforms, Windows for example, all allocated memory owned by a thread
3607is deallocated when that thread ends.  So if you need that not to happen, you
3608need to use the shared memory functions, such as C<L</savesharedsvpv>>.
3609
3610=cut
3611*/
3612
3613PERL_STATIC_INLINE char *
3614Perl_savesvpv(pTHX_ SV *sv)
3615{
3616    STRLEN len;
3617    const char * const pv = SvPV_const(sv, len);
3618    char *newaddr;
3619
3620    PERL_ARGS_ASSERT_SAVESVPV;
3621
3622    ++len;
3623    Newx(newaddr,len,char);
3624    return (char *) CopyD(pv,newaddr,len,char);
3625}
3626
3627/*
3628=for apidoc savesharedsvpv
3629
3630A version of C<savesharedpv()> which allocates the duplicate string in
3631memory which is shared between threads.
3632
3633=cut
3634*/
3635
3636PERL_STATIC_INLINE char *
3637Perl_savesharedsvpv(pTHX_ SV *sv)
3638{
3639    STRLEN len;
3640    const char * const pv = SvPV_const(sv, len);
3641
3642    PERL_ARGS_ASSERT_SAVESHAREDSVPV;
3643
3644    return savesharedpvn(pv, len);
3645}
3646
3647#ifndef PERL_GET_CONTEXT_DEFINED
3648
3649/*
3650=for apidoc_section $embedding
3651=for apidoc get_context
3652
3653Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
3654
3655=cut
3656*/
3657
3658PERL_STATIC_INLINE void *
3659Perl_get_context(void)
3660{
3661#  if defined(USE_ITHREADS)
3662#    ifdef OLD_PTHREADS_API
3663    pthread_addr_t t;
3664    int error = pthread_getspecific(PL_thr_key, &t);
3665    if (error)
3666        Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
3667    return (void*)t;
3668#    elif defined(I_MACH_CTHREADS)
3669    return (void*)cthread_data(cthread_self());
3670#    else
3671    return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
3672#    endif
3673#  else
3674    return (void*)NULL;
3675#  endif
3676}
3677
3678#endif
3679
3680PERL_STATIC_INLINE MGVTBL*
3681Perl_get_vtbl(pTHX_ int vtbl_id)
3682{
3683    PERL_UNUSED_CONTEXT;
3684
3685    return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
3686        ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
3687}
3688
3689/*
3690=for apidoc my_strlcat
3691
3692The C library C<strlcat> if available, or a Perl implementation of it.
3693This operates on C C<NUL>-terminated strings.
3694
3695C<my_strlcat()> appends string C<src> to the end of C<dst>.  It will append at
3696most S<C<size - strlen(dst) - 1>> characters.  It will then C<NUL>-terminate,
3697unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
3698practice this should not happen as it means that either C<size> is incorrect or
3699that C<dst> is not a proper C<NUL>-terminated string).
3700
3701Note that C<size> is the full size of the destination buffer and
3702the result is guaranteed to be C<NUL>-terminated if there is room.  Note that
3703room for the C<NUL> should be included in C<size>.
3704
3705The return value is the total length that C<dst> would have if C<size> is
3706sufficiently large.  Thus it is the initial length of C<dst> plus the length of
3707C<src>.  If C<size> is smaller than the return, the excess was not appended.
3708
3709=cut
3710
3711Description stolen from http://man.openbsd.org/strlcat.3
3712*/
3713#ifndef HAS_STRLCAT
3714PERL_STATIC_INLINE Size_t
3715Perl_my_strlcat(char *dst, const char *src, Size_t size)
3716{
3717    Size_t used, length, copy;
3718
3719    used = strlen(dst);
3720    length = strlen(src);
3721    if (size > 0 && used < size - 1) {
3722        copy = (length >= size - used) ? size - used - 1 : length;
3723        memcpy(dst + used, src, copy);
3724        dst[used + copy] = '\0';
3725    }
3726    return used + length;
3727}
3728#endif
3729
3730
3731/*
3732=for apidoc my_strlcpy
3733
3734The C library C<strlcpy> if available, or a Perl implementation of it.
3735This operates on C C<NUL>-terminated strings.
3736
3737C<my_strlcpy()> copies up to S<C<size - 1>> characters from the string C<src>
3738to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
3739
3740The return value is the total length C<src> would be if the copy completely
3741succeeded.  If it is larger than C<size>, the excess was not copied.
3742
3743=cut
3744
3745Description stolen from http://man.openbsd.org/strlcpy.3
3746*/
3747#ifndef HAS_STRLCPY
3748PERL_STATIC_INLINE Size_t
3749Perl_my_strlcpy(char *dst, const char *src, Size_t size)
3750{
3751    Size_t length, copy;
3752
3753    length = strlen(src);
3754    if (size > 0) {
3755        copy = (length >= size) ? size - 1 : length;
3756        memcpy(dst, src, copy);
3757        dst[copy] = '\0';
3758    }
3759    return length;
3760}
3761#endif
3762
3763/*
3764 * ex: set ts=8 sts=4 sw=4 et:
3765 */
3766