Cross Reference: /openbsd-current/gnu/usr.bin/perl/dist/Devel-PPPort/parts/inc/utf8

=provides

__UNDEFINED__
SvUTF8
UTF8f
UTF8fARG
utf8_to_uvchr_buf
sv_len_utf8
sv_len_utf8_nomg

=implementation

#ifdef SVf_UTF8
__UNDEFINED__ SvUTF8(sv) (SvFLAGS(sv) & SVf_UTF8)
#endif

#if { VERSION == 5.19.1 } /* 5.19.1 does not have UTF8fARG, only broken UTF8f */
#undef UTF8f
#endif

#ifdef SVf_UTF8
__UNDEFINED__  UTF8f           SVf
__UNDEFINED__  UTF8fARG(u,l,p) newSVpvn_flags((p), (l), ((u) ? SVf_UTF8 : 0) | SVs_TEMP)
#endif

#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))

__UNDEFINED__ UNICODE_REPLACEMENT  0xFFFD

#ifdef UTF8_MAXLEN
__UNDEFINED__  UTF8_MAXBYTES   UTF8_MAXLEN
#endif

__UNDEF_NOT_PROVIDED__ UTF_START_MARK(len)                                   \
                    (((len) >  7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))

/* On non-EBCDIC was valid for some releases earlier than this, but easier to
 * just do one check */
#if { VERSION < 5.018 }
#  undef UTF8_MAXBYTES_CASE
#endif

#if 'A' == 65
#  define D_PPP_BYTE_INFO_BITS 6  /* 6 bits meaningful in continuation bytes */
__UNDEFINED__          UTF8_MAXBYTES_CASE 13
#else
#  define D_PPP_BYTE_INFO_BITS 5  /* 5 bits meaningful in continuation bytes */
__UNDEFINED__          UTF8_MAXBYTES_CASE 15
#endif

__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT D_PPP_BYTE_INFO_BITS

#ifdef NATIVE_TO_UTF
__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c)  NATIVE_TO_UTF(c)
#else   /* System doesn't support EBCDIC */
__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c)  (c)
#endif

#ifdef UTF_TO_NATIVE
__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c)  UTF_TO_NATIVE(c)
#else   /* System doesn't support EBCDIC */
__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c)  (c)
#endif

__UNDEF_NOT_PROVIDED__ UTF_START_MASK(len)                                 \
                                (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
__UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK                            \
                                    ((U8) (0xFF << UTF_ACCUMULATION_SHIFT))
__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK                               \
                                          (UTF_IS_CONTINUATION_MASK & 0xB0)
__UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE                                  \
    ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))

__UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE                           \
                    ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))

#if { VERSION < 5.007 }     /* Was the complement of what should have been */
#  undef UTF8_IS_DOWNGRADEABLE_START
#endif
__UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c)                       \
                inRANGE(NATIVE_UTF8_TO_I8(c),                               \
                        UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1)
__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK                                \
                                ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))

__UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added)                         \
                                  (((base) << UTF_ACCUMULATION_SHIFT)       \
                                   | ((NATIVE_UTF8_TO_I8(added))            \
                                       & UTF_CONTINUATION_MASK))

__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV                 0
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY            0x0001
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION     0x0002
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT            0x0008
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG             0x0010
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW         0x0080
__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY  ( UTF8_ALLOW_CONTINUATION      \
                                        |UTF8_ALLOW_NON_CONTINUATION  \
                                        |UTF8_ALLOW_SHORT             \
                                        |UTF8_ALLOW_LONG              \
                                        |UTF8_ALLOW_OVERFLOW)

#if defined UTF8SKIP

/* Don't use official versions because they use MIN, which may not be available */
#undef UTF8_SAFE_SKIP
#undef UTF8_CHK_SKIP

__UNDEFINED__  UTF8_SAFE_SKIP(s, e)  (                                          \
                                      ((((e) - (s)) <= 0)                       \
                                      ? 0                                       \
                                      : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))

__UNDEFINED__ UTF8_CHK_SKIP(s)                                                  \
    (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)),  \
                                      UTF8SKIP(s))))
/* UTF8_CHK_SKIP depends on my_strnlen */
__UNDEFINED__ UTF8_SKIP(s)  UTF8SKIP(s)
#endif

#if 'A' == 65
__UNDEFINED__ UTF8_IS_INVARIANT(c)   isASCII(c)
#else
__UNDEFINED__ UTF8_IS_INVARIANT(c)  (isASCII(c) || isCNTRL_L1(c))
#endif

__UNDEFINED__ UVCHR_IS_INVARIANT(c)  UTF8_IS_INVARIANT(c)

#ifdef UVCHR_IS_INVARIANT
#  if 'A' != 65 || UVSIZE < 8
     /* 32 bit platform, which includes UTF-EBCDIC on the releases this is
      * backported to */
#    define D_PPP_UVCHR_SKIP_UPPER(c) 7
#  else
#    define D_PPP_UVCHR_SKIP_UPPER(c)                                       \
        (((WIDEST_UTYPE) (c)) <                                             \
         (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13)
#  endif

__UNDEFINED__ UVCHR_SKIP(c)                                                     \
          UVCHR_IS_INVARIANT(c)                                          ? 1 :  \
          (WIDEST_UTYPE) (c) < (32 * (1U << (    D_PPP_BYTE_INFO_BITS))) ? 2 :  \
          (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 :  \
          (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 :  \
          (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 :  \
          (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 :  \
          D_PPP_UVCHR_SKIP_UPPER(c)
#endif

#ifdef is_ascii_string
__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)

/* Hint: is_ascii_string, is_invariant_string
   is_utf8_invariant_string() does the same thing and is preferred because its
   name is more accurate as to what it does */
#endif

#ifdef ibcmp_utf8
__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)                            \
                                cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
#endif

#if defined(is_utf8_string) && defined(UTF8SKIP)
__UNDEFINED__ isUTF8_CHAR(s, e)    (                                            \
    (e) <= (s) || ! is_utf8_string(s, UTF8_SAFE_SKIP(s, e))                     \
    ? 0                                                                         \
    : UTF8SKIP(s))
#endif

#if 'A' == 65
__UNDEFINED__ BOM_UTF8                    "\xEF\xBB\xBF"
__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8  "\xEF\xBF\xBD"
#elif '^' == 95
__UNDEFINED__ BOM_UTF8                    "\xDD\x73\x66\x73"
__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8  "\xDD\x73\x73\x71"
#elif '^' == 176
__UNDEFINED__ BOM_UTF8                    "\xDD\x72\x65\x72"
__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8  "\xDD\x72\x72\x70"
#else
#  error Unknown character set
#endif

#if { VERSION < 5.35.10 }
        /* Versions prior to 5.31.4 accepted things that are now considered
         * malformations, and didn't return -1 on error with warnings enabled.
         * Versions before 5.35.10 dereferenced empty input without checking */
#  undef utf8_to_uvchr_buf
#endif

/* This implementation brings modern, generally more restricted standards to
 * utf8_to_uvchr_buf.  Some of these are security related, and clearly must
 * be done.  But its arguable that the others need not, and hence should not.
 * The reason they're here is that a module that intends to play with the
 * latest perls should be able to work the same in all releases.  An example is
 * that perl no longer accepts any UV for a code point, but limits them to
 * IV_MAX or below.  This is for future internal use of the larger code points.
 * If it turns out that some of these changes are breaking code that isn't
 * intended to work with modern perls, the tighter restrictions could be
 * relaxed.  khw thinks this is unlikely, but has been wrong in the past. */

/* 5.6.0 is the first release with UTF-8, and we don't implement this function
 * there due to its likely lack of still being in use, and the underlying
 * implementation is very different from later ones, without the later
 * safeguards, so would require extra work to deal with */
#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
   /* Choose which underlying implementation to use.  At least one must be
    * present or the perl is too early to handle this function */
#  if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv)
#    if defined(utf8n_to_uvchr)   /* This is the preferred implementation */
#      define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
#    elif /* Must be at least 5.6.1 from #if above;                             \
             If have both regular and _simple, regular has all args */          \
          defined(utf8_to_uv) && defined(utf8_to_uv_simple)
#      define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv
#    elif defined(utf8_to_uvchr)  /* The below won't work well on error input */
#      define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags)          \
                                            utf8_to_uvchr((U8 *)(s), (retlen))
#    else
#      define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags)          \
                                            utf8_to_uv((U8 *)(s), (retlen))
#    endif
#  endif

#  if { NEED utf8_to_uvchr_buf }

UV
utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
#    if { VERSION >= 5.31.4 }   /* But from above, must be < 5.35.10 */
#      if { VERSION != 5.35.9 }

    /* Versions less than 5.35.9 could dereference s on zero length, so
     * pass it something where no harm comes from that. */
    if (send <= s) s = send = (U8 *) "?";
    return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);

#      else /* Below is 5.35.9, which also works on non-empty input, but
               for empty input, can wrongly dereference, and additionally is
               also just plain broken */
    if (send > s) return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);
    if (! ckWARN_d(WARN_UTF8)) {
        if (retlen) *retlen = 0;
        return UNICODE_REPLACEMENT;
    }
    else {
        s = send = (U8 *) "?";

        /* Call just for its warning */
        (void) Perl__utf8n_to_uvchr_msgs_helper(s, 0, NULL, 0, NULL, NULL);
        if (retlen) *retlen = (STRLEN) -1;
        return 0;
    }

#      endif
#    else

    UV ret;
    STRLEN curlen;
    bool overflows = 0;
    const U8 *cur_s = s;
    const bool do_warnings = ckWARN_d(WARN_UTF8);
#    if { VERSION < 5.26.0 } && ! defined(EBCDIC)
    STRLEN overflow_length = 0;
#    endif

    if (send > s) {
        curlen = send - s;
    }
    else {
        assert(0);  /* Modern perls die under this circumstance */
        curlen = 0;
        if (! do_warnings) {    /* Handle empty here if no warnings needed */
            if (retlen) *retlen = 0;
            return UNICODE_REPLACEMENT;
        }
    }

#      if { VERSION < 5.26.0 } && ! defined(EBCDIC)

    /* Perl did not properly detect overflow for much of its history on
     * non-EBCDIC platforms, often returning an overlong value which may or may
     * not have been tolerated in the call.  Also, earlier versions, when they
     * did detect overflow, may have disallowed it completely.  Modern ones can
     * replace it with the REPLACEMENT CHARACTER, depending on calling
     * parameters.  Therefore detect it ourselves in  releases it was
     * problematic in. */

    if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {

        /* First, on a 32-bit machine the first byte being at least \xFE
         * automatically is overflow, as it indicates something requiring more
         * than 31 bits */
        if (sizeof(ret) < 8) {
            overflows = 1;
            overflow_length = (*s == 0xFE) ? 7 : 13;
        }
        else {
            const U8 highest[] =    /* 2*63-1 */
                        "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
            const U8 *cur_h = highest;

            for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
                if (UNLIKELY(*cur_s == *cur_h)) {
                    continue;
                }

                /* If this byte is larger than the corresponding highest UTF-8
                 * byte, the sequence overflows; otherwise the byte is less
                 * than (as we handled the equality case above), and so the
                 * sequence doesn't overflow */
                overflows = *cur_s > *cur_h;
                break;

            }

            /* Here, either we set the bool and broke out of the loop, or got
             * to the end and all bytes are the same which indicates it doesn't
             * overflow.  If it did overflow, it would be this number of bytes
             * */
            overflow_length = 13;
        }
    }

    if (UNLIKELY(overflows)) {
        ret = 0;

        if (! do_warnings && retlen) {
            *retlen = overflow_length;
        }
    }
    else

#      endif  /* < 5.26 */

        /* Here, we are either in a release that properly detects overflow, or
         * we have checked for overflow and the next statement is executing as
         * part of the above conditional where we know we don't have overflow.
         *
         * The modern versions allow anything that evaluates to a legal UV, but
         * not overlongs nor an empty input */
        ret = D_PPP_utf8_to_uvchr_buf_callee(
              (U8 *) /* Early perls: no const */
                    s, curlen, retlen,   (UTF8_ALLOW_ANYUV
                                      & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));

#      if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }

    /* But actually, more modern versions restrict the UV to being no more than
     * what an IV can hold, so it could still have gotten it wrong about
     * overflowing. */
    if (UNLIKELY(ret > IV_MAX)) {
        overflows = 1;
    }

#      endif

    if (UNLIKELY(overflows)) {
        if (! do_warnings) {
            if (retlen) {
                *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
                *retlen = D_PPP_MIN(*retlen, curlen);
            }
            return UNICODE_REPLACEMENT;
        }
        else {

            /* We use the error message in use from 5.8-5.26 */
            Perl_warner(aTHX_ packWARN(WARN_UTF8),
                "Malformed UTF-8 character (overflow at 0x%" UVxf
                ", byte 0x%02x, after start byte 0x%02x)",
                ret, *cur_s, *s);
            if (retlen) {
                *retlen = (STRLEN) -1;
            }
            return 0;
        }
    }

    /* Here, did not overflow, but if it failed for some other reason, and
     * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
     * try again, allowing anything.  (Note a return of 0 is ok if the input
     * was '\0') */
    if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {

        /* If curlen is 0, we already handled the case where warnings are
         * disabled, so this 'if' will be true, and so later on, we know that
         * 's' is dereferencible */
        if (do_warnings) {
            if (retlen) {
                *retlen = (STRLEN) -1;
            }
        }
        else {
            ret = D_PPP_utf8_to_uvchr_buf_callee(
                                     (U8 *) /* Early perls: no const */
                                            s, curlen, retlen, UTF8_ALLOW_ANY);
            /* Override with the REPLACEMENT character, as that is what the
             * modern version of this function returns */
            ret = UNICODE_REPLACEMENT;

#      if { VERSION < 5.16.0 }

            /* Versions earlier than this don't necessarily return the proper
             * length.  It should not extend past the end of string, nor past
             * what the first byte indicates the length is, nor past the
             * continuation characters */
            if (retlen && (IV) *retlen >= 0) {
                unsigned int i = 1;

                *retlen = D_PPP_MIN(*retlen, curlen);
                *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
                do {
#        ifdef UTF8_IS_CONTINUATION
                    if (! UTF8_IS_CONTINUATION(s[i]))
#        else       /* Versions without the above don't support EBCDIC anyway */
                    if (s[i] < 0x80 || s[i] > 0xBF)
#        endif
                    {
                        *retlen = i;
                        break;
                    }
                } while (++i < *retlen);
            }

#      endif  /* end of < 5.16.0 */

        }
    }

    return ret;

#    endif    /* end of < 5.31.4 */

}

#  endif
#endif

#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
                        to read past a NUL, making it much less likely to read
                        off the end of the buffer.  A NUL indicates the start
                        of the next character anyway.  If the input isn't
                        NUL-terminated, the function remains unsafe, as it
                        always has been. */

__UNDEFINED__  utf8_to_uvchr(s, lp)                                             \
    ((*(s) == '\0')                                                             \
    ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */        \
    : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp)))

#endif

/* Hint: utf8_to_uvchr
    Use utf8_to_uvchr_buf() instead.  But ONLY if you KNOW the upper bound
    of the input string (not resorting to using UTF8SKIP, etc., to infer it).
    The backported utf8_to_uvchr() will do a better job to prevent most cases
    of trying to read beyond the end of the buffer */

/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */

#ifdef sv_len_utf8
#  if { VERSION >= 5.17.5 }
#    ifndef  sv_len_utf8_nomg
#      if defined(PERL_USE_GCC_BRACE_GROUPS)
#        define sv_len_utf8_nomg(sv)                                    \
            ({                                                          \
                SV *sv_ = (sv);                                         \
                sv_len_utf8(!SvGMAGICAL(sv_)                            \
                            ? sv_                                       \
                            : sv_mortalcopy_flags(sv_, SV_NOSTEAL));    \
            })
#      else
         PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv)
         {
             dTHX;
             if (SvGMAGICAL(sv))
                 return sv_len_utf8(sv_mortalcopy_flags(sv,
                                                        SV_NOSTEAL));
             else return sv_len_utf8(sv);
         }
#        define  sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)
#      endif
#    endif
#  else /* < 5.17.5 */
    /* Older Perl versions have broken sv_len_utf8() when passed sv does not
     * have SVf_UTF8 flag set */
    /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */
#    undef sv_len_utf8
#    if defined(PERL_USE_GCC_BRACE_GROUPS)
#      define sv_len_utf8_nomg(sv)                                          \
        ({                                                                  \
            SV *sv2 = (sv);                                                 \
            STRLEN len;                                                     \
            if (SvUTF8(sv2)) {                                              \
                if (SvGMAGICAL(sv2))                                        \
                    len = Perl_sv_len_utf8(aTHX_                            \
                                           sv_mortalcopy_flags(sv2,         \
                                                               SV_NOSTEAL));\
                else                                                        \
                    len = Perl_sv_len_utf8(aTHX_ sv2);                      \
             }                                                              \
             else SvPV_nomg(sv2, len);                                      \
             len;                                                           \
       })
#      define sv_len_utf8(sv) ({ SV *_sv1 = (sv);                           \
                                 SvGETMAGIC(_sv1);                          \
                                 sv_len_utf8_nomg(_sv1);                    \
                              })
#    else   /* Below is no brace groups */
       PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv)
       {
          dTHX;
          STRLEN len;
          if (SvUTF8(sv)) {
              if (SvGMAGICAL(sv))
                  len = Perl_sv_len_utf8(aTHX_
                                         sv_mortalcopy_flags(sv,
                                                             SV_NOSTEAL));
              else
                  len = Perl_sv_len_utf8(aTHX_ sv);
          }
          else SvPV_nomg(sv, len);
          return len;
       }
#      define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)

       PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8(SV * sv)
       {
          dTHX;
          SvGETMAGIC(sv);
          return sv_len_utf8_nomg(sv);
       }
#      define sv_len_utf8(sv) D_PPP_sv_len_utf8(sv)
#    endif
#  endif    /* End of < 5.17.5 */
#endif

=xsinit

#define NEED_utf8_to_uvchr_buf

=xsubs

#if defined(UTF8f) && defined(newSVpvf)

void
UTF8f(x)
        SV *x
        PREINIT:
                U32 u;
                STRLEN len;
                char *ptr;
        INIT:
                ptr = SvPV(x, len);
                u = SvUTF8(x);
        PPCODE:
                x = sv_2mortal(newSVpvf("[%" UTF8f "]", UTF8fARG(u, len, ptr)));
                XPUSHs(x);
                XSRETURN(1);

#endif

#if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */  \
                         /* as being available and params not what the  */  \
                         /* API function has; works on EBCDIC too */

SV *
uvchr_to_utf8(native)

    UV native
    PREINIT:
        int len;
        U8 string[UTF8_MAXBYTES+1];
        int i;
        UV uni;

    CODE:
	len = UVCHR_SKIP(native);

        for (i = 0; i < len; i++) {
            string[i] = '\0';
        }

        if (len <= 1) {
            string[0] = native;
        }
        else {
            i = len;
            uni = NATIVE_TO_UNI(native);
            while (i-- > 1) {
                string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
                uni >>= UTF_ACCUMULATION_SHIFT;
            }
            string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len));
        }

        RETVAL = newSVpvn((char *) string, len);
        SvUTF8_on(RETVAL);
    OUTPUT:
        RETVAL

#endif
#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)

STRLEN
UTF8_SAFE_SKIP(s, adjustment)
        char * s
        int adjustment
        PREINIT:
            const char *const_s;
        CODE:
            const_s = s;
            /* Instead of passing in an 'e' ptr, use the real end, adjusted */
            RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
        OUTPUT:
            RETVAL

#endif

#ifdef isUTF8_CHAR

STRLEN
isUTF8_CHAR(s, adjustment)
        unsigned char * s
        int adjustment
        PREINIT:
            const unsigned char *const_s;
            const unsigned char *const_e;
        CODE:
            const_s = s;
            /* Instead of passing in an 'e' ptr, use the real end, adjusted */
            const_e = const_s + UTF8SKIP(const_s) + adjustment;
            RETVAL = isUTF8_CHAR(const_s, const_e);
        OUTPUT:
            RETVAL

#endif


#ifdef foldEQ_utf8

STRLEN
foldEQ_utf8(s1, l1, u1, s2, l2, u2)
        char *s1
        UV l1
        bool u1
        char *s2
        UV l2
        bool u2
        PREINIT:
            const char *const_s1;
            const char *const_s2;
        CODE:
            const_s1 = s1;
            const_s2 = s2;
            RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
        OUTPUT:
            RETVAL

#endif

#ifdef utf8_to_uvchr_buf

AV *
utf8_to_uvchr_buf(s, adjustment)
        unsigned char *s
        int adjustment
        PREINIT:
            AV *av;
            STRLEN len;
            const unsigned char *const_s;
        CODE:
            av = newAV();
            const_s = s;
            av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
                                                  s + UTF8SKIP(s) + adjustment,
                                                  &len)));
            if (len == (STRLEN) -1) {
                av_push(av, newSViv(-1));
            }
            else {
                av_push(av, newSVuv(len));
            }
            RETVAL = av;
        OUTPUT:
                RETVAL

#endif

#ifdef utf8_to_uvchr

AV *
utf8_to_uvchr(s)
        unsigned char *s
        PREINIT:
            AV *av;
            STRLEN len;
            const unsigned char *const_s;
        CODE:
            av = newAV();
            const_s = s;
            av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
            if (len == (STRLEN) -1) {
                av_push(av, newSViv(-1));
            }
            else {
                av_push(av, newSVuv(len));
            }
            RETVAL = av;
        OUTPUT:
                RETVAL

#endif

#ifdef sv_len_utf8

STRLEN
sv_len_utf8(sv)
        SV *sv
        CODE:
                RETVAL = sv_len_utf8(sv);
        OUTPUT:
                RETVAL

#endif

#ifdef sv_len_utf8_nomg

STRLEN
sv_len_utf8_nomg(sv)
        SV *sv
        CODE:
                RETVAL = sv_len_utf8_nomg(sv);
        OUTPUT:
                RETVAL

#endif

#ifdef UVCHR_IS_INVARIANT

bool
UVCHR_IS_INVARIANT(c)
        unsigned c
        PREINIT:
        CODE:
            RETVAL = UVCHR_IS_INVARIANT(c);
        OUTPUT:
            RETVAL

#endif

#ifdef UVCHR_SKIP

STRLEN
UVCHR_SKIP(c)
        UV c
        PREINIT:
        CODE:
            RETVAL = UVCHR_SKIP(c);
        OUTPUT:
            RETVAL

#endif

=tests plan => 98

BEGIN {
    # skip tests on 5.6.0 and earlier, plus 5.7.0
    if (ivers($]) <= ivers(5.6) || ivers($]) == ivers(5.7) ) {
        skip 'skip: broken utf8 support', 98;
        exit;
    }
    require warnings;
}

is(Devel::PPPort::UTF8f(42), '[42]');
is(Devel::PPPort::UTF8f('abc'), '[abc]');
is(Devel::PPPort::UTF8f("\x{263a}"), "[\x{263a}]");

my $str = "\x{A8}";
if (ivers($]) >= ivers(5.8)) { eval q{utf8::upgrade($str)} }
is(Devel::PPPort::UTF8f($str), "[\x{A8}]");
if (ivers($]) >= ivers(5.8)) { eval q{utf8::downgrade($str)} }
is(Devel::PPPort::UTF8f($str), "[\x{A8}]");

is(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
is(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);

is(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
is(&Devel::PPPort::isUTF8_CHAR("A",  0), 1);
is(&Devel::PPPort::isUTF8_CHAR("\x{100}",  -1), 0);
is(&Devel::PPPort::isUTF8_CHAR("\x{100}",  0), 2);

is(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1);
ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6));
ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100));

is(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1);
is(&Devel::PPPort::UVCHR_SKIP(0xb6),     2, "This is a test");
is(&Devel::PPPort::UVCHR_SKIP(0x3FF),    2);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFF),   3);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFFF),  4);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5);
is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6);
is(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7);
if (ord("A") != 65) {
    skip("Test not valid on EBCDIC", 1)
}
else {
    is(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7);
}

if (ivers($]) < ivers(5.8)) {
    skip("Perl version too early", 3);
}
else {
    is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
    is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
    is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
}

my $ret = &Devel::PPPort::utf8_to_uvchr("A");
is($ret->[0], ord("A"));
is($ret->[1], 1);

$ret = &Devel::PPPort::utf8_to_uvchr("\0");
is($ret->[0], 0);
is($ret->[1], 1);

$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
is($ret->[0], ord("A"));
is($ret->[1], 1);

$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
is($ret->[0], 0);
is($ret->[1], 1);

my @buf_tests = (
    {
        input      => "A",
        adjustment => -1,
        warning    => eval "qr/empty/",
        no_warnings_returned_length => 0,
    },
    {
        input      => "\xc4\xc5",
        adjustment => 0,
        warning    => eval "qr/non-continuation/",
        no_warnings_returned_length => 1,
    },
    {
        input      => "\xc4\x80",
        adjustment => -1,
        warning    => eval "qr/short|1 byte, need 2/",
        no_warnings_returned_length => 1,
    },
    {
        input      => "\xc0\x81",
        adjustment => 0,
        warning    => eval "qr/overlong|2 bytes, need 1/",
        no_warnings_returned_length => 2,
    },
    {
        input      => "\xe0\x80\x81",
        adjustment => 0,
        warning    => eval "qr/overlong|3 bytes, need 1/",
        no_warnings_returned_length => 3,
    },
    {
        input      => "\xf0\x80\x80\x81",
        adjustment => 0,
        warning    => eval "qr/overlong|4 bytes, need 1/",
        no_warnings_returned_length => 4,
    },
    {                 # Old algorithm failed to detect this
        input      => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
        adjustment => 0,
        warning    => eval "qr/overflow/",
        no_warnings_returned_length => 13,
    },
);

if (ord("A") != 65) {   # tests not valid for EBCDIC
    skip("Perl version too early", 2 + 4 + (scalar @buf_tests * 5));
}
else {
    $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
    is($ret->[0], 0x100);
    is($ret->[1], 2);

    my @warnings;
    local $SIG{__WARN__} = sub { push @warnings, @_; };

    {
        use warnings 'utf8';
        $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
        is($ret->[0], 0);
        is($ret->[1], -1);

        no warnings 'utf8';
        $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
        is($ret->[0], 0xFFFD);
        is($ret->[1], 1);
    }


    # An empty input is an assertion failure on debugging builds.  It is
    # deliberately the first test.
    require Config; Config->import;
    use vars '%Config';

    # VMS doesn't put DEBUGGING in ccflags, and Windows doesn't have
    # $Config{config_args}.  When 5.14 or later can be assumed, use
    # Config::non_bincompat_options(), but for now we're stuck with this.
    if (   $Config{ccflags} =~ /-DDEBUGGING/
        || $^O eq 'VMS' && $Config{config_args} =~ /\bDDEBUGGING\b/)
    {
        shift @buf_tests;
        skip("Test not valid on DEBUGGING builds", 5);
    }

    my $test;
    for $test (@buf_tests) {
        my $input = $test->{'input'};
        my $adjustment = $test->{'adjustment'};
        my $display = 'utf8_to_uvchr_buf("';
        my $i;
        for ($i = 0; $i < length($input) + $adjustment; $i++) {
            $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
        }

        $display .= '")';
        my $warning = $test->{'warning'};

        undef @warnings;
        use warnings 'utf8';
        $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
        is($ret->[0], 0,  "returned value $display; warnings enabled");
        is($ret->[1], -1, "returned length $display; warnings enabled");
        my $all_warnings = join "; ", @warnings;
        my $contains = grep { $_ =~ $warning } $all_warnings;
        is($contains, 1, $display
                    . "; Got: '$all_warnings', which should contain '$warning'");

        undef @warnings;
        no warnings 'utf8';
        $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
        is($ret->[0], 0xFFFD,  "returned value $display; warnings disabled");
        is($ret->[1], $test->{'no_warnings_returned_length'},
                      "returned length $display; warnings disabled");
    }
}

if (ivers($]) ge ivers(5.008)) {
    BEGIN { if (ivers($]) ge ivers(5.008)) { require utf8; "utf8"->import() } }

    is(Devel::PPPort::sv_len_utf8("a������"), 4);
    is(Devel::PPPort::sv_len_utf8_nomg("a������"), 4);

    my $str = "������";
    utf8::downgrade($str);
    is(Devel::PPPort::sv_len_utf8($str), 3);
    utf8::downgrade($str);
    is(Devel::PPPort::sv_len_utf8_nomg($str), 3);
    utf8::upgrade($str);
    is(Devel::PPPort::sv_len_utf8($str), 3);
    utf8::upgrade($str);
    is(Devel::PPPort::sv_len_utf8_nomg($str), 3);

    tie my $scalar, 'TieScalarCounter', "��";

    is(tied($scalar)->{fetch}, 0);
    is(tied($scalar)->{store}, 0);
    is(Devel::PPPort::sv_len_utf8($scalar), 2);
    is(tied($scalar)->{fetch}, 1);
    is(tied($scalar)->{store}, 0);
    is(Devel::PPPort::sv_len_utf8($scalar), 3);
    is(tied($scalar)->{fetch}, 2);
    is(tied($scalar)->{store}, 0);
    is(Devel::PPPort::sv_len_utf8($scalar), 4);
    is(tied($scalar)->{fetch}, 3);
    is(tied($scalar)->{store}, 0);
    is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
    is(tied($scalar)->{fetch}, 3);
    is(tied($scalar)->{store}, 0);
    is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
    is(tied($scalar)->{fetch}, 3);
    is(tied($scalar)->{store}, 0);
} else {
    skip 'skip: no utf8::downgrade/utf8::upgrade support', 23;
}

package TieScalarCounter;

sub TIESCALAR {
    my ($class, $value) = @_;
    return bless { fetch => 0, store => 0, value => $value }, $class;
}

sub FETCH {
    BEGIN { if (main::ivers($]) ge main::ivers(5.008)) { require utf8; "utf8"->import() } }
    my ($self) = @_;
    $self->{fetch}++;
    return $self->{value} .= "��";
}

sub STORE {
    my ($self, $value) = @_;
    $self->{store}++;
    $self->{value} = $value;
}