1=provides
2
3__UNDEFINED__
4SvUTF8
5UTF8f
6UTF8fARG
7utf8_to_uvchr_buf
8sv_len_utf8
9sv_len_utf8_nomg
10
11=implementation
12
13#ifdef SVf_UTF8
14__UNDEFINED__ SvUTF8(sv) (SvFLAGS(sv) & SVf_UTF8)
15#endif
16
17#if { VERSION == 5.19.1 } /* 5.19.1 does not have UTF8fARG, only broken UTF8f */
18#undef UTF8f
19#endif
20
21#ifdef SVf_UTF8
22__UNDEFINED__  UTF8f           SVf
23__UNDEFINED__  UTF8fARG(u,l,p) newSVpvn_flags((p), (l), ((u) ? SVf_UTF8 : 0) | SVs_TEMP)
24#endif
25
26#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
27
28__UNDEFINED__ UNICODE_REPLACEMENT  0xFFFD
29
30#ifdef UTF8_MAXLEN
31__UNDEFINED__  UTF8_MAXBYTES   UTF8_MAXLEN
32#endif
33
34__UNDEF_NOT_PROVIDED__ UTF_START_MARK(len)                                   \
35                    (((len) >  7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
36
37/* On non-EBCDIC was valid for some releases earlier than this, but easier to
38 * just do one check */
39#if { VERSION < 5.018 }
40#  undef UTF8_MAXBYTES_CASE
41#endif
42
43#if 'A' == 65
44#  define D_PPP_BYTE_INFO_BITS 6  /* 6 bits meaningful in continuation bytes */
45__UNDEFINED__          UTF8_MAXBYTES_CASE 13
46#else
47#  define D_PPP_BYTE_INFO_BITS 5  /* 5 bits meaningful in continuation bytes */
48__UNDEFINED__          UTF8_MAXBYTES_CASE 15
49#endif
50
51__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT D_PPP_BYTE_INFO_BITS
52
53#ifdef NATIVE_TO_UTF
54__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c)  NATIVE_TO_UTF(c)
55#else   /* System doesn't support EBCDIC */
56__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c)  (c)
57#endif
58
59#ifdef UTF_TO_NATIVE
60__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c)  UTF_TO_NATIVE(c)
61#else   /* System doesn't support EBCDIC */
62__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c)  (c)
63#endif
64
65__UNDEF_NOT_PROVIDED__ UTF_START_MASK(len)                                 \
66                                (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
67__UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK                            \
68                                    ((U8) (0xFF << UTF_ACCUMULATION_SHIFT))
69__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK                               \
70                                          (UTF_IS_CONTINUATION_MASK & 0xB0)
71__UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE                                  \
72    ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
73
74__UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE                           \
75                    ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
76
77#if { VERSION < 5.007 }     /* Was the complement of what should have been */
78#  undef UTF8_IS_DOWNGRADEABLE_START
79#endif
80__UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c)                       \
81                inRANGE(NATIVE_UTF8_TO_I8(c),                               \
82                        UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1)
83__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK                                \
84                                ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
85
86__UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added)                         \
87                                  (((base) << UTF_ACCUMULATION_SHIFT)       \
88                                   | ((NATIVE_UTF8_TO_I8(added))            \
89                                       & UTF_CONTINUATION_MASK))
90
91__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV                 0
92__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY            0x0001
93__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION     0x0002
94__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
95__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT            0x0008
96__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG             0x0010
97__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW         0x0080
98__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY  ( UTF8_ALLOW_CONTINUATION      \
99                                        |UTF8_ALLOW_NON_CONTINUATION  \
100                                        |UTF8_ALLOW_SHORT             \
101                                        |UTF8_ALLOW_LONG              \
102                                        |UTF8_ALLOW_OVERFLOW)
103
104#if defined UTF8SKIP
105
106/* Don't use official versions because they use MIN, which may not be available */
107#undef UTF8_SAFE_SKIP
108#undef UTF8_CHK_SKIP
109
110__UNDEFINED__  UTF8_SAFE_SKIP(s, e)  (                                          \
111                                      ((((e) - (s)) <= 0)                       \
112                                      ? 0                                       \
113                                      : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
114
115__UNDEFINED__ UTF8_CHK_SKIP(s)                                                  \
116    (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)),  \
117                                      UTF8SKIP(s))))
118/* UTF8_CHK_SKIP depends on my_strnlen */
119__UNDEFINED__ UTF8_SKIP(s)  UTF8SKIP(s)
120#endif
121
122#if 'A' == 65
123__UNDEFINED__ UTF8_IS_INVARIANT(c)   isASCII(c)
124#else
125__UNDEFINED__ UTF8_IS_INVARIANT(c)  (isASCII(c) || isCNTRL_L1(c))
126#endif
127
128__UNDEFINED__ UVCHR_IS_INVARIANT(c)  UTF8_IS_INVARIANT(c)
129
130#ifdef UVCHR_IS_INVARIANT
131#  if 'A' != 65 || UVSIZE < 8
132     /* 32 bit platform, which includes UTF-EBCDIC on the releases this is
133      * backported to */
134#    define D_PPP_UVCHR_SKIP_UPPER(c) 7
135#  else
136#    define D_PPP_UVCHR_SKIP_UPPER(c)                                       \
137        (((WIDEST_UTYPE) (c)) <                                             \
138         (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13)
139#  endif
140
141__UNDEFINED__ UVCHR_SKIP(c)                                                     \
142          UVCHR_IS_INVARIANT(c)                                          ? 1 :  \
143          (WIDEST_UTYPE) (c) < (32 * (1U << (    D_PPP_BYTE_INFO_BITS))) ? 2 :  \
144          (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 :  \
145          (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 :  \
146          (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 :  \
147          (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 :  \
148          D_PPP_UVCHR_SKIP_UPPER(c)
149#endif
150
151#ifdef is_ascii_string
152__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
153__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
154
155/* Hint: is_ascii_string, is_invariant_string
156   is_utf8_invariant_string() does the same thing and is preferred because its
157   name is more accurate as to what it does */
158#endif
159
160#ifdef ibcmp_utf8
161__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)                            \
162                                cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
163#endif
164
165#if defined(is_utf8_string) && defined(UTF8SKIP)
166__UNDEFINED__ isUTF8_CHAR(s, e)    (                                            \
167    (e) <= (s) || ! is_utf8_string(s, UTF8_SAFE_SKIP(s, e))                     \
168    ? 0                                                                         \
169    : UTF8SKIP(s))
170#endif
171
172#if 'A' == 65
173__UNDEFINED__ BOM_UTF8                    "\xEF\xBB\xBF"
174__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8  "\xEF\xBF\xBD"
175#elif '^' == 95
176__UNDEFINED__ BOM_UTF8                    "\xDD\x73\x66\x73"
177__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8  "\xDD\x73\x73\x71"
178#elif '^' == 176
179__UNDEFINED__ BOM_UTF8                    "\xDD\x72\x65\x72"
180__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8  "\xDD\x72\x72\x70"
181#else
182#  error Unknown character set
183#endif
184
185#if { VERSION < 5.35.10 }
186        /* Versions prior to 5.31.4 accepted things that are now considered
187         * malformations, and didn't return -1 on error with warnings enabled.
188         * Versions before 5.35.10 dereferenced empty input without checking */
189#  undef utf8_to_uvchr_buf
190#endif
191
192/* This implementation brings modern, generally more restricted standards to
193 * utf8_to_uvchr_buf.  Some of these are security related, and clearly must
194 * be done.  But its arguable that the others need not, and hence should not.
195 * The reason they're here is that a module that intends to play with the
196 * latest perls should be able to work the same in all releases.  An example is
197 * that perl no longer accepts any UV for a code point, but limits them to
198 * IV_MAX or below.  This is for future internal use of the larger code points.
199 * If it turns out that some of these changes are breaking code that isn't
200 * intended to work with modern perls, the tighter restrictions could be
201 * relaxed.  khw thinks this is unlikely, but has been wrong in the past. */
202
203/* 5.6.0 is the first release with UTF-8, and we don't implement this function
204 * there due to its likely lack of still being in use, and the underlying
205 * implementation is very different from later ones, without the later
206 * safeguards, so would require extra work to deal with */
207#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
208   /* Choose which underlying implementation to use.  At least one must be
209    * present or the perl is too early to handle this function */
210#  if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv)
211#    if defined(utf8n_to_uvchr)   /* This is the preferred implementation */
212#      define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
213#    elif /* Must be at least 5.6.1 from #if above;                             \
214             If have both regular and _simple, regular has all args */          \
215          defined(utf8_to_uv) && defined(utf8_to_uv_simple)
216#      define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv
217#    elif defined(utf8_to_uvchr)  /* The below won't work well on error input */
218#      define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags)          \
219                                            utf8_to_uvchr((U8 *)(s), (retlen))
220#    else
221#      define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags)          \
222                                            utf8_to_uv((U8 *)(s), (retlen))
223#    endif
224#  endif
225
226#  if { NEED utf8_to_uvchr_buf }
227
228UV
229utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
230{
231#    if { VERSION >= 5.31.4 }   /* But from above, must be < 5.35.10 */
232#      if { VERSION != 5.35.9 }
233
234    /* Versions less than 5.35.9 could dereference s on zero length, so
235     * pass it something where no harm comes from that. */
236    if (send <= s) s = send = (U8 *) "?";
237    return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);
238
239#      else /* Below is 5.35.9, which also works on non-empty input, but
240               for empty input, can wrongly dereference, and additionally is
241               also just plain broken */
242    if (send > s) return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);
243    if (! ckWARN_d(WARN_UTF8)) {
244        if (retlen) *retlen = 0;
245        return UNICODE_REPLACEMENT;
246    }
247    else {
248        s = send = (U8 *) "?";
249
250        /* Call just for its warning */
251        (void) Perl__utf8n_to_uvchr_msgs_helper(s, 0, NULL, 0, NULL, NULL);
252        if (retlen) *retlen = (STRLEN) -1;
253        return 0;
254    }
255
256#      endif
257#    else
258
259    UV ret;
260    STRLEN curlen;
261    bool overflows = 0;
262    const U8 *cur_s = s;
263    const bool do_warnings = ckWARN_d(WARN_UTF8);
264#    if { VERSION < 5.26.0 } && ! defined(EBCDIC)
265    STRLEN overflow_length = 0;
266#    endif
267
268    if (send > s) {
269        curlen = send - s;
270    }
271    else {
272        assert(0);  /* Modern perls die under this circumstance */
273        curlen = 0;
274        if (! do_warnings) {    /* Handle empty here if no warnings needed */
275            if (retlen) *retlen = 0;
276            return UNICODE_REPLACEMENT;
277        }
278    }
279
280#      if { VERSION < 5.26.0 } && ! defined(EBCDIC)
281
282    /* Perl did not properly detect overflow for much of its history on
283     * non-EBCDIC platforms, often returning an overlong value which may or may
284     * not have been tolerated in the call.  Also, earlier versions, when they
285     * did detect overflow, may have disallowed it completely.  Modern ones can
286     * replace it with the REPLACEMENT CHARACTER, depending on calling
287     * parameters.  Therefore detect it ourselves in  releases it was
288     * problematic in. */
289
290    if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
291
292        /* First, on a 32-bit machine the first byte being at least \xFE
293         * automatically is overflow, as it indicates something requiring more
294         * than 31 bits */
295        if (sizeof(ret) < 8) {
296            overflows = 1;
297            overflow_length = (*s == 0xFE) ? 7 : 13;
298        }
299        else {
300            const U8 highest[] =    /* 2*63-1 */
301                        "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
302            const U8 *cur_h = highest;
303
304            for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
305                if (UNLIKELY(*cur_s == *cur_h)) {
306                    continue;
307                }
308
309                /* If this byte is larger than the corresponding highest UTF-8
310                 * byte, the sequence overflows; otherwise the byte is less
311                 * than (as we handled the equality case above), and so the
312                 * sequence doesn't overflow */
313                overflows = *cur_s > *cur_h;
314                break;
315
316            }
317
318            /* Here, either we set the bool and broke out of the loop, or got
319             * to the end and all bytes are the same which indicates it doesn't
320             * overflow.  If it did overflow, it would be this number of bytes
321             * */
322            overflow_length = 13;
323        }
324    }
325
326    if (UNLIKELY(overflows)) {
327        ret = 0;
328
329        if (! do_warnings && retlen) {
330            *retlen = overflow_length;
331        }
332    }
333    else
334
335#      endif  /* < 5.26 */
336
337        /* Here, we are either in a release that properly detects overflow, or
338         * we have checked for overflow and the next statement is executing as
339         * part of the above conditional where we know we don't have overflow.
340         *
341         * The modern versions allow anything that evaluates to a legal UV, but
342         * not overlongs nor an empty input */
343        ret = D_PPP_utf8_to_uvchr_buf_callee(
344              (U8 *) /* Early perls: no const */
345                    s, curlen, retlen,   (UTF8_ALLOW_ANYUV
346                                      & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
347
348#      if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
349
350    /* But actually, more modern versions restrict the UV to being no more than
351     * what an IV can hold, so it could still have gotten it wrong about
352     * overflowing. */
353    if (UNLIKELY(ret > IV_MAX)) {
354        overflows = 1;
355    }
356
357#      endif
358
359    if (UNLIKELY(overflows)) {
360        if (! do_warnings) {
361            if (retlen) {
362                *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
363                *retlen = D_PPP_MIN(*retlen, curlen);
364            }
365            return UNICODE_REPLACEMENT;
366        }
367        else {
368
369            /* We use the error message in use from 5.8-5.26 */
370            Perl_warner(aTHX_ packWARN(WARN_UTF8),
371                "Malformed UTF-8 character (overflow at 0x%" UVxf
372                ", byte 0x%02x, after start byte 0x%02x)",
373                ret, *cur_s, *s);
374            if (retlen) {
375                *retlen = (STRLEN) -1;
376            }
377            return 0;
378        }
379    }
380
381    /* Here, did not overflow, but if it failed for some other reason, and
382     * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
383     * try again, allowing anything.  (Note a return of 0 is ok if the input
384     * was '\0') */
385    if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
386
387        /* If curlen is 0, we already handled the case where warnings are
388         * disabled, so this 'if' will be true, and so later on, we know that
389         * 's' is dereferencible */
390        if (do_warnings) {
391            if (retlen) {
392                *retlen = (STRLEN) -1;
393            }
394        }
395        else {
396            ret = D_PPP_utf8_to_uvchr_buf_callee(
397                                     (U8 *) /* Early perls: no const */
398                                            s, curlen, retlen, UTF8_ALLOW_ANY);
399            /* Override with the REPLACEMENT character, as that is what the
400             * modern version of this function returns */
401            ret = UNICODE_REPLACEMENT;
402
403#      if { VERSION < 5.16.0 }
404
405            /* Versions earlier than this don't necessarily return the proper
406             * length.  It should not extend past the end of string, nor past
407             * what the first byte indicates the length is, nor past the
408             * continuation characters */
409            if (retlen && (IV) *retlen >= 0) {
410                unsigned int i = 1;
411
412                *retlen = D_PPP_MIN(*retlen, curlen);
413                *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
414                do {
415#        ifdef UTF8_IS_CONTINUATION
416                    if (! UTF8_IS_CONTINUATION(s[i]))
417#        else       /* Versions without the above don't support EBCDIC anyway */
418                    if (s[i] < 0x80 || s[i] > 0xBF)
419#        endif
420                    {
421                        *retlen = i;
422                        break;
423                    }
424                } while (++i < *retlen);
425            }
426
427#      endif  /* end of < 5.16.0 */
428
429        }
430    }
431
432    return ret;
433
434#    endif    /* end of < 5.31.4 */
435
436}
437
438#  endif
439#endif
440
441#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
442#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
443                        to read past a NUL, making it much less likely to read
444                        off the end of the buffer.  A NUL indicates the start
445                        of the next character anyway.  If the input isn't
446                        NUL-terminated, the function remains unsafe, as it
447                        always has been. */
448
449__UNDEFINED__  utf8_to_uvchr(s, lp)                                             \
450    ((*(s) == '\0')                                                             \
451    ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */        \
452    : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp)))
453
454#endif
455
456/* Hint: utf8_to_uvchr
457    Use utf8_to_uvchr_buf() instead.  But ONLY if you KNOW the upper bound
458    of the input string (not resorting to using UTF8SKIP, etc., to infer it).
459    The backported utf8_to_uvchr() will do a better job to prevent most cases
460    of trying to read beyond the end of the buffer */
461
462/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */
463
464#ifdef sv_len_utf8
465#  if { VERSION >= 5.17.5 }
466#    ifndef  sv_len_utf8_nomg
467#      if defined(PERL_USE_GCC_BRACE_GROUPS)
468#        define sv_len_utf8_nomg(sv)                                    \
469            ({                                                          \
470                SV *sv_ = (sv);                                         \
471                sv_len_utf8(!SvGMAGICAL(sv_)                            \
472                            ? sv_                                       \
473                            : sv_mortalcopy_flags(sv_, SV_NOSTEAL));    \
474            })
475#      else
476         PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv)
477         {
478             dTHX;
479             if (SvGMAGICAL(sv))
480                 return sv_len_utf8(sv_mortalcopy_flags(sv,
481                                                        SV_NOSTEAL));
482             else return sv_len_utf8(sv);
483         }
484#        define  sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)
485#      endif
486#    endif
487#  else /* < 5.17.5 */
488    /* Older Perl versions have broken sv_len_utf8() when passed sv does not
489     * have SVf_UTF8 flag set */
490    /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */
491#    undef sv_len_utf8
492#    if defined(PERL_USE_GCC_BRACE_GROUPS)
493#      define sv_len_utf8_nomg(sv)                                          \
494        ({                                                                  \
495            SV *sv2 = (sv);                                                 \
496            STRLEN len;                                                     \
497            if (SvUTF8(sv2)) {                                              \
498                if (SvGMAGICAL(sv2))                                        \
499                    len = Perl_sv_len_utf8(aTHX_                            \
500                                           sv_mortalcopy_flags(sv2,         \
501                                                               SV_NOSTEAL));\
502                else                                                        \
503                    len = Perl_sv_len_utf8(aTHX_ sv2);                      \
504             }                                                              \
505             else SvPV_nomg(sv2, len);                                      \
506             len;                                                           \
507       })
508#      define sv_len_utf8(sv) ({ SV *_sv1 = (sv);                           \
509                                 SvGETMAGIC(_sv1);                          \
510                                 sv_len_utf8_nomg(_sv1);                    \
511                              })
512#    else   /* Below is no brace groups */
513       PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv)
514       {
515          dTHX;
516          STRLEN len;
517          if (SvUTF8(sv)) {
518              if (SvGMAGICAL(sv))
519                  len = Perl_sv_len_utf8(aTHX_
520                                         sv_mortalcopy_flags(sv,
521                                                             SV_NOSTEAL));
522              else
523                  len = Perl_sv_len_utf8(aTHX_ sv);
524          }
525          else SvPV_nomg(sv, len);
526          return len;
527       }
528#      define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)
529
530       PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8(SV * sv)
531       {
532          dTHX;
533          SvGETMAGIC(sv);
534          return sv_len_utf8_nomg(sv);
535       }
536#      define sv_len_utf8(sv) D_PPP_sv_len_utf8(sv)
537#    endif
538#  endif    /* End of < 5.17.5 */
539#endif
540
541=xsinit
542
543#define NEED_utf8_to_uvchr_buf
544
545=xsubs
546
547#if defined(UTF8f) && defined(newSVpvf)
548
549void
550UTF8f(x)
551        SV *x
552        PREINIT:
553                U32 u;
554                STRLEN len;
555                char *ptr;
556        INIT:
557                ptr = SvPV(x, len);
558                u = SvUTF8(x);
559        PPCODE:
560                x = sv_2mortal(newSVpvf("[%" UTF8f "]", UTF8fARG(u, len, ptr)));
561                XPUSHs(x);
562                XSRETURN(1);
563
564#endif
565
566#if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */  \
567                         /* as being available and params not what the  */  \
568                         /* API function has; works on EBCDIC too */
569
570SV *
571uvchr_to_utf8(native)
572
573    UV native
574    PREINIT:
575        int len;
576        U8 string[UTF8_MAXBYTES+1];
577        int i;
578        UV uni;
579
580    CODE:
581	len = UVCHR_SKIP(native);
582
583        for (i = 0; i < len; i++) {
584            string[i] = '\0';
585        }
586
587        if (len <= 1) {
588            string[0] = native;
589        }
590        else {
591            i = len;
592            uni = NATIVE_TO_UNI(native);
593            while (i-- > 1) {
594                string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
595                uni >>= UTF_ACCUMULATION_SHIFT;
596            }
597            string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len));
598        }
599
600        RETVAL = newSVpvn((char *) string, len);
601        SvUTF8_on(RETVAL);
602    OUTPUT:
603        RETVAL
604
605#endif
606#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
607
608STRLEN
609UTF8_SAFE_SKIP(s, adjustment)
610        char * s
611        int adjustment
612        PREINIT:
613            const char *const_s;
614        CODE:
615            const_s = s;
616            /* Instead of passing in an 'e' ptr, use the real end, adjusted */
617            RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
618        OUTPUT:
619            RETVAL
620
621#endif
622
623#ifdef isUTF8_CHAR
624
625STRLEN
626isUTF8_CHAR(s, adjustment)
627        unsigned char * s
628        int adjustment
629        PREINIT:
630            const unsigned char *const_s;
631            const unsigned char *const_e;
632        CODE:
633            const_s = s;
634            /* Instead of passing in an 'e' ptr, use the real end, adjusted */
635            const_e = const_s + UTF8SKIP(const_s) + adjustment;
636            RETVAL = isUTF8_CHAR(const_s, const_e);
637        OUTPUT:
638            RETVAL
639
640#endif
641
642
643#ifdef foldEQ_utf8
644
645STRLEN
646foldEQ_utf8(s1, l1, u1, s2, l2, u2)
647        char *s1
648        UV l1
649        bool u1
650        char *s2
651        UV l2
652        bool u2
653        PREINIT:
654            const char *const_s1;
655            const char *const_s2;
656        CODE:
657            const_s1 = s1;
658            const_s2 = s2;
659            RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
660        OUTPUT:
661            RETVAL
662
663#endif
664
665#ifdef utf8_to_uvchr_buf
666
667AV *
668utf8_to_uvchr_buf(s, adjustment)
669        unsigned char *s
670        int adjustment
671        PREINIT:
672            AV *av;
673            STRLEN len;
674            const unsigned char *const_s;
675        CODE:
676            av = newAV();
677            const_s = s;
678            av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
679                                                  s + UTF8SKIP(s) + adjustment,
680                                                  &len)));
681            if (len == (STRLEN) -1) {
682                av_push(av, newSViv(-1));
683            }
684            else {
685                av_push(av, newSVuv(len));
686            }
687            RETVAL = av;
688        OUTPUT:
689                RETVAL
690
691#endif
692
693#ifdef utf8_to_uvchr
694
695AV *
696utf8_to_uvchr(s)
697        unsigned char *s
698        PREINIT:
699            AV *av;
700            STRLEN len;
701            const unsigned char *const_s;
702        CODE:
703            av = newAV();
704            const_s = s;
705            av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
706            if (len == (STRLEN) -1) {
707                av_push(av, newSViv(-1));
708            }
709            else {
710                av_push(av, newSVuv(len));
711            }
712            RETVAL = av;
713        OUTPUT:
714                RETVAL
715
716#endif
717
718#ifdef sv_len_utf8
719
720STRLEN
721sv_len_utf8(sv)
722        SV *sv
723        CODE:
724                RETVAL = sv_len_utf8(sv);
725        OUTPUT:
726                RETVAL
727
728#endif
729
730#ifdef sv_len_utf8_nomg
731
732STRLEN
733sv_len_utf8_nomg(sv)
734        SV *sv
735        CODE:
736                RETVAL = sv_len_utf8_nomg(sv);
737        OUTPUT:
738                RETVAL
739
740#endif
741
742#ifdef UVCHR_IS_INVARIANT
743
744bool
745UVCHR_IS_INVARIANT(c)
746        unsigned c
747        PREINIT:
748        CODE:
749            RETVAL = UVCHR_IS_INVARIANT(c);
750        OUTPUT:
751            RETVAL
752
753#endif
754
755#ifdef UVCHR_SKIP
756
757STRLEN
758UVCHR_SKIP(c)
759        UV c
760        PREINIT:
761        CODE:
762            RETVAL = UVCHR_SKIP(c);
763        OUTPUT:
764            RETVAL
765
766#endif
767
768=tests plan => 98
769
770BEGIN {
771    # skip tests on 5.6.0 and earlier, plus 5.7.0
772    if (ivers($]) <= ivers(5.6) || ivers($]) == ivers(5.7) ) {
773        skip 'skip: broken utf8 support', 98;
774        exit;
775    }
776    require warnings;
777}
778
779is(Devel::PPPort::UTF8f(42), '[42]');
780is(Devel::PPPort::UTF8f('abc'), '[abc]');
781is(Devel::PPPort::UTF8f("\x{263a}"), "[\x{263a}]");
782
783my $str = "\x{A8}";
784if (ivers($]) >= ivers(5.8)) { eval q{utf8::upgrade($str)} }
785is(Devel::PPPort::UTF8f($str), "[\x{A8}]");
786if (ivers($]) >= ivers(5.8)) { eval q{utf8::downgrade($str)} }
787is(Devel::PPPort::UTF8f($str), "[\x{A8}]");
788
789is(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
790is(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
791
792is(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
793is(&Devel::PPPort::isUTF8_CHAR("A",  0), 1);
794is(&Devel::PPPort::isUTF8_CHAR("\x{100}",  -1), 0);
795is(&Devel::PPPort::isUTF8_CHAR("\x{100}",  0), 2);
796
797is(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1);
798ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6));
799ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100));
800
801is(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1);
802is(&Devel::PPPort::UVCHR_SKIP(0xb6),     2, "This is a test");
803is(&Devel::PPPort::UVCHR_SKIP(0x3FF),    2);
804is(&Devel::PPPort::UVCHR_SKIP(0x3FFF),   3);
805is(&Devel::PPPort::UVCHR_SKIP(0x3FFFF),  4);
806is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5);
807is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6);
808is(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7);
809if (ord("A") != 65) {
810    skip("Test not valid on EBCDIC", 1)
811}
812else {
813    is(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7);
814}
815
816if (ivers($]) < ivers(5.8)) {
817    skip("Perl version too early", 3);
818}
819else {
820    is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
821    is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
822    is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
823}
824
825my $ret = &Devel::PPPort::utf8_to_uvchr("A");
826is($ret->[0], ord("A"));
827is($ret->[1], 1);
828
829$ret = &Devel::PPPort::utf8_to_uvchr("\0");
830is($ret->[0], 0);
831is($ret->[1], 1);
832
833$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
834is($ret->[0], ord("A"));
835is($ret->[1], 1);
836
837$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
838is($ret->[0], 0);
839is($ret->[1], 1);
840
841my @buf_tests = (
842    {
843        input      => "A",
844        adjustment => -1,
845        warning    => eval "qr/empty/",
846        no_warnings_returned_length => 0,
847    },
848    {
849        input      => "\xc4\xc5",
850        adjustment => 0,
851        warning    => eval "qr/non-continuation/",
852        no_warnings_returned_length => 1,
853    },
854    {
855        input      => "\xc4\x80",
856        adjustment => -1,
857        warning    => eval "qr/short|1 byte, need 2/",
858        no_warnings_returned_length => 1,
859    },
860    {
861        input      => "\xc0\x81",
862        adjustment => 0,
863        warning    => eval "qr/overlong|2 bytes, need 1/",
864        no_warnings_returned_length => 2,
865    },
866    {
867        input      => "\xe0\x80\x81",
868        adjustment => 0,
869        warning    => eval "qr/overlong|3 bytes, need 1/",
870        no_warnings_returned_length => 3,
871    },
872    {
873        input      => "\xf0\x80\x80\x81",
874        adjustment => 0,
875        warning    => eval "qr/overlong|4 bytes, need 1/",
876        no_warnings_returned_length => 4,
877    },
878    {                 # Old algorithm failed to detect this
879        input      => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
880        adjustment => 0,
881        warning    => eval "qr/overflow/",
882        no_warnings_returned_length => 13,
883    },
884);
885
886if (ord("A") != 65) {   # tests not valid for EBCDIC
887    skip("Perl version too early", 2 + 4 + (scalar @buf_tests * 5));
888}
889else {
890    $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
891    is($ret->[0], 0x100);
892    is($ret->[1], 2);
893
894    my @warnings;
895    local $SIG{__WARN__} = sub { push @warnings, @_; };
896
897    {
898        use warnings 'utf8';
899        $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
900        is($ret->[0], 0);
901        is($ret->[1], -1);
902
903        no warnings 'utf8';
904        $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
905        is($ret->[0], 0xFFFD);
906        is($ret->[1], 1);
907    }
908
909
910    # An empty input is an assertion failure on debugging builds.  It is
911    # deliberately the first test.
912    require Config; Config->import;
913    use vars '%Config';
914
915    # VMS doesn't put DEBUGGING in ccflags, and Windows doesn't have
916    # $Config{config_args}.  When 5.14 or later can be assumed, use
917    # Config::non_bincompat_options(), but for now we're stuck with this.
918    if (   $Config{ccflags} =~ /-DDEBUGGING/
919        || $^O eq 'VMS' && $Config{config_args} =~ /\bDDEBUGGING\b/)
920    {
921        shift @buf_tests;
922        skip("Test not valid on DEBUGGING builds", 5);
923    }
924
925    my $test;
926    for $test (@buf_tests) {
927        my $input = $test->{'input'};
928        my $adjustment = $test->{'adjustment'};
929        my $display = 'utf8_to_uvchr_buf("';
930        my $i;
931        for ($i = 0; $i < length($input) + $adjustment; $i++) {
932            $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
933        }
934
935        $display .= '")';
936        my $warning = $test->{'warning'};
937
938        undef @warnings;
939        use warnings 'utf8';
940        $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
941        is($ret->[0], 0,  "returned value $display; warnings enabled");
942        is($ret->[1], -1, "returned length $display; warnings enabled");
943        my $all_warnings = join "; ", @warnings;
944        my $contains = grep { $_ =~ $warning } $all_warnings;
945        is($contains, 1, $display
946                    . "; Got: '$all_warnings', which should contain '$warning'");
947
948        undef @warnings;
949        no warnings 'utf8';
950        $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
951        is($ret->[0], 0xFFFD,  "returned value $display; warnings disabled");
952        is($ret->[1], $test->{'no_warnings_returned_length'},
953                      "returned length $display; warnings disabled");
954    }
955}
956
957if (ivers($]) ge ivers(5.008)) {
958    BEGIN { if (ivers($]) ge ivers(5.008)) { require utf8; "utf8"->import() } }
959
960    is(Devel::PPPort::sv_len_utf8("a������"), 4);
961    is(Devel::PPPort::sv_len_utf8_nomg("a������"), 4);
962
963    my $str = "������";
964    utf8::downgrade($str);
965    is(Devel::PPPort::sv_len_utf8($str), 3);
966    utf8::downgrade($str);
967    is(Devel::PPPort::sv_len_utf8_nomg($str), 3);
968    utf8::upgrade($str);
969    is(Devel::PPPort::sv_len_utf8($str), 3);
970    utf8::upgrade($str);
971    is(Devel::PPPort::sv_len_utf8_nomg($str), 3);
972
973    tie my $scalar, 'TieScalarCounter', "��";
974
975    is(tied($scalar)->{fetch}, 0);
976    is(tied($scalar)->{store}, 0);
977    is(Devel::PPPort::sv_len_utf8($scalar), 2);
978    is(tied($scalar)->{fetch}, 1);
979    is(tied($scalar)->{store}, 0);
980    is(Devel::PPPort::sv_len_utf8($scalar), 3);
981    is(tied($scalar)->{fetch}, 2);
982    is(tied($scalar)->{store}, 0);
983    is(Devel::PPPort::sv_len_utf8($scalar), 4);
984    is(tied($scalar)->{fetch}, 3);
985    is(tied($scalar)->{store}, 0);
986    is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
987    is(tied($scalar)->{fetch}, 3);
988    is(tied($scalar)->{store}, 0);
989    is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
990    is(tied($scalar)->{fetch}, 3);
991    is(tied($scalar)->{store}, 0);
992} else {
993    skip 'skip: no utf8::downgrade/utf8::upgrade support', 23;
994}
995
996package TieScalarCounter;
997
998sub TIESCALAR {
999    my ($class, $value) = @_;
1000    return bless { fetch => 0, store => 0, value => $value }, $class;
1001}
1002
1003sub FETCH {
1004    BEGIN { if (main::ivers($]) ge main::ivers(5.008)) { require utf8; "utf8"->import() } }
1005    my ($self) = @_;
1006    $self->{fetch}++;
1007    return $self->{value} .= "��";
1008}
1009
1010sub STORE {
1011    my ($self, $value) = @_;
1012    $self->{store}++;
1013    $self->{value} = $value;
1014}
1015