1=provides 2 3__UNDEFINED__ 4SvUTF8 5UTF8f 6UTF8fARG 7utf8_to_uvchr_buf 8sv_len_utf8 9sv_len_utf8_nomg 10 11=implementation 12 13#ifdef SVf_UTF8 14__UNDEFINED__ SvUTF8(sv) (SvFLAGS(sv) & SVf_UTF8) 15#endif 16 17#if { VERSION == 5.19.1 } /* 5.19.1 does not have UTF8fARG, only broken UTF8f */ 18#undef UTF8f 19#endif 20 21#ifdef SVf_UTF8 22__UNDEFINED__ UTF8f SVf 23__UNDEFINED__ UTF8fARG(u,l,p) newSVpvn_flags((p), (l), ((u) ? SVf_UTF8 : 0) | SVs_TEMP) 24#endif 25 26#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b)) 27 28__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD 29 30#ifdef UTF8_MAXLEN 31__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN 32#endif 33 34__UNDEF_NOT_PROVIDED__ UTF_START_MARK(len) \ 35 (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len))))) 36 37/* On non-EBCDIC was valid for some releases earlier than this, but easier to 38 * just do one check */ 39#if { VERSION < 5.018 } 40# undef UTF8_MAXBYTES_CASE 41#endif 42 43#if 'A' == 65 44# define D_PPP_BYTE_INFO_BITS 6 /* 6 bits meaningful in continuation bytes */ 45__UNDEFINED__ UTF8_MAXBYTES_CASE 13 46#else 47# define D_PPP_BYTE_INFO_BITS 5 /* 5 bits meaningful in continuation bytes */ 48__UNDEFINED__ UTF8_MAXBYTES_CASE 15 49#endif 50 51__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT D_PPP_BYTE_INFO_BITS 52 53#ifdef NATIVE_TO_UTF 54__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) NATIVE_TO_UTF(c) 55#else /* System doesn't support EBCDIC */ 56__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) (c) 57#endif 58 59#ifdef UTF_TO_NATIVE 60__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) UTF_TO_NATIVE(c) 61#else /* System doesn't support EBCDIC */ 62__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) (c) 63#endif 64 65__UNDEF_NOT_PROVIDED__ UTF_START_MASK(len) \ 66 (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2))) 67__UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK \ 68 ((U8) (0xFF << UTF_ACCUMULATION_SHIFT)) 69__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK \ 70 (UTF_IS_CONTINUATION_MASK & 0xB0) 71__UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE \ 72 ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) 73 74__UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE \ 75 ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) 76 77#if { VERSION < 5.007 } /* Was the complement of what should have been */ 78# undef UTF8_IS_DOWNGRADEABLE_START 79#endif 80__UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c) \ 81 inRANGE(NATIVE_UTF8_TO_I8(c), \ 82 UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1) 83__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK \ 84 ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1)) 85 86__UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added) \ 87 (((base) << UTF_ACCUMULATION_SHIFT) \ 88 | ((NATIVE_UTF8_TO_I8(added)) \ 89 & UTF_CONTINUATION_MASK)) 90 91__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV 0 92__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY 0x0001 93__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION 0x0002 94__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004 95__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT 0x0008 96__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG 0x0010 97__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW 0x0080 98__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ 99 |UTF8_ALLOW_NON_CONTINUATION \ 100 |UTF8_ALLOW_SHORT \ 101 |UTF8_ALLOW_LONG \ 102 |UTF8_ALLOW_OVERFLOW) 103 104#if defined UTF8SKIP 105 106/* Don't use official versions because they use MIN, which may not be available */ 107#undef UTF8_SAFE_SKIP 108#undef UTF8_CHK_SKIP 109 110__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ 111 ((((e) - (s)) <= 0) \ 112 ? 0 \ 113 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s)))) 114 115__UNDEFINED__ UTF8_CHK_SKIP(s) \ 116 (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)), \ 117 UTF8SKIP(s)))) 118/* UTF8_CHK_SKIP depends on my_strnlen */ 119__UNDEFINED__ UTF8_SKIP(s) UTF8SKIP(s) 120#endif 121 122#if 'A' == 65 123__UNDEFINED__ UTF8_IS_INVARIANT(c) isASCII(c) 124#else 125__UNDEFINED__ UTF8_IS_INVARIANT(c) (isASCII(c) || isCNTRL_L1(c)) 126#endif 127 128__UNDEFINED__ UVCHR_IS_INVARIANT(c) UTF8_IS_INVARIANT(c) 129 130#ifdef UVCHR_IS_INVARIANT 131# if 'A' != 65 || UVSIZE < 8 132 /* 32 bit platform, which includes UTF-EBCDIC on the releases this is 133 * backported to */ 134# define D_PPP_UVCHR_SKIP_UPPER(c) 7 135# else 136# define D_PPP_UVCHR_SKIP_UPPER(c) \ 137 (((WIDEST_UTYPE) (c)) < \ 138 (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13) 139# endif 140 141__UNDEFINED__ UVCHR_SKIP(c) \ 142 UVCHR_IS_INVARIANT(c) ? 1 : \ 143 (WIDEST_UTYPE) (c) < (32 * (1U << ( D_PPP_BYTE_INFO_BITS))) ? 2 : \ 144 (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 : \ 145 (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 : \ 146 (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 : \ 147 (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 : \ 148 D_PPP_UVCHR_SKIP_UPPER(c) 149#endif 150 151#ifdef is_ascii_string 152__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l) 153__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l) 154 155/* Hint: is_ascii_string, is_invariant_string 156 is_utf8_invariant_string() does the same thing and is preferred because its 157 name is more accurate as to what it does */ 158#endif 159 160#ifdef ibcmp_utf8 161__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \ 162 cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)) 163#endif 164 165#if defined(is_utf8_string) && defined(UTF8SKIP) 166__UNDEFINED__ isUTF8_CHAR(s, e) ( \ 167 (e) <= (s) || ! is_utf8_string(s, UTF8_SAFE_SKIP(s, e)) \ 168 ? 0 \ 169 : UTF8SKIP(s)) 170#endif 171 172#if 'A' == 65 173__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF" 174__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD" 175#elif '^' == 95 176__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73" 177__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71" 178#elif '^' == 176 179__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72" 180__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70" 181#else 182# error Unknown character set 183#endif 184 185#if { VERSION < 5.35.10 } 186 /* Versions prior to 5.31.4 accepted things that are now considered 187 * malformations, and didn't return -1 on error with warnings enabled. 188 * Versions before 5.35.10 dereferenced empty input without checking */ 189# undef utf8_to_uvchr_buf 190#endif 191 192/* This implementation brings modern, generally more restricted standards to 193 * utf8_to_uvchr_buf. Some of these are security related, and clearly must 194 * be done. But its arguable that the others need not, and hence should not. 195 * The reason they're here is that a module that intends to play with the 196 * latest perls should be able to work the same in all releases. An example is 197 * that perl no longer accepts any UV for a code point, but limits them to 198 * IV_MAX or below. This is for future internal use of the larger code points. 199 * If it turns out that some of these changes are breaking code that isn't 200 * intended to work with modern perls, the tighter restrictions could be 201 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ 202 203/* 5.6.0 is the first release with UTF-8, and we don't implement this function 204 * there due to its likely lack of still being in use, and the underlying 205 * implementation is very different from later ones, without the later 206 * safeguards, so would require extra work to deal with */ 207#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf) 208 /* Choose which underlying implementation to use. At least one must be 209 * present or the perl is too early to handle this function */ 210# if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv) 211# if defined(utf8n_to_uvchr) /* This is the preferred implementation */ 212# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr 213# elif /* Must be at least 5.6.1 from #if above; \ 214 If have both regular and _simple, regular has all args */ \ 215 defined(utf8_to_uv) && defined(utf8_to_uv_simple) 216# define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv 217# elif defined(utf8_to_uvchr) /* The below won't work well on error input */ 218# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ 219 utf8_to_uvchr((U8 *)(s), (retlen)) 220# else 221# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ 222 utf8_to_uv((U8 *)(s), (retlen)) 223# endif 224# endif 225 226# if { NEED utf8_to_uvchr_buf } 227 228UV 229utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 230{ 231# if { VERSION >= 5.31.4 } /* But from above, must be < 5.35.10 */ 232# if { VERSION != 5.35.9 } 233 234 /* Versions less than 5.35.9 could dereference s on zero length, so 235 * pass it something where no harm comes from that. */ 236 if (send <= s) s = send = (U8 *) "?"; 237 return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen); 238 239# else /* Below is 5.35.9, which also works on non-empty input, but 240 for empty input, can wrongly dereference, and additionally is 241 also just plain broken */ 242 if (send > s) return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen); 243 if (! ckWARN_d(WARN_UTF8)) { 244 if (retlen) *retlen = 0; 245 return UNICODE_REPLACEMENT; 246 } 247 else { 248 s = send = (U8 *) "?"; 249 250 /* Call just for its warning */ 251 (void) Perl__utf8n_to_uvchr_msgs_helper(s, 0, NULL, 0, NULL, NULL); 252 if (retlen) *retlen = (STRLEN) -1; 253 return 0; 254 } 255 256# endif 257# else 258 259 UV ret; 260 STRLEN curlen; 261 bool overflows = 0; 262 const U8 *cur_s = s; 263 const bool do_warnings = ckWARN_d(WARN_UTF8); 264# if { VERSION < 5.26.0 } && ! defined(EBCDIC) 265 STRLEN overflow_length = 0; 266# endif 267 268 if (send > s) { 269 curlen = send - s; 270 } 271 else { 272 assert(0); /* Modern perls die under this circumstance */ 273 curlen = 0; 274 if (! do_warnings) { /* Handle empty here if no warnings needed */ 275 if (retlen) *retlen = 0; 276 return UNICODE_REPLACEMENT; 277 } 278 } 279 280# if { VERSION < 5.26.0 } && ! defined(EBCDIC) 281 282 /* Perl did not properly detect overflow for much of its history on 283 * non-EBCDIC platforms, often returning an overlong value which may or may 284 * not have been tolerated in the call. Also, earlier versions, when they 285 * did detect overflow, may have disallowed it completely. Modern ones can 286 * replace it with the REPLACEMENT CHARACTER, depending on calling 287 * parameters. Therefore detect it ourselves in releases it was 288 * problematic in. */ 289 290 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { 291 292 /* First, on a 32-bit machine the first byte being at least \xFE 293 * automatically is overflow, as it indicates something requiring more 294 * than 31 bits */ 295 if (sizeof(ret) < 8) { 296 overflows = 1; 297 overflow_length = (*s == 0xFE) ? 7 : 13; 298 } 299 else { 300 const U8 highest[] = /* 2*63-1 */ 301 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; 302 const U8 *cur_h = highest; 303 304 for (cur_s = s; cur_s < send; cur_s++, cur_h++) { 305 if (UNLIKELY(*cur_s == *cur_h)) { 306 continue; 307 } 308 309 /* If this byte is larger than the corresponding highest UTF-8 310 * byte, the sequence overflows; otherwise the byte is less 311 * than (as we handled the equality case above), and so the 312 * sequence doesn't overflow */ 313 overflows = *cur_s > *cur_h; 314 break; 315 316 } 317 318 /* Here, either we set the bool and broke out of the loop, or got 319 * to the end and all bytes are the same which indicates it doesn't 320 * overflow. If it did overflow, it would be this number of bytes 321 * */ 322 overflow_length = 13; 323 } 324 } 325 326 if (UNLIKELY(overflows)) { 327 ret = 0; 328 329 if (! do_warnings && retlen) { 330 *retlen = overflow_length; 331 } 332 } 333 else 334 335# endif /* < 5.26 */ 336 337 /* Here, we are either in a release that properly detects overflow, or 338 * we have checked for overflow and the next statement is executing as 339 * part of the above conditional where we know we don't have overflow. 340 * 341 * The modern versions allow anything that evaluates to a legal UV, but 342 * not overlongs nor an empty input */ 343 ret = D_PPP_utf8_to_uvchr_buf_callee( 344 (U8 *) /* Early perls: no const */ 345 s, curlen, retlen, (UTF8_ALLOW_ANYUV 346 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); 347 348# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 } 349 350 /* But actually, more modern versions restrict the UV to being no more than 351 * what an IV can hold, so it could still have gotten it wrong about 352 * overflowing. */ 353 if (UNLIKELY(ret > IV_MAX)) { 354 overflows = 1; 355 } 356 357# endif 358 359 if (UNLIKELY(overflows)) { 360 if (! do_warnings) { 361 if (retlen) { 362 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); 363 *retlen = D_PPP_MIN(*retlen, curlen); 364 } 365 return UNICODE_REPLACEMENT; 366 } 367 else { 368 369 /* We use the error message in use from 5.8-5.26 */ 370 Perl_warner(aTHX_ packWARN(WARN_UTF8), 371 "Malformed UTF-8 character (overflow at 0x%" UVxf 372 ", byte 0x%02x, after start byte 0x%02x)", 373 ret, *cur_s, *s); 374 if (retlen) { 375 *retlen = (STRLEN) -1; 376 } 377 return 0; 378 } 379 } 380 381 /* Here, did not overflow, but if it failed for some other reason, and 382 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(), 383 * try again, allowing anything. (Note a return of 0 is ok if the input 384 * was '\0') */ 385 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { 386 387 /* If curlen is 0, we already handled the case where warnings are 388 * disabled, so this 'if' will be true, and so later on, we know that 389 * 's' is dereferencible */ 390 if (do_warnings) { 391 if (retlen) { 392 *retlen = (STRLEN) -1; 393 } 394 } 395 else { 396 ret = D_PPP_utf8_to_uvchr_buf_callee( 397 (U8 *) /* Early perls: no const */ 398 s, curlen, retlen, UTF8_ALLOW_ANY); 399 /* Override with the REPLACEMENT character, as that is what the 400 * modern version of this function returns */ 401 ret = UNICODE_REPLACEMENT; 402 403# if { VERSION < 5.16.0 } 404 405 /* Versions earlier than this don't necessarily return the proper 406 * length. It should not extend past the end of string, nor past 407 * what the first byte indicates the length is, nor past the 408 * continuation characters */ 409 if (retlen && (IV) *retlen >= 0) { 410 unsigned int i = 1; 411 412 *retlen = D_PPP_MIN(*retlen, curlen); 413 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); 414 do { 415# ifdef UTF8_IS_CONTINUATION 416 if (! UTF8_IS_CONTINUATION(s[i])) 417# else /* Versions without the above don't support EBCDIC anyway */ 418 if (s[i] < 0x80 || s[i] > 0xBF) 419# endif 420 { 421 *retlen = i; 422 break; 423 } 424 } while (++i < *retlen); 425 } 426 427# endif /* end of < 5.16.0 */ 428 429 } 430 } 431 432 return ret; 433 434# endif /* end of < 5.31.4 */ 435 436} 437 438# endif 439#endif 440 441#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) 442#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses 443 to read past a NUL, making it much less likely to read 444 off the end of the buffer. A NUL indicates the start 445 of the next character anyway. If the input isn't 446 NUL-terminated, the function remains unsafe, as it 447 always has been. */ 448 449__UNDEFINED__ utf8_to_uvchr(s, lp) \ 450 ((*(s) == '\0') \ 451 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ 452 : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp))) 453 454#endif 455 456/* Hint: utf8_to_uvchr 457 Use utf8_to_uvchr_buf() instead. But ONLY if you KNOW the upper bound 458 of the input string (not resorting to using UTF8SKIP, etc., to infer it). 459 The backported utf8_to_uvchr() will do a better job to prevent most cases 460 of trying to read beyond the end of the buffer */ 461 462/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */ 463 464#ifdef sv_len_utf8 465# if { VERSION >= 5.17.5 } 466# ifndef sv_len_utf8_nomg 467# if defined(PERL_USE_GCC_BRACE_GROUPS) 468# define sv_len_utf8_nomg(sv) \ 469 ({ \ 470 SV *sv_ = (sv); \ 471 sv_len_utf8(!SvGMAGICAL(sv_) \ 472 ? sv_ \ 473 : sv_mortalcopy_flags(sv_, SV_NOSTEAL)); \ 474 }) 475# else 476 PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv) 477 { 478 dTHX; 479 if (SvGMAGICAL(sv)) 480 return sv_len_utf8(sv_mortalcopy_flags(sv, 481 SV_NOSTEAL)); 482 else return sv_len_utf8(sv); 483 } 484# define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv) 485# endif 486# endif 487# else /* < 5.17.5 */ 488 /* Older Perl versions have broken sv_len_utf8() when passed sv does not 489 * have SVf_UTF8 flag set */ 490 /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */ 491# undef sv_len_utf8 492# if defined(PERL_USE_GCC_BRACE_GROUPS) 493# define sv_len_utf8_nomg(sv) \ 494 ({ \ 495 SV *sv2 = (sv); \ 496 STRLEN len; \ 497 if (SvUTF8(sv2)) { \ 498 if (SvGMAGICAL(sv2)) \ 499 len = Perl_sv_len_utf8(aTHX_ \ 500 sv_mortalcopy_flags(sv2, \ 501 SV_NOSTEAL));\ 502 else \ 503 len = Perl_sv_len_utf8(aTHX_ sv2); \ 504 } \ 505 else SvPV_nomg(sv2, len); \ 506 len; \ 507 }) 508# define sv_len_utf8(sv) ({ SV *_sv1 = (sv); \ 509 SvGETMAGIC(_sv1); \ 510 sv_len_utf8_nomg(_sv1); \ 511 }) 512# else /* Below is no brace groups */ 513 PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv) 514 { 515 dTHX; 516 STRLEN len; 517 if (SvUTF8(sv)) { 518 if (SvGMAGICAL(sv)) 519 len = Perl_sv_len_utf8(aTHX_ 520 sv_mortalcopy_flags(sv, 521 SV_NOSTEAL)); 522 else 523 len = Perl_sv_len_utf8(aTHX_ sv); 524 } 525 else SvPV_nomg(sv, len); 526 return len; 527 } 528# define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv) 529 530 PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8(SV * sv) 531 { 532 dTHX; 533 SvGETMAGIC(sv); 534 return sv_len_utf8_nomg(sv); 535 } 536# define sv_len_utf8(sv) D_PPP_sv_len_utf8(sv) 537# endif 538# endif /* End of < 5.17.5 */ 539#endif 540 541=xsinit 542 543#define NEED_utf8_to_uvchr_buf 544 545=xsubs 546 547#if defined(UTF8f) && defined(newSVpvf) 548 549void 550UTF8f(x) 551 SV *x 552 PREINIT: 553 U32 u; 554 STRLEN len; 555 char *ptr; 556 INIT: 557 ptr = SvPV(x, len); 558 u = SvUTF8(x); 559 PPCODE: 560 x = sv_2mortal(newSVpvf("[%" UTF8f "]", UTF8fARG(u, len, ptr))); 561 XPUSHs(x); 562 XSRETURN(1); 563 564#endif 565 566#if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */ \ 567 /* as being available and params not what the */ \ 568 /* API function has; works on EBCDIC too */ 569 570SV * 571uvchr_to_utf8(native) 572 573 UV native 574 PREINIT: 575 int len; 576 U8 string[UTF8_MAXBYTES+1]; 577 int i; 578 UV uni; 579 580 CODE: 581 len = UVCHR_SKIP(native); 582 583 for (i = 0; i < len; i++) { 584 string[i] = '\0'; 585 } 586 587 if (len <= 1) { 588 string[0] = native; 589 } 590 else { 591 i = len; 592 uni = NATIVE_TO_UNI(native); 593 while (i-- > 1) { 594 string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); 595 uni >>= UTF_ACCUMULATION_SHIFT; 596 } 597 string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len)); 598 } 599 600 RETVAL = newSVpvn((char *) string, len); 601 SvUTF8_on(RETVAL); 602 OUTPUT: 603 RETVAL 604 605#endif 606#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) 607 608STRLEN 609UTF8_SAFE_SKIP(s, adjustment) 610 char * s 611 int adjustment 612 PREINIT: 613 const char *const_s; 614 CODE: 615 const_s = s; 616 /* Instead of passing in an 'e' ptr, use the real end, adjusted */ 617 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment); 618 OUTPUT: 619 RETVAL 620 621#endif 622 623#ifdef isUTF8_CHAR 624 625STRLEN 626isUTF8_CHAR(s, adjustment) 627 unsigned char * s 628 int adjustment 629 PREINIT: 630 const unsigned char *const_s; 631 const unsigned char *const_e; 632 CODE: 633 const_s = s; 634 /* Instead of passing in an 'e' ptr, use the real end, adjusted */ 635 const_e = const_s + UTF8SKIP(const_s) + adjustment; 636 RETVAL = isUTF8_CHAR(const_s, const_e); 637 OUTPUT: 638 RETVAL 639 640#endif 641 642 643#ifdef foldEQ_utf8 644 645STRLEN 646foldEQ_utf8(s1, l1, u1, s2, l2, u2) 647 char *s1 648 UV l1 649 bool u1 650 char *s2 651 UV l2 652 bool u2 653 PREINIT: 654 const char *const_s1; 655 const char *const_s2; 656 CODE: 657 const_s1 = s1; 658 const_s2 = s2; 659 RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2); 660 OUTPUT: 661 RETVAL 662 663#endif 664 665#ifdef utf8_to_uvchr_buf 666 667AV * 668utf8_to_uvchr_buf(s, adjustment) 669 unsigned char *s 670 int adjustment 671 PREINIT: 672 AV *av; 673 STRLEN len; 674 const unsigned char *const_s; 675 CODE: 676 av = newAV(); 677 const_s = s; 678 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s, 679 s + UTF8SKIP(s) + adjustment, 680 &len))); 681 if (len == (STRLEN) -1) { 682 av_push(av, newSViv(-1)); 683 } 684 else { 685 av_push(av, newSVuv(len)); 686 } 687 RETVAL = av; 688 OUTPUT: 689 RETVAL 690 691#endif 692 693#ifdef utf8_to_uvchr 694 695AV * 696utf8_to_uvchr(s) 697 unsigned char *s 698 PREINIT: 699 AV *av; 700 STRLEN len; 701 const unsigned char *const_s; 702 CODE: 703 av = newAV(); 704 const_s = s; 705 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len))); 706 if (len == (STRLEN) -1) { 707 av_push(av, newSViv(-1)); 708 } 709 else { 710 av_push(av, newSVuv(len)); 711 } 712 RETVAL = av; 713 OUTPUT: 714 RETVAL 715 716#endif 717 718#ifdef sv_len_utf8 719 720STRLEN 721sv_len_utf8(sv) 722 SV *sv 723 CODE: 724 RETVAL = sv_len_utf8(sv); 725 OUTPUT: 726 RETVAL 727 728#endif 729 730#ifdef sv_len_utf8_nomg 731 732STRLEN 733sv_len_utf8_nomg(sv) 734 SV *sv 735 CODE: 736 RETVAL = sv_len_utf8_nomg(sv); 737 OUTPUT: 738 RETVAL 739 740#endif 741 742#ifdef UVCHR_IS_INVARIANT 743 744bool 745UVCHR_IS_INVARIANT(c) 746 unsigned c 747 PREINIT: 748 CODE: 749 RETVAL = UVCHR_IS_INVARIANT(c); 750 OUTPUT: 751 RETVAL 752 753#endif 754 755#ifdef UVCHR_SKIP 756 757STRLEN 758UVCHR_SKIP(c) 759 UV c 760 PREINIT: 761 CODE: 762 RETVAL = UVCHR_SKIP(c); 763 OUTPUT: 764 RETVAL 765 766#endif 767 768=tests plan => 98 769 770BEGIN { 771 # skip tests on 5.6.0 and earlier, plus 5.7.0 772 if (ivers($]) <= ivers(5.6) || ivers($]) == ivers(5.7) ) { 773 skip 'skip: broken utf8 support', 98; 774 exit; 775 } 776 require warnings; 777} 778 779is(Devel::PPPort::UTF8f(42), '[42]'); 780is(Devel::PPPort::UTF8f('abc'), '[abc]'); 781is(Devel::PPPort::UTF8f("\x{263a}"), "[\x{263a}]"); 782 783my $str = "\x{A8}"; 784if (ivers($]) >= ivers(5.8)) { eval q{utf8::upgrade($str)} } 785is(Devel::PPPort::UTF8f($str), "[\x{A8}]"); 786if (ivers($]) >= ivers(5.8)) { eval q{utf8::downgrade($str)} } 787is(Devel::PPPort::UTF8f($str), "[\x{A8}]"); 788 789is(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); 790is(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); 791 792is(&Devel::PPPort::isUTF8_CHAR("A", -1), 0); 793is(&Devel::PPPort::isUTF8_CHAR("A", 0), 1); 794is(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0); 795is(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2); 796 797is(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1); 798ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6)); 799ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100)); 800 801is(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1); 802is(&Devel::PPPort::UVCHR_SKIP(0xb6), 2, "This is a test"); 803is(&Devel::PPPort::UVCHR_SKIP(0x3FF), 2); 804is(&Devel::PPPort::UVCHR_SKIP(0x3FFF), 3); 805is(&Devel::PPPort::UVCHR_SKIP(0x3FFFF), 4); 806is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5); 807is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6); 808is(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7); 809if (ord("A") != 65) { 810 skip("Test not valid on EBCDIC", 1) 811} 812else { 813 is(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7); 814} 815 816if (ivers($]) < ivers(5.8)) { 817 skip("Perl version too early", 3); 818} 819else { 820 is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1); 821 is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0); 822 is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0); 823} 824 825my $ret = &Devel::PPPort::utf8_to_uvchr("A"); 826is($ret->[0], ord("A")); 827is($ret->[1], 1); 828 829$ret = &Devel::PPPort::utf8_to_uvchr("\0"); 830is($ret->[0], 0); 831is($ret->[1], 1); 832 833$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); 834is($ret->[0], ord("A")); 835is($ret->[1], 1); 836 837$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); 838is($ret->[0], 0); 839is($ret->[1], 1); 840 841my @buf_tests = ( 842 { 843 input => "A", 844 adjustment => -1, 845 warning => eval "qr/empty/", 846 no_warnings_returned_length => 0, 847 }, 848 { 849 input => "\xc4\xc5", 850 adjustment => 0, 851 warning => eval "qr/non-continuation/", 852 no_warnings_returned_length => 1, 853 }, 854 { 855 input => "\xc4\x80", 856 adjustment => -1, 857 warning => eval "qr/short|1 byte, need 2/", 858 no_warnings_returned_length => 1, 859 }, 860 { 861 input => "\xc0\x81", 862 adjustment => 0, 863 warning => eval "qr/overlong|2 bytes, need 1/", 864 no_warnings_returned_length => 2, 865 }, 866 { 867 input => "\xe0\x80\x81", 868 adjustment => 0, 869 warning => eval "qr/overlong|3 bytes, need 1/", 870 no_warnings_returned_length => 3, 871 }, 872 { 873 input => "\xf0\x80\x80\x81", 874 adjustment => 0, 875 warning => eval "qr/overlong|4 bytes, need 1/", 876 no_warnings_returned_length => 4, 877 }, 878 { # Old algorithm failed to detect this 879 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 880 adjustment => 0, 881 warning => eval "qr/overflow/", 882 no_warnings_returned_length => 13, 883 }, 884); 885 886if (ord("A") != 65) { # tests not valid for EBCDIC 887 skip("Perl version too early", 2 + 4 + (scalar @buf_tests * 5)); 888} 889else { 890 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); 891 is($ret->[0], 0x100); 892 is($ret->[1], 2); 893 894 my @warnings; 895 local $SIG{__WARN__} = sub { push @warnings, @_; }; 896 897 { 898 use warnings 'utf8'; 899 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); 900 is($ret->[0], 0); 901 is($ret->[1], -1); 902 903 no warnings 'utf8'; 904 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); 905 is($ret->[0], 0xFFFD); 906 is($ret->[1], 1); 907 } 908 909 910 # An empty input is an assertion failure on debugging builds. It is 911 # deliberately the first test. 912 require Config; Config->import; 913 use vars '%Config'; 914 915 # VMS doesn't put DEBUGGING in ccflags, and Windows doesn't have 916 # $Config{config_args}. When 5.14 or later can be assumed, use 917 # Config::non_bincompat_options(), but for now we're stuck with this. 918 if ( $Config{ccflags} =~ /-DDEBUGGING/ 919 || $^O eq 'VMS' && $Config{config_args} =~ /\bDDEBUGGING\b/) 920 { 921 shift @buf_tests; 922 skip("Test not valid on DEBUGGING builds", 5); 923 } 924 925 my $test; 926 for $test (@buf_tests) { 927 my $input = $test->{'input'}; 928 my $adjustment = $test->{'adjustment'}; 929 my $display = 'utf8_to_uvchr_buf("'; 930 my $i; 931 for ($i = 0; $i < length($input) + $adjustment; $i++) { 932 $display .= sprintf "\\x%02x", ord substr($input, $i, 1); 933 } 934 935 $display .= '")'; 936 my $warning = $test->{'warning'}; 937 938 undef @warnings; 939 use warnings 'utf8'; 940 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); 941 is($ret->[0], 0, "returned value $display; warnings enabled"); 942 is($ret->[1], -1, "returned length $display; warnings enabled"); 943 my $all_warnings = join "; ", @warnings; 944 my $contains = grep { $_ =~ $warning } $all_warnings; 945 is($contains, 1, $display 946 . "; Got: '$all_warnings', which should contain '$warning'"); 947 948 undef @warnings; 949 no warnings 'utf8'; 950 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); 951 is($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); 952 is($ret->[1], $test->{'no_warnings_returned_length'}, 953 "returned length $display; warnings disabled"); 954 } 955} 956 957if (ivers($]) ge ivers(5.008)) { 958 BEGIN { if (ivers($]) ge ivers(5.008)) { require utf8; "utf8"->import() } } 959 960 is(Devel::PPPort::sv_len_utf8("a������"), 4); 961 is(Devel::PPPort::sv_len_utf8_nomg("a������"), 4); 962 963 my $str = "������"; 964 utf8::downgrade($str); 965 is(Devel::PPPort::sv_len_utf8($str), 3); 966 utf8::downgrade($str); 967 is(Devel::PPPort::sv_len_utf8_nomg($str), 3); 968 utf8::upgrade($str); 969 is(Devel::PPPort::sv_len_utf8($str), 3); 970 utf8::upgrade($str); 971 is(Devel::PPPort::sv_len_utf8_nomg($str), 3); 972 973 tie my $scalar, 'TieScalarCounter', "��"; 974 975 is(tied($scalar)->{fetch}, 0); 976 is(tied($scalar)->{store}, 0); 977 is(Devel::PPPort::sv_len_utf8($scalar), 2); 978 is(tied($scalar)->{fetch}, 1); 979 is(tied($scalar)->{store}, 0); 980 is(Devel::PPPort::sv_len_utf8($scalar), 3); 981 is(tied($scalar)->{fetch}, 2); 982 is(tied($scalar)->{store}, 0); 983 is(Devel::PPPort::sv_len_utf8($scalar), 4); 984 is(tied($scalar)->{fetch}, 3); 985 is(tied($scalar)->{store}, 0); 986 is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4); 987 is(tied($scalar)->{fetch}, 3); 988 is(tied($scalar)->{store}, 0); 989 is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4); 990 is(tied($scalar)->{fetch}, 3); 991 is(tied($scalar)->{store}, 0); 992} else { 993 skip 'skip: no utf8::downgrade/utf8::upgrade support', 23; 994} 995 996package TieScalarCounter; 997 998sub TIESCALAR { 999 my ($class, $value) = @_; 1000 return bless { fetch => 0, store => 0, value => $value }, $class; 1001} 1002 1003sub FETCH { 1004 BEGIN { if (main::ivers($]) ge main::ivers(5.008)) { require utf8; "utf8"->import() } } 1005 my ($self) = @_; 1006 $self->{fetch}++; 1007 return $self->{value} .= "��"; 1008} 1009 1010sub STORE { 1011 my ($self, $value) = @_; 1012 $self->{store}++; 1013 $self->{value} = $value; 1014} 1015