1/* inline.h 2 * 3 * Copyright (C) 2012 by Larry Wall and others 4 * 5 * You may distribute under the terms of either the GNU General Public 6 * License or the Artistic License, as specified in the README file. 7 * 8 * This file contains tables and code adapted from 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this 10 * copyright notice: 11 12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 13 14Permission is hereby granted, free of charge, to any person obtaining a copy of 15this software and associated documentation files (the "Software"), to deal in 16the Software without restriction, including without limitation the rights to 17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 18of the Software, and to permit persons to whom the Software is furnished to do 19so, subject to the following conditions: 20 21The above copyright notice and this permission notice shall be included in all 22copies or substantial portions of the Software. 23 24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30SOFTWARE. 31 32 * 33 * This file is a home for static inline functions that cannot go in other 34 * header files, because they depend on proto.h (included after most other 35 * headers) or struct definitions. 36 * 37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but 38 * whose details should be exposed to the compiler, for such things as tail 39 * call optimization. 40 * 41 * Each section names the header file that the functions "belong" to. 42 */ 43 44/* ------------------------------- av.h ------------------------------- */ 45 46/* 47=for apidoc_section $AV 48=for apidoc av_count 49Returns the number of elements in the array C<av>. This is the true length of 50the array, including any undefined elements. It is always the same as 51S<C<av_top_index(av) + 1>>. 52 53=cut 54*/ 55PERL_STATIC_INLINE Size_t 56Perl_av_count(pTHX_ AV *av) 57{ 58 PERL_ARGS_ASSERT_AV_COUNT; 59 assert(SvTYPE(av) == SVt_PVAV); 60 61 return AvFILL(av) + 1; 62} 63 64/* ------------------------------- av.c ------------------------------- */ 65 66/* 67=for apidoc av_store_simple 68 69This is a cut-down version of av_store that assumes that the array is 70very straightforward - no magic, not readonly, and AvREAL - and that 71C<key> is not negative. This function MUST NOT be used in situations 72where any of those assumptions may not hold. 73 74Stores an SV in an array. The array index is specified as C<key>. It 75can be dereferenced to get the C<SV*> that was stored there (= C<val>)). 76 77Note that the caller is responsible for suitably incrementing the reference 78count of C<val> before the call. 79 80Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>. 81 82=cut 83*/ 84 85PERL_STATIC_INLINE SV** 86Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val) 87{ 88 SV** ary; 89 90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE; 91 assert(SvTYPE(av) == SVt_PVAV); 92 assert(!SvMAGICAL(av)); 93 assert(!SvREADONLY(av)); 94 assert(AvREAL(av)); 95 assert(key > -1); 96 97 ary = AvARRAY(av); 98 99 if (AvFILLp(av) < key) { 100 if (key > AvMAX(av)) { 101 av_extend(av,key); 102 ary = AvARRAY(av); 103 } 104 AvFILLp(av) = key; 105 } else 106 SvREFCNT_dec(ary[key]); 107 108 ary[key] = val; 109 return &ary[key]; 110} 111 112/* 113=for apidoc av_fetch_simple 114 115This is a cut-down version of av_fetch that assumes that the array is 116very straightforward - no magic, not readonly, and AvREAL - and that 117C<key> is not negative. This function MUST NOT be used in situations 118where any of those assumptions may not hold. 119 120Returns the SV at the specified index in the array. The C<key> is the 121index. If lval is true, you are guaranteed to get a real SV back (in case 122it wasn't real before), which you can then modify. Check that the return 123value is non-null before dereferencing it to a C<SV*>. 124 125The rough perl equivalent is C<$myarray[$key]>. 126 127=cut 128*/ 129 130PERL_STATIC_INLINE SV** 131Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval) 132{ 133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE; 134 assert(SvTYPE(av) == SVt_PVAV); 135 assert(!SvMAGICAL(av)); 136 assert(!SvREADONLY(av)); 137 assert(AvREAL(av)); 138 assert(key > -1); 139 140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) { 141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL; 142 } else { 143 return &AvARRAY(av)[key]; 144 } 145} 146 147/* 148=for apidoc av_push_simple 149 150This is a cut-down version of av_push that assumes that the array is very 151straightforward - no magic, not readonly, and AvREAL - and that C<key> is 152not less than -1. This function MUST NOT be used in situations where any 153of those assumptions may not hold. 154 155Pushes an SV (transferring control of one reference count) onto the end of the 156array. The array will grow automatically to accommodate the addition. 157 158Perl equivalent: C<push @myarray, $val;>. 159 160=cut 161*/ 162 163PERL_STATIC_INLINE void 164Perl_av_push_simple(pTHX_ AV *av, SV *val) 165{ 166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE; 167 assert(SvTYPE(av) == SVt_PVAV); 168 assert(!SvMAGICAL(av)); 169 assert(!SvREADONLY(av)); 170 assert(AvREAL(av)); 171 assert(AvFILLp(av) > -2); 172 173 (void)av_store_simple(av,AvFILLp(av)+1,val); 174} 175 176/* 177=for apidoc av_new_alloc 178 179This implements L<perlapi/C<newAV_alloc_x>> 180and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this 181functionality. 182 183Creates a new AV and allocates its SV* array. 184 185This is similar to, but more efficient than doing: 186 187 AV *av = newAV(); 188 av_extend(av, key); 189 190The size parameter is used to pre-allocate a SV* array large enough to 191hold at least elements C<0..(size-1)>. C<size> must be at least 1. 192 193The C<zeroflag> parameter controls whether or not the array is NULL 194initialized. 195 196=cut 197*/ 198 199PERL_STATIC_INLINE AV * 200Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag) 201{ 202 AV * const av = newAV(); 203 SV** ary; 204 PERL_ARGS_ASSERT_AV_NEW_ALLOC; 205 assert(size > 0); 206 207 Newx(ary, size, SV*); /* Newx performs the memwrap check */ 208 AvALLOC(av) = ary; 209 AvARRAY(av) = ary; 210 AvMAX(av) = size - 1; 211 212 if (zeroflag) 213 Zero(ary, size, SV*); 214 215 return av; 216} 217 218 219/* ------------------------------- cv.h ------------------------------- */ 220 221/* 222=for apidoc_section $CV 223=for apidoc CvGV 224Returns the GV associated with the CV C<sv>, reifying it if necessary. 225 226=cut 227*/ 228PERL_STATIC_INLINE GV * 229Perl_CvGV(pTHX_ CV *sv) 230{ 231 PERL_ARGS_ASSERT_CVGV; 232 233 return CvNAMED(sv) 234 ? Perl_cvgv_from_hek(aTHX_ sv) 235 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; 236} 237 238/* 239=for apidoc CvDEPTH 240Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a 241recursive call. 242 243=cut 244*/ 245PERL_STATIC_INLINE I32 * 246Perl_CvDEPTH(const CV * const sv) 247{ 248 PERL_ARGS_ASSERT_CVDEPTH; 249 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); 250 251 return &((XPVCV*)SvANY(sv))->xcv_depth; 252} 253 254/* 255 CvPROTO returns the prototype as stored, which is not necessarily what 256 the interpreter should be using. Specifically, the interpreter assumes 257 that spaces have been stripped, which has been the case if the prototype 258 was added by toke.c, but is generally not the case if it was added elsewhere. 259 Since we can't enforce the spacelessness at assignment time, this routine 260 provides a temporary copy at parse time with spaces removed. 261 I<orig> is the start of the original buffer, I<len> is the length of the 262 prototype and will be updated when this returns. 263 */ 264 265#ifdef PERL_CORE 266PERL_STATIC_INLINE char * 267S_strip_spaces(pTHX_ const char * orig, STRLEN * const len) 268{ 269 SV * tmpsv; 270 char * tmps; 271 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP); 272 tmps = SvPVX(tmpsv); 273 while ((*len)--) { 274 if (!isSPACE(*orig)) 275 *tmps++ = *orig; 276 orig++; 277 } 278 *tmps = '\0'; 279 *len = tmps - SvPVX(tmpsv); 280 return SvPVX(tmpsv); 281} 282#endif 283 284/* ------------------------------- iperlsys.h ------------------------------- */ 285#if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS) 286 287/* Otherwise this function is implemented as macros in iperlsys.h */ 288 289PERL_STATIC_INLINE bool 290S_PerlEnv_putenv(pTHX_ char * str) 291{ 292 PERL_ARGS_ASSERT_PERLENV_PUTENV; 293 294 ENV_LOCK; 295 bool retval = putenv(str); 296 ENV_UNLOCK; 297 298 return retval; 299} 300 301#endif 302 303/* ------------------------------- mg.h ------------------------------- */ 304 305#if defined(PERL_CORE) || defined(PERL_EXT) 306/* assumes get-magic and stringification have already occurred */ 307PERL_STATIC_INLINE STRLEN 308S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) 309{ 310 assert(mg->mg_type == PERL_MAGIC_regex_global); 311 assert(mg->mg_len != -1); 312 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv)) 313 return (STRLEN)mg->mg_len; 314 else { 315 const STRLEN pos = (STRLEN)mg->mg_len; 316 /* Without this check, we may read past the end of the buffer: */ 317 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; 318 return sv_or_pv_pos_u2b(sv, s, pos, NULL); 319 } 320} 321#endif 322 323/* ------------------------------- pad.h ------------------------------ */ 324 325#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) 326PERL_STATIC_INLINE bool 327S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) 328{ 329 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE; 330 331 /* is seq within the range _LOW to _HIGH ? 332 * This is complicated by the fact that PL_cop_seqmax 333 * may have wrapped around at some point */ 334 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO) 335 return FALSE; /* not yet introduced */ 336 337 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) { 338 /* in compiling scope */ 339 if ( 340 (seq > COP_SEQ_RANGE_LOW(pn)) 341 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) 342 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) 343 ) 344 return TRUE; 345 } 346 else if ( 347 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) 348 ? 349 ( seq > COP_SEQ_RANGE_LOW(pn) 350 || seq <= COP_SEQ_RANGE_HIGH(pn)) 351 352 : ( seq > COP_SEQ_RANGE_LOW(pn) 353 && seq <= COP_SEQ_RANGE_HIGH(pn)) 354 ) 355 return TRUE; 356 return FALSE; 357} 358#endif 359 360/* ------------------------------- pp.h ------------------------------- */ 361 362PERL_STATIC_INLINE I32 363Perl_TOPMARK(pTHX) 364{ 365 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 366 "MARK top %p %" IVdf "\n", 367 PL_markstack_ptr, 368 (IV)*PL_markstack_ptr))); 369 return *PL_markstack_ptr; 370} 371 372PERL_STATIC_INLINE I32 373Perl_POPMARK(pTHX) 374{ 375 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 376 "MARK pop %p %" IVdf "\n", 377 (PL_markstack_ptr-1), 378 (IV)*(PL_markstack_ptr-1)))); 379 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); 380 return *PL_markstack_ptr--; 381} 382 383/* ----------------------------- regexp.h ----------------------------- */ 384 385/* PVLVs need to act as a superset of all scalar types - they are basically 386 * PVMGs with a few extra fields. 387 * REGEXPs are first class scalars, but have many fields that can't be copied 388 * into a PVLV body. 389 * 390 * Hence we take a different approach - instead of a copy, PVLVs store a pointer 391 * back to the original body. To avoid increasing the size of PVLVs just for the 392 * rare case of REGEXP assignment, this pointer is stored in the memory usually 393 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to 394 * read the pointer from the two possible locations. The macro SvLEN() wraps the 395 * access to the union's member xpvlenu_len, but there is no equivalent macro 396 * for wrapping the union's member xpvlenu_rx, hence the direct reference here. 397 * 398 * See commit df6b4bd56551f2d3 for more details. */ 399 400PERL_STATIC_INLINE struct regexp * 401Perl_ReANY(const REGEXP * const re) 402{ 403 XPV* const p = (XPV*)SvANY(re); 404 405 PERL_ARGS_ASSERT_REANY; 406 assert(isREGEXP(re)); 407 408 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx 409 : (struct regexp *)p; 410} 411 412/* ------------------------------- utf8.h ------------------------------- */ 413 414/* 415=for apidoc_section $unicode 416*/ 417 418PERL_STATIC_INLINE void 419Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) 420{ 421 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 422 * encoded string at '*dest', updating '*dest' to include it */ 423 424 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; 425 426 if (NATIVE_BYTE_IS_INVARIANT(byte)) 427 *((*dest)++) = byte; 428 else { 429 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); 430 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); 431 } 432} 433 434/* 435=for apidoc valid_utf8_to_uvchr 436Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is 437known that the next character in the input UTF-8 string C<s> is well-formed 438(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code 439points, and non-Unicode code points are allowed. 440 441=cut 442 443 */ 444 445PERL_STATIC_INLINE UV 446Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) 447{ 448 const UV expectlen = UTF8SKIP(s); 449 const U8* send = s + expectlen; 450 UV uv = *s; 451 452 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; 453 454 if (retlen) { 455 *retlen = expectlen; 456 } 457 458 /* An invariant is trivially returned */ 459 if (expectlen == 1) { 460 return uv; 461 } 462 463 /* Remove the leading bits that indicate the number of bytes, leaving just 464 * the bits that are part of the value */ 465 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 466 467 /* Now, loop through the remaining bytes, accumulating each into the 468 * working total as we go. (I khw tried unrolling the loop for up to 4 469 * bytes, but there was no performance improvement) */ 470 for (++s; s < send; s++) { 471 uv = UTF8_ACCUMULATE(uv, *s); 472 } 473 474 return UNI_TO_NATIVE(uv); 475 476} 477 478/* 479=for apidoc is_utf8_invariant_string 480 481Returns TRUE if the first C<len> bytes of the string C<s> are the same 482regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on 483EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they 484are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only 485the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range 486characters are invariant, but so also are the C1 controls. 487 488If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you 489use this option, that C<s> can't have embedded C<NUL> characters and has to 490have a terminating C<NUL> byte). 491 492See also 493C<L</is_utf8_string>>, 494C<L</is_utf8_string_flags>>, 495C<L</is_utf8_string_loc>>, 496C<L</is_utf8_string_loc_flags>>, 497C<L</is_utf8_string_loclen>>, 498C<L</is_utf8_string_loclen_flags>>, 499C<L</is_utf8_fixed_width_buf_flags>>, 500C<L</is_utf8_fixed_width_buf_loc_flags>>, 501C<L</is_utf8_fixed_width_buf_loclen_flags>>, 502C<L</is_strict_utf8_string>>, 503C<L</is_strict_utf8_string_loc>>, 504C<L</is_strict_utf8_string_loclen>>, 505C<L</is_c9strict_utf8_string>>, 506C<L</is_c9strict_utf8_string_loc>>, 507and 508C<L</is_c9strict_utf8_string_loclen>>. 509 510=cut 511 512*/ 513 514#define is_utf8_invariant_string(s, len) \ 515 is_utf8_invariant_string_loc(s, len, NULL) 516 517/* 518=for apidoc is_utf8_invariant_string_loc 519 520Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of 521the first UTF-8 variant character in the C<ep> pointer; if all characters are 522UTF-8 invariant, this function does not change the contents of C<*ep>. 523 524=cut 525 526*/ 527 528PERL_STATIC_INLINE bool 529Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) 530{ 531 const U8* send; 532 const U8* x = s; 533 534 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; 535 536 if (len == 0) { 537 len = strlen((const char *)s); 538 } 539 540 send = s + len; 541 542/* This looks like 0x010101... */ 543# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) 544 545/* This looks like 0x808080... */ 546# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) 547# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) 548# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) 549 550/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by 551 * or'ing together the lowest bits of 'x'. Hopefully the final term gets 552 * optimized out completely on a 32-bit system, and its mask gets optimized out 553 * on a 64-bit system */ 554# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ 555 | ( PTR2nat(x) >> 1) \ 556 | ( ( (PTR2nat(x) \ 557 & PERL_WORD_BOUNDARY_MASK) >> 2)))) 558 559#ifndef EBCDIC 560 561 /* Do the word-at-a-time iff there is at least one usable full word. That 562 * means that after advancing to a word boundary, there still is at least a 563 * full word left. The number of bytes needed to advance is 'wordsize - 564 * offset' unless offset is 0. */ 565 if ((STRLEN) (send - x) >= PERL_WORDSIZE 566 567 /* This term is wordsize if subword; 0 if not */ 568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 569 570 /* 'offset' */ 571 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 572 { 573 574 /* Process per-byte until reach word boundary. XXX This loop could be 575 * eliminated if we knew that this platform had fast unaligned reads */ 576 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 577 if (! UTF8_IS_INVARIANT(*x)) { 578 if (ep) { 579 *ep = x; 580 } 581 582 return FALSE; 583 } 584 x++; 585 } 586 587 /* Here, we know we have at least one full word to process. Process 588 * per-word as long as we have at least a full word left */ 589 do { 590 if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { 591 592 /* Found a variant. Just return if caller doesn't want its 593 * exact position */ 594 if (! ep) { 595 return FALSE; 596 } 597 598# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ 599 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 600 601 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x); 602 assert(*ep >= s && *ep < send); 603 604 return FALSE; 605 606# else /* If weird byte order, drop into next loop to do byte-at-a-time 607 checks. */ 608 609 break; 610# endif 611 } 612 613 x += PERL_WORDSIZE; 614 615 } while (x + PERL_WORDSIZE <= send); 616 } 617 618#endif /* End of ! EBCDIC */ 619 620 /* Process per-byte */ 621 while (x < send) { 622 if (! UTF8_IS_INVARIANT(*x)) { 623 if (ep) { 624 *ep = x; 625 } 626 627 return FALSE; 628 } 629 630 x++; 631 } 632 633 return TRUE; 634} 635 636/* See if the platform has builtins for finding the most/least significant bit, 637 * and which one is right for using on 32 and 64 bit operands */ 638#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0)) 639# if U32SIZE == INTSIZE 640# define PERL_CLZ_32 __builtin_clz 641# endif 642# if defined(U64TYPE) && U64SIZE == INTSIZE 643# define PERL_CLZ_64 __builtin_clz 644# endif 645#endif 646#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0)) 647# if U32SIZE == INTSIZE 648# define PERL_CTZ_32 __builtin_ctz 649# endif 650# if defined(U64TYPE) && U64SIZE == INTSIZE 651# define PERL_CTZ_64 __builtin_ctz 652# endif 653#endif 654 655#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0)) 656# if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32) 657# define PERL_CLZ_32 __builtin_clzl 658# endif 659# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64) 660# define PERL_CLZ_64 __builtin_clzl 661# endif 662#endif 663#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0)) 664# if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32) 665# define PERL_CTZ_32 __builtin_ctzl 666# endif 667# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64) 668# define PERL_CTZ_64 __builtin_ctzl 669# endif 670#endif 671 672#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0)) 673# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32) 674# define PERL_CLZ_32 __builtin_clzll 675# endif 676# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64) 677# define PERL_CLZ_64 __builtin_clzll 678# endif 679#endif 680#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0)) 681# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32) 682# define PERL_CTZ_32 __builtin_ctzll 683# endif 684# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64) 685# define PERL_CTZ_64 __builtin_ctzll 686# endif 687#endif 688 689#if defined(_MSC_VER) 690# include <intrin.h> 691# pragma intrinsic(_BitScanForward) 692# pragma intrinsic(_BitScanReverse) 693# ifdef _WIN64 694# pragma intrinsic(_BitScanForward64) 695# pragma intrinsic(_BitScanReverse64) 696# endif 697#endif 698 699/* The reason there are not checks to see if ffs() and ffsl() are available for 700 * determining the lsb, is because these don't improve on the deBruijn method 701 * fallback, which is just a branchless integer multiply, array element 702 * retrieval, and shift. The others, even if the function call overhead is 703 * optimized out, have to cope with the possibility of the input being all 704 * zeroes, and almost certainly will have conditionals for this eventuality. 705 * khw, at the time of this commit, looked at the source for both gcc and clang 706 * to verify this. (gcc used a method inferior to deBruijn.) */ 707 708/* Below are functions to find the first, last, or only set bit in a word. On 709 * platforms with 64-bit capability, there is a pair for each operation; the 710 * first taking a 64 bit operand, and the second a 32 bit one. The logic is 711 * the same in each pair, so the second is stripped of most comments. */ 712 713#ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 714 715PERL_STATIC_INLINE unsigned 716Perl_lsbit_pos64(U64 word) 717{ 718 /* Find the position (0..63) of the least significant set bit in the input 719 * word */ 720 721 ASSUME(word != 0); 722 723 /* If we can determine that the platform has a usable fast method to get 724 * this info, use that */ 725 726# if defined(PERL_CTZ_64) 727# define PERL_HAS_FAST_GET_LSB_POS64 728 729 return (unsigned) PERL_CTZ_64(word); 730 731# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) 732# define PERL_HAS_FAST_GET_LSB_POS64 733 734 { 735 unsigned long index; 736 _BitScanForward64(&index, word); 737 return (unsigned)index; 738 } 739 740# else 741 742 /* Here, we didn't find a fast method for finding the lsb. Fall back to 743 * making the lsb the only set bit in the word, and use our function that 744 * works on words with a single bit set. 745 * 746 * Isolate the lsb; 747 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set 748 * 749 * The word will look like this, with a rightmost set bit in position 's': 750 * ('x's are don't cares, and 'y's are their complements) 751 * s 752 * x..x100..00 753 * y..y011..11 Complement 754 * y..y100..00 Add 1 755 * 0..0100..00 And with the original 756 * 757 * (Yes, complementing and adding 1 is just taking the negative on 2's 758 * complement machines, but not on 1's complement ones, and some compilers 759 * complain about negating an unsigned.) 760 */ 761 return single_1bit_pos64(word & (~word + 1)); 762 763# endif 764 765} 766 767# define lsbit_pos_uintmax_(word) lsbit_pos64(word) 768#else /* ! QUAD */ 769# define lsbit_pos_uintmax_(word) lsbit_pos32(word) 770#endif 771 772PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */ 773Perl_lsbit_pos32(U32 word) 774{ 775 /* Find the position (0..31) of the least significant set bit in the input 776 * word */ 777 778 ASSUME(word != 0); 779 780#if defined(PERL_CTZ_32) 781# define PERL_HAS_FAST_GET_LSB_POS32 782 783 return (unsigned) PERL_CTZ_32(word); 784 785#elif U32SIZE == 4 && defined(_MSC_VER) 786# define PERL_HAS_FAST_GET_LSB_POS32 787 788 { 789 unsigned long index; 790 _BitScanForward(&index, word); 791 return (unsigned)index; 792 } 793 794#else 795 796 return single_1bit_pos32(word & (~word + 1)); 797 798#endif 799 800} 801 802 803/* Convert the leading zeros count to the bit position of the first set bit. 804 * This just subtracts from the highest position, 31 or 63. But some compilers 805 * don't optimize this optimally, and so a bit of bit twiddling encourages them 806 * to do the right thing. It turns out that subtracting a smaller non-negative 807 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of 808 * the two numbers. To see why, first note that the sum of any number, x, and 809 * its complement, x', is all ones. So all ones minus x is x'. Then note that 810 * the xor of x and all ones is x'. */ 811#define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc)) 812 813#ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 814 815PERL_STATIC_INLINE unsigned 816Perl_msbit_pos64(U64 word) 817{ 818 /* Find the position (0..63) of the most significant set bit in the input 819 * word */ 820 821 ASSUME(word != 0); 822 823 /* If we can determine that the platform has a usable fast method to get 824 * this, use that */ 825 826# if defined(PERL_CLZ_64) 827# define PERL_HAS_FAST_GET_MSB_POS64 828 829 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word)); 830 831# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) 832# define PERL_HAS_FAST_GET_MSB_POS64 833 834 { 835 unsigned long index; 836 _BitScanReverse64(&index, word); 837 return (unsigned)index; 838 } 839 840# else 841 842 /* Here, we didn't find a fast method for finding the msb. Fall back to 843 * making the msb the only set bit in the word, and use our function that 844 * works on words with a single bit set. 845 * 846 * Isolate the msb; http://codeforces.com/blog/entry/10330 847 * 848 * Only the most significant set bit matters. Or'ing word with its right 849 * shift of 1 makes that bit and the next one to its right both 1. 850 * Repeating that with the right shift of 2 makes for 4 1-bits in a row. 851 * ... We end with the msb and all to the right being 1. */ 852 word |= (word >> 1); 853 word |= (word >> 2); 854 word |= (word >> 4); 855 word |= (word >> 8); 856 word |= (word >> 16); 857 word |= (word >> 32); 858 859 /* Then subtracting the right shift by 1 clears all but the left-most of 860 * the 1 bits, which is our desired result */ 861 word -= (word >> 1); 862 863 /* Now we have a single bit set */ 864 return single_1bit_pos64(word); 865 866# endif 867 868} 869 870# define msbit_pos_uintmax_(word) msbit_pos64(word) 871#else /* ! QUAD */ 872# define msbit_pos_uintmax_(word) msbit_pos32(word) 873#endif 874 875PERL_STATIC_INLINE unsigned 876Perl_msbit_pos32(U32 word) 877{ 878 /* Find the position (0..31) of the most significant set bit in the input 879 * word */ 880 881 ASSUME(word != 0); 882 883#if defined(PERL_CLZ_32) 884# define PERL_HAS_FAST_GET_MSB_POS32 885 886 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word)); 887 888#elif U32SIZE == 4 && defined(_MSC_VER) 889# define PERL_HAS_FAST_GET_MSB_POS32 890 891 { 892 unsigned long index; 893 _BitScanReverse(&index, word); 894 return (unsigned)index; 895 } 896 897#else 898 899 word |= (word >> 1); 900 word |= (word >> 2); 901 word |= (word >> 4); 902 word |= (word >> 8); 903 word |= (word >> 16); 904 word -= (word >> 1); 905 return single_1bit_pos32(word); 906 907#endif 908 909} 910 911#if UVSIZE == U64SIZE 912# define msbit_pos(word) msbit_pos64(word) 913# define lsbit_pos(word) lsbit_pos64(word) 914#elif UVSIZE == U32SIZE 915# define msbit_pos(word) msbit_pos32(word) 916# define lsbit_pos(word) lsbit_pos32(word) 917#endif 918 919#ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 920 921PERL_STATIC_INLINE unsigned 922Perl_single_1bit_pos64(U64 word) 923{ 924 /* Given a 64-bit word known to contain all zero bits except one 1 bit, 925 * find and return the 1's position: 0..63 */ 926 927# ifdef PERL_CORE /* macro not exported */ 928 ASSUME(isPOWER_OF_2(word)); 929# else 930 ASSUME(word && (word & (word-1)) == 0); 931# endif 932 933 /* The only set bit is both the most and least significant bit. If we have 934 * a fast way of finding either one, use that. 935 * 936 * It may appear at first glance that those functions call this one, but 937 * they don't if the corresponding #define is set */ 938 939# ifdef PERL_HAS_FAST_GET_MSB_POS64 940 941 return msbit_pos64(word); 942 943# elif defined(PERL_HAS_FAST_GET_LSB_POS64) 944 945 return lsbit_pos64(word); 946 947# else 948 949 /* The position of the only set bit in a word can be quickly calculated 950 * using deBruijn sequences. See for example 951 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */ 952 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_) 953 >> PERL_deBruijnShift64_]; 954# endif 955 956} 957 958#endif 959 960PERL_STATIC_INLINE unsigned 961Perl_single_1bit_pos32(U32 word) 962{ 963 /* Given a 32-bit word known to contain all zero bits except one 1 bit, 964 * find and return the 1's position: 0..31 */ 965 966#ifdef PERL_CORE /* macro not exported */ 967 ASSUME(isPOWER_OF_2(word)); 968#else 969 ASSUME(word && (word & (word-1)) == 0); 970#endif 971#ifdef PERL_HAS_FAST_GET_MSB_POS32 972 973 return msbit_pos32(word); 974 975#elif defined(PERL_HAS_FAST_GET_LSB_POS32) 976 977 return lsbit_pos32(word); 978 979/* Unlikely, but possible for the platform to have a wider fast operation but 980 * not a narrower one. But easy enough to handle the case by widening the 981 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops 982 * would be slower than the deBruijn method.) */ 983#elif defined(PERL_HAS_FAST_GET_MSB_POS64) 984 985 return msbit_pos64(word); 986 987#elif defined(PERL_HAS_FAST_GET_LSB_POS64) 988 989 return lsbit_pos64(word); 990 991#else 992 993 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_) 994 >> PERL_deBruijnShift32_]; 995#endif 996 997} 998 999#ifndef EBCDIC 1000 1001PERL_STATIC_INLINE unsigned int 1002Perl_variant_byte_number(PERL_UINTMAX_T word) 1003{ 1004 /* This returns the position in a word (0..7) of the first variant byte in 1005 * it. This is a helper function. Note that there are no branches */ 1006 1007 /* Get just the msb bits of each byte */ 1008 word &= PERL_VARIANTS_WORD_MASK; 1009 1010 /* This should only be called if we know there is a variant byte in the 1011 * word */ 1012 assert(word); 1013 1014# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 1015 1016 /* Bytes are stored like 1017 * Byte8 ... Byte2 Byte1 1018 * 63..56...15...8 7...0 1019 * so getting the lsb of the whole modified word is getting the msb of the 1020 * first byte that has its msb set */ 1021 word = lsbit_pos_uintmax_(word); 1022 1023 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert 1024 * to 0..7 */ 1025 return (unsigned int) ((word + 1) >> 3) - 1; 1026 1027# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 1028 1029 /* Bytes are stored like 1030 * Byte1 Byte2 ... Byte8 1031 * 63..56 55..47 ... 7...0 1032 * so getting the msb of the whole modified word is getting the msb of the 1033 * first byte that has its msb set */ 1034 word = msbit_pos_uintmax_(word); 1035 1036 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert 1037 * to 0..7 */ 1038 word = ((word + 1) >> 3) - 1; 1039 1040 /* And invert the result because of the reversed byte order on this 1041 * platform */ 1042 word = CHARBITS - word - 1; 1043 1044 return (unsigned int) word; 1045 1046# else 1047# error Unexpected byte order 1048# endif 1049 1050} 1051 1052#endif 1053#if defined(PERL_CORE) || defined(PERL_EXT) 1054 1055/* 1056=for apidoc variant_under_utf8_count 1057 1058This function looks at the sequence of bytes between C<s> and C<e>, which are 1059assumed to be encoded in ASCII/Latin1, and returns how many of them would 1060change should the string be translated into UTF-8. Due to the nature of UTF-8, 1061each of these would occupy two bytes instead of the single one in the input 1062string. Thus, this function returns the precise number of bytes the string 1063would expand by when translated to UTF-8. 1064 1065Unlike most of the other functions that have C<utf8> in their name, the input 1066to this function is NOT a UTF-8-encoded string. The function name is slightly 1067I<odd> to emphasize this. 1068 1069This function is internal to Perl because khw thinks that any XS code that 1070would want this is probably operating too close to the internals. Presenting a 1071valid use case could change that. 1072 1073See also 1074C<L<perlapi/is_utf8_invariant_string>> 1075and 1076C<L<perlapi/is_utf8_invariant_string_loc>>, 1077 1078=cut 1079 1080*/ 1081 1082PERL_STATIC_INLINE Size_t 1083S_variant_under_utf8_count(const U8* const s, const U8* const e) 1084{ 1085 const U8* x = s; 1086 Size_t count = 0; 1087 1088 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; 1089 1090# ifndef EBCDIC 1091 1092 /* Test if the string is long enough to use word-at-a-time. (Logic is the 1093 * same as for is_utf8_invariant_string()) */ 1094 if ((STRLEN) (e - x) >= PERL_WORDSIZE 1095 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 1096 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 1097 { 1098 1099 /* Process per-byte until reach word boundary. XXX This loop could be 1100 * eliminated if we knew that this platform had fast unaligned reads */ 1101 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 1102 count += ! UTF8_IS_INVARIANT(*x++); 1103 } 1104 1105 /* Process per-word as long as we have at least a full word left */ 1106 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an 1107 explanation of how this works */ 1108 PERL_UINTMAX_T increment 1109 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) 1110 * PERL_COUNT_MULTIPLIER) 1111 >> ((PERL_WORDSIZE - 1) * CHARBITS); 1112 count += (Size_t) increment; 1113 x += PERL_WORDSIZE; 1114 } while (x + PERL_WORDSIZE <= e); 1115 } 1116 1117# endif 1118 1119 /* Process per-byte */ 1120 while (x < e) { 1121 if (! UTF8_IS_INVARIANT(*x)) { 1122 count++; 1123 } 1124 1125 x++; 1126 } 1127 1128 return count; 1129} 1130 1131#endif 1132 1133 /* Keep these around for these files */ 1134#if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C) 1135# undef PERL_WORDSIZE 1136# undef PERL_COUNT_MULTIPLIER 1137# undef PERL_WORD_BOUNDARY_MASK 1138# undef PERL_VARIANTS_WORD_MASK 1139#endif 1140 1141/* 1142=for apidoc is_utf8_string 1143 1144Returns TRUE if the first C<len> bytes of string C<s> form a valid 1145Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will 1146be calculated using C<strlen(s)> (which means if you use this option, that C<s> 1147can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1148byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1149 1150This function considers Perl's extended UTF-8 to be valid. That means that 1151code points above Unicode, surrogates, and non-character code points are 1152considered valid by this function. Use C<L</is_strict_utf8_string>>, 1153C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what 1154code points are considered valid. 1155 1156See also 1157C<L</is_utf8_invariant_string>>, 1158C<L</is_utf8_invariant_string_loc>>, 1159C<L</is_utf8_string_loc>>, 1160C<L</is_utf8_string_loclen>>, 1161C<L</is_utf8_fixed_width_buf_flags>>, 1162C<L</is_utf8_fixed_width_buf_loc_flags>>, 1163C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1164 1165=cut 1166*/ 1167 1168#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) 1169 1170#if defined(PERL_CORE) || defined (PERL_EXT) 1171 1172/* 1173=for apidoc is_utf8_non_invariant_string 1174 1175Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first 1176C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended 1177UTF-8; otherwise returns FALSE. 1178 1179A TRUE return means that at least one code point represented by the sequence 1180either is a wide character not representable as a single byte, or the 1181representation differs depending on whether the sequence is encoded in UTF-8 or 1182not. 1183 1184See also 1185C<L<perlapi/is_utf8_invariant_string>>, 1186C<L<perlapi/is_utf8_string>> 1187 1188=cut 1189 1190This is commonly used to determine if a SV's UTF-8 flag should be turned on. 1191It generally needn't be if its string is entirely UTF-8 invariant, and it 1192shouldn't be if it otherwise contains invalid UTF-8. 1193 1194It is an internal function because khw thinks that XS code shouldn't be working 1195at this low a level. A valid use case could change that. 1196 1197*/ 1198 1199PERL_STATIC_INLINE bool 1200Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len) 1201{ 1202 const U8 * first_variant; 1203 1204 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; 1205 1206 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1207 return FALSE; 1208 } 1209 1210 return is_utf8_string(first_variant, len - (first_variant - s)); 1211} 1212 1213#endif 1214 1215/* 1216=for apidoc is_strict_utf8_string 1217 1218Returns TRUE if the first C<len> bytes of string C<s> form a valid 1219UTF-8-encoded string that is fully interchangeable by any application using 1220Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be 1221calculated using C<strlen(s)> (which means if you use this option, that C<s> 1222can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1223byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1224 1225This function returns FALSE for strings containing any 1226code points above the Unicode max of 0x10FFFF, surrogate code points, or 1227non-character code points. 1228 1229See also 1230C<L</is_utf8_invariant_string>>, 1231C<L</is_utf8_invariant_string_loc>>, 1232C<L</is_utf8_string>>, 1233C<L</is_utf8_string_flags>>, 1234C<L</is_utf8_string_loc>>, 1235C<L</is_utf8_string_loc_flags>>, 1236C<L</is_utf8_string_loclen>>, 1237C<L</is_utf8_string_loclen_flags>>, 1238C<L</is_utf8_fixed_width_buf_flags>>, 1239C<L</is_utf8_fixed_width_buf_loc_flags>>, 1240C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1241C<L</is_strict_utf8_string_loc>>, 1242C<L</is_strict_utf8_string_loclen>>, 1243C<L</is_c9strict_utf8_string>>, 1244C<L</is_c9strict_utf8_string_loc>>, 1245and 1246C<L</is_c9strict_utf8_string_loclen>>. 1247 1248=cut 1249*/ 1250 1251#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) 1252 1253/* 1254=for apidoc is_c9strict_utf8_string 1255 1256Returns TRUE if the first C<len> bytes of string C<s> form a valid 1257UTF-8-encoded string that conforms to 1258L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>; 1259otherwise it returns FALSE. If C<len> is 0, it will be calculated using 1260C<strlen(s)> (which means if you use this option, that C<s> can't have embedded 1261C<NUL> characters and has to have a terminating C<NUL> byte). Note that all 1262characters being ASCII constitute 'a valid UTF-8 string'. 1263 1264This function returns FALSE for strings containing any code points above the 1265Unicode max of 0x10FFFF or surrogate code points, but accepts non-character 1266code points per 1267L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1268 1269See also 1270C<L</is_utf8_invariant_string>>, 1271C<L</is_utf8_invariant_string_loc>>, 1272C<L</is_utf8_string>>, 1273C<L</is_utf8_string_flags>>, 1274C<L</is_utf8_string_loc>>, 1275C<L</is_utf8_string_loc_flags>>, 1276C<L</is_utf8_string_loclen>>, 1277C<L</is_utf8_string_loclen_flags>>, 1278C<L</is_utf8_fixed_width_buf_flags>>, 1279C<L</is_utf8_fixed_width_buf_loc_flags>>, 1280C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1281C<L</is_strict_utf8_string>>, 1282C<L</is_strict_utf8_string_loc>>, 1283C<L</is_strict_utf8_string_loclen>>, 1284C<L</is_c9strict_utf8_string_loc>>, 1285and 1286C<L</is_c9strict_utf8_string_loclen>>. 1287 1288=cut 1289*/ 1290 1291#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) 1292 1293/* 1294=for apidoc is_utf8_string_flags 1295 1296Returns TRUE if the first C<len> bytes of string C<s> form a valid 1297UTF-8 string, subject to the restrictions imposed by C<flags>; 1298returns FALSE otherwise. If C<len> is 0, it will be calculated 1299using C<strlen(s)> (which means if you use this option, that C<s> can't have 1300embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note 1301that all characters being ASCII constitute 'a valid UTF-8 string'. 1302 1303If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if 1304C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 1305as C<L</is_strict_utf8_string>>; and if C<flags> is 1306C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as 1307C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any 1308combination of the C<UTF8_DISALLOW_I<foo>> flags understood by 1309C<L</utf8n_to_uvchr>>, with the same meanings. 1310 1311See also 1312C<L</is_utf8_invariant_string>>, 1313C<L</is_utf8_invariant_string_loc>>, 1314C<L</is_utf8_string>>, 1315C<L</is_utf8_string_loc>>, 1316C<L</is_utf8_string_loc_flags>>, 1317C<L</is_utf8_string_loclen>>, 1318C<L</is_utf8_string_loclen_flags>>, 1319C<L</is_utf8_fixed_width_buf_flags>>, 1320C<L</is_utf8_fixed_width_buf_loc_flags>>, 1321C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1322C<L</is_strict_utf8_string>>, 1323C<L</is_strict_utf8_string_loc>>, 1324C<L</is_strict_utf8_string_loclen>>, 1325C<L</is_c9strict_utf8_string>>, 1326C<L</is_c9strict_utf8_string_loc>>, 1327and 1328C<L</is_c9strict_utf8_string_loclen>>. 1329 1330=cut 1331*/ 1332 1333PERL_STATIC_INLINE bool 1334Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) 1335{ 1336 const U8 * first_variant; 1337 1338 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; 1339 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1340 |UTF8_DISALLOW_PERL_EXTENDED))); 1341 1342 if (len == 0) { 1343 len = strlen((const char *)s); 1344 } 1345 1346 if (flags == 0) { 1347 return is_utf8_string(s, len); 1348 } 1349 1350 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1351 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 1352 { 1353 return is_strict_utf8_string(s, len); 1354 } 1355 1356 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1357 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 1358 { 1359 return is_c9strict_utf8_string(s, len); 1360 } 1361 1362 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { 1363 const U8* const send = s + len; 1364 const U8* x = first_variant; 1365 1366 while (x < send) { 1367 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 1368 if (UNLIKELY(! cur_len)) { 1369 return FALSE; 1370 } 1371 x += cur_len; 1372 } 1373 } 1374 1375 return TRUE; 1376} 1377 1378/* 1379 1380=for apidoc is_utf8_string_loc 1381 1382Like C<L</is_utf8_string>> but stores the location of the failure (in the 1383case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1384"utf8ness success") in the C<ep> pointer. 1385 1386See also C<L</is_utf8_string_loclen>>. 1387 1388=cut 1389*/ 1390 1391#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) 1392 1393/* 1394 1395=for apidoc is_utf8_string_loclen 1396 1397Like C<L</is_utf8_string>> but stores the location of the failure (in the 1398case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1399"utf8ness success") in the C<ep> pointer, and the number of UTF-8 1400encoded characters in the C<el> pointer. 1401 1402See also C<L</is_utf8_string_loc>>. 1403 1404=cut 1405*/ 1406 1407PERL_STATIC_INLINE bool 1408Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1409{ 1410 const U8 * first_variant; 1411 1412 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 1413 1414 if (len == 0) { 1415 len = strlen((const char *) s); 1416 } 1417 1418 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1419 if (el) 1420 *el = len; 1421 1422 if (ep) { 1423 *ep = s + len; 1424 } 1425 1426 return TRUE; 1427 } 1428 1429 { 1430 const U8* const send = s + len; 1431 const U8* x = first_variant; 1432 STRLEN outlen = first_variant - s; 1433 1434 while (x < send) { 1435 const STRLEN cur_len = isUTF8_CHAR(x, send); 1436 if (UNLIKELY(! cur_len)) { 1437 break; 1438 } 1439 x += cur_len; 1440 outlen++; 1441 } 1442 1443 if (el) 1444 *el = outlen; 1445 1446 if (ep) { 1447 *ep = x; 1448 } 1449 1450 return (x == send); 1451 } 1452} 1453 1454/* The perl core arranges to never call the DFA below without there being at 1455 * least one byte available to look at. This allows the DFA to use a do {} 1456 * while loop which means that calling it with a UTF-8 invariant has a single 1457 * conditional, same as the calling code checking for invariance ahead of time. 1458 * And having the calling code remove that conditional speeds up by that 1459 * conditional, the case where it wasn't invariant. So there's no reason to 1460 * check before caling this. 1461 * 1462 * But we don't know this for non-core calls, so have to retain the check for 1463 * them. */ 1464#ifdef PERL_CORE 1465# define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s)) 1466#else 1467# define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE 1468#endif 1469 1470/* 1471 * DFA for checking input is valid UTF-8 syntax. 1472 * 1473 * This uses adaptations of the table and algorithm given in 1474 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1475 * documentation of the original version. A copyright notice for the original 1476 * version is given at the beginning of this file. The Perl adaptations are 1477 * documented at the definition of PL_extended_utf8_dfa_tab[]. 1478 * 1479 * This dfa is fast. There are three exit conditions: 1480 * 1) a well-formed code point, acceptable to the table 1481 * 2) the beginning bytes of an incomplete character, whose completion might 1482 * or might not be acceptable 1483 * 3) unacceptable to the table. Some of the adaptations have certain, 1484 * hopefully less likely to occur, legal inputs be unacceptable to the 1485 * table, so these must be sorted out afterwards. 1486 * 1487 * This macro is a complete implementation of the code executing the DFA. It 1488 * is passed the input sequence bounds and the table to use, and what to do 1489 * for each of the exit conditions. There are three canned actions, likely to 1490 * be the ones you want: 1491 * DFA_RETURN_SUCCESS_ 1492 * DFA_RETURN_FAILURE_ 1493 * DFA_GOTO_TEASE_APART_FF_ 1494 * 1495 * You pass a parameter giving the action to take for each of the three 1496 * possible exit conditions: 1497 * 1498 * 'accept_action' This is executed when the DFA accepts the input. 1499 * DFA_RETURN_SUCCESS_ is the most likely candidate. 1500 * 'reject_action' This is executed when the DFA rejects the input. 1501 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where 1502 * you have written code to distinguish the rejecting state 1503 * results. Because it happens in several places, and 1504 * involves #ifdefs, the special action 1505 * DFA_GOTO_TEASE_APART_FF_ is what you want with 1506 * PL_extended_utf8_dfa_tab. On platforms without 1507 * EXTRA_LONG_UTF8, there is no need to tease anything apart, 1508 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you 1509 * need to have a label 'tease_apart_FF' that it will transfer 1510 * to. 1511 * 'incomplete_char_action' This is executed when the DFA ran off the end 1512 * before accepting or rejecting the input. 1513 * DFA_RETURN_FAILURE_ is the likely action, but you could 1514 * have a 'goto', or NOOP. In the latter case the DFA drops 1515 * off the end, and you place your code to handle this case 1516 * immediately after it. 1517 */ 1518 1519#define DFA_RETURN_SUCCESS_ return s - s0 1520#define DFA_RETURN_FAILURE_ return 0 1521#ifdef HAS_EXTRA_LONG_UTF8 1522# define DFA_TEASE_APART_FF_ goto tease_apart_FF 1523#else 1524# define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_ 1525#endif 1526 1527#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \ 1528 accept_action, \ 1529 reject_action, \ 1530 incomplete_char_action) \ 1531 STMT_START { \ 1532 const U8 * s = s0; \ 1533 const U8 * e_ = e; \ 1534 UV state = 0; \ 1535 \ 1536 PERL_NON_CORE_CHECK_EMPTY(s, e_); \ 1537 \ 1538 do { \ 1539 state = dfa_tab[256 + state + dfa_tab[*s]]; \ 1540 s++; \ 1541 \ 1542 if (state == 0) { /* Accepting state */ \ 1543 accept_action; \ 1544 } \ 1545 \ 1546 if (UNLIKELY(state == 1)) { /* Rejecting state */ \ 1547 reject_action; \ 1548 } \ 1549 } while (s < e_); \ 1550 \ 1551 /* Here, dropped out of loop before end-of-char */ \ 1552 incomplete_char_action; \ 1553 } STMT_END 1554 1555 1556/* 1557 1558=for apidoc isUTF8_CHAR 1559 1560Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1561looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 1562that represents some code point; otherwise it evaluates to 0. If non-zero, the 1563value gives how many bytes starting at C<s> comprise the code point's 1564representation. Any bytes remaining before C<e>, but beyond the ones needed to 1565form the first code point in C<s>, are not examined. 1566 1567The code point can be any that will fit in an IV on this machine, using Perl's 1568extension to official UTF-8 to represent those higher than the Unicode maximum 1569of 0x10FFFF. That means that this macro is used to efficiently decide if the 1570next few bytes in C<s> is legal UTF-8 for a single character. 1571 1572Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those 1573defined by Unicode to be fully interchangeable across applications; 1574C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1575#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1576code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1577 1578Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and 1579C<L</is_utf8_string_loclen>> to check entire strings. 1580 1581Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC 1582machines) is a valid UTF-8 character. 1583 1584=cut 1585 1586This uses an adaptation of the table and algorithm given in 1587https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1588documentation of the original version. A copyright notice for the original 1589version is given at the beginning of this file. The Perl adaptation is 1590documented at the definition of PL_extended_utf8_dfa_tab[]. 1591*/ 1592 1593PERL_STATIC_INLINE Size_t 1594Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) 1595{ 1596 PERL_ARGS_ASSERT_ISUTF8_CHAR; 1597 1598 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 1599 DFA_RETURN_SUCCESS_, 1600 DFA_TEASE_APART_FF_, 1601 DFA_RETURN_FAILURE_); 1602 1603 /* Here, we didn't return success, but dropped out of the loop. In the 1604 * case of PL_extended_utf8_dfa_tab, this means the input is either 1605 * malformed, or the start byte was FF on a platform that the dfa doesn't 1606 * handle FF's. Call a helper function. */ 1607 1608#ifdef HAS_EXTRA_LONG_UTF8 1609 1610 tease_apart_FF: 1611 1612 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 1613 * either malformed, or was for the largest possible start byte, which we 1614 * now check, not inline */ 1615 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) { 1616 return 0; 1617 } 1618 1619 return is_utf8_FF_helper_(s0, e, 1620 FALSE /* require full, not partial char */ 1621 ); 1622#endif 1623 1624} 1625 1626/* 1627 1628=for apidoc isSTRICT_UTF8_CHAR 1629 1630Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1631looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1632Unicode code point completely acceptable for open interchange between all 1633applications; otherwise it evaluates to 0. If non-zero, the value gives how 1634many bytes starting at C<s> comprise the code point's representation. Any 1635bytes remaining before C<e>, but beyond the ones needed to form the first code 1636point in C<s>, are not examined. 1637 1638The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not 1639be a surrogate nor a non-character code point. Thus this excludes any code 1640point from Perl's extended UTF-8. 1641 1642This is used to efficiently decide if the next few bytes in C<s> is 1643legal Unicode-acceptable UTF-8 for a single character. 1644 1645Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1646#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1647code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; 1648and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1649 1650Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and 1651C<L</is_strict_utf8_string_loclen>> to check entire strings. 1652 1653=cut 1654 1655This uses an adaptation of the tables and algorithm given in 1656https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1657documentation of the original version. A copyright notice for the original 1658version is given at the beginning of this file. The Perl adaptation is 1659documented at the definition of strict_extended_utf8_dfa_tab[]. 1660 1661*/ 1662 1663PERL_STATIC_INLINE Size_t 1664Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1665{ 1666 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; 1667 1668 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, 1669 DFA_RETURN_SUCCESS_, 1670 goto check_hanguls, 1671 DFA_RETURN_FAILURE_); 1672 check_hanguls: 1673 1674 /* Here, we didn't return success, but dropped out of the loop. In the 1675 * case of PL_strict_utf8_dfa_tab, this means the input is either 1676 * malformed, or was for certain Hanguls; handle them specially */ 1677 1678 /* The dfa above drops out for incomplete or illegal inputs, and certain 1679 * legal Hanguls; check and return accordingly */ 1680 return is_HANGUL_ED_utf8_safe(s0, e); 1681} 1682 1683/* 1684 1685=for apidoc isC9_STRICT_UTF8_CHAR 1686 1687Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1688looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1689Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, 1690the value gives how many bytes starting at C<s> comprise the code point's 1691representation. Any bytes remaining before C<e>, but beyond the ones needed to 1692form the first code point in C<s>, are not examined. 1693 1694The largest acceptable code point is the Unicode maximum 0x10FFFF. This 1695differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character 1696code points. This corresponds to 1697L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1698which said that non-character code points are merely discouraged rather than 1699completely forbidden in open interchange. See 1700L<perlunicode/Noncharacter code points>. 1701 1702Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and 1703C<L</isUTF8_CHAR_flags>> for a more customized definition. 1704 1705Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and 1706C<L</is_c9strict_utf8_string_loclen>> to check entire strings. 1707 1708=cut 1709 1710This uses an adaptation of the tables and algorithm given in 1711https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1712documentation of the original version. A copyright notice for the original 1713version is given at the beginning of this file. The Perl adaptation is 1714documented at the definition of PL_c9_utf8_dfa_tab[]. 1715 1716*/ 1717 1718PERL_STATIC_INLINE Size_t 1719Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1720{ 1721 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; 1722 1723 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab, 1724 DFA_RETURN_SUCCESS_, 1725 DFA_RETURN_FAILURE_, 1726 DFA_RETURN_FAILURE_); 1727} 1728 1729/* 1730 1731=for apidoc is_strict_utf8_string_loc 1732 1733Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1734case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1735"utf8ness success") in the C<ep> pointer. 1736 1737See also C<L</is_strict_utf8_string_loclen>>. 1738 1739=cut 1740*/ 1741 1742#define is_strict_utf8_string_loc(s, len, ep) \ 1743 is_strict_utf8_string_loclen(s, len, ep, 0) 1744 1745/* 1746 1747=for apidoc is_strict_utf8_string_loclen 1748 1749Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1750case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1751"utf8ness success") in the C<ep> pointer, and the number of UTF-8 1752encoded characters in the C<el> pointer. 1753 1754See also C<L</is_strict_utf8_string_loc>>. 1755 1756=cut 1757*/ 1758 1759PERL_STATIC_INLINE bool 1760Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1761{ 1762 const U8 * first_variant; 1763 1764 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; 1765 1766 if (len == 0) { 1767 len = strlen((const char *) s); 1768 } 1769 1770 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1771 if (el) 1772 *el = len; 1773 1774 if (ep) { 1775 *ep = s + len; 1776 } 1777 1778 return TRUE; 1779 } 1780 1781 { 1782 const U8* const send = s + len; 1783 const U8* x = first_variant; 1784 STRLEN outlen = first_variant - s; 1785 1786 while (x < send) { 1787 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); 1788 if (UNLIKELY(! cur_len)) { 1789 break; 1790 } 1791 x += cur_len; 1792 outlen++; 1793 } 1794 1795 if (el) 1796 *el = outlen; 1797 1798 if (ep) { 1799 *ep = x; 1800 } 1801 1802 return (x == send); 1803 } 1804} 1805 1806/* 1807 1808=for apidoc is_c9strict_utf8_string_loc 1809 1810Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1811the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1812"utf8ness success") in the C<ep> pointer. 1813 1814See also C<L</is_c9strict_utf8_string_loclen>>. 1815 1816=cut 1817*/ 1818 1819#define is_c9strict_utf8_string_loc(s, len, ep) \ 1820 is_c9strict_utf8_string_loclen(s, len, ep, 0) 1821 1822/* 1823 1824=for apidoc is_c9strict_utf8_string_loclen 1825 1826Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1827the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1828"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded 1829characters in the C<el> pointer. 1830 1831See also C<L</is_c9strict_utf8_string_loc>>. 1832 1833=cut 1834*/ 1835 1836PERL_STATIC_INLINE bool 1837Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1838{ 1839 const U8 * first_variant; 1840 1841 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; 1842 1843 if (len == 0) { 1844 len = strlen((const char *) s); 1845 } 1846 1847 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1848 if (el) 1849 *el = len; 1850 1851 if (ep) { 1852 *ep = s + len; 1853 } 1854 1855 return TRUE; 1856 } 1857 1858 { 1859 const U8* const send = s + len; 1860 const U8* x = first_variant; 1861 STRLEN outlen = first_variant - s; 1862 1863 while (x < send) { 1864 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); 1865 if (UNLIKELY(! cur_len)) { 1866 break; 1867 } 1868 x += cur_len; 1869 outlen++; 1870 } 1871 1872 if (el) 1873 *el = outlen; 1874 1875 if (ep) { 1876 *ep = x; 1877 } 1878 1879 return (x == send); 1880 } 1881} 1882 1883/* 1884 1885=for apidoc is_utf8_string_loc_flags 1886 1887Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1888case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1889"utf8ness success") in the C<ep> pointer. 1890 1891See also C<L</is_utf8_string_loclen_flags>>. 1892 1893=cut 1894*/ 1895 1896#define is_utf8_string_loc_flags(s, len, ep, flags) \ 1897 is_utf8_string_loclen_flags(s, len, ep, 0, flags) 1898 1899 1900/* The above 3 actual functions could have been moved into the more general one 1901 * just below, and made #defines that call it with the right 'flags'. They are 1902 * currently kept separate to increase their chances of getting inlined */ 1903 1904/* 1905 1906=for apidoc is_utf8_string_loclen_flags 1907 1908Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1909case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1910"utf8ness success") in the C<ep> pointer, and the number of UTF-8 1911encoded characters in the C<el> pointer. 1912 1913See also C<L</is_utf8_string_loc_flags>>. 1914 1915=cut 1916*/ 1917 1918PERL_STATIC_INLINE bool 1919Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) 1920{ 1921 const U8 * first_variant; 1922 1923 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; 1924 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1925 |UTF8_DISALLOW_PERL_EXTENDED))); 1926 1927 if (len == 0) { 1928 len = strlen((const char *) s); 1929 } 1930 1931 if (flags == 0) { 1932 return is_utf8_string_loclen(s, len, ep, el); 1933 } 1934 1935 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1936 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 1937 { 1938 return is_strict_utf8_string_loclen(s, len, ep, el); 1939 } 1940 1941 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1942 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 1943 { 1944 return is_c9strict_utf8_string_loclen(s, len, ep, el); 1945 } 1946 1947 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1948 if (el) 1949 *el = len; 1950 1951 if (ep) { 1952 *ep = s + len; 1953 } 1954 1955 return TRUE; 1956 } 1957 1958 { 1959 const U8* send = s + len; 1960 const U8* x = first_variant; 1961 STRLEN outlen = first_variant - s; 1962 1963 while (x < send) { 1964 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 1965 if (UNLIKELY(! cur_len)) { 1966 break; 1967 } 1968 x += cur_len; 1969 outlen++; 1970 } 1971 1972 if (el) 1973 *el = outlen; 1974 1975 if (ep) { 1976 *ep = x; 1977 } 1978 1979 return (x == send); 1980 } 1981} 1982 1983/* 1984=for apidoc utf8_distance 1985 1986Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 1987and C<b>. 1988 1989WARNING: use only if you *know* that the pointers point inside the 1990same UTF-8 buffer. 1991 1992=cut 1993*/ 1994 1995PERL_STATIC_INLINE IV 1996Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 1997{ 1998 PERL_ARGS_ASSERT_UTF8_DISTANCE; 1999 2000 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 2001} 2002 2003/* 2004=for apidoc utf8_hop 2005 2006Return the UTF-8 pointer C<s> displaced by C<off> characters, either 2007forward (if C<off> is positive) or backward (if negative). C<s> does not need 2008to be pointing to the starting byte of a character. If it isn't, one count of 2009C<off> will be used up to get to the start of the next character for forward 2010hops, and to the start of the current character for negative ones. 2011 2012WARNING: Prefer L</utf8_hop_safe> to this one. 2013 2014Do NOT use this function unless you B<know> C<off> is within 2015the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned 2016on the first byte of a character or just after the last byte of a character. 2017 2018=cut 2019*/ 2020 2021PERL_STATIC_INLINE U8 * 2022Perl_utf8_hop(const U8 *s, SSize_t off) 2023{ 2024 PERL_ARGS_ASSERT_UTF8_HOP; 2025 2026 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2027 * the XXX bitops (especially ~) can create illegal UTF-8. 2028 * In other words: in Perl UTF-8 is not just for Unicode. */ 2029 2030 if (off > 0) { 2031 2032 /* Get to next non-continuation byte */ 2033 if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) { 2034 do { 2035 s++; 2036 } 2037 while (UTF8_IS_CONTINUATION(*s)); 2038 off--; 2039 } 2040 2041 while (off--) 2042 s += UTF8SKIP(s); 2043 } 2044 else { 2045 while (off++) { 2046 s--; 2047 while (UTF8_IS_CONTINUATION(*s)) 2048 s--; 2049 } 2050 } 2051 2052 GCC_DIAG_IGNORE(-Wcast-qual) 2053 return (U8 *)s; 2054 GCC_DIAG_RESTORE 2055} 2056 2057/* 2058=for apidoc utf8_hop_forward 2059 2060Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2061forward. C<s> does not need to be pointing to the starting byte of a 2062character. If it isn't, one count of C<off> will be used up to get to the 2063start of the next character. 2064 2065C<off> must be non-negative. 2066 2067C<s> must be before or equal to C<end>. 2068 2069When moving forward it will not move beyond C<end>. 2070 2071Will not exceed this limit even if the string is not valid "UTF-8". 2072 2073=cut 2074*/ 2075 2076PERL_STATIC_INLINE U8 * 2077Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) 2078{ 2079 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; 2080 2081 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2082 * the bitops (especially ~) can create illegal UTF-8. 2083 * In other words: in Perl UTF-8 is not just for Unicode. */ 2084 2085 assert(s <= end); 2086 assert(off >= 0); 2087 2088 if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) { 2089 /* Get to next non-continuation byte */ 2090 do { 2091 s++; 2092 } 2093 while (UTF8_IS_CONTINUATION(*s)); 2094 off--; 2095 } 2096 2097 while (off--) { 2098 STRLEN skip = UTF8SKIP(s); 2099 if ((STRLEN)(end - s) <= skip) { 2100 GCC_DIAG_IGNORE(-Wcast-qual) 2101 return (U8 *)end; 2102 GCC_DIAG_RESTORE 2103 } 2104 s += skip; 2105 } 2106 2107 GCC_DIAG_IGNORE(-Wcast-qual) 2108 return (U8 *)s; 2109 GCC_DIAG_RESTORE 2110} 2111 2112/* 2113=for apidoc utf8_hop_back 2114 2115Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2116backward. C<s> does not need to be pointing to the starting byte of a 2117character. If it isn't, one count of C<off> will be used up to get to that 2118start. 2119 2120C<off> must be non-positive. 2121 2122C<s> must be after or equal to C<start>. 2123 2124When moving backward it will not move before C<start>. 2125 2126Will not exceed this limit even if the string is not valid "UTF-8". 2127 2128=cut 2129*/ 2130 2131PERL_STATIC_INLINE U8 * 2132Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) 2133{ 2134 PERL_ARGS_ASSERT_UTF8_HOP_BACK; 2135 2136 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2137 * the bitops (especially ~) can create illegal UTF-8. 2138 * In other words: in Perl UTF-8 is not just for Unicode. */ 2139 2140 assert(start <= s); 2141 assert(off <= 0); 2142 2143 /* Note: if we know that the input is well-formed, we can do per-word 2144 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented 2145 * that. But it was reverted because doing per-word has some 2146 * start-up/tear-down overhead, so only makes sense if the distance to be 2147 * moved is large, and core perl doesn't currently move more than a few 2148 * characters at a time. You can reinstate it if it does become 2149 * advantageous. */ 2150 while (off++ && s > start) { 2151 do { 2152 s--; 2153 } while (UTF8_IS_CONTINUATION(*s) && s > start); 2154 } 2155 2156 GCC_DIAG_IGNORE(-Wcast-qual) 2157 return (U8 *)s; 2158 GCC_DIAG_RESTORE 2159} 2160 2161/* 2162=for apidoc utf8_hop_safe 2163 2164Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2165either forward or backward. C<s> does not need to be pointing to the starting 2166byte of a character. If it isn't, one count of C<off> will be used up to get 2167to the start of the next character for forward hops, and to the start of the 2168current character for negative ones. 2169 2170When moving backward it will not move before C<start>. 2171 2172When moving forward it will not move beyond C<end>. 2173 2174Will not exceed those limits even if the string is not valid "UTF-8". 2175 2176=cut 2177*/ 2178 2179PERL_STATIC_INLINE U8 * 2180Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) 2181{ 2182 PERL_ARGS_ASSERT_UTF8_HOP_SAFE; 2183 2184 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2185 * the bitops (especially ~) can create illegal UTF-8. 2186 * In other words: in Perl UTF-8 is not just for Unicode. */ 2187 2188 assert(start <= s && s <= end); 2189 2190 if (off >= 0) { 2191 return utf8_hop_forward(s, off, end); 2192 } 2193 else { 2194 return utf8_hop_back(s, off, start); 2195 } 2196} 2197 2198/* 2199 2200=for apidoc isUTF8_CHAR_flags 2201 2202Evaluates to non-zero if the first few bytes of the string starting at C<s> and 2203looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 2204that represents some code point, subject to the restrictions given by C<flags>; 2205otherwise it evaluates to 0. If non-zero, the value gives how many bytes 2206starting at C<s> comprise the code point's representation. Any bytes remaining 2207before C<e>, but beyond the ones needed to form the first code point in C<s>, 2208are not examined. 2209 2210If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; 2211if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 2212as C<L</isSTRICT_UTF8_CHAR>>; 2213and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives 2214the same results as C<L</isC9_STRICT_UTF8_CHAR>>. 2215Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags 2216understood by C<L</utf8n_to_uvchr>>, with the same meanings. 2217 2218The three alternative macros are for the most commonly needed validations; they 2219are likely to run somewhat faster than this more general one, as they can be 2220inlined into your code. 2221 2222Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and 2223L</is_utf8_string_loclen_flags> to check entire strings. 2224 2225=cut 2226*/ 2227 2228PERL_STATIC_INLINE STRLEN 2229Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) 2230{ 2231 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS; 2232 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2233 |UTF8_DISALLOW_PERL_EXTENDED))); 2234 2235 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2236 goto check_success, 2237 DFA_TEASE_APART_FF_, 2238 DFA_RETURN_FAILURE_); 2239 2240 check_success: 2241 2242 return is_utf8_char_helper_(s0, e, flags); 2243 2244#ifdef HAS_EXTRA_LONG_UTF8 2245 2246 tease_apart_FF: 2247 2248 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 2249 * either malformed, or was for the largest possible start byte, which 2250 * indicates perl extended UTF-8, well above the Unicode maximum */ 2251 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 2252 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 2253 { 2254 return 0; 2255 } 2256 2257 /* Otherwise examine the sequence not inline */ 2258 return is_utf8_FF_helper_(s0, e, 2259 FALSE /* require full, not partial char */ 2260 ); 2261#endif 2262 2263} 2264 2265/* 2266 2267=for apidoc is_utf8_valid_partial_char 2268 2269Returns 0 if the sequence of bytes starting at C<s> and looking no further than 2270S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code 2271points. Otherwise, it returns 1 if there exists at least one non-empty 2272sequence of bytes that when appended to sequence C<s>, starting at position 2273C<e> causes the entire sequence to be the well-formed UTF-8 of some code point; 2274otherwise returns 0. 2275 2276In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code 2277point. 2278 2279This is useful when a fixed-length buffer is being tested for being well-formed 2280UTF-8, but the final few bytes in it don't comprise a full character; that is, 2281it is split somewhere in the middle of the final code point's UTF-8 2282representation. (Presumably when the buffer is refreshed with the next chunk 2283of data, the new first bytes will complete the partial code point.) This 2284function is used to verify that the final bytes in the current buffer are in 2285fact the legal beginning of some code point, so that if they aren't, the 2286failure can be signalled without having to wait for the next read. 2287 2288=cut 2289*/ 2290#define is_utf8_valid_partial_char(s, e) \ 2291 is_utf8_valid_partial_char_flags(s, e, 0) 2292 2293/* 2294 2295=for apidoc is_utf8_valid_partial_char_flags 2296 2297Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether 2298or not the input is a valid UTF-8 encoded partial character, but it takes an 2299extra parameter, C<flags>, which can further restrict which code points are 2300considered valid. 2301 2302If C<flags> is 0, this behaves identically to 2303C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination 2304of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If 2305there is any sequence of bytes that can complete the input partial character in 2306such a way that a non-prohibited character is formed, the function returns 2307TRUE; otherwise FALSE. Non character code points cannot be determined based on 2308partial character input. But many of the other possible excluded types can be 2309determined from just the first one or two bytes. 2310 2311=cut 2312 */ 2313 2314PERL_STATIC_INLINE bool 2315Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags) 2316{ 2317 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; 2318 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2319 |UTF8_DISALLOW_PERL_EXTENDED))); 2320 2321 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2322 DFA_RETURN_FAILURE_, 2323 DFA_TEASE_APART_FF_, 2324 NOOP); 2325 2326 /* The NOOP above causes the DFA to drop down here iff the input was a 2327 * partial character. flags=0 => can return TRUE immediately; otherwise we 2328 * need to check (not inline) if the partial character is the beginning of 2329 * a disallowed one */ 2330 if (flags == 0) { 2331 return TRUE; 2332 } 2333 2334 return cBOOL(is_utf8_char_helper_(s0, e, flags)); 2335 2336#ifdef HAS_EXTRA_LONG_UTF8 2337 2338 tease_apart_FF: 2339 2340 /* Getting here means the input is either malformed, or, in the case of 2341 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The 2342 * latter case has to be extended UTF-8, so can fail immediately if that is 2343 * forbidden */ 2344 2345 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 2346 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 2347 { 2348 return 0; 2349 } 2350 2351 return is_utf8_FF_helper_(s0, e, 2352 TRUE /* Require to be a partial character */ 2353 ); 2354#endif 2355 2356} 2357 2358/* 2359 2360=for apidoc is_utf8_fixed_width_buf_flags 2361 2362Returns TRUE if the fixed-width buffer starting at C<s> with length C<len> 2363is entirely valid UTF-8, subject to the restrictions given by C<flags>; 2364otherwise it returns FALSE. 2365 2366If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted 2367without restriction. If the final few bytes of the buffer do not form a 2368complete code point, this will return TRUE anyway, provided that 2369C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them. 2370 2371If C<flags> in non-zero, it can be any combination of the 2372C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the 2373same meanings. 2374 2375This function differs from C<L</is_utf8_string_flags>> only in that the latter 2376returns FALSE if the final few bytes of the string don't form a complete code 2377point. 2378 2379=cut 2380 */ 2381#define is_utf8_fixed_width_buf_flags(s, len, flags) \ 2382 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags) 2383 2384/* 2385 2386=for apidoc is_utf8_fixed_width_buf_loc_flags 2387 2388Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the 2389failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point 2390to the beginning of any partial character at the end of the buffer; if there is 2391no partial character C<*ep> will contain C<s>+C<len>. 2392 2393See also C<L</is_utf8_fixed_width_buf_loclen_flags>>. 2394 2395=cut 2396*/ 2397 2398#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \ 2399 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags) 2400 2401/* 2402 2403=for apidoc is_utf8_fixed_width_buf_loclen_flags 2404 2405Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of 2406complete, valid characters found in the C<el> pointer. 2407 2408=cut 2409*/ 2410 2411PERL_STATIC_INLINE bool 2412Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, 2413 STRLEN len, 2414 const U8 **ep, 2415 STRLEN *el, 2416 const U32 flags) 2417{ 2418 const U8 * maybe_partial; 2419 2420 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS; 2421 2422 if (! ep) { 2423 ep = &maybe_partial; 2424 } 2425 2426 /* If it's entirely valid, return that; otherwise see if the only error is 2427 * that the final few bytes are for a partial character */ 2428 return is_utf8_string_loclen_flags(s, len, ep, el, flags) 2429 || is_utf8_valid_partial_char_flags(*ep, s + len, flags); 2430} 2431 2432PERL_STATIC_INLINE UV 2433Perl_utf8n_to_uvchr_msgs(const U8 *s, 2434 STRLEN curlen, 2435 STRLEN *retlen, 2436 const U32 flags, 2437 U32 * errors, 2438 AV ** msgs) 2439{ 2440 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the 2441 * simple cases, and, if necessary calls a helper function to deal with the 2442 * more complex ones. Almost all well-formed non-problematic code points 2443 * are considered simple, so that it's unlikely that the helper function 2444 * will need to be called. 2445 * 2446 * This is an adaptation of the tables and algorithm given in 2447 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides 2448 * comprehensive documentation of the original version. A copyright notice 2449 * for the original version is given at the beginning of this file. The 2450 * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[]. 2451 */ 2452 2453 const U8 * const s0 = s; 2454 const U8 * send = s0 + curlen; 2455 UV type; 2456 UV uv; 2457 2458 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; 2459 2460 /* This dfa is fast. If it accepts the input, it was for a well-formed, 2461 * non-problematic code point, which can be returned immediately. 2462 * Otherwise we call a helper function to figure out the more complicated 2463 * cases. */ 2464 2465 /* No calls from core pass in an empty string; non-core need a check */ 2466#ifdef PERL_CORE 2467 assert(curlen > 0); 2468#else 2469 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen, 2470 flags, errors, msgs); 2471#endif 2472 2473 type = PL_strict_utf8_dfa_tab[*s]; 2474 2475 /* The table is structured so that 'type' is 0 iff the input byte is 2476 * represented identically regardless of the UTF-8ness of the string */ 2477 if (type == 0) { /* UTF-8 invariants are returned unchanged */ 2478 uv = *s; 2479 } 2480 else { 2481 UV state = PL_strict_utf8_dfa_tab[256 + type]; 2482 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); 2483 2484 while (++s < send) { 2485 type = PL_strict_utf8_dfa_tab[*s]; 2486 state = PL_strict_utf8_dfa_tab[256 + state + type]; 2487 2488 uv = UTF8_ACCUMULATE(uv, *s); 2489 2490 if (state == 0) { 2491#ifdef EBCDIC 2492 uv = UNI_TO_NATIVE(uv); 2493#endif 2494 goto success; 2495 } 2496 2497 if (UNLIKELY(state == 1)) { 2498 break; 2499 } 2500 } 2501 2502 /* Here is potentially problematic. Use the full mechanism */ 2503 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, 2504 errors, msgs); 2505 } 2506 2507 success: 2508 if (retlen) { 2509 *retlen = s - s0 + 1; 2510 } 2511 if (errors) { 2512 *errors = 0; 2513 } 2514 if (msgs) { 2515 *msgs = NULL; 2516 } 2517 2518 return uv; 2519} 2520 2521PERL_STATIC_INLINE UV 2522Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2523{ 2524 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; 2525 2526 assert(s < send); 2527 2528 if (! ckWARN_d(WARN_UTF8)) { 2529 2530 /* EMPTY is not really allowed, and asserts on debugging builds. But 2531 * on non-debugging we have to deal with it, and this causes it to 2532 * return the REPLACEMENT CHARACTER, as the documentation indicates */ 2533 return utf8n_to_uvchr(s, send - s, retlen, 2534 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); 2535 } 2536 else { 2537 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); 2538 if (retlen && ret == 0 && (send <= s || *s != '\0')) { 2539 *retlen = (STRLEN) -1; 2540 } 2541 2542 return ret; 2543 } 2544} 2545 2546/* ------------------------------- perl.h ----------------------------- */ 2547 2548/* 2549=for apidoc_section $utility 2550 2551=for apidoc is_safe_syscall 2552 2553Test that the given C<pv> (with length C<len>) doesn't contain any internal 2554C<NUL> characters. 2555If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls> 2556category, and return FALSE. 2557 2558Return TRUE if the name is safe. 2559 2560C<what> and C<op_name> are used in any warning. 2561 2562Used by the C<IS_SAFE_SYSCALL()> macro. 2563 2564=cut 2565*/ 2566 2567PERL_STATIC_INLINE bool 2568Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) 2569{ 2570 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs 2571 * perl itself uses xce*() functions which accept 8-bit strings. 2572 */ 2573 2574 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL; 2575 2576 if (len > 1) { 2577 char *null_at; 2578 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) { 2579 SETERRNO(ENOENT, LIB_INVARG); 2580 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS), 2581 "Invalid \\0 character in %s for %s: %s\\0%s", 2582 what, op_name, pv, null_at+1); 2583 return FALSE; 2584 } 2585 } 2586 2587 return TRUE; 2588} 2589 2590/* 2591 2592Return true if the supplied filename has a newline character 2593immediately before the first (hopefully only) NUL. 2594 2595My original look at this incorrectly used the len from SvPV(), but 2596that's incorrect, since we allow for a NUL in pv[len-1]. 2597 2598So instead, strlen() and work from there. 2599 2600This allow for the user reading a filename, forgetting to chomp it, 2601then calling: 2602 2603 open my $foo, "$file\0"; 2604 2605*/ 2606 2607#ifdef PERL_CORE 2608 2609PERL_STATIC_INLINE bool 2610S_should_warn_nl(const char *pv) 2611{ 2612 STRLEN len; 2613 2614 PERL_ARGS_ASSERT_SHOULD_WARN_NL; 2615 2616 len = strlen(pv); 2617 2618 return len > 0 && pv[len-1] == '\n'; 2619} 2620 2621#endif 2622 2623#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) 2624 2625PERL_STATIC_INLINE bool 2626S_lossless_NV_to_IV(const NV nv, IV *ivp) 2627{ 2628 /* This function determines if the input NV 'nv' may be converted without 2629 * loss of data to an IV. If not, it returns FALSE taking no other action. 2630 * But if it is possible, it does the conversion, returning TRUE, and 2631 * storing the converted result in '*ivp' */ 2632 2633 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; 2634 2635# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 2636 /* Normally any comparison with a NaN returns false; if we can't rely 2637 * on that behaviour, check explicitly */ 2638 if (UNLIKELY(Perl_isnan(nv))) { 2639 return FALSE; 2640 } 2641# endif 2642 2643 /* Written this way so that with an always-false NaN comparison we 2644 * return false */ 2645 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) { 2646 return FALSE; 2647 } 2648 2649 if ((IV) nv != nv) { 2650 return FALSE; 2651 } 2652 2653 *ivp = (IV) nv; 2654 return TRUE; 2655} 2656 2657#endif 2658 2659/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ 2660 2661#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) 2662 2663#define MAX_CHARSET_NAME_LENGTH 2 2664 2665PERL_STATIC_INLINE const char * 2666S_get_regex_charset_name(const U32 flags, STRLEN* const lenp) 2667{ 2668 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME; 2669 2670 /* Returns a string that corresponds to the name of the regex character set 2671 * given by 'flags', and *lenp is set the length of that string, which 2672 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ 2673 2674 *lenp = 1; 2675 switch (get_regex_charset(flags)) { 2676 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; 2677 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; 2678 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; 2679 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; 2680 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 2681 *lenp = 2; 2682 return ASCII_MORE_RESTRICT_PAT_MODS; 2683 } 2684 /* The NOT_REACHED; hides an assert() which has a rather complex 2685 * definition in perl.h. */ 2686 NOT_REACHED; /* NOTREACHED */ 2687 return "?"; /* Unknown */ 2688} 2689 2690#endif 2691 2692/* 2693 2694Return false if any get magic is on the SV other than taint magic. 2695 2696*/ 2697 2698PERL_STATIC_INLINE bool 2699Perl_sv_only_taint_gmagic(SV *sv) 2700{ 2701 MAGIC *mg = SvMAGIC(sv); 2702 2703 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; 2704 2705 while (mg) { 2706 if (mg->mg_type != PERL_MAGIC_taint 2707 && !(mg->mg_flags & MGf_GSKIP) 2708 && mg->mg_virtual->svt_get) { 2709 return FALSE; 2710 } 2711 mg = mg->mg_moremagic; 2712 } 2713 2714 return TRUE; 2715} 2716 2717/* ------------------ cop.h ------------------------------------------- */ 2718 2719/* implement GIMME_V() macro */ 2720 2721PERL_STATIC_INLINE U8 2722Perl_gimme_V(pTHX) 2723{ 2724 I32 cxix; 2725 U8 gimme = (PL_op->op_flags & OPf_WANT); 2726 2727 if (gimme) 2728 return gimme; 2729 cxix = PL_curstackinfo->si_cxsubix; 2730 if (cxix < 0) 2731 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID; 2732 assert(cxstack[cxix].blk_gimme & G_WANT); 2733 return (cxstack[cxix].blk_gimme & G_WANT); 2734} 2735 2736 2737/* Enter a block. Push a new base context and return its address. */ 2738 2739PERL_STATIC_INLINE PERL_CONTEXT * 2740Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) 2741{ 2742 PERL_CONTEXT * cx; 2743 2744 PERL_ARGS_ASSERT_CX_PUSHBLOCK; 2745 2746 CXINC; 2747 cx = CX_CUR(); 2748 cx->cx_type = type; 2749 cx->blk_gimme = gimme; 2750 cx->blk_oldsaveix = saveix; 2751 cx->blk_oldsp = (I32)(sp - PL_stack_base); 2752 cx->blk_oldcop = PL_curcop; 2753 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack); 2754 cx->blk_oldscopesp = PL_scopestack_ix; 2755 cx->blk_oldpm = PL_curpm; 2756 cx->blk_old_tmpsfloor = PL_tmps_floor; 2757 2758 PL_tmps_floor = PL_tmps_ix; 2759 CX_DEBUG(cx, "PUSH"); 2760 return cx; 2761} 2762 2763 2764/* Exit a block (RETURN and LAST). */ 2765 2766PERL_STATIC_INLINE void 2767Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx) 2768{ 2769 PERL_ARGS_ASSERT_CX_POPBLOCK; 2770 2771 CX_DEBUG(cx, "POP"); 2772 /* these 3 are common to cx_popblock and cx_topblock */ 2773 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2774 PL_scopestack_ix = cx->blk_oldscopesp; 2775 PL_curpm = cx->blk_oldpm; 2776 2777 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats 2778 * and leaves a CX entry lying around for repeated use, so 2779 * skip for multicall */ \ 2780 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx)) 2781 || PL_savestack_ix == cx->blk_oldsaveix); 2782 PL_curcop = cx->blk_oldcop; 2783 PL_tmps_floor = cx->blk_old_tmpsfloor; 2784} 2785 2786/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO). 2787 * Whereas cx_popblock() restores the state to the point just before 2788 * cx_pushblock() was called, cx_topblock() restores it to the point just 2789 * *after* cx_pushblock() was called. */ 2790 2791PERL_STATIC_INLINE void 2792Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx) 2793{ 2794 PERL_ARGS_ASSERT_CX_TOPBLOCK; 2795 2796 CX_DEBUG(cx, "TOP"); 2797 /* these 3 are common to cx_popblock and cx_topblock */ 2798 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2799 PL_scopestack_ix = cx->blk_oldscopesp; 2800 PL_curpm = cx->blk_oldpm; 2801 2802 PL_stack_sp = PL_stack_base + cx->blk_oldsp; 2803} 2804 2805 2806PERL_STATIC_INLINE void 2807Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) 2808{ 2809 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); 2810 2811 PERL_ARGS_ASSERT_CX_PUSHSUB; 2812 2813 PERL_DTRACE_PROBE_ENTRY(cv); 2814 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix; 2815 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; 2816 cx->blk_sub.cv = cv; 2817 cx->blk_sub.olddepth = CvDEPTH(cv); 2818 cx->blk_sub.prevcomppad = PL_comppad; 2819 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0; 2820 cx->blk_sub.retop = retop; 2821 SvREFCNT_inc_simple_void_NN(cv); 2822 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF); 2823} 2824 2825 2826/* subsets of cx_popsub() */ 2827 2828PERL_STATIC_INLINE void 2829Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) 2830{ 2831 CV *cv; 2832 2833 PERL_ARGS_ASSERT_CX_POPSUB_COMMON; 2834 assert(CxTYPE(cx) == CXt_SUB); 2835 2836 PL_comppad = cx->blk_sub.prevcomppad; 2837 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2838 cv = cx->blk_sub.cv; 2839 CvDEPTH(cv) = cx->blk_sub.olddepth; 2840 cx->blk_sub.cv = NULL; 2841 SvREFCNT_dec(cv); 2842 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix; 2843} 2844 2845 2846/* handle the @_ part of leaving a sub */ 2847 2848PERL_STATIC_INLINE void 2849Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) 2850{ 2851 AV *av; 2852 2853 PERL_ARGS_ASSERT_CX_POPSUB_ARGS; 2854 assert(CxTYPE(cx) == CXt_SUB); 2855 assert(AvARRAY(MUTABLE_AV( 2856 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[ 2857 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad); 2858 2859 CX_POP_SAVEARRAY(cx); 2860 av = MUTABLE_AV(PAD_SVl(0)); 2861 if (UNLIKELY(AvREAL(av))) 2862 /* abandon @_ if it got reified */ 2863 clear_defarray(av, 0); 2864 else { 2865 CLEAR_ARGARRAY(av); 2866 } 2867} 2868 2869 2870PERL_STATIC_INLINE void 2871Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx) 2872{ 2873 PERL_ARGS_ASSERT_CX_POPSUB; 2874 assert(CxTYPE(cx) == CXt_SUB); 2875 2876 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv); 2877 2878 if (CxHASARGS(cx)) 2879 cx_popsub_args(cx); 2880 cx_popsub_common(cx); 2881} 2882 2883 2884PERL_STATIC_INLINE void 2885Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) 2886{ 2887 PERL_ARGS_ASSERT_CX_PUSHFORMAT; 2888 2889 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix; 2890 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack; 2891 cx->blk_format.cv = cv; 2892 cx->blk_format.retop = retop; 2893 cx->blk_format.gv = gv; 2894 cx->blk_format.dfoutgv = PL_defoutgv; 2895 cx->blk_format.prevcomppad = PL_comppad; 2896 cx->blk_u16 = 0; 2897 2898 SvREFCNT_inc_simple_void_NN(cv); 2899 CvDEPTH(cv)++; 2900 SvREFCNT_inc_void(cx->blk_format.dfoutgv); 2901} 2902 2903 2904PERL_STATIC_INLINE void 2905Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx) 2906{ 2907 CV *cv; 2908 GV *dfout; 2909 2910 PERL_ARGS_ASSERT_CX_POPFORMAT; 2911 assert(CxTYPE(cx) == CXt_FORMAT); 2912 2913 dfout = cx->blk_format.dfoutgv; 2914 setdefout(dfout); 2915 cx->blk_format.dfoutgv = NULL; 2916 SvREFCNT_dec_NN(dfout); 2917 2918 PL_comppad = cx->blk_format.prevcomppad; 2919 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2920 cv = cx->blk_format.cv; 2921 cx->blk_format.cv = NULL; 2922 --CvDEPTH(cv); 2923 SvREFCNT_dec_NN(cv); 2924 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix; 2925} 2926 2927 2928PERL_STATIC_INLINE void 2929Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2930{ 2931 cx->blk_eval.retop = retop; 2932 cx->blk_eval.old_namesv = namesv; 2933 cx->blk_eval.old_eval_root = PL_eval_root; 2934 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL; 2935 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */ 2936 cx->blk_eval.cur_top_env = PL_top_env; 2937 2938 assert(!(PL_in_eval & ~ 0x3F)); 2939 assert(!(PL_op->op_type & ~0x1FF)); 2940 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); 2941} 2942 2943PERL_STATIC_INLINE void 2944Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2945{ 2946 PERL_ARGS_ASSERT_CX_PUSHEVAL; 2947 2948 Perl_push_evalortry_common(aTHX_ cx, retop, namesv); 2949 2950 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 2951 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; 2952} 2953 2954PERL_STATIC_INLINE void 2955Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop) 2956{ 2957 PERL_ARGS_ASSERT_CX_PUSHTRY; 2958 2959 Perl_push_evalortry_common(aTHX_ cx, retop, NULL); 2960 2961 /* Don't actually change it, just store the current value so it's restored 2962 * by the common popeval */ 2963 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 2964} 2965 2966 2967PERL_STATIC_INLINE void 2968Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx) 2969{ 2970 SV *sv; 2971 2972 PERL_ARGS_ASSERT_CX_POPEVAL; 2973 assert(CxTYPE(cx) == CXt_EVAL); 2974 2975 PL_in_eval = CxOLD_IN_EVAL(cx); 2976 assert(!(PL_in_eval & 0xc0)); 2977 PL_eval_root = cx->blk_eval.old_eval_root; 2978 sv = cx->blk_eval.cur_text; 2979 if (sv && CxEVAL_TXT_REFCNTED(cx)) { 2980 cx->blk_eval.cur_text = NULL; 2981 SvREFCNT_dec_NN(sv); 2982 } 2983 2984 sv = cx->blk_eval.old_namesv; 2985 if (sv) { 2986 cx->blk_eval.old_namesv = NULL; 2987 SvREFCNT_dec_NN(sv); 2988 } 2989 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix; 2990} 2991 2992 2993/* push a plain loop, i.e. 2994 * { block } 2995 * while (cond) { block } 2996 * for (init;cond;continue) { block } 2997 * This loop can be last/redo'ed etc. 2998 */ 2999 3000PERL_STATIC_INLINE void 3001Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) 3002{ 3003 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; 3004 cx->blk_loop.my_op = cLOOP; 3005} 3006 3007 3008/* push a true for loop, i.e. 3009 * for var (list) { block } 3010 */ 3011 3012PERL_STATIC_INLINE void 3013Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) 3014{ 3015 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; 3016 3017 /* this one line is common with cx_pushloop_plain */ 3018 cx->blk_loop.my_op = cLOOP; 3019 3020 cx->blk_loop.itervar_u.svp = (SV**)itervarp; 3021 cx->blk_loop.itersave = itersave; 3022#ifdef USE_ITHREADS 3023 cx->blk_loop.oldcomppad = PL_comppad; 3024#endif 3025} 3026 3027 3028/* pop all loop types, including plain */ 3029 3030PERL_STATIC_INLINE void 3031Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx) 3032{ 3033 PERL_ARGS_ASSERT_CX_POPLOOP; 3034 3035 assert(CxTYPE_is_LOOP(cx)); 3036 if ( CxTYPE(cx) == CXt_LOOP_ARY 3037 || CxTYPE(cx) == CXt_LOOP_LAZYSV) 3038 { 3039 /* Free ary or cur. This assumes that state_u.ary.ary 3040 * aligns with state_u.lazysv.cur. See cx_dup() */ 3041 SV *sv = cx->blk_loop.state_u.lazysv.cur; 3042 cx->blk_loop.state_u.lazysv.cur = NULL; 3043 SvREFCNT_dec_NN(sv); 3044 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) { 3045 sv = cx->blk_loop.state_u.lazysv.end; 3046 cx->blk_loop.state_u.lazysv.end = NULL; 3047 SvREFCNT_dec_NN(sv); 3048 } 3049 } 3050 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) { 3051 SV *cursv; 3052 SV **svp = (cx)->blk_loop.itervar_u.svp; 3053 if ((cx->cx_type & CXp_FOR_GV)) 3054 svp = &GvSV((GV*)svp); 3055 cursv = *svp; 3056 *svp = cx->blk_loop.itersave; 3057 cx->blk_loop.itersave = NULL; 3058 SvREFCNT_dec(cursv); 3059 } 3060 if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF)) 3061 SvREFCNT_dec(cx->blk_loop.itervar_u.svp); 3062} 3063 3064 3065PERL_STATIC_INLINE void 3066Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) 3067{ 3068 PERL_ARGS_ASSERT_CX_PUSHWHEN; 3069 3070 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3071} 3072 3073 3074PERL_STATIC_INLINE void 3075Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx) 3076{ 3077 PERL_ARGS_ASSERT_CX_POPWHEN; 3078 assert(CxTYPE(cx) == CXt_WHEN); 3079 3080 PERL_UNUSED_ARG(cx); 3081 PERL_UNUSED_CONTEXT; 3082 /* currently NOOP */ 3083} 3084 3085 3086PERL_STATIC_INLINE void 3087Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) 3088{ 3089 PERL_ARGS_ASSERT_CX_PUSHGIVEN; 3090 3091 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3092 cx->blk_givwhen.defsv_save = orig_defsv; 3093} 3094 3095 3096PERL_STATIC_INLINE void 3097Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx) 3098{ 3099 SV *sv; 3100 3101 PERL_ARGS_ASSERT_CX_POPGIVEN; 3102 assert(CxTYPE(cx) == CXt_GIVEN); 3103 3104 sv = GvSV(PL_defgv); 3105 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save; 3106 cx->blk_givwhen.defsv_save = NULL; 3107 SvREFCNT_dec(sv); 3108} 3109 3110/* 3111=for apidoc newPADxVOP 3112 3113Constructs, checks and returns an op containing a pad offset. C<type> is 3114the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV> 3115or C<OP_PADCV>. The returned op will have the C<op_targ> field set by 3116the C<padix> argument. 3117 3118This is convenient when constructing a large optree in nested function 3119calls, as it avoids needing to store the pad op directly to set the 3120C<op_targ> field as a side-effect. For example 3121 3122 o = op_append_elem(OP_LINESEQ, o, 3123 newPADxVOP(OP_PADSV, 0, padix)); 3124 3125=cut 3126*/ 3127 3128PERL_STATIC_INLINE OP * 3129Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix) 3130{ 3131 PERL_ARGS_ASSERT_NEWPADXVOP; 3132 3133 assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV 3134 || type == OP_PADCV); 3135 OP *o = newOP(type, flags); 3136 o->op_targ = padix; 3137 return o; 3138} 3139 3140/* ------------------ util.h ------------------------------------------- */ 3141 3142/* 3143=for apidoc_section $string 3144 3145=for apidoc foldEQ 3146 3147Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 3148same 3149case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes 3150match themselves and their opposite case counterparts. Non-cased and non-ASCII 3151range bytes match only themselves. 3152 3153=cut 3154*/ 3155 3156PERL_STATIC_INLINE I32 3157Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len) 3158{ 3159 const U8 *a = (const U8 *)s1; 3160 const U8 *b = (const U8 *)s2; 3161 3162 PERL_ARGS_ASSERT_FOLDEQ; 3163 3164 assert(len >= 0); 3165 3166 while (len--) { 3167 if (*a != *b && *a != PL_fold[*b]) 3168 return 0; 3169 a++,b++; 3170 } 3171 return 1; 3172} 3173 3174PERL_STATIC_INLINE I32 3175Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len) 3176{ 3177 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds 3178 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and 3179 * does not check for this. Nor does it check that the strings each have 3180 * at least 'len' characters. */ 3181 3182 const U8 *a = (const U8 *)s1; 3183 const U8 *b = (const U8 *)s2; 3184 3185 PERL_ARGS_ASSERT_FOLDEQ_LATIN1; 3186 3187 assert(len >= 0); 3188 3189 while (len--) { 3190 if (*a != *b && *a != PL_fold_latin1[*b]) { 3191 return 0; 3192 } 3193 a++, b++; 3194 } 3195 return 1; 3196} 3197 3198/* 3199=for apidoc_section $locale 3200=for apidoc foldEQ_locale 3201 3202Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 3203same case-insensitively in the current locale; false otherwise. 3204 3205=cut 3206*/ 3207 3208PERL_STATIC_INLINE I32 3209Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len) 3210{ 3211 const U8 *a = (const U8 *)s1; 3212 const U8 *b = (const U8 *)s2; 3213 3214 PERL_ARGS_ASSERT_FOLDEQ_LOCALE; 3215 3216 assert(len >= 0); 3217 3218 while (len--) { 3219 if (*a != *b && *a != PL_fold_locale[*b]) { 3220 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 3221 "%s:%d: Our records indicate %02x is not a fold of %02x" 3222 " or its mate %02x\n", 3223 __FILE__, __LINE__, *a, *b, PL_fold_locale[*b])); 3224 3225 return 0; 3226 } 3227 a++,b++; 3228 } 3229 return 1; 3230} 3231 3232/* 3233=for apidoc_section $string 3234=for apidoc my_strnlen 3235 3236The C library C<strnlen> if available, or a Perl implementation of it. 3237 3238C<my_strnlen()> computes the length of the string, up to C<maxlen> 3239characters. It will never attempt to address more than C<maxlen> 3240characters, making it suitable for use with strings that are not 3241guaranteed to be NUL-terminated. 3242 3243=cut 3244 3245Description stolen from http://man.openbsd.org/strnlen.3, 3246implementation stolen from PostgreSQL. 3247*/ 3248#ifndef HAS_STRNLEN 3249 3250PERL_STATIC_INLINE Size_t 3251Perl_my_strnlen(const char *str, Size_t maxlen) 3252{ 3253 const char *end = (char *) memchr(str, '\0', maxlen); 3254 3255 PERL_ARGS_ASSERT_MY_STRNLEN; 3256 3257 if (end == NULL) return maxlen; 3258 return end - str; 3259} 3260 3261#endif 3262 3263#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) 3264 3265PERL_STATIC_INLINE void * 3266S_my_memrchr(const char * s, const char c, const STRLEN len) 3267{ 3268 /* memrchr(), since many platforms lack it */ 3269 3270 const char * t = s + len - 1; 3271 3272 PERL_ARGS_ASSERT_MY_MEMRCHR; 3273 3274 while (t >= s) { 3275 if (*t == c) { 3276 return (void *) t; 3277 } 3278 t--; 3279 } 3280 3281 return NULL; 3282} 3283 3284#endif 3285 3286PERL_STATIC_INLINE char * 3287Perl_mortal_getenv(const char * str) 3288{ 3289 /* This implements a (mostly) thread-safe, sequential-call-safe getenv(). 3290 * 3291 * It's (mostly) thread-safe because it uses a mutex to prevent other 3292 * threads (that look at this mutex) from destroying the result before this 3293 * routine has a chance to copy the result to a place that won't be 3294 * destroyed before the caller gets a chance to handle it. That place is a 3295 * mortal SV. khw chose this over SAVEFREEPV because he is under the 3296 * impression that the SV will hang around longer under more circumstances 3297 * 3298 * The reason it isn't completely thread-safe is that other code could 3299 * simply not pay attention to the mutex. All of the Perl core uses the 3300 * mutex, but it is possible for code from, say XS, to not use this mutex, 3301 * defeating the safety. 3302 * 3303 * getenv() returns, in some implementations, a pointer to a spot in the 3304 * **environ array, which could be invalidated at any time by this or 3305 * another thread changing the environment. Other implementations copy the 3306 * **environ value to a static buffer, returning a pointer to that. That 3307 * buffer might or might not be invalidated by a getenv() call in another 3308 * thread. If it does get zapped, we need an exclusive lock. Otherwise, 3309 * many getenv() calls can safely be running simultaneously, so a 3310 * many-reader (but no simultaneous writers) lock is ok. There is a 3311 * Configure probe to see if another thread destroys the buffer, and the 3312 * mutex is defined accordingly. 3313 * 3314 * But in all cases, using the mutex prevents these problems, as long as 3315 * all code uses the same mutex. 3316 * 3317 * A complication is that this can be called during phases where the 3318 * mortalization process isn't available. These are in interpreter 3319 * destruction or early in construction. khw believes that at these times 3320 * there shouldn't be anything else going on, so plain getenv is safe AS 3321 * LONG AS the caller acts on the return before calling it again. */ 3322 3323 char * ret; 3324 dTHX; 3325 3326 PERL_ARGS_ASSERT_MORTAL_GETENV; 3327 3328 /* Can't mortalize without stacks. khw believes that no other threads 3329 * should be running, so no need to lock things, and this may be during a 3330 * phase when locking isn't even available */ 3331 if (UNLIKELY(PL_scopestack_ix == 0)) { 3332 return getenv(str); 3333 } 3334 3335#ifdef PERL_MEM_LOG 3336 3337 /* A major complication arises under PERL_MEM_LOG. When that is active, 3338 * every memory allocation may result in logging, depending on the value of 3339 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for 3340 * saving ENV{foo}'s value (but before saving it), the logging code will 3341 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some 3342 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to 3343 * lock a boolean mutex recursively); 3) destroying the getenv() static 3344 * buffer; or 4) destroying the temporary created by this for the copy 3345 * causes a log entry to be made which could cause a new temporary to be 3346 * created, which will need to be destroyed at some point, leading to an 3347 * infinite loop. 3348 * 3349 * The solution adopted here (after some gnashing of teeth) is to detect 3350 * the recursive calls and calls from the logger, and treat them specially. 3351 * Let's say we want to do getenv("foo"). We first find 3352 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter 3353 * variable, so no temporary is required. Then we do getenv(foo), and in 3354 * the process of creating a temporary to save it, this function will be 3355 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call, 3356 * we detect that it is such a call and return our saved value instead of 3357 * locking and doing a new getenv(). This solves all of problems 1), 2), 3358 * and 3). Because all the getenv()s are done while the mutex is locked, 3359 * the state cannot have changed. To solve 4), we don't create a temporary 3360 * when this is called from the logging code. That code disposes of the 3361 * return value while the mutex is still locked. 3362 * 3363 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial 3364 * digits and 3 particular letters are significant; the rest are ignored by 3365 * the memory logging code. Thus the per-interpreter variable only needs 3366 * to be large enough to save the significant information, the size of 3367 * which is known at compile time. The first byte is extra, reserved for 3368 * flags for our use. To protect against overflowing, only the reserved 3369 * byte, as many digits as don't overflow, and the three letters are 3370 * stored. 3371 * 3372 * The reserved byte has two bits: 3373 * 0x1 if set indicates that if we get here, it is a recursive call of 3374 * getenv() 3375 * 0x2 if set indicates that the call is from the logging code. 3376 * 3377 * If the flag indicates this is a recursive call, just return the stored 3378 * value of PL_mem_log; An empty value gets turned into NULL. */ 3379 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) { 3380 if (PL_mem_log[1] == '\0') { 3381 return NULL; 3382 } else { 3383 return PL_mem_log + 1; 3384 } 3385 } 3386 3387#endif 3388 3389 GETENV_LOCK; 3390 3391#ifdef PERL_MEM_LOG 3392 3393 /* Here we are in a critical section. As explained above, we do our own 3394 * getenv(PERL_MEM_LOG), saving the result safely. */ 3395 ret = getenv("PERL_MEM_LOG"); 3396 if (ret == NULL) { /* No logging active */ 3397 3398 /* Return that immediately if called from the logging code */ 3399 if (PL_mem_log[0] & 0x2) { 3400 GETENV_UNLOCK; 3401 return NULL; 3402 } 3403 3404 PL_mem_log[1] = '\0'; 3405 } 3406 else { 3407 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */ 3408 3409 /* There is nothing to prevent the value of PERL_MEM_LOG from being an 3410 * extremely long string. But we want only a few characters from it. 3411 * PL_mem_log has been made large enough to hold just the ones we need. 3412 * First the file descriptor. */ 3413 if (isDIGIT(*ret)) { 3414 const char * s = ret; 3415 if (UNLIKELY(*s == '0')) { 3416 3417 /* Reduce multiple leading zeros to a single one. This is to 3418 * allow the caller to change what to do with leading zeros. */ 3419 *mem_log_meat++ = '0'; 3420 s++; 3421 while (*s == '0') { 3422 s++; 3423 } 3424 } 3425 3426 /* If the input overflows, copy just enough for the result to also 3427 * overflow, plus 1 to make sure */ 3428 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) { 3429 *mem_log_meat++ = *s++; 3430 } 3431 } 3432 3433 /* Then each of the four significant characters */ 3434 if (strchr(ret, 'm')) { 3435 *mem_log_meat++ = 'm'; 3436 } 3437 if (strchr(ret, 's')) { 3438 *mem_log_meat++ = 's'; 3439 } 3440 if (strchr(ret, 't')) { 3441 *mem_log_meat++ = 't'; 3442 } 3443 if (strchr(ret, 'c')) { 3444 *mem_log_meat++ = 'c'; 3445 } 3446 *mem_log_meat = '\0'; 3447 3448 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log)); 3449 } 3450 3451 /* If we are being called from the logger, it only needs the significant 3452 * portion of PERL_MEM_LOG, and doesn't need a safe copy */ 3453 if (PL_mem_log[0] & 0x2) { 3454 assert(strEQ(str, "PERL_MEM_LOG")); 3455 GETENV_UNLOCK; 3456 return PL_mem_log + 1; 3457 } 3458 3459 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that 3460 * is coming from other than the logging code, so it should be treated the 3461 * same as any other getenv(), returning the full value, not just the 3462 * significant part, and having its value saved. Set the flag that 3463 * indicates any call to this routine will be a recursion from here */ 3464 PL_mem_log[0] = 0x1; 3465 3466#endif 3467 3468 /* Now get the value of the real desired variable, and save a copy */ 3469 ret = getenv(str); 3470 3471 if (ret != NULL) { 3472 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) ); 3473 } 3474 3475 GETENV_UNLOCK; 3476 3477#ifdef PERL_MEM_LOG 3478 3479 /* Clear the buffer */ 3480 Zero(PL_mem_log, sizeof(PL_mem_log), char); 3481 3482#endif 3483 3484 return ret; 3485} 3486 3487PERL_STATIC_INLINE bool 3488Perl_sv_isbool(pTHX_ const SV *sv) 3489{ 3490 return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv); 3491} 3492 3493#ifdef USE_ITHREADS 3494 3495PERL_STATIC_INLINE AV * 3496Perl_cop_file_avn(pTHX_ const COP *cop) { 3497 3498 PERL_ARGS_ASSERT_COP_FILE_AVN; 3499 3500 const char *file = CopFILE(cop); 3501 if (file) { 3502 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD); 3503 if (gv) { 3504 return GvAVn(gv); 3505 } 3506 else 3507 return NULL; 3508 } 3509 else 3510 return NULL; 3511} 3512 3513#endif 3514 3515PERL_STATIC_INLINE PADNAME * 3516Perl_padname_refcnt_inc(PADNAME *pn) 3517{ 3518 PadnameREFCNT(pn)++; 3519 return pn; 3520} 3521 3522PERL_STATIC_INLINE PADNAMELIST * 3523Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl) 3524{ 3525 PadnamelistREFCNT(pnl)++; 3526 return pnl; 3527} 3528 3529/* copy a string to a safe spot */ 3530 3531/* 3532=for apidoc_section $string 3533=for apidoc savepv 3534 3535Perl's version of C<strdup()>. Returns a pointer to a newly allocated 3536string which is a duplicate of C<pv>. The size of the string is 3537determined by C<strlen()>, which means it may not contain embedded C<NUL> 3538characters and must have a trailing C<NUL>. To prevent memory leaks, the 3539memory allocated for the new string needs to be freed when no longer needed. 3540This can be done with the C<L</Safefree>> function, or 3541L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>. 3542 3543On some platforms, Windows for example, all allocated memory owned by a thread 3544is deallocated when that thread ends. So if you need that not to happen, you 3545need to use the shared memory functions, such as C<L</savesharedpv>>. 3546 3547=cut 3548*/ 3549 3550PERL_STATIC_INLINE char * 3551Perl_savepv(pTHX_ const char *pv) 3552{ 3553 PERL_UNUSED_CONTEXT; 3554 if (!pv) 3555 return NULL; 3556 else { 3557 char *newaddr; 3558 const STRLEN pvlen = strlen(pv)+1; 3559 Newx(newaddr, pvlen, char); 3560 return (char*)memcpy(newaddr, pv, pvlen); 3561 } 3562} 3563 3564/* same thing but with a known length */ 3565 3566/* 3567=for apidoc savepvn 3568 3569Perl's version of what C<strndup()> would be if it existed. Returns a 3570pointer to a newly allocated string which is a duplicate of the first 3571C<len> bytes from C<pv>, plus a trailing 3572C<NUL> byte. The memory allocated for 3573the new string can be freed with the C<Safefree()> function. 3574 3575On some platforms, Windows for example, all allocated memory owned by a thread 3576is deallocated when that thread ends. So if you need that not to happen, you 3577need to use the shared memory functions, such as C<L</savesharedpvn>>. 3578 3579=cut 3580*/ 3581 3582PERL_STATIC_INLINE char * 3583Perl_savepvn(pTHX_ const char *pv, Size_t len) 3584{ 3585 char *newaddr; 3586 PERL_UNUSED_CONTEXT; 3587 3588 Newx(newaddr,len+1,char); 3589 /* Give a meaning to NULL pointer mainly for the use in sv_magic() */ 3590 if (pv) { 3591 /* might not be null terminated */ 3592 newaddr[len] = '\0'; 3593 return (char *) CopyD(pv,newaddr,len,char); 3594 } 3595 else { 3596 return (char *) ZeroD(newaddr,len+1,char); 3597 } 3598} 3599 3600/* 3601=for apidoc savesvpv 3602 3603A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from 3604the passed in SV using C<SvPV()> 3605 3606On some platforms, Windows for example, all allocated memory owned by a thread 3607is deallocated when that thread ends. So if you need that not to happen, you 3608need to use the shared memory functions, such as C<L</savesharedsvpv>>. 3609 3610=cut 3611*/ 3612 3613PERL_STATIC_INLINE char * 3614Perl_savesvpv(pTHX_ SV *sv) 3615{ 3616 STRLEN len; 3617 const char * const pv = SvPV_const(sv, len); 3618 char *newaddr; 3619 3620 PERL_ARGS_ASSERT_SAVESVPV; 3621 3622 ++len; 3623 Newx(newaddr,len,char); 3624 return (char *) CopyD(pv,newaddr,len,char); 3625} 3626 3627/* 3628=for apidoc savesharedsvpv 3629 3630A version of C<savesharedpv()> which allocates the duplicate string in 3631memory which is shared between threads. 3632 3633=cut 3634*/ 3635 3636PERL_STATIC_INLINE char * 3637Perl_savesharedsvpv(pTHX_ SV *sv) 3638{ 3639 STRLEN len; 3640 const char * const pv = SvPV_const(sv, len); 3641 3642 PERL_ARGS_ASSERT_SAVESHAREDSVPV; 3643 3644 return savesharedpvn(pv, len); 3645} 3646 3647#ifndef PERL_GET_CONTEXT_DEFINED 3648 3649/* 3650=for apidoc_section $embedding 3651=for apidoc get_context 3652 3653Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead. 3654 3655=cut 3656*/ 3657 3658PERL_STATIC_INLINE void * 3659Perl_get_context(void) 3660{ 3661# if defined(USE_ITHREADS) 3662# ifdef OLD_PTHREADS_API 3663 pthread_addr_t t; 3664 int error = pthread_getspecific(PL_thr_key, &t); 3665 if (error) 3666 Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error); 3667 return (void*)t; 3668# elif defined(I_MACH_CTHREADS) 3669 return (void*)cthread_data(cthread_self()); 3670# else 3671 return (void*)PTHREAD_GETSPECIFIC(PL_thr_key); 3672# endif 3673# else 3674 return (void*)NULL; 3675# endif 3676} 3677 3678#endif 3679 3680PERL_STATIC_INLINE MGVTBL* 3681Perl_get_vtbl(pTHX_ int vtbl_id) 3682{ 3683 PERL_UNUSED_CONTEXT; 3684 3685 return (vtbl_id < 0 || vtbl_id >= magic_vtable_max) 3686 ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id; 3687} 3688 3689/* 3690=for apidoc my_strlcat 3691 3692The C library C<strlcat> if available, or a Perl implementation of it. 3693This operates on C C<NUL>-terminated strings. 3694 3695C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at 3696most S<C<size - strlen(dst) - 1>> characters. It will then C<NUL>-terminate, 3697unless C<size> is 0 or the original C<dst> string was longer than C<size> (in 3698practice this should not happen as it means that either C<size> is incorrect or 3699that C<dst> is not a proper C<NUL>-terminated string). 3700 3701Note that C<size> is the full size of the destination buffer and 3702the result is guaranteed to be C<NUL>-terminated if there is room. Note that 3703room for the C<NUL> should be included in C<size>. 3704 3705The return value is the total length that C<dst> would have if C<size> is 3706sufficiently large. Thus it is the initial length of C<dst> plus the length of 3707C<src>. If C<size> is smaller than the return, the excess was not appended. 3708 3709=cut 3710 3711Description stolen from http://man.openbsd.org/strlcat.3 3712*/ 3713#ifndef HAS_STRLCAT 3714PERL_STATIC_INLINE Size_t 3715Perl_my_strlcat(char *dst, const char *src, Size_t size) 3716{ 3717 Size_t used, length, copy; 3718 3719 used = strlen(dst); 3720 length = strlen(src); 3721 if (size > 0 && used < size - 1) { 3722 copy = (length >= size - used) ? size - used - 1 : length; 3723 memcpy(dst + used, src, copy); 3724 dst[used + copy] = '\0'; 3725 } 3726 return used + length; 3727} 3728#endif 3729 3730 3731/* 3732=for apidoc my_strlcpy 3733 3734The C library C<strlcpy> if available, or a Perl implementation of it. 3735This operates on C C<NUL>-terminated strings. 3736 3737C<my_strlcpy()> copies up to S<C<size - 1>> characters from the string C<src> 3738to C<dst>, C<NUL>-terminating the result if C<size> is not 0. 3739 3740The return value is the total length C<src> would be if the copy completely 3741succeeded. If it is larger than C<size>, the excess was not copied. 3742 3743=cut 3744 3745Description stolen from http://man.openbsd.org/strlcpy.3 3746*/ 3747#ifndef HAS_STRLCPY 3748PERL_STATIC_INLINE Size_t 3749Perl_my_strlcpy(char *dst, const char *src, Size_t size) 3750{ 3751 Size_t length, copy; 3752 3753 length = strlen(src); 3754 if (size > 0) { 3755 copy = (length >= size) ? size - 1 : length; 3756 memcpy(dst, src, copy); 3757 dst[copy] = '\0'; 3758 } 3759 return length; 3760} 3761#endif 3762 3763/* 3764 * ex: set ts=8 sts=4 sw=4 et: 3765 */ 3766