1 /* inline.h 2 * 3 * Copyright (C) 2012 by Larry Wall and others 4 * 5 * You may distribute under the terms of either the GNU General Public 6 * License or the Artistic License, as specified in the README file. 7 * 8 * This file contains tables and code adapted from 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this 10 * copyright notice: 11 12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 13 14 Permission is hereby granted, free of charge, to any person obtaining a copy of 15 this software and associated documentation files (the "Software"), to deal in 16 the Software without restriction, including without limitation the rights to 17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 18 of the Software, and to permit persons to whom the Software is furnished to do 19 so, subject to the following conditions: 20 21 The above copyright notice and this permission notice shall be included in all 22 copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 SOFTWARE. 31 32 * 33 * This file is a home for static inline functions that cannot go in other 34 * header files, because they depend on proto.h (included after most other 35 * headers) or struct definitions. 36 * 37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but 38 * whose details should be exposed to the compiler, for such things as tail 39 * call optimization. 40 * 41 * Each section names the header file that the functions "belong" to. 42 */ 43 44 /* ------------------------------- av.h ------------------------------- */ 45 46 /* 47 =for apidoc_section $AV 48 =for apidoc av_count 49 Returns the number of elements in the array C<av>. This is the true length of 50 the array, including any undefined elements. It is always the same as 51 S<C<av_top_index(av) + 1>>. 52 53 =cut 54 */ 55 PERL_STATIC_INLINE Size_t 56 Perl_av_count(pTHX_ AV *av) 57 { 58 PERL_ARGS_ASSERT_AV_COUNT; 59 assert(SvTYPE(av) == SVt_PVAV); 60 61 return AvFILL(av) + 1; 62 } 63 64 /* ------------------------------- av.c ------------------------------- */ 65 66 /* 67 =for apidoc av_store_simple 68 69 This is a cut-down version of av_store that assumes that the array is 70 very straightforward - no magic, not readonly, and AvREAL - and that 71 C<key> is not negative. This function MUST NOT be used in situations 72 where any of those assumptions may not hold. 73 74 Stores an SV in an array. The array index is specified as C<key>. It 75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)). 76 77 Note that the caller is responsible for suitably incrementing the reference 78 count of C<val> before the call. 79 80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>. 81 82 =cut 83 */ 84 85 PERL_STATIC_INLINE SV** 86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val) 87 { 88 SV** ary; 89 90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE; 91 assert(SvTYPE(av) == SVt_PVAV); 92 assert(!SvMAGICAL(av)); 93 assert(!SvREADONLY(av)); 94 assert(AvREAL(av)); 95 assert(key > -1); 96 97 ary = AvARRAY(av); 98 99 if (AvFILLp(av) < key) { 100 if (key > AvMAX(av)) { 101 av_extend(av,key); 102 ary = AvARRAY(av); 103 } 104 AvFILLp(av) = key; 105 } else 106 SvREFCNT_dec(ary[key]); 107 108 ary[key] = val; 109 return &ary[key]; 110 } 111 112 /* 113 =for apidoc av_fetch_simple 114 115 This is a cut-down version of av_fetch that assumes that the array is 116 very straightforward - no magic, not readonly, and AvREAL - and that 117 C<key> is not negative. This function MUST NOT be used in situations 118 where any of those assumptions may not hold. 119 120 Returns the SV at the specified index in the array. The C<key> is the 121 index. If lval is true, you are guaranteed to get a real SV back (in case 122 it wasn't real before), which you can then modify. Check that the return 123 value is non-null before dereferencing it to a C<SV*>. 124 125 The rough perl equivalent is C<$myarray[$key]>. 126 127 =cut 128 */ 129 130 PERL_STATIC_INLINE SV** 131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval) 132 { 133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE; 134 assert(SvTYPE(av) == SVt_PVAV); 135 assert(!SvMAGICAL(av)); 136 assert(!SvREADONLY(av)); 137 assert(AvREAL(av)); 138 assert(key > -1); 139 140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) { 141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL; 142 } else { 143 return &AvARRAY(av)[key]; 144 } 145 } 146 147 /* 148 =for apidoc av_push_simple 149 150 This is a cut-down version of av_push that assumes that the array is very 151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is 152 not less than -1. This function MUST NOT be used in situations where any 153 of those assumptions may not hold. 154 155 Pushes an SV (transferring control of one reference count) onto the end of the 156 array. The array will grow automatically to accommodate the addition. 157 158 Perl equivalent: C<push @myarray, $val;>. 159 160 =cut 161 */ 162 163 PERL_STATIC_INLINE void 164 Perl_av_push_simple(pTHX_ AV *av, SV *val) 165 { 166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE; 167 assert(SvTYPE(av) == SVt_PVAV); 168 assert(!SvMAGICAL(av)); 169 assert(!SvREADONLY(av)); 170 assert(AvREAL(av)); 171 assert(AvFILLp(av) > -2); 172 173 (void)av_store_simple(av,AvFILLp(av)+1,val); 174 } 175 176 /* 177 =for apidoc av_new_alloc 178 179 This implements L<perlapi/C<newAV_alloc_x>> 180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this 181 functionality. 182 183 Creates a new AV and allocates its SV* array. 184 185 This is similar to, but more efficient than doing: 186 187 AV *av = newAV(); 188 av_extend(av, key); 189 190 The size parameter is used to pre-allocate a SV* array large enough to 191 hold at least elements C<0..(size-1)>. C<size> must be at least 1. 192 193 The C<zeroflag> parameter controls whether or not the array is NULL 194 initialized. 195 196 =cut 197 */ 198 199 PERL_STATIC_INLINE AV * 200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag) 201 { 202 AV * const av = newAV(); 203 SV** ary; 204 PERL_ARGS_ASSERT_AV_NEW_ALLOC; 205 assert(size > 0); 206 207 Newx(ary, size, SV*); /* Newx performs the memwrap check */ 208 AvALLOC(av) = ary; 209 AvARRAY(av) = ary; 210 AvMAX(av) = size - 1; 211 212 if (zeroflag) 213 Zero(ary, size, SV*); 214 215 return av; 216 } 217 218 219 /* ------------------------------- cv.h ------------------------------- */ 220 221 /* 222 =for apidoc_section $CV 223 =for apidoc CvGV 224 Returns the GV associated with the CV C<sv>, reifying it if necessary. 225 226 =cut 227 */ 228 PERL_STATIC_INLINE GV * 229 Perl_CvGV(pTHX_ CV *sv) 230 { 231 PERL_ARGS_ASSERT_CVGV; 232 233 return CvNAMED(sv) 234 ? Perl_cvgv_from_hek(aTHX_ sv) 235 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; 236 } 237 238 /* 239 =for apidoc CvDEPTH 240 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a 241 recursive call. 242 243 =cut 244 */ 245 PERL_STATIC_INLINE I32 * 246 Perl_CvDEPTH(const CV * const sv) 247 { 248 PERL_ARGS_ASSERT_CVDEPTH; 249 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); 250 251 return &((XPVCV*)SvANY(sv))->xcv_depth; 252 } 253 254 /* 255 CvPROTO returns the prototype as stored, which is not necessarily what 256 the interpreter should be using. Specifically, the interpreter assumes 257 that spaces have been stripped, which has been the case if the prototype 258 was added by toke.c, but is generally not the case if it was added elsewhere. 259 Since we can't enforce the spacelessness at assignment time, this routine 260 provides a temporary copy at parse time with spaces removed. 261 I<orig> is the start of the original buffer, I<len> is the length of the 262 prototype and will be updated when this returns. 263 */ 264 265 #ifdef PERL_CORE 266 PERL_STATIC_INLINE char * 267 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len) 268 { 269 SV * tmpsv; 270 char * tmps; 271 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP); 272 tmps = SvPVX(tmpsv); 273 while ((*len)--) { 274 if (!isSPACE(*orig)) 275 *tmps++ = *orig; 276 orig++; 277 } 278 *tmps = '\0'; 279 *len = tmps - SvPVX(tmpsv); 280 return SvPVX(tmpsv); 281 } 282 #endif 283 284 /* ------------------------------- iperlsys.h ------------------------------- */ 285 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS) 286 287 /* Otherwise this function is implemented as macros in iperlsys.h */ 288 289 PERL_STATIC_INLINE bool 290 S_PerlEnv_putenv(pTHX_ char * str) 291 { 292 PERL_ARGS_ASSERT_PERLENV_PUTENV; 293 294 ENV_LOCK; 295 bool retval = putenv(str); 296 ENV_UNLOCK; 297 298 return retval; 299 } 300 301 #endif 302 303 /* ------------------------------- mg.h ------------------------------- */ 304 305 #if defined(PERL_CORE) || defined(PERL_EXT) 306 /* assumes get-magic and stringification have already occurred */ 307 PERL_STATIC_INLINE STRLEN 308 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) 309 { 310 assert(mg->mg_type == PERL_MAGIC_regex_global); 311 assert(mg->mg_len != -1); 312 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv)) 313 return (STRLEN)mg->mg_len; 314 else { 315 const STRLEN pos = (STRLEN)mg->mg_len; 316 /* Without this check, we may read past the end of the buffer: */ 317 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; 318 return sv_or_pv_pos_u2b(sv, s, pos, NULL); 319 } 320 } 321 #endif 322 323 /* ------------------------------- pad.h ------------------------------ */ 324 325 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) 326 PERL_STATIC_INLINE bool 327 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) 328 { 329 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE; 330 331 /* is seq within the range _LOW to _HIGH ? 332 * This is complicated by the fact that PL_cop_seqmax 333 * may have wrapped around at some point */ 334 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO) 335 return FALSE; /* not yet introduced */ 336 337 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) { 338 /* in compiling scope */ 339 if ( 340 (seq > COP_SEQ_RANGE_LOW(pn)) 341 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) 342 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) 343 ) 344 return TRUE; 345 } 346 else if ( 347 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) 348 ? 349 ( seq > COP_SEQ_RANGE_LOW(pn) 350 || seq <= COP_SEQ_RANGE_HIGH(pn)) 351 352 : ( seq > COP_SEQ_RANGE_LOW(pn) 353 && seq <= COP_SEQ_RANGE_HIGH(pn)) 354 ) 355 return TRUE; 356 return FALSE; 357 } 358 #endif 359 360 /* ------------------------------- pp.h ------------------------------- */ 361 362 PERL_STATIC_INLINE I32 363 Perl_TOPMARK(pTHX) 364 { 365 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 366 "MARK top %p %" IVdf "\n", 367 PL_markstack_ptr, 368 (IV)*PL_markstack_ptr))); 369 return *PL_markstack_ptr; 370 } 371 372 PERL_STATIC_INLINE I32 373 Perl_POPMARK(pTHX) 374 { 375 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 376 "MARK pop %p %" IVdf "\n", 377 (PL_markstack_ptr-1), 378 (IV)*(PL_markstack_ptr-1)))); 379 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); 380 return *PL_markstack_ptr--; 381 } 382 383 /* ----------------------------- regexp.h ----------------------------- */ 384 385 /* PVLVs need to act as a superset of all scalar types - they are basically 386 * PVMGs with a few extra fields. 387 * REGEXPs are first class scalars, but have many fields that can't be copied 388 * into a PVLV body. 389 * 390 * Hence we take a different approach - instead of a copy, PVLVs store a pointer 391 * back to the original body. To avoid increasing the size of PVLVs just for the 392 * rare case of REGEXP assignment, this pointer is stored in the memory usually 393 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to 394 * read the pointer from the two possible locations. The macro SvLEN() wraps the 395 * access to the union's member xpvlenu_len, but there is no equivalent macro 396 * for wrapping the union's member xpvlenu_rx, hence the direct reference here. 397 * 398 * See commit df6b4bd56551f2d3 for more details. */ 399 400 PERL_STATIC_INLINE struct regexp * 401 Perl_ReANY(const REGEXP * const re) 402 { 403 XPV* const p = (XPV*)SvANY(re); 404 405 PERL_ARGS_ASSERT_REANY; 406 assert(isREGEXP(re)); 407 408 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx 409 : (struct regexp *)p; 410 } 411 412 /* ------------------------------- utf8.h ------------------------------- */ 413 414 /* 415 =for apidoc_section $unicode 416 */ 417 418 PERL_STATIC_INLINE void 419 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) 420 { 421 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 422 * encoded string at '*dest', updating '*dest' to include it */ 423 424 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; 425 426 if (NATIVE_BYTE_IS_INVARIANT(byte)) 427 *((*dest)++) = byte; 428 else { 429 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); 430 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); 431 } 432 } 433 434 /* 435 =for apidoc valid_utf8_to_uvchr 436 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is 437 known that the next character in the input UTF-8 string C<s> is well-formed 438 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code 439 points, and non-Unicode code points are allowed. 440 441 =cut 442 443 */ 444 445 PERL_STATIC_INLINE UV 446 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) 447 { 448 const UV expectlen = UTF8SKIP(s); 449 const U8* send = s + expectlen; 450 UV uv = *s; 451 452 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; 453 454 if (retlen) { 455 *retlen = expectlen; 456 } 457 458 /* An invariant is trivially returned */ 459 if (expectlen == 1) { 460 return uv; 461 } 462 463 /* Remove the leading bits that indicate the number of bytes, leaving just 464 * the bits that are part of the value */ 465 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 466 467 /* Now, loop through the remaining bytes, accumulating each into the 468 * working total as we go. (I khw tried unrolling the loop for up to 4 469 * bytes, but there was no performance improvement) */ 470 for (++s; s < send; s++) { 471 uv = UTF8_ACCUMULATE(uv, *s); 472 } 473 474 return UNI_TO_NATIVE(uv); 475 476 } 477 478 /* 479 =for apidoc is_utf8_invariant_string 480 481 Returns TRUE if the first C<len> bytes of the string C<s> are the same 482 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on 483 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they 484 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only 485 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range 486 characters are invariant, but so also are the C1 controls. 487 488 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you 489 use this option, that C<s> can't have embedded C<NUL> characters and has to 490 have a terminating C<NUL> byte). 491 492 See also 493 C<L</is_utf8_string>>, 494 C<L</is_utf8_string_flags>>, 495 C<L</is_utf8_string_loc>>, 496 C<L</is_utf8_string_loc_flags>>, 497 C<L</is_utf8_string_loclen>>, 498 C<L</is_utf8_string_loclen_flags>>, 499 C<L</is_utf8_fixed_width_buf_flags>>, 500 C<L</is_utf8_fixed_width_buf_loc_flags>>, 501 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 502 C<L</is_strict_utf8_string>>, 503 C<L</is_strict_utf8_string_loc>>, 504 C<L</is_strict_utf8_string_loclen>>, 505 C<L</is_c9strict_utf8_string>>, 506 C<L</is_c9strict_utf8_string_loc>>, 507 and 508 C<L</is_c9strict_utf8_string_loclen>>. 509 510 =cut 511 512 */ 513 514 #define is_utf8_invariant_string(s, len) \ 515 is_utf8_invariant_string_loc(s, len, NULL) 516 517 /* 518 =for apidoc is_utf8_invariant_string_loc 519 520 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of 521 the first UTF-8 variant character in the C<ep> pointer; if all characters are 522 UTF-8 invariant, this function does not change the contents of C<*ep>. 523 524 =cut 525 526 */ 527 528 PERL_STATIC_INLINE bool 529 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) 530 { 531 const U8* send; 532 const U8* x = s; 533 534 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; 535 536 if (len == 0) { 537 len = strlen((const char *)s); 538 } 539 540 send = s + len; 541 542 /* This looks like 0x010101... */ 543 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) 544 545 /* This looks like 0x808080... */ 546 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) 547 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) 548 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) 549 550 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by 551 * or'ing together the lowest bits of 'x'. Hopefully the final term gets 552 * optimized out completely on a 32-bit system, and its mask gets optimized out 553 * on a 64-bit system */ 554 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ 555 | ( PTR2nat(x) >> 1) \ 556 | ( ( (PTR2nat(x) \ 557 & PERL_WORD_BOUNDARY_MASK) >> 2)))) 558 559 #ifndef EBCDIC 560 561 /* Do the word-at-a-time iff there is at least one usable full word. That 562 * means that after advancing to a word boundary, there still is at least a 563 * full word left. The number of bytes needed to advance is 'wordsize - 564 * offset' unless offset is 0. */ 565 if ((STRLEN) (send - x) >= PERL_WORDSIZE 566 567 /* This term is wordsize if subword; 0 if not */ 568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 569 570 /* 'offset' */ 571 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 572 { 573 574 /* Process per-byte until reach word boundary. XXX This loop could be 575 * eliminated if we knew that this platform had fast unaligned reads */ 576 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 577 if (! UTF8_IS_INVARIANT(*x)) { 578 if (ep) { 579 *ep = x; 580 } 581 582 return FALSE; 583 } 584 x++; 585 } 586 587 /* Here, we know we have at least one full word to process. Process 588 * per-word as long as we have at least a full word left */ 589 do { 590 if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { 591 592 /* Found a variant. Just return if caller doesn't want its 593 * exact position */ 594 if (! ep) { 595 return FALSE; 596 } 597 598 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ 599 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 600 601 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x); 602 assert(*ep >= s && *ep < send); 603 604 return FALSE; 605 606 # else /* If weird byte order, drop into next loop to do byte-at-a-time 607 checks. */ 608 609 break; 610 # endif 611 } 612 613 x += PERL_WORDSIZE; 614 615 } while (x + PERL_WORDSIZE <= send); 616 } 617 618 #endif /* End of ! EBCDIC */ 619 620 /* Process per-byte */ 621 while (x < send) { 622 if (! UTF8_IS_INVARIANT(*x)) { 623 if (ep) { 624 *ep = x; 625 } 626 627 return FALSE; 628 } 629 630 x++; 631 } 632 633 return TRUE; 634 } 635 636 /* See if the platform has builtins for finding the most/least significant bit, 637 * and which one is right for using on 32 and 64 bit operands */ 638 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0)) 639 # if U32SIZE == INTSIZE 640 # define PERL_CLZ_32 __builtin_clz 641 # endif 642 # if defined(U64TYPE) && U64SIZE == INTSIZE 643 # define PERL_CLZ_64 __builtin_clz 644 # endif 645 #endif 646 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0)) 647 # if U32SIZE == INTSIZE 648 # define PERL_CTZ_32 __builtin_ctz 649 # endif 650 # if defined(U64TYPE) && U64SIZE == INTSIZE 651 # define PERL_CTZ_64 __builtin_ctz 652 # endif 653 #endif 654 655 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0)) 656 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32) 657 # define PERL_CLZ_32 __builtin_clzl 658 # endif 659 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64) 660 # define PERL_CLZ_64 __builtin_clzl 661 # endif 662 #endif 663 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0)) 664 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32) 665 # define PERL_CTZ_32 __builtin_ctzl 666 # endif 667 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64) 668 # define PERL_CTZ_64 __builtin_ctzl 669 # endif 670 #endif 671 672 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0)) 673 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32) 674 # define PERL_CLZ_32 __builtin_clzll 675 # endif 676 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64) 677 # define PERL_CLZ_64 __builtin_clzll 678 # endif 679 #endif 680 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0)) 681 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32) 682 # define PERL_CTZ_32 __builtin_ctzll 683 # endif 684 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64) 685 # define PERL_CTZ_64 __builtin_ctzll 686 # endif 687 #endif 688 689 #if defined(_MSC_VER) 690 # include <intrin.h> 691 # pragma intrinsic(_BitScanForward) 692 # pragma intrinsic(_BitScanReverse) 693 # ifdef _WIN64 694 # pragma intrinsic(_BitScanForward64) 695 # pragma intrinsic(_BitScanReverse64) 696 # endif 697 #endif 698 699 /* The reason there are not checks to see if ffs() and ffsl() are available for 700 * determining the lsb, is because these don't improve on the deBruijn method 701 * fallback, which is just a branchless integer multiply, array element 702 * retrieval, and shift. The others, even if the function call overhead is 703 * optimized out, have to cope with the possibility of the input being all 704 * zeroes, and almost certainly will have conditionals for this eventuality. 705 * khw, at the time of this commit, looked at the source for both gcc and clang 706 * to verify this. (gcc used a method inferior to deBruijn.) */ 707 708 /* Below are functions to find the first, last, or only set bit in a word. On 709 * platforms with 64-bit capability, there is a pair for each operation; the 710 * first taking a 64 bit operand, and the second a 32 bit one. The logic is 711 * the same in each pair, so the second is stripped of most comments. */ 712 713 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 714 715 PERL_STATIC_INLINE unsigned 716 Perl_lsbit_pos64(U64 word) 717 { 718 /* Find the position (0..63) of the least significant set bit in the input 719 * word */ 720 721 ASSUME(word != 0); 722 723 /* If we can determine that the platform has a usable fast method to get 724 * this info, use that */ 725 726 # if defined(PERL_CTZ_64) 727 # define PERL_HAS_FAST_GET_LSB_POS64 728 729 return (unsigned) PERL_CTZ_64(word); 730 731 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) 732 # define PERL_HAS_FAST_GET_LSB_POS64 733 734 { 735 unsigned long index; 736 _BitScanForward64(&index, word); 737 return (unsigned)index; 738 } 739 740 # else 741 742 /* Here, we didn't find a fast method for finding the lsb. Fall back to 743 * making the lsb the only set bit in the word, and use our function that 744 * works on words with a single bit set. 745 * 746 * Isolate the lsb; 747 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set 748 * 749 * The word will look like this, with a rightmost set bit in position 's': 750 * ('x's are don't cares, and 'y's are their complements) 751 * s 752 * x..x100..00 753 * y..y011..11 Complement 754 * y..y100..00 Add 1 755 * 0..0100..00 And with the original 756 * 757 * (Yes, complementing and adding 1 is just taking the negative on 2's 758 * complement machines, but not on 1's complement ones, and some compilers 759 * complain about negating an unsigned.) 760 */ 761 return single_1bit_pos64(word & (~word + 1)); 762 763 # endif 764 765 } 766 767 # define lsbit_pos_uintmax_(word) lsbit_pos64(word) 768 #else /* ! QUAD */ 769 # define lsbit_pos_uintmax_(word) lsbit_pos32(word) 770 #endif 771 772 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */ 773 Perl_lsbit_pos32(U32 word) 774 { 775 /* Find the position (0..31) of the least significant set bit in the input 776 * word */ 777 778 ASSUME(word != 0); 779 780 #if defined(PERL_CTZ_32) 781 # define PERL_HAS_FAST_GET_LSB_POS32 782 783 return (unsigned) PERL_CTZ_32(word); 784 785 #elif U32SIZE == 4 && defined(_MSC_VER) 786 # define PERL_HAS_FAST_GET_LSB_POS32 787 788 { 789 unsigned long index; 790 _BitScanForward(&index, word); 791 return (unsigned)index; 792 } 793 794 #else 795 796 return single_1bit_pos32(word & (~word + 1)); 797 798 #endif 799 800 } 801 802 803 /* Convert the leading zeros count to the bit position of the first set bit. 804 * This just subtracts from the highest position, 31 or 63. But some compilers 805 * don't optimize this optimally, and so a bit of bit twiddling encourages them 806 * to do the right thing. It turns out that subtracting a smaller non-negative 807 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of 808 * the two numbers. To see why, first note that the sum of any number, x, and 809 * its complement, x', is all ones. So all ones minus x is x'. Then note that 810 * the xor of x and all ones is x'. */ 811 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc)) 812 813 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 814 815 PERL_STATIC_INLINE unsigned 816 Perl_msbit_pos64(U64 word) 817 { 818 /* Find the position (0..63) of the most significant set bit in the input 819 * word */ 820 821 ASSUME(word != 0); 822 823 /* If we can determine that the platform has a usable fast method to get 824 * this, use that */ 825 826 # if defined(PERL_CLZ_64) 827 # define PERL_HAS_FAST_GET_MSB_POS64 828 829 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word)); 830 831 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) 832 # define PERL_HAS_FAST_GET_MSB_POS64 833 834 { 835 unsigned long index; 836 _BitScanReverse64(&index, word); 837 return (unsigned)index; 838 } 839 840 # else 841 842 /* Here, we didn't find a fast method for finding the msb. Fall back to 843 * making the msb the only set bit in the word, and use our function that 844 * works on words with a single bit set. 845 * 846 * Isolate the msb; http://codeforces.com/blog/entry/10330 847 * 848 * Only the most significant set bit matters. Or'ing word with its right 849 * shift of 1 makes that bit and the next one to its right both 1. 850 * Repeating that with the right shift of 2 makes for 4 1-bits in a row. 851 * ... We end with the msb and all to the right being 1. */ 852 word |= (word >> 1); 853 word |= (word >> 2); 854 word |= (word >> 4); 855 word |= (word >> 8); 856 word |= (word >> 16); 857 word |= (word >> 32); 858 859 /* Then subtracting the right shift by 1 clears all but the left-most of 860 * the 1 bits, which is our desired result */ 861 word -= (word >> 1); 862 863 /* Now we have a single bit set */ 864 return single_1bit_pos64(word); 865 866 # endif 867 868 } 869 870 # define msbit_pos_uintmax_(word) msbit_pos64(word) 871 #else /* ! QUAD */ 872 # define msbit_pos_uintmax_(word) msbit_pos32(word) 873 #endif 874 875 PERL_STATIC_INLINE unsigned 876 Perl_msbit_pos32(U32 word) 877 { 878 /* Find the position (0..31) of the most significant set bit in the input 879 * word */ 880 881 ASSUME(word != 0); 882 883 #if defined(PERL_CLZ_32) 884 # define PERL_HAS_FAST_GET_MSB_POS32 885 886 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word)); 887 888 #elif U32SIZE == 4 && defined(_MSC_VER) 889 # define PERL_HAS_FAST_GET_MSB_POS32 890 891 { 892 unsigned long index; 893 _BitScanReverse(&index, word); 894 return (unsigned)index; 895 } 896 897 #else 898 899 word |= (word >> 1); 900 word |= (word >> 2); 901 word |= (word >> 4); 902 word |= (word >> 8); 903 word |= (word >> 16); 904 word -= (word >> 1); 905 return single_1bit_pos32(word); 906 907 #endif 908 909 } 910 911 #if UVSIZE == U64SIZE 912 # define msbit_pos(word) msbit_pos64(word) 913 # define lsbit_pos(word) lsbit_pos64(word) 914 #elif UVSIZE == U32SIZE 915 # define msbit_pos(word) msbit_pos32(word) 916 # define lsbit_pos(word) lsbit_pos32(word) 917 #endif 918 919 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 920 921 PERL_STATIC_INLINE unsigned 922 Perl_single_1bit_pos64(U64 word) 923 { 924 /* Given a 64-bit word known to contain all zero bits except one 1 bit, 925 * find and return the 1's position: 0..63 */ 926 927 # ifdef PERL_CORE /* macro not exported */ 928 ASSUME(isPOWER_OF_2(word)); 929 # else 930 ASSUME(word && (word & (word-1)) == 0); 931 # endif 932 933 /* The only set bit is both the most and least significant bit. If we have 934 * a fast way of finding either one, use that. 935 * 936 * It may appear at first glance that those functions call this one, but 937 * they don't if the corresponding #define is set */ 938 939 # ifdef PERL_HAS_FAST_GET_MSB_POS64 940 941 return msbit_pos64(word); 942 943 # elif defined(PERL_HAS_FAST_GET_LSB_POS64) 944 945 return lsbit_pos64(word); 946 947 # else 948 949 /* The position of the only set bit in a word can be quickly calculated 950 * using deBruijn sequences. See for example 951 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */ 952 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_) 953 >> PERL_deBruijnShift64_]; 954 # endif 955 956 } 957 958 #endif 959 960 PERL_STATIC_INLINE unsigned 961 Perl_single_1bit_pos32(U32 word) 962 { 963 /* Given a 32-bit word known to contain all zero bits except one 1 bit, 964 * find and return the 1's position: 0..31 */ 965 966 #ifdef PERL_CORE /* macro not exported */ 967 ASSUME(isPOWER_OF_2(word)); 968 #else 969 ASSUME(word && (word & (word-1)) == 0); 970 #endif 971 #ifdef PERL_HAS_FAST_GET_MSB_POS32 972 973 return msbit_pos32(word); 974 975 #elif defined(PERL_HAS_FAST_GET_LSB_POS32) 976 977 return lsbit_pos32(word); 978 979 /* Unlikely, but possible for the platform to have a wider fast operation but 980 * not a narrower one. But easy enough to handle the case by widening the 981 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops 982 * would be slower than the deBruijn method.) */ 983 #elif defined(PERL_HAS_FAST_GET_MSB_POS64) 984 985 return msbit_pos64(word); 986 987 #elif defined(PERL_HAS_FAST_GET_LSB_POS64) 988 989 return lsbit_pos64(word); 990 991 #else 992 993 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_) 994 >> PERL_deBruijnShift32_]; 995 #endif 996 997 } 998 999 #ifndef EBCDIC 1000 1001 PERL_STATIC_INLINE unsigned int 1002 Perl_variant_byte_number(PERL_UINTMAX_T word) 1003 { 1004 /* This returns the position in a word (0..7) of the first variant byte in 1005 * it. This is a helper function. Note that there are no branches */ 1006 1007 /* Get just the msb bits of each byte */ 1008 word &= PERL_VARIANTS_WORD_MASK; 1009 1010 /* This should only be called if we know there is a variant byte in the 1011 * word */ 1012 assert(word); 1013 1014 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 1015 1016 /* Bytes are stored like 1017 * Byte8 ... Byte2 Byte1 1018 * 63..56...15...8 7...0 1019 * so getting the lsb of the whole modified word is getting the msb of the 1020 * first byte that has its msb set */ 1021 word = lsbit_pos_uintmax_(word); 1022 1023 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert 1024 * to 0..7 */ 1025 return (unsigned int) ((word + 1) >> 3) - 1; 1026 1027 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 1028 1029 /* Bytes are stored like 1030 * Byte1 Byte2 ... Byte8 1031 * 63..56 55..47 ... 7...0 1032 * so getting the msb of the whole modified word is getting the msb of the 1033 * first byte that has its msb set */ 1034 word = msbit_pos_uintmax_(word); 1035 1036 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert 1037 * to 0..7 */ 1038 word = ((word + 1) >> 3) - 1; 1039 1040 /* And invert the result because of the reversed byte order on this 1041 * platform */ 1042 word = CHARBITS - word - 1; 1043 1044 return (unsigned int) word; 1045 1046 # else 1047 # error Unexpected byte order 1048 # endif 1049 1050 } 1051 1052 #endif 1053 #if defined(PERL_CORE) || defined(PERL_EXT) 1054 1055 /* 1056 =for apidoc variant_under_utf8_count 1057 1058 This function looks at the sequence of bytes between C<s> and C<e>, which are 1059 assumed to be encoded in ASCII/Latin1, and returns how many of them would 1060 change should the string be translated into UTF-8. Due to the nature of UTF-8, 1061 each of these would occupy two bytes instead of the single one in the input 1062 string. Thus, this function returns the precise number of bytes the string 1063 would expand by when translated to UTF-8. 1064 1065 Unlike most of the other functions that have C<utf8> in their name, the input 1066 to this function is NOT a UTF-8-encoded string. The function name is slightly 1067 I<odd> to emphasize this. 1068 1069 This function is internal to Perl because khw thinks that any XS code that 1070 would want this is probably operating too close to the internals. Presenting a 1071 valid use case could change that. 1072 1073 See also 1074 C<L<perlapi/is_utf8_invariant_string>> 1075 and 1076 C<L<perlapi/is_utf8_invariant_string_loc>>, 1077 1078 =cut 1079 1080 */ 1081 1082 PERL_STATIC_INLINE Size_t 1083 S_variant_under_utf8_count(const U8* const s, const U8* const e) 1084 { 1085 const U8* x = s; 1086 Size_t count = 0; 1087 1088 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; 1089 1090 # ifndef EBCDIC 1091 1092 /* Test if the string is long enough to use word-at-a-time. (Logic is the 1093 * same as for is_utf8_invariant_string()) */ 1094 if ((STRLEN) (e - x) >= PERL_WORDSIZE 1095 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 1096 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 1097 { 1098 1099 /* Process per-byte until reach word boundary. XXX This loop could be 1100 * eliminated if we knew that this platform had fast unaligned reads */ 1101 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 1102 count += ! UTF8_IS_INVARIANT(*x++); 1103 } 1104 1105 /* Process per-word as long as we have at least a full word left */ 1106 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an 1107 explanation of how this works */ 1108 PERL_UINTMAX_T increment 1109 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) 1110 * PERL_COUNT_MULTIPLIER) 1111 >> ((PERL_WORDSIZE - 1) * CHARBITS); 1112 count += (Size_t) increment; 1113 x += PERL_WORDSIZE; 1114 } while (x + PERL_WORDSIZE <= e); 1115 } 1116 1117 # endif 1118 1119 /* Process per-byte */ 1120 while (x < e) { 1121 if (! UTF8_IS_INVARIANT(*x)) { 1122 count++; 1123 } 1124 1125 x++; 1126 } 1127 1128 return count; 1129 } 1130 1131 #endif 1132 1133 /* Keep these around for these files */ 1134 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C) 1135 # undef PERL_WORDSIZE 1136 # undef PERL_COUNT_MULTIPLIER 1137 # undef PERL_WORD_BOUNDARY_MASK 1138 # undef PERL_VARIANTS_WORD_MASK 1139 #endif 1140 1141 /* 1142 =for apidoc is_utf8_string 1143 1144 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1145 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will 1146 be calculated using C<strlen(s)> (which means if you use this option, that C<s> 1147 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1148 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1149 1150 This function considers Perl's extended UTF-8 to be valid. That means that 1151 code points above Unicode, surrogates, and non-character code points are 1152 considered valid by this function. Use C<L</is_strict_utf8_string>>, 1153 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what 1154 code points are considered valid. 1155 1156 See also 1157 C<L</is_utf8_invariant_string>>, 1158 C<L</is_utf8_invariant_string_loc>>, 1159 C<L</is_utf8_string_loc>>, 1160 C<L</is_utf8_string_loclen>>, 1161 C<L</is_utf8_fixed_width_buf_flags>>, 1162 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1163 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1164 1165 =cut 1166 */ 1167 1168 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) 1169 1170 #if defined(PERL_CORE) || defined (PERL_EXT) 1171 1172 /* 1173 =for apidoc is_utf8_non_invariant_string 1174 1175 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first 1176 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended 1177 UTF-8; otherwise returns FALSE. 1178 1179 A TRUE return means that at least one code point represented by the sequence 1180 either is a wide character not representable as a single byte, or the 1181 representation differs depending on whether the sequence is encoded in UTF-8 or 1182 not. 1183 1184 See also 1185 C<L<perlapi/is_utf8_invariant_string>>, 1186 C<L<perlapi/is_utf8_string>> 1187 1188 =cut 1189 1190 This is commonly used to determine if a SV's UTF-8 flag should be turned on. 1191 It generally needn't be if its string is entirely UTF-8 invariant, and it 1192 shouldn't be if it otherwise contains invalid UTF-8. 1193 1194 It is an internal function because khw thinks that XS code shouldn't be working 1195 at this low a level. A valid use case could change that. 1196 1197 */ 1198 1199 PERL_STATIC_INLINE bool 1200 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len) 1201 { 1202 const U8 * first_variant; 1203 1204 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; 1205 1206 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1207 return FALSE; 1208 } 1209 1210 return is_utf8_string(first_variant, len - (first_variant - s)); 1211 } 1212 1213 #endif 1214 1215 /* 1216 =for apidoc is_strict_utf8_string 1217 1218 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1219 UTF-8-encoded string that is fully interchangeable by any application using 1220 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be 1221 calculated using C<strlen(s)> (which means if you use this option, that C<s> 1222 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1223 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1224 1225 This function returns FALSE for strings containing any 1226 code points above the Unicode max of 0x10FFFF, surrogate code points, or 1227 non-character code points. 1228 1229 See also 1230 C<L</is_utf8_invariant_string>>, 1231 C<L</is_utf8_invariant_string_loc>>, 1232 C<L</is_utf8_string>>, 1233 C<L</is_utf8_string_flags>>, 1234 C<L</is_utf8_string_loc>>, 1235 C<L</is_utf8_string_loc_flags>>, 1236 C<L</is_utf8_string_loclen>>, 1237 C<L</is_utf8_string_loclen_flags>>, 1238 C<L</is_utf8_fixed_width_buf_flags>>, 1239 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1240 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1241 C<L</is_strict_utf8_string_loc>>, 1242 C<L</is_strict_utf8_string_loclen>>, 1243 C<L</is_c9strict_utf8_string>>, 1244 C<L</is_c9strict_utf8_string_loc>>, 1245 and 1246 C<L</is_c9strict_utf8_string_loclen>>. 1247 1248 =cut 1249 */ 1250 1251 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) 1252 1253 /* 1254 =for apidoc is_c9strict_utf8_string 1255 1256 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1257 UTF-8-encoded string that conforms to 1258 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>; 1259 otherwise it returns FALSE. If C<len> is 0, it will be calculated using 1260 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded 1261 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all 1262 characters being ASCII constitute 'a valid UTF-8 string'. 1263 1264 This function returns FALSE for strings containing any code points above the 1265 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character 1266 code points per 1267 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1268 1269 See also 1270 C<L</is_utf8_invariant_string>>, 1271 C<L</is_utf8_invariant_string_loc>>, 1272 C<L</is_utf8_string>>, 1273 C<L</is_utf8_string_flags>>, 1274 C<L</is_utf8_string_loc>>, 1275 C<L</is_utf8_string_loc_flags>>, 1276 C<L</is_utf8_string_loclen>>, 1277 C<L</is_utf8_string_loclen_flags>>, 1278 C<L</is_utf8_fixed_width_buf_flags>>, 1279 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1280 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1281 C<L</is_strict_utf8_string>>, 1282 C<L</is_strict_utf8_string_loc>>, 1283 C<L</is_strict_utf8_string_loclen>>, 1284 C<L</is_c9strict_utf8_string_loc>>, 1285 and 1286 C<L</is_c9strict_utf8_string_loclen>>. 1287 1288 =cut 1289 */ 1290 1291 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) 1292 1293 /* 1294 =for apidoc is_utf8_string_flags 1295 1296 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1297 UTF-8 string, subject to the restrictions imposed by C<flags>; 1298 returns FALSE otherwise. If C<len> is 0, it will be calculated 1299 using C<strlen(s)> (which means if you use this option, that C<s> can't have 1300 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note 1301 that all characters being ASCII constitute 'a valid UTF-8 string'. 1302 1303 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if 1304 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 1305 as C<L</is_strict_utf8_string>>; and if C<flags> is 1306 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as 1307 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any 1308 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by 1309 C<L</utf8n_to_uvchr>>, with the same meanings. 1310 1311 See also 1312 C<L</is_utf8_invariant_string>>, 1313 C<L</is_utf8_invariant_string_loc>>, 1314 C<L</is_utf8_string>>, 1315 C<L</is_utf8_string_loc>>, 1316 C<L</is_utf8_string_loc_flags>>, 1317 C<L</is_utf8_string_loclen>>, 1318 C<L</is_utf8_string_loclen_flags>>, 1319 C<L</is_utf8_fixed_width_buf_flags>>, 1320 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1321 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1322 C<L</is_strict_utf8_string>>, 1323 C<L</is_strict_utf8_string_loc>>, 1324 C<L</is_strict_utf8_string_loclen>>, 1325 C<L</is_c9strict_utf8_string>>, 1326 C<L</is_c9strict_utf8_string_loc>>, 1327 and 1328 C<L</is_c9strict_utf8_string_loclen>>. 1329 1330 =cut 1331 */ 1332 1333 PERL_STATIC_INLINE bool 1334 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) 1335 { 1336 const U8 * first_variant; 1337 1338 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; 1339 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1340 |UTF8_DISALLOW_PERL_EXTENDED))); 1341 1342 if (len == 0) { 1343 len = strlen((const char *)s); 1344 } 1345 1346 if (flags == 0) { 1347 return is_utf8_string(s, len); 1348 } 1349 1350 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1351 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 1352 { 1353 return is_strict_utf8_string(s, len); 1354 } 1355 1356 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1357 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 1358 { 1359 return is_c9strict_utf8_string(s, len); 1360 } 1361 1362 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { 1363 const U8* const send = s + len; 1364 const U8* x = first_variant; 1365 1366 while (x < send) { 1367 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 1368 if (UNLIKELY(! cur_len)) { 1369 return FALSE; 1370 } 1371 x += cur_len; 1372 } 1373 } 1374 1375 return TRUE; 1376 } 1377 1378 /* 1379 1380 =for apidoc is_utf8_string_loc 1381 1382 Like C<L</is_utf8_string>> but stores the location of the failure (in the 1383 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1384 "utf8ness success") in the C<ep> pointer. 1385 1386 See also C<L</is_utf8_string_loclen>>. 1387 1388 =cut 1389 */ 1390 1391 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) 1392 1393 /* 1394 1395 =for apidoc is_utf8_string_loclen 1396 1397 Like C<L</is_utf8_string>> but stores the location of the failure (in the 1398 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1399 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1400 encoded characters in the C<el> pointer. 1401 1402 See also C<L</is_utf8_string_loc>>. 1403 1404 =cut 1405 */ 1406 1407 PERL_STATIC_INLINE bool 1408 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1409 { 1410 const U8 * first_variant; 1411 1412 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 1413 1414 if (len == 0) { 1415 len = strlen((const char *) s); 1416 } 1417 1418 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1419 if (el) 1420 *el = len; 1421 1422 if (ep) { 1423 *ep = s + len; 1424 } 1425 1426 return TRUE; 1427 } 1428 1429 { 1430 const U8* const send = s + len; 1431 const U8* x = first_variant; 1432 STRLEN outlen = first_variant - s; 1433 1434 while (x < send) { 1435 const STRLEN cur_len = isUTF8_CHAR(x, send); 1436 if (UNLIKELY(! cur_len)) { 1437 break; 1438 } 1439 x += cur_len; 1440 outlen++; 1441 } 1442 1443 if (el) 1444 *el = outlen; 1445 1446 if (ep) { 1447 *ep = x; 1448 } 1449 1450 return (x == send); 1451 } 1452 } 1453 1454 /* The perl core arranges to never call the DFA below without there being at 1455 * least one byte available to look at. This allows the DFA to use a do {} 1456 * while loop which means that calling it with a UTF-8 invariant has a single 1457 * conditional, same as the calling code checking for invariance ahead of time. 1458 * And having the calling code remove that conditional speeds up by that 1459 * conditional, the case where it wasn't invariant. So there's no reason to 1460 * check before caling this. 1461 * 1462 * But we don't know this for non-core calls, so have to retain the check for 1463 * them. */ 1464 #ifdef PERL_CORE 1465 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s)) 1466 #else 1467 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE 1468 #endif 1469 1470 /* 1471 * DFA for checking input is valid UTF-8 syntax. 1472 * 1473 * This uses adaptations of the table and algorithm given in 1474 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1475 * documentation of the original version. A copyright notice for the original 1476 * version is given at the beginning of this file. The Perl adaptations are 1477 * documented at the definition of PL_extended_utf8_dfa_tab[]. 1478 * 1479 * This dfa is fast. There are three exit conditions: 1480 * 1) a well-formed code point, acceptable to the table 1481 * 2) the beginning bytes of an incomplete character, whose completion might 1482 * or might not be acceptable 1483 * 3) unacceptable to the table. Some of the adaptations have certain, 1484 * hopefully less likely to occur, legal inputs be unacceptable to the 1485 * table, so these must be sorted out afterwards. 1486 * 1487 * This macro is a complete implementation of the code executing the DFA. It 1488 * is passed the input sequence bounds and the table to use, and what to do 1489 * for each of the exit conditions. There are three canned actions, likely to 1490 * be the ones you want: 1491 * DFA_RETURN_SUCCESS_ 1492 * DFA_RETURN_FAILURE_ 1493 * DFA_GOTO_TEASE_APART_FF_ 1494 * 1495 * You pass a parameter giving the action to take for each of the three 1496 * possible exit conditions: 1497 * 1498 * 'accept_action' This is executed when the DFA accepts the input. 1499 * DFA_RETURN_SUCCESS_ is the most likely candidate. 1500 * 'reject_action' This is executed when the DFA rejects the input. 1501 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where 1502 * you have written code to distinguish the rejecting state 1503 * results. Because it happens in several places, and 1504 * involves #ifdefs, the special action 1505 * DFA_GOTO_TEASE_APART_FF_ is what you want with 1506 * PL_extended_utf8_dfa_tab. On platforms without 1507 * EXTRA_LONG_UTF8, there is no need to tease anything apart, 1508 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you 1509 * need to have a label 'tease_apart_FF' that it will transfer 1510 * to. 1511 * 'incomplete_char_action' This is executed when the DFA ran off the end 1512 * before accepting or rejecting the input. 1513 * DFA_RETURN_FAILURE_ is the likely action, but you could 1514 * have a 'goto', or NOOP. In the latter case the DFA drops 1515 * off the end, and you place your code to handle this case 1516 * immediately after it. 1517 */ 1518 1519 #define DFA_RETURN_SUCCESS_ return s - s0 1520 #define DFA_RETURN_FAILURE_ return 0 1521 #ifdef HAS_EXTRA_LONG_UTF8 1522 # define DFA_TEASE_APART_FF_ goto tease_apart_FF 1523 #else 1524 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_ 1525 #endif 1526 1527 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \ 1528 accept_action, \ 1529 reject_action, \ 1530 incomplete_char_action) \ 1531 STMT_START { \ 1532 const U8 * s = s0; \ 1533 const U8 * e_ = e; \ 1534 UV state = 0; \ 1535 \ 1536 PERL_NON_CORE_CHECK_EMPTY(s, e_); \ 1537 \ 1538 do { \ 1539 state = dfa_tab[256 + state + dfa_tab[*s]]; \ 1540 s++; \ 1541 \ 1542 if (state == 0) { /* Accepting state */ \ 1543 accept_action; \ 1544 } \ 1545 \ 1546 if (UNLIKELY(state == 1)) { /* Rejecting state */ \ 1547 reject_action; \ 1548 } \ 1549 } while (s < e_); \ 1550 \ 1551 /* Here, dropped out of loop before end-of-char */ \ 1552 incomplete_char_action; \ 1553 } STMT_END 1554 1555 1556 /* 1557 1558 =for apidoc isUTF8_CHAR 1559 1560 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1561 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 1562 that represents some code point; otherwise it evaluates to 0. If non-zero, the 1563 value gives how many bytes starting at C<s> comprise the code point's 1564 representation. Any bytes remaining before C<e>, but beyond the ones needed to 1565 form the first code point in C<s>, are not examined. 1566 1567 The code point can be any that will fit in an IV on this machine, using Perl's 1568 extension to official UTF-8 to represent those higher than the Unicode maximum 1569 of 0x10FFFF. That means that this macro is used to efficiently decide if the 1570 next few bytes in C<s> is legal UTF-8 for a single character. 1571 1572 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those 1573 defined by Unicode to be fully interchangeable across applications; 1574 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1575 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1576 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1577 1578 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and 1579 C<L</is_utf8_string_loclen>> to check entire strings. 1580 1581 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC 1582 machines) is a valid UTF-8 character. 1583 1584 =cut 1585 1586 This uses an adaptation of the table and algorithm given in 1587 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1588 documentation of the original version. A copyright notice for the original 1589 version is given at the beginning of this file. The Perl adaptation is 1590 documented at the definition of PL_extended_utf8_dfa_tab[]. 1591 */ 1592 1593 PERL_STATIC_INLINE Size_t 1594 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) 1595 { 1596 PERL_ARGS_ASSERT_ISUTF8_CHAR; 1597 1598 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 1599 DFA_RETURN_SUCCESS_, 1600 DFA_TEASE_APART_FF_, 1601 DFA_RETURN_FAILURE_); 1602 1603 /* Here, we didn't return success, but dropped out of the loop. In the 1604 * case of PL_extended_utf8_dfa_tab, this means the input is either 1605 * malformed, or the start byte was FF on a platform that the dfa doesn't 1606 * handle FF's. Call a helper function. */ 1607 1608 #ifdef HAS_EXTRA_LONG_UTF8 1609 1610 tease_apart_FF: 1611 1612 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 1613 * either malformed, or was for the largest possible start byte, which we 1614 * now check, not inline */ 1615 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) { 1616 return 0; 1617 } 1618 1619 return is_utf8_FF_helper_(s0, e, 1620 FALSE /* require full, not partial char */ 1621 ); 1622 #endif 1623 1624 } 1625 1626 /* 1627 1628 =for apidoc isSTRICT_UTF8_CHAR 1629 1630 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1631 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1632 Unicode code point completely acceptable for open interchange between all 1633 applications; otherwise it evaluates to 0. If non-zero, the value gives how 1634 many bytes starting at C<s> comprise the code point's representation. Any 1635 bytes remaining before C<e>, but beyond the ones needed to form the first code 1636 point in C<s>, are not examined. 1637 1638 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not 1639 be a surrogate nor a non-character code point. Thus this excludes any code 1640 point from Perl's extended UTF-8. 1641 1642 This is used to efficiently decide if the next few bytes in C<s> is 1643 legal Unicode-acceptable UTF-8 for a single character. 1644 1645 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1646 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1647 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; 1648 and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1649 1650 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and 1651 C<L</is_strict_utf8_string_loclen>> to check entire strings. 1652 1653 =cut 1654 1655 This uses an adaptation of the tables and algorithm given in 1656 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1657 documentation of the original version. A copyright notice for the original 1658 version is given at the beginning of this file. The Perl adaptation is 1659 documented at the definition of strict_extended_utf8_dfa_tab[]. 1660 1661 */ 1662 1663 PERL_STATIC_INLINE Size_t 1664 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1665 { 1666 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; 1667 1668 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, 1669 DFA_RETURN_SUCCESS_, 1670 goto check_hanguls, 1671 DFA_RETURN_FAILURE_); 1672 check_hanguls: 1673 1674 /* Here, we didn't return success, but dropped out of the loop. In the 1675 * case of PL_strict_utf8_dfa_tab, this means the input is either 1676 * malformed, or was for certain Hanguls; handle them specially */ 1677 1678 /* The dfa above drops out for incomplete or illegal inputs, and certain 1679 * legal Hanguls; check and return accordingly */ 1680 return is_HANGUL_ED_utf8_safe(s0, e); 1681 } 1682 1683 /* 1684 1685 =for apidoc isC9_STRICT_UTF8_CHAR 1686 1687 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1688 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1689 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, 1690 the value gives how many bytes starting at C<s> comprise the code point's 1691 representation. Any bytes remaining before C<e>, but beyond the ones needed to 1692 form the first code point in C<s>, are not examined. 1693 1694 The largest acceptable code point is the Unicode maximum 0x10FFFF. This 1695 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character 1696 code points. This corresponds to 1697 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1698 which said that non-character code points are merely discouraged rather than 1699 completely forbidden in open interchange. See 1700 L<perlunicode/Noncharacter code points>. 1701 1702 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and 1703 C<L</isUTF8_CHAR_flags>> for a more customized definition. 1704 1705 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and 1706 C<L</is_c9strict_utf8_string_loclen>> to check entire strings. 1707 1708 =cut 1709 1710 This uses an adaptation of the tables and algorithm given in 1711 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1712 documentation of the original version. A copyright notice for the original 1713 version is given at the beginning of this file. The Perl adaptation is 1714 documented at the definition of PL_c9_utf8_dfa_tab[]. 1715 1716 */ 1717 1718 PERL_STATIC_INLINE Size_t 1719 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1720 { 1721 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; 1722 1723 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab, 1724 DFA_RETURN_SUCCESS_, 1725 DFA_RETURN_FAILURE_, 1726 DFA_RETURN_FAILURE_); 1727 } 1728 1729 /* 1730 1731 =for apidoc is_strict_utf8_string_loc 1732 1733 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1734 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1735 "utf8ness success") in the C<ep> pointer. 1736 1737 See also C<L</is_strict_utf8_string_loclen>>. 1738 1739 =cut 1740 */ 1741 1742 #define is_strict_utf8_string_loc(s, len, ep) \ 1743 is_strict_utf8_string_loclen(s, len, ep, 0) 1744 1745 /* 1746 1747 =for apidoc is_strict_utf8_string_loclen 1748 1749 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1750 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1751 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1752 encoded characters in the C<el> pointer. 1753 1754 See also C<L</is_strict_utf8_string_loc>>. 1755 1756 =cut 1757 */ 1758 1759 PERL_STATIC_INLINE bool 1760 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1761 { 1762 const U8 * first_variant; 1763 1764 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; 1765 1766 if (len == 0) { 1767 len = strlen((const char *) s); 1768 } 1769 1770 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1771 if (el) 1772 *el = len; 1773 1774 if (ep) { 1775 *ep = s + len; 1776 } 1777 1778 return TRUE; 1779 } 1780 1781 { 1782 const U8* const send = s + len; 1783 const U8* x = first_variant; 1784 STRLEN outlen = first_variant - s; 1785 1786 while (x < send) { 1787 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); 1788 if (UNLIKELY(! cur_len)) { 1789 break; 1790 } 1791 x += cur_len; 1792 outlen++; 1793 } 1794 1795 if (el) 1796 *el = outlen; 1797 1798 if (ep) { 1799 *ep = x; 1800 } 1801 1802 return (x == send); 1803 } 1804 } 1805 1806 /* 1807 1808 =for apidoc is_c9strict_utf8_string_loc 1809 1810 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1811 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1812 "utf8ness success") in the C<ep> pointer. 1813 1814 See also C<L</is_c9strict_utf8_string_loclen>>. 1815 1816 =cut 1817 */ 1818 1819 #define is_c9strict_utf8_string_loc(s, len, ep) \ 1820 is_c9strict_utf8_string_loclen(s, len, ep, 0) 1821 1822 /* 1823 1824 =for apidoc is_c9strict_utf8_string_loclen 1825 1826 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1827 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1828 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded 1829 characters in the C<el> pointer. 1830 1831 See also C<L</is_c9strict_utf8_string_loc>>. 1832 1833 =cut 1834 */ 1835 1836 PERL_STATIC_INLINE bool 1837 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1838 { 1839 const U8 * first_variant; 1840 1841 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; 1842 1843 if (len == 0) { 1844 len = strlen((const char *) s); 1845 } 1846 1847 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1848 if (el) 1849 *el = len; 1850 1851 if (ep) { 1852 *ep = s + len; 1853 } 1854 1855 return TRUE; 1856 } 1857 1858 { 1859 const U8* const send = s + len; 1860 const U8* x = first_variant; 1861 STRLEN outlen = first_variant - s; 1862 1863 while (x < send) { 1864 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); 1865 if (UNLIKELY(! cur_len)) { 1866 break; 1867 } 1868 x += cur_len; 1869 outlen++; 1870 } 1871 1872 if (el) 1873 *el = outlen; 1874 1875 if (ep) { 1876 *ep = x; 1877 } 1878 1879 return (x == send); 1880 } 1881 } 1882 1883 /* 1884 1885 =for apidoc is_utf8_string_loc_flags 1886 1887 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1888 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1889 "utf8ness success") in the C<ep> pointer. 1890 1891 See also C<L</is_utf8_string_loclen_flags>>. 1892 1893 =cut 1894 */ 1895 1896 #define is_utf8_string_loc_flags(s, len, ep, flags) \ 1897 is_utf8_string_loclen_flags(s, len, ep, 0, flags) 1898 1899 1900 /* The above 3 actual functions could have been moved into the more general one 1901 * just below, and made #defines that call it with the right 'flags'. They are 1902 * currently kept separate to increase their chances of getting inlined */ 1903 1904 /* 1905 1906 =for apidoc is_utf8_string_loclen_flags 1907 1908 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1909 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1910 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1911 encoded characters in the C<el> pointer. 1912 1913 See also C<L</is_utf8_string_loc_flags>>. 1914 1915 =cut 1916 */ 1917 1918 PERL_STATIC_INLINE bool 1919 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) 1920 { 1921 const U8 * first_variant; 1922 1923 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; 1924 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1925 |UTF8_DISALLOW_PERL_EXTENDED))); 1926 1927 if (len == 0) { 1928 len = strlen((const char *) s); 1929 } 1930 1931 if (flags == 0) { 1932 return is_utf8_string_loclen(s, len, ep, el); 1933 } 1934 1935 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1936 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 1937 { 1938 return is_strict_utf8_string_loclen(s, len, ep, el); 1939 } 1940 1941 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1942 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 1943 { 1944 return is_c9strict_utf8_string_loclen(s, len, ep, el); 1945 } 1946 1947 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1948 if (el) 1949 *el = len; 1950 1951 if (ep) { 1952 *ep = s + len; 1953 } 1954 1955 return TRUE; 1956 } 1957 1958 { 1959 const U8* send = s + len; 1960 const U8* x = first_variant; 1961 STRLEN outlen = first_variant - s; 1962 1963 while (x < send) { 1964 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 1965 if (UNLIKELY(! cur_len)) { 1966 break; 1967 } 1968 x += cur_len; 1969 outlen++; 1970 } 1971 1972 if (el) 1973 *el = outlen; 1974 1975 if (ep) { 1976 *ep = x; 1977 } 1978 1979 return (x == send); 1980 } 1981 } 1982 1983 /* 1984 =for apidoc utf8_distance 1985 1986 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 1987 and C<b>. 1988 1989 WARNING: use only if you *know* that the pointers point inside the 1990 same UTF-8 buffer. 1991 1992 =cut 1993 */ 1994 1995 PERL_STATIC_INLINE IV 1996 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 1997 { 1998 PERL_ARGS_ASSERT_UTF8_DISTANCE; 1999 2000 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 2001 } 2002 2003 /* 2004 =for apidoc utf8_hop 2005 2006 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 2007 forward (if C<off> is positive) or backward (if negative). C<s> does not need 2008 to be pointing to the starting byte of a character. If it isn't, one count of 2009 C<off> will be used up to get to the start of the next character for forward 2010 hops, and to the start of the current character for negative ones. 2011 2012 WARNING: Prefer L</utf8_hop_safe> to this one. 2013 2014 Do NOT use this function unless you B<know> C<off> is within 2015 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned 2016 on the first byte of a character or just after the last byte of a character. 2017 2018 =cut 2019 */ 2020 2021 PERL_STATIC_INLINE U8 * 2022 Perl_utf8_hop(const U8 *s, SSize_t off) 2023 { 2024 PERL_ARGS_ASSERT_UTF8_HOP; 2025 2026 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2027 * the XXX bitops (especially ~) can create illegal UTF-8. 2028 * In other words: in Perl UTF-8 is not just for Unicode. */ 2029 2030 if (off > 0) { 2031 2032 /* Get to next non-continuation byte */ 2033 if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) { 2034 do { 2035 s++; 2036 } 2037 while (UTF8_IS_CONTINUATION(*s)); 2038 off--; 2039 } 2040 2041 while (off--) 2042 s += UTF8SKIP(s); 2043 } 2044 else { 2045 while (off++) { 2046 s--; 2047 while (UTF8_IS_CONTINUATION(*s)) 2048 s--; 2049 } 2050 } 2051 2052 GCC_DIAG_IGNORE(-Wcast-qual) 2053 return (U8 *)s; 2054 GCC_DIAG_RESTORE 2055 } 2056 2057 /* 2058 =for apidoc utf8_hop_forward 2059 2060 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2061 forward. C<s> does not need to be pointing to the starting byte of a 2062 character. If it isn't, one count of C<off> will be used up to get to the 2063 start of the next character. 2064 2065 C<off> must be non-negative. 2066 2067 C<s> must be before or equal to C<end>. 2068 2069 When moving forward it will not move beyond C<end>. 2070 2071 Will not exceed this limit even if the string is not valid "UTF-8". 2072 2073 =cut 2074 */ 2075 2076 PERL_STATIC_INLINE U8 * 2077 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) 2078 { 2079 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; 2080 2081 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2082 * the bitops (especially ~) can create illegal UTF-8. 2083 * In other words: in Perl UTF-8 is not just for Unicode. */ 2084 2085 assert(s <= end); 2086 assert(off >= 0); 2087 2088 if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) { 2089 /* Get to next non-continuation byte */ 2090 do { 2091 s++; 2092 } 2093 while (UTF8_IS_CONTINUATION(*s)); 2094 off--; 2095 } 2096 2097 while (off--) { 2098 STRLEN skip = UTF8SKIP(s); 2099 if ((STRLEN)(end - s) <= skip) { 2100 GCC_DIAG_IGNORE(-Wcast-qual) 2101 return (U8 *)end; 2102 GCC_DIAG_RESTORE 2103 } 2104 s += skip; 2105 } 2106 2107 GCC_DIAG_IGNORE(-Wcast-qual) 2108 return (U8 *)s; 2109 GCC_DIAG_RESTORE 2110 } 2111 2112 /* 2113 =for apidoc utf8_hop_back 2114 2115 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2116 backward. C<s> does not need to be pointing to the starting byte of a 2117 character. If it isn't, one count of C<off> will be used up to get to that 2118 start. 2119 2120 C<off> must be non-positive. 2121 2122 C<s> must be after or equal to C<start>. 2123 2124 When moving backward it will not move before C<start>. 2125 2126 Will not exceed this limit even if the string is not valid "UTF-8". 2127 2128 =cut 2129 */ 2130 2131 PERL_STATIC_INLINE U8 * 2132 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) 2133 { 2134 PERL_ARGS_ASSERT_UTF8_HOP_BACK; 2135 2136 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2137 * the bitops (especially ~) can create illegal UTF-8. 2138 * In other words: in Perl UTF-8 is not just for Unicode. */ 2139 2140 assert(start <= s); 2141 assert(off <= 0); 2142 2143 /* Note: if we know that the input is well-formed, we can do per-word 2144 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented 2145 * that. But it was reverted because doing per-word has some 2146 * start-up/tear-down overhead, so only makes sense if the distance to be 2147 * moved is large, and core perl doesn't currently move more than a few 2148 * characters at a time. You can reinstate it if it does become 2149 * advantageous. */ 2150 while (off++ && s > start) { 2151 do { 2152 s--; 2153 } while (UTF8_IS_CONTINUATION(*s) && s > start); 2154 } 2155 2156 GCC_DIAG_IGNORE(-Wcast-qual) 2157 return (U8 *)s; 2158 GCC_DIAG_RESTORE 2159 } 2160 2161 /* 2162 =for apidoc utf8_hop_safe 2163 2164 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2165 either forward or backward. C<s> does not need to be pointing to the starting 2166 byte of a character. If it isn't, one count of C<off> will be used up to get 2167 to the start of the next character for forward hops, and to the start of the 2168 current character for negative ones. 2169 2170 When moving backward it will not move before C<start>. 2171 2172 When moving forward it will not move beyond C<end>. 2173 2174 Will not exceed those limits even if the string is not valid "UTF-8". 2175 2176 =cut 2177 */ 2178 2179 PERL_STATIC_INLINE U8 * 2180 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) 2181 { 2182 PERL_ARGS_ASSERT_UTF8_HOP_SAFE; 2183 2184 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2185 * the bitops (especially ~) can create illegal UTF-8. 2186 * In other words: in Perl UTF-8 is not just for Unicode. */ 2187 2188 assert(start <= s && s <= end); 2189 2190 if (off >= 0) { 2191 return utf8_hop_forward(s, off, end); 2192 } 2193 else { 2194 return utf8_hop_back(s, off, start); 2195 } 2196 } 2197 2198 /* 2199 2200 =for apidoc isUTF8_CHAR_flags 2201 2202 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 2203 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 2204 that represents some code point, subject to the restrictions given by C<flags>; 2205 otherwise it evaluates to 0. If non-zero, the value gives how many bytes 2206 starting at C<s> comprise the code point's representation. Any bytes remaining 2207 before C<e>, but beyond the ones needed to form the first code point in C<s>, 2208 are not examined. 2209 2210 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; 2211 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 2212 as C<L</isSTRICT_UTF8_CHAR>>; 2213 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives 2214 the same results as C<L</isC9_STRICT_UTF8_CHAR>>. 2215 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags 2216 understood by C<L</utf8n_to_uvchr>>, with the same meanings. 2217 2218 The three alternative macros are for the most commonly needed validations; they 2219 are likely to run somewhat faster than this more general one, as they can be 2220 inlined into your code. 2221 2222 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and 2223 L</is_utf8_string_loclen_flags> to check entire strings. 2224 2225 =cut 2226 */ 2227 2228 PERL_STATIC_INLINE STRLEN 2229 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) 2230 { 2231 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS; 2232 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2233 |UTF8_DISALLOW_PERL_EXTENDED))); 2234 2235 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2236 goto check_success, 2237 DFA_TEASE_APART_FF_, 2238 DFA_RETURN_FAILURE_); 2239 2240 check_success: 2241 2242 return is_utf8_char_helper_(s0, e, flags); 2243 2244 #ifdef HAS_EXTRA_LONG_UTF8 2245 2246 tease_apart_FF: 2247 2248 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 2249 * either malformed, or was for the largest possible start byte, which 2250 * indicates perl extended UTF-8, well above the Unicode maximum */ 2251 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 2252 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 2253 { 2254 return 0; 2255 } 2256 2257 /* Otherwise examine the sequence not inline */ 2258 return is_utf8_FF_helper_(s0, e, 2259 FALSE /* require full, not partial char */ 2260 ); 2261 #endif 2262 2263 } 2264 2265 /* 2266 2267 =for apidoc is_utf8_valid_partial_char 2268 2269 Returns 0 if the sequence of bytes starting at C<s> and looking no further than 2270 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code 2271 points. Otherwise, it returns 1 if there exists at least one non-empty 2272 sequence of bytes that when appended to sequence C<s>, starting at position 2273 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point; 2274 otherwise returns 0. 2275 2276 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code 2277 point. 2278 2279 This is useful when a fixed-length buffer is being tested for being well-formed 2280 UTF-8, but the final few bytes in it don't comprise a full character; that is, 2281 it is split somewhere in the middle of the final code point's UTF-8 2282 representation. (Presumably when the buffer is refreshed with the next chunk 2283 of data, the new first bytes will complete the partial code point.) This 2284 function is used to verify that the final bytes in the current buffer are in 2285 fact the legal beginning of some code point, so that if they aren't, the 2286 failure can be signalled without having to wait for the next read. 2287 2288 =cut 2289 */ 2290 #define is_utf8_valid_partial_char(s, e) \ 2291 is_utf8_valid_partial_char_flags(s, e, 0) 2292 2293 /* 2294 2295 =for apidoc is_utf8_valid_partial_char_flags 2296 2297 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether 2298 or not the input is a valid UTF-8 encoded partial character, but it takes an 2299 extra parameter, C<flags>, which can further restrict which code points are 2300 considered valid. 2301 2302 If C<flags> is 0, this behaves identically to 2303 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination 2304 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If 2305 there is any sequence of bytes that can complete the input partial character in 2306 such a way that a non-prohibited character is formed, the function returns 2307 TRUE; otherwise FALSE. Non character code points cannot be determined based on 2308 partial character input. But many of the other possible excluded types can be 2309 determined from just the first one or two bytes. 2310 2311 =cut 2312 */ 2313 2314 PERL_STATIC_INLINE bool 2315 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags) 2316 { 2317 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; 2318 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2319 |UTF8_DISALLOW_PERL_EXTENDED))); 2320 2321 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2322 DFA_RETURN_FAILURE_, 2323 DFA_TEASE_APART_FF_, 2324 NOOP); 2325 2326 /* The NOOP above causes the DFA to drop down here iff the input was a 2327 * partial character. flags=0 => can return TRUE immediately; otherwise we 2328 * need to check (not inline) if the partial character is the beginning of 2329 * a disallowed one */ 2330 if (flags == 0) { 2331 return TRUE; 2332 } 2333 2334 return cBOOL(is_utf8_char_helper_(s0, e, flags)); 2335 2336 #ifdef HAS_EXTRA_LONG_UTF8 2337 2338 tease_apart_FF: 2339 2340 /* Getting here means the input is either malformed, or, in the case of 2341 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The 2342 * latter case has to be extended UTF-8, so can fail immediately if that is 2343 * forbidden */ 2344 2345 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 2346 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 2347 { 2348 return 0; 2349 } 2350 2351 return is_utf8_FF_helper_(s0, e, 2352 TRUE /* Require to be a partial character */ 2353 ); 2354 #endif 2355 2356 } 2357 2358 /* 2359 2360 =for apidoc is_utf8_fixed_width_buf_flags 2361 2362 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len> 2363 is entirely valid UTF-8, subject to the restrictions given by C<flags>; 2364 otherwise it returns FALSE. 2365 2366 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted 2367 without restriction. If the final few bytes of the buffer do not form a 2368 complete code point, this will return TRUE anyway, provided that 2369 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them. 2370 2371 If C<flags> in non-zero, it can be any combination of the 2372 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the 2373 same meanings. 2374 2375 This function differs from C<L</is_utf8_string_flags>> only in that the latter 2376 returns FALSE if the final few bytes of the string don't form a complete code 2377 point. 2378 2379 =cut 2380 */ 2381 #define is_utf8_fixed_width_buf_flags(s, len, flags) \ 2382 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags) 2383 2384 /* 2385 2386 =for apidoc is_utf8_fixed_width_buf_loc_flags 2387 2388 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the 2389 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point 2390 to the beginning of any partial character at the end of the buffer; if there is 2391 no partial character C<*ep> will contain C<s>+C<len>. 2392 2393 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>. 2394 2395 =cut 2396 */ 2397 2398 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \ 2399 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags) 2400 2401 /* 2402 2403 =for apidoc is_utf8_fixed_width_buf_loclen_flags 2404 2405 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of 2406 complete, valid characters found in the C<el> pointer. 2407 2408 =cut 2409 */ 2410 2411 PERL_STATIC_INLINE bool 2412 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, 2413 STRLEN len, 2414 const U8 **ep, 2415 STRLEN *el, 2416 const U32 flags) 2417 { 2418 const U8 * maybe_partial; 2419 2420 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS; 2421 2422 if (! ep) { 2423 ep = &maybe_partial; 2424 } 2425 2426 /* If it's entirely valid, return that; otherwise see if the only error is 2427 * that the final few bytes are for a partial character */ 2428 return is_utf8_string_loclen_flags(s, len, ep, el, flags) 2429 || is_utf8_valid_partial_char_flags(*ep, s + len, flags); 2430 } 2431 2432 PERL_STATIC_INLINE UV 2433 Perl_utf8n_to_uvchr_msgs(const U8 *s, 2434 STRLEN curlen, 2435 STRLEN *retlen, 2436 const U32 flags, 2437 U32 * errors, 2438 AV ** msgs) 2439 { 2440 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the 2441 * simple cases, and, if necessary calls a helper function to deal with the 2442 * more complex ones. Almost all well-formed non-problematic code points 2443 * are considered simple, so that it's unlikely that the helper function 2444 * will need to be called. 2445 * 2446 * This is an adaptation of the tables and algorithm given in 2447 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides 2448 * comprehensive documentation of the original version. A copyright notice 2449 * for the original version is given at the beginning of this file. The 2450 * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[]. 2451 */ 2452 2453 const U8 * const s0 = s; 2454 const U8 * send = s0 + curlen; 2455 UV type; 2456 UV uv; 2457 2458 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; 2459 2460 /* This dfa is fast. If it accepts the input, it was for a well-formed, 2461 * non-problematic code point, which can be returned immediately. 2462 * Otherwise we call a helper function to figure out the more complicated 2463 * cases. */ 2464 2465 /* No calls from core pass in an empty string; non-core need a check */ 2466 #ifdef PERL_CORE 2467 assert(curlen > 0); 2468 #else 2469 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen, 2470 flags, errors, msgs); 2471 #endif 2472 2473 type = PL_strict_utf8_dfa_tab[*s]; 2474 2475 /* The table is structured so that 'type' is 0 iff the input byte is 2476 * represented identically regardless of the UTF-8ness of the string */ 2477 if (type == 0) { /* UTF-8 invariants are returned unchanged */ 2478 uv = *s; 2479 } 2480 else { 2481 UV state = PL_strict_utf8_dfa_tab[256 + type]; 2482 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); 2483 2484 while (++s < send) { 2485 type = PL_strict_utf8_dfa_tab[*s]; 2486 state = PL_strict_utf8_dfa_tab[256 + state + type]; 2487 2488 uv = UTF8_ACCUMULATE(uv, *s); 2489 2490 if (state == 0) { 2491 #ifdef EBCDIC 2492 uv = UNI_TO_NATIVE(uv); 2493 #endif 2494 goto success; 2495 } 2496 2497 if (UNLIKELY(state == 1)) { 2498 break; 2499 } 2500 } 2501 2502 /* Here is potentially problematic. Use the full mechanism */ 2503 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, 2504 errors, msgs); 2505 } 2506 2507 success: 2508 if (retlen) { 2509 *retlen = s - s0 + 1; 2510 } 2511 if (errors) { 2512 *errors = 0; 2513 } 2514 if (msgs) { 2515 *msgs = NULL; 2516 } 2517 2518 return uv; 2519 } 2520 2521 PERL_STATIC_INLINE UV 2522 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2523 { 2524 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; 2525 2526 assert(s < send); 2527 2528 if (! ckWARN_d(WARN_UTF8)) { 2529 2530 /* EMPTY is not really allowed, and asserts on debugging builds. But 2531 * on non-debugging we have to deal with it, and this causes it to 2532 * return the REPLACEMENT CHARACTER, as the documentation indicates */ 2533 return utf8n_to_uvchr(s, send - s, retlen, 2534 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); 2535 } 2536 else { 2537 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); 2538 if (retlen && ret == 0 && (send <= s || *s != '\0')) { 2539 *retlen = (STRLEN) -1; 2540 } 2541 2542 return ret; 2543 } 2544 } 2545 2546 /* ------------------------------- perl.h ----------------------------- */ 2547 2548 /* 2549 =for apidoc_section $utility 2550 2551 =for apidoc is_safe_syscall 2552 2553 Test that the given C<pv> (with length C<len>) doesn't contain any internal 2554 C<NUL> characters. 2555 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls> 2556 category, and return FALSE. 2557 2558 Return TRUE if the name is safe. 2559 2560 C<what> and C<op_name> are used in any warning. 2561 2562 Used by the C<IS_SAFE_SYSCALL()> macro. 2563 2564 =cut 2565 */ 2566 2567 PERL_STATIC_INLINE bool 2568 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) 2569 { 2570 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs 2571 * perl itself uses xce*() functions which accept 8-bit strings. 2572 */ 2573 2574 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL; 2575 2576 if (len > 1) { 2577 char *null_at; 2578 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) { 2579 SETERRNO(ENOENT, LIB_INVARG); 2580 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS), 2581 "Invalid \\0 character in %s for %s: %s\\0%s", 2582 what, op_name, pv, null_at+1); 2583 return FALSE; 2584 } 2585 } 2586 2587 return TRUE; 2588 } 2589 2590 /* 2591 2592 Return true if the supplied filename has a newline character 2593 immediately before the first (hopefully only) NUL. 2594 2595 My original look at this incorrectly used the len from SvPV(), but 2596 that's incorrect, since we allow for a NUL in pv[len-1]. 2597 2598 So instead, strlen() and work from there. 2599 2600 This allow for the user reading a filename, forgetting to chomp it, 2601 then calling: 2602 2603 open my $foo, "$file\0"; 2604 2605 */ 2606 2607 #ifdef PERL_CORE 2608 2609 PERL_STATIC_INLINE bool 2610 S_should_warn_nl(const char *pv) 2611 { 2612 STRLEN len; 2613 2614 PERL_ARGS_ASSERT_SHOULD_WARN_NL; 2615 2616 len = strlen(pv); 2617 2618 return len > 0 && pv[len-1] == '\n'; 2619 } 2620 2621 #endif 2622 2623 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) 2624 2625 PERL_STATIC_INLINE bool 2626 S_lossless_NV_to_IV(const NV nv, IV *ivp) 2627 { 2628 /* This function determines if the input NV 'nv' may be converted without 2629 * loss of data to an IV. If not, it returns FALSE taking no other action. 2630 * But if it is possible, it does the conversion, returning TRUE, and 2631 * storing the converted result in '*ivp' */ 2632 2633 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; 2634 2635 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 2636 /* Normally any comparison with a NaN returns false; if we can't rely 2637 * on that behaviour, check explicitly */ 2638 if (UNLIKELY(Perl_isnan(nv))) { 2639 return FALSE; 2640 } 2641 # endif 2642 2643 /* Written this way so that with an always-false NaN comparison we 2644 * return false */ 2645 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) { 2646 return FALSE; 2647 } 2648 2649 if ((IV) nv != nv) { 2650 return FALSE; 2651 } 2652 2653 *ivp = (IV) nv; 2654 return TRUE; 2655 } 2656 2657 #endif 2658 2659 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ 2660 2661 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) 2662 2663 #define MAX_CHARSET_NAME_LENGTH 2 2664 2665 PERL_STATIC_INLINE const char * 2666 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp) 2667 { 2668 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME; 2669 2670 /* Returns a string that corresponds to the name of the regex character set 2671 * given by 'flags', and *lenp is set the length of that string, which 2672 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ 2673 2674 *lenp = 1; 2675 switch (get_regex_charset(flags)) { 2676 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; 2677 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; 2678 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; 2679 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; 2680 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 2681 *lenp = 2; 2682 return ASCII_MORE_RESTRICT_PAT_MODS; 2683 } 2684 /* The NOT_REACHED; hides an assert() which has a rather complex 2685 * definition in perl.h. */ 2686 NOT_REACHED; /* NOTREACHED */ 2687 return "?"; /* Unknown */ 2688 } 2689 2690 #endif 2691 2692 /* 2693 2694 Return false if any get magic is on the SV other than taint magic. 2695 2696 */ 2697 2698 PERL_STATIC_INLINE bool 2699 Perl_sv_only_taint_gmagic(SV *sv) 2700 { 2701 MAGIC *mg = SvMAGIC(sv); 2702 2703 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; 2704 2705 while (mg) { 2706 if (mg->mg_type != PERL_MAGIC_taint 2707 && !(mg->mg_flags & MGf_GSKIP) 2708 && mg->mg_virtual->svt_get) { 2709 return FALSE; 2710 } 2711 mg = mg->mg_moremagic; 2712 } 2713 2714 return TRUE; 2715 } 2716 2717 /* ------------------ cop.h ------------------------------------------- */ 2718 2719 /* implement GIMME_V() macro */ 2720 2721 PERL_STATIC_INLINE U8 2722 Perl_gimme_V(pTHX) 2723 { 2724 I32 cxix; 2725 U8 gimme = (PL_op->op_flags & OPf_WANT); 2726 2727 if (gimme) 2728 return gimme; 2729 cxix = PL_curstackinfo->si_cxsubix; 2730 if (cxix < 0) 2731 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID; 2732 assert(cxstack[cxix].blk_gimme & G_WANT); 2733 return (cxstack[cxix].blk_gimme & G_WANT); 2734 } 2735 2736 2737 /* Enter a block. Push a new base context and return its address. */ 2738 2739 PERL_STATIC_INLINE PERL_CONTEXT * 2740 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) 2741 { 2742 PERL_CONTEXT * cx; 2743 2744 PERL_ARGS_ASSERT_CX_PUSHBLOCK; 2745 2746 CXINC; 2747 cx = CX_CUR(); 2748 cx->cx_type = type; 2749 cx->blk_gimme = gimme; 2750 cx->blk_oldsaveix = saveix; 2751 cx->blk_oldsp = (I32)(sp - PL_stack_base); 2752 cx->blk_oldcop = PL_curcop; 2753 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack); 2754 cx->blk_oldscopesp = PL_scopestack_ix; 2755 cx->blk_oldpm = PL_curpm; 2756 cx->blk_old_tmpsfloor = PL_tmps_floor; 2757 2758 PL_tmps_floor = PL_tmps_ix; 2759 CX_DEBUG(cx, "PUSH"); 2760 return cx; 2761 } 2762 2763 2764 /* Exit a block (RETURN and LAST). */ 2765 2766 PERL_STATIC_INLINE void 2767 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx) 2768 { 2769 PERL_ARGS_ASSERT_CX_POPBLOCK; 2770 2771 CX_DEBUG(cx, "POP"); 2772 /* these 3 are common to cx_popblock and cx_topblock */ 2773 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2774 PL_scopestack_ix = cx->blk_oldscopesp; 2775 PL_curpm = cx->blk_oldpm; 2776 2777 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats 2778 * and leaves a CX entry lying around for repeated use, so 2779 * skip for multicall */ \ 2780 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx)) 2781 || PL_savestack_ix == cx->blk_oldsaveix); 2782 PL_curcop = cx->blk_oldcop; 2783 PL_tmps_floor = cx->blk_old_tmpsfloor; 2784 } 2785 2786 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO). 2787 * Whereas cx_popblock() restores the state to the point just before 2788 * cx_pushblock() was called, cx_topblock() restores it to the point just 2789 * *after* cx_pushblock() was called. */ 2790 2791 PERL_STATIC_INLINE void 2792 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx) 2793 { 2794 PERL_ARGS_ASSERT_CX_TOPBLOCK; 2795 2796 CX_DEBUG(cx, "TOP"); 2797 /* these 3 are common to cx_popblock and cx_topblock */ 2798 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2799 PL_scopestack_ix = cx->blk_oldscopesp; 2800 PL_curpm = cx->blk_oldpm; 2801 2802 PL_stack_sp = PL_stack_base + cx->blk_oldsp; 2803 } 2804 2805 2806 PERL_STATIC_INLINE void 2807 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) 2808 { 2809 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); 2810 2811 PERL_ARGS_ASSERT_CX_PUSHSUB; 2812 2813 PERL_DTRACE_PROBE_ENTRY(cv); 2814 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix; 2815 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; 2816 cx->blk_sub.cv = cv; 2817 cx->blk_sub.olddepth = CvDEPTH(cv); 2818 cx->blk_sub.prevcomppad = PL_comppad; 2819 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0; 2820 cx->blk_sub.retop = retop; 2821 SvREFCNT_inc_simple_void_NN(cv); 2822 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF); 2823 } 2824 2825 2826 /* subsets of cx_popsub() */ 2827 2828 PERL_STATIC_INLINE void 2829 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) 2830 { 2831 CV *cv; 2832 2833 PERL_ARGS_ASSERT_CX_POPSUB_COMMON; 2834 assert(CxTYPE(cx) == CXt_SUB); 2835 2836 PL_comppad = cx->blk_sub.prevcomppad; 2837 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2838 cv = cx->blk_sub.cv; 2839 CvDEPTH(cv) = cx->blk_sub.olddepth; 2840 cx->blk_sub.cv = NULL; 2841 SvREFCNT_dec(cv); 2842 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix; 2843 } 2844 2845 2846 /* handle the @_ part of leaving a sub */ 2847 2848 PERL_STATIC_INLINE void 2849 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) 2850 { 2851 AV *av; 2852 2853 PERL_ARGS_ASSERT_CX_POPSUB_ARGS; 2854 assert(CxTYPE(cx) == CXt_SUB); 2855 assert(AvARRAY(MUTABLE_AV( 2856 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[ 2857 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad); 2858 2859 CX_POP_SAVEARRAY(cx); 2860 av = MUTABLE_AV(PAD_SVl(0)); 2861 if (UNLIKELY(AvREAL(av))) 2862 /* abandon @_ if it got reified */ 2863 clear_defarray(av, 0); 2864 else { 2865 CLEAR_ARGARRAY(av); 2866 } 2867 } 2868 2869 2870 PERL_STATIC_INLINE void 2871 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx) 2872 { 2873 PERL_ARGS_ASSERT_CX_POPSUB; 2874 assert(CxTYPE(cx) == CXt_SUB); 2875 2876 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv); 2877 2878 if (CxHASARGS(cx)) 2879 cx_popsub_args(cx); 2880 cx_popsub_common(cx); 2881 } 2882 2883 2884 PERL_STATIC_INLINE void 2885 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) 2886 { 2887 PERL_ARGS_ASSERT_CX_PUSHFORMAT; 2888 2889 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix; 2890 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack; 2891 cx->blk_format.cv = cv; 2892 cx->blk_format.retop = retop; 2893 cx->blk_format.gv = gv; 2894 cx->blk_format.dfoutgv = PL_defoutgv; 2895 cx->blk_format.prevcomppad = PL_comppad; 2896 cx->blk_u16 = 0; 2897 2898 SvREFCNT_inc_simple_void_NN(cv); 2899 CvDEPTH(cv)++; 2900 SvREFCNT_inc_void(cx->blk_format.dfoutgv); 2901 } 2902 2903 2904 PERL_STATIC_INLINE void 2905 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx) 2906 { 2907 CV *cv; 2908 GV *dfout; 2909 2910 PERL_ARGS_ASSERT_CX_POPFORMAT; 2911 assert(CxTYPE(cx) == CXt_FORMAT); 2912 2913 dfout = cx->blk_format.dfoutgv; 2914 setdefout(dfout); 2915 cx->blk_format.dfoutgv = NULL; 2916 SvREFCNT_dec_NN(dfout); 2917 2918 PL_comppad = cx->blk_format.prevcomppad; 2919 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2920 cv = cx->blk_format.cv; 2921 cx->blk_format.cv = NULL; 2922 --CvDEPTH(cv); 2923 SvREFCNT_dec_NN(cv); 2924 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix; 2925 } 2926 2927 2928 PERL_STATIC_INLINE void 2929 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2930 { 2931 cx->blk_eval.retop = retop; 2932 cx->blk_eval.old_namesv = namesv; 2933 cx->blk_eval.old_eval_root = PL_eval_root; 2934 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL; 2935 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */ 2936 cx->blk_eval.cur_top_env = PL_top_env; 2937 2938 assert(!(PL_in_eval & ~ 0x3F)); 2939 assert(!(PL_op->op_type & ~0x1FF)); 2940 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); 2941 } 2942 2943 PERL_STATIC_INLINE void 2944 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2945 { 2946 PERL_ARGS_ASSERT_CX_PUSHEVAL; 2947 2948 Perl_push_evalortry_common(aTHX_ cx, retop, namesv); 2949 2950 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 2951 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; 2952 } 2953 2954 PERL_STATIC_INLINE void 2955 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop) 2956 { 2957 PERL_ARGS_ASSERT_CX_PUSHTRY; 2958 2959 Perl_push_evalortry_common(aTHX_ cx, retop, NULL); 2960 2961 /* Don't actually change it, just store the current value so it's restored 2962 * by the common popeval */ 2963 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 2964 } 2965 2966 2967 PERL_STATIC_INLINE void 2968 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx) 2969 { 2970 SV *sv; 2971 2972 PERL_ARGS_ASSERT_CX_POPEVAL; 2973 assert(CxTYPE(cx) == CXt_EVAL); 2974 2975 PL_in_eval = CxOLD_IN_EVAL(cx); 2976 assert(!(PL_in_eval & 0xc0)); 2977 PL_eval_root = cx->blk_eval.old_eval_root; 2978 sv = cx->blk_eval.cur_text; 2979 if (sv && CxEVAL_TXT_REFCNTED(cx)) { 2980 cx->blk_eval.cur_text = NULL; 2981 SvREFCNT_dec_NN(sv); 2982 } 2983 2984 sv = cx->blk_eval.old_namesv; 2985 if (sv) { 2986 cx->blk_eval.old_namesv = NULL; 2987 SvREFCNT_dec_NN(sv); 2988 } 2989 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix; 2990 } 2991 2992 2993 /* push a plain loop, i.e. 2994 * { block } 2995 * while (cond) { block } 2996 * for (init;cond;continue) { block } 2997 * This loop can be last/redo'ed etc. 2998 */ 2999 3000 PERL_STATIC_INLINE void 3001 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) 3002 { 3003 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; 3004 cx->blk_loop.my_op = cLOOP; 3005 } 3006 3007 3008 /* push a true for loop, i.e. 3009 * for var (list) { block } 3010 */ 3011 3012 PERL_STATIC_INLINE void 3013 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) 3014 { 3015 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; 3016 3017 /* this one line is common with cx_pushloop_plain */ 3018 cx->blk_loop.my_op = cLOOP; 3019 3020 cx->blk_loop.itervar_u.svp = (SV**)itervarp; 3021 cx->blk_loop.itersave = itersave; 3022 #ifdef USE_ITHREADS 3023 cx->blk_loop.oldcomppad = PL_comppad; 3024 #endif 3025 } 3026 3027 3028 /* pop all loop types, including plain */ 3029 3030 PERL_STATIC_INLINE void 3031 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx) 3032 { 3033 PERL_ARGS_ASSERT_CX_POPLOOP; 3034 3035 assert(CxTYPE_is_LOOP(cx)); 3036 if ( CxTYPE(cx) == CXt_LOOP_ARY 3037 || CxTYPE(cx) == CXt_LOOP_LAZYSV) 3038 { 3039 /* Free ary or cur. This assumes that state_u.ary.ary 3040 * aligns with state_u.lazysv.cur. See cx_dup() */ 3041 SV *sv = cx->blk_loop.state_u.lazysv.cur; 3042 cx->blk_loop.state_u.lazysv.cur = NULL; 3043 SvREFCNT_dec_NN(sv); 3044 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) { 3045 sv = cx->blk_loop.state_u.lazysv.end; 3046 cx->blk_loop.state_u.lazysv.end = NULL; 3047 SvREFCNT_dec_NN(sv); 3048 } 3049 } 3050 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) { 3051 SV *cursv; 3052 SV **svp = (cx)->blk_loop.itervar_u.svp; 3053 if ((cx->cx_type & CXp_FOR_GV)) 3054 svp = &GvSV((GV*)svp); 3055 cursv = *svp; 3056 *svp = cx->blk_loop.itersave; 3057 cx->blk_loop.itersave = NULL; 3058 SvREFCNT_dec(cursv); 3059 } 3060 if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF)) 3061 SvREFCNT_dec(cx->blk_loop.itervar_u.svp); 3062 } 3063 3064 3065 PERL_STATIC_INLINE void 3066 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) 3067 { 3068 PERL_ARGS_ASSERT_CX_PUSHWHEN; 3069 3070 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3071 } 3072 3073 3074 PERL_STATIC_INLINE void 3075 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx) 3076 { 3077 PERL_ARGS_ASSERT_CX_POPWHEN; 3078 assert(CxTYPE(cx) == CXt_WHEN); 3079 3080 PERL_UNUSED_ARG(cx); 3081 PERL_UNUSED_CONTEXT; 3082 /* currently NOOP */ 3083 } 3084 3085 3086 PERL_STATIC_INLINE void 3087 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) 3088 { 3089 PERL_ARGS_ASSERT_CX_PUSHGIVEN; 3090 3091 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3092 cx->blk_givwhen.defsv_save = orig_defsv; 3093 } 3094 3095 3096 PERL_STATIC_INLINE void 3097 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx) 3098 { 3099 SV *sv; 3100 3101 PERL_ARGS_ASSERT_CX_POPGIVEN; 3102 assert(CxTYPE(cx) == CXt_GIVEN); 3103 3104 sv = GvSV(PL_defgv); 3105 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save; 3106 cx->blk_givwhen.defsv_save = NULL; 3107 SvREFCNT_dec(sv); 3108 } 3109 3110 /* 3111 =for apidoc newPADxVOP 3112 3113 Constructs, checks and returns an op containing a pad offset. C<type> is 3114 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV> 3115 or C<OP_PADCV>. The returned op will have the C<op_targ> field set by 3116 the C<padix> argument. 3117 3118 This is convenient when constructing a large optree in nested function 3119 calls, as it avoids needing to store the pad op directly to set the 3120 C<op_targ> field as a side-effect. For example 3121 3122 o = op_append_elem(OP_LINESEQ, o, 3123 newPADxVOP(OP_PADSV, 0, padix)); 3124 3125 =cut 3126 */ 3127 3128 PERL_STATIC_INLINE OP * 3129 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix) 3130 { 3131 PERL_ARGS_ASSERT_NEWPADXVOP; 3132 3133 assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV 3134 || type == OP_PADCV); 3135 OP *o = newOP(type, flags); 3136 o->op_targ = padix; 3137 return o; 3138 } 3139 3140 /* ------------------ util.h ------------------------------------------- */ 3141 3142 /* 3143 =for apidoc_section $string 3144 3145 =for apidoc foldEQ 3146 3147 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 3148 same 3149 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes 3150 match themselves and their opposite case counterparts. Non-cased and non-ASCII 3151 range bytes match only themselves. 3152 3153 =cut 3154 */ 3155 3156 PERL_STATIC_INLINE I32 3157 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len) 3158 { 3159 const U8 *a = (const U8 *)s1; 3160 const U8 *b = (const U8 *)s2; 3161 3162 PERL_ARGS_ASSERT_FOLDEQ; 3163 3164 assert(len >= 0); 3165 3166 while (len--) { 3167 if (*a != *b && *a != PL_fold[*b]) 3168 return 0; 3169 a++,b++; 3170 } 3171 return 1; 3172 } 3173 3174 PERL_STATIC_INLINE I32 3175 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len) 3176 { 3177 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds 3178 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and 3179 * does not check for this. Nor does it check that the strings each have 3180 * at least 'len' characters. */ 3181 3182 const U8 *a = (const U8 *)s1; 3183 const U8 *b = (const U8 *)s2; 3184 3185 PERL_ARGS_ASSERT_FOLDEQ_LATIN1; 3186 3187 assert(len >= 0); 3188 3189 while (len--) { 3190 if (*a != *b && *a != PL_fold_latin1[*b]) { 3191 return 0; 3192 } 3193 a++, b++; 3194 } 3195 return 1; 3196 } 3197 3198 /* 3199 =for apidoc_section $locale 3200 =for apidoc foldEQ_locale 3201 3202 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 3203 same case-insensitively in the current locale; false otherwise. 3204 3205 =cut 3206 */ 3207 3208 PERL_STATIC_INLINE I32 3209 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len) 3210 { 3211 const U8 *a = (const U8 *)s1; 3212 const U8 *b = (const U8 *)s2; 3213 3214 PERL_ARGS_ASSERT_FOLDEQ_LOCALE; 3215 3216 assert(len >= 0); 3217 3218 while (len--) { 3219 if (*a != *b && *a != PL_fold_locale[*b]) { 3220 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 3221 "%s:%d: Our records indicate %02x is not a fold of %02x" 3222 " or its mate %02x\n", 3223 __FILE__, __LINE__, *a, *b, PL_fold_locale[*b])); 3224 3225 return 0; 3226 } 3227 a++,b++; 3228 } 3229 return 1; 3230 } 3231 3232 /* 3233 =for apidoc_section $string 3234 =for apidoc my_strnlen 3235 3236 The C library C<strnlen> if available, or a Perl implementation of it. 3237 3238 C<my_strnlen()> computes the length of the string, up to C<maxlen> 3239 characters. It will never attempt to address more than C<maxlen> 3240 characters, making it suitable for use with strings that are not 3241 guaranteed to be NUL-terminated. 3242 3243 =cut 3244 3245 Description stolen from http://man.openbsd.org/strnlen.3, 3246 implementation stolen from PostgreSQL. 3247 */ 3248 #ifndef HAS_STRNLEN 3249 3250 PERL_STATIC_INLINE Size_t 3251 Perl_my_strnlen(const char *str, Size_t maxlen) 3252 { 3253 const char *end = (char *) memchr(str, '\0', maxlen); 3254 3255 PERL_ARGS_ASSERT_MY_STRNLEN; 3256 3257 if (end == NULL) return maxlen; 3258 return end - str; 3259 } 3260 3261 #endif 3262 3263 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) 3264 3265 PERL_STATIC_INLINE void * 3266 S_my_memrchr(const char * s, const char c, const STRLEN len) 3267 { 3268 /* memrchr(), since many platforms lack it */ 3269 3270 const char * t = s + len - 1; 3271 3272 PERL_ARGS_ASSERT_MY_MEMRCHR; 3273 3274 while (t >= s) { 3275 if (*t == c) { 3276 return (void *) t; 3277 } 3278 t--; 3279 } 3280 3281 return NULL; 3282 } 3283 3284 #endif 3285 3286 PERL_STATIC_INLINE char * 3287 Perl_mortal_getenv(const char * str) 3288 { 3289 /* This implements a (mostly) thread-safe, sequential-call-safe getenv(). 3290 * 3291 * It's (mostly) thread-safe because it uses a mutex to prevent other 3292 * threads (that look at this mutex) from destroying the result before this 3293 * routine has a chance to copy the result to a place that won't be 3294 * destroyed before the caller gets a chance to handle it. That place is a 3295 * mortal SV. khw chose this over SAVEFREEPV because he is under the 3296 * impression that the SV will hang around longer under more circumstances 3297 * 3298 * The reason it isn't completely thread-safe is that other code could 3299 * simply not pay attention to the mutex. All of the Perl core uses the 3300 * mutex, but it is possible for code from, say XS, to not use this mutex, 3301 * defeating the safety. 3302 * 3303 * getenv() returns, in some implementations, a pointer to a spot in the 3304 * **environ array, which could be invalidated at any time by this or 3305 * another thread changing the environment. Other implementations copy the 3306 * **environ value to a static buffer, returning a pointer to that. That 3307 * buffer might or might not be invalidated by a getenv() call in another 3308 * thread. If it does get zapped, we need an exclusive lock. Otherwise, 3309 * many getenv() calls can safely be running simultaneously, so a 3310 * many-reader (but no simultaneous writers) lock is ok. There is a 3311 * Configure probe to see if another thread destroys the buffer, and the 3312 * mutex is defined accordingly. 3313 * 3314 * But in all cases, using the mutex prevents these problems, as long as 3315 * all code uses the same mutex. 3316 * 3317 * A complication is that this can be called during phases where the 3318 * mortalization process isn't available. These are in interpreter 3319 * destruction or early in construction. khw believes that at these times 3320 * there shouldn't be anything else going on, so plain getenv is safe AS 3321 * LONG AS the caller acts on the return before calling it again. */ 3322 3323 char * ret; 3324 dTHX; 3325 3326 PERL_ARGS_ASSERT_MORTAL_GETENV; 3327 3328 /* Can't mortalize without stacks. khw believes that no other threads 3329 * should be running, so no need to lock things, and this may be during a 3330 * phase when locking isn't even available */ 3331 if (UNLIKELY(PL_scopestack_ix == 0)) { 3332 return getenv(str); 3333 } 3334 3335 #ifdef PERL_MEM_LOG 3336 3337 /* A major complication arises under PERL_MEM_LOG. When that is active, 3338 * every memory allocation may result in logging, depending on the value of 3339 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for 3340 * saving ENV{foo}'s value (but before saving it), the logging code will 3341 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some 3342 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to 3343 * lock a boolean mutex recursively); 3) destroying the getenv() static 3344 * buffer; or 4) destroying the temporary created by this for the copy 3345 * causes a log entry to be made which could cause a new temporary to be 3346 * created, which will need to be destroyed at some point, leading to an 3347 * infinite loop. 3348 * 3349 * The solution adopted here (after some gnashing of teeth) is to detect 3350 * the recursive calls and calls from the logger, and treat them specially. 3351 * Let's say we want to do getenv("foo"). We first find 3352 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter 3353 * variable, so no temporary is required. Then we do getenv(foo), and in 3354 * the process of creating a temporary to save it, this function will be 3355 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call, 3356 * we detect that it is such a call and return our saved value instead of 3357 * locking and doing a new getenv(). This solves all of problems 1), 2), 3358 * and 3). Because all the getenv()s are done while the mutex is locked, 3359 * the state cannot have changed. To solve 4), we don't create a temporary 3360 * when this is called from the logging code. That code disposes of the 3361 * return value while the mutex is still locked. 3362 * 3363 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial 3364 * digits and 3 particular letters are significant; the rest are ignored by 3365 * the memory logging code. Thus the per-interpreter variable only needs 3366 * to be large enough to save the significant information, the size of 3367 * which is known at compile time. The first byte is extra, reserved for 3368 * flags for our use. To protect against overflowing, only the reserved 3369 * byte, as many digits as don't overflow, and the three letters are 3370 * stored. 3371 * 3372 * The reserved byte has two bits: 3373 * 0x1 if set indicates that if we get here, it is a recursive call of 3374 * getenv() 3375 * 0x2 if set indicates that the call is from the logging code. 3376 * 3377 * If the flag indicates this is a recursive call, just return the stored 3378 * value of PL_mem_log; An empty value gets turned into NULL. */ 3379 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) { 3380 if (PL_mem_log[1] == '\0') { 3381 return NULL; 3382 } else { 3383 return PL_mem_log + 1; 3384 } 3385 } 3386 3387 #endif 3388 3389 GETENV_LOCK; 3390 3391 #ifdef PERL_MEM_LOG 3392 3393 /* Here we are in a critical section. As explained above, we do our own 3394 * getenv(PERL_MEM_LOG), saving the result safely. */ 3395 ret = getenv("PERL_MEM_LOG"); 3396 if (ret == NULL) { /* No logging active */ 3397 3398 /* Return that immediately if called from the logging code */ 3399 if (PL_mem_log[0] & 0x2) { 3400 GETENV_UNLOCK; 3401 return NULL; 3402 } 3403 3404 PL_mem_log[1] = '\0'; 3405 } 3406 else { 3407 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */ 3408 3409 /* There is nothing to prevent the value of PERL_MEM_LOG from being an 3410 * extremely long string. But we want only a few characters from it. 3411 * PL_mem_log has been made large enough to hold just the ones we need. 3412 * First the file descriptor. */ 3413 if (isDIGIT(*ret)) { 3414 const char * s = ret; 3415 if (UNLIKELY(*s == '0')) { 3416 3417 /* Reduce multiple leading zeros to a single one. This is to 3418 * allow the caller to change what to do with leading zeros. */ 3419 *mem_log_meat++ = '0'; 3420 s++; 3421 while (*s == '0') { 3422 s++; 3423 } 3424 } 3425 3426 /* If the input overflows, copy just enough for the result to also 3427 * overflow, plus 1 to make sure */ 3428 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) { 3429 *mem_log_meat++ = *s++; 3430 } 3431 } 3432 3433 /* Then each of the four significant characters */ 3434 if (strchr(ret, 'm')) { 3435 *mem_log_meat++ = 'm'; 3436 } 3437 if (strchr(ret, 's')) { 3438 *mem_log_meat++ = 's'; 3439 } 3440 if (strchr(ret, 't')) { 3441 *mem_log_meat++ = 't'; 3442 } 3443 if (strchr(ret, 'c')) { 3444 *mem_log_meat++ = 'c'; 3445 } 3446 *mem_log_meat = '\0'; 3447 3448 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log)); 3449 } 3450 3451 /* If we are being called from the logger, it only needs the significant 3452 * portion of PERL_MEM_LOG, and doesn't need a safe copy */ 3453 if (PL_mem_log[0] & 0x2) { 3454 assert(strEQ(str, "PERL_MEM_LOG")); 3455 GETENV_UNLOCK; 3456 return PL_mem_log + 1; 3457 } 3458 3459 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that 3460 * is coming from other than the logging code, so it should be treated the 3461 * same as any other getenv(), returning the full value, not just the 3462 * significant part, and having its value saved. Set the flag that 3463 * indicates any call to this routine will be a recursion from here */ 3464 PL_mem_log[0] = 0x1; 3465 3466 #endif 3467 3468 /* Now get the value of the real desired variable, and save a copy */ 3469 ret = getenv(str); 3470 3471 if (ret != NULL) { 3472 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) ); 3473 } 3474 3475 GETENV_UNLOCK; 3476 3477 #ifdef PERL_MEM_LOG 3478 3479 /* Clear the buffer */ 3480 Zero(PL_mem_log, sizeof(PL_mem_log), char); 3481 3482 #endif 3483 3484 return ret; 3485 } 3486 3487 PERL_STATIC_INLINE bool 3488 Perl_sv_isbool(pTHX_ const SV *sv) 3489 { 3490 return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv); 3491 } 3492 3493 #ifdef USE_ITHREADS 3494 3495 PERL_STATIC_INLINE AV * 3496 Perl_cop_file_avn(pTHX_ const COP *cop) { 3497 3498 PERL_ARGS_ASSERT_COP_FILE_AVN; 3499 3500 const char *file = CopFILE(cop); 3501 if (file) { 3502 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD); 3503 if (gv) { 3504 return GvAVn(gv); 3505 } 3506 else 3507 return NULL; 3508 } 3509 else 3510 return NULL; 3511 } 3512 3513 #endif 3514 3515 PERL_STATIC_INLINE PADNAME * 3516 Perl_padname_refcnt_inc(PADNAME *pn) 3517 { 3518 PadnameREFCNT(pn)++; 3519 return pn; 3520 } 3521 3522 PERL_STATIC_INLINE PADNAMELIST * 3523 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl) 3524 { 3525 PadnamelistREFCNT(pnl)++; 3526 return pnl; 3527 } 3528 3529 /* copy a string to a safe spot */ 3530 3531 /* 3532 =for apidoc_section $string 3533 =for apidoc savepv 3534 3535 Perl's version of C<strdup()>. Returns a pointer to a newly allocated 3536 string which is a duplicate of C<pv>. The size of the string is 3537 determined by C<strlen()>, which means it may not contain embedded C<NUL> 3538 characters and must have a trailing C<NUL>. To prevent memory leaks, the 3539 memory allocated for the new string needs to be freed when no longer needed. 3540 This can be done with the C<L</Safefree>> function, or 3541 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>. 3542 3543 On some platforms, Windows for example, all allocated memory owned by a thread 3544 is deallocated when that thread ends. So if you need that not to happen, you 3545 need to use the shared memory functions, such as C<L</savesharedpv>>. 3546 3547 =cut 3548 */ 3549 3550 PERL_STATIC_INLINE char * 3551 Perl_savepv(pTHX_ const char *pv) 3552 { 3553 PERL_UNUSED_CONTEXT; 3554 if (!pv) 3555 return NULL; 3556 else { 3557 char *newaddr; 3558 const STRLEN pvlen = strlen(pv)+1; 3559 Newx(newaddr, pvlen, char); 3560 return (char*)memcpy(newaddr, pv, pvlen); 3561 } 3562 } 3563 3564 /* same thing but with a known length */ 3565 3566 /* 3567 =for apidoc savepvn 3568 3569 Perl's version of what C<strndup()> would be if it existed. Returns a 3570 pointer to a newly allocated string which is a duplicate of the first 3571 C<len> bytes from C<pv>, plus a trailing 3572 C<NUL> byte. The memory allocated for 3573 the new string can be freed with the C<Safefree()> function. 3574 3575 On some platforms, Windows for example, all allocated memory owned by a thread 3576 is deallocated when that thread ends. So if you need that not to happen, you 3577 need to use the shared memory functions, such as C<L</savesharedpvn>>. 3578 3579 =cut 3580 */ 3581 3582 PERL_STATIC_INLINE char * 3583 Perl_savepvn(pTHX_ const char *pv, Size_t len) 3584 { 3585 char *newaddr; 3586 PERL_UNUSED_CONTEXT; 3587 3588 Newx(newaddr,len+1,char); 3589 /* Give a meaning to NULL pointer mainly for the use in sv_magic() */ 3590 if (pv) { 3591 /* might not be null terminated */ 3592 newaddr[len] = '\0'; 3593 return (char *) CopyD(pv,newaddr,len,char); 3594 } 3595 else { 3596 return (char *) ZeroD(newaddr,len+1,char); 3597 } 3598 } 3599 3600 /* 3601 =for apidoc savesvpv 3602 3603 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from 3604 the passed in SV using C<SvPV()> 3605 3606 On some platforms, Windows for example, all allocated memory owned by a thread 3607 is deallocated when that thread ends. So if you need that not to happen, you 3608 need to use the shared memory functions, such as C<L</savesharedsvpv>>. 3609 3610 =cut 3611 */ 3612 3613 PERL_STATIC_INLINE char * 3614 Perl_savesvpv(pTHX_ SV *sv) 3615 { 3616 STRLEN len; 3617 const char * const pv = SvPV_const(sv, len); 3618 char *newaddr; 3619 3620 PERL_ARGS_ASSERT_SAVESVPV; 3621 3622 ++len; 3623 Newx(newaddr,len,char); 3624 return (char *) CopyD(pv,newaddr,len,char); 3625 } 3626 3627 /* 3628 =for apidoc savesharedsvpv 3629 3630 A version of C<savesharedpv()> which allocates the duplicate string in 3631 memory which is shared between threads. 3632 3633 =cut 3634 */ 3635 3636 PERL_STATIC_INLINE char * 3637 Perl_savesharedsvpv(pTHX_ SV *sv) 3638 { 3639 STRLEN len; 3640 const char * const pv = SvPV_const(sv, len); 3641 3642 PERL_ARGS_ASSERT_SAVESHAREDSVPV; 3643 3644 return savesharedpvn(pv, len); 3645 } 3646 3647 #ifndef PERL_GET_CONTEXT_DEFINED 3648 3649 /* 3650 =for apidoc_section $embedding 3651 =for apidoc get_context 3652 3653 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead. 3654 3655 =cut 3656 */ 3657 3658 PERL_STATIC_INLINE void * 3659 Perl_get_context(void) 3660 { 3661 # if defined(USE_ITHREADS) 3662 # ifdef OLD_PTHREADS_API 3663 pthread_addr_t t; 3664 int error = pthread_getspecific(PL_thr_key, &t); 3665 if (error) 3666 Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error); 3667 return (void*)t; 3668 # elif defined(I_MACH_CTHREADS) 3669 return (void*)cthread_data(cthread_self()); 3670 # else 3671 return (void*)PTHREAD_GETSPECIFIC(PL_thr_key); 3672 # endif 3673 # else 3674 return (void*)NULL; 3675 # endif 3676 } 3677 3678 #endif 3679 3680 PERL_STATIC_INLINE MGVTBL* 3681 Perl_get_vtbl(pTHX_ int vtbl_id) 3682 { 3683 PERL_UNUSED_CONTEXT; 3684 3685 return (vtbl_id < 0 || vtbl_id >= magic_vtable_max) 3686 ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id; 3687 } 3688 3689 /* 3690 =for apidoc my_strlcat 3691 3692 The C library C<strlcat> if available, or a Perl implementation of it. 3693 This operates on C C<NUL>-terminated strings. 3694 3695 C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at 3696 most S<C<size - strlen(dst) - 1>> characters. It will then C<NUL>-terminate, 3697 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in 3698 practice this should not happen as it means that either C<size> is incorrect or 3699 that C<dst> is not a proper C<NUL>-terminated string). 3700 3701 Note that C<size> is the full size of the destination buffer and 3702 the result is guaranteed to be C<NUL>-terminated if there is room. Note that 3703 room for the C<NUL> should be included in C<size>. 3704 3705 The return value is the total length that C<dst> would have if C<size> is 3706 sufficiently large. Thus it is the initial length of C<dst> plus the length of 3707 C<src>. If C<size> is smaller than the return, the excess was not appended. 3708 3709 =cut 3710 3711 Description stolen from http://man.openbsd.org/strlcat.3 3712 */ 3713 #ifndef HAS_STRLCAT 3714 PERL_STATIC_INLINE Size_t 3715 Perl_my_strlcat(char *dst, const char *src, Size_t size) 3716 { 3717 Size_t used, length, copy; 3718 3719 used = strlen(dst); 3720 length = strlen(src); 3721 if (size > 0 && used < size - 1) { 3722 copy = (length >= size - used) ? size - used - 1 : length; 3723 memcpy(dst + used, src, copy); 3724 dst[used + copy] = '\0'; 3725 } 3726 return used + length; 3727 } 3728 #endif 3729 3730 3731 /* 3732 =for apidoc my_strlcpy 3733 3734 The C library C<strlcpy> if available, or a Perl implementation of it. 3735 This operates on C C<NUL>-terminated strings. 3736 3737 C<my_strlcpy()> copies up to S<C<size - 1>> characters from the string C<src> 3738 to C<dst>, C<NUL>-terminating the result if C<size> is not 0. 3739 3740 The return value is the total length C<src> would be if the copy completely 3741 succeeded. If it is larger than C<size>, the excess was not copied. 3742 3743 =cut 3744 3745 Description stolen from http://man.openbsd.org/strlcpy.3 3746 */ 3747 #ifndef HAS_STRLCPY 3748 PERL_STATIC_INLINE Size_t 3749 Perl_my_strlcpy(char *dst, const char *src, Size_t size) 3750 { 3751 Size_t length, copy; 3752 3753 length = strlen(src); 3754 if (size > 0) { 3755 copy = (length >= size) ? size - 1 : length; 3756 memcpy(dst, src, copy); 3757 dst[copy] = '\0'; 3758 } 3759 return length; 3760 } 3761 #endif 3762 3763 /* 3764 * ex: set ts=8 sts=4 sw=4 et: 3765 */ 3766