1 /* inline.h 2 * 3 * Copyright (C) 2012 by Larry Wall and others 4 * 5 * You may distribute under the terms of either the GNU General Public 6 * License or the Artistic License, as specified in the README file. 7 * 8 * This file contains tables and code adapted from 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this 10 * copyright notice: 11 12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 13 14 Permission is hereby granted, free of charge, to any person obtaining a copy of 15 this software and associated documentation files (the "Software"), to deal in 16 the Software without restriction, including without limitation the rights to 17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 18 of the Software, and to permit persons to whom the Software is furnished to do 19 so, subject to the following conditions: 20 21 The above copyright notice and this permission notice shall be included in all 22 copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 SOFTWARE. 31 32 * 33 * This file is a home for static inline functions that cannot go in other 34 * header files, because they depend on proto.h (included after most other 35 * headers) or struct definitions. 36 * 37 * Each section names the header file that the functions "belong" to. 38 */ 39 40 /* ------------------------------- av.h ------------------------------- */ 41 42 /* 43 =for apidoc_section $AV 44 =for apidoc av_count 45 Returns the number of elements in the array C<av>. This is the true length of 46 the array, including any undefined elements. It is always the same as 47 S<C<av_top_index(av) + 1>>. 48 49 =cut 50 */ 51 PERL_STATIC_INLINE Size_t 52 Perl_av_count(pTHX_ AV *av) 53 { 54 PERL_ARGS_ASSERT_AV_COUNT; 55 assert(SvTYPE(av) == SVt_PVAV); 56 57 return AvFILL(av) + 1; 58 } 59 60 /* ------------------------------- av.c ------------------------------- */ 61 62 /* 63 =for apidoc av_store_simple 64 65 This is a cut-down version of av_store that assumes that the array is 66 very straightforward - no magic, not readonly, and AvREAL - and that 67 C<key> is not negative. This function MUST NOT be used in situations 68 where any of those assumptions may not hold. 69 70 Stores an SV in an array. The array index is specified as C<key>. It 71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)). 72 73 Note that the caller is responsible for suitably incrementing the reference 74 count of C<val> before the call. 75 76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>. 77 78 =cut 79 */ 80 81 PERL_STATIC_INLINE SV** 82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val) 83 { 84 SV** ary; 85 86 PERL_ARGS_ASSERT_AV_STORE_SIMPLE; 87 assert(SvTYPE(av) == SVt_PVAV); 88 assert(!SvMAGICAL(av)); 89 assert(!SvREADONLY(av)); 90 assert(AvREAL(av)); 91 assert(key > -1); 92 93 ary = AvARRAY(av); 94 95 if (AvFILLp(av) < key) { 96 if (key > AvMAX(av)) { 97 av_extend(av,key); 98 ary = AvARRAY(av); 99 } 100 AvFILLp(av) = key; 101 } else 102 SvREFCNT_dec(ary[key]); 103 104 ary[key] = val; 105 return &ary[key]; 106 } 107 108 /* 109 =for apidoc av_fetch_simple 110 111 This is a cut-down version of av_fetch that assumes that the array is 112 very straightforward - no magic, not readonly, and AvREAL - and that 113 C<key> is not negative. This function MUST NOT be used in situations 114 where any of those assumptions may not hold. 115 116 Returns the SV at the specified index in the array. The C<key> is the 117 index. If lval is true, you are guaranteed to get a real SV back (in case 118 it wasn't real before), which you can then modify. Check that the return 119 value is non-null before dereferencing it to a C<SV*>. 120 121 The rough perl equivalent is C<$myarray[$key]>. 122 123 =cut 124 */ 125 126 PERL_STATIC_INLINE SV** 127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval) 128 { 129 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE; 130 assert(SvTYPE(av) == SVt_PVAV); 131 assert(!SvMAGICAL(av)); 132 assert(!SvREADONLY(av)); 133 assert(AvREAL(av)); 134 assert(key > -1); 135 136 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) { 137 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL; 138 } else { 139 return &AvARRAY(av)[key]; 140 } 141 } 142 143 /* ------------------------------- cv.h ------------------------------- */ 144 145 /* 146 =for apidoc_section $CV 147 =for apidoc CvGV 148 Returns the GV associated with the CV C<sv>, reifying it if necessary. 149 150 =cut 151 */ 152 PERL_STATIC_INLINE GV * 153 Perl_CvGV(pTHX_ CV *sv) 154 { 155 PERL_ARGS_ASSERT_CVGV; 156 157 return CvNAMED(sv) 158 ? Perl_cvgv_from_hek(aTHX_ sv) 159 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; 160 } 161 162 /* 163 =for apidoc CvDEPTH 164 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a 165 recursive call. 166 167 =cut 168 */ 169 PERL_STATIC_INLINE I32 * 170 Perl_CvDEPTH(const CV * const sv) 171 { 172 PERL_ARGS_ASSERT_CVDEPTH; 173 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); 174 175 return &((XPVCV*)SvANY(sv))->xcv_depth; 176 } 177 178 /* 179 CvPROTO returns the prototype as stored, which is not necessarily what 180 the interpreter should be using. Specifically, the interpreter assumes 181 that spaces have been stripped, which has been the case if the prototype 182 was added by toke.c, but is generally not the case if it was added elsewhere. 183 Since we can't enforce the spacelessness at assignment time, this routine 184 provides a temporary copy at parse time with spaces removed. 185 I<orig> is the start of the original buffer, I<len> is the length of the 186 prototype and will be updated when this returns. 187 */ 188 189 #ifdef PERL_CORE 190 PERL_STATIC_INLINE char * 191 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len) 192 { 193 SV * tmpsv; 194 char * tmps; 195 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP); 196 tmps = SvPVX(tmpsv); 197 while ((*len)--) { 198 if (!isSPACE(*orig)) 199 *tmps++ = *orig; 200 orig++; 201 } 202 *tmps = '\0'; 203 *len = tmps - SvPVX(tmpsv); 204 return SvPVX(tmpsv); 205 } 206 #endif 207 208 /* ------------------------------- mg.h ------------------------------- */ 209 210 #if defined(PERL_CORE) || defined(PERL_EXT) 211 /* assumes get-magic and stringification have already occurred */ 212 PERL_STATIC_INLINE STRLEN 213 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) 214 { 215 assert(mg->mg_type == PERL_MAGIC_regex_global); 216 assert(mg->mg_len != -1); 217 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv)) 218 return (STRLEN)mg->mg_len; 219 else { 220 const STRLEN pos = (STRLEN)mg->mg_len; 221 /* Without this check, we may read past the end of the buffer: */ 222 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; 223 return sv_or_pv_pos_u2b(sv, s, pos, NULL); 224 } 225 } 226 #endif 227 228 /* ------------------------------- pad.h ------------------------------ */ 229 230 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) 231 PERL_STATIC_INLINE bool 232 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) 233 { 234 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE; 235 236 /* is seq within the range _LOW to _HIGH ? 237 * This is complicated by the fact that PL_cop_seqmax 238 * may have wrapped around at some point */ 239 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO) 240 return FALSE; /* not yet introduced */ 241 242 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) { 243 /* in compiling scope */ 244 if ( 245 (seq > COP_SEQ_RANGE_LOW(pn)) 246 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) 247 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) 248 ) 249 return TRUE; 250 } 251 else if ( 252 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) 253 ? 254 ( seq > COP_SEQ_RANGE_LOW(pn) 255 || seq <= COP_SEQ_RANGE_HIGH(pn)) 256 257 : ( seq > COP_SEQ_RANGE_LOW(pn) 258 && seq <= COP_SEQ_RANGE_HIGH(pn)) 259 ) 260 return TRUE; 261 return FALSE; 262 } 263 #endif 264 265 /* ------------------------------- pp.h ------------------------------- */ 266 267 PERL_STATIC_INLINE I32 268 Perl_TOPMARK(pTHX) 269 { 270 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 271 "MARK top %p %" IVdf "\n", 272 PL_markstack_ptr, 273 (IV)*PL_markstack_ptr))); 274 return *PL_markstack_ptr; 275 } 276 277 PERL_STATIC_INLINE I32 278 Perl_POPMARK(pTHX) 279 { 280 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 281 "MARK pop %p %" IVdf "\n", 282 (PL_markstack_ptr-1), 283 (IV)*(PL_markstack_ptr-1)))); 284 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); 285 return *PL_markstack_ptr--; 286 } 287 288 /* ----------------------------- regexp.h ----------------------------- */ 289 290 /* PVLVs need to act as a superset of all scalar types - they are basically 291 * PVMGs with a few extra fields. 292 * REGEXPs are first class scalars, but have many fields that can't be copied 293 * into a PVLV body. 294 * 295 * Hence we take a different approach - instead of a copy, PVLVs store a pointer 296 * back to the original body. To avoid increasing the size of PVLVs just for the 297 * rare case of REGEXP assignment, this pointer is stored in the memory usually 298 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to 299 * read the pointer from the two possible locations. The macro SvLEN() wraps the 300 * access to the union's member xpvlenu_len, but there is no equivalent macro 301 * for wrapping the union's member xpvlenu_rx, hence the direct reference here. 302 * 303 * See commit df6b4bd56551f2d3 for more details. */ 304 305 PERL_STATIC_INLINE struct regexp * 306 Perl_ReANY(const REGEXP * const re) 307 { 308 XPV* const p = (XPV*)SvANY(re); 309 310 PERL_ARGS_ASSERT_REANY; 311 assert(isREGEXP(re)); 312 313 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx 314 : (struct regexp *)p; 315 } 316 317 /* ------------------------------- sv.h ------------------------------- */ 318 319 PERL_STATIC_INLINE bool 320 Perl_SvTRUE(pTHX_ SV *sv) 321 { 322 PERL_ARGS_ASSERT_SVTRUE; 323 324 if (UNLIKELY(sv == NULL)) 325 return FALSE; 326 SvGETMAGIC(sv); 327 return SvTRUE_nomg_NN(sv); 328 } 329 330 PERL_STATIC_INLINE bool 331 Perl_SvTRUE_nomg(pTHX_ SV *sv) 332 { 333 PERL_ARGS_ASSERT_SVTRUE_NOMG; 334 335 if (UNLIKELY(sv == NULL)) 336 return FALSE; 337 return SvTRUE_nomg_NN(sv); 338 } 339 340 PERL_STATIC_INLINE bool 341 Perl_SvTRUE_NN(pTHX_ SV *sv) 342 { 343 PERL_ARGS_ASSERT_SVTRUE_NN; 344 345 SvGETMAGIC(sv); 346 return SvTRUE_nomg_NN(sv); 347 } 348 349 PERL_STATIC_INLINE bool 350 Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback) 351 { 352 PERL_ARGS_ASSERT_SVTRUE_COMMON; 353 354 if (UNLIKELY(SvIMMORTAL_INTERP(sv))) 355 return SvIMMORTAL_TRUE(sv); 356 357 if (! SvOK(sv)) 358 return FALSE; 359 360 if (SvPOK(sv)) 361 return SvPVXtrue(sv); 362 363 if (SvIOK(sv)) 364 return SvIVX(sv) != 0; /* casts to bool */ 365 366 if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv))))) 367 return TRUE; 368 369 if (sv_2bool_is_fallback) 370 return sv_2bool_nomg(sv); 371 372 return isGV_with_GP(sv); 373 } 374 375 376 PERL_STATIC_INLINE SV * 377 Perl_SvREFCNT_inc(SV *sv) 378 { 379 if (LIKELY(sv != NULL)) 380 SvREFCNT(sv)++; 381 return sv; 382 } 383 PERL_STATIC_INLINE SV * 384 Perl_SvREFCNT_inc_NN(SV *sv) 385 { 386 PERL_ARGS_ASSERT_SVREFCNT_INC_NN; 387 388 SvREFCNT(sv)++; 389 return sv; 390 } 391 PERL_STATIC_INLINE void 392 Perl_SvREFCNT_inc_void(SV *sv) 393 { 394 if (LIKELY(sv != NULL)) 395 SvREFCNT(sv)++; 396 } 397 PERL_STATIC_INLINE void 398 Perl_SvREFCNT_dec(pTHX_ SV *sv) 399 { 400 if (LIKELY(sv != NULL)) { 401 U32 rc = SvREFCNT(sv); 402 if (LIKELY(rc > 1)) 403 SvREFCNT(sv) = rc - 1; 404 else 405 Perl_sv_free2(aTHX_ sv, rc); 406 } 407 } 408 409 PERL_STATIC_INLINE void 410 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv) 411 { 412 U32 rc = SvREFCNT(sv); 413 414 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN; 415 416 if (LIKELY(rc > 1)) 417 SvREFCNT(sv) = rc - 1; 418 else 419 Perl_sv_free2(aTHX_ sv, rc); 420 } 421 422 /* 423 =for apidoc SvAMAGIC_on 424 425 Indicate that C<sv> has overloading (active magic) enabled. 426 427 =cut 428 */ 429 430 PERL_STATIC_INLINE void 431 Perl_SvAMAGIC_on(SV *sv) 432 { 433 PERL_ARGS_ASSERT_SVAMAGIC_ON; 434 assert(SvROK(sv)); 435 436 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv))); 437 } 438 439 /* 440 =for apidoc SvAMAGIC_off 441 442 Indicate that C<sv> has overloading (active magic) disabled. 443 444 =cut 445 */ 446 447 PERL_STATIC_INLINE void 448 Perl_SvAMAGIC_off(SV *sv) 449 { 450 PERL_ARGS_ASSERT_SVAMAGIC_OFF; 451 452 if (SvROK(sv) && SvOBJECT(SvRV(sv))) 453 HvAMAGIC_off(SvSTASH(SvRV(sv))); 454 } 455 456 PERL_STATIC_INLINE U32 457 Perl_SvPADSTALE_on(SV *sv) 458 { 459 assert(!(SvFLAGS(sv) & SVs_PADTMP)); 460 return SvFLAGS(sv) |= SVs_PADSTALE; 461 } 462 PERL_STATIC_INLINE U32 463 Perl_SvPADSTALE_off(SV *sv) 464 { 465 assert(!(SvFLAGS(sv) & SVs_PADTMP)); 466 return SvFLAGS(sv) &= ~SVs_PADSTALE; 467 } 468 #if defined(PERL_CORE) || defined (PERL_EXT) 469 PERL_STATIC_INLINE STRLEN 470 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) 471 { 472 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B; 473 if (SvGAMAGIC(sv)) { 474 U8 *hopped = utf8_hop((U8 *)pv, pos); 475 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped); 476 return (STRLEN)(hopped - (U8 *)pv); 477 } 478 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN); 479 } 480 #endif 481 482 /* ------------------------------- utf8.h ------------------------------- */ 483 484 /* 485 =for apidoc_section $unicode 486 */ 487 488 PERL_STATIC_INLINE void 489 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) 490 { 491 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 492 * encoded string at '*dest', updating '*dest' to include it */ 493 494 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; 495 496 if (NATIVE_BYTE_IS_INVARIANT(byte)) 497 *((*dest)++) = byte; 498 else { 499 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); 500 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); 501 } 502 } 503 504 /* 505 =for apidoc valid_utf8_to_uvchr 506 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is 507 known that the next character in the input UTF-8 string C<s> is well-formed 508 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code 509 points, and non-Unicode code points are allowed. 510 511 =cut 512 513 */ 514 515 PERL_STATIC_INLINE UV 516 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) 517 { 518 const UV expectlen = UTF8SKIP(s); 519 const U8* send = s + expectlen; 520 UV uv = *s; 521 522 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; 523 524 if (retlen) { 525 *retlen = expectlen; 526 } 527 528 /* An invariant is trivially returned */ 529 if (expectlen == 1) { 530 return uv; 531 } 532 533 /* Remove the leading bits that indicate the number of bytes, leaving just 534 * the bits that are part of the value */ 535 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 536 537 /* Now, loop through the remaining bytes, accumulating each into the 538 * working total as we go. (I khw tried unrolling the loop for up to 4 539 * bytes, but there was no performance improvement) */ 540 for (++s; s < send; s++) { 541 uv = UTF8_ACCUMULATE(uv, *s); 542 } 543 544 return UNI_TO_NATIVE(uv); 545 546 } 547 548 /* 549 =for apidoc is_utf8_invariant_string 550 551 Returns TRUE if the first C<len> bytes of the string C<s> are the same 552 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on 553 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they 554 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only 555 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range 556 characters are invariant, but so also are the C1 controls. 557 558 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you 559 use this option, that C<s> can't have embedded C<NUL> characters and has to 560 have a terminating C<NUL> byte). 561 562 See also 563 C<L</is_utf8_string>>, 564 C<L</is_utf8_string_flags>>, 565 C<L</is_utf8_string_loc>>, 566 C<L</is_utf8_string_loc_flags>>, 567 C<L</is_utf8_string_loclen>>, 568 C<L</is_utf8_string_loclen_flags>>, 569 C<L</is_utf8_fixed_width_buf_flags>>, 570 C<L</is_utf8_fixed_width_buf_loc_flags>>, 571 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 572 C<L</is_strict_utf8_string>>, 573 C<L</is_strict_utf8_string_loc>>, 574 C<L</is_strict_utf8_string_loclen>>, 575 C<L</is_c9strict_utf8_string>>, 576 C<L</is_c9strict_utf8_string_loc>>, 577 and 578 C<L</is_c9strict_utf8_string_loclen>>. 579 580 =cut 581 582 */ 583 584 #define is_utf8_invariant_string(s, len) \ 585 is_utf8_invariant_string_loc(s, len, NULL) 586 587 /* 588 =for apidoc is_utf8_invariant_string_loc 589 590 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of 591 the first UTF-8 variant character in the C<ep> pointer; if all characters are 592 UTF-8 invariant, this function does not change the contents of C<*ep>. 593 594 =cut 595 596 */ 597 598 PERL_STATIC_INLINE bool 599 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) 600 { 601 const U8* send; 602 const U8* x = s; 603 604 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; 605 606 if (len == 0) { 607 len = strlen((const char *)s); 608 } 609 610 send = s + len; 611 612 /* This looks like 0x010101... */ 613 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) 614 615 /* This looks like 0x808080... */ 616 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) 617 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) 618 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) 619 620 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by 621 * or'ing together the lowest bits of 'x'. Hopefully the final term gets 622 * optimized out completely on a 32-bit system, and its mask gets optimized out 623 * on a 64-bit system */ 624 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ 625 | ( PTR2nat(x) >> 1) \ 626 | ( ( (PTR2nat(x) \ 627 & PERL_WORD_BOUNDARY_MASK) >> 2)))) 628 629 #ifndef EBCDIC 630 631 /* Do the word-at-a-time iff there is at least one usable full word. That 632 * means that after advancing to a word boundary, there still is at least a 633 * full word left. The number of bytes needed to advance is 'wordsize - 634 * offset' unless offset is 0. */ 635 if ((STRLEN) (send - x) >= PERL_WORDSIZE 636 637 /* This term is wordsize if subword; 0 if not */ 638 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 639 640 /* 'offset' */ 641 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 642 { 643 644 /* Process per-byte until reach word boundary. XXX This loop could be 645 * eliminated if we knew that this platform had fast unaligned reads */ 646 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 647 if (! UTF8_IS_INVARIANT(*x)) { 648 if (ep) { 649 *ep = x; 650 } 651 652 return FALSE; 653 } 654 x++; 655 } 656 657 /* Here, we know we have at least one full word to process. Process 658 * per-word as long as we have at least a full word left */ 659 do { 660 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { 661 662 /* Found a variant. Just return if caller doesn't want its 663 * exact position */ 664 if (! ep) { 665 return FALSE; 666 } 667 668 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ 669 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 670 671 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x); 672 assert(*ep >= s && *ep < send); 673 674 return FALSE; 675 676 # else /* If weird byte order, drop into next loop to do byte-at-a-time 677 checks. */ 678 679 break; 680 # endif 681 } 682 683 x += PERL_WORDSIZE; 684 685 } while (x + PERL_WORDSIZE <= send); 686 } 687 688 #endif /* End of ! EBCDIC */ 689 690 /* Process per-byte */ 691 while (x < send) { 692 if (! UTF8_IS_INVARIANT(*x)) { 693 if (ep) { 694 *ep = x; 695 } 696 697 return FALSE; 698 } 699 700 x++; 701 } 702 703 return TRUE; 704 } 705 706 /* See if the platform has builtins for finding the most/least significant bit, 707 * and which one is right for using on 32 and 64 bit operands */ 708 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0)) 709 # if U32SIZE == INTSIZE 710 # define PERL_CLZ_32 __builtin_clz 711 # endif 712 # if defined(U64TYPE) && U64SIZE == INTSIZE 713 # define PERL_CLZ_64 __builtin_clz 714 # endif 715 #endif 716 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0)) 717 # if U32SIZE == INTSIZE 718 # define PERL_CTZ_32 __builtin_ctz 719 # endif 720 # if defined(U64TYPE) && U64SIZE == INTSIZE 721 # define PERL_CTZ_64 __builtin_ctz 722 # endif 723 #endif 724 725 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0)) 726 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32) 727 # define PERL_CLZ_32 __builtin_clzl 728 # endif 729 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64) 730 # define PERL_CLZ_64 __builtin_clzl 731 # endif 732 #endif 733 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0)) 734 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32) 735 # define PERL_CTZ_32 __builtin_ctzl 736 # endif 737 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64) 738 # define PERL_CTZ_64 __builtin_ctzl 739 # endif 740 #endif 741 742 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0)) 743 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32) 744 # define PERL_CLZ_32 __builtin_clzll 745 # endif 746 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64) 747 # define PERL_CLZ_64 __builtin_clzll 748 # endif 749 #endif 750 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0)) 751 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32) 752 # define PERL_CTZ_32 __builtin_ctzll 753 # endif 754 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64) 755 # define PERL_CTZ_64 __builtin_ctzll 756 # endif 757 #endif 758 759 #if defined(_MSC_VER) 760 # include <intrin.h> 761 # pragma intrinsic(_BitScanForward) 762 # pragma intrinsic(_BitScanReverse) 763 # ifdef _WIN64 764 # pragma intrinsic(_BitScanForward64) 765 # pragma intrinsic(_BitScanReverse64) 766 # endif 767 #endif 768 769 /* The reason there are not checks to see if ffs() and ffsl() are available for 770 * determining the lsb, is because these don't improve on the deBruijn method 771 * fallback, which is just a branchless integer multiply, array element 772 * retrieval, and shift. The others, even if the function call overhead is 773 * optimized out, have to cope with the possibility of the input being all 774 * zeroes, and almost certainly will have conditionals for this eventuality. 775 * khw, at the time of this commit, looked at the source for both gcc and clang 776 * to verify this. (gcc used a method inferior to deBruijn.) */ 777 778 /* Below are functions to find the first, last, or only set bit in a word. On 779 * platforms with 64-bit capability, there is a pair for each operation; the 780 * first taking a 64 bit operand, and the second a 32 bit one. The logic is 781 * the same in each pair, so the second is stripped of most comments. */ 782 783 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 784 785 PERL_STATIC_INLINE unsigned 786 Perl_lsbit_pos64(U64 word) 787 { 788 /* Find the position (0..63) of the least significant set bit in the input 789 * word */ 790 791 ASSUME(word != 0); 792 793 /* If we can determine that the platform has a usable fast method to get 794 * this info, use that */ 795 796 # if defined(PERL_CTZ_64) 797 # define PERL_HAS_FAST_GET_LSB_POS64 798 799 return (unsigned) PERL_CTZ_64(word); 800 801 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) 802 # define PERL_HAS_FAST_GET_LSB_POS64 803 804 { 805 unsigned long index; 806 _BitScanForward64(&index, word); 807 return (unsigned)index; 808 } 809 810 # else 811 812 /* Here, we didn't find a fast method for finding the lsb. Fall back to 813 * making the lsb the only set bit in the word, and use our function that 814 * works on words with a single bit set. 815 * 816 * Isolate the lsb; 817 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set 818 * 819 * The word will look like this, with a rightmost set bit in position 's': 820 * ('x's are don't cares, and 'y's are their complements) 821 * s 822 * x..x100..00 823 * y..y011..11 Complement 824 * y..y100..00 Add 1 825 * 0..0100..00 And with the original 826 * 827 * (Yes, complementing and adding 1 is just taking the negative on 2's 828 * complement machines, but not on 1's complement ones, and some compilers 829 * complain about negating an unsigned.) 830 */ 831 return single_1bit_pos64(word & (~word + 1)); 832 833 # endif 834 835 } 836 837 # define lsbit_pos_uintmax_(word) lsbit_pos64(word) 838 #else /* ! QUAD */ 839 # define lsbit_pos_uintmax_(word) lsbit_pos32(word) 840 #endif 841 842 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */ 843 Perl_lsbit_pos32(U32 word) 844 { 845 /* Find the position (0..31) of the least significant set bit in the input 846 * word */ 847 848 ASSUME(word != 0); 849 850 #if defined(PERL_CTZ_32) 851 # define PERL_HAS_FAST_GET_LSB_POS32 852 853 return (unsigned) PERL_CTZ_32(word); 854 855 #elif U32SIZE == 4 && defined(_MSC_VER) 856 # define PERL_HAS_FAST_GET_LSB_POS32 857 858 { 859 unsigned long index; 860 _BitScanForward(&index, word); 861 return (unsigned)index; 862 } 863 864 #else 865 866 return single_1bit_pos32(word & (~word + 1)); 867 868 #endif 869 870 } 871 872 873 /* Convert the leading zeros count to the bit position of the first set bit. 874 * This just subtracts from the highest position, 31 or 63. But some compilers 875 * don't optimize this optimally, and so a bit of bit twiddling encourages them 876 * to do the right thing. It turns out that subtracting a smaller non-negative 877 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of 878 * the two numbers. To see why, first note that the sum of any number, x, and 879 * its complement, x', is all ones. So all ones minus x is x'. Then note that 880 * the xor of x and all ones is x'. */ 881 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc)) 882 883 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 884 885 PERL_STATIC_INLINE unsigned 886 Perl_msbit_pos64(U64 word) 887 { 888 /* Find the position (0..63) of the most significant set bit in the input 889 * word */ 890 891 ASSUME(word != 0); 892 893 /* If we can determine that the platform has a usable fast method to get 894 * this, use that */ 895 896 # if defined(PERL_CLZ_64) 897 # define PERL_HAS_FAST_GET_MSB_POS64 898 899 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word)); 900 901 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) 902 # define PERL_HAS_FAST_GET_MSB_POS64 903 904 { 905 unsigned long index; 906 _BitScanReverse64(&index, word); 907 return (unsigned)index; 908 } 909 910 # else 911 912 /* Here, we didn't find a fast method for finding the msb. Fall back to 913 * making the msb the only set bit in the word, and use our function that 914 * works on words with a single bit set. 915 * 916 * Isolate the msb; http://codeforces.com/blog/entry/10330 917 * 918 * Only the most significant set bit matters. Or'ing word with its right 919 * shift of 1 makes that bit and the next one to its right both 1. 920 * Repeating that with the right shift of 2 makes for 4 1-bits in a row. 921 * ... We end with the msb and all to the right being 1. */ 922 word |= (word >> 1); 923 word |= (word >> 2); 924 word |= (word >> 4); 925 word |= (word >> 8); 926 word |= (word >> 16); 927 word |= (word >> 32); 928 929 /* Then subtracting the right shift by 1 clears all but the left-most of 930 * the 1 bits, which is our desired result */ 931 word -= (word >> 1); 932 933 /* Now we have a single bit set */ 934 return single_1bit_pos64(word); 935 936 # endif 937 938 } 939 940 # define msbit_pos_uintmax_(word) msbit_pos64(word) 941 #else /* ! QUAD */ 942 # define msbit_pos_uintmax_(word) msbit_pos32(word) 943 #endif 944 945 PERL_STATIC_INLINE unsigned 946 Perl_msbit_pos32(U32 word) 947 { 948 /* Find the position (0..31) of the most significant set bit in the input 949 * word */ 950 951 ASSUME(word != 0); 952 953 #if defined(PERL_CLZ_32) 954 # define PERL_HAS_FAST_GET_MSB_POS32 955 956 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word)); 957 958 #elif U32SIZE == 4 && defined(_MSC_VER) 959 # define PERL_HAS_FAST_GET_MSB_POS32 960 961 { 962 unsigned long index; 963 _BitScanReverse(&index, word); 964 return (unsigned)index; 965 } 966 967 #else 968 969 word |= (word >> 1); 970 word |= (word >> 2); 971 word |= (word >> 4); 972 word |= (word >> 8); 973 word |= (word >> 16); 974 word -= (word >> 1); 975 return single_1bit_pos32(word); 976 977 #endif 978 979 } 980 981 #if UVSIZE == U64SIZE 982 # define msbit_pos(word) msbit_pos64(word) 983 # define lsbit_pos(word) lsbit_pos64(word) 984 #elif UVSIZE == U32SIZE 985 # define msbit_pos(word) msbit_pos32(word) 986 # define lsbit_pos(word) lsbit_pos32(word) 987 #endif 988 989 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 990 991 PERL_STATIC_INLINE unsigned 992 Perl_single_1bit_pos64(U64 word) 993 { 994 /* Given a 64-bit word known to contain all zero bits except one 1 bit, 995 * find and return the 1's position: 0..63 */ 996 997 # ifdef PERL_CORE /* macro not exported */ 998 ASSUME(isPOWER_OF_2(word)); 999 # else 1000 ASSUME(word && (word & (word-1)) == 0); 1001 # endif 1002 1003 /* The only set bit is both the most and least significant bit. If we have 1004 * a fast way of finding either one, use that. 1005 * 1006 * It may appear at first glance that those functions call this one, but 1007 * they don't if the corresponding #define is set */ 1008 1009 # ifdef PERL_HAS_FAST_GET_MSB_POS64 1010 1011 return msbit_pos64(word); 1012 1013 # elif defined(PERL_HAS_FAST_GET_LSB_POS64) 1014 1015 return lsbit_pos64(word); 1016 1017 # else 1018 1019 /* The position of the only set bit in a word can be quickly calculated 1020 * using deBruijn sequences. See for example 1021 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */ 1022 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_) 1023 >> PERL_deBruijnShift64_]; 1024 # endif 1025 1026 } 1027 1028 #endif 1029 1030 PERL_STATIC_INLINE unsigned 1031 Perl_single_1bit_pos32(U32 word) 1032 { 1033 /* Given a 32-bit word known to contain all zero bits except one 1 bit, 1034 * find and return the 1's position: 0..31 */ 1035 1036 #ifdef PERL_CORE /* macro not exported */ 1037 ASSUME(isPOWER_OF_2(word)); 1038 #else 1039 ASSUME(word && (word & (word-1)) == 0); 1040 #endif 1041 #ifdef PERL_HAS_FAST_GET_MSB_POS32 1042 1043 return msbit_pos32(word); 1044 1045 #elif defined(PERL_HAS_FAST_GET_LSB_POS32) 1046 1047 return lsbit_pos32(word); 1048 1049 /* Unlikely, but possible for the platform to have a wider fast operation but 1050 * not a narrower one. But easy enough to handle the case by widening the 1051 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops 1052 * would be slower than the deBruijn method.) */ 1053 #elif defined(PERL_HAS_FAST_GET_MSB_POS64) 1054 1055 return msbit_pos64(word); 1056 1057 #elif defined(PERL_HAS_FAST_GET_LSB_POS64) 1058 1059 return lsbit_pos64(word); 1060 1061 #else 1062 1063 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_) 1064 >> PERL_deBruijnShift32_]; 1065 #endif 1066 1067 } 1068 1069 #ifndef EBCDIC 1070 1071 PERL_STATIC_INLINE unsigned int 1072 Perl_variant_byte_number(PERL_UINTMAX_T word) 1073 { 1074 /* This returns the position in a word (0..7) of the first variant byte in 1075 * it. This is a helper function. Note that there are no branches */ 1076 1077 /* Get just the msb bits of each byte */ 1078 word &= PERL_VARIANTS_WORD_MASK; 1079 1080 /* This should only be called if we know there is a variant byte in the 1081 * word */ 1082 assert(word); 1083 1084 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 1085 1086 /* Bytes are stored like 1087 * Byte8 ... Byte2 Byte1 1088 * 63..56...15...8 7...0 1089 * so getting the lsb of the whole modified word is getting the msb of the 1090 * first byte that has its msb set */ 1091 word = lsbit_pos_uintmax_(word); 1092 1093 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert 1094 * to 0..7 */ 1095 return (unsigned int) ((word + 1) >> 3) - 1; 1096 1097 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 1098 1099 /* Bytes are stored like 1100 * Byte1 Byte2 ... Byte8 1101 * 63..56 55..47 ... 7...0 1102 * so getting the msb of the whole modified word is getting the msb of the 1103 * first byte that has its msb set */ 1104 word = msbit_pos_uintmax_(word); 1105 1106 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert 1107 * to 0..7 */ 1108 word = ((word + 1) >> 3) - 1; 1109 1110 /* And invert the result because of the reversed byte order on this 1111 * platform */ 1112 word = CHARBITS - word - 1; 1113 1114 return (unsigned int) word; 1115 1116 # else 1117 # error Unexpected byte order 1118 # endif 1119 1120 } 1121 1122 #endif 1123 #if defined(PERL_CORE) || defined(PERL_EXT) 1124 1125 /* 1126 =for apidoc variant_under_utf8_count 1127 1128 This function looks at the sequence of bytes between C<s> and C<e>, which are 1129 assumed to be encoded in ASCII/Latin1, and returns how many of them would 1130 change should the string be translated into UTF-8. Due to the nature of UTF-8, 1131 each of these would occupy two bytes instead of the single one in the input 1132 string. Thus, this function returns the precise number of bytes the string 1133 would expand by when translated to UTF-8. 1134 1135 Unlike most of the other functions that have C<utf8> in their name, the input 1136 to this function is NOT a UTF-8-encoded string. The function name is slightly 1137 I<odd> to emphasize this. 1138 1139 This function is internal to Perl because khw thinks that any XS code that 1140 would want this is probably operating too close to the internals. Presenting a 1141 valid use case could change that. 1142 1143 See also 1144 C<L<perlapi/is_utf8_invariant_string>> 1145 and 1146 C<L<perlapi/is_utf8_invariant_string_loc>>, 1147 1148 =cut 1149 1150 */ 1151 1152 PERL_STATIC_INLINE Size_t 1153 S_variant_under_utf8_count(const U8* const s, const U8* const e) 1154 { 1155 const U8* x = s; 1156 Size_t count = 0; 1157 1158 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; 1159 1160 # ifndef EBCDIC 1161 1162 /* Test if the string is long enough to use word-at-a-time. (Logic is the 1163 * same as for is_utf8_invariant_string()) */ 1164 if ((STRLEN) (e - x) >= PERL_WORDSIZE 1165 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 1166 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 1167 { 1168 1169 /* Process per-byte until reach word boundary. XXX This loop could be 1170 * eliminated if we knew that this platform had fast unaligned reads */ 1171 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 1172 count += ! UTF8_IS_INVARIANT(*x++); 1173 } 1174 1175 /* Process per-word as long as we have at least a full word left */ 1176 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an 1177 explanation of how this works */ 1178 PERL_UINTMAX_T increment 1179 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) 1180 * PERL_COUNT_MULTIPLIER) 1181 >> ((PERL_WORDSIZE - 1) * CHARBITS); 1182 count += (Size_t) increment; 1183 x += PERL_WORDSIZE; 1184 } while (x + PERL_WORDSIZE <= e); 1185 } 1186 1187 # endif 1188 1189 /* Process per-byte */ 1190 while (x < e) { 1191 if (! UTF8_IS_INVARIANT(*x)) { 1192 count++; 1193 } 1194 1195 x++; 1196 } 1197 1198 return count; 1199 } 1200 1201 #endif 1202 1203 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */ 1204 # undef PERL_WORDSIZE 1205 # undef PERL_COUNT_MULTIPLIER 1206 # undef PERL_WORD_BOUNDARY_MASK 1207 # undef PERL_VARIANTS_WORD_MASK 1208 #endif 1209 1210 /* 1211 =for apidoc is_utf8_string 1212 1213 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1214 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will 1215 be calculated using C<strlen(s)> (which means if you use this option, that C<s> 1216 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1217 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1218 1219 This function considers Perl's extended UTF-8 to be valid. That means that 1220 code points above Unicode, surrogates, and non-character code points are 1221 considered valid by this function. Use C<L</is_strict_utf8_string>>, 1222 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what 1223 code points are considered valid. 1224 1225 See also 1226 C<L</is_utf8_invariant_string>>, 1227 C<L</is_utf8_invariant_string_loc>>, 1228 C<L</is_utf8_string_loc>>, 1229 C<L</is_utf8_string_loclen>>, 1230 C<L</is_utf8_fixed_width_buf_flags>>, 1231 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1232 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1233 1234 =cut 1235 */ 1236 1237 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) 1238 1239 #if defined(PERL_CORE) || defined (PERL_EXT) 1240 1241 /* 1242 =for apidoc is_utf8_non_invariant_string 1243 1244 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first 1245 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended 1246 UTF-8; otherwise returns FALSE. 1247 1248 A TRUE return means that at least one code point represented by the sequence 1249 either is a wide character not representable as a single byte, or the 1250 representation differs depending on whether the sequence is encoded in UTF-8 or 1251 not. 1252 1253 See also 1254 C<L<perlapi/is_utf8_invariant_string>>, 1255 C<L<perlapi/is_utf8_string>> 1256 1257 =cut 1258 1259 This is commonly used to determine if a SV's UTF-8 flag should be turned on. 1260 It generally needn't be if its string is entirely UTF-8 invariant, and it 1261 shouldn't be if it otherwise contains invalid UTF-8. 1262 1263 It is an internal function because khw thinks that XS code shouldn't be working 1264 at this low a level. A valid use case could change that. 1265 1266 */ 1267 1268 PERL_STATIC_INLINE bool 1269 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len) 1270 { 1271 const U8 * first_variant; 1272 1273 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; 1274 1275 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1276 return FALSE; 1277 } 1278 1279 return is_utf8_string(first_variant, len - (first_variant - s)); 1280 } 1281 1282 #endif 1283 1284 /* 1285 =for apidoc is_strict_utf8_string 1286 1287 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1288 UTF-8-encoded string that is fully interchangeable by any application using 1289 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be 1290 calculated using C<strlen(s)> (which means if you use this option, that C<s> 1291 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1292 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1293 1294 This function returns FALSE for strings containing any 1295 code points above the Unicode max of 0x10FFFF, surrogate code points, or 1296 non-character code points. 1297 1298 See also 1299 C<L</is_utf8_invariant_string>>, 1300 C<L</is_utf8_invariant_string_loc>>, 1301 C<L</is_utf8_string>>, 1302 C<L</is_utf8_string_flags>>, 1303 C<L</is_utf8_string_loc>>, 1304 C<L</is_utf8_string_loc_flags>>, 1305 C<L</is_utf8_string_loclen>>, 1306 C<L</is_utf8_string_loclen_flags>>, 1307 C<L</is_utf8_fixed_width_buf_flags>>, 1308 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1309 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1310 C<L</is_strict_utf8_string_loc>>, 1311 C<L</is_strict_utf8_string_loclen>>, 1312 C<L</is_c9strict_utf8_string>>, 1313 C<L</is_c9strict_utf8_string_loc>>, 1314 and 1315 C<L</is_c9strict_utf8_string_loclen>>. 1316 1317 =cut 1318 */ 1319 1320 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) 1321 1322 /* 1323 =for apidoc is_c9strict_utf8_string 1324 1325 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1326 UTF-8-encoded string that conforms to 1327 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>; 1328 otherwise it returns FALSE. If C<len> is 0, it will be calculated using 1329 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded 1330 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all 1331 characters being ASCII constitute 'a valid UTF-8 string'. 1332 1333 This function returns FALSE for strings containing any code points above the 1334 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character 1335 code points per 1336 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1337 1338 See also 1339 C<L</is_utf8_invariant_string>>, 1340 C<L</is_utf8_invariant_string_loc>>, 1341 C<L</is_utf8_string>>, 1342 C<L</is_utf8_string_flags>>, 1343 C<L</is_utf8_string_loc>>, 1344 C<L</is_utf8_string_loc_flags>>, 1345 C<L</is_utf8_string_loclen>>, 1346 C<L</is_utf8_string_loclen_flags>>, 1347 C<L</is_utf8_fixed_width_buf_flags>>, 1348 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1349 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1350 C<L</is_strict_utf8_string>>, 1351 C<L</is_strict_utf8_string_loc>>, 1352 C<L</is_strict_utf8_string_loclen>>, 1353 C<L</is_c9strict_utf8_string_loc>>, 1354 and 1355 C<L</is_c9strict_utf8_string_loclen>>. 1356 1357 =cut 1358 */ 1359 1360 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) 1361 1362 /* 1363 =for apidoc is_utf8_string_flags 1364 1365 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1366 UTF-8 string, subject to the restrictions imposed by C<flags>; 1367 returns FALSE otherwise. If C<len> is 0, it will be calculated 1368 using C<strlen(s)> (which means if you use this option, that C<s> can't have 1369 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note 1370 that all characters being ASCII constitute 'a valid UTF-8 string'. 1371 1372 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if 1373 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 1374 as C<L</is_strict_utf8_string>>; and if C<flags> is 1375 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as 1376 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any 1377 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by 1378 C<L</utf8n_to_uvchr>>, with the same meanings. 1379 1380 See also 1381 C<L</is_utf8_invariant_string>>, 1382 C<L</is_utf8_invariant_string_loc>>, 1383 C<L</is_utf8_string>>, 1384 C<L</is_utf8_string_loc>>, 1385 C<L</is_utf8_string_loc_flags>>, 1386 C<L</is_utf8_string_loclen>>, 1387 C<L</is_utf8_string_loclen_flags>>, 1388 C<L</is_utf8_fixed_width_buf_flags>>, 1389 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1390 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1391 C<L</is_strict_utf8_string>>, 1392 C<L</is_strict_utf8_string_loc>>, 1393 C<L</is_strict_utf8_string_loclen>>, 1394 C<L</is_c9strict_utf8_string>>, 1395 C<L</is_c9strict_utf8_string_loc>>, 1396 and 1397 C<L</is_c9strict_utf8_string_loclen>>. 1398 1399 =cut 1400 */ 1401 1402 PERL_STATIC_INLINE bool 1403 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) 1404 { 1405 const U8 * first_variant; 1406 1407 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; 1408 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1409 |UTF8_DISALLOW_PERL_EXTENDED))); 1410 1411 if (len == 0) { 1412 len = strlen((const char *)s); 1413 } 1414 1415 if (flags == 0) { 1416 return is_utf8_string(s, len); 1417 } 1418 1419 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1420 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 1421 { 1422 return is_strict_utf8_string(s, len); 1423 } 1424 1425 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1426 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 1427 { 1428 return is_c9strict_utf8_string(s, len); 1429 } 1430 1431 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { 1432 const U8* const send = s + len; 1433 const U8* x = first_variant; 1434 1435 while (x < send) { 1436 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 1437 if (UNLIKELY(! cur_len)) { 1438 return FALSE; 1439 } 1440 x += cur_len; 1441 } 1442 } 1443 1444 return TRUE; 1445 } 1446 1447 /* 1448 1449 =for apidoc is_utf8_string_loc 1450 1451 Like C<L</is_utf8_string>> but stores the location of the failure (in the 1452 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1453 "utf8ness success") in the C<ep> pointer. 1454 1455 See also C<L</is_utf8_string_loclen>>. 1456 1457 =cut 1458 */ 1459 1460 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) 1461 1462 /* 1463 1464 =for apidoc is_utf8_string_loclen 1465 1466 Like C<L</is_utf8_string>> but stores the location of the failure (in the 1467 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1468 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1469 encoded characters in the C<el> pointer. 1470 1471 See also C<L</is_utf8_string_loc>>. 1472 1473 =cut 1474 */ 1475 1476 PERL_STATIC_INLINE bool 1477 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1478 { 1479 const U8 * first_variant; 1480 1481 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 1482 1483 if (len == 0) { 1484 len = strlen((const char *) s); 1485 } 1486 1487 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1488 if (el) 1489 *el = len; 1490 1491 if (ep) { 1492 *ep = s + len; 1493 } 1494 1495 return TRUE; 1496 } 1497 1498 { 1499 const U8* const send = s + len; 1500 const U8* x = first_variant; 1501 STRLEN outlen = first_variant - s; 1502 1503 while (x < send) { 1504 const STRLEN cur_len = isUTF8_CHAR(x, send); 1505 if (UNLIKELY(! cur_len)) { 1506 break; 1507 } 1508 x += cur_len; 1509 outlen++; 1510 } 1511 1512 if (el) 1513 *el = outlen; 1514 1515 if (ep) { 1516 *ep = x; 1517 } 1518 1519 return (x == send); 1520 } 1521 } 1522 1523 /* The perl core arranges to never call the DFA below without there being at 1524 * least one byte available to look at. This allows the DFA to use a do {} 1525 * while loop which means that calling it with a UTF-8 invariant has a single 1526 * conditional, same as the calling code checking for invariance ahead of time. 1527 * And having the calling code remove that conditional speeds up by that 1528 * conditional, the case where it wasn't invariant. So there's no reason to 1529 * check before caling this. 1530 * 1531 * But we don't know this for non-core calls, so have to retain the check for 1532 * them. */ 1533 #ifdef PERL_CORE 1534 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s)) 1535 #else 1536 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE 1537 #endif 1538 1539 /* 1540 * DFA for checking input is valid UTF-8 syntax. 1541 * 1542 * This uses adaptations of the table and algorithm given in 1543 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1544 * documentation of the original version. A copyright notice for the original 1545 * version is given at the beginning of this file. The Perl adapations are 1546 * documented at the definition of PL_extended_utf8_dfa_tab[]. 1547 * 1548 * This dfa is fast. There are three exit conditions: 1549 * 1) a well-formed code point, acceptable to the table 1550 * 2) the beginning bytes of an incomplete character, whose completion might 1551 * or might not be acceptable 1552 * 3) unacceptable to the table. Some of the adaptations have certain, 1553 * hopefully less likely to occur, legal inputs be unacceptable to the 1554 * table, so these must be sorted out afterwards. 1555 * 1556 * This macro is a complete implementation of the code executing the DFA. It 1557 * is passed the input sequence bounds and the table to use, and what to do 1558 * for each of the exit conditions. There are three canned actions, likely to 1559 * be the ones you want: 1560 * DFA_RETURN_SUCCESS_ 1561 * DFA_RETURN_FAILURE_ 1562 * DFA_GOTO_TEASE_APART_FF_ 1563 * 1564 * You pass a parameter giving the action to take for each of the three 1565 * possible exit conditions: 1566 * 1567 * 'accept_action' This is executed when the DFA accepts the input. 1568 * DFA_RETURN_SUCCESS_ is the most likely candidate. 1569 * 'reject_action' This is executed when the DFA rejects the input. 1570 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where 1571 * you have written code to distinguish the rejecting state 1572 * results. Because it happens in several places, and 1573 * involves #ifdefs, the special action 1574 * DFA_GOTO_TEASE_APART_FF_ is what you want with 1575 * PL_extended_utf8_dfa_tab. On platforms without 1576 * EXTRA_LONG_UTF8, there is no need to tease anything apart, 1577 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you 1578 * need to have a label 'tease_apart_FF' that it will transfer 1579 * to. 1580 * 'incomplete_char_action' This is executed when the DFA ran off the end 1581 * before accepting or rejecting the input. 1582 * DFA_RETURN_FAILURE_ is the likely action, but you could 1583 * have a 'goto', or NOOP. In the latter case the DFA drops 1584 * off the end, and you place your code to handle this case 1585 * immediately after it. 1586 */ 1587 1588 #define DFA_RETURN_SUCCESS_ return s - s0 1589 #define DFA_RETURN_FAILURE_ return 0 1590 #ifdef HAS_EXTRA_LONG_UTF8 1591 # define DFA_TEASE_APART_FF_ goto tease_apart_FF 1592 #else 1593 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_ 1594 #endif 1595 1596 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \ 1597 accept_action, \ 1598 reject_action, \ 1599 incomplete_char_action) \ 1600 STMT_START { \ 1601 const U8 * s = s0; \ 1602 UV state = 0; \ 1603 \ 1604 PERL_NON_CORE_CHECK_EMPTY(s,e); \ 1605 \ 1606 do { \ 1607 state = dfa_tab[256 + state + dfa_tab[*s]]; \ 1608 s++; \ 1609 \ 1610 if (state == 0) { /* Accepting state */ \ 1611 accept_action; \ 1612 } \ 1613 \ 1614 if (UNLIKELY(state == 1)) { /* Rejecting state */ \ 1615 reject_action; \ 1616 } \ 1617 } while (s < e); \ 1618 \ 1619 /* Here, dropped out of loop before end-of-char */ \ 1620 incomplete_char_action; \ 1621 } STMT_END 1622 1623 1624 /* 1625 1626 =for apidoc isUTF8_CHAR 1627 1628 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1629 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 1630 that represents some code point; otherwise it evaluates to 0. If non-zero, the 1631 value gives how many bytes starting at C<s> comprise the code point's 1632 representation. Any bytes remaining before C<e>, but beyond the ones needed to 1633 form the first code point in C<s>, are not examined. 1634 1635 The code point can be any that will fit in an IV on this machine, using Perl's 1636 extension to official UTF-8 to represent those higher than the Unicode maximum 1637 of 0x10FFFF. That means that this macro is used to efficiently decide if the 1638 next few bytes in C<s> is legal UTF-8 for a single character. 1639 1640 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those 1641 defined by Unicode to be fully interchangeable across applications; 1642 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1643 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1644 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1645 1646 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and 1647 C<L</is_utf8_string_loclen>> to check entire strings. 1648 1649 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC 1650 machines) is a valid UTF-8 character. 1651 1652 =cut 1653 1654 This uses an adaptation of the table and algorithm given in 1655 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1656 documentation of the original version. A copyright notice for the original 1657 version is given at the beginning of this file. The Perl adapation is 1658 documented at the definition of PL_extended_utf8_dfa_tab[]. 1659 */ 1660 1661 PERL_STATIC_INLINE Size_t 1662 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) 1663 { 1664 PERL_ARGS_ASSERT_ISUTF8_CHAR; 1665 1666 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 1667 DFA_RETURN_SUCCESS_, 1668 DFA_TEASE_APART_FF_, 1669 DFA_RETURN_FAILURE_); 1670 1671 /* Here, we didn't return success, but dropped out of the loop. In the 1672 * case of PL_extended_utf8_dfa_tab, this means the input is either 1673 * malformed, or the start byte was FF on a platform that the dfa doesn't 1674 * handle FF's. Call a helper function. */ 1675 1676 #ifdef HAS_EXTRA_LONG_UTF8 1677 1678 tease_apart_FF: 1679 1680 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 1681 * either malformed, or was for the largest possible start byte, which we 1682 * now check, not inline */ 1683 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) { 1684 return 0; 1685 } 1686 1687 return is_utf8_FF_helper_(s0, e, 1688 FALSE /* require full, not partial char */ 1689 ); 1690 #endif 1691 1692 } 1693 1694 /* 1695 1696 =for apidoc isSTRICT_UTF8_CHAR 1697 1698 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1699 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1700 Unicode code point completely acceptable for open interchange between all 1701 applications; otherwise it evaluates to 0. If non-zero, the value gives how 1702 many bytes starting at C<s> comprise the code point's representation. Any 1703 bytes remaining before C<e>, but beyond the ones needed to form the first code 1704 point in C<s>, are not examined. 1705 1706 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not 1707 be a surrogate nor a non-character code point. Thus this excludes any code 1708 point from Perl's extended UTF-8. 1709 1710 This is used to efficiently decide if the next few bytes in C<s> is 1711 legal Unicode-acceptable UTF-8 for a single character. 1712 1713 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1714 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1715 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; 1716 and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1717 1718 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and 1719 C<L</is_strict_utf8_string_loclen>> to check entire strings. 1720 1721 =cut 1722 1723 This uses an adaptation of the tables and algorithm given in 1724 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1725 documentation of the original version. A copyright notice for the original 1726 version is given at the beginning of this file. The Perl adapation is 1727 documented at the definition of strict_extended_utf8_dfa_tab[]. 1728 1729 */ 1730 1731 PERL_STATIC_INLINE Size_t 1732 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1733 { 1734 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; 1735 1736 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, 1737 DFA_RETURN_SUCCESS_, 1738 goto check_hanguls, 1739 DFA_RETURN_FAILURE_); 1740 check_hanguls: 1741 1742 /* Here, we didn't return success, but dropped out of the loop. In the 1743 * case of PL_strict_utf8_dfa_tab, this means the input is either 1744 * malformed, or was for certain Hanguls; handle them specially */ 1745 1746 /* The dfa above drops out for incomplete or illegal inputs, and certain 1747 * legal Hanguls; check and return accordingly */ 1748 return is_HANGUL_ED_utf8_safe(s0, e); 1749 } 1750 1751 /* 1752 1753 =for apidoc isC9_STRICT_UTF8_CHAR 1754 1755 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1756 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1757 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, 1758 the value gives how many bytes starting at C<s> comprise the code point's 1759 representation. Any bytes remaining before C<e>, but beyond the ones needed to 1760 form the first code point in C<s>, are not examined. 1761 1762 The largest acceptable code point is the Unicode maximum 0x10FFFF. This 1763 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character 1764 code points. This corresponds to 1765 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1766 which said that non-character code points are merely discouraged rather than 1767 completely forbidden in open interchange. See 1768 L<perlunicode/Noncharacter code points>. 1769 1770 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and 1771 C<L</isUTF8_CHAR_flags>> for a more customized definition. 1772 1773 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and 1774 C<L</is_c9strict_utf8_string_loclen>> to check entire strings. 1775 1776 =cut 1777 1778 This uses an adaptation of the tables and algorithm given in 1779 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1780 documentation of the original version. A copyright notice for the original 1781 version is given at the beginning of this file. The Perl adapation is 1782 documented at the definition of PL_c9_utf8_dfa_tab[]. 1783 1784 */ 1785 1786 PERL_STATIC_INLINE Size_t 1787 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1788 { 1789 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; 1790 1791 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab, 1792 DFA_RETURN_SUCCESS_, 1793 DFA_RETURN_FAILURE_, 1794 DFA_RETURN_FAILURE_); 1795 } 1796 1797 /* 1798 1799 =for apidoc is_strict_utf8_string_loc 1800 1801 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1802 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1803 "utf8ness success") in the C<ep> pointer. 1804 1805 See also C<L</is_strict_utf8_string_loclen>>. 1806 1807 =cut 1808 */ 1809 1810 #define is_strict_utf8_string_loc(s, len, ep) \ 1811 is_strict_utf8_string_loclen(s, len, ep, 0) 1812 1813 /* 1814 1815 =for apidoc is_strict_utf8_string_loclen 1816 1817 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1818 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1819 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1820 encoded characters in the C<el> pointer. 1821 1822 See also C<L</is_strict_utf8_string_loc>>. 1823 1824 =cut 1825 */ 1826 1827 PERL_STATIC_INLINE bool 1828 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1829 { 1830 const U8 * first_variant; 1831 1832 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; 1833 1834 if (len == 0) { 1835 len = strlen((const char *) s); 1836 } 1837 1838 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1839 if (el) 1840 *el = len; 1841 1842 if (ep) { 1843 *ep = s + len; 1844 } 1845 1846 return TRUE; 1847 } 1848 1849 { 1850 const U8* const send = s + len; 1851 const U8* x = first_variant; 1852 STRLEN outlen = first_variant - s; 1853 1854 while (x < send) { 1855 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); 1856 if (UNLIKELY(! cur_len)) { 1857 break; 1858 } 1859 x += cur_len; 1860 outlen++; 1861 } 1862 1863 if (el) 1864 *el = outlen; 1865 1866 if (ep) { 1867 *ep = x; 1868 } 1869 1870 return (x == send); 1871 } 1872 } 1873 1874 /* 1875 1876 =for apidoc is_c9strict_utf8_string_loc 1877 1878 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1879 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1880 "utf8ness success") in the C<ep> pointer. 1881 1882 See also C<L</is_c9strict_utf8_string_loclen>>. 1883 1884 =cut 1885 */ 1886 1887 #define is_c9strict_utf8_string_loc(s, len, ep) \ 1888 is_c9strict_utf8_string_loclen(s, len, ep, 0) 1889 1890 /* 1891 1892 =for apidoc is_c9strict_utf8_string_loclen 1893 1894 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1895 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1896 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded 1897 characters in the C<el> pointer. 1898 1899 See also C<L</is_c9strict_utf8_string_loc>>. 1900 1901 =cut 1902 */ 1903 1904 PERL_STATIC_INLINE bool 1905 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1906 { 1907 const U8 * first_variant; 1908 1909 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; 1910 1911 if (len == 0) { 1912 len = strlen((const char *) s); 1913 } 1914 1915 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1916 if (el) 1917 *el = len; 1918 1919 if (ep) { 1920 *ep = s + len; 1921 } 1922 1923 return TRUE; 1924 } 1925 1926 { 1927 const U8* const send = s + len; 1928 const U8* x = first_variant; 1929 STRLEN outlen = first_variant - s; 1930 1931 while (x < send) { 1932 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); 1933 if (UNLIKELY(! cur_len)) { 1934 break; 1935 } 1936 x += cur_len; 1937 outlen++; 1938 } 1939 1940 if (el) 1941 *el = outlen; 1942 1943 if (ep) { 1944 *ep = x; 1945 } 1946 1947 return (x == send); 1948 } 1949 } 1950 1951 /* 1952 1953 =for apidoc is_utf8_string_loc_flags 1954 1955 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1956 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1957 "utf8ness success") in the C<ep> pointer. 1958 1959 See also C<L</is_utf8_string_loclen_flags>>. 1960 1961 =cut 1962 */ 1963 1964 #define is_utf8_string_loc_flags(s, len, ep, flags) \ 1965 is_utf8_string_loclen_flags(s, len, ep, 0, flags) 1966 1967 1968 /* The above 3 actual functions could have been moved into the more general one 1969 * just below, and made #defines that call it with the right 'flags'. They are 1970 * currently kept separate to increase their chances of getting inlined */ 1971 1972 /* 1973 1974 =for apidoc is_utf8_string_loclen_flags 1975 1976 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1977 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1978 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1979 encoded characters in the C<el> pointer. 1980 1981 See also C<L</is_utf8_string_loc_flags>>. 1982 1983 =cut 1984 */ 1985 1986 PERL_STATIC_INLINE bool 1987 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) 1988 { 1989 const U8 * first_variant; 1990 1991 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; 1992 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1993 |UTF8_DISALLOW_PERL_EXTENDED))); 1994 1995 if (len == 0) { 1996 len = strlen((const char *) s); 1997 } 1998 1999 if (flags == 0) { 2000 return is_utf8_string_loclen(s, len, ep, el); 2001 } 2002 2003 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 2004 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 2005 { 2006 return is_strict_utf8_string_loclen(s, len, ep, el); 2007 } 2008 2009 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 2010 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 2011 { 2012 return is_c9strict_utf8_string_loclen(s, len, ep, el); 2013 } 2014 2015 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 2016 if (el) 2017 *el = len; 2018 2019 if (ep) { 2020 *ep = s + len; 2021 } 2022 2023 return TRUE; 2024 } 2025 2026 { 2027 const U8* send = s + len; 2028 const U8* x = first_variant; 2029 STRLEN outlen = first_variant - s; 2030 2031 while (x < send) { 2032 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 2033 if (UNLIKELY(! cur_len)) { 2034 break; 2035 } 2036 x += cur_len; 2037 outlen++; 2038 } 2039 2040 if (el) 2041 *el = outlen; 2042 2043 if (ep) { 2044 *ep = x; 2045 } 2046 2047 return (x == send); 2048 } 2049 } 2050 2051 /* 2052 =for apidoc utf8_distance 2053 2054 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 2055 and C<b>. 2056 2057 WARNING: use only if you *know* that the pointers point inside the 2058 same UTF-8 buffer. 2059 2060 =cut 2061 */ 2062 2063 PERL_STATIC_INLINE IV 2064 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 2065 { 2066 PERL_ARGS_ASSERT_UTF8_DISTANCE; 2067 2068 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 2069 } 2070 2071 /* 2072 =for apidoc utf8_hop 2073 2074 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 2075 forward or backward. 2076 2077 WARNING: do not use the following unless you *know* C<off> is within 2078 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned 2079 on the first byte of character or just after the last byte of a character. 2080 2081 =cut 2082 */ 2083 2084 PERL_STATIC_INLINE U8 * 2085 Perl_utf8_hop(const U8 *s, SSize_t off) 2086 { 2087 PERL_ARGS_ASSERT_UTF8_HOP; 2088 2089 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2090 * the bitops (especially ~) can create illegal UTF-8. 2091 * In other words: in Perl UTF-8 is not just for Unicode. */ 2092 2093 if (off >= 0) { 2094 while (off--) 2095 s += UTF8SKIP(s); 2096 } 2097 else { 2098 while (off++) { 2099 s--; 2100 while (UTF8_IS_CONTINUATION(*s)) 2101 s--; 2102 } 2103 } 2104 GCC_DIAG_IGNORE(-Wcast-qual) 2105 return (U8 *)s; 2106 GCC_DIAG_RESTORE 2107 } 2108 2109 /* 2110 =for apidoc utf8_hop_forward 2111 2112 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2113 forward. 2114 2115 C<off> must be non-negative. 2116 2117 C<s> must be before or equal to C<end>. 2118 2119 When moving forward it will not move beyond C<end>. 2120 2121 Will not exceed this limit even if the string is not valid "UTF-8". 2122 2123 =cut 2124 */ 2125 2126 PERL_STATIC_INLINE U8 * 2127 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) 2128 { 2129 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; 2130 2131 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2132 * the bitops (especially ~) can create illegal UTF-8. 2133 * In other words: in Perl UTF-8 is not just for Unicode. */ 2134 2135 assert(s <= end); 2136 assert(off >= 0); 2137 2138 while (off--) { 2139 STRLEN skip = UTF8SKIP(s); 2140 if ((STRLEN)(end - s) <= skip) { 2141 GCC_DIAG_IGNORE(-Wcast-qual) 2142 return (U8 *)end; 2143 GCC_DIAG_RESTORE 2144 } 2145 s += skip; 2146 } 2147 2148 GCC_DIAG_IGNORE(-Wcast-qual) 2149 return (U8 *)s; 2150 GCC_DIAG_RESTORE 2151 } 2152 2153 /* 2154 =for apidoc utf8_hop_back 2155 2156 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2157 backward. 2158 2159 C<off> must be non-positive. 2160 2161 C<s> must be after or equal to C<start>. 2162 2163 When moving backward it will not move before C<start>. 2164 2165 Will not exceed this limit even if the string is not valid "UTF-8". 2166 2167 =cut 2168 */ 2169 2170 PERL_STATIC_INLINE U8 * 2171 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) 2172 { 2173 PERL_ARGS_ASSERT_UTF8_HOP_BACK; 2174 2175 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2176 * the bitops (especially ~) can create illegal UTF-8. 2177 * In other words: in Perl UTF-8 is not just for Unicode. */ 2178 2179 assert(start <= s); 2180 assert(off <= 0); 2181 2182 while (off++ && s > start) { 2183 do { 2184 s--; 2185 } while (UTF8_IS_CONTINUATION(*s) && s > start); 2186 } 2187 2188 GCC_DIAG_IGNORE(-Wcast-qual) 2189 return (U8 *)s; 2190 GCC_DIAG_RESTORE 2191 } 2192 2193 /* 2194 =for apidoc utf8_hop_safe 2195 2196 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2197 either forward or backward. 2198 2199 When moving backward it will not move before C<start>. 2200 2201 When moving forward it will not move beyond C<end>. 2202 2203 Will not exceed those limits even if the string is not valid "UTF-8". 2204 2205 =cut 2206 */ 2207 2208 PERL_STATIC_INLINE U8 * 2209 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) 2210 { 2211 PERL_ARGS_ASSERT_UTF8_HOP_SAFE; 2212 2213 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2214 * the bitops (especially ~) can create illegal UTF-8. 2215 * In other words: in Perl UTF-8 is not just for Unicode. */ 2216 2217 assert(start <= s && s <= end); 2218 2219 if (off >= 0) { 2220 return utf8_hop_forward(s, off, end); 2221 } 2222 else { 2223 return utf8_hop_back(s, off, start); 2224 } 2225 } 2226 2227 /* 2228 2229 =for apidoc isUTF8_CHAR_flags 2230 2231 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 2232 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 2233 that represents some code point, subject to the restrictions given by C<flags>; 2234 otherwise it evaluates to 0. If non-zero, the value gives how many bytes 2235 starting at C<s> comprise the code point's representation. Any bytes remaining 2236 before C<e>, but beyond the ones needed to form the first code point in C<s>, 2237 are not examined. 2238 2239 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; 2240 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 2241 as C<L</isSTRICT_UTF8_CHAR>>; 2242 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives 2243 the same results as C<L</isC9_STRICT_UTF8_CHAR>>. 2244 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags 2245 understood by C<L</utf8n_to_uvchr>>, with the same meanings. 2246 2247 The three alternative macros are for the most commonly needed validations; they 2248 are likely to run somewhat faster than this more general one, as they can be 2249 inlined into your code. 2250 2251 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and 2252 L</is_utf8_string_loclen_flags> to check entire strings. 2253 2254 =cut 2255 */ 2256 2257 PERL_STATIC_INLINE STRLEN 2258 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) 2259 { 2260 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS; 2261 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2262 |UTF8_DISALLOW_PERL_EXTENDED))); 2263 2264 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2265 goto check_success, 2266 DFA_TEASE_APART_FF_, 2267 DFA_RETURN_FAILURE_); 2268 2269 check_success: 2270 2271 return is_utf8_char_helper_(s0, e, flags); 2272 2273 #ifdef HAS_EXTRA_LONG_UTF8 2274 2275 tease_apart_FF: 2276 2277 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 2278 * either malformed, or was for the largest possible start byte, which 2279 * indicates perl extended UTF-8, well above the Unicode maximum */ 2280 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 2281 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 2282 { 2283 return 0; 2284 } 2285 2286 /* Otherwise examine the sequence not inline */ 2287 return is_utf8_FF_helper_(s0, e, 2288 FALSE /* require full, not partial char */ 2289 ); 2290 #endif 2291 2292 } 2293 2294 /* 2295 2296 =for apidoc is_utf8_valid_partial_char 2297 2298 Returns 0 if the sequence of bytes starting at C<s> and looking no further than 2299 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code 2300 points. Otherwise, it returns 1 if there exists at least one non-empty 2301 sequence of bytes that when appended to sequence C<s>, starting at position 2302 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point; 2303 otherwise returns 0. 2304 2305 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code 2306 point. 2307 2308 This is useful when a fixed-length buffer is being tested for being well-formed 2309 UTF-8, but the final few bytes in it don't comprise a full character; that is, 2310 it is split somewhere in the middle of the final code point's UTF-8 2311 representation. (Presumably when the buffer is refreshed with the next chunk 2312 of data, the new first bytes will complete the partial code point.) This 2313 function is used to verify that the final bytes in the current buffer are in 2314 fact the legal beginning of some code point, so that if they aren't, the 2315 failure can be signalled without having to wait for the next read. 2316 2317 =cut 2318 */ 2319 #define is_utf8_valid_partial_char(s, e) \ 2320 is_utf8_valid_partial_char_flags(s, e, 0) 2321 2322 /* 2323 2324 =for apidoc is_utf8_valid_partial_char_flags 2325 2326 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether 2327 or not the input is a valid UTF-8 encoded partial character, but it takes an 2328 extra parameter, C<flags>, which can further restrict which code points are 2329 considered valid. 2330 2331 If C<flags> is 0, this behaves identically to 2332 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination 2333 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If 2334 there is any sequence of bytes that can complete the input partial character in 2335 such a way that a non-prohibited character is formed, the function returns 2336 TRUE; otherwise FALSE. Non character code points cannot be determined based on 2337 partial character input. But many of the other possible excluded types can be 2338 determined from just the first one or two bytes. 2339 2340 =cut 2341 */ 2342 2343 PERL_STATIC_INLINE bool 2344 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags) 2345 { 2346 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; 2347 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2348 |UTF8_DISALLOW_PERL_EXTENDED))); 2349 2350 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2351 DFA_RETURN_FAILURE_, 2352 DFA_TEASE_APART_FF_, 2353 NOOP); 2354 2355 /* The NOOP above causes the DFA to drop down here iff the input was a 2356 * partial character. flags=0 => can return TRUE immediately; otherwise we 2357 * need to check (not inline) if the partial character is the beginning of 2358 * a disallowed one */ 2359 if (flags == 0) { 2360 return TRUE; 2361 } 2362 2363 return cBOOL(is_utf8_char_helper_(s0, e, flags)); 2364 2365 #ifdef HAS_EXTRA_LONG_UTF8 2366 2367 tease_apart_FF: 2368 2369 /* Getting here means the input is either malformed, or, in the case of 2370 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The 2371 * latter case has to be extended UTF-8, so can fail immediately if that is 2372 * forbidden */ 2373 2374 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 2375 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 2376 { 2377 return 0; 2378 } 2379 2380 return is_utf8_FF_helper_(s0, e, 2381 TRUE /* Require to be a partial character */ 2382 ); 2383 #endif 2384 2385 } 2386 2387 /* 2388 2389 =for apidoc is_utf8_fixed_width_buf_flags 2390 2391 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len> 2392 is entirely valid UTF-8, subject to the restrictions given by C<flags>; 2393 otherwise it returns FALSE. 2394 2395 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted 2396 without restriction. If the final few bytes of the buffer do not form a 2397 complete code point, this will return TRUE anyway, provided that 2398 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them. 2399 2400 If C<flags> in non-zero, it can be any combination of the 2401 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the 2402 same meanings. 2403 2404 This function differs from C<L</is_utf8_string_flags>> only in that the latter 2405 returns FALSE if the final few bytes of the string don't form a complete code 2406 point. 2407 2408 =cut 2409 */ 2410 #define is_utf8_fixed_width_buf_flags(s, len, flags) \ 2411 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags) 2412 2413 /* 2414 2415 =for apidoc is_utf8_fixed_width_buf_loc_flags 2416 2417 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the 2418 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point 2419 to the beginning of any partial character at the end of the buffer; if there is 2420 no partial character C<*ep> will contain C<s>+C<len>. 2421 2422 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>. 2423 2424 =cut 2425 */ 2426 2427 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \ 2428 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags) 2429 2430 /* 2431 2432 =for apidoc is_utf8_fixed_width_buf_loclen_flags 2433 2434 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of 2435 complete, valid characters found in the C<el> pointer. 2436 2437 =cut 2438 */ 2439 2440 PERL_STATIC_INLINE bool 2441 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, 2442 STRLEN len, 2443 const U8 **ep, 2444 STRLEN *el, 2445 const U32 flags) 2446 { 2447 const U8 * maybe_partial; 2448 2449 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS; 2450 2451 if (! ep) { 2452 ep = &maybe_partial; 2453 } 2454 2455 /* If it's entirely valid, return that; otherwise see if the only error is 2456 * that the final few bytes are for a partial character */ 2457 return is_utf8_string_loclen_flags(s, len, ep, el, flags) 2458 || is_utf8_valid_partial_char_flags(*ep, s + len, flags); 2459 } 2460 2461 PERL_STATIC_INLINE UV 2462 Perl_utf8n_to_uvchr_msgs(const U8 *s, 2463 STRLEN curlen, 2464 STRLEN *retlen, 2465 const U32 flags, 2466 U32 * errors, 2467 AV ** msgs) 2468 { 2469 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the 2470 * simple cases, and, if necessary calls a helper function to deal with the 2471 * more complex ones. Almost all well-formed non-problematic code points 2472 * are considered simple, so that it's unlikely that the helper function 2473 * will need to be called. 2474 * 2475 * This is an adaptation of the tables and algorithm given in 2476 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides 2477 * comprehensive documentation of the original version. A copyright notice 2478 * for the original version is given at the beginning of this file. The 2479 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[]. 2480 */ 2481 2482 const U8 * const s0 = s; 2483 const U8 * send = s0 + curlen; 2484 UV type; 2485 UV uv; 2486 2487 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; 2488 2489 /* This dfa is fast. If it accepts the input, it was for a well-formed, 2490 * non-problematic code point, which can be returned immediately. 2491 * Otherwise we call a helper function to figure out the more complicated 2492 * cases. */ 2493 2494 /* No calls from core pass in an empty string; non-core need a check */ 2495 #ifdef PERL_CORE 2496 assert(curlen > 0); 2497 #else 2498 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen, 2499 flags, errors, msgs); 2500 #endif 2501 2502 type = PL_strict_utf8_dfa_tab[*s]; 2503 2504 /* The table is structured so that 'type' is 0 iff the input byte is 2505 * represented identically regardless of the UTF-8ness of the string */ 2506 if (type == 0) { /* UTF-8 invariants are returned unchanged */ 2507 uv = *s; 2508 } 2509 else { 2510 UV state = PL_strict_utf8_dfa_tab[256 + type]; 2511 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); 2512 2513 while (++s < send) { 2514 type = PL_strict_utf8_dfa_tab[*s]; 2515 state = PL_strict_utf8_dfa_tab[256 + state + type]; 2516 2517 uv = UTF8_ACCUMULATE(uv, *s); 2518 2519 if (state == 0) { 2520 #ifdef EBCDIC 2521 uv = UNI_TO_NATIVE(uv); 2522 #endif 2523 goto success; 2524 } 2525 2526 if (UNLIKELY(state == 1)) { 2527 break; 2528 } 2529 } 2530 2531 /* Here is potentially problematic. Use the full mechanism */ 2532 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, 2533 errors, msgs); 2534 } 2535 2536 success: 2537 if (retlen) { 2538 *retlen = s - s0 + 1; 2539 } 2540 if (errors) { 2541 *errors = 0; 2542 } 2543 if (msgs) { 2544 *msgs = NULL; 2545 } 2546 2547 return uv; 2548 } 2549 2550 PERL_STATIC_INLINE UV 2551 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2552 { 2553 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; 2554 2555 assert(s < send); 2556 2557 if (! ckWARN_d(WARN_UTF8)) { 2558 2559 /* EMPTY is not really allowed, and asserts on debugging builds. But 2560 * on non-debugging we have to deal with it, and this causes it to 2561 * return the REPLACEMENT CHARACTER, as the documentation indicates */ 2562 return utf8n_to_uvchr(s, send - s, retlen, 2563 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); 2564 } 2565 else { 2566 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); 2567 if (retlen && ret == 0 && (send <= s || *s != '\0')) { 2568 *retlen = (STRLEN) -1; 2569 } 2570 2571 return ret; 2572 } 2573 } 2574 2575 /* ------------------------------- perl.h ----------------------------- */ 2576 2577 /* 2578 =for apidoc_section $utility 2579 2580 =for apidoc is_safe_syscall 2581 2582 Test that the given C<pv> (with length C<len>) doesn't contain any internal 2583 C<NUL> characters. 2584 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls> 2585 category, and return FALSE. 2586 2587 Return TRUE if the name is safe. 2588 2589 C<what> and C<op_name> are used in any warning. 2590 2591 Used by the C<IS_SAFE_SYSCALL()> macro. 2592 2593 =cut 2594 */ 2595 2596 PERL_STATIC_INLINE bool 2597 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) 2598 { 2599 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs 2600 * perl itself uses xce*() functions which accept 8-bit strings. 2601 */ 2602 2603 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL; 2604 2605 if (len > 1) { 2606 char *null_at; 2607 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) { 2608 SETERRNO(ENOENT, LIB_INVARG); 2609 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS), 2610 "Invalid \\0 character in %s for %s: %s\\0%s", 2611 what, op_name, pv, null_at+1); 2612 return FALSE; 2613 } 2614 } 2615 2616 return TRUE; 2617 } 2618 2619 /* 2620 2621 Return true if the supplied filename has a newline character 2622 immediately before the first (hopefully only) NUL. 2623 2624 My original look at this incorrectly used the len from SvPV(), but 2625 that's incorrect, since we allow for a NUL in pv[len-1]. 2626 2627 So instead, strlen() and work from there. 2628 2629 This allow for the user reading a filename, forgetting to chomp it, 2630 then calling: 2631 2632 open my $foo, "$file\0"; 2633 2634 */ 2635 2636 #ifdef PERL_CORE 2637 2638 PERL_STATIC_INLINE bool 2639 S_should_warn_nl(const char *pv) 2640 { 2641 STRLEN len; 2642 2643 PERL_ARGS_ASSERT_SHOULD_WARN_NL; 2644 2645 len = strlen(pv); 2646 2647 return len > 0 && pv[len-1] == '\n'; 2648 } 2649 2650 #endif 2651 2652 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) 2653 2654 PERL_STATIC_INLINE bool 2655 S_lossless_NV_to_IV(const NV nv, IV *ivp) 2656 { 2657 /* This function determines if the input NV 'nv' may be converted without 2658 * loss of data to an IV. If not, it returns FALSE taking no other action. 2659 * But if it is possible, it does the conversion, returning TRUE, and 2660 * storing the converted result in '*ivp' */ 2661 2662 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; 2663 2664 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 2665 /* Normally any comparison with a NaN returns false; if we can't rely 2666 * on that behaviour, check explicitly */ 2667 if (UNLIKELY(Perl_isnan(nv))) { 2668 return FALSE; 2669 } 2670 # endif 2671 2672 /* Written this way so that with an always-false NaN comparison we 2673 * return false */ 2674 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) { 2675 return FALSE; 2676 } 2677 2678 if ((IV) nv != nv) { 2679 return FALSE; 2680 } 2681 2682 *ivp = (IV) nv; 2683 return TRUE; 2684 } 2685 2686 #endif 2687 2688 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ 2689 2690 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) 2691 2692 #define MAX_CHARSET_NAME_LENGTH 2 2693 2694 PERL_STATIC_INLINE const char * 2695 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp) 2696 { 2697 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME; 2698 2699 /* Returns a string that corresponds to the name of the regex character set 2700 * given by 'flags', and *lenp is set the length of that string, which 2701 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ 2702 2703 *lenp = 1; 2704 switch (get_regex_charset(flags)) { 2705 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; 2706 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; 2707 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; 2708 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; 2709 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 2710 *lenp = 2; 2711 return ASCII_MORE_RESTRICT_PAT_MODS; 2712 } 2713 /* The NOT_REACHED; hides an assert() which has a rather complex 2714 * definition in perl.h. */ 2715 NOT_REACHED; /* NOTREACHED */ 2716 return "?"; /* Unknown */ 2717 } 2718 2719 #endif 2720 2721 /* 2722 2723 Return false if any get magic is on the SV other than taint magic. 2724 2725 */ 2726 2727 PERL_STATIC_INLINE bool 2728 Perl_sv_only_taint_gmagic(SV *sv) 2729 { 2730 MAGIC *mg = SvMAGIC(sv); 2731 2732 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; 2733 2734 while (mg) { 2735 if (mg->mg_type != PERL_MAGIC_taint 2736 && !(mg->mg_flags & MGf_GSKIP) 2737 && mg->mg_virtual->svt_get) { 2738 return FALSE; 2739 } 2740 mg = mg->mg_moremagic; 2741 } 2742 2743 return TRUE; 2744 } 2745 2746 /* ------------------ cop.h ------------------------------------------- */ 2747 2748 /* implement GIMME_V() macro */ 2749 2750 PERL_STATIC_INLINE U8 2751 Perl_gimme_V(pTHX) 2752 { 2753 I32 cxix; 2754 U8 gimme = (PL_op->op_flags & OPf_WANT); 2755 2756 if (gimme) 2757 return gimme; 2758 cxix = PL_curstackinfo->si_cxsubix; 2759 if (cxix < 0) 2760 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID; 2761 assert(cxstack[cxix].blk_gimme & G_WANT); 2762 return (cxstack[cxix].blk_gimme & G_WANT); 2763 } 2764 2765 2766 /* Enter a block. Push a new base context and return its address. */ 2767 2768 PERL_STATIC_INLINE PERL_CONTEXT * 2769 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) 2770 { 2771 PERL_CONTEXT * cx; 2772 2773 PERL_ARGS_ASSERT_CX_PUSHBLOCK; 2774 2775 CXINC; 2776 cx = CX_CUR(); 2777 cx->cx_type = type; 2778 cx->blk_gimme = gimme; 2779 cx->blk_oldsaveix = saveix; 2780 cx->blk_oldsp = (I32)(sp - PL_stack_base); 2781 cx->blk_oldcop = PL_curcop; 2782 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack); 2783 cx->blk_oldscopesp = PL_scopestack_ix; 2784 cx->blk_oldpm = PL_curpm; 2785 cx->blk_old_tmpsfloor = PL_tmps_floor; 2786 2787 PL_tmps_floor = PL_tmps_ix; 2788 CX_DEBUG(cx, "PUSH"); 2789 return cx; 2790 } 2791 2792 2793 /* Exit a block (RETURN and LAST). */ 2794 2795 PERL_STATIC_INLINE void 2796 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx) 2797 { 2798 PERL_ARGS_ASSERT_CX_POPBLOCK; 2799 2800 CX_DEBUG(cx, "POP"); 2801 /* these 3 are common to cx_popblock and cx_topblock */ 2802 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2803 PL_scopestack_ix = cx->blk_oldscopesp; 2804 PL_curpm = cx->blk_oldpm; 2805 2806 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats 2807 * and leaves a CX entry lying around for repeated use, so 2808 * skip for multicall */ \ 2809 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx)) 2810 || PL_savestack_ix == cx->blk_oldsaveix); 2811 PL_curcop = cx->blk_oldcop; 2812 PL_tmps_floor = cx->blk_old_tmpsfloor; 2813 } 2814 2815 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO). 2816 * Whereas cx_popblock() restores the state to the point just before 2817 * cx_pushblock() was called, cx_topblock() restores it to the point just 2818 * *after* cx_pushblock() was called. */ 2819 2820 PERL_STATIC_INLINE void 2821 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx) 2822 { 2823 PERL_ARGS_ASSERT_CX_TOPBLOCK; 2824 2825 CX_DEBUG(cx, "TOP"); 2826 /* these 3 are common to cx_popblock and cx_topblock */ 2827 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2828 PL_scopestack_ix = cx->blk_oldscopesp; 2829 PL_curpm = cx->blk_oldpm; 2830 2831 PL_stack_sp = PL_stack_base + cx->blk_oldsp; 2832 } 2833 2834 2835 PERL_STATIC_INLINE void 2836 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) 2837 { 2838 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); 2839 2840 PERL_ARGS_ASSERT_CX_PUSHSUB; 2841 2842 PERL_DTRACE_PROBE_ENTRY(cv); 2843 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix; 2844 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; 2845 cx->blk_sub.cv = cv; 2846 cx->blk_sub.olddepth = CvDEPTH(cv); 2847 cx->blk_sub.prevcomppad = PL_comppad; 2848 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0; 2849 cx->blk_sub.retop = retop; 2850 SvREFCNT_inc_simple_void_NN(cv); 2851 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF); 2852 } 2853 2854 2855 /* subsets of cx_popsub() */ 2856 2857 PERL_STATIC_INLINE void 2858 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) 2859 { 2860 CV *cv; 2861 2862 PERL_ARGS_ASSERT_CX_POPSUB_COMMON; 2863 assert(CxTYPE(cx) == CXt_SUB); 2864 2865 PL_comppad = cx->blk_sub.prevcomppad; 2866 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2867 cv = cx->blk_sub.cv; 2868 CvDEPTH(cv) = cx->blk_sub.olddepth; 2869 cx->blk_sub.cv = NULL; 2870 SvREFCNT_dec(cv); 2871 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix; 2872 } 2873 2874 2875 /* handle the @_ part of leaving a sub */ 2876 2877 PERL_STATIC_INLINE void 2878 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) 2879 { 2880 AV *av; 2881 2882 PERL_ARGS_ASSERT_CX_POPSUB_ARGS; 2883 assert(CxTYPE(cx) == CXt_SUB); 2884 assert(AvARRAY(MUTABLE_AV( 2885 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[ 2886 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad); 2887 2888 CX_POP_SAVEARRAY(cx); 2889 av = MUTABLE_AV(PAD_SVl(0)); 2890 if (UNLIKELY(AvREAL(av))) 2891 /* abandon @_ if it got reified */ 2892 clear_defarray(av, 0); 2893 else { 2894 CLEAR_ARGARRAY(av); 2895 } 2896 } 2897 2898 2899 PERL_STATIC_INLINE void 2900 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx) 2901 { 2902 PERL_ARGS_ASSERT_CX_POPSUB; 2903 assert(CxTYPE(cx) == CXt_SUB); 2904 2905 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv); 2906 2907 if (CxHASARGS(cx)) 2908 cx_popsub_args(cx); 2909 cx_popsub_common(cx); 2910 } 2911 2912 2913 PERL_STATIC_INLINE void 2914 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) 2915 { 2916 PERL_ARGS_ASSERT_CX_PUSHFORMAT; 2917 2918 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix; 2919 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack; 2920 cx->blk_format.cv = cv; 2921 cx->blk_format.retop = retop; 2922 cx->blk_format.gv = gv; 2923 cx->blk_format.dfoutgv = PL_defoutgv; 2924 cx->blk_format.prevcomppad = PL_comppad; 2925 cx->blk_u16 = 0; 2926 2927 SvREFCNT_inc_simple_void_NN(cv); 2928 CvDEPTH(cv)++; 2929 SvREFCNT_inc_void(cx->blk_format.dfoutgv); 2930 } 2931 2932 2933 PERL_STATIC_INLINE void 2934 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx) 2935 { 2936 CV *cv; 2937 GV *dfout; 2938 2939 PERL_ARGS_ASSERT_CX_POPFORMAT; 2940 assert(CxTYPE(cx) == CXt_FORMAT); 2941 2942 dfout = cx->blk_format.dfoutgv; 2943 setdefout(dfout); 2944 cx->blk_format.dfoutgv = NULL; 2945 SvREFCNT_dec_NN(dfout); 2946 2947 PL_comppad = cx->blk_format.prevcomppad; 2948 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2949 cv = cx->blk_format.cv; 2950 cx->blk_format.cv = NULL; 2951 --CvDEPTH(cv); 2952 SvREFCNT_dec_NN(cv); 2953 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix; 2954 } 2955 2956 2957 PERL_STATIC_INLINE void 2958 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2959 { 2960 cx->blk_eval.retop = retop; 2961 cx->blk_eval.old_namesv = namesv; 2962 cx->blk_eval.old_eval_root = PL_eval_root; 2963 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL; 2964 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */ 2965 cx->blk_eval.cur_top_env = PL_top_env; 2966 2967 assert(!(PL_in_eval & ~ 0x3F)); 2968 assert(!(PL_op->op_type & ~0x1FF)); 2969 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); 2970 } 2971 2972 PERL_STATIC_INLINE void 2973 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2974 { 2975 PERL_ARGS_ASSERT_CX_PUSHEVAL; 2976 2977 Perl_push_evalortry_common(aTHX_ cx, retop, namesv); 2978 2979 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 2980 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; 2981 } 2982 2983 PERL_STATIC_INLINE void 2984 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop) 2985 { 2986 PERL_ARGS_ASSERT_CX_PUSHTRY; 2987 2988 Perl_push_evalortry_common(aTHX_ cx, retop, NULL); 2989 2990 /* Don't actually change it, just store the current value so it's restored 2991 * by the common popeval */ 2992 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 2993 } 2994 2995 2996 PERL_STATIC_INLINE void 2997 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx) 2998 { 2999 SV *sv; 3000 3001 PERL_ARGS_ASSERT_CX_POPEVAL; 3002 assert(CxTYPE(cx) == CXt_EVAL); 3003 3004 PL_in_eval = CxOLD_IN_EVAL(cx); 3005 assert(!(PL_in_eval & 0xc0)); 3006 PL_eval_root = cx->blk_eval.old_eval_root; 3007 sv = cx->blk_eval.cur_text; 3008 if (sv && CxEVAL_TXT_REFCNTED(cx)) { 3009 cx->blk_eval.cur_text = NULL; 3010 SvREFCNT_dec_NN(sv); 3011 } 3012 3013 sv = cx->blk_eval.old_namesv; 3014 if (sv) { 3015 cx->blk_eval.old_namesv = NULL; 3016 SvREFCNT_dec_NN(sv); 3017 } 3018 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix; 3019 } 3020 3021 3022 /* push a plain loop, i.e. 3023 * { block } 3024 * while (cond) { block } 3025 * for (init;cond;continue) { block } 3026 * This loop can be last/redo'ed etc. 3027 */ 3028 3029 PERL_STATIC_INLINE void 3030 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) 3031 { 3032 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; 3033 cx->blk_loop.my_op = cLOOP; 3034 } 3035 3036 3037 /* push a true for loop, i.e. 3038 * for var (list) { block } 3039 */ 3040 3041 PERL_STATIC_INLINE void 3042 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) 3043 { 3044 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; 3045 3046 /* this one line is common with cx_pushloop_plain */ 3047 cx->blk_loop.my_op = cLOOP; 3048 3049 cx->blk_loop.itervar_u.svp = (SV**)itervarp; 3050 cx->blk_loop.itersave = itersave; 3051 #ifdef USE_ITHREADS 3052 cx->blk_loop.oldcomppad = PL_comppad; 3053 #endif 3054 } 3055 3056 3057 /* pop all loop types, including plain */ 3058 3059 PERL_STATIC_INLINE void 3060 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx) 3061 { 3062 PERL_ARGS_ASSERT_CX_POPLOOP; 3063 3064 assert(CxTYPE_is_LOOP(cx)); 3065 if ( CxTYPE(cx) == CXt_LOOP_ARY 3066 || CxTYPE(cx) == CXt_LOOP_LAZYSV) 3067 { 3068 /* Free ary or cur. This assumes that state_u.ary.ary 3069 * aligns with state_u.lazysv.cur. See cx_dup() */ 3070 SV *sv = cx->blk_loop.state_u.lazysv.cur; 3071 cx->blk_loop.state_u.lazysv.cur = NULL; 3072 SvREFCNT_dec_NN(sv); 3073 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) { 3074 sv = cx->blk_loop.state_u.lazysv.end; 3075 cx->blk_loop.state_u.lazysv.end = NULL; 3076 SvREFCNT_dec_NN(sv); 3077 } 3078 } 3079 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) { 3080 SV *cursv; 3081 SV **svp = (cx)->blk_loop.itervar_u.svp; 3082 if ((cx->cx_type & CXp_FOR_GV)) 3083 svp = &GvSV((GV*)svp); 3084 cursv = *svp; 3085 *svp = cx->blk_loop.itersave; 3086 cx->blk_loop.itersave = NULL; 3087 SvREFCNT_dec(cursv); 3088 } 3089 } 3090 3091 3092 PERL_STATIC_INLINE void 3093 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) 3094 { 3095 PERL_ARGS_ASSERT_CX_PUSHWHEN; 3096 3097 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3098 } 3099 3100 3101 PERL_STATIC_INLINE void 3102 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx) 3103 { 3104 PERL_ARGS_ASSERT_CX_POPWHEN; 3105 assert(CxTYPE(cx) == CXt_WHEN); 3106 3107 PERL_UNUSED_ARG(cx); 3108 PERL_UNUSED_CONTEXT; 3109 /* currently NOOP */ 3110 } 3111 3112 3113 PERL_STATIC_INLINE void 3114 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) 3115 { 3116 PERL_ARGS_ASSERT_CX_PUSHGIVEN; 3117 3118 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3119 cx->blk_givwhen.defsv_save = orig_defsv; 3120 } 3121 3122 3123 PERL_STATIC_INLINE void 3124 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx) 3125 { 3126 SV *sv; 3127 3128 PERL_ARGS_ASSERT_CX_POPGIVEN; 3129 assert(CxTYPE(cx) == CXt_GIVEN); 3130 3131 sv = GvSV(PL_defgv); 3132 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save; 3133 cx->blk_givwhen.defsv_save = NULL; 3134 SvREFCNT_dec(sv); 3135 } 3136 3137 /* ------------------ util.h ------------------------------------------- */ 3138 3139 /* 3140 =for apidoc_section $string 3141 3142 =for apidoc foldEQ 3143 3144 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 3145 same 3146 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes 3147 match themselves and their opposite case counterparts. Non-cased and non-ASCII 3148 range bytes match only themselves. 3149 3150 =cut 3151 */ 3152 3153 PERL_STATIC_INLINE I32 3154 Perl_foldEQ(const char *s1, const char *s2, I32 len) 3155 { 3156 const U8 *a = (const U8 *)s1; 3157 const U8 *b = (const U8 *)s2; 3158 3159 PERL_ARGS_ASSERT_FOLDEQ; 3160 3161 assert(len >= 0); 3162 3163 while (len--) { 3164 if (*a != *b && *a != PL_fold[*b]) 3165 return 0; 3166 a++,b++; 3167 } 3168 return 1; 3169 } 3170 3171 PERL_STATIC_INLINE I32 3172 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) 3173 { 3174 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds 3175 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and 3176 * does not check for this. Nor does it check that the strings each have 3177 * at least 'len' characters. */ 3178 3179 const U8 *a = (const U8 *)s1; 3180 const U8 *b = (const U8 *)s2; 3181 3182 PERL_ARGS_ASSERT_FOLDEQ_LATIN1; 3183 3184 assert(len >= 0); 3185 3186 while (len--) { 3187 if (*a != *b && *a != PL_fold_latin1[*b]) { 3188 return 0; 3189 } 3190 a++, b++; 3191 } 3192 return 1; 3193 } 3194 3195 /* 3196 =for apidoc_section $locale 3197 =for apidoc foldEQ_locale 3198 3199 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 3200 same case-insensitively in the current locale; false otherwise. 3201 3202 =cut 3203 */ 3204 3205 PERL_STATIC_INLINE I32 3206 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) 3207 { 3208 const U8 *a = (const U8 *)s1; 3209 const U8 *b = (const U8 *)s2; 3210 3211 PERL_ARGS_ASSERT_FOLDEQ_LOCALE; 3212 3213 assert(len >= 0); 3214 3215 while (len--) { 3216 if (*a != *b && *a != PL_fold_locale[*b]) 3217 return 0; 3218 a++,b++; 3219 } 3220 return 1; 3221 } 3222 3223 /* 3224 =for apidoc_section $string 3225 =for apidoc my_strnlen 3226 3227 The C library C<strnlen> if available, or a Perl implementation of it. 3228 3229 C<my_strnlen()> computes the length of the string, up to C<maxlen> 3230 characters. It will never attempt to address more than C<maxlen> 3231 characters, making it suitable for use with strings that are not 3232 guaranteed to be NUL-terminated. 3233 3234 =cut 3235 3236 Description stolen from http://man.openbsd.org/strnlen.3, 3237 implementation stolen from PostgreSQL. 3238 */ 3239 #ifndef HAS_STRNLEN 3240 3241 PERL_STATIC_INLINE Size_t 3242 Perl_my_strnlen(const char *str, Size_t maxlen) 3243 { 3244 const char *end = (char *) memchr(str, '\0', maxlen); 3245 3246 PERL_ARGS_ASSERT_MY_STRNLEN; 3247 3248 if (end == NULL) return maxlen; 3249 return end - str; 3250 } 3251 3252 #endif 3253 3254 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) 3255 3256 PERL_STATIC_INLINE void * 3257 S_my_memrchr(const char * s, const char c, const STRLEN len) 3258 { 3259 /* memrchr(), since many platforms lack it */ 3260 3261 const char * t = s + len - 1; 3262 3263 PERL_ARGS_ASSERT_MY_MEMRCHR; 3264 3265 while (t >= s) { 3266 if (*t == c) { 3267 return (void *) t; 3268 } 3269 t--; 3270 } 3271 3272 return NULL; 3273 } 3274 3275 #endif 3276 3277 PERL_STATIC_INLINE char * 3278 Perl_mortal_getenv(const char * str) 3279 { 3280 /* This implements a (mostly) thread-safe, sequential-call-safe getenv(). 3281 * 3282 * It's (mostly) thread-safe because it uses a mutex to prevent other 3283 * threads (that look at this mutex) from destroying the result before this 3284 * routine has a chance to copy the result to a place that won't be 3285 * destroyed before the caller gets a chance to handle it. That place is a 3286 * mortal SV. khw chose this over SAVEFREEPV because he is under the 3287 * impression that the SV will hang around longer under more circumstances 3288 * 3289 * The reason it isn't completely thread-safe is that other code could 3290 * simply not pay attention to the mutex. All of the Perl core uses the 3291 * mutex, but it is possible for code from, say XS, to not use this mutex, 3292 * defeating the safety. 3293 * 3294 * getenv() returns, in some implementations, a pointer to a spot in the 3295 * **environ array, which could be invalidated at any time by this or 3296 * another thread changing the environment. Other implementations copy the 3297 * **environ value to a static buffer, returning a pointer to that. That 3298 * buffer might or might not be invalidated by a getenv() call in another 3299 * thread. If it does get zapped, we need an exclusive lock. Otherwise, 3300 * many getenv() calls can safely be running simultaneously, so a 3301 * many-reader (but no simultaneous writers) lock is ok. There is a 3302 * Configure probe to see if another thread destroys the buffer, and the 3303 * mutex is defined accordingly. 3304 * 3305 * But in all cases, using the mutex prevents these problems, as long as 3306 * all code uses the same mutex. 3307 * 3308 * A complication is that this can be called during phases where the 3309 * mortalization process isn't available. These are in interpreter 3310 * destruction or early in construction. khw believes that at these times 3311 * there shouldn't be anything else going on, so plain getenv is safe AS 3312 * LONG AS the caller acts on the return before calling it again. */ 3313 3314 char * ret; 3315 dTHX; 3316 3317 PERL_ARGS_ASSERT_MORTAL_GETENV; 3318 3319 /* Can't mortalize without stacks. khw believes that no other threads 3320 * should be running, so no need to lock things, and this may be during a 3321 * phase when locking isn't even available */ 3322 if (UNLIKELY(PL_scopestack_ix == 0)) { 3323 return getenv(str); 3324 } 3325 3326 #ifdef PERL_MEM_LOG 3327 3328 /* A major complication arises under PERL_MEM_LOG. When that is active, 3329 * every memory allocation may result in logging, depending on the value of 3330 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for 3331 * saving ENV{foo}'s value (but before saving it), the logging code will 3332 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some 3333 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to 3334 * lock a boolean mutex recursively); 3) destroying the getenv() static 3335 * buffer; or 4) destroying the temporary created by this for the copy 3336 * causes a log entry to be made which could cause a new temporary to be 3337 * created, which will need to be destroyed at some point, leading to an 3338 * infinite loop. 3339 * 3340 * The solution adopted here (after some gnashing of teeth) is to detect 3341 * the recursive calls and calls from the logger, and treat them specially. 3342 * Let's say we want to do getenv("foo"). We first find 3343 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter 3344 * variable, so no temporary is required. Then we do getenv(foo}, and in 3345 * the process of creating a temporary to save it, this function will be 3346 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call, 3347 * we detect that it is such a call and return our saved value instead of 3348 * locking and doing a new getenv(). This solves all of problems 1), 2), 3349 * and 3). Because all the getenv()s are done while the mutex is locked, 3350 * the state cannot have changed. To solve 4), we don't create a temporary 3351 * when this is called from the logging code. That code disposes of the 3352 * return value while the mutex is still locked. 3353 * 3354 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial 3355 * digits and 3 particular letters are significant; the rest are ignored by 3356 * the memory logging code. Thus the per-interpreter variable only needs 3357 * to be large enough to save the significant information, the size of 3358 * which is known at compile time. The first byte is extra, reserved for 3359 * flags for our use. To protect against overflowing, only the reserved 3360 * byte, as many digits as don't overflow, and the three letters are 3361 * stored. 3362 * 3363 * The reserved byte has two bits: 3364 * 0x1 if set indicates that if we get here, it is a recursive call of 3365 * getenv() 3366 * 0x2 if set indicates that the call is from the logging code. 3367 * 3368 * If the flag indicates this is a recursive call, just return the stored 3369 * value of PL_mem_log; An empty value gets turned into NULL. */ 3370 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) { 3371 if (PL_mem_log[1] == '\0') { 3372 return NULL; 3373 } else { 3374 return PL_mem_log + 1; 3375 } 3376 } 3377 3378 #endif 3379 3380 GETENV_LOCK; 3381 3382 #ifdef PERL_MEM_LOG 3383 3384 /* Here we are in a critical section. As explained above, we do our own 3385 * getenv(PERL_MEM_LOG), saving the result safely. */ 3386 ret = getenv("PERL_MEM_LOG"); 3387 if (ret == NULL) { /* No logging active */ 3388 3389 /* Return that immediately if called from the logging code */ 3390 if (PL_mem_log[0] & 0x2) { 3391 GETENV_UNLOCK; 3392 return NULL; 3393 } 3394 3395 PL_mem_log[1] = '\0'; 3396 } 3397 else { 3398 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */ 3399 3400 /* There is nothing to prevent the value of PERL_MEM_LOG from being an 3401 * extremely long string. But we want only a few characters from it. 3402 * PL_mem_log has been made large enough to hold just the ones we need. 3403 * First the file descriptor. */ 3404 if (isDIGIT(*ret)) { 3405 const char * s = ret; 3406 if (UNLIKELY(*s == '0')) { 3407 3408 /* Reduce multiple leading zeros to a single one. This is to 3409 * allow the caller to change what to do with leading zeros. */ 3410 *mem_log_meat++ = '0'; 3411 s++; 3412 while (*s == '0') { 3413 s++; 3414 } 3415 } 3416 3417 /* If the input overflows, copy just enough for the result to also 3418 * overflow, plus 1 to make sure */ 3419 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) { 3420 *mem_log_meat++ = *s++; 3421 } 3422 } 3423 3424 /* Then each of the three significant characters */ 3425 if (strchr(ret, 'm')) { 3426 *mem_log_meat++ = 'm'; 3427 } 3428 if (strchr(ret, 's')) { 3429 *mem_log_meat++ = 's'; 3430 } 3431 if (strchr(ret, 't')) { 3432 *mem_log_meat++ = 't'; 3433 } 3434 *mem_log_meat = '\0'; 3435 3436 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log)); 3437 } 3438 3439 /* If we are being called from the logger, it only needs the significant 3440 * portion of PERL_MEM_LOG, and doesn't need a safe copy */ 3441 if (PL_mem_log[0] & 0x2) { 3442 assert(strEQ(str, "PERL_MEM_LOG")); 3443 GETENV_UNLOCK; 3444 return PL_mem_log + 1; 3445 } 3446 3447 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that 3448 * is coming from other than the logging code, so it should be treated the 3449 * same as any other getenv(), returning the full value, not just the 3450 * significant part, and having its value saved. Set the flag that 3451 * indicates any call to this routine will be a recursion from here */ 3452 PL_mem_log[0] = 0x1; 3453 3454 #endif 3455 3456 /* Now get the value of the real desired variable, and save a copy */ 3457 ret = getenv(str); 3458 3459 if (ret != NULL) { 3460 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) ); 3461 } 3462 3463 GETENV_UNLOCK; 3464 3465 #ifdef PERL_MEM_LOG 3466 3467 /* Clear the buffer */ 3468 Zero(PL_mem_log, sizeof(PL_mem_log), char); 3469 3470 #endif 3471 3472 return ret; 3473 } 3474 3475 PERL_STATIC_INLINE bool 3476 Perl_sv_isbool(pTHX_ const SV *sv) 3477 { 3478 return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) && 3479 (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No); 3480 } 3481 3482 #ifdef USE_ITHREADS 3483 3484 PERL_STATIC_INLINE AV * 3485 Perl_cop_file_avn(pTHX_ const COP *cop) { 3486 3487 PERL_ARGS_ASSERT_COP_FILE_AVN; 3488 3489 const char *file = CopFILE(cop); 3490 if (file) { 3491 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD); 3492 if (gv) { 3493 return GvAVn(gv); 3494 } 3495 else 3496 return NULL; 3497 } 3498 else 3499 return NULL; 3500 } 3501 3502 #endif 3503 3504 /* 3505 * ex: set ts=8 sts=4 sw=4 et: 3506 */ 3507