1 /* inline.h 2 * 3 * Copyright (C) 2012 by Larry Wall and others 4 * 5 * You may distribute under the terms of either the GNU General Public 6 * License or the Artistic License, as specified in the README file. 7 * 8 * This file contains tables and code adapted from 9 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this 10 * copyright notice: 11 12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 13 14 Permission is hereby granted, free of charge, to any person obtaining a copy of 15 this software and associated documentation files (the "Software"), to deal in 16 the Software without restriction, including without limitation the rights to 17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 18 of the Software, and to permit persons to whom the Software is furnished to do 19 so, subject to the following conditions: 20 21 The above copyright notice and this permission notice shall be included in all 22 copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 SOFTWARE. 31 32 * 33 * This file is a home for static inline functions that cannot go in other 34 * header files, because they depend on proto.h (included after most other 35 * headers) or struct definitions. 36 * 37 * Each section names the header file that the functions "belong" to. 38 */ 39 40 /* ------------------------------- av.h ------------------------------- */ 41 42 PERL_STATIC_INLINE SSize_t 43 S_av_top_index(pTHX_ AV *av) 44 { 45 PERL_ARGS_ASSERT_AV_TOP_INDEX; 46 assert(SvTYPE(av) == SVt_PVAV); 47 48 return AvFILL(av); 49 } 50 51 /* ------------------------------- cv.h ------------------------------- */ 52 53 PERL_STATIC_INLINE GV * 54 S_CvGV(pTHX_ CV *sv) 55 { 56 return CvNAMED(sv) 57 ? Perl_cvgv_from_hek(aTHX_ sv) 58 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; 59 } 60 61 PERL_STATIC_INLINE I32 * 62 S_CvDEPTHp(const CV * const sv) 63 { 64 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); 65 return &((XPVCV*)SvANY(sv))->xcv_depth; 66 } 67 68 /* 69 CvPROTO returns the prototype as stored, which is not necessarily what 70 the interpreter should be using. Specifically, the interpreter assumes 71 that spaces have been stripped, which has been the case if the prototype 72 was added by toke.c, but is generally not the case if it was added elsewhere. 73 Since we can't enforce the spacelessness at assignment time, this routine 74 provides a temporary copy at parse time with spaces removed. 75 I<orig> is the start of the original buffer, I<len> is the length of the 76 prototype and will be updated when this returns. 77 */ 78 79 #ifdef PERL_CORE 80 PERL_STATIC_INLINE char * 81 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len) 82 { 83 SV * tmpsv; 84 char * tmps; 85 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP); 86 tmps = SvPVX(tmpsv); 87 while ((*len)--) { 88 if (!isSPACE(*orig)) 89 *tmps++ = *orig; 90 orig++; 91 } 92 *tmps = '\0'; 93 *len = tmps - SvPVX(tmpsv); 94 return SvPVX(tmpsv); 95 } 96 #endif 97 98 /* ------------------------------- mg.h ------------------------------- */ 99 100 #if defined(PERL_CORE) || defined(PERL_EXT) 101 /* assumes get-magic and stringification have already occurred */ 102 PERL_STATIC_INLINE STRLEN 103 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) 104 { 105 assert(mg->mg_type == PERL_MAGIC_regex_global); 106 assert(mg->mg_len != -1); 107 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv)) 108 return (STRLEN)mg->mg_len; 109 else { 110 const STRLEN pos = (STRLEN)mg->mg_len; 111 /* Without this check, we may read past the end of the buffer: */ 112 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; 113 return sv_or_pv_pos_u2b(sv, s, pos, NULL); 114 } 115 } 116 #endif 117 118 /* ------------------------------- pad.h ------------------------------ */ 119 120 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) 121 PERL_STATIC_INLINE bool 122 PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) 123 { 124 /* is seq within the range _LOW to _HIGH ? 125 * This is complicated by the fact that PL_cop_seqmax 126 * may have wrapped around at some point */ 127 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO) 128 return FALSE; /* not yet introduced */ 129 130 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) { 131 /* in compiling scope */ 132 if ( 133 (seq > COP_SEQ_RANGE_LOW(pn)) 134 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) 135 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) 136 ) 137 return TRUE; 138 } 139 else if ( 140 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) 141 ? 142 ( seq > COP_SEQ_RANGE_LOW(pn) 143 || seq <= COP_SEQ_RANGE_HIGH(pn)) 144 145 : ( seq > COP_SEQ_RANGE_LOW(pn) 146 && seq <= COP_SEQ_RANGE_HIGH(pn)) 147 ) 148 return TRUE; 149 return FALSE; 150 } 151 #endif 152 153 /* ------------------------------- pp.h ------------------------------- */ 154 155 PERL_STATIC_INLINE I32 156 S_TOPMARK(pTHX) 157 { 158 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 159 "MARK top %p %" IVdf "\n", 160 PL_markstack_ptr, 161 (IV)*PL_markstack_ptr))); 162 return *PL_markstack_ptr; 163 } 164 165 PERL_STATIC_INLINE I32 166 S_POPMARK(pTHX) 167 { 168 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 169 "MARK pop %p %" IVdf "\n", 170 (PL_markstack_ptr-1), 171 (IV)*(PL_markstack_ptr-1)))); 172 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); 173 return *PL_markstack_ptr--; 174 } 175 176 /* ----------------------------- regexp.h ----------------------------- */ 177 178 PERL_STATIC_INLINE struct regexp * 179 S_ReANY(const REGEXP * const re) 180 { 181 XPV* const p = (XPV*)SvANY(re); 182 assert(isREGEXP(re)); 183 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx 184 : (struct regexp *)p; 185 } 186 187 /* ------------------------------- sv.h ------------------------------- */ 188 189 PERL_STATIC_INLINE SV * 190 S_SvREFCNT_inc(SV *sv) 191 { 192 if (LIKELY(sv != NULL)) 193 SvREFCNT(sv)++; 194 return sv; 195 } 196 PERL_STATIC_INLINE SV * 197 S_SvREFCNT_inc_NN(SV *sv) 198 { 199 SvREFCNT(sv)++; 200 return sv; 201 } 202 PERL_STATIC_INLINE void 203 S_SvREFCNT_inc_void(SV *sv) 204 { 205 if (LIKELY(sv != NULL)) 206 SvREFCNT(sv)++; 207 } 208 PERL_STATIC_INLINE void 209 S_SvREFCNT_dec(pTHX_ SV *sv) 210 { 211 if (LIKELY(sv != NULL)) { 212 U32 rc = SvREFCNT(sv); 213 if (LIKELY(rc > 1)) 214 SvREFCNT(sv) = rc - 1; 215 else 216 Perl_sv_free2(aTHX_ sv, rc); 217 } 218 } 219 220 PERL_STATIC_INLINE void 221 S_SvREFCNT_dec_NN(pTHX_ SV *sv) 222 { 223 U32 rc = SvREFCNT(sv); 224 if (LIKELY(rc > 1)) 225 SvREFCNT(sv) = rc - 1; 226 else 227 Perl_sv_free2(aTHX_ sv, rc); 228 } 229 230 PERL_STATIC_INLINE void 231 SvAMAGIC_on(SV *sv) 232 { 233 assert(SvROK(sv)); 234 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv))); 235 } 236 PERL_STATIC_INLINE void 237 SvAMAGIC_off(SV *sv) 238 { 239 if (SvROK(sv) && SvOBJECT(SvRV(sv))) 240 HvAMAGIC_off(SvSTASH(SvRV(sv))); 241 } 242 243 PERL_STATIC_INLINE U32 244 S_SvPADSTALE_on(SV *sv) 245 { 246 assert(!(SvFLAGS(sv) & SVs_PADTMP)); 247 return SvFLAGS(sv) |= SVs_PADSTALE; 248 } 249 PERL_STATIC_INLINE U32 250 S_SvPADSTALE_off(SV *sv) 251 { 252 assert(!(SvFLAGS(sv) & SVs_PADTMP)); 253 return SvFLAGS(sv) &= ~SVs_PADSTALE; 254 } 255 #if defined(PERL_CORE) || defined (PERL_EXT) 256 PERL_STATIC_INLINE STRLEN 257 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) 258 { 259 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B; 260 if (SvGAMAGIC(sv)) { 261 U8 *hopped = utf8_hop((U8 *)pv, pos); 262 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped); 263 return (STRLEN)(hopped - (U8 *)pv); 264 } 265 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN); 266 } 267 #endif 268 269 /* ------------------------------- handy.h ------------------------------- */ 270 271 /* saves machine code for a common noreturn idiom typically used in Newx*() */ 272 GCC_DIAG_IGNORE_DECL(-Wunused-function); 273 static void 274 S_croak_memory_wrap(void) 275 { 276 Perl_croak_nocontext("%s",PL_memory_wrap); 277 } 278 GCC_DIAG_RESTORE_DECL; 279 280 /* ------------------------------- utf8.h ------------------------------- */ 281 282 /* 283 =head1 Unicode Support 284 */ 285 286 PERL_STATIC_INLINE void 287 S_append_utf8_from_native_byte(const U8 byte, U8** dest) 288 { 289 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 290 * encoded string at '*dest', updating '*dest' to include it */ 291 292 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; 293 294 if (NATIVE_BYTE_IS_INVARIANT(byte)) 295 *((*dest)++) = byte; 296 else { 297 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); 298 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); 299 } 300 } 301 302 /* 303 =for apidoc valid_utf8_to_uvchr 304 Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that 305 the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>, 306 it passes C<L</isUTF8_CHAR>>. Surrogates, non-character code points, and 307 non-Unicode code points are allowed. 308 309 =cut 310 311 */ 312 313 PERL_STATIC_INLINE UV 314 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) 315 { 316 const UV expectlen = UTF8SKIP(s); 317 const U8* send = s + expectlen; 318 UV uv = *s; 319 320 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; 321 322 if (retlen) { 323 *retlen = expectlen; 324 } 325 326 /* An invariant is trivially returned */ 327 if (expectlen == 1) { 328 return uv; 329 } 330 331 /* Remove the leading bits that indicate the number of bytes, leaving just 332 * the bits that are part of the value */ 333 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 334 335 /* Now, loop through the remaining bytes, accumulating each into the 336 * working total as we go. (I khw tried unrolling the loop for up to 4 337 * bytes, but there was no performance improvement) */ 338 for (++s; s < send; s++) { 339 uv = UTF8_ACCUMULATE(uv, *s); 340 } 341 342 return UNI_TO_NATIVE(uv); 343 344 } 345 346 /* 347 =for apidoc is_utf8_invariant_string 348 349 Returns TRUE if the first C<len> bytes of the string C<s> are the same 350 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on 351 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they 352 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only 353 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range 354 characters are invariant, but so also are the C1 controls. 355 356 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you 357 use this option, that C<s> can't have embedded C<NUL> characters and has to 358 have a terminating C<NUL> byte). 359 360 See also 361 C<L</is_utf8_string>>, 362 C<L</is_utf8_string_flags>>, 363 C<L</is_utf8_string_loc>>, 364 C<L</is_utf8_string_loc_flags>>, 365 C<L</is_utf8_string_loclen>>, 366 C<L</is_utf8_string_loclen_flags>>, 367 C<L</is_utf8_fixed_width_buf_flags>>, 368 C<L</is_utf8_fixed_width_buf_loc_flags>>, 369 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 370 C<L</is_strict_utf8_string>>, 371 C<L</is_strict_utf8_string_loc>>, 372 C<L</is_strict_utf8_string_loclen>>, 373 C<L</is_c9strict_utf8_string>>, 374 C<L</is_c9strict_utf8_string_loc>>, 375 and 376 C<L</is_c9strict_utf8_string_loclen>>. 377 378 =cut 379 380 */ 381 382 #define is_utf8_invariant_string(s, len) \ 383 is_utf8_invariant_string_loc(s, len, NULL) 384 385 /* 386 =for apidoc is_utf8_invariant_string_loc 387 388 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of 389 the first UTF-8 variant character in the C<ep> pointer; if all characters are 390 UTF-8 invariant, this function does not change the contents of C<*ep>. 391 392 =cut 393 394 */ 395 396 PERL_STATIC_INLINE bool 397 S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) 398 { 399 const U8* send; 400 const U8* x = s; 401 402 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; 403 404 if (len == 0) { 405 len = strlen((const char *)s); 406 } 407 408 send = s + len; 409 410 /* This looks like 0x010101... */ 411 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) 412 413 /* This looks like 0x808080... */ 414 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) 415 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) 416 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) 417 418 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by 419 * or'ing together the lowest bits of 'x'. Hopefully the final term gets 420 * optimized out completely on a 32-bit system, and its mask gets optimized out 421 * on a 64-bit system */ 422 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ 423 | ( PTR2nat(x) >> 1) \ 424 | ( ( (PTR2nat(x) \ 425 & PERL_WORD_BOUNDARY_MASK) >> 2)))) 426 427 #ifndef EBCDIC 428 429 /* Do the word-at-a-time iff there is at least one usable full word. That 430 * means that after advancing to a word boundary, there still is at least a 431 * full word left. The number of bytes needed to advance is 'wordsize - 432 * offset' unless offset is 0. */ 433 if ((STRLEN) (send - x) >= PERL_WORDSIZE 434 435 /* This term is wordsize if subword; 0 if not */ 436 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 437 438 /* 'offset' */ 439 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 440 { 441 442 /* Process per-byte until reach word boundary. XXX This loop could be 443 * eliminated if we knew that this platform had fast unaligned reads */ 444 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 445 if (! UTF8_IS_INVARIANT(*x)) { 446 if (ep) { 447 *ep = x; 448 } 449 450 return FALSE; 451 } 452 x++; 453 } 454 455 /* Here, we know we have at least one full word to process. Process 456 * per-word as long as we have at least a full word left */ 457 do { 458 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { 459 460 /* Found a variant. Just return if caller doesn't want its 461 * exact position */ 462 if (! ep) { 463 return FALSE; 464 } 465 466 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ 467 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 468 469 *ep = x + _variant_byte_number(* (PERL_UINTMAX_T *) x); 470 assert(*ep >= s && *ep < send); 471 472 return FALSE; 473 474 # else /* If weird byte order, drop into next loop to do byte-at-a-time 475 checks. */ 476 477 break; 478 # endif 479 } 480 481 x += PERL_WORDSIZE; 482 483 } while (x + PERL_WORDSIZE <= send); 484 } 485 486 #endif /* End of ! EBCDIC */ 487 488 /* Process per-byte */ 489 while (x < send) { 490 if (! UTF8_IS_INVARIANT(*x)) { 491 if (ep) { 492 *ep = x; 493 } 494 495 return FALSE; 496 } 497 498 x++; 499 } 500 501 return TRUE; 502 } 503 504 #ifndef EBCDIC 505 506 PERL_STATIC_INLINE unsigned int 507 S__variant_byte_number(PERL_UINTMAX_T word) 508 { 509 510 /* This returns the position in a word (0..7) of the first variant byte in 511 * it. This is a helper function. Note that there are no branches */ 512 513 assert(word); 514 515 /* Get just the msb bits of each byte */ 516 word &= PERL_VARIANTS_WORD_MASK; 517 518 # ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the 519 easiest thing is to hide that from the callers */ 520 { 521 unsigned int i; 522 const U8 * s = (U8 *) &word; 523 dTHX; 524 525 for (i = 0; i < sizeof(word); i++ ) { 526 if (s[i]) { 527 return i; 528 } 529 } 530 531 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n", 532 __FILE__, __LINE__); 533 } 534 535 # elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 536 537 /* Bytes are stored like 538 * Byte8 ... Byte2 Byte1 539 * 63..56...15...8 7...0 540 * 541 * Isolate the lsb; 542 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set 543 * 544 * The word will look this this, with a rightmost set bit in position 's': 545 * ('x's are don't cares) 546 * s 547 * x..x100..0 548 * x..xx10..0 Right shift (rightmost 0 is shifted off) 549 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and 550 * the 1 just to their left into a 0; the remainder is 551 * untouched 552 * 0..0011..1 The xor with the original, x..xx10..0, clears that 553 * remainder, sets the bottom to all 1 554 * 0..0100..0 Add 1 to clear the word except for the bit in 's' 555 * 556 * Another method is to do 'word &= -word'; but it generates a compiler 557 * message on some platforms about taking the negative of an unsigned */ 558 559 word >>= 1; 560 word = 1 + (word ^ (word - 1)); 561 562 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 563 564 /* Bytes are stored like 565 * Byte1 Byte2 ... Byte8 566 * 63..56 55..47 ... 7...0 567 * 568 * Isolate the msb; http://codeforces.com/blog/entry/10330 569 * 570 * Only the most significant set bit matters. Or'ing word with its right 571 * shift of 1 makes that bit and the next one to its right both 1. Then 572 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the 573 * msb and all to the right being 1. */ 574 word |= word >> 1; 575 word |= word >> 2; 576 word |= word >> 4; 577 word |= word >> 8; 578 word |= word >> 16; 579 word |= word >> 32; /* This should get optimized out on 32-bit systems. */ 580 581 /* Then subtracting the right shift by 1 clears all but the left-most of 582 * the 1 bits, which is our desired result */ 583 word -= (word >> 1); 584 585 # else 586 # error Unexpected byte order 587 # endif 588 589 /* Here 'word' has a single bit set: the msb of the first byte in which it 590 * is set. Calculate that position in the word. We can use this 591 * specialized solution: https://stackoverflow.com/a/32339674/1626653, 592 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should 593 * just get shifted off at compile time) */ 594 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48) 595 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32) 596 | (39 << 24) | (47 << 16) 597 | (55 << 8) | (63 << 0)); 598 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */ 599 600 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */ 601 word = ((word + 1) >> 3) - 1; 602 603 # if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 604 605 /* And invert the result */ 606 word = CHARBITS - word - 1; 607 608 # endif 609 610 return (unsigned int) word; 611 } 612 613 #endif 614 #if defined(PERL_CORE) || defined(PERL_EXT) 615 616 /* 617 =for apidoc variant_under_utf8_count 618 619 This function looks at the sequence of bytes between C<s> and C<e>, which are 620 assumed to be encoded in ASCII/Latin1, and returns how many of them would 621 change should the string be translated into UTF-8. Due to the nature of UTF-8, 622 each of these would occupy two bytes instead of the single one in the input 623 string. Thus, this function returns the precise number of bytes the string 624 would expand by when translated to UTF-8. 625 626 Unlike most of the other functions that have C<utf8> in their name, the input 627 to this function is NOT a UTF-8-encoded string. The function name is slightly 628 I<odd> to emphasize this. 629 630 This function is internal to Perl because khw thinks that any XS code that 631 would want this is probably operating too close to the internals. Presenting a 632 valid use case could change that. 633 634 See also 635 C<L<perlapi/is_utf8_invariant_string>> 636 and 637 C<L<perlapi/is_utf8_invariant_string_loc>>, 638 639 =cut 640 641 */ 642 643 PERL_STATIC_INLINE Size_t 644 S_variant_under_utf8_count(const U8* const s, const U8* const e) 645 { 646 const U8* x = s; 647 Size_t count = 0; 648 649 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; 650 651 # ifndef EBCDIC 652 653 /* Test if the string is long enough to use word-at-a-time. (Logic is the 654 * same as for is_utf8_invariant_string()) */ 655 if ((STRLEN) (e - x) >= PERL_WORDSIZE 656 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 657 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 658 { 659 660 /* Process per-byte until reach word boundary. XXX This loop could be 661 * eliminated if we knew that this platform had fast unaligned reads */ 662 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 663 count += ! UTF8_IS_INVARIANT(*x++); 664 } 665 666 /* Process per-word as long as we have at least a full word left */ 667 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an 668 explanation of how this works */ 669 PERL_UINTMAX_T increment 670 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) 671 * PERL_COUNT_MULTIPLIER) 672 >> ((PERL_WORDSIZE - 1) * CHARBITS); 673 count += (Size_t) increment; 674 x += PERL_WORDSIZE; 675 } while (x + PERL_WORDSIZE <= e); 676 } 677 678 # endif 679 680 /* Process per-byte */ 681 while (x < e) { 682 if (! UTF8_IS_INVARIANT(*x)) { 683 count++; 684 } 685 686 x++; 687 } 688 689 return count; 690 } 691 692 #endif 693 694 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */ 695 # undef PERL_WORDSIZE 696 # undef PERL_COUNT_MULTIPLIER 697 # undef PERL_WORD_BOUNDARY_MASK 698 # undef PERL_VARIANTS_WORD_MASK 699 #endif 700 701 /* 702 =for apidoc is_utf8_string 703 704 Returns TRUE if the first C<len> bytes of string C<s> form a valid 705 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will 706 be calculated using C<strlen(s)> (which means if you use this option, that C<s> 707 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 708 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 709 710 This function considers Perl's extended UTF-8 to be valid. That means that 711 code points above Unicode, surrogates, and non-character code points are 712 considered valid by this function. Use C<L</is_strict_utf8_string>>, 713 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what 714 code points are considered valid. 715 716 See also 717 C<L</is_utf8_invariant_string>>, 718 C<L</is_utf8_invariant_string_loc>>, 719 C<L</is_utf8_string_loc>>, 720 C<L</is_utf8_string_loclen>>, 721 C<L</is_utf8_fixed_width_buf_flags>>, 722 C<L</is_utf8_fixed_width_buf_loc_flags>>, 723 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 724 725 =cut 726 */ 727 728 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) 729 730 #if defined(PERL_CORE) || defined (PERL_EXT) 731 732 /* 733 =for apidoc is_utf8_non_invariant_string 734 735 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first 736 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended 737 UTF-8; otherwise returns FALSE. 738 739 A TRUE return means that at least one code point represented by the sequence 740 either is a wide character not representable as a single byte, or the 741 representation differs depending on whether the sequence is encoded in UTF-8 or 742 not. 743 744 See also 745 C<L<perlapi/is_utf8_invariant_string>>, 746 C<L<perlapi/is_utf8_string>> 747 748 =cut 749 750 This is commonly used to determine if a SV's UTF-8 flag should be turned on. 751 It generally needn't be if its string is entirely UTF-8 invariant, and it 752 shouldn't be if it otherwise contains invalid UTF-8. 753 754 It is an internal function because khw thinks that XS code shouldn't be working 755 at this low a level. A valid use case could change that. 756 757 */ 758 759 PERL_STATIC_INLINE bool 760 S_is_utf8_non_invariant_string(const U8* const s, STRLEN len) 761 { 762 const U8 * first_variant; 763 764 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; 765 766 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 767 return FALSE; 768 } 769 770 return is_utf8_string(first_variant, len - (first_variant - s)); 771 } 772 773 #endif 774 775 /* 776 =for apidoc is_strict_utf8_string 777 778 Returns TRUE if the first C<len> bytes of string C<s> form a valid 779 UTF-8-encoded string that is fully interchangeable by any application using 780 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be 781 calculated using C<strlen(s)> (which means if you use this option, that C<s> 782 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 783 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 784 785 This function returns FALSE for strings containing any 786 code points above the Unicode max of 0x10FFFF, surrogate code points, or 787 non-character code points. 788 789 See also 790 C<L</is_utf8_invariant_string>>, 791 C<L</is_utf8_invariant_string_loc>>, 792 C<L</is_utf8_string>>, 793 C<L</is_utf8_string_flags>>, 794 C<L</is_utf8_string_loc>>, 795 C<L</is_utf8_string_loc_flags>>, 796 C<L</is_utf8_string_loclen>>, 797 C<L</is_utf8_string_loclen_flags>>, 798 C<L</is_utf8_fixed_width_buf_flags>>, 799 C<L</is_utf8_fixed_width_buf_loc_flags>>, 800 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 801 C<L</is_strict_utf8_string_loc>>, 802 C<L</is_strict_utf8_string_loclen>>, 803 C<L</is_c9strict_utf8_string>>, 804 C<L</is_c9strict_utf8_string_loc>>, 805 and 806 C<L</is_c9strict_utf8_string_loclen>>. 807 808 =cut 809 */ 810 811 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) 812 813 /* 814 =for apidoc is_c9strict_utf8_string 815 816 Returns TRUE if the first C<len> bytes of string C<s> form a valid 817 UTF-8-encoded string that conforms to 818 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>; 819 otherwise it returns FALSE. If C<len> is 0, it will be calculated using 820 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded 821 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all 822 characters being ASCII constitute 'a valid UTF-8 string'. 823 824 This function returns FALSE for strings containing any code points above the 825 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character 826 code points per 827 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 828 829 See also 830 C<L</is_utf8_invariant_string>>, 831 C<L</is_utf8_invariant_string_loc>>, 832 C<L</is_utf8_string>>, 833 C<L</is_utf8_string_flags>>, 834 C<L</is_utf8_string_loc>>, 835 C<L</is_utf8_string_loc_flags>>, 836 C<L</is_utf8_string_loclen>>, 837 C<L</is_utf8_string_loclen_flags>>, 838 C<L</is_utf8_fixed_width_buf_flags>>, 839 C<L</is_utf8_fixed_width_buf_loc_flags>>, 840 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 841 C<L</is_strict_utf8_string>>, 842 C<L</is_strict_utf8_string_loc>>, 843 C<L</is_strict_utf8_string_loclen>>, 844 C<L</is_c9strict_utf8_string_loc>>, 845 and 846 C<L</is_c9strict_utf8_string_loclen>>. 847 848 =cut 849 */ 850 851 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) 852 853 /* 854 =for apidoc is_utf8_string_flags 855 856 Returns TRUE if the first C<len> bytes of string C<s> form a valid 857 UTF-8 string, subject to the restrictions imposed by C<flags>; 858 returns FALSE otherwise. If C<len> is 0, it will be calculated 859 using C<strlen(s)> (which means if you use this option, that C<s> can't have 860 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note 861 that all characters being ASCII constitute 'a valid UTF-8 string'. 862 863 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if 864 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 865 as C<L</is_strict_utf8_string>>; and if C<flags> is 866 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as 867 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any 868 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by 869 C<L</utf8n_to_uvchr>>, with the same meanings. 870 871 See also 872 C<L</is_utf8_invariant_string>>, 873 C<L</is_utf8_invariant_string_loc>>, 874 C<L</is_utf8_string>>, 875 C<L</is_utf8_string_loc>>, 876 C<L</is_utf8_string_loc_flags>>, 877 C<L</is_utf8_string_loclen>>, 878 C<L</is_utf8_string_loclen_flags>>, 879 C<L</is_utf8_fixed_width_buf_flags>>, 880 C<L</is_utf8_fixed_width_buf_loc_flags>>, 881 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 882 C<L</is_strict_utf8_string>>, 883 C<L</is_strict_utf8_string_loc>>, 884 C<L</is_strict_utf8_string_loclen>>, 885 C<L</is_c9strict_utf8_string>>, 886 C<L</is_c9strict_utf8_string_loc>>, 887 and 888 C<L</is_c9strict_utf8_string_loclen>>. 889 890 =cut 891 */ 892 893 PERL_STATIC_INLINE bool 894 S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) 895 { 896 const U8 * first_variant; 897 898 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; 899 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 900 |UTF8_DISALLOW_PERL_EXTENDED))); 901 902 if (len == 0) { 903 len = strlen((const char *)s); 904 } 905 906 if (flags == 0) { 907 return is_utf8_string(s, len); 908 } 909 910 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 911 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 912 { 913 return is_strict_utf8_string(s, len); 914 } 915 916 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 917 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 918 { 919 return is_c9strict_utf8_string(s, len); 920 } 921 922 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { 923 const U8* const send = s + len; 924 const U8* x = first_variant; 925 926 while (x < send) { 927 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 928 if (UNLIKELY(! cur_len)) { 929 return FALSE; 930 } 931 x += cur_len; 932 } 933 } 934 935 return TRUE; 936 } 937 938 /* 939 940 =for apidoc is_utf8_string_loc 941 942 Like C<L</is_utf8_string>> but stores the location of the failure (in the 943 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 944 "utf8ness success") in the C<ep> pointer. 945 946 See also C<L</is_utf8_string_loclen>>. 947 948 =cut 949 */ 950 951 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) 952 953 /* 954 955 =for apidoc is_utf8_string_loclen 956 957 Like C<L</is_utf8_string>> but stores the location of the failure (in the 958 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 959 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 960 encoded characters in the C<el> pointer. 961 962 See also C<L</is_utf8_string_loc>>. 963 964 =cut 965 */ 966 967 PERL_STATIC_INLINE bool 968 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 969 { 970 const U8 * first_variant; 971 972 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 973 974 if (len == 0) { 975 len = strlen((const char *) s); 976 } 977 978 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 979 if (el) 980 *el = len; 981 982 if (ep) { 983 *ep = s + len; 984 } 985 986 return TRUE; 987 } 988 989 { 990 const U8* const send = s + len; 991 const U8* x = first_variant; 992 STRLEN outlen = first_variant - s; 993 994 while (x < send) { 995 const STRLEN cur_len = isUTF8_CHAR(x, send); 996 if (UNLIKELY(! cur_len)) { 997 break; 998 } 999 x += cur_len; 1000 outlen++; 1001 } 1002 1003 if (el) 1004 *el = outlen; 1005 1006 if (ep) { 1007 *ep = x; 1008 } 1009 1010 return (x == send); 1011 } 1012 } 1013 1014 /* 1015 1016 =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e 1017 1018 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1019 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 1020 that represents some code point; otherwise it evaluates to 0. If non-zero, the 1021 value gives how many bytes starting at C<s> comprise the code point's 1022 representation. Any bytes remaining before C<e>, but beyond the ones needed to 1023 form the first code point in C<s>, are not examined. 1024 1025 The code point can be any that will fit in an IV on this machine, using Perl's 1026 extension to official UTF-8 to represent those higher than the Unicode maximum 1027 of 0x10FFFF. That means that this macro is used to efficiently decide if the 1028 next few bytes in C<s> is legal UTF-8 for a single character. 1029 1030 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those 1031 defined by Unicode to be fully interchangeable across applications; 1032 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1033 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1034 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1035 1036 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and 1037 C<L</is_utf8_string_loclen>> to check entire strings. 1038 1039 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC 1040 machines) is a valid UTF-8 character. 1041 1042 =cut 1043 1044 This uses an adaptation of the table and algorithm given in 1045 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1046 documentation of the original version. A copyright notice for the original 1047 version is given at the beginning of this file. The Perl adapation is 1048 documented at the definition of PL_extended_utf8_dfa_tab[]. 1049 1050 */ 1051 1052 PERL_STATIC_INLINE Size_t 1053 S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) 1054 { 1055 const U8 * s = s0; 1056 UV state = 0; 1057 1058 PERL_ARGS_ASSERT_ISUTF8_CHAR; 1059 1060 /* This dfa is fast. If it accepts the input, it was for a well-formed, 1061 * code point, which can be returned immediately. Otherwise, it is either 1062 * malformed, or for the start byte FF which the dfa doesn't handle (except 1063 * on 32-bit ASCII platforms where it trivially is an error). Call a 1064 * helper function for the other platforms. */ 1065 1066 while (s < e && LIKELY(state != 1)) { 1067 state = PL_extended_utf8_dfa_tab[256 1068 + state 1069 + PL_extended_utf8_dfa_tab[*s]]; 1070 if (state != 0) { 1071 s++; 1072 continue; 1073 } 1074 1075 return s - s0 + 1; 1076 } 1077 1078 #if defined(UV_IS_QUAD) || defined(EBCDIC) 1079 1080 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) { 1081 return _is_utf8_char_helper(s0, e, 0); 1082 } 1083 1084 #endif 1085 1086 return 0; 1087 } 1088 1089 /* 1090 1091 =for apidoc isSTRICT_UTF8_CHAR 1092 1093 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1094 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1095 Unicode code point completely acceptable for open interchange between all 1096 applications; otherwise it evaluates to 0. If non-zero, the value gives how 1097 many bytes starting at C<s> comprise the code point's representation. Any 1098 bytes remaining before C<e>, but beyond the ones needed to form the first code 1099 point in C<s>, are not examined. 1100 1101 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not 1102 be a surrogate nor a non-character code point. Thus this excludes any code 1103 point from Perl's extended UTF-8. 1104 1105 This is used to efficiently decide if the next few bytes in C<s> is 1106 legal Unicode-acceptable UTF-8 for a single character. 1107 1108 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 1109 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 1110 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; 1111 and C<L</isUTF8_CHAR_flags>> for a more customized definition. 1112 1113 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and 1114 C<L</is_strict_utf8_string_loclen>> to check entire strings. 1115 1116 =cut 1117 1118 This uses an adaptation of the tables and algorithm given in 1119 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1120 documentation of the original version. A copyright notice for the original 1121 version is given at the beginning of this file. The Perl adapation is 1122 documented at the definition of strict_extended_utf8_dfa_tab[]. 1123 1124 */ 1125 1126 PERL_STATIC_INLINE Size_t 1127 S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1128 { 1129 const U8 * s = s0; 1130 UV state = 0; 1131 1132 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; 1133 1134 while (s < e && LIKELY(state != 1)) { 1135 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]]; 1136 1137 if (state != 0) { 1138 s++; 1139 continue; 1140 } 1141 1142 return s - s0 + 1; 1143 } 1144 1145 #ifndef EBCDIC 1146 1147 /* The dfa above drops out for certain Hanguls; handle them specially */ 1148 if (is_HANGUL_ED_utf8_safe(s0, e)) { 1149 return 3; 1150 } 1151 1152 #endif 1153 1154 return 0; 1155 } 1156 1157 /* 1158 1159 =for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e 1160 1161 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 1162 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 1163 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, 1164 the value gives how many bytes starting at C<s> comprise the code point's 1165 representation. Any bytes remaining before C<e>, but beyond the ones needed to 1166 form the first code point in C<s>, are not examined. 1167 1168 The largest acceptable code point is the Unicode maximum 0x10FFFF. This 1169 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character 1170 code points. This corresponds to 1171 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 1172 which said that non-character code points are merely discouraged rather than 1173 completely forbidden in open interchange. See 1174 L<perlunicode/Noncharacter code points>. 1175 1176 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and 1177 C<L</isUTF8_CHAR_flags>> for a more customized definition. 1178 1179 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and 1180 C<L</is_c9strict_utf8_string_loclen>> to check entire strings. 1181 1182 =cut 1183 1184 This uses an adaptation of the tables and algorithm given in 1185 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 1186 documentation of the original version. A copyright notice for the original 1187 version is given at the beginning of this file. The Perl adapation is 1188 documented at the definition of PL_c9_utf8_dfa_tab[]. 1189 1190 */ 1191 1192 PERL_STATIC_INLINE Size_t 1193 S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 1194 { 1195 const U8 * s = s0; 1196 UV state = 0; 1197 1198 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; 1199 1200 while (s < e && LIKELY(state != 1)) { 1201 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]]; 1202 1203 if (state != 0) { 1204 s++; 1205 continue; 1206 } 1207 1208 return s - s0 + 1; 1209 } 1210 1211 return 0; 1212 } 1213 1214 /* 1215 1216 =for apidoc is_strict_utf8_string_loc 1217 1218 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1219 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1220 "utf8ness success") in the C<ep> pointer. 1221 1222 See also C<L</is_strict_utf8_string_loclen>>. 1223 1224 =cut 1225 */ 1226 1227 #define is_strict_utf8_string_loc(s, len, ep) \ 1228 is_strict_utf8_string_loclen(s, len, ep, 0) 1229 1230 /* 1231 1232 =for apidoc is_strict_utf8_string_loclen 1233 1234 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 1235 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1236 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1237 encoded characters in the C<el> pointer. 1238 1239 See also C<L</is_strict_utf8_string_loc>>. 1240 1241 =cut 1242 */ 1243 1244 PERL_STATIC_INLINE bool 1245 S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1246 { 1247 const U8 * first_variant; 1248 1249 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; 1250 1251 if (len == 0) { 1252 len = strlen((const char *) s); 1253 } 1254 1255 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1256 if (el) 1257 *el = len; 1258 1259 if (ep) { 1260 *ep = s + len; 1261 } 1262 1263 return TRUE; 1264 } 1265 1266 { 1267 const U8* const send = s + len; 1268 const U8* x = first_variant; 1269 STRLEN outlen = first_variant - s; 1270 1271 while (x < send) { 1272 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); 1273 if (UNLIKELY(! cur_len)) { 1274 break; 1275 } 1276 x += cur_len; 1277 outlen++; 1278 } 1279 1280 if (el) 1281 *el = outlen; 1282 1283 if (ep) { 1284 *ep = x; 1285 } 1286 1287 return (x == send); 1288 } 1289 } 1290 1291 /* 1292 1293 =for apidoc is_c9strict_utf8_string_loc 1294 1295 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1296 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1297 "utf8ness success") in the C<ep> pointer. 1298 1299 See also C<L</is_c9strict_utf8_string_loclen>>. 1300 1301 =cut 1302 */ 1303 1304 #define is_c9strict_utf8_string_loc(s, len, ep) \ 1305 is_c9strict_utf8_string_loclen(s, len, ep, 0) 1306 1307 /* 1308 1309 =for apidoc is_c9strict_utf8_string_loclen 1310 1311 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 1312 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1313 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded 1314 characters in the C<el> pointer. 1315 1316 See also C<L</is_c9strict_utf8_string_loc>>. 1317 1318 =cut 1319 */ 1320 1321 PERL_STATIC_INLINE bool 1322 S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 1323 { 1324 const U8 * first_variant; 1325 1326 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; 1327 1328 if (len == 0) { 1329 len = strlen((const char *) s); 1330 } 1331 1332 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1333 if (el) 1334 *el = len; 1335 1336 if (ep) { 1337 *ep = s + len; 1338 } 1339 1340 return TRUE; 1341 } 1342 1343 { 1344 const U8* const send = s + len; 1345 const U8* x = first_variant; 1346 STRLEN outlen = first_variant - s; 1347 1348 while (x < send) { 1349 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); 1350 if (UNLIKELY(! cur_len)) { 1351 break; 1352 } 1353 x += cur_len; 1354 outlen++; 1355 } 1356 1357 if (el) 1358 *el = outlen; 1359 1360 if (ep) { 1361 *ep = x; 1362 } 1363 1364 return (x == send); 1365 } 1366 } 1367 1368 /* 1369 1370 =for apidoc is_utf8_string_loc_flags 1371 1372 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1373 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1374 "utf8ness success") in the C<ep> pointer. 1375 1376 See also C<L</is_utf8_string_loclen_flags>>. 1377 1378 =cut 1379 */ 1380 1381 #define is_utf8_string_loc_flags(s, len, ep, flags) \ 1382 is_utf8_string_loclen_flags(s, len, ep, 0, flags) 1383 1384 1385 /* The above 3 actual functions could have been moved into the more general one 1386 * just below, and made #defines that call it with the right 'flags'. They are 1387 * currently kept separate to increase their chances of getting inlined */ 1388 1389 /* 1390 1391 =for apidoc is_utf8_string_loclen_flags 1392 1393 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 1394 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 1395 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 1396 encoded characters in the C<el> pointer. 1397 1398 See also C<L</is_utf8_string_loc_flags>>. 1399 1400 =cut 1401 */ 1402 1403 PERL_STATIC_INLINE bool 1404 S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) 1405 { 1406 const U8 * first_variant; 1407 1408 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; 1409 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1410 |UTF8_DISALLOW_PERL_EXTENDED))); 1411 1412 if (len == 0) { 1413 len = strlen((const char *) s); 1414 } 1415 1416 if (flags == 0) { 1417 return is_utf8_string_loclen(s, len, ep, el); 1418 } 1419 1420 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1421 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 1422 { 1423 return is_strict_utf8_string_loclen(s, len, ep, el); 1424 } 1425 1426 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 1427 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 1428 { 1429 return is_c9strict_utf8_string_loclen(s, len, ep, el); 1430 } 1431 1432 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 1433 if (el) 1434 *el = len; 1435 1436 if (ep) { 1437 *ep = s + len; 1438 } 1439 1440 return TRUE; 1441 } 1442 1443 { 1444 const U8* send = s + len; 1445 const U8* x = first_variant; 1446 STRLEN outlen = first_variant - s; 1447 1448 while (x < send) { 1449 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 1450 if (UNLIKELY(! cur_len)) { 1451 break; 1452 } 1453 x += cur_len; 1454 outlen++; 1455 } 1456 1457 if (el) 1458 *el = outlen; 1459 1460 if (ep) { 1461 *ep = x; 1462 } 1463 1464 return (x == send); 1465 } 1466 } 1467 1468 /* 1469 =for apidoc utf8_distance 1470 1471 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 1472 and C<b>. 1473 1474 WARNING: use only if you *know* that the pointers point inside the 1475 same UTF-8 buffer. 1476 1477 =cut 1478 */ 1479 1480 PERL_STATIC_INLINE IV 1481 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 1482 { 1483 PERL_ARGS_ASSERT_UTF8_DISTANCE; 1484 1485 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 1486 } 1487 1488 /* 1489 =for apidoc utf8_hop 1490 1491 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 1492 forward or backward. 1493 1494 WARNING: do not use the following unless you *know* C<off> is within 1495 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned 1496 on the first byte of character or just after the last byte of a character. 1497 1498 =cut 1499 */ 1500 1501 PERL_STATIC_INLINE U8 * 1502 Perl_utf8_hop(const U8 *s, SSize_t off) 1503 { 1504 PERL_ARGS_ASSERT_UTF8_HOP; 1505 1506 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 1507 * the bitops (especially ~) can create illegal UTF-8. 1508 * In other words: in Perl UTF-8 is not just for Unicode. */ 1509 1510 if (off >= 0) { 1511 while (off--) 1512 s += UTF8SKIP(s); 1513 } 1514 else { 1515 while (off++) { 1516 s--; 1517 while (UTF8_IS_CONTINUATION(*s)) 1518 s--; 1519 } 1520 } 1521 GCC_DIAG_IGNORE(-Wcast-qual) 1522 return (U8 *)s; 1523 GCC_DIAG_RESTORE 1524 } 1525 1526 /* 1527 =for apidoc utf8_hop_forward 1528 1529 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 1530 forward. 1531 1532 C<off> must be non-negative. 1533 1534 C<s> must be before or equal to C<end>. 1535 1536 When moving forward it will not move beyond C<end>. 1537 1538 Will not exceed this limit even if the string is not valid "UTF-8". 1539 1540 =cut 1541 */ 1542 1543 PERL_STATIC_INLINE U8 * 1544 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) 1545 { 1546 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; 1547 1548 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 1549 * the bitops (especially ~) can create illegal UTF-8. 1550 * In other words: in Perl UTF-8 is not just for Unicode. */ 1551 1552 assert(s <= end); 1553 assert(off >= 0); 1554 1555 while (off--) { 1556 STRLEN skip = UTF8SKIP(s); 1557 if ((STRLEN)(end - s) <= skip) { 1558 GCC_DIAG_IGNORE(-Wcast-qual) 1559 return (U8 *)end; 1560 GCC_DIAG_RESTORE 1561 } 1562 s += skip; 1563 } 1564 1565 GCC_DIAG_IGNORE(-Wcast-qual) 1566 return (U8 *)s; 1567 GCC_DIAG_RESTORE 1568 } 1569 1570 /* 1571 =for apidoc utf8_hop_back 1572 1573 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 1574 backward. 1575 1576 C<off> must be non-positive. 1577 1578 C<s> must be after or equal to C<start>. 1579 1580 When moving backward it will not move before C<start>. 1581 1582 Will not exceed this limit even if the string is not valid "UTF-8". 1583 1584 =cut 1585 */ 1586 1587 PERL_STATIC_INLINE U8 * 1588 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) 1589 { 1590 PERL_ARGS_ASSERT_UTF8_HOP_BACK; 1591 1592 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 1593 * the bitops (especially ~) can create illegal UTF-8. 1594 * In other words: in Perl UTF-8 is not just for Unicode. */ 1595 1596 assert(start <= s); 1597 assert(off <= 0); 1598 1599 while (off++ && s > start) { 1600 do { 1601 s--; 1602 } while (UTF8_IS_CONTINUATION(*s) && s > start); 1603 } 1604 1605 GCC_DIAG_IGNORE(-Wcast-qual) 1606 return (U8 *)s; 1607 GCC_DIAG_RESTORE 1608 } 1609 1610 /* 1611 =for apidoc utf8_hop_safe 1612 1613 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 1614 either forward or backward. 1615 1616 When moving backward it will not move before C<start>. 1617 1618 When moving forward it will not move beyond C<end>. 1619 1620 Will not exceed those limits even if the string is not valid "UTF-8". 1621 1622 =cut 1623 */ 1624 1625 PERL_STATIC_INLINE U8 * 1626 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) 1627 { 1628 PERL_ARGS_ASSERT_UTF8_HOP_SAFE; 1629 1630 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 1631 * the bitops (especially ~) can create illegal UTF-8. 1632 * In other words: in Perl UTF-8 is not just for Unicode. */ 1633 1634 assert(start <= s && s <= end); 1635 1636 if (off >= 0) { 1637 return utf8_hop_forward(s, off, end); 1638 } 1639 else { 1640 return utf8_hop_back(s, off, start); 1641 } 1642 } 1643 1644 /* 1645 1646 =for apidoc is_utf8_valid_partial_char 1647 1648 Returns 0 if the sequence of bytes starting at C<s> and looking no further than 1649 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code 1650 points. Otherwise, it returns 1 if there exists at least one non-empty 1651 sequence of bytes that when appended to sequence C<s>, starting at position 1652 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point; 1653 otherwise returns 0. 1654 1655 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code 1656 point. 1657 1658 This is useful when a fixed-length buffer is being tested for being well-formed 1659 UTF-8, but the final few bytes in it don't comprise a full character; that is, 1660 it is split somewhere in the middle of the final code point's UTF-8 1661 representation. (Presumably when the buffer is refreshed with the next chunk 1662 of data, the new first bytes will complete the partial code point.) This 1663 function is used to verify that the final bytes in the current buffer are in 1664 fact the legal beginning of some code point, so that if they aren't, the 1665 failure can be signalled without having to wait for the next read. 1666 1667 =cut 1668 */ 1669 #define is_utf8_valid_partial_char(s, e) \ 1670 is_utf8_valid_partial_char_flags(s, e, 0) 1671 1672 /* 1673 1674 =for apidoc is_utf8_valid_partial_char_flags 1675 1676 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether 1677 or not the input is a valid UTF-8 encoded partial character, but it takes an 1678 extra parameter, C<flags>, which can further restrict which code points are 1679 considered valid. 1680 1681 If C<flags> is 0, this behaves identically to 1682 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination 1683 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If 1684 there is any sequence of bytes that can complete the input partial character in 1685 such a way that a non-prohibited character is formed, the function returns 1686 TRUE; otherwise FALSE. Non character code points cannot be determined based on 1687 partial character input. But many of the other possible excluded types can be 1688 determined from just the first one or two bytes. 1689 1690 =cut 1691 */ 1692 1693 PERL_STATIC_INLINE bool 1694 S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags) 1695 { 1696 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; 1697 1698 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1699 |UTF8_DISALLOW_PERL_EXTENDED))); 1700 1701 if (s >= e || s + UTF8SKIP(s) <= e) { 1702 return FALSE; 1703 } 1704 1705 return cBOOL(_is_utf8_char_helper(s, e, flags)); 1706 } 1707 1708 /* 1709 1710 =for apidoc is_utf8_fixed_width_buf_flags 1711 1712 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len> 1713 is entirely valid UTF-8, subject to the restrictions given by C<flags>; 1714 otherwise it returns FALSE. 1715 1716 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted 1717 without restriction. If the final few bytes of the buffer do not form a 1718 complete code point, this will return TRUE anyway, provided that 1719 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them. 1720 1721 If C<flags> in non-zero, it can be any combination of the 1722 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the 1723 same meanings. 1724 1725 This function differs from C<L</is_utf8_string_flags>> only in that the latter 1726 returns FALSE if the final few bytes of the string don't form a complete code 1727 point. 1728 1729 =cut 1730 */ 1731 #define is_utf8_fixed_width_buf_flags(s, len, flags) \ 1732 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags) 1733 1734 /* 1735 1736 =for apidoc is_utf8_fixed_width_buf_loc_flags 1737 1738 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the 1739 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point 1740 to the beginning of any partial character at the end of the buffer; if there is 1741 no partial character C<*ep> will contain C<s>+C<len>. 1742 1743 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>. 1744 1745 =cut 1746 */ 1747 1748 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \ 1749 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags) 1750 1751 /* 1752 1753 =for apidoc is_utf8_fixed_width_buf_loclen_flags 1754 1755 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of 1756 complete, valid characters found in the C<el> pointer. 1757 1758 =cut 1759 */ 1760 1761 PERL_STATIC_INLINE bool 1762 S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, 1763 STRLEN len, 1764 const U8 **ep, 1765 STRLEN *el, 1766 const U32 flags) 1767 { 1768 const U8 * maybe_partial; 1769 1770 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS; 1771 1772 if (! ep) { 1773 ep = &maybe_partial; 1774 } 1775 1776 /* If it's entirely valid, return that; otherwise see if the only error is 1777 * that the final few bytes are for a partial character */ 1778 return is_utf8_string_loclen_flags(s, len, ep, el, flags) 1779 || is_utf8_valid_partial_char_flags(*ep, s + len, flags); 1780 } 1781 1782 PERL_STATIC_INLINE UV 1783 S_utf8n_to_uvchr_msgs(const U8 *s, 1784 STRLEN curlen, 1785 STRLEN *retlen, 1786 const U32 flags, 1787 U32 * errors, 1788 AV ** msgs) 1789 { 1790 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the 1791 * simple cases, and, if necessary calls a helper function to deal with the 1792 * more complex ones. Almost all well-formed non-problematic code points 1793 * are considered simple, so that it's unlikely that the helper function 1794 * will need to be called. 1795 * 1796 * This is an adaptation of the tables and algorithm given in 1797 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides 1798 * comprehensive documentation of the original version. A copyright notice 1799 * for the original version is given at the beginning of this file. The 1800 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[]. 1801 */ 1802 1803 const U8 * const s0 = s; 1804 const U8 * send = s0 + curlen; 1805 UV uv = 0; /* The 0 silences some stupid compilers */ 1806 UV state = 0; 1807 1808 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; 1809 1810 /* This dfa is fast. If it accepts the input, it was for a well-formed, 1811 * non-problematic code point, which can be returned immediately. 1812 * Otherwise we call a helper function to figure out the more complicated 1813 * cases. */ 1814 1815 while (s < send && LIKELY(state != 1)) { 1816 UV type = PL_strict_utf8_dfa_tab[*s]; 1817 1818 uv = (state == 0) 1819 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) 1820 : UTF8_ACCUMULATE(uv, *s); 1821 state = PL_strict_utf8_dfa_tab[256 + state + type]; 1822 1823 if (state != 0) { 1824 s++; 1825 continue; 1826 } 1827 1828 if (retlen) { 1829 *retlen = s - s0 + 1; 1830 } 1831 if (errors) { 1832 *errors = 0; 1833 } 1834 if (msgs) { 1835 *msgs = NULL; 1836 } 1837 1838 return uv; 1839 } 1840 1841 /* Here is potentially problematic. Use the full mechanism */ 1842 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs); 1843 } 1844 1845 /* ------------------------------- perl.h ----------------------------- */ 1846 1847 /* 1848 =head1 Miscellaneous Functions 1849 1850 =for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name 1851 1852 Test that the given C<pv> doesn't contain any internal C<NUL> characters. 1853 If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE. 1854 1855 Return TRUE if the name is safe. 1856 1857 Used by the C<IS_SAFE_SYSCALL()> macro. 1858 1859 =cut 1860 */ 1861 1862 PERL_STATIC_INLINE bool 1863 S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) { 1864 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs 1865 * perl itself uses xce*() functions which accept 8-bit strings. 1866 */ 1867 1868 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL; 1869 1870 if (len > 1) { 1871 char *null_at; 1872 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) { 1873 SETERRNO(ENOENT, LIB_INVARG); 1874 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS), 1875 "Invalid \\0 character in %s for %s: %s\\0%s", 1876 what, op_name, pv, null_at+1); 1877 return FALSE; 1878 } 1879 } 1880 1881 return TRUE; 1882 } 1883 1884 /* 1885 1886 Return true if the supplied filename has a newline character 1887 immediately before the first (hopefully only) NUL. 1888 1889 My original look at this incorrectly used the len from SvPV(), but 1890 that's incorrect, since we allow for a NUL in pv[len-1]. 1891 1892 So instead, strlen() and work from there. 1893 1894 This allow for the user reading a filename, forgetting to chomp it, 1895 then calling: 1896 1897 open my $foo, "$file\0"; 1898 1899 */ 1900 1901 #ifdef PERL_CORE 1902 1903 PERL_STATIC_INLINE bool 1904 S_should_warn_nl(const char *pv) { 1905 STRLEN len; 1906 1907 PERL_ARGS_ASSERT_SHOULD_WARN_NL; 1908 1909 len = strlen(pv); 1910 1911 return len > 0 && pv[len-1] == '\n'; 1912 } 1913 1914 #endif 1915 1916 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ 1917 1918 #define MAX_CHARSET_NAME_LENGTH 2 1919 1920 PERL_STATIC_INLINE const char * 1921 get_regex_charset_name(const U32 flags, STRLEN* const lenp) 1922 { 1923 /* Returns a string that corresponds to the name of the regex character set 1924 * given by 'flags', and *lenp is set the length of that string, which 1925 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ 1926 1927 *lenp = 1; 1928 switch (get_regex_charset(flags)) { 1929 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; 1930 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; 1931 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; 1932 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; 1933 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 1934 *lenp = 2; 1935 return ASCII_MORE_RESTRICT_PAT_MODS; 1936 } 1937 /* The NOT_REACHED; hides an assert() which has a rather complex 1938 * definition in perl.h. */ 1939 NOT_REACHED; /* NOTREACHED */ 1940 return "?"; /* Unknown */ 1941 } 1942 1943 /* 1944 1945 Return false if any get magic is on the SV other than taint magic. 1946 1947 */ 1948 1949 PERL_STATIC_INLINE bool 1950 S_sv_only_taint_gmagic(SV *sv) { 1951 MAGIC *mg = SvMAGIC(sv); 1952 1953 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; 1954 1955 while (mg) { 1956 if (mg->mg_type != PERL_MAGIC_taint 1957 && !(mg->mg_flags & MGf_GSKIP) 1958 && mg->mg_virtual->svt_get) { 1959 return FALSE; 1960 } 1961 mg = mg->mg_moremagic; 1962 } 1963 1964 return TRUE; 1965 } 1966 1967 /* ------------------ cop.h ------------------------------------------- */ 1968 1969 1970 /* Enter a block. Push a new base context and return its address. */ 1971 1972 PERL_STATIC_INLINE PERL_CONTEXT * 1973 S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) 1974 { 1975 PERL_CONTEXT * cx; 1976 1977 PERL_ARGS_ASSERT_CX_PUSHBLOCK; 1978 1979 CXINC; 1980 cx = CX_CUR(); 1981 cx->cx_type = type; 1982 cx->blk_gimme = gimme; 1983 cx->blk_oldsaveix = saveix; 1984 cx->blk_oldsp = (I32)(sp - PL_stack_base); 1985 cx->blk_oldcop = PL_curcop; 1986 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack); 1987 cx->blk_oldscopesp = PL_scopestack_ix; 1988 cx->blk_oldpm = PL_curpm; 1989 cx->blk_old_tmpsfloor = PL_tmps_floor; 1990 1991 PL_tmps_floor = PL_tmps_ix; 1992 CX_DEBUG(cx, "PUSH"); 1993 return cx; 1994 } 1995 1996 1997 /* Exit a block (RETURN and LAST). */ 1998 1999 PERL_STATIC_INLINE void 2000 S_cx_popblock(pTHX_ PERL_CONTEXT *cx) 2001 { 2002 PERL_ARGS_ASSERT_CX_POPBLOCK; 2003 2004 CX_DEBUG(cx, "POP"); 2005 /* these 3 are common to cx_popblock and cx_topblock */ 2006 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2007 PL_scopestack_ix = cx->blk_oldscopesp; 2008 PL_curpm = cx->blk_oldpm; 2009 2010 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats 2011 * and leaves a CX entry lying around for repeated use, so 2012 * skip for multicall */ \ 2013 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx)) 2014 || PL_savestack_ix == cx->blk_oldsaveix); 2015 PL_curcop = cx->blk_oldcop; 2016 PL_tmps_floor = cx->blk_old_tmpsfloor; 2017 } 2018 2019 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO). 2020 * Whereas cx_popblock() restores the state to the point just before 2021 * cx_pushblock() was called, cx_topblock() restores it to the point just 2022 * *after* cx_pushblock() was called. */ 2023 2024 PERL_STATIC_INLINE void 2025 S_cx_topblock(pTHX_ PERL_CONTEXT *cx) 2026 { 2027 PERL_ARGS_ASSERT_CX_TOPBLOCK; 2028 2029 CX_DEBUG(cx, "TOP"); 2030 /* these 3 are common to cx_popblock and cx_topblock */ 2031 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 2032 PL_scopestack_ix = cx->blk_oldscopesp; 2033 PL_curpm = cx->blk_oldpm; 2034 2035 PL_stack_sp = PL_stack_base + cx->blk_oldsp; 2036 } 2037 2038 2039 PERL_STATIC_INLINE void 2040 S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) 2041 { 2042 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); 2043 2044 PERL_ARGS_ASSERT_CX_PUSHSUB; 2045 2046 PERL_DTRACE_PROBE_ENTRY(cv); 2047 cx->blk_sub.cv = cv; 2048 cx->blk_sub.olddepth = CvDEPTH(cv); 2049 cx->blk_sub.prevcomppad = PL_comppad; 2050 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0; 2051 cx->blk_sub.retop = retop; 2052 SvREFCNT_inc_simple_void_NN(cv); 2053 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF); 2054 } 2055 2056 2057 /* subsets of cx_popsub() */ 2058 2059 PERL_STATIC_INLINE void 2060 S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) 2061 { 2062 CV *cv; 2063 2064 PERL_ARGS_ASSERT_CX_POPSUB_COMMON; 2065 assert(CxTYPE(cx) == CXt_SUB); 2066 2067 PL_comppad = cx->blk_sub.prevcomppad; 2068 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2069 cv = cx->blk_sub.cv; 2070 CvDEPTH(cv) = cx->blk_sub.olddepth; 2071 cx->blk_sub.cv = NULL; 2072 SvREFCNT_dec(cv); 2073 } 2074 2075 2076 /* handle the @_ part of leaving a sub */ 2077 2078 PERL_STATIC_INLINE void 2079 S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) 2080 { 2081 AV *av; 2082 2083 PERL_ARGS_ASSERT_CX_POPSUB_ARGS; 2084 assert(CxTYPE(cx) == CXt_SUB); 2085 assert(AvARRAY(MUTABLE_AV( 2086 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[ 2087 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad); 2088 2089 CX_POP_SAVEARRAY(cx); 2090 av = MUTABLE_AV(PAD_SVl(0)); 2091 if (UNLIKELY(AvREAL(av))) 2092 /* abandon @_ if it got reified */ 2093 clear_defarray(av, 0); 2094 else { 2095 CLEAR_ARGARRAY(av); 2096 } 2097 } 2098 2099 2100 PERL_STATIC_INLINE void 2101 S_cx_popsub(pTHX_ PERL_CONTEXT *cx) 2102 { 2103 PERL_ARGS_ASSERT_CX_POPSUB; 2104 assert(CxTYPE(cx) == CXt_SUB); 2105 2106 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv); 2107 2108 if (CxHASARGS(cx)) 2109 cx_popsub_args(cx); 2110 cx_popsub_common(cx); 2111 } 2112 2113 2114 PERL_STATIC_INLINE void 2115 S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) 2116 { 2117 PERL_ARGS_ASSERT_CX_PUSHFORMAT; 2118 2119 cx->blk_format.cv = cv; 2120 cx->blk_format.retop = retop; 2121 cx->blk_format.gv = gv; 2122 cx->blk_format.dfoutgv = PL_defoutgv; 2123 cx->blk_format.prevcomppad = PL_comppad; 2124 cx->blk_u16 = 0; 2125 2126 SvREFCNT_inc_simple_void_NN(cv); 2127 CvDEPTH(cv)++; 2128 SvREFCNT_inc_void(cx->blk_format.dfoutgv); 2129 } 2130 2131 2132 PERL_STATIC_INLINE void 2133 S_cx_popformat(pTHX_ PERL_CONTEXT *cx) 2134 { 2135 CV *cv; 2136 GV *dfout; 2137 2138 PERL_ARGS_ASSERT_CX_POPFORMAT; 2139 assert(CxTYPE(cx) == CXt_FORMAT); 2140 2141 dfout = cx->blk_format.dfoutgv; 2142 setdefout(dfout); 2143 cx->blk_format.dfoutgv = NULL; 2144 SvREFCNT_dec_NN(dfout); 2145 2146 PL_comppad = cx->blk_format.prevcomppad; 2147 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; 2148 cv = cx->blk_format.cv; 2149 cx->blk_format.cv = NULL; 2150 --CvDEPTH(cv); 2151 SvREFCNT_dec_NN(cv); 2152 } 2153 2154 2155 PERL_STATIC_INLINE void 2156 S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 2157 { 2158 PERL_ARGS_ASSERT_CX_PUSHEVAL; 2159 2160 cx->blk_eval.retop = retop; 2161 cx->blk_eval.old_namesv = namesv; 2162 cx->blk_eval.old_eval_root = PL_eval_root; 2163 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL; 2164 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */ 2165 cx->blk_eval.cur_top_env = PL_top_env; 2166 2167 assert(!(PL_in_eval & ~ 0x3F)); 2168 assert(!(PL_op->op_type & ~0x1FF)); 2169 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); 2170 } 2171 2172 2173 PERL_STATIC_INLINE void 2174 S_cx_popeval(pTHX_ PERL_CONTEXT *cx) 2175 { 2176 SV *sv; 2177 2178 PERL_ARGS_ASSERT_CX_POPEVAL; 2179 assert(CxTYPE(cx) == CXt_EVAL); 2180 2181 PL_in_eval = CxOLD_IN_EVAL(cx); 2182 assert(!(PL_in_eval & 0xc0)); 2183 PL_eval_root = cx->blk_eval.old_eval_root; 2184 sv = cx->blk_eval.cur_text; 2185 if (sv && CxEVAL_TXT_REFCNTED(cx)) { 2186 cx->blk_eval.cur_text = NULL; 2187 SvREFCNT_dec_NN(sv); 2188 } 2189 2190 sv = cx->blk_eval.old_namesv; 2191 if (sv) { 2192 cx->blk_eval.old_namesv = NULL; 2193 SvREFCNT_dec_NN(sv); 2194 } 2195 } 2196 2197 2198 /* push a plain loop, i.e. 2199 * { block } 2200 * while (cond) { block } 2201 * for (init;cond;continue) { block } 2202 * This loop can be last/redo'ed etc. 2203 */ 2204 2205 PERL_STATIC_INLINE void 2206 S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) 2207 { 2208 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; 2209 cx->blk_loop.my_op = cLOOP; 2210 } 2211 2212 2213 /* push a true for loop, i.e. 2214 * for var (list) { block } 2215 */ 2216 2217 PERL_STATIC_INLINE void 2218 S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) 2219 { 2220 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; 2221 2222 /* this one line is common with cx_pushloop_plain */ 2223 cx->blk_loop.my_op = cLOOP; 2224 2225 cx->blk_loop.itervar_u.svp = (SV**)itervarp; 2226 cx->blk_loop.itersave = itersave; 2227 #ifdef USE_ITHREADS 2228 cx->blk_loop.oldcomppad = PL_comppad; 2229 #endif 2230 } 2231 2232 2233 /* pop all loop types, including plain */ 2234 2235 PERL_STATIC_INLINE void 2236 S_cx_poploop(pTHX_ PERL_CONTEXT *cx) 2237 { 2238 PERL_ARGS_ASSERT_CX_POPLOOP; 2239 2240 assert(CxTYPE_is_LOOP(cx)); 2241 if ( CxTYPE(cx) == CXt_LOOP_ARY 2242 || CxTYPE(cx) == CXt_LOOP_LAZYSV) 2243 { 2244 /* Free ary or cur. This assumes that state_u.ary.ary 2245 * aligns with state_u.lazysv.cur. See cx_dup() */ 2246 SV *sv = cx->blk_loop.state_u.lazysv.cur; 2247 cx->blk_loop.state_u.lazysv.cur = NULL; 2248 SvREFCNT_dec_NN(sv); 2249 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) { 2250 sv = cx->blk_loop.state_u.lazysv.end; 2251 cx->blk_loop.state_u.lazysv.end = NULL; 2252 SvREFCNT_dec_NN(sv); 2253 } 2254 } 2255 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) { 2256 SV *cursv; 2257 SV **svp = (cx)->blk_loop.itervar_u.svp; 2258 if ((cx->cx_type & CXp_FOR_GV)) 2259 svp = &GvSV((GV*)svp); 2260 cursv = *svp; 2261 *svp = cx->blk_loop.itersave; 2262 cx->blk_loop.itersave = NULL; 2263 SvREFCNT_dec(cursv); 2264 } 2265 } 2266 2267 2268 PERL_STATIC_INLINE void 2269 S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) 2270 { 2271 PERL_ARGS_ASSERT_CX_PUSHWHEN; 2272 2273 cx->blk_givwhen.leave_op = cLOGOP->op_other; 2274 } 2275 2276 2277 PERL_STATIC_INLINE void 2278 S_cx_popwhen(pTHX_ PERL_CONTEXT *cx) 2279 { 2280 PERL_ARGS_ASSERT_CX_POPWHEN; 2281 assert(CxTYPE(cx) == CXt_WHEN); 2282 2283 PERL_UNUSED_ARG(cx); 2284 PERL_UNUSED_CONTEXT; 2285 /* currently NOOP */ 2286 } 2287 2288 2289 PERL_STATIC_INLINE void 2290 S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) 2291 { 2292 PERL_ARGS_ASSERT_CX_PUSHGIVEN; 2293 2294 cx->blk_givwhen.leave_op = cLOGOP->op_other; 2295 cx->blk_givwhen.defsv_save = orig_defsv; 2296 } 2297 2298 2299 PERL_STATIC_INLINE void 2300 S_cx_popgiven(pTHX_ PERL_CONTEXT *cx) 2301 { 2302 SV *sv; 2303 2304 PERL_ARGS_ASSERT_CX_POPGIVEN; 2305 assert(CxTYPE(cx) == CXt_GIVEN); 2306 2307 sv = GvSV(PL_defgv); 2308 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save; 2309 cx->blk_givwhen.defsv_save = NULL; 2310 SvREFCNT_dec(sv); 2311 } 2312 2313 /* ------------------ util.h ------------------------------------------- */ 2314 2315 /* 2316 =head1 Miscellaneous Functions 2317 2318 =for apidoc foldEQ 2319 2320 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 2321 same 2322 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes 2323 match themselves and their opposite case counterparts. Non-cased and non-ASCII 2324 range bytes match only themselves. 2325 2326 =cut 2327 */ 2328 2329 PERL_STATIC_INLINE I32 2330 Perl_foldEQ(const char *s1, const char *s2, I32 len) 2331 { 2332 const U8 *a = (const U8 *)s1; 2333 const U8 *b = (const U8 *)s2; 2334 2335 PERL_ARGS_ASSERT_FOLDEQ; 2336 2337 assert(len >= 0); 2338 2339 while (len--) { 2340 if (*a != *b && *a != PL_fold[*b]) 2341 return 0; 2342 a++,b++; 2343 } 2344 return 1; 2345 } 2346 2347 PERL_STATIC_INLINE I32 2348 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) 2349 { 2350 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds 2351 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and 2352 * does not check for this. Nor does it check that the strings each have 2353 * at least 'len' characters. */ 2354 2355 const U8 *a = (const U8 *)s1; 2356 const U8 *b = (const U8 *)s2; 2357 2358 PERL_ARGS_ASSERT_FOLDEQ_LATIN1; 2359 2360 assert(len >= 0); 2361 2362 while (len--) { 2363 if (*a != *b && *a != PL_fold_latin1[*b]) { 2364 return 0; 2365 } 2366 a++, b++; 2367 } 2368 return 1; 2369 } 2370 2371 /* 2372 =for apidoc foldEQ_locale 2373 2374 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 2375 same case-insensitively in the current locale; false otherwise. 2376 2377 =cut 2378 */ 2379 2380 PERL_STATIC_INLINE I32 2381 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) 2382 { 2383 dVAR; 2384 const U8 *a = (const U8 *)s1; 2385 const U8 *b = (const U8 *)s2; 2386 2387 PERL_ARGS_ASSERT_FOLDEQ_LOCALE; 2388 2389 assert(len >= 0); 2390 2391 while (len--) { 2392 if (*a != *b && *a != PL_fold_locale[*b]) 2393 return 0; 2394 a++,b++; 2395 } 2396 return 1; 2397 } 2398 2399 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) 2400 2401 PERL_STATIC_INLINE void * 2402 S_my_memrchr(const char * s, const char c, const STRLEN len) 2403 { 2404 /* memrchr(), since many platforms lack it */ 2405 2406 const char * t = s + len - 1; 2407 2408 PERL_ARGS_ASSERT_MY_MEMRCHR; 2409 2410 while (t >= s) { 2411 if (*t == c) { 2412 return (void *) t; 2413 } 2414 t--; 2415 } 2416 2417 return NULL; 2418 } 2419 2420 #endif 2421 2422 /* 2423 * ex: set ts=8 sts=4 sw=4 et: 2424 */ 2425