1 /* inline.h 2 * 3 * Copyright (C) 2012 by Larry Wall and others 4 * 5 * You may distribute under the terms of either the GNU General Public 6 * License or the Artistic License, as specified in the README file. 7 * 8 * This file contains tables and code adapted from 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this 10 * copyright notice: 11 12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 13 14 Permission is hereby granted, free of charge, to any person obtaining a copy of 15 this software and associated documentation files (the "Software"), to deal in 16 the Software without restriction, including without limitation the rights to 17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 18 of the Software, and to permit persons to whom the Software is furnished to do 19 so, subject to the following conditions: 20 21 The above copyright notice and this permission notice shall be included in all 22 copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 SOFTWARE. 31 32 * 33 * This file is a home for static inline functions that cannot go in other 34 * header files, because they depend on proto.h (included after most other 35 * headers) or struct definitions. 36 * 37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but 38 * whose details should be exposed to the compiler, for such things as tail 39 * call optimization. 40 * 41 * Each section names the header file that the functions "belong" to. 42 */ 43 44 /* ------------------------------- av.h ------------------------------- */ 45 46 /* 47 =for apidoc_section $AV 48 =for apidoc av_count 49 Returns the number of elements in the array C<av>. This is the true length of 50 the array, including any undefined elements. It is always the same as 51 S<C<av_top_index(av) + 1>>. 52 53 =cut 54 */ 55 PERL_STATIC_INLINE Size_t 56 Perl_av_count(pTHX_ AV *av) 57 { 58 PERL_ARGS_ASSERT_AV_COUNT; 59 assert(SvTYPE(av) == SVt_PVAV); 60 61 return AvFILL(av) + 1; 62 } 63 64 /* ------------------------------- av.c ------------------------------- */ 65 66 /* 67 =for apidoc av_store_simple 68 69 This is a cut-down version of av_store that assumes that the array is 70 very straightforward - no magic, not readonly, and AvREAL - and that 71 C<key> is not negative. This function MUST NOT be used in situations 72 where any of those assumptions may not hold. 73 74 Stores an SV in an array. The array index is specified as C<key>. It 75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)). 76 77 Note that the caller is responsible for suitably incrementing the reference 78 count of C<val> before the call. 79 80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>. 81 82 =cut 83 */ 84 85 PERL_STATIC_INLINE SV** 86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val) 87 { 88 SV** ary; 89 90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE; 91 assert(SvTYPE(av) == SVt_PVAV); 92 assert(!SvMAGICAL(av)); 93 assert(!SvREADONLY(av)); 94 assert(AvREAL(av)); 95 assert(key > -1); 96 97 ary = AvARRAY(av); 98 99 if (AvFILLp(av) < key) { 100 if (key > AvMAX(av)) { 101 av_extend(av,key); 102 ary = AvARRAY(av); 103 } 104 AvFILLp(av) = key; 105 } else 106 SvREFCNT_dec(ary[key]); 107 108 ary[key] = val; 109 return &ary[key]; 110 } 111 112 /* 113 =for apidoc av_fetch_simple 114 115 This is a cut-down version of av_fetch that assumes that the array is 116 very straightforward - no magic, not readonly, and AvREAL - and that 117 C<key> is not negative. This function MUST NOT be used in situations 118 where any of those assumptions may not hold. 119 120 Returns the SV at the specified index in the array. The C<key> is the 121 index. If lval is true, you are guaranteed to get a real SV back (in case 122 it wasn't real before), which you can then modify. Check that the return 123 value is non-null before dereferencing it to a C<SV*>. 124 125 The rough perl equivalent is C<$myarray[$key]>. 126 127 =cut 128 */ 129 130 PERL_STATIC_INLINE SV** 131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval) 132 { 133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE; 134 assert(SvTYPE(av) == SVt_PVAV); 135 assert(!SvMAGICAL(av)); 136 assert(!SvREADONLY(av)); 137 assert(AvREAL(av)); 138 assert(key > -1); 139 140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) { 141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL; 142 } else { 143 return &AvARRAY(av)[key]; 144 } 145 } 146 147 /* 148 =for apidoc av_push_simple 149 150 This is a cut-down version of av_push that assumes that the array is very 151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is 152 not less than -1. This function MUST NOT be used in situations where any 153 of those assumptions may not hold. 154 155 Pushes an SV (transferring control of one reference count) onto the end of the 156 array. The array will grow automatically to accommodate the addition. 157 158 Perl equivalent: C<push @myarray, $val;>. 159 160 =cut 161 */ 162 163 PERL_STATIC_INLINE void 164 Perl_av_push_simple(pTHX_ AV *av, SV *val) 165 { 166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE; 167 assert(SvTYPE(av) == SVt_PVAV); 168 assert(!SvMAGICAL(av)); 169 assert(!SvREADONLY(av)); 170 assert(AvREAL(av)); 171 assert(AvFILLp(av) > -2); 172 173 (void)av_store_simple(av,AvFILLp(av)+1,val); 174 } 175 176 /* 177 =for apidoc av_new_alloc 178 179 This implements L<perlapi/C<newAV_alloc_x>> 180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this 181 functionality. 182 183 Creates a new AV and allocates its SV* array. 184 185 This is similar to, but more efficient than doing: 186 187 AV *av = newAV(); 188 av_extend(av, key); 189 190 The size parameter is used to pre-allocate a SV* array large enough to 191 hold at least elements C<0..(size-1)>. C<size> must be at least 1. 192 193 The C<zeroflag> parameter controls whether or not the array is NULL 194 initialized. 195 196 =cut 197 */ 198 199 PERL_STATIC_INLINE AV * 200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag) 201 { 202 AV * const av = newAV(); 203 SV** ary; 204 PERL_ARGS_ASSERT_AV_NEW_ALLOC; 205 assert(size > 0); 206 207 Newx(ary, size, SV*); /* Newx performs the memwrap check */ 208 AvALLOC(av) = ary; 209 AvARRAY(av) = ary; 210 AvMAX(av) = size - 1; 211 212 if (zeroflag) 213 Zero(ary, size, SV*); 214 215 return av; 216 } 217 218 219 /* remove (AvARRAY(av) - AvALLOC(av)) offset from empty array */ 220 221 PERL_STATIC_INLINE void 222 Perl_av_remove_offset(pTHX_ AV *av) 223 { 224 PERL_ARGS_ASSERT_AV_REMOVE_OFFSET; 225 assert(AvFILLp(av) == -1); 226 SSize_t i = AvARRAY(av) - AvALLOC(av); 227 if (i) { 228 AvARRAY(av) = AvALLOC(av); 229 AvMAX(av) += i; 230 #ifdef PERL_RC_STACK 231 Zero(AvALLOC(av), i, SV*); 232 #endif 233 } 234 } 235 236 237 /* ------------------------------- cv.h ------------------------------- */ 238 239 /* 240 =for apidoc_section $CV 241 =for apidoc CvGV 242 Returns the GV associated with the CV C<sv>, reifying it if necessary. 243 244 =cut 245 */ 246 PERL_STATIC_INLINE GV * 247 Perl_CvGV(pTHX_ CV *sv) 248 { 249 PERL_ARGS_ASSERT_CVGV; 250 251 return CvNAMED(sv) 252 ? Perl_cvgv_from_hek(aTHX_ sv) 253 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; 254 } 255 256 /* 257 =for apidoc CvDEPTH 258 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a 259 recursive call. 260 261 =cut 262 */ 263 PERL_STATIC_INLINE I32 * 264 Perl_CvDEPTH(const CV * const sv) 265 { 266 PERL_ARGS_ASSERT_CVDEPTH; 267 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); 268 269 return &((XPVCV*)SvANY(sv))->xcv_depth; 270 } 271 272 /* 273 CvPROTO returns the prototype as stored, which is not necessarily what 274 the interpreter should be using. Specifically, the interpreter assumes 275 that spaces have been stripped, which has been the case if the prototype 276 was added by toke.c, but is generally not the case if it was added elsewhere. 277 Since we can't enforce the spacelessness at assignment time, this routine 278 provides a temporary copy at parse time with spaces removed. 279 I<orig> is the start of the original buffer, I<len> is the length of the 280 prototype and will be updated when this returns. 281 */ 282 283 #ifdef PERL_CORE 284 PERL_STATIC_INLINE char * 285 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len) 286 { 287 SV * tmpsv; 288 char * tmps; 289 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP); 290 tmps = SvPVX(tmpsv); 291 while ((*len)--) { 292 if (!isSPACE(*orig)) 293 *tmps++ = *orig; 294 orig++; 295 } 296 *tmps = '\0'; 297 *len = tmps - SvPVX(tmpsv); 298 return SvPVX(tmpsv); 299 } 300 #endif 301 302 /* ------------------------------- iperlsys.h ------------------------------- */ 303 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS) 304 305 /* Otherwise this function is implemented as macros in iperlsys.h */ 306 307 PERL_STATIC_INLINE bool 308 S_PerlEnv_putenv(pTHX_ char * str) 309 { 310 PERL_ARGS_ASSERT_PERLENV_PUTENV; 311 312 ENV_LOCK; 313 bool retval = putenv(str); 314 ENV_UNLOCK; 315 316 return retval; 317 } 318 319 #endif 320 321 /* ------------------------------- mg.h ------------------------------- */ 322 323 #if defined(PERL_CORE) || defined(PERL_EXT) 324 /* assumes get-magic and stringification have already occurred */ 325 PERL_STATIC_INLINE STRLEN 326 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) 327 { 328 assert(mg->mg_type == PERL_MAGIC_regex_global); 329 assert(mg->mg_len != -1); 330 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv)) 331 return (STRLEN)mg->mg_len; 332 else { 333 const STRLEN pos = (STRLEN)mg->mg_len; 334 /* Without this check, we may read past the end of the buffer: */ 335 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; 336 return sv_or_pv_pos_u2b(sv, s, pos, NULL); 337 } 338 } 339 #endif 340 341 /* ------------------------------- pad.h ------------------------------ */ 342 343 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) 344 PERL_STATIC_INLINE bool 345 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) 346 { 347 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE; 348 349 /* is seq within the range _LOW to _HIGH ? 350 * This is complicated by the fact that PL_cop_seqmax 351 * may have wrapped around at some point */ 352 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO) 353 return FALSE; /* not yet introduced */ 354 355 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) { 356 /* in compiling scope */ 357 if ( 358 (seq > COP_SEQ_RANGE_LOW(pn)) 359 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) 360 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) 361 ) 362 return TRUE; 363 } 364 else if ( 365 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) 366 ? 367 ( seq > COP_SEQ_RANGE_LOW(pn) 368 || seq <= COP_SEQ_RANGE_HIGH(pn)) 369 370 : ( seq > COP_SEQ_RANGE_LOW(pn) 371 && seq <= COP_SEQ_RANGE_HIGH(pn)) 372 ) 373 return TRUE; 374 return FALSE; 375 } 376 #endif 377 378 /* ------------------------------- pp.h ------------------------------- */ 379 380 PERL_STATIC_INLINE Stack_off_t 381 Perl_TOPMARK(pTHX) 382 { 383 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 384 "MARK top %p %" IVdf "\n", 385 PL_markstack_ptr, 386 (IV)*PL_markstack_ptr))); 387 return *PL_markstack_ptr; 388 } 389 390 PERL_STATIC_INLINE Stack_off_t 391 Perl_POPMARK(pTHX) 392 { 393 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, 394 "MARK pop %p %" IVdf "\n", 395 (PL_markstack_ptr-1), 396 (IV)*(PL_markstack_ptr-1)))); 397 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); 398 return *PL_markstack_ptr--; 399 } 400 401 /* 402 =for apidoc_section $rpp 403 404 =for apidoc rpp_extend 405 Ensures that there is space on the stack to push C<n> items, extending it 406 if necessary. 407 408 =cut 409 */ 410 411 PERL_STATIC_INLINE void 412 Perl_rpp_extend(pTHX_ SSize_t n) 413 { 414 PERL_ARGS_ASSERT_RPP_EXTEND; 415 416 EXTEND_HWM_SET(PL_stack_sp, n); 417 #ifndef STRESS_REALLOC 418 if (UNLIKELY(_EXTEND_NEEDS_GROW(PL_stack_sp, n))) 419 #endif 420 { 421 (void)stack_grow(PL_stack_sp, PL_stack_sp, n); 422 } 423 } 424 425 426 /* 427 =for apidoc rpp_popfree_to 428 429 Pop and free all items on the argument stack above C<sp>. On return, 430 C<PL_stack_sp> will be equal to C<sp>. 431 432 =cut 433 */ 434 435 PERL_STATIC_INLINE void 436 Perl_rpp_popfree_to(pTHX_ SV **sp) 437 { 438 PERL_ARGS_ASSERT_RPP_POPFREE_TO; 439 440 assert(sp <= PL_stack_sp); 441 #ifdef PERL_RC_STACK 442 assert(rpp_stack_is_rc()); 443 while (PL_stack_sp > sp) { 444 SV *sv = *PL_stack_sp--; 445 SvREFCNT_dec(sv); 446 } 447 #else 448 PL_stack_sp = sp; 449 #endif 450 } 451 452 453 /* 454 =for apidoc rpp_popfree_to_NN 455 456 A variant of rpp_popfree_to() which assumes that all the pointers being 457 popped off the stack are non-NULL. 458 459 =cut 460 */ 461 462 PERL_STATIC_INLINE void 463 Perl_rpp_popfree_to_NN(pTHX_ SV **sp) 464 { 465 PERL_ARGS_ASSERT_RPP_POPFREE_TO_NN; 466 467 assert(sp <= PL_stack_sp); 468 #ifdef PERL_RC_STACK 469 assert(rpp_stack_is_rc()); 470 while (PL_stack_sp > sp) { 471 SV *sv = *PL_stack_sp--; 472 assert(sv); 473 SvREFCNT_dec_NN(sv); 474 } 475 #else 476 PL_stack_sp = sp; 477 #endif 478 } 479 480 481 /* 482 =for apidoc rpp_popfree_1 483 484 Pop and free the top item on the argument stack and update C<PL_stack_sp>. 485 486 =cut 487 */ 488 489 PERL_STATIC_INLINE void 490 Perl_rpp_popfree_1(pTHX) 491 { 492 PERL_ARGS_ASSERT_RPP_POPFREE_1; 493 494 #ifdef PERL_RC_STACK 495 assert(rpp_stack_is_rc()); 496 SV *sv = *PL_stack_sp--; 497 SvREFCNT_dec(sv); 498 #else 499 PL_stack_sp--; 500 #endif 501 } 502 503 504 /* 505 =for apidoc rpp_popfree_1_NN 506 507 A variant of rpp_popfree_1() which assumes that the pointer being popped 508 off the stack is non-NULL. 509 510 =cut 511 */ 512 513 PERL_STATIC_INLINE void 514 Perl_rpp_popfree_1_NN(pTHX) 515 { 516 PERL_ARGS_ASSERT_RPP_POPFREE_1_NN; 517 518 assert(*PL_stack_sp); 519 #ifdef PERL_RC_STACK 520 assert(rpp_stack_is_rc()); 521 SV *sv = *PL_stack_sp--; 522 SvREFCNT_dec_NN(sv); 523 #else 524 PL_stack_sp--; 525 #endif 526 } 527 528 529 /* 530 =for apidoc rpp_popfree_2 531 532 Pop and free the top two items on the argument stack and update 533 C<PL_stack_sp>. 534 535 =cut 536 */ 537 538 539 PERL_STATIC_INLINE void 540 Perl_rpp_popfree_2(pTHX) 541 { 542 PERL_ARGS_ASSERT_RPP_POPFREE_2; 543 544 #ifdef PERL_RC_STACK 545 assert(rpp_stack_is_rc()); 546 for (int i = 0; i < 2; i++) { 547 SV *sv = *PL_stack_sp--; 548 SvREFCNT_dec(sv); 549 } 550 #else 551 PL_stack_sp -= 2; 552 #endif 553 } 554 555 556 /* 557 =for apidoc rpp_popfree_2_NN 558 559 A variant of rpp_popfree_2() which assumes that the two pointers being 560 popped off the stack are non-NULL. 561 562 =cut 563 */ 564 565 566 PERL_STATIC_INLINE void 567 Perl_rpp_popfree_2_NN(pTHX) 568 { 569 PERL_ARGS_ASSERT_RPP_POPFREE_2_NN; 570 #ifdef PERL_RC_STACK 571 SV *sv2 = *PL_stack_sp--; 572 assert(sv2); 573 SV *sv1 = *PL_stack_sp; 574 assert(sv1); 575 576 assert(rpp_stack_is_rc()); 577 U32 rc1 = SvREFCNT(sv1); 578 U32 rc2 = SvREFCNT(sv2); 579 /* This expression is intended to be true if either of rc1 or rc2 has 580 * the value 0 or 1, but using only a single branch test, rather 581 * than the two branches that a compiler would plant for a boolean 582 * expression. We are working on the assumption that, most of the 583 * time, neither of the args to a binary function will need to be 584 * freed - they're likely to lex vars, or PADTMPs or whatever. 585 * So give the CPU a single branch that is rarely taken. */ 586 if (UNLIKELY( !(rc1>>1) + !(rc2>>1) )) 587 /* at least one of the old SVs needs freeing. Do it the long way */ 588 Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2); 589 else { 590 SvREFCNT(sv1) = rc1 - 1; 591 SvREFCNT(sv2) = rc2 - 1; 592 } 593 PL_stack_sp--; 594 #else 595 PL_stack_sp -= 2; 596 #endif 597 } 598 599 600 /* 601 =for apidoc rpp_pop_1_norc 602 603 Pop and return the top item off the argument stack and update 604 C<PL_stack_sp>. It's similar to rpp_popfree_1(), except that it actually 605 returns a value, and it I<doesn't> decrement the SV's reference count. 606 On non-C<PERL_RC_STACK> builds it actually increments the SV's reference 607 count. 608 609 This is useful in cases where the popped value is immediately embedded 610 somewhere e.g. via av_store(), allowing you skip decrementing and then 611 immediately incrementing the reference count again (and risk prematurely 612 freeing the SV if it had a RC of 1). On non-RC builds, the reference count 613 bookkeeping still works too, which is why it should be used rather than 614 a simple C<*PL_stack_sp-->. 615 616 =cut 617 */ 618 619 PERL_STATIC_INLINE SV* 620 Perl_rpp_pop_1_norc(pTHX) 621 { 622 PERL_ARGS_ASSERT_RPP_POP_1_NORC 623 624 SV *sv = *PL_stack_sp--; 625 626 #ifndef PERL_RC_STACK 627 SvREFCNT_inc(sv); 628 #else 629 assert(rpp_stack_is_rc()); 630 #endif 631 return sv; 632 } 633 634 635 636 /* 637 =for apidoc rpp_push_1 638 =for apidoc_item rpp_push_IMM 639 =for apidoc_item rpp_push_2 640 =for apidoc_item rpp_xpush_1 641 =for apidoc_item rpp_xpush_IMM 642 =for apidoc_item rpp_xpush_2 643 644 Push one or two SVs onto the stack, incrementing their reference counts 645 and updating C<PL_stack_sp>. With the C<x> variants, it extends the stack 646 first. The C<IMM> variants assume that the single argument is an immortal 647 such as <&PL_sv_undef> and, for efficiency, will skip incrementing its 648 reference count. 649 650 =cut 651 */ 652 653 PERL_STATIC_INLINE void 654 Perl_rpp_push_1(pTHX_ SV *sv) 655 { 656 PERL_ARGS_ASSERT_RPP_PUSH_1; 657 658 *++PL_stack_sp = sv; 659 #ifdef PERL_RC_STACK 660 assert(rpp_stack_is_rc()); 661 SvREFCNT_inc_simple_void_NN(sv); 662 #endif 663 } 664 665 PERL_STATIC_INLINE void 666 Perl_rpp_push_IMM(pTHX_ SV *sv) 667 { 668 PERL_ARGS_ASSERT_RPP_PUSH_IMM; 669 670 assert(SvIMMORTAL(sv)); 671 *++PL_stack_sp = sv; 672 #ifdef PERL_RC_STACK 673 assert(rpp_stack_is_rc()); 674 #endif 675 } 676 677 PERL_STATIC_INLINE void 678 Perl_rpp_push_2(pTHX_ SV *sv1, SV *sv2) 679 { 680 PERL_ARGS_ASSERT_RPP_PUSH_2; 681 682 *++PL_stack_sp = sv1; 683 *++PL_stack_sp = sv2; 684 #ifdef PERL_RC_STACK 685 assert(rpp_stack_is_rc()); 686 SvREFCNT_inc_simple_void_NN(sv1); 687 SvREFCNT_inc_simple_void_NN(sv2); 688 #endif 689 } 690 691 PERL_STATIC_INLINE void 692 Perl_rpp_xpush_1(pTHX_ SV *sv) 693 { 694 PERL_ARGS_ASSERT_RPP_XPUSH_1; 695 696 rpp_extend(1); 697 rpp_push_1(sv); 698 } 699 700 PERL_STATIC_INLINE void 701 Perl_rpp_xpush_IMM(pTHX_ SV *sv) 702 { 703 PERL_ARGS_ASSERT_RPP_XPUSH_IMM; 704 705 rpp_extend(1); 706 rpp_push_IMM(sv); 707 } 708 709 PERL_STATIC_INLINE void 710 Perl_rpp_xpush_2(pTHX_ SV *sv1, SV *sv2) 711 { 712 PERL_ARGS_ASSERT_RPP_XPUSH_2; 713 714 rpp_extend(2); 715 rpp_push_2(sv1, sv2); 716 } 717 718 719 /* 720 =for apidoc rpp_push_1_norc 721 722 Push C<sv> onto the stack without incrementing its reference count, and 723 update C<PL_stack_sp>. On non-PERL_RC_STACK builds, mortalise too. 724 725 This is most useful where an SV has just been created and already has a 726 reference count of 1, but has not yet been anchored anywhere. 727 728 =cut 729 */ 730 731 PERL_STATIC_INLINE void 732 Perl_rpp_push_1_norc(pTHX_ SV *sv) 733 { 734 PERL_ARGS_ASSERT_RPP_PUSH_1; 735 736 *++PL_stack_sp = sv; 737 #ifdef PERL_RC_STACK 738 assert(rpp_stack_is_rc()); 739 #else 740 sv_2mortal(sv); 741 #endif 742 } 743 744 745 /* 746 =for apidoc rpp_replace_1_1 747 =for apidoc_item rpp_replace_1_1_NN 748 =for apidoc_item rpp_replace_1_IMM_NN 749 750 Replace the current top stack item with C<sv>, while suitably adjusting 751 reference counts. Equivalent to rpp_popfree_1(); rpp_push_1(sv), but 752 is more efficient and handles both SVs being the same. 753 754 The C<_NN> variant assumes that the pointer on the stack to the SV being 755 freed is non-NULL. 756 757 The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes 758 that the single argument is an immortal such as <&PL_sv_undef> and, for 759 efficiency, will skip incrementing its reference count. 760 761 =cut 762 */ 763 764 PERL_STATIC_INLINE void 765 Perl_rpp_replace_1_1(pTHX_ SV *sv) 766 { 767 PERL_ARGS_ASSERT_RPP_REPLACE_1_1; 768 769 assert(sv); 770 #ifdef PERL_RC_STACK 771 assert(rpp_stack_is_rc()); 772 SV *oldsv = *PL_stack_sp; 773 *PL_stack_sp = sv; 774 SvREFCNT_inc_simple_void_NN(sv); 775 SvREFCNT_dec(oldsv); 776 #else 777 *PL_stack_sp = sv; 778 #endif 779 } 780 781 782 PERL_STATIC_INLINE void 783 Perl_rpp_replace_1_1_NN(pTHX_ SV *sv) 784 { 785 PERL_ARGS_ASSERT_RPP_REPLACE_1_1_NN; 786 787 assert(sv); 788 assert(*PL_stack_sp); 789 #ifdef PERL_RC_STACK 790 assert(rpp_stack_is_rc()); 791 SV *oldsv = *PL_stack_sp; 792 *PL_stack_sp = sv; 793 SvREFCNT_inc_simple_void_NN(sv); 794 SvREFCNT_dec_NN(oldsv); 795 #else 796 *PL_stack_sp = sv; 797 #endif 798 } 799 800 801 PERL_STATIC_INLINE void 802 Perl_rpp_replace_1_IMM_NN(pTHX_ SV *sv) 803 { 804 PERL_ARGS_ASSERT_RPP_REPLACE_1_IMM_NN; 805 806 assert(sv); 807 assert(SvIMMORTAL(sv)); 808 assert(*PL_stack_sp); 809 #ifdef PERL_RC_STACK 810 assert(rpp_stack_is_rc()); 811 SV *oldsv = *PL_stack_sp; 812 *PL_stack_sp = sv; 813 SvREFCNT_dec_NN(oldsv); 814 #else 815 *PL_stack_sp = sv; 816 #endif 817 } 818 819 820 /* 821 =for apidoc rpp_replace_2_1 822 =for apidoc_item rpp_replace_2_1_NN 823 =for apidoc_item rpp_replace_2_IMM_NN 824 825 Replace the current top to stacks item with C<sv>, while suitably 826 adjusting reference counts. Equivalent to rpp_popfree_2(); rpp_push_1(sv), 827 but is more efficient and handles SVs being the same. 828 829 The C<_NN> variant assumes that the pointers on the stack to the SVs being 830 freed are non-NULL. 831 832 The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes 833 that the single argument is an immortal such as <&PL_sv_undef> and, for 834 efficiency, will skip incrementing its reference count. 835 =cut 836 */ 837 838 PERL_STATIC_INLINE void 839 Perl_rpp_replace_2_1(pTHX_ SV *sv) 840 { 841 PERL_ARGS_ASSERT_RPP_REPLACE_2_1; 842 843 #ifdef PERL_RC_STACK 844 assert(rpp_stack_is_rc()); 845 /* replace PL_stack_sp[-1] first; leave PL_stack_sp[0] in place while 846 * we free [-1], so if an exception occurs, [0] will still be freed. 847 */ 848 SV *oldsv = PL_stack_sp[-1]; 849 PL_stack_sp[-1] = sv; 850 SvREFCNT_inc_simple_void_NN(sv); 851 SvREFCNT_dec(oldsv); 852 oldsv = *PL_stack_sp--; 853 SvREFCNT_dec(oldsv); 854 #else 855 *--PL_stack_sp = sv; 856 #endif 857 } 858 859 860 /* Private helper function for _NN and _IMM_NN variants. 861 * Assumes sv has already had its ref count incremented, 862 * ready for being put on the stack. 863 * Intended to be small and fast, since it's inlined into many hot parts of 864 * code. 865 */ 866 867 PERL_STATIC_INLINE void 868 Perl_rpp_replace_2_1_COMMON(pTHX_ SV *sv) 869 { 870 871 assert(sv); 872 #ifdef PERL_RC_STACK 873 SV *sv2 = *PL_stack_sp--; 874 assert(sv2); 875 SV *sv1 = *PL_stack_sp; 876 assert(sv1); 877 878 *PL_stack_sp = sv; 879 assert(rpp_stack_is_rc()); 880 U32 rc1 = SvREFCNT(sv1); 881 U32 rc2 = SvREFCNT(sv2); 882 /* This expression is intended to be true if either of rc1 or rc2 has 883 * the value 0 or 1, but using only a single branch test, rather 884 * than the two branches that a compiler would plant for a boolean 885 * expression. We are working on the assumption that, most of the 886 * time, neither of the args to a binary function will need to be 887 * freed - they're likely to lex vars, or PADTMPs or whatever. 888 * So give the CPU a single branch that is rarely taken. */ 889 if (UNLIKELY( !(rc1>>1) + !(rc2>>1) )) 890 /* at least one of the old SVs needs freeing. Do it the long way */ 891 Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2); 892 else { 893 SvREFCNT(sv1) = rc1 - 1; 894 SvREFCNT(sv2) = rc2 - 1; 895 } 896 #else 897 *--PL_stack_sp = sv; 898 #endif 899 } 900 901 902 PERL_STATIC_INLINE void 903 Perl_rpp_replace_2_1_NN(pTHX_ SV *sv) 904 { 905 PERL_ARGS_ASSERT_RPP_REPLACE_2_1_NN; 906 907 assert(sv); 908 #ifdef PERL_RC_STACK 909 SvREFCNT_inc_simple_void_NN(sv); 910 #endif 911 rpp_replace_2_1_COMMON(sv); 912 } 913 914 915 PERL_STATIC_INLINE void 916 Perl_rpp_replace_2_IMM_NN(pTHX_ SV *sv) 917 { 918 PERL_ARGS_ASSERT_RPP_REPLACE_2_IMM_NN; 919 920 assert(sv); 921 assert(SvIMMORTAL(sv)); 922 rpp_replace_2_1_COMMON(sv); 923 } 924 925 926 /* 927 =for apidoc rpp_replace_at 928 929 Replace the SV at address sp within the stack with C<sv>, while suitably 930 adjusting reference counts. Equivalent to C<*sp = sv>, except with proper 931 reference count handling. 932 933 =cut 934 */ 935 936 PERL_STATIC_INLINE void 937 Perl_rpp_replace_at(pTHX_ SV **sp, SV *sv) 938 { 939 PERL_ARGS_ASSERT_RPP_REPLACE_AT; 940 941 #ifdef PERL_RC_STACK 942 assert(rpp_stack_is_rc()); 943 SV *oldsv = *sp; 944 *sp = sv; 945 SvREFCNT_inc_simple_void_NN(sv); 946 SvREFCNT_dec(oldsv); 947 #else 948 *sp = sv; 949 #endif 950 } 951 952 953 /* 954 =for apidoc rpp_replace_at_NN 955 956 A variant of rpp_replace_at() which assumes that the SV pointer on the 957 stack is non-NULL. 958 959 =cut 960 */ 961 962 PERL_STATIC_INLINE void 963 Perl_rpp_replace_at_NN(pTHX_ SV **sp, SV *sv) 964 { 965 PERL_ARGS_ASSERT_RPP_REPLACE_AT_NN; 966 967 assert(sv); 968 assert(*sp); 969 #ifdef PERL_RC_STACK 970 assert(rpp_stack_is_rc()); 971 SV *oldsv = *sp; 972 *sp = sv; 973 SvREFCNT_inc_simple_void_NN(sv); 974 SvREFCNT_dec_NN(oldsv); 975 #else 976 *sp = sv; 977 #endif 978 } 979 980 981 /* 982 =for apidoc rpp_replace_at_norc 983 984 Replace the SV at address sp within the stack with C<sv>, while suitably 985 adjusting the reference count of the old SV. Equivalent to C<*sp = sv>, 986 except with proper reference count handling. 987 988 C<sv>'s reference count doesn't get incremented. On non-C<PERL_RC_STACK> 989 builds, it gets mortalised too. 990 991 This is most useful where an SV has just been created and already has a 992 reference count of 1, but has not yet been anchored anywhere. 993 994 =cut 995 */ 996 997 PERL_STATIC_INLINE void 998 Perl_rpp_replace_at_norc(pTHX_ SV **sp, SV *sv) 999 { 1000 PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC; 1001 1002 #ifdef PERL_RC_STACK 1003 assert(rpp_stack_is_rc()); 1004 SV *oldsv = *sp; 1005 *sp = sv; 1006 SvREFCNT_dec(oldsv); 1007 #else 1008 *sp = sv; 1009 sv_2mortal(sv); 1010 #endif 1011 } 1012 1013 1014 /* 1015 =for apidoc rpp_replace_at_norc_NN 1016 1017 A variant of rpp_replace_at_norc() which assumes that the SV pointer on the 1018 stack is non-NULL. 1019 1020 =cut 1021 */ 1022 1023 PERL_STATIC_INLINE void 1024 Perl_rpp_replace_at_norc_NN(pTHX_ SV **sp, SV *sv) 1025 { 1026 PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC_NN; 1027 1028 assert(*sp); 1029 #ifdef PERL_RC_STACK 1030 assert(rpp_stack_is_rc()); 1031 SV *oldsv = *sp; 1032 *sp = sv; 1033 SvREFCNT_dec_NN(oldsv); 1034 #else 1035 *sp = sv; 1036 sv_2mortal(sv); 1037 #endif 1038 } 1039 1040 1041 /* 1042 =for apidoc rpp_context 1043 1044 Impose void, scalar or list context on the stack. 1045 First, pop C<extra> items off the stack, then when C<gimme> is: 1046 C<G_LIST>: return as-is. 1047 C<G_VOID>: pop everything back to C<mark> 1048 C<G_SCALAR>: move the top stack item (or C<&PL_sv_undef> if none) to 1049 C<mark+1> and free everything above it. 1050 1051 =cut 1052 */ 1053 1054 PERL_STATIC_INLINE void 1055 Perl_rpp_context(pTHX_ SV **mark, U8 gimme, SSize_t extra) 1056 { 1057 PERL_ARGS_ASSERT_RPP_CONTEXT; 1058 assert(extra >= 0); 1059 assert(mark <= PL_stack_sp - extra); 1060 1061 if (gimme == G_LIST) 1062 mark = PL_stack_sp - extra; 1063 else if (gimme == G_SCALAR) { 1064 SV **svp = PL_stack_sp - extra; 1065 mark++; 1066 if (mark > svp) { 1067 /* empty list (plus extra) */ 1068 rpp_popfree_to(svp); 1069 rpp_extend(1); 1070 *++PL_stack_sp = &PL_sv_undef; 1071 return; 1072 } 1073 /* swap top and bottom list items */ 1074 SV *top = *svp; 1075 *svp = *mark; 1076 *mark = top; 1077 } 1078 rpp_popfree_to(mark); 1079 } 1080 1081 1082 1083 1084 /* 1085 =for apidoc rpp_try_AMAGIC_1 1086 =for apidoc_item rpp_try_AMAGIC_2 1087 1088 Check whether either of the one or two SVs at the top of the stack is 1089 magical or a ref, and in either case handle it specially: invoke get 1090 magic, call an overload method, or replace a ref with a temporary numeric 1091 value, as appropriate. If this function returns true, it indicates that 1092 the correct return value is already on the stack. Intended to be used at 1093 the beginning of the PP function for unary or binary ops. 1094 1095 =cut 1096 */ 1097 1098 PERL_STATIC_INLINE bool 1099 Perl_rpp_try_AMAGIC_1(pTHX_ int method, int flags) 1100 { 1101 return UNLIKELY((SvFLAGS(*PL_stack_sp) & (SVf_ROK|SVs_GMG))) 1102 && Perl_try_amagic_un(aTHX_ method, flags); 1103 } 1104 1105 PERL_STATIC_INLINE bool 1106 Perl_rpp_try_AMAGIC_2(pTHX_ int method, int flags) 1107 { 1108 return UNLIKELY(((SvFLAGS(PL_stack_sp[-1])|SvFLAGS(PL_stack_sp[0])) 1109 & (SVf_ROK|SVs_GMG))) 1110 && Perl_try_amagic_bin(aTHX_ method, flags); 1111 } 1112 1113 1114 /* 1115 =for apidoc rpp_stack_is_rc 1116 1117 Returns a boolean value indicating whether the stack is currently 1118 reference-counted. Note that if the stack is split (bottom half RC, top 1119 half non-RC), this function returns false, even if the top half currently 1120 contains zero items. 1121 1122 =cut 1123 */ 1124 1125 PERL_STATIC_INLINE bool 1126 Perl_rpp_stack_is_rc(pTHX) 1127 { 1128 #ifdef PERL_RC_STACK 1129 return AvREAL(PL_curstack) && !PL_curstackinfo->si_stack_nonrc_base; 1130 #else 1131 return 0; 1132 #endif 1133 1134 } 1135 1136 1137 /* 1138 =for apidoc rpp_is_lone 1139 1140 Indicates whether the stacked SV C<sv> (assumed to be not yet popped off 1141 the stack) is only kept alive due to a single reference from the argument 1142 stack and/or and the temps stack. 1143 1144 This can used for example to decide whether the copying of return values 1145 in rvalue context can be skipped, or whether it shouldn't be assigned to 1146 in lvalue context. 1147 1148 =cut 1149 */ 1150 1151 1152 PERL_STATIC_INLINE bool 1153 Perl_rpp_is_lone(pTHX_ SV *sv) 1154 { 1155 #ifdef PERL_RC_STACK 1156 /* note that rpp_is_lone() can be used in wrapped pp functions, 1157 * where technically the stack is no longer ref-counted; but because 1158 * the args are non-RC copies of RC args further down the stack, we 1159 * can't be in a *completely* non-ref stack. 1160 */ 1161 assert(AvREAL(PL_curstack)); 1162 #endif 1163 1164 return SvREFCNT(sv) <= cBOOL(SvTEMP(sv)) 1165 #ifdef PERL_RC_STACK 1166 + 1 1167 && !SvIMMORTAL(sv) /* PL_sv_undef etc are never stealable */ 1168 #endif 1169 ; 1170 } 1171 1172 1173 /* 1174 =for apidoc rpp_invoke_xs 1175 1176 Call the XS function associated with C<cv>. Wraps the call if necessary to 1177 handle XS functions which are not aware of reference-counted stacks. 1178 1179 =cut 1180 */ 1181 1182 1183 PERL_STATIC_INLINE void 1184 Perl_rpp_invoke_xs(pTHX_ CV *cv) 1185 { 1186 PERL_ARGS_ASSERT_RPP_INVOKE_XS; 1187 1188 #ifdef PERL_RC_STACK 1189 if (!CvXS_RCSTACK(cv)) 1190 Perl_xs_wrap(aTHX_ CvXSUB(cv), cv); 1191 else 1192 #endif 1193 CvXSUB(cv)(aTHX_ cv); 1194 } 1195 1196 1197 1198 1199 /* ----------------------------- regexp.h ----------------------------- */ 1200 1201 /* PVLVs need to act as a superset of all scalar types - they are basically 1202 * PVMGs with a few extra fields. 1203 * REGEXPs are first class scalars, but have many fields that can't be copied 1204 * into a PVLV body. 1205 * 1206 * Hence we take a different approach - instead of a copy, PVLVs store a pointer 1207 * back to the original body. To avoid increasing the size of PVLVs just for the 1208 * rare case of REGEXP assignment, this pointer is stored in the memory usually 1209 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to 1210 * read the pointer from the two possible locations. The macro SvLEN() wraps the 1211 * access to the union's member xpvlenu_len, but there is no equivalent macro 1212 * for wrapping the union's member xpvlenu_rx, hence the direct reference here. 1213 * 1214 * See commit df6b4bd56551f2d3 for more details. */ 1215 1216 PERL_STATIC_INLINE struct regexp * 1217 Perl_ReANY(const REGEXP * const re) 1218 { 1219 XPV* const p = (XPV*)SvANY(re); 1220 1221 PERL_ARGS_ASSERT_REANY; 1222 assert(isREGEXP(re)); 1223 1224 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx 1225 : (struct regexp *)p; 1226 } 1227 1228 /* ------------------------------- utf8.h ------------------------------- */ 1229 1230 /* 1231 =for apidoc_section $unicode 1232 */ 1233 1234 PERL_STATIC_INLINE void 1235 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) 1236 { 1237 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 1238 * encoded string at '*dest', updating '*dest' to include it */ 1239 1240 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; 1241 1242 if (NATIVE_BYTE_IS_INVARIANT(byte)) 1243 *((*dest)++) = byte; 1244 else { 1245 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); 1246 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); 1247 } 1248 } 1249 1250 /* 1251 =for apidoc valid_utf8_to_uvchr 1252 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is 1253 known that the next character in the input UTF-8 string C<s> is well-formed 1254 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code 1255 points, and non-Unicode code points are allowed. 1256 1257 =cut 1258 1259 */ 1260 1261 PERL_STATIC_INLINE UV 1262 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) 1263 { 1264 const UV expectlen = UTF8SKIP(s); 1265 const U8* send = s + expectlen; 1266 UV uv = *s; 1267 1268 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; 1269 1270 if (retlen) { 1271 *retlen = expectlen; 1272 } 1273 1274 /* An invariant is trivially returned */ 1275 if (expectlen == 1) { 1276 return uv; 1277 } 1278 1279 /* Remove the leading bits that indicate the number of bytes, leaving just 1280 * the bits that are part of the value */ 1281 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 1282 1283 /* Now, loop through the remaining bytes, accumulating each into the 1284 * working total as we go. (I khw tried unrolling the loop for up to 4 1285 * bytes, but there was no performance improvement) */ 1286 for (++s; s < send; s++) { 1287 uv = UTF8_ACCUMULATE(uv, *s); 1288 } 1289 1290 return UNI_TO_NATIVE(uv); 1291 1292 } 1293 1294 /* 1295 =for apidoc is_utf8_invariant_string 1296 1297 Returns TRUE if the first C<len> bytes of the string C<s> are the same 1298 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on 1299 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they 1300 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only 1301 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range 1302 characters are invariant, but so also are the C1 controls. 1303 1304 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you 1305 use this option, that C<s> can't have embedded C<NUL> characters and has to 1306 have a terminating C<NUL> byte). 1307 1308 See also 1309 C<L</is_utf8_string>>, 1310 C<L</is_utf8_string_flags>>, 1311 C<L</is_utf8_string_loc>>, 1312 C<L</is_utf8_string_loc_flags>>, 1313 C<L</is_utf8_string_loclen>>, 1314 C<L</is_utf8_string_loclen_flags>>, 1315 C<L</is_utf8_fixed_width_buf_flags>>, 1316 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1317 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1318 C<L</is_strict_utf8_string>>, 1319 C<L</is_strict_utf8_string_loc>>, 1320 C<L</is_strict_utf8_string_loclen>>, 1321 C<L</is_c9strict_utf8_string>>, 1322 C<L</is_c9strict_utf8_string_loc>>, 1323 and 1324 C<L</is_c9strict_utf8_string_loclen>>. 1325 1326 =cut 1327 1328 */ 1329 1330 #define is_utf8_invariant_string(s, len) \ 1331 is_utf8_invariant_string_loc(s, len, NULL) 1332 1333 /* 1334 =for apidoc is_utf8_invariant_string_loc 1335 1336 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of 1337 the first UTF-8 variant character in the C<ep> pointer; if all characters are 1338 UTF-8 invariant, this function does not change the contents of C<*ep>. 1339 1340 =cut 1341 1342 */ 1343 1344 PERL_STATIC_INLINE bool 1345 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) 1346 { 1347 const U8* send; 1348 const U8* x = s; 1349 1350 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; 1351 1352 if (len == 0) { 1353 len = strlen((const char *)s); 1354 } 1355 1356 send = s + len; 1357 1358 /* This looks like 0x010101... */ 1359 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) 1360 1361 /* This looks like 0x808080... */ 1362 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) 1363 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) 1364 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) 1365 1366 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by 1367 * or'ing together the lowest bits of 'x'. Hopefully the final term gets 1368 * optimized out completely on a 32-bit system, and its mask gets optimized out 1369 * on a 64-bit system */ 1370 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ 1371 | ( PTR2nat(x) >> 1) \ 1372 | ( ( (PTR2nat(x) \ 1373 & PERL_WORD_BOUNDARY_MASK) >> 2)))) 1374 1375 #ifndef EBCDIC 1376 1377 /* Do the word-at-a-time iff there is at least one usable full word. That 1378 * means that after advancing to a word boundary, there still is at least a 1379 * full word left. The number of bytes needed to advance is 'wordsize - 1380 * offset' unless offset is 0. */ 1381 if ((STRLEN) (send - x) >= PERL_WORDSIZE 1382 1383 /* This term is wordsize if subword; 0 if not */ 1384 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 1385 1386 /* 'offset' */ 1387 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 1388 { 1389 1390 /* Process per-byte until reach word boundary. XXX This loop could be 1391 * eliminated if we knew that this platform had fast unaligned reads */ 1392 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 1393 if (! UTF8_IS_INVARIANT(*x)) { 1394 if (ep) { 1395 *ep = x; 1396 } 1397 1398 return FALSE; 1399 } 1400 x++; 1401 } 1402 1403 /* Here, we know we have at least one full word to process. Process 1404 * per-word as long as we have at least a full word left */ 1405 do { 1406 if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { 1407 1408 /* Found a variant. Just return if caller doesn't want its 1409 * exact position */ 1410 if (! ep) { 1411 return FALSE; 1412 } 1413 1414 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ 1415 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 1416 1417 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x); 1418 assert(*ep >= s && *ep < send); 1419 1420 return FALSE; 1421 1422 # else /* If weird byte order, drop into next loop to do byte-at-a-time 1423 checks. */ 1424 1425 break; 1426 # endif 1427 } 1428 1429 x += PERL_WORDSIZE; 1430 1431 } while (x + PERL_WORDSIZE <= send); 1432 } 1433 1434 #endif /* End of ! EBCDIC */ 1435 1436 /* Process per-byte */ 1437 while (x < send) { 1438 if (! UTF8_IS_INVARIANT(*x)) { 1439 if (ep) { 1440 *ep = x; 1441 } 1442 1443 return FALSE; 1444 } 1445 1446 x++; 1447 } 1448 1449 return TRUE; 1450 } 1451 1452 /* See if the platform has builtins for finding the most/least significant bit, 1453 * and which one is right for using on 32 and 64 bit operands */ 1454 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0)) 1455 # if U32SIZE == INTSIZE 1456 # define PERL_CLZ_32 __builtin_clz 1457 # endif 1458 # if defined(U64TYPE) && U64SIZE == INTSIZE 1459 # define PERL_CLZ_64 __builtin_clz 1460 # endif 1461 #endif 1462 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0)) 1463 # if U32SIZE == INTSIZE 1464 # define PERL_CTZ_32 __builtin_ctz 1465 # endif 1466 # if defined(U64TYPE) && U64SIZE == INTSIZE 1467 # define PERL_CTZ_64 __builtin_ctz 1468 # endif 1469 #endif 1470 1471 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0)) 1472 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32) 1473 # define PERL_CLZ_32 __builtin_clzl 1474 # endif 1475 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64) 1476 # define PERL_CLZ_64 __builtin_clzl 1477 # endif 1478 #endif 1479 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0)) 1480 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32) 1481 # define PERL_CTZ_32 __builtin_ctzl 1482 # endif 1483 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64) 1484 # define PERL_CTZ_64 __builtin_ctzl 1485 # endif 1486 #endif 1487 1488 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0)) 1489 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32) 1490 # define PERL_CLZ_32 __builtin_clzll 1491 # endif 1492 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64) 1493 # define PERL_CLZ_64 __builtin_clzll 1494 # endif 1495 #endif 1496 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0)) 1497 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32) 1498 # define PERL_CTZ_32 __builtin_ctzll 1499 # endif 1500 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64) 1501 # define PERL_CTZ_64 __builtin_ctzll 1502 # endif 1503 #endif 1504 1505 #if defined(WIN32) 1506 # include <intrin.h> 1507 /* MinGW warns that it ignores "pragma intrinsic". */ 1508 # if defined(_MSC_VER) 1509 # pragma intrinsic(_BitScanForward) 1510 # pragma intrinsic(_BitScanReverse) 1511 # if defined(_WIN64) 1512 # pragma intrinsic(_BitScanForward64) 1513 # pragma intrinsic(_BitScanReverse64) 1514 # endif 1515 # endif 1516 #endif 1517 1518 /* The reason there are not checks to see if ffs() and ffsl() are available for 1519 * determining the lsb, is because these don't improve on the deBruijn method 1520 * fallback, which is just a branchless integer multiply, array element 1521 * retrieval, and shift. The others, even if the function call overhead is 1522 * optimized out, have to cope with the possibility of the input being all 1523 * zeroes, and almost certainly will have conditionals for this eventuality. 1524 * khw, at the time of this commit, looked at the source for both gcc and clang 1525 * to verify this. (gcc used a method inferior to deBruijn.) */ 1526 1527 /* Below are functions to find the first, last, or only set bit in a word. On 1528 * platforms with 64-bit capability, there is a pair for each operation; the 1529 * first taking a 64 bit operand, and the second a 32 bit one. The logic is 1530 * the same in each pair, so the second is stripped of most comments. */ 1531 1532 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 1533 1534 PERL_STATIC_INLINE unsigned 1535 Perl_lsbit_pos64(U64 word) 1536 { 1537 /* Find the position (0..63) of the least significant set bit in the input 1538 * word */ 1539 1540 ASSUME(word != 0); 1541 1542 /* If we can determine that the platform has a usable fast method to get 1543 * this info, use that */ 1544 1545 # if defined(PERL_CTZ_64) 1546 # define PERL_HAS_FAST_GET_LSB_POS64 1547 1548 return (unsigned) PERL_CTZ_64(word); 1549 1550 # elif U64SIZE == 8 && defined(_WIN64) 1551 # define PERL_HAS_FAST_GET_LSB_POS64 1552 1553 { 1554 unsigned long index; 1555 _BitScanForward64(&index, word); 1556 return (unsigned)index; 1557 } 1558 1559 # else 1560 1561 /* Here, we didn't find a fast method for finding the lsb. Fall back to 1562 * making the lsb the only set bit in the word, and use our function that 1563 * works on words with a single bit set. 1564 * 1565 * Isolate the lsb; 1566 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set 1567 * 1568 * The word will look like this, with a rightmost set bit in position 's': 1569 * ('x's are don't cares, and 'y's are their complements) 1570 * s 1571 * x..x100..00 1572 * y..y011..11 Complement 1573 * y..y100..00 Add 1 1574 * 0..0100..00 And with the original 1575 * 1576 * (Yes, complementing and adding 1 is just taking the negative on 2's 1577 * complement machines, but not on 1's complement ones, and some compilers 1578 * complain about negating an unsigned.) 1579 */ 1580 return single_1bit_pos64(word & (~word + 1)); 1581 1582 # endif 1583 1584 } 1585 1586 # define lsbit_pos_uintmax_(word) lsbit_pos64(word) 1587 #else /* ! QUAD */ 1588 # define lsbit_pos_uintmax_(word) lsbit_pos32(word) 1589 #endif 1590 1591 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */ 1592 Perl_lsbit_pos32(U32 word) 1593 { 1594 /* Find the position (0..31) of the least significant set bit in the input 1595 * word */ 1596 1597 ASSUME(word != 0); 1598 1599 #if defined(PERL_CTZ_32) 1600 # define PERL_HAS_FAST_GET_LSB_POS32 1601 1602 return (unsigned) PERL_CTZ_32(word); 1603 1604 #elif U32SIZE == 4 && defined(WIN32) 1605 # define PERL_HAS_FAST_GET_LSB_POS32 1606 1607 { 1608 unsigned long index; 1609 _BitScanForward(&index, word); 1610 return (unsigned)index; 1611 } 1612 1613 #elif defined(PERL_HAS_FAST_GET_LSB_POS64) 1614 # define PERL_HAS_FAST_GET_LSB_POS32 1615 1616 /* Unlikely, but possible for the platform to have a wider fast operation 1617 * but not a narrower one. But easy enough to handle the case by widening 1618 * the parameter size. */ 1619 return lsbit_pos64(word); 1620 1621 #else 1622 1623 return single_1bit_pos32(word & (~word + 1)); 1624 1625 #endif 1626 1627 } 1628 1629 1630 /* Convert the leading zeros count to the bit position of the first set bit. 1631 * This just subtracts from the highest position, 31 or 63. But some compilers 1632 * don't optimize this optimally, and so a bit of bit twiddling encourages them 1633 * to do the right thing. It turns out that subtracting a smaller non-negative 1634 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of 1635 * the two numbers. To see why, first note that the sum of any number, x, and 1636 * its complement, x', is all ones. So all ones minus x is x'. Then note that 1637 * the xor of x and all ones is x'. */ 1638 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc)) 1639 1640 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 1641 1642 PERL_STATIC_INLINE unsigned 1643 Perl_msbit_pos64(U64 word) 1644 { 1645 /* Find the position (0..63) of the most significant set bit in the input 1646 * word */ 1647 1648 ASSUME(word != 0); 1649 1650 /* If we can determine that the platform has a usable fast method to get 1651 * this, use that */ 1652 1653 # if defined(PERL_CLZ_64) 1654 # define PERL_HAS_FAST_GET_MSB_POS64 1655 1656 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word)); 1657 1658 # elif U64SIZE == 8 && defined(_WIN64) 1659 # define PERL_HAS_FAST_GET_MSB_POS64 1660 1661 { 1662 unsigned long index; 1663 _BitScanReverse64(&index, word); 1664 return (unsigned)index; 1665 } 1666 1667 # else 1668 1669 /* Here, we didn't find a fast method for finding the msb. Fall back to 1670 * making the msb the only set bit in the word, and use our function that 1671 * works on words with a single bit set. 1672 * 1673 * Isolate the msb; http://codeforces.com/blog/entry/10330 1674 * 1675 * Only the most significant set bit matters. Or'ing word with its right 1676 * shift of 1 makes that bit and the next one to its right both 1. 1677 * Repeating that with the right shift of 2 makes for 4 1-bits in a row. 1678 * ... We end with the msb and all to the right being 1. */ 1679 word |= (word >> 1); 1680 word |= (word >> 2); 1681 word |= (word >> 4); 1682 word |= (word >> 8); 1683 word |= (word >> 16); 1684 word |= (word >> 32); 1685 1686 /* Then subtracting the right shift by 1 clears all but the left-most of 1687 * the 1 bits, which is our desired result */ 1688 word -= (word >> 1); 1689 1690 /* Now we have a single bit set */ 1691 return single_1bit_pos64(word); 1692 1693 # endif 1694 1695 } 1696 1697 # define msbit_pos_uintmax_(word) msbit_pos64(word) 1698 #else /* ! QUAD */ 1699 # define msbit_pos_uintmax_(word) msbit_pos32(word) 1700 #endif 1701 1702 PERL_STATIC_INLINE unsigned 1703 Perl_msbit_pos32(U32 word) 1704 { 1705 /* Find the position (0..31) of the most significant set bit in the input 1706 * word */ 1707 1708 ASSUME(word != 0); 1709 1710 #if defined(PERL_CLZ_32) 1711 # define PERL_HAS_FAST_GET_MSB_POS32 1712 1713 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word)); 1714 #elif U32SIZE == 4 && defined(WIN32) 1715 # define PERL_HAS_FAST_GET_MSB_POS32 1716 1717 { 1718 unsigned long index; 1719 _BitScanReverse(&index, word); 1720 return (unsigned)index; 1721 } 1722 1723 #elif defined(PERL_HAS_FAST_GET_MSB_POS64) 1724 # define PERL_HAS_FAST_GET_MSB_POS32 1725 1726 return msbit_pos64(word); /* Let compiler widen parameter */ 1727 1728 #else 1729 1730 word |= (word >> 1); 1731 word |= (word >> 2); 1732 word |= (word >> 4); 1733 word |= (word >> 8); 1734 word |= (word >> 16); 1735 word -= (word >> 1); 1736 return single_1bit_pos32(word); 1737 1738 #endif 1739 1740 } 1741 1742 /* Note that if you are working through all the 1 bits in a word, and don't 1743 * care which order you process them in, it is better to use lsbit_pos. This 1744 * is because some platforms have a fast way to find the msb but not the lsb, 1745 * and others vice versa. The code above falls back to use the single 1746 * available fast method when the desired one is missing, and it is cheaper to 1747 * fall back from lsb to msb than the other way around */ 1748 1749 #if UVSIZE == U64SIZE 1750 # define msbit_pos(word) msbit_pos64(word) 1751 # define lsbit_pos(word) lsbit_pos64(word) 1752 #elif UVSIZE == U32SIZE 1753 # define msbit_pos(word) msbit_pos32(word) 1754 # define lsbit_pos(word) lsbit_pos32(word) 1755 #endif 1756 1757 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */ 1758 1759 PERL_STATIC_INLINE unsigned 1760 Perl_single_1bit_pos64(U64 word) 1761 { 1762 /* Given a 64-bit word known to contain all zero bits except one 1 bit, 1763 * find and return the 1's position: 0..63 */ 1764 1765 # ifdef PERL_CORE /* macro not exported */ 1766 ASSUME(isPOWER_OF_2(word)); 1767 # else 1768 ASSUME(word && (word & (word-1)) == 0); 1769 # endif 1770 1771 /* The only set bit is both the most and least significant bit. If we have 1772 * a fast way of finding either one, use that. 1773 * 1774 * It may appear at first glance that those functions call this one, but 1775 * they don't if the corresponding #define is set */ 1776 1777 # ifdef PERL_HAS_FAST_GET_MSB_POS64 1778 1779 return msbit_pos64(word); 1780 1781 # elif defined(PERL_HAS_FAST_GET_LSB_POS64) 1782 1783 return lsbit_pos64(word); 1784 1785 # else 1786 1787 /* The position of the only set bit in a word can be quickly calculated 1788 * using deBruijn sequences. See for example 1789 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */ 1790 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_) 1791 >> PERL_deBruijnShift64_]; 1792 # endif 1793 1794 } 1795 1796 #endif 1797 1798 PERL_STATIC_INLINE unsigned 1799 Perl_single_1bit_pos32(U32 word) 1800 { 1801 /* Given a 32-bit word known to contain all zero bits except one 1 bit, 1802 * find and return the 1's position: 0..31 */ 1803 1804 #ifdef PERL_CORE /* macro not exported */ 1805 ASSUME(isPOWER_OF_2(word)); 1806 #else 1807 ASSUME(word && (word & (word-1)) == 0); 1808 #endif 1809 #ifdef PERL_HAS_FAST_GET_MSB_POS32 1810 1811 return msbit_pos32(word); 1812 1813 #elif defined(PERL_HAS_FAST_GET_LSB_POS32) 1814 1815 return lsbit_pos32(word); 1816 1817 #else 1818 1819 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_) 1820 >> PERL_deBruijnShift32_]; 1821 #endif 1822 1823 } 1824 1825 #ifndef EBCDIC 1826 1827 PERL_STATIC_INLINE unsigned int 1828 Perl_variant_byte_number(PERL_UINTMAX_T word) 1829 { 1830 /* This returns the position in a word (0..7) of the first variant byte in 1831 * it. This is a helper function. Note that there are no branches */ 1832 1833 /* Get just the msb bits of each byte */ 1834 word &= PERL_VARIANTS_WORD_MASK; 1835 1836 /* This should only be called if we know there is a variant byte in the 1837 * word */ 1838 assert(word); 1839 1840 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 1841 1842 /* Bytes are stored like 1843 * Byte8 ... Byte2 Byte1 1844 * 63..56...15...8 7...0 1845 * so getting the lsb of the whole modified word is getting the msb of the 1846 * first byte that has its msb set */ 1847 word = lsbit_pos_uintmax_(word); 1848 1849 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert 1850 * to 0..7 */ 1851 return (unsigned int) ((word + 1) >> 3) - 1; 1852 1853 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 1854 1855 /* Bytes are stored like 1856 * Byte1 Byte2 ... Byte8 1857 * 63..56 55..47 ... 7...0 1858 * so getting the msb of the whole modified word is getting the msb of the 1859 * first byte that has its msb set */ 1860 word = msbit_pos_uintmax_(word); 1861 1862 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert 1863 * to 0..7 */ 1864 word = ((word + 1) >> 3) - 1; 1865 1866 /* And invert the result because of the reversed byte order on this 1867 * platform */ 1868 word = CHARBITS - word - 1; 1869 1870 return (unsigned int) word; 1871 1872 # else 1873 # error Unexpected byte order 1874 # endif 1875 1876 } 1877 1878 #endif 1879 #if defined(PERL_CORE) || defined(PERL_EXT) 1880 1881 /* 1882 =for apidoc variant_under_utf8_count 1883 1884 This function looks at the sequence of bytes between C<s> and C<e>, which are 1885 assumed to be encoded in ASCII/Latin1, and returns how many of them would 1886 change should the string be translated into UTF-8. Due to the nature of UTF-8, 1887 each of these would occupy two bytes instead of the single one in the input 1888 string. Thus, this function returns the precise number of bytes the string 1889 would expand by when translated to UTF-8. 1890 1891 Unlike most of the other functions that have C<utf8> in their name, the input 1892 to this function is NOT a UTF-8-encoded string. The function name is slightly 1893 I<odd> to emphasize this. 1894 1895 This function is internal to Perl because khw thinks that any XS code that 1896 would want this is probably operating too close to the internals. Presenting a 1897 valid use case could change that. 1898 1899 See also 1900 C<L<perlapi/is_utf8_invariant_string>> 1901 and 1902 C<L<perlapi/is_utf8_invariant_string_loc>>, 1903 1904 =cut 1905 1906 */ 1907 1908 PERL_STATIC_INLINE Size_t 1909 S_variant_under_utf8_count(const U8* const s, const U8* const e) 1910 { 1911 const U8* x = s; 1912 Size_t count = 0; 1913 1914 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; 1915 1916 # ifndef EBCDIC 1917 1918 /* Test if the string is long enough to use word-at-a-time. (Logic is the 1919 * same as for is_utf8_invariant_string()) */ 1920 if ((STRLEN) (e - x) >= PERL_WORDSIZE 1921 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) 1922 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) 1923 { 1924 1925 /* Process per-byte until reach word boundary. XXX This loop could be 1926 * eliminated if we knew that this platform had fast unaligned reads */ 1927 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { 1928 count += ! UTF8_IS_INVARIANT(*x++); 1929 } 1930 1931 /* Process per-word as long as we have at least a full word left */ 1932 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an 1933 explanation of how this works */ 1934 PERL_UINTMAX_T increment 1935 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) 1936 * PERL_COUNT_MULTIPLIER) 1937 >> ((PERL_WORDSIZE - 1) * CHARBITS); 1938 count += (Size_t) increment; 1939 x += PERL_WORDSIZE; 1940 } while (x + PERL_WORDSIZE <= e); 1941 } 1942 1943 # endif 1944 1945 /* Process per-byte */ 1946 while (x < e) { 1947 if (! UTF8_IS_INVARIANT(*x)) { 1948 count++; 1949 } 1950 1951 x++; 1952 } 1953 1954 return count; 1955 } 1956 1957 #endif 1958 1959 /* Keep these around for these files */ 1960 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C) 1961 # undef PERL_WORDSIZE 1962 # undef PERL_COUNT_MULTIPLIER 1963 # undef PERL_WORD_BOUNDARY_MASK 1964 # undef PERL_VARIANTS_WORD_MASK 1965 #endif 1966 1967 /* 1968 =for apidoc is_utf8_string 1969 1970 Returns TRUE if the first C<len> bytes of string C<s> form a valid 1971 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will 1972 be calculated using C<strlen(s)> (which means if you use this option, that C<s> 1973 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 1974 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 1975 1976 This function considers Perl's extended UTF-8 to be valid. That means that 1977 code points above Unicode, surrogates, and non-character code points are 1978 considered valid by this function. Use C<L</is_strict_utf8_string>>, 1979 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what 1980 code points are considered valid. 1981 1982 See also 1983 C<L</is_utf8_invariant_string>>, 1984 C<L</is_utf8_invariant_string_loc>>, 1985 C<L</is_utf8_string_loc>>, 1986 C<L</is_utf8_string_loclen>>, 1987 C<L</is_utf8_fixed_width_buf_flags>>, 1988 C<L</is_utf8_fixed_width_buf_loc_flags>>, 1989 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 1990 1991 =cut 1992 */ 1993 1994 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) 1995 1996 #if defined(PERL_CORE) || defined (PERL_EXT) 1997 1998 /* 1999 =for apidoc is_utf8_non_invariant_string 2000 2001 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first 2002 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended 2003 UTF-8; otherwise returns FALSE. 2004 2005 A TRUE return means that at least one code point represented by the sequence 2006 either is a wide character not representable as a single byte, or the 2007 representation differs depending on whether the sequence is encoded in UTF-8 or 2008 not. 2009 2010 See also 2011 C<L<perlapi/is_utf8_invariant_string>>, 2012 C<L<perlapi/is_utf8_string>> 2013 2014 =cut 2015 2016 This is commonly used to determine if a SV's UTF-8 flag should be turned on. 2017 It generally needn't be if its string is entirely UTF-8 invariant, and it 2018 shouldn't be if it otherwise contains invalid UTF-8. 2019 2020 It is an internal function because khw thinks that XS code shouldn't be working 2021 at this low a level. A valid use case could change that. 2022 2023 */ 2024 2025 PERL_STATIC_INLINE bool 2026 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len) 2027 { 2028 const U8 * first_variant; 2029 2030 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; 2031 2032 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 2033 return FALSE; 2034 } 2035 2036 return is_utf8_string(first_variant, len - (first_variant - s)); 2037 } 2038 2039 #endif 2040 2041 /* 2042 =for apidoc is_strict_utf8_string 2043 2044 Returns TRUE if the first C<len> bytes of string C<s> form a valid 2045 UTF-8-encoded string that is fully interchangeable by any application using 2046 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be 2047 calculated using C<strlen(s)> (which means if you use this option, that C<s> 2048 can't have embedded C<NUL> characters and has to have a terminating C<NUL> 2049 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. 2050 2051 This function returns FALSE for strings containing any 2052 code points above the Unicode max of 0x10FFFF, surrogate code points, or 2053 non-character code points. 2054 2055 See also 2056 C<L</is_utf8_invariant_string>>, 2057 C<L</is_utf8_invariant_string_loc>>, 2058 C<L</is_utf8_string>>, 2059 C<L</is_utf8_string_flags>>, 2060 C<L</is_utf8_string_loc>>, 2061 C<L</is_utf8_string_loc_flags>>, 2062 C<L</is_utf8_string_loclen>>, 2063 C<L</is_utf8_string_loclen_flags>>, 2064 C<L</is_utf8_fixed_width_buf_flags>>, 2065 C<L</is_utf8_fixed_width_buf_loc_flags>>, 2066 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 2067 C<L</is_strict_utf8_string_loc>>, 2068 C<L</is_strict_utf8_string_loclen>>, 2069 C<L</is_c9strict_utf8_string>>, 2070 C<L</is_c9strict_utf8_string_loc>>, 2071 and 2072 C<L</is_c9strict_utf8_string_loclen>>. 2073 2074 =cut 2075 */ 2076 2077 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) 2078 2079 /* 2080 =for apidoc is_c9strict_utf8_string 2081 2082 Returns TRUE if the first C<len> bytes of string C<s> form a valid 2083 UTF-8-encoded string that conforms to 2084 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>; 2085 otherwise it returns FALSE. If C<len> is 0, it will be calculated using 2086 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded 2087 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all 2088 characters being ASCII constitute 'a valid UTF-8 string'. 2089 2090 This function returns FALSE for strings containing any code points above the 2091 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character 2092 code points per 2093 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 2094 2095 See also 2096 C<L</is_utf8_invariant_string>>, 2097 C<L</is_utf8_invariant_string_loc>>, 2098 C<L</is_utf8_string>>, 2099 C<L</is_utf8_string_flags>>, 2100 C<L</is_utf8_string_loc>>, 2101 C<L</is_utf8_string_loc_flags>>, 2102 C<L</is_utf8_string_loclen>>, 2103 C<L</is_utf8_string_loclen_flags>>, 2104 C<L</is_utf8_fixed_width_buf_flags>>, 2105 C<L</is_utf8_fixed_width_buf_loc_flags>>, 2106 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 2107 C<L</is_strict_utf8_string>>, 2108 C<L</is_strict_utf8_string_loc>>, 2109 C<L</is_strict_utf8_string_loclen>>, 2110 C<L</is_c9strict_utf8_string_loc>>, 2111 and 2112 C<L</is_c9strict_utf8_string_loclen>>. 2113 2114 =cut 2115 */ 2116 2117 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) 2118 2119 /* 2120 =for apidoc is_utf8_string_flags 2121 2122 Returns TRUE if the first C<len> bytes of string C<s> form a valid 2123 UTF-8 string, subject to the restrictions imposed by C<flags>; 2124 returns FALSE otherwise. If C<len> is 0, it will be calculated 2125 using C<strlen(s)> (which means if you use this option, that C<s> can't have 2126 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note 2127 that all characters being ASCII constitute 'a valid UTF-8 string'. 2128 2129 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if 2130 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 2131 as C<L</is_strict_utf8_string>>; and if C<flags> is 2132 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as 2133 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any 2134 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by 2135 C<L</utf8n_to_uvchr>>, with the same meanings. 2136 2137 See also 2138 C<L</is_utf8_invariant_string>>, 2139 C<L</is_utf8_invariant_string_loc>>, 2140 C<L</is_utf8_string>>, 2141 C<L</is_utf8_string_loc>>, 2142 C<L</is_utf8_string_loc_flags>>, 2143 C<L</is_utf8_string_loclen>>, 2144 C<L</is_utf8_string_loclen_flags>>, 2145 C<L</is_utf8_fixed_width_buf_flags>>, 2146 C<L</is_utf8_fixed_width_buf_loc_flags>>, 2147 C<L</is_utf8_fixed_width_buf_loclen_flags>>, 2148 C<L</is_strict_utf8_string>>, 2149 C<L</is_strict_utf8_string_loc>>, 2150 C<L</is_strict_utf8_string_loclen>>, 2151 C<L</is_c9strict_utf8_string>>, 2152 C<L</is_c9strict_utf8_string_loc>>, 2153 and 2154 C<L</is_c9strict_utf8_string_loclen>>. 2155 2156 =cut 2157 */ 2158 2159 PERL_STATIC_INLINE bool 2160 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) 2161 { 2162 const U8 * first_variant; 2163 2164 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; 2165 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2166 |UTF8_DISALLOW_PERL_EXTENDED))); 2167 2168 if (len == 0) { 2169 len = strlen((const char *)s); 2170 } 2171 2172 if (flags == 0) { 2173 return is_utf8_string(s, len); 2174 } 2175 2176 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 2177 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 2178 { 2179 return is_strict_utf8_string(s, len); 2180 } 2181 2182 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 2183 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 2184 { 2185 return is_c9strict_utf8_string(s, len); 2186 } 2187 2188 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { 2189 const U8* const send = s + len; 2190 const U8* x = first_variant; 2191 2192 while (x < send) { 2193 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 2194 if (UNLIKELY(! cur_len)) { 2195 return FALSE; 2196 } 2197 x += cur_len; 2198 } 2199 } 2200 2201 return TRUE; 2202 } 2203 2204 /* 2205 2206 =for apidoc is_utf8_string_loc 2207 2208 Like C<L</is_utf8_string>> but stores the location of the failure (in the 2209 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2210 "utf8ness success") in the C<ep> pointer. 2211 2212 See also C<L</is_utf8_string_loclen>>. 2213 2214 =cut 2215 */ 2216 2217 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) 2218 2219 /* 2220 2221 =for apidoc is_utf8_string_loclen 2222 2223 Like C<L</is_utf8_string>> but stores the location of the failure (in the 2224 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2225 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 2226 encoded characters in the C<el> pointer. 2227 2228 See also C<L</is_utf8_string_loc>>. 2229 2230 =cut 2231 */ 2232 2233 PERL_STATIC_INLINE bool 2234 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 2235 { 2236 const U8 * first_variant; 2237 2238 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 2239 2240 if (len == 0) { 2241 len = strlen((const char *) s); 2242 } 2243 2244 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 2245 if (el) 2246 *el = len; 2247 2248 if (ep) { 2249 *ep = s + len; 2250 } 2251 2252 return TRUE; 2253 } 2254 2255 { 2256 const U8* const send = s + len; 2257 const U8* x = first_variant; 2258 STRLEN outlen = first_variant - s; 2259 2260 while (x < send) { 2261 const STRLEN cur_len = isUTF8_CHAR(x, send); 2262 if (UNLIKELY(! cur_len)) { 2263 break; 2264 } 2265 x += cur_len; 2266 outlen++; 2267 } 2268 2269 if (el) 2270 *el = outlen; 2271 2272 if (ep) { 2273 *ep = x; 2274 } 2275 2276 return (x == send); 2277 } 2278 } 2279 2280 /* The perl core arranges to never call the DFA below without there being at 2281 * least one byte available to look at. This allows the DFA to use a do {} 2282 * while loop which means that calling it with a UTF-8 invariant has a single 2283 * conditional, same as the calling code checking for invariance ahead of time. 2284 * And having the calling code remove that conditional speeds up by that 2285 * conditional, the case where it wasn't invariant. So there's no reason to 2286 * check before caling this. 2287 * 2288 * But we don't know this for non-core calls, so have to retain the check for 2289 * them. */ 2290 #ifdef PERL_CORE 2291 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s)) 2292 #else 2293 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE 2294 #endif 2295 2296 /* 2297 * DFA for checking input is valid UTF-8 syntax. 2298 * 2299 * This uses adaptations of the table and algorithm given in 2300 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 2301 * documentation of the original version. A copyright notice for the original 2302 * version is given at the beginning of this file. The Perl adaptations are 2303 * documented at the definition of PL_extended_utf8_dfa_tab[]. 2304 * 2305 * This dfa is fast. There are three exit conditions: 2306 * 1) a well-formed code point, acceptable to the table 2307 * 2) the beginning bytes of an incomplete character, whose completion might 2308 * or might not be acceptable 2309 * 3) unacceptable to the table. Some of the adaptations have certain, 2310 * hopefully less likely to occur, legal inputs be unacceptable to the 2311 * table, so these must be sorted out afterwards. 2312 * 2313 * This macro is a complete implementation of the code executing the DFA. It 2314 * is passed the input sequence bounds and the table to use, and what to do 2315 * for each of the exit conditions. There are three canned actions, likely to 2316 * be the ones you want: 2317 * DFA_RETURN_SUCCESS_ 2318 * DFA_RETURN_FAILURE_ 2319 * DFA_GOTO_TEASE_APART_FF_ 2320 * 2321 * You pass a parameter giving the action to take for each of the three 2322 * possible exit conditions: 2323 * 2324 * 'accept_action' This is executed when the DFA accepts the input. 2325 * DFA_RETURN_SUCCESS_ is the most likely candidate. 2326 * 'reject_action' This is executed when the DFA rejects the input. 2327 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where 2328 * you have written code to distinguish the rejecting state 2329 * results. Because it happens in several places, and 2330 * involves #ifdefs, the special action 2331 * DFA_GOTO_TEASE_APART_FF_ is what you want with 2332 * PL_extended_utf8_dfa_tab. On platforms without 2333 * EXTRA_LONG_UTF8, there is no need to tease anything apart, 2334 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you 2335 * need to have a label 'tease_apart_FF' that it will transfer 2336 * to. 2337 * 'incomplete_char_action' This is executed when the DFA ran off the end 2338 * before accepting or rejecting the input. 2339 * DFA_RETURN_FAILURE_ is the likely action, but you could 2340 * have a 'goto', or NOOP. In the latter case the DFA drops 2341 * off the end, and you place your code to handle this case 2342 * immediately after it. 2343 */ 2344 2345 #define DFA_RETURN_SUCCESS_ return s - s0 2346 #define DFA_RETURN_FAILURE_ return 0 2347 #ifdef HAS_EXTRA_LONG_UTF8 2348 # define DFA_TEASE_APART_FF_ goto tease_apart_FF 2349 #else 2350 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_ 2351 #endif 2352 2353 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \ 2354 accept_action, \ 2355 reject_action, \ 2356 incomplete_char_action) \ 2357 STMT_START { \ 2358 const U8 * s = s0; \ 2359 const U8 * e_ = e; \ 2360 UV state = 0; \ 2361 \ 2362 PERL_NON_CORE_CHECK_EMPTY(s, e_); \ 2363 \ 2364 do { \ 2365 state = dfa_tab[256 + state + dfa_tab[*s]]; \ 2366 s++; \ 2367 \ 2368 if (state == 0) { /* Accepting state */ \ 2369 accept_action; \ 2370 } \ 2371 \ 2372 if (UNLIKELY(state == 1)) { /* Rejecting state */ \ 2373 reject_action; \ 2374 } \ 2375 } while (s < e_); \ 2376 \ 2377 /* Here, dropped out of loop before end-of-char */ \ 2378 incomplete_char_action; \ 2379 } STMT_END 2380 2381 2382 /* 2383 2384 =for apidoc isUTF8_CHAR 2385 2386 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 2387 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 2388 that represents some code point; otherwise it evaluates to 0. If non-zero, the 2389 value gives how many bytes starting at C<s> comprise the code point's 2390 representation. Any bytes remaining before C<e>, but beyond the ones needed to 2391 form the first code point in C<s>, are not examined. 2392 2393 The code point can be any that will fit in an IV on this machine, using Perl's 2394 extension to official UTF-8 to represent those higher than the Unicode maximum 2395 of 0x10FFFF. That means that this macro is used to efficiently decide if the 2396 next few bytes in C<s> is legal UTF-8 for a single character. 2397 2398 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those 2399 defined by Unicode to be fully interchangeable across applications; 2400 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 2401 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 2402 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition. 2403 2404 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and 2405 C<L</is_utf8_string_loclen>> to check entire strings. 2406 2407 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC 2408 machines) is a valid UTF-8 character. 2409 2410 =cut 2411 2412 This uses an adaptation of the table and algorithm given in 2413 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 2414 documentation of the original version. A copyright notice for the original 2415 version is given at the beginning of this file. The Perl adaptation is 2416 documented at the definition of PL_extended_utf8_dfa_tab[]. 2417 */ 2418 2419 PERL_STATIC_INLINE Size_t 2420 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) 2421 { 2422 PERL_ARGS_ASSERT_ISUTF8_CHAR; 2423 2424 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 2425 DFA_RETURN_SUCCESS_, 2426 DFA_TEASE_APART_FF_, 2427 DFA_RETURN_FAILURE_); 2428 2429 /* Here, we didn't return success, but dropped out of the loop. In the 2430 * case of PL_extended_utf8_dfa_tab, this means the input is either 2431 * malformed, or the start byte was FF on a platform that the dfa doesn't 2432 * handle FF's. Call a helper function. */ 2433 2434 #ifdef HAS_EXTRA_LONG_UTF8 2435 2436 tease_apart_FF: 2437 2438 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 2439 * either malformed, or was for the largest possible start byte, which we 2440 * now check, not inline */ 2441 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) { 2442 return 0; 2443 } 2444 2445 return is_utf8_FF_helper_(s0, e, 2446 FALSE /* require full, not partial char */ 2447 ); 2448 #endif 2449 2450 } 2451 2452 /* 2453 2454 =for apidoc isSTRICT_UTF8_CHAR 2455 2456 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 2457 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 2458 Unicode code point completely acceptable for open interchange between all 2459 applications; otherwise it evaluates to 0. If non-zero, the value gives how 2460 many bytes starting at C<s> comprise the code point's representation. Any 2461 bytes remaining before C<e>, but beyond the ones needed to form the first code 2462 point in C<s>, are not examined. 2463 2464 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not 2465 be a surrogate nor a non-character code point. Thus this excludes any code 2466 point from Perl's extended UTF-8. 2467 2468 This is used to efficiently decide if the next few bytes in C<s> is 2469 legal Unicode-acceptable UTF-8 for a single character. 2470 2471 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum 2472 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable 2473 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; 2474 and C<L</isUTF8_CHAR_flags>> for a more customized definition. 2475 2476 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and 2477 C<L</is_strict_utf8_string_loclen>> to check entire strings. 2478 2479 =cut 2480 2481 This uses an adaptation of the tables and algorithm given in 2482 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 2483 documentation of the original version. A copyright notice for the original 2484 version is given at the beginning of this file. The Perl adaptation is 2485 documented at the definition of strict_extended_utf8_dfa_tab[]. 2486 2487 */ 2488 2489 PERL_STATIC_INLINE Size_t 2490 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 2491 { 2492 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; 2493 2494 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, 2495 DFA_RETURN_SUCCESS_, 2496 goto check_hanguls, 2497 DFA_RETURN_FAILURE_); 2498 check_hanguls: 2499 2500 /* Here, we didn't return success, but dropped out of the loop. In the 2501 * case of PL_strict_utf8_dfa_tab, this means the input is either 2502 * malformed, or was for certain Hanguls; handle them specially */ 2503 2504 /* The dfa above drops out for incomplete or illegal inputs, and certain 2505 * legal Hanguls; check and return accordingly */ 2506 return is_HANGUL_ED_utf8_safe(s0, e); 2507 } 2508 2509 /* 2510 2511 =for apidoc isC9_STRICT_UTF8_CHAR 2512 2513 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 2514 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some 2515 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, 2516 the value gives how many bytes starting at C<s> comprise the code point's 2517 representation. Any bytes remaining before C<e>, but beyond the ones needed to 2518 form the first code point in C<s>, are not examined. 2519 2520 The largest acceptable code point is the Unicode maximum 0x10FFFF. This 2521 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character 2522 code points. This corresponds to 2523 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>. 2524 which said that non-character code points are merely discouraged rather than 2525 completely forbidden in open interchange. See 2526 L<perlunicode/Noncharacter code points>. 2527 2528 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and 2529 C<L</isUTF8_CHAR_flags>> for a more customized definition. 2530 2531 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and 2532 C<L</is_c9strict_utf8_string_loclen>> to check entire strings. 2533 2534 =cut 2535 2536 This uses an adaptation of the tables and algorithm given in 2537 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive 2538 documentation of the original version. A copyright notice for the original 2539 version is given at the beginning of this file. The Perl adaptation is 2540 documented at the definition of PL_c9_utf8_dfa_tab[]. 2541 2542 */ 2543 2544 PERL_STATIC_INLINE Size_t 2545 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) 2546 { 2547 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; 2548 2549 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab, 2550 DFA_RETURN_SUCCESS_, 2551 DFA_RETURN_FAILURE_, 2552 DFA_RETURN_FAILURE_); 2553 } 2554 2555 /* 2556 2557 =for apidoc is_strict_utf8_string_loc 2558 2559 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 2560 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2561 "utf8ness success") in the C<ep> pointer. 2562 2563 See also C<L</is_strict_utf8_string_loclen>>. 2564 2565 =cut 2566 */ 2567 2568 #define is_strict_utf8_string_loc(s, len, ep) \ 2569 is_strict_utf8_string_loclen(s, len, ep, 0) 2570 2571 /* 2572 2573 =for apidoc is_strict_utf8_string_loclen 2574 2575 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the 2576 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2577 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 2578 encoded characters in the C<el> pointer. 2579 2580 See also C<L</is_strict_utf8_string_loc>>. 2581 2582 =cut 2583 */ 2584 2585 PERL_STATIC_INLINE bool 2586 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 2587 { 2588 const U8 * first_variant; 2589 2590 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; 2591 2592 if (len == 0) { 2593 len = strlen((const char *) s); 2594 } 2595 2596 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 2597 if (el) 2598 *el = len; 2599 2600 if (ep) { 2601 *ep = s + len; 2602 } 2603 2604 return TRUE; 2605 } 2606 2607 { 2608 const U8* const send = s + len; 2609 const U8* x = first_variant; 2610 STRLEN outlen = first_variant - s; 2611 2612 while (x < send) { 2613 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); 2614 if (UNLIKELY(! cur_len)) { 2615 break; 2616 } 2617 x += cur_len; 2618 outlen++; 2619 } 2620 2621 if (el) 2622 *el = outlen; 2623 2624 if (ep) { 2625 *ep = x; 2626 } 2627 2628 return (x == send); 2629 } 2630 } 2631 2632 /* 2633 2634 =for apidoc is_c9strict_utf8_string_loc 2635 2636 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 2637 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2638 "utf8ness success") in the C<ep> pointer. 2639 2640 See also C<L</is_c9strict_utf8_string_loclen>>. 2641 2642 =cut 2643 */ 2644 2645 #define is_c9strict_utf8_string_loc(s, len, ep) \ 2646 is_c9strict_utf8_string_loclen(s, len, ep, 0) 2647 2648 /* 2649 2650 =for apidoc is_c9strict_utf8_string_loclen 2651 2652 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in 2653 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2654 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded 2655 characters in the C<el> pointer. 2656 2657 See also C<L</is_c9strict_utf8_string_loc>>. 2658 2659 =cut 2660 */ 2661 2662 PERL_STATIC_INLINE bool 2663 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 2664 { 2665 const U8 * first_variant; 2666 2667 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; 2668 2669 if (len == 0) { 2670 len = strlen((const char *) s); 2671 } 2672 2673 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 2674 if (el) 2675 *el = len; 2676 2677 if (ep) { 2678 *ep = s + len; 2679 } 2680 2681 return TRUE; 2682 } 2683 2684 { 2685 const U8* const send = s + len; 2686 const U8* x = first_variant; 2687 STRLEN outlen = first_variant - s; 2688 2689 while (x < send) { 2690 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); 2691 if (UNLIKELY(! cur_len)) { 2692 break; 2693 } 2694 x += cur_len; 2695 outlen++; 2696 } 2697 2698 if (el) 2699 *el = outlen; 2700 2701 if (ep) { 2702 *ep = x; 2703 } 2704 2705 return (x == send); 2706 } 2707 } 2708 2709 /* 2710 2711 =for apidoc is_utf8_string_loc_flags 2712 2713 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 2714 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2715 "utf8ness success") in the C<ep> pointer. 2716 2717 See also C<L</is_utf8_string_loclen_flags>>. 2718 2719 =cut 2720 */ 2721 2722 #define is_utf8_string_loc_flags(s, len, ep, flags) \ 2723 is_utf8_string_loclen_flags(s, len, ep, 0, flags) 2724 2725 2726 /* The above 3 actual functions could have been moved into the more general one 2727 * just below, and made #defines that call it with the right 'flags'. They are 2728 * currently kept separate to increase their chances of getting inlined */ 2729 2730 /* 2731 2732 =for apidoc is_utf8_string_loclen_flags 2733 2734 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the 2735 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 2736 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 2737 encoded characters in the C<el> pointer. 2738 2739 See also C<L</is_utf8_string_loc_flags>>. 2740 2741 =cut 2742 */ 2743 2744 PERL_STATIC_INLINE bool 2745 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) 2746 { 2747 const U8 * first_variant; 2748 2749 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; 2750 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 2751 |UTF8_DISALLOW_PERL_EXTENDED))); 2752 2753 if (flags == 0) { 2754 return is_utf8_string_loclen(s, len, ep, el); 2755 } 2756 2757 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 2758 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) 2759 { 2760 return is_strict_utf8_string_loclen(s, len, ep, el); 2761 } 2762 2763 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) 2764 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) 2765 { 2766 return is_c9strict_utf8_string_loclen(s, len, ep, el); 2767 } 2768 2769 if (len == 0) { 2770 len = strlen((const char *) s); 2771 } 2772 2773 if (is_utf8_invariant_string_loc(s, len, &first_variant)) { 2774 if (el) 2775 *el = len; 2776 2777 if (ep) { 2778 *ep = s + len; 2779 } 2780 2781 return TRUE; 2782 } 2783 2784 { 2785 const U8* send = s + len; 2786 const U8* x = first_variant; 2787 STRLEN outlen = first_variant - s; 2788 2789 while (x < send) { 2790 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); 2791 if (UNLIKELY(! cur_len)) { 2792 break; 2793 } 2794 x += cur_len; 2795 outlen++; 2796 } 2797 2798 if (el) 2799 *el = outlen; 2800 2801 if (ep) { 2802 *ep = x; 2803 } 2804 2805 return (x == send); 2806 } 2807 } 2808 2809 /* 2810 =for apidoc utf8_distance 2811 2812 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 2813 and C<b>. 2814 2815 WARNING: use only if you *know* that the pointers point inside the 2816 same UTF-8 buffer. 2817 2818 =cut 2819 */ 2820 2821 PERL_STATIC_INLINE IV 2822 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 2823 { 2824 PERL_ARGS_ASSERT_UTF8_DISTANCE; 2825 2826 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 2827 } 2828 2829 /* 2830 =for apidoc utf8_hop 2831 2832 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 2833 forward (if C<off> is positive) or backward (if negative). C<s> does not need 2834 to be pointing to the starting byte of a character. If it isn't, one count of 2835 C<off> will be used up to get to the start of the next character for forward 2836 hops, and to the start of the current character for negative ones. 2837 2838 WARNING: Prefer L</utf8_hop_safe> to this one. 2839 2840 Do NOT use this function unless you B<know> C<off> is within 2841 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned 2842 on the first byte of a character or just after the last byte of a character. 2843 2844 =cut 2845 */ 2846 2847 PERL_STATIC_INLINE U8 * 2848 Perl_utf8_hop(const U8 *s, SSize_t off) 2849 { 2850 PERL_ARGS_ASSERT_UTF8_HOP; 2851 2852 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2853 * the XXX bitops (especially ~) can create illegal UTF-8. 2854 * In other words: in Perl UTF-8 is not just for Unicode. */ 2855 2856 if (off > 0) { 2857 2858 /* Get to next non-continuation byte */ 2859 if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) { 2860 do { 2861 s++; 2862 } 2863 while (UTF8_IS_CONTINUATION(*s)); 2864 off--; 2865 } 2866 2867 while (off--) 2868 s += UTF8SKIP(s); 2869 } 2870 else { 2871 while (off++) { 2872 s--; 2873 while (UTF8_IS_CONTINUATION(*s)) 2874 s--; 2875 } 2876 } 2877 2878 GCC_DIAG_IGNORE(-Wcast-qual) 2879 return (U8 *)s; 2880 GCC_DIAG_RESTORE 2881 } 2882 2883 /* 2884 =for apidoc utf8_hop_forward 2885 2886 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2887 forward. C<s> does not need to be pointing to the starting byte of a 2888 character. If it isn't, one count of C<off> will be used up to get to the 2889 start of the next character. 2890 2891 C<off> must be non-negative. 2892 2893 C<s> must be before or equal to C<end>. 2894 2895 When moving forward it will not move beyond C<end>. 2896 2897 Will not exceed this limit even if the string is not valid "UTF-8". 2898 2899 =cut 2900 */ 2901 2902 PERL_STATIC_INLINE U8 * 2903 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) 2904 { 2905 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; 2906 2907 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2908 * the bitops (especially ~) can create illegal UTF-8. 2909 * In other words: in Perl UTF-8 is not just for Unicode. */ 2910 2911 assert(s <= end); 2912 assert(off >= 0); 2913 2914 if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) { 2915 /* Get to next non-continuation byte */ 2916 do { 2917 s++; 2918 } 2919 while (UTF8_IS_CONTINUATION(*s)); 2920 off--; 2921 } 2922 2923 while (off--) { 2924 STRLEN skip = UTF8SKIP(s); 2925 if ((STRLEN)(end - s) <= skip) { 2926 GCC_DIAG_IGNORE(-Wcast-qual) 2927 return (U8 *)end; 2928 GCC_DIAG_RESTORE 2929 } 2930 s += skip; 2931 } 2932 2933 GCC_DIAG_IGNORE(-Wcast-qual) 2934 return (U8 *)s; 2935 GCC_DIAG_RESTORE 2936 } 2937 2938 /* 2939 =for apidoc utf8_hop_back 2940 2941 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2942 backward. C<s> does not need to be pointing to the starting byte of a 2943 character. If it isn't, one count of C<off> will be used up to get to that 2944 start. 2945 2946 C<off> must be non-positive. 2947 2948 C<s> must be after or equal to C<start>. 2949 2950 When moving backward it will not move before C<start>. 2951 2952 Will not exceed this limit even if the string is not valid "UTF-8". 2953 2954 =cut 2955 */ 2956 2957 PERL_STATIC_INLINE U8 * 2958 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) 2959 { 2960 PERL_ARGS_ASSERT_UTF8_HOP_BACK; 2961 2962 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 2963 * the bitops (especially ~) can create illegal UTF-8. 2964 * In other words: in Perl UTF-8 is not just for Unicode. */ 2965 2966 assert(start <= s); 2967 assert(off <= 0); 2968 2969 /* Note: if we know that the input is well-formed, we can do per-word 2970 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented 2971 * that. But it was reverted because doing per-word has some 2972 * start-up/tear-down overhead, so only makes sense if the distance to be 2973 * moved is large, and core perl doesn't currently move more than a few 2974 * characters at a time. You can reinstate it if it does become 2975 * advantageous. */ 2976 while (off++ && s > start) { 2977 do { 2978 s--; 2979 } while (s > start && UTF8_IS_CONTINUATION(*s)); 2980 } 2981 2982 GCC_DIAG_IGNORE(-Wcast-qual) 2983 return (U8 *)s; 2984 GCC_DIAG_RESTORE 2985 } 2986 2987 /* 2988 =for apidoc utf8_hop_safe 2989 2990 Return the UTF-8 pointer C<s> displaced by up to C<off> characters, 2991 either forward or backward. C<s> does not need to be pointing to the starting 2992 byte of a character. If it isn't, one count of C<off> will be used up to get 2993 to the start of the next character for forward hops, and to the start of the 2994 current character for negative ones. 2995 2996 When moving backward it will not move before C<start>. 2997 2998 When moving forward it will not move beyond C<end>. 2999 3000 Will not exceed those limits even if the string is not valid "UTF-8". 3001 3002 =cut 3003 */ 3004 3005 PERL_STATIC_INLINE U8 * 3006 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) 3007 { 3008 PERL_ARGS_ASSERT_UTF8_HOP_SAFE; 3009 3010 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 3011 * the bitops (especially ~) can create illegal UTF-8. 3012 * In other words: in Perl UTF-8 is not just for Unicode. */ 3013 3014 assert(start <= s && s <= end); 3015 3016 if (off >= 0) { 3017 return utf8_hop_forward(s, off, end); 3018 } 3019 else { 3020 return utf8_hop_back(s, off, start); 3021 } 3022 } 3023 3024 /* 3025 3026 =for apidoc isUTF8_CHAR_flags 3027 3028 Evaluates to non-zero if the first few bytes of the string starting at C<s> and 3029 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, 3030 that represents some code point, subject to the restrictions given by C<flags>; 3031 otherwise it evaluates to 0. If non-zero, the value gives how many bytes 3032 starting at C<s> comprise the code point's representation. Any bytes remaining 3033 before C<e>, but beyond the ones needed to form the first code point in C<s>, 3034 are not examined. 3035 3036 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; 3037 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results 3038 as C<L</isSTRICT_UTF8_CHAR>>; 3039 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives 3040 the same results as C<L</isC9_STRICT_UTF8_CHAR>>. 3041 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags 3042 understood by C<L</utf8n_to_uvchr>>, with the same meanings. 3043 3044 The three alternative macros are for the most commonly needed validations; they 3045 are likely to run somewhat faster than this more general one, as they can be 3046 inlined into your code. 3047 3048 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and 3049 L</is_utf8_string_loclen_flags> to check entire strings. 3050 3051 =cut 3052 */ 3053 3054 PERL_STATIC_INLINE STRLEN 3055 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) 3056 { 3057 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS; 3058 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 3059 |UTF8_DISALLOW_PERL_EXTENDED))); 3060 3061 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 3062 goto check_success, 3063 DFA_TEASE_APART_FF_, 3064 DFA_RETURN_FAILURE_); 3065 3066 check_success: 3067 3068 return is_utf8_char_helper_(s0, e, flags); 3069 3070 #ifdef HAS_EXTRA_LONG_UTF8 3071 3072 tease_apart_FF: 3073 3074 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is 3075 * either malformed, or was for the largest possible start byte, which 3076 * indicates perl extended UTF-8, well above the Unicode maximum */ 3077 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 3078 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 3079 { 3080 return 0; 3081 } 3082 3083 /* Otherwise examine the sequence not inline */ 3084 return is_utf8_FF_helper_(s0, e, 3085 FALSE /* require full, not partial char */ 3086 ); 3087 #endif 3088 3089 } 3090 3091 /* 3092 3093 =for apidoc is_utf8_valid_partial_char 3094 3095 Returns 0 if the sequence of bytes starting at C<s> and looking no further than 3096 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code 3097 points. Otherwise, it returns 1 if there exists at least one non-empty 3098 sequence of bytes that when appended to sequence C<s>, starting at position 3099 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point; 3100 otherwise returns 0. 3101 3102 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code 3103 point. 3104 3105 This is useful when a fixed-length buffer is being tested for being well-formed 3106 UTF-8, but the final few bytes in it don't comprise a full character; that is, 3107 it is split somewhere in the middle of the final code point's UTF-8 3108 representation. (Presumably when the buffer is refreshed with the next chunk 3109 of data, the new first bytes will complete the partial code point.) This 3110 function is used to verify that the final bytes in the current buffer are in 3111 fact the legal beginning of some code point, so that if they aren't, the 3112 failure can be signalled without having to wait for the next read. 3113 3114 =cut 3115 */ 3116 #define is_utf8_valid_partial_char(s, e) \ 3117 is_utf8_valid_partial_char_flags(s, e, 0) 3118 3119 /* 3120 3121 =for apidoc is_utf8_valid_partial_char_flags 3122 3123 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether 3124 or not the input is a valid UTF-8 encoded partial character, but it takes an 3125 extra parameter, C<flags>, which can further restrict which code points are 3126 considered valid. 3127 3128 If C<flags> is 0, this behaves identically to 3129 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination 3130 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If 3131 there is any sequence of bytes that can complete the input partial character in 3132 such a way that a non-prohibited character is formed, the function returns 3133 TRUE; otherwise FALSE. Non character code points cannot be determined based on 3134 partial character input. But many of the other possible excluded types can be 3135 determined from just the first one or two bytes. 3136 3137 =cut 3138 */ 3139 3140 PERL_STATIC_INLINE bool 3141 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags) 3142 { 3143 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; 3144 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 3145 |UTF8_DISALLOW_PERL_EXTENDED))); 3146 3147 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, 3148 DFA_RETURN_FAILURE_, 3149 DFA_TEASE_APART_FF_, 3150 NOOP); 3151 3152 /* The NOOP above causes the DFA to drop down here iff the input was a 3153 * partial character. flags=0 => can return TRUE immediately; otherwise we 3154 * need to check (not inline) if the partial character is the beginning of 3155 * a disallowed one */ 3156 if (flags == 0) { 3157 return TRUE; 3158 } 3159 3160 return cBOOL(is_utf8_char_helper_(s0, e, flags)); 3161 3162 #ifdef HAS_EXTRA_LONG_UTF8 3163 3164 tease_apart_FF: 3165 3166 /* Getting here means the input is either malformed, or, in the case of 3167 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The 3168 * latter case has to be extended UTF-8, so can fail immediately if that is 3169 * forbidden */ 3170 3171 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) 3172 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) 3173 { 3174 return 0; 3175 } 3176 3177 return is_utf8_FF_helper_(s0, e, 3178 TRUE /* Require to be a partial character */ 3179 ); 3180 #endif 3181 3182 } 3183 3184 /* 3185 3186 =for apidoc is_utf8_fixed_width_buf_flags 3187 3188 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len> 3189 is entirely valid UTF-8, subject to the restrictions given by C<flags>; 3190 otherwise it returns FALSE. 3191 3192 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted 3193 without restriction. If the final few bytes of the buffer do not form a 3194 complete code point, this will return TRUE anyway, provided that 3195 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them. 3196 3197 If C<flags> in non-zero, it can be any combination of the 3198 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the 3199 same meanings. 3200 3201 This function differs from C<L</is_utf8_string_flags>> only in that the latter 3202 returns FALSE if the final few bytes of the string don't form a complete code 3203 point. 3204 3205 =cut 3206 */ 3207 #define is_utf8_fixed_width_buf_flags(s, len, flags) \ 3208 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags) 3209 3210 /* 3211 3212 =for apidoc is_utf8_fixed_width_buf_loc_flags 3213 3214 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the 3215 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point 3216 to the beginning of any partial character at the end of the buffer; if there is 3217 no partial character C<*ep> will contain C<s>+C<len>. 3218 3219 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>. 3220 3221 =cut 3222 */ 3223 3224 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \ 3225 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags) 3226 3227 /* 3228 3229 =for apidoc is_utf8_fixed_width_buf_loclen_flags 3230 3231 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of 3232 complete, valid characters found in the C<el> pointer. 3233 3234 =cut 3235 */ 3236 3237 PERL_STATIC_INLINE bool 3238 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, 3239 STRLEN len, 3240 const U8 **ep, 3241 STRLEN *el, 3242 const U32 flags) 3243 { 3244 const U8 * maybe_partial; 3245 3246 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS; 3247 3248 if (! ep) { 3249 ep = &maybe_partial; 3250 } 3251 3252 /* If it's entirely valid, return that; otherwise see if the only error is 3253 * that the final few bytes are for a partial character */ 3254 return is_utf8_string_loclen_flags(s, len, ep, el, flags) 3255 || is_utf8_valid_partial_char_flags(*ep, s + len, flags); 3256 } 3257 3258 PERL_STATIC_INLINE UV 3259 Perl_utf8n_to_uvchr_msgs(const U8 *s, 3260 STRLEN curlen, 3261 STRLEN *retlen, 3262 const U32 flags, 3263 U32 * errors, 3264 AV ** msgs) 3265 { 3266 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the 3267 * simple cases, and, if necessary calls a helper function to deal with the 3268 * more complex ones. Almost all well-formed non-problematic code points 3269 * are considered simple, so that it's unlikely that the helper function 3270 * will need to be called. 3271 * 3272 * This is an adaptation of the tables and algorithm given in 3273 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides 3274 * comprehensive documentation of the original version. A copyright notice 3275 * for the original version is given at the beginning of this file. The 3276 * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[]. 3277 */ 3278 3279 const U8 * const s0 = s; 3280 const U8 * send = s0 + curlen; 3281 UV type; 3282 UV uv; 3283 3284 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; 3285 3286 /* This dfa is fast. If it accepts the input, it was for a well-formed, 3287 * non-problematic code point, which can be returned immediately. 3288 * Otherwise we call a helper function to figure out the more complicated 3289 * cases. */ 3290 3291 /* No calls from core pass in an empty string; non-core need a check */ 3292 #ifdef PERL_CORE 3293 assert(curlen > 0); 3294 #else 3295 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen, 3296 flags, errors, msgs); 3297 #endif 3298 3299 type = PL_strict_utf8_dfa_tab[*s]; 3300 3301 /* The table is structured so that 'type' is 0 iff the input byte is 3302 * represented identically regardless of the UTF-8ness of the string */ 3303 if (type == 0) { /* UTF-8 invariants are returned unchanged */ 3304 uv = *s; 3305 } 3306 else { 3307 UV state = PL_strict_utf8_dfa_tab[256 + type]; 3308 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); 3309 3310 while (LIKELY(state != 1) && ++s < send) { 3311 type = PL_strict_utf8_dfa_tab[*s]; 3312 state = PL_strict_utf8_dfa_tab[256 + state + type]; 3313 3314 uv = UTF8_ACCUMULATE(uv, *s); 3315 3316 if (state == 0) { 3317 #ifdef EBCDIC 3318 uv = UNI_TO_NATIVE(uv); 3319 #endif 3320 goto success; 3321 } 3322 } 3323 3324 /* Here is potentially problematic. Use the full mechanism */ 3325 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, 3326 errors, msgs); 3327 } 3328 3329 success: 3330 if (retlen) { 3331 *retlen = s - s0 + 1; 3332 } 3333 if (errors) { 3334 *errors = 0; 3335 } 3336 if (msgs) { 3337 *msgs = NULL; 3338 } 3339 3340 return uv; 3341 } 3342 3343 PERL_STATIC_INLINE UV 3344 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 3345 { 3346 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; 3347 3348 assert(s < send); 3349 3350 if (! ckWARN_d(WARN_UTF8)) { 3351 3352 /* EMPTY is not really allowed, and asserts on debugging builds. But 3353 * on non-debugging we have to deal with it, and this causes it to 3354 * return the REPLACEMENT CHARACTER, as the documentation indicates */ 3355 return utf8n_to_uvchr(s, send - s, retlen, 3356 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); 3357 } 3358 else { 3359 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); 3360 if (retlen && ret == 0 && (send <= s || *s != '\0')) { 3361 *retlen = (STRLEN) -1; 3362 } 3363 3364 return ret; 3365 } 3366 } 3367 3368 /* ------------------------------- perl.h ----------------------------- */ 3369 3370 /* 3371 =for apidoc_section $utility 3372 3373 =for apidoc is_safe_syscall 3374 3375 Test that the given C<pv> (with length C<len>) doesn't contain any internal 3376 C<NUL> characters. 3377 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls> 3378 category, and return FALSE. 3379 3380 Return TRUE if the name is safe. 3381 3382 C<what> and C<op_name> are used in any warning. 3383 3384 Used by the C<IS_SAFE_SYSCALL()> macro. 3385 3386 =cut 3387 */ 3388 3389 PERL_STATIC_INLINE bool 3390 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) 3391 { 3392 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs 3393 * perl itself uses xce*() functions which accept 8-bit strings. 3394 */ 3395 3396 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL; 3397 3398 if (len > 1) { 3399 char *null_at; 3400 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) { 3401 SETERRNO(ENOENT, LIB_INVARG); 3402 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS), 3403 "Invalid \\0 character in %s for %s: %s\\0%s", 3404 what, op_name, pv, null_at+1); 3405 return FALSE; 3406 } 3407 } 3408 3409 return TRUE; 3410 } 3411 3412 /* 3413 3414 Return true if the supplied filename has a newline character 3415 immediately before the first (hopefully only) NUL. 3416 3417 My original look at this incorrectly used the len from SvPV(), but 3418 that's incorrect, since we allow for a NUL in pv[len-1]. 3419 3420 So instead, strlen() and work from there. 3421 3422 This allow for the user reading a filename, forgetting to chomp it, 3423 then calling: 3424 3425 open my $foo, "$file\0"; 3426 3427 */ 3428 3429 #ifdef PERL_CORE 3430 3431 PERL_STATIC_INLINE bool 3432 S_should_warn_nl(const char *pv) 3433 { 3434 STRLEN len; 3435 3436 PERL_ARGS_ASSERT_SHOULD_WARN_NL; 3437 3438 len = strlen(pv); 3439 3440 return len > 0 && pv[len-1] == '\n'; 3441 } 3442 3443 #endif 3444 3445 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) 3446 3447 PERL_STATIC_INLINE bool 3448 S_lossless_NV_to_IV(const NV nv, IV *ivp) 3449 { 3450 /* This function determines if the input NV 'nv' may be converted without 3451 * loss of data to an IV. If not, it returns FALSE taking no other action. 3452 * But if it is possible, it does the conversion, returning TRUE, and 3453 * storing the converted result in '*ivp' */ 3454 3455 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; 3456 3457 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 3458 /* Normally any comparison with a NaN returns false; if we can't rely 3459 * on that behaviour, check explicitly */ 3460 if (UNLIKELY(Perl_isnan(nv))) { 3461 return FALSE; 3462 } 3463 # endif 3464 3465 # ifndef NV_PRESERVES_UV 3466 STATIC_ASSERT_STMT(((UV)1 << NV_PRESERVES_UV_BITS) - 1 <= (UV)IV_MAX); 3467 # endif 3468 3469 /* Written this way so that with an always-false NaN comparison we 3470 * return false */ 3471 if ( 3472 # ifdef NV_PRESERVES_UV 3473 LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1) && 3474 # else 3475 /* If the condition below is not satisfied, lower bits of nv's 3476 * integral part is already lost and accurate conversion to integer 3477 * is impossible. 3478 * Note this should be consistent with S_sv_2iuv_common in sv.c. */ 3479 Perl_fabs(nv) < (NV) ((UV)1 << NV_PRESERVES_UV_BITS) && 3480 # endif 3481 (IV) nv == nv) { 3482 *ivp = (IV) nv; 3483 return TRUE; 3484 } 3485 return FALSE; 3486 } 3487 3488 #endif 3489 3490 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ 3491 3492 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) 3493 3494 #define MAX_CHARSET_NAME_LENGTH 2 3495 3496 PERL_STATIC_INLINE const char * 3497 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp) 3498 { 3499 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME; 3500 3501 /* Returns a string that corresponds to the name of the regex character set 3502 * given by 'flags', and *lenp is set the length of that string, which 3503 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ 3504 3505 *lenp = 1; 3506 switch (get_regex_charset(flags)) { 3507 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; 3508 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; 3509 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; 3510 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; 3511 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 3512 *lenp = 2; 3513 return ASCII_MORE_RESTRICT_PAT_MODS; 3514 } 3515 /* The NOT_REACHED; hides an assert() which has a rather complex 3516 * definition in perl.h. */ 3517 NOT_REACHED; /* NOTREACHED */ 3518 return "?"; /* Unknown */ 3519 } 3520 3521 #endif 3522 3523 /* 3524 3525 Return false if any get magic is on the SV other than taint magic. 3526 3527 */ 3528 3529 PERL_STATIC_INLINE bool 3530 Perl_sv_only_taint_gmagic(SV *sv) 3531 { 3532 MAGIC *mg = SvMAGIC(sv); 3533 3534 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; 3535 3536 while (mg) { 3537 if (mg->mg_type != PERL_MAGIC_taint 3538 && !(mg->mg_flags & MGf_GSKIP) 3539 && mg->mg_virtual->svt_get) { 3540 return FALSE; 3541 } 3542 mg = mg->mg_moremagic; 3543 } 3544 3545 return TRUE; 3546 } 3547 3548 /* ------------------ cop.h ------------------------------------------- */ 3549 3550 /* implement GIMME_V() macro */ 3551 3552 PERL_STATIC_INLINE U8 3553 Perl_gimme_V(pTHX) 3554 { 3555 I32 cxix; 3556 U8 gimme = (PL_op->op_flags & OPf_WANT); 3557 3558 if (gimme) 3559 return gimme; 3560 cxix = PL_curstackinfo->si_cxsubix; 3561 if (cxix < 0) 3562 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID; 3563 assert(cxstack[cxix].blk_gimme & G_WANT); 3564 return (cxstack[cxix].blk_gimme & G_WANT); 3565 } 3566 3567 3568 /* Enter a block. Push a new base context and return its address. */ 3569 3570 PERL_STATIC_INLINE PERL_CONTEXT * 3571 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) 3572 { 3573 PERL_CONTEXT * cx; 3574 3575 PERL_ARGS_ASSERT_CX_PUSHBLOCK; 3576 3577 CXINC; 3578 cx = CX_CUR(); 3579 cx->cx_type = type; 3580 cx->blk_gimme = gimme; 3581 cx->blk_oldsaveix = saveix; 3582 cx->blk_oldsp = (Stack_off_t)(sp - PL_stack_base); 3583 assert(cxstack_ix <= 0 3584 || CxTYPE(cx-1) == CXt_SUBST 3585 || cx->blk_oldsp >= (cx-1)->blk_oldsp); 3586 cx->blk_oldcop = PL_curcop; 3587 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack); 3588 cx->blk_oldscopesp = PL_scopestack_ix; 3589 cx->blk_oldpm = PL_curpm; 3590 cx->blk_old_tmpsfloor = PL_tmps_floor; 3591 3592 PL_tmps_floor = PL_tmps_ix; 3593 CX_DEBUG(cx, "PUSH"); 3594 return cx; 3595 } 3596 3597 3598 /* Exit a block (RETURN and LAST). */ 3599 3600 PERL_STATIC_INLINE void 3601 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx) 3602 { 3603 PERL_ARGS_ASSERT_CX_POPBLOCK; 3604 3605 CX_DEBUG(cx, "POP"); 3606 /* these 3 are common to cx_popblock and cx_topblock */ 3607 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 3608 PL_scopestack_ix = cx->blk_oldscopesp; 3609 PL_curpm = cx->blk_oldpm; 3610 3611 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats 3612 * and leaves a CX entry lying around for repeated use, so 3613 * skip for multicall */ \ 3614 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx)) 3615 || PL_savestack_ix == cx->blk_oldsaveix); 3616 PL_curcop = cx->blk_oldcop; 3617 PL_tmps_floor = cx->blk_old_tmpsfloor; 3618 } 3619 3620 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO). 3621 * Whereas cx_popblock() restores the state to the point just before 3622 * cx_pushblock() was called, cx_topblock() restores it to the point just 3623 * *after* cx_pushblock() was called. */ 3624 3625 PERL_STATIC_INLINE void 3626 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx) 3627 { 3628 PERL_ARGS_ASSERT_CX_TOPBLOCK; 3629 3630 CX_DEBUG(cx, "TOP"); 3631 /* these 3 are common to cx_popblock and cx_topblock */ 3632 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; 3633 PL_scopestack_ix = cx->blk_oldscopesp; 3634 PL_curpm = cx->blk_oldpm; 3635 Perl_rpp_popfree_to(aTHX_ PL_stack_base + cx->blk_oldsp); 3636 } 3637 3638 3639 PERL_STATIC_INLINE void 3640 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) 3641 { 3642 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); 3643 3644 PERL_ARGS_ASSERT_CX_PUSHSUB; 3645 3646 PERL_DTRACE_PROBE_ENTRY(cv); 3647 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix; 3648 PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack); 3649 cx->blk_sub.cv = cv; 3650 cx->blk_sub.olddepth = CvDEPTH(cv); 3651 cx->blk_sub.prevcomppad = PL_comppad; 3652 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0; 3653 cx->blk_sub.retop = retop; 3654 SvREFCNT_inc_simple_void_NN(cv); 3655 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF); 3656 } 3657 3658 3659 /* subsets of cx_popsub() */ 3660 3661 PERL_STATIC_INLINE void 3662 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) 3663 { 3664 CV *cv; 3665 3666 PERL_ARGS_ASSERT_CX_POPSUB_COMMON; 3667 assert(CxTYPE(cx) == CXt_SUB); 3668 3669 PL_comppad = cx->blk_sub.prevcomppad; 3670 PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL; 3671 cv = cx->blk_sub.cv; 3672 CvDEPTH(cv) = cx->blk_sub.olddepth; 3673 cx->blk_sub.cv = NULL; 3674 SvREFCNT_dec(cv); 3675 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix; 3676 } 3677 3678 3679 /* handle the @_ part of leaving a sub */ 3680 3681 PERL_STATIC_INLINE void 3682 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) 3683 { 3684 AV *av; 3685 3686 PERL_ARGS_ASSERT_CX_POPSUB_ARGS; 3687 assert(CxTYPE(cx) == CXt_SUB); 3688 assert(AvARRAY(MUTABLE_AV( 3689 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[ 3690 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad); 3691 3692 CX_POP_SAVEARRAY(cx); 3693 av = MUTABLE_AV(PAD_SVl(0)); 3694 if (!SvMAGICAL(av) && SvREFCNT(av) == 1 3695 #ifndef PERL_RC_STACK 3696 && !AvREAL(av) 3697 #endif 3698 ) 3699 clear_defarray_simple(av); 3700 else 3701 /* abandon @_ if it got reified */ 3702 clear_defarray(av, 0); 3703 } 3704 3705 3706 PERL_STATIC_INLINE void 3707 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx) 3708 { 3709 PERL_ARGS_ASSERT_CX_POPSUB; 3710 assert(CxTYPE(cx) == CXt_SUB); 3711 3712 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv); 3713 3714 if (CxHASARGS(cx)) 3715 cx_popsub_args(cx); 3716 cx_popsub_common(cx); 3717 } 3718 3719 3720 PERL_STATIC_INLINE void 3721 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) 3722 { 3723 PERL_ARGS_ASSERT_CX_PUSHFORMAT; 3724 3725 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix; 3726 PL_curstackinfo->si_cxsubix= (I32)(cx - PL_curstackinfo->si_cxstack); 3727 cx->blk_format.cv = cv; 3728 cx->blk_format.retop = retop; 3729 cx->blk_format.gv = gv; 3730 cx->blk_format.dfoutgv = PL_defoutgv; 3731 cx->blk_format.prevcomppad = PL_comppad; 3732 cx->blk_u16 = 0; 3733 3734 SvREFCNT_inc_simple_void_NN(cv); 3735 CvDEPTH(cv)++; 3736 SvREFCNT_inc_void(cx->blk_format.dfoutgv); 3737 } 3738 3739 3740 PERL_STATIC_INLINE void 3741 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx) 3742 { 3743 CV *cv; 3744 GV *dfout; 3745 3746 PERL_ARGS_ASSERT_CX_POPFORMAT; 3747 assert(CxTYPE(cx) == CXt_FORMAT); 3748 3749 dfout = cx->blk_format.dfoutgv; 3750 setdefout(dfout); 3751 cx->blk_format.dfoutgv = NULL; 3752 SvREFCNT_dec_NN(dfout); 3753 3754 PL_comppad = cx->blk_format.prevcomppad; 3755 PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL; 3756 cv = cx->blk_format.cv; 3757 cx->blk_format.cv = NULL; 3758 --CvDEPTH(cv); 3759 SvREFCNT_dec_NN(cv); 3760 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix; 3761 } 3762 3763 3764 PERL_STATIC_INLINE void 3765 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 3766 { 3767 cx->blk_eval.retop = retop; 3768 cx->blk_eval.old_namesv = namesv; 3769 cx->blk_eval.old_eval_root = PL_eval_root; 3770 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL; 3771 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */ 3772 cx->blk_eval.cur_top_env = PL_top_env; 3773 3774 assert(!(PL_in_eval & ~ 0x3F)); 3775 assert(!(PL_op->op_type & ~0x1FF)); 3776 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); 3777 } 3778 3779 PERL_STATIC_INLINE void 3780 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) 3781 { 3782 PERL_ARGS_ASSERT_CX_PUSHEVAL; 3783 3784 Perl_push_evalortry_common(aTHX_ cx, retop, namesv); 3785 3786 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 3787 PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack); 3788 } 3789 3790 PERL_STATIC_INLINE void 3791 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop) 3792 { 3793 PERL_ARGS_ASSERT_CX_PUSHTRY; 3794 3795 Perl_push_evalortry_common(aTHX_ cx, retop, NULL); 3796 3797 /* Don't actually change it, just store the current value so it's restored 3798 * by the common popeval */ 3799 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; 3800 } 3801 3802 3803 PERL_STATIC_INLINE void 3804 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx) 3805 { 3806 SV *sv; 3807 3808 PERL_ARGS_ASSERT_CX_POPEVAL; 3809 assert(CxTYPE(cx) == CXt_EVAL); 3810 3811 PL_in_eval = CxOLD_IN_EVAL(cx); 3812 assert(!(PL_in_eval & 0xc0)); 3813 PL_eval_root = cx->blk_eval.old_eval_root; 3814 sv = cx->blk_eval.cur_text; 3815 if (sv && CxEVAL_TXT_REFCNTED(cx)) { 3816 cx->blk_eval.cur_text = NULL; 3817 SvREFCNT_dec_NN(sv); 3818 } 3819 3820 sv = cx->blk_eval.old_namesv; 3821 if (sv) { 3822 cx->blk_eval.old_namesv = NULL; 3823 SvREFCNT_dec_NN(sv); 3824 } 3825 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix; 3826 } 3827 3828 3829 /* push a plain loop, i.e. 3830 * { block } 3831 * while (cond) { block } 3832 * for (init;cond;continue) { block } 3833 * This loop can be last/redo'ed etc. 3834 */ 3835 3836 PERL_STATIC_INLINE void 3837 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) 3838 { 3839 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; 3840 cx->blk_loop.my_op = cLOOP; 3841 } 3842 3843 3844 /* push a true for loop, i.e. 3845 * for var (list) { block } 3846 */ 3847 3848 PERL_STATIC_INLINE void 3849 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) 3850 { 3851 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; 3852 3853 /* this one line is common with cx_pushloop_plain */ 3854 cx->blk_loop.my_op = cLOOP; 3855 3856 cx->blk_loop.itervar_u.svp = (SV**)itervarp; 3857 cx->blk_loop.itersave = itersave; 3858 #ifdef USE_ITHREADS 3859 cx->blk_loop.oldcomppad = PL_comppad; 3860 #endif 3861 } 3862 3863 3864 /* pop all loop types, including plain */ 3865 3866 PERL_STATIC_INLINE void 3867 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx) 3868 { 3869 PERL_ARGS_ASSERT_CX_POPLOOP; 3870 3871 assert(CxTYPE_is_LOOP(cx)); 3872 if ( CxTYPE(cx) == CXt_LOOP_ARY 3873 || CxTYPE(cx) == CXt_LOOP_LAZYSV) 3874 { 3875 /* Free ary or cur. This assumes that state_u.ary.ary 3876 * aligns with state_u.lazysv.cur. See cx_dup() */ 3877 SV *sv = cx->blk_loop.state_u.lazysv.cur; 3878 cx->blk_loop.state_u.lazysv.cur = NULL; 3879 SvREFCNT_dec_NN(sv); 3880 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) { 3881 sv = cx->blk_loop.state_u.lazysv.end; 3882 cx->blk_loop.state_u.lazysv.end = NULL; 3883 SvREFCNT_dec_NN(sv); 3884 } 3885 } 3886 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) { 3887 SV *cursv; 3888 SV **svp = (cx)->blk_loop.itervar_u.svp; 3889 if ((cx->cx_type & CXp_FOR_GV)) 3890 svp = &GvSV((GV*)svp); 3891 cursv = *svp; 3892 *svp = cx->blk_loop.itersave; 3893 cx->blk_loop.itersave = NULL; 3894 SvREFCNT_dec(cursv); 3895 } 3896 if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF)) 3897 SvREFCNT_dec(cx->blk_loop.itervar_u.svp); 3898 } 3899 3900 3901 PERL_STATIC_INLINE void 3902 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) 3903 { 3904 PERL_ARGS_ASSERT_CX_PUSHWHEN; 3905 3906 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3907 } 3908 3909 3910 PERL_STATIC_INLINE void 3911 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx) 3912 { 3913 PERL_ARGS_ASSERT_CX_POPWHEN; 3914 assert(CxTYPE(cx) == CXt_WHEN); 3915 3916 PERL_UNUSED_ARG(cx); 3917 PERL_UNUSED_CONTEXT; 3918 /* currently NOOP */ 3919 } 3920 3921 3922 PERL_STATIC_INLINE void 3923 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) 3924 { 3925 PERL_ARGS_ASSERT_CX_PUSHGIVEN; 3926 3927 cx->blk_givwhen.leave_op = cLOGOP->op_other; 3928 cx->blk_givwhen.defsv_save = orig_defsv; 3929 } 3930 3931 3932 PERL_STATIC_INLINE void 3933 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx) 3934 { 3935 SV *sv; 3936 3937 PERL_ARGS_ASSERT_CX_POPGIVEN; 3938 assert(CxTYPE(cx) == CXt_GIVEN); 3939 3940 sv = GvSV(PL_defgv); 3941 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save; 3942 cx->blk_givwhen.defsv_save = NULL; 3943 SvREFCNT_dec(sv); 3944 } 3945 3946 3947 /* Make @_ empty in-place in simple cases: a cheap av_clear(). 3948 * See Perl_clear_defarray() for non-simple cases */ 3949 3950 3951 PERL_STATIC_INLINE void 3952 Perl_clear_defarray_simple(pTHX_ AV *av) 3953 { 3954 PERL_ARGS_ASSERT_CLEAR_DEFARRAY_SIMPLE; 3955 3956 assert(SvTYPE(av) == SVt_PVAV); 3957 assert(!SvREADONLY(av)); 3958 assert(!SvMAGICAL(av)); 3959 assert(SvREFCNT(av) == 1); 3960 3961 #ifdef PERL_RC_STACK 3962 assert(AvREAL(av)); 3963 /* this code assumes that destructors called here can't free av 3964 * itself, because pad[0] and/or CX pointers will keep it alive */ 3965 SSize_t i = AvFILLp(av); 3966 while (i >= 0) { 3967 SV *sv = AvARRAY(av)[i]; 3968 AvARRAY(av)[i--] = NULL; 3969 SvREFCNT_dec(sv); 3970 } 3971 #else 3972 assert(!AvREAL(av)); 3973 #endif 3974 AvFILLp(av) = -1; 3975 Perl_av_remove_offset(aTHX_ av); 3976 } 3977 3978 /* Switch to a different argument stack. 3979 * 3980 * Note that it doesn't update PL_curstackinfo->si_stack_nonrc_base, 3981 * so this should only be used as part of a general switching between 3982 * stackinfos. 3983 */ 3984 3985 PERL_STATIC_INLINE void 3986 Perl_switch_argstack(pTHX_ AV *to) 3987 { 3988 PERL_ARGS_ASSERT_SWITCH_ARGSTACK; 3989 3990 AvFILLp(PL_curstack) = PL_stack_sp - PL_stack_base; 3991 PL_stack_base = AvARRAY(to); 3992 PL_stack_max = PL_stack_base + AvMAX(to); 3993 PL_stack_sp = PL_stack_base + AvFILLp(to); 3994 PL_curstack = to; 3995 } 3996 3997 3998 /* Push, and switch to a new stackinfo, allocating one if none are spare, 3999 * to get a fresh set of stacks. 4000 * Update all the interpreter variables like PL_curstackinfo, 4001 * PL_stack_sp, etc. 4002 * current flag meanings: 4003 * 1 make the new arg stack AvREAL 4004 */ 4005 4006 4007 PERL_STATIC_INLINE void 4008 Perl_push_stackinfo(pTHX_ I32 type, UV flags) 4009 { 4010 PERL_ARGS_ASSERT_PUSH_STACKINFO; 4011 4012 PERL_SI *next = PL_curstackinfo->si_next; 4013 DEBUG_l({ 4014 int i = 0; PERL_SI *p = PL_curstackinfo; 4015 while (p) { i++; p = p->si_prev; } 4016 Perl_deb(aTHX_ "push STACKINFO %d in %s at %s:%d\n", 4017 i, SAFE_FUNCTION__, __FILE__, __LINE__); 4018 }) 4019 4020 if (!next) { 4021 next = new_stackinfo_flags(32, 2048/sizeof(PERL_CONTEXT) - 1, flags); 4022 next->si_prev = PL_curstackinfo; 4023 PL_curstackinfo->si_next = next; 4024 } 4025 next->si_type = type; 4026 next->si_cxix = -1; 4027 next->si_cxsubix = -1; 4028 PUSHSTACK_INIT_HWM(next); 4029 #ifdef PERL_RC_STACK 4030 next->si_stack_nonrc_base = 0; 4031 #endif 4032 if (flags & 1) 4033 AvREAL_on(next->si_stack); 4034 else 4035 AvREAL_off(next->si_stack); 4036 AvFILLp(next->si_stack) = 0; 4037 switch_argstack(next->si_stack); 4038 PL_curstackinfo = next; 4039 SET_MARK_OFFSET; 4040 } 4041 4042 4043 /* Pop, then switch to the previous stackinfo and set of stacks. 4044 * Update all the interpreter variables like PL_curstackinfo, 4045 * PL_stack_sp, etc. */ 4046 4047 PERL_STATIC_INLINE void 4048 Perl_pop_stackinfo(pTHX) 4049 { 4050 PERL_ARGS_ASSERT_POP_STACKINFO; 4051 4052 PERL_SI * const prev = PL_curstackinfo->si_prev; 4053 DEBUG_l({ 4054 int i = -1; PERL_SI *p = PL_curstackinfo; 4055 while (p) { i++; p = p->si_prev; } 4056 Perl_deb(aTHX_ "pop STACKINFO %d in %s at %s:%d\n", 4057 i, SAFE_FUNCTION__, __FILE__, __LINE__);}) 4058 if (!prev) { 4059 Perl_croak_popstack(); 4060 } 4061 4062 switch_argstack(prev->si_stack); 4063 /* don't free prev here, free them all at the END{} */ 4064 PL_curstackinfo = prev; 4065 } 4066 4067 4068 4069 /* 4070 =for apidoc newPADxVOP 4071 4072 Constructs, checks and returns an op containing a pad offset. C<type> is 4073 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV> 4074 or C<OP_PADCV>. The returned op will have the C<op_targ> field set by 4075 the C<padix> argument. 4076 4077 This is convenient when constructing a large optree in nested function 4078 calls, as it avoids needing to store the pad op directly to set the 4079 C<op_targ> field as a side-effect. For example 4080 4081 o = op_append_elem(OP_LINESEQ, o, 4082 newPADxVOP(OP_PADSV, 0, padix)); 4083 4084 =cut 4085 */ 4086 4087 PERL_STATIC_INLINE OP * 4088 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix) 4089 { 4090 PERL_ARGS_ASSERT_NEWPADXVOP; 4091 4092 assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV 4093 || type == OP_PADCV); 4094 OP *o = newOP(type, flags); 4095 o->op_targ = padix; 4096 return o; 4097 } 4098 4099 /* ------------------ util.h ------------------------------------------- */ 4100 4101 /* 4102 =for apidoc_section $string 4103 4104 =for apidoc foldEQ 4105 4106 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 4107 same 4108 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes 4109 match themselves and their opposite case counterparts. Non-cased and non-ASCII 4110 range bytes match only themselves. 4111 4112 =cut 4113 */ 4114 4115 PERL_STATIC_INLINE I32 4116 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len) 4117 { 4118 PERL_UNUSED_CONTEXT; 4119 4120 const U8 *a = (const U8 *)s1; 4121 const U8 *b = (const U8 *)s2; 4122 4123 PERL_ARGS_ASSERT_FOLDEQ; 4124 4125 assert(len >= 0); 4126 4127 while (len--) { 4128 if (*a != *b && *a != PL_fold[*b]) 4129 return 0; 4130 a++,b++; 4131 } 4132 return 1; 4133 } 4134 4135 PERL_STATIC_INLINE I32 4136 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len) 4137 { 4138 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds 4139 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and 4140 * does not check for this. Nor does it check that the strings each have 4141 * at least 'len' characters. */ 4142 4143 PERL_UNUSED_CONTEXT; 4144 4145 const U8 *a = (const U8 *)s1; 4146 const U8 *b = (const U8 *)s2; 4147 4148 PERL_ARGS_ASSERT_FOLDEQ_LATIN1; 4149 4150 assert(len >= 0); 4151 4152 while (len--) { 4153 if (*a != *b && *a != PL_fold_latin1[*b]) { 4154 return 0; 4155 } 4156 a++, b++; 4157 } 4158 return 1; 4159 } 4160 4161 /* 4162 =for apidoc_section $locale 4163 =for apidoc foldEQ_locale 4164 4165 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the 4166 same case-insensitively in the current locale; false otherwise. 4167 4168 =cut 4169 */ 4170 4171 PERL_STATIC_INLINE I32 4172 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len) 4173 { 4174 const U8 *a = (const U8 *)s1; 4175 const U8 *b = (const U8 *)s2; 4176 4177 PERL_ARGS_ASSERT_FOLDEQ_LOCALE; 4178 4179 assert(len >= 0); 4180 4181 while (len--) { 4182 if (*a != *b && *a != PL_fold_locale[*b]) { 4183 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4184 "%s:%d: Our records indicate %02x is not a fold of %02x" 4185 " or its mate %02x\n", 4186 __FILE__, __LINE__, *a, *b, PL_fold_locale[*b])); 4187 4188 return 0; 4189 } 4190 a++,b++; 4191 } 4192 return 1; 4193 } 4194 4195 /* 4196 =for apidoc_section $string 4197 =for apidoc my_strnlen 4198 4199 The C library C<strnlen> if available, or a Perl implementation of it. 4200 4201 C<my_strnlen()> computes the length of the string, up to C<maxlen> 4202 bytes. It will never attempt to address more than C<maxlen> 4203 bytes, making it suitable for use with strings that are not 4204 guaranteed to be NUL-terminated. 4205 4206 =cut 4207 4208 Description stolen from http://man.openbsd.org/strnlen.3, 4209 implementation stolen from PostgreSQL. 4210 */ 4211 #ifndef HAS_STRNLEN 4212 4213 PERL_STATIC_INLINE Size_t 4214 Perl_my_strnlen(const char *str, Size_t maxlen) 4215 { 4216 const char *end = (char *) memchr(str, '\0', maxlen); 4217 4218 PERL_ARGS_ASSERT_MY_STRNLEN; 4219 4220 if (end == NULL) return maxlen; 4221 return end - str; 4222 } 4223 4224 #endif 4225 4226 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) 4227 4228 PERL_STATIC_INLINE void * 4229 S_my_memrchr(const char * s, const char c, const STRLEN len) 4230 { 4231 /* memrchr(), since many platforms lack it */ 4232 4233 const char * t = s + len - 1; 4234 4235 PERL_ARGS_ASSERT_MY_MEMRCHR; 4236 4237 while (t >= s) { 4238 if (*t == c) { 4239 return (void *) t; 4240 } 4241 t--; 4242 } 4243 4244 return NULL; 4245 } 4246 4247 #endif 4248 4249 PERL_STATIC_INLINE char * 4250 Perl_mortal_getenv(const char * str) 4251 { 4252 /* This implements a (mostly) thread-safe, sequential-call-safe getenv(). 4253 * 4254 * It's (mostly) thread-safe because it uses a mutex to prevent other 4255 * threads (that look at this mutex) from destroying the result before this 4256 * routine has a chance to copy the result to a place that won't be 4257 * destroyed before the caller gets a chance to handle it. That place is a 4258 * mortal SV. khw chose this over SAVEFREEPV because he is under the 4259 * impression that the SV will hang around longer under more circumstances 4260 * 4261 * The reason it isn't completely thread-safe is that other code could 4262 * simply not pay attention to the mutex. All of the Perl core uses the 4263 * mutex, but it is possible for code from, say XS, to not use this mutex, 4264 * defeating the safety. 4265 * 4266 * getenv() returns, in some implementations, a pointer to a spot in the 4267 * **environ array, which could be invalidated at any time by this or 4268 * another thread changing the environment. Other implementations copy the 4269 * **environ value to a static buffer, returning a pointer to that. That 4270 * buffer might or might not be invalidated by a getenv() call in another 4271 * thread. If it does get zapped, we need an exclusive lock. Otherwise, 4272 * many getenv() calls can safely be running simultaneously, so a 4273 * many-reader (but no simultaneous writers) lock is ok. There is a 4274 * Configure probe to see if another thread destroys the buffer, and the 4275 * mutex is defined accordingly. 4276 * 4277 * But in all cases, using the mutex prevents these problems, as long as 4278 * all code uses the same mutex. 4279 * 4280 * A complication is that this can be called during phases where the 4281 * mortalization process isn't available. These are in interpreter 4282 * destruction or early in construction. khw believes that at these times 4283 * there shouldn't be anything else going on, so plain getenv is safe AS 4284 * LONG AS the caller acts on the return before calling it again. */ 4285 4286 char * ret; 4287 dTHX; 4288 4289 PERL_ARGS_ASSERT_MORTAL_GETENV; 4290 4291 /* Can't mortalize without stacks. khw believes that no other threads 4292 * should be running, so no need to lock things, and this may be during a 4293 * phase when locking isn't even available */ 4294 if (UNLIKELY(PL_scopestack_ix == 0)) { 4295 return getenv(str); 4296 } 4297 4298 #ifdef PERL_MEM_LOG 4299 4300 /* A major complication arises under PERL_MEM_LOG. When that is active, 4301 * every memory allocation may result in logging, depending on the value of 4302 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for 4303 * saving ENV{foo}'s value (but before saving it), the logging code will 4304 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some 4305 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to 4306 * lock a boolean mutex recursively); 3) destroying the getenv() static 4307 * buffer; or 4) destroying the temporary created by this for the copy 4308 * causes a log entry to be made which could cause a new temporary to be 4309 * created, which will need to be destroyed at some point, leading to an 4310 * infinite loop. 4311 * 4312 * The solution adopted here (after some gnashing of teeth) is to detect 4313 * the recursive calls and calls from the logger, and treat them specially. 4314 * Let's say we want to do getenv("foo"). We first find 4315 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter 4316 * variable, so no temporary is required. Then we do getenv(foo), and in 4317 * the process of creating a temporary to save it, this function will be 4318 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call, 4319 * we detect that it is such a call and return our saved value instead of 4320 * locking and doing a new getenv(). This solves all of problems 1), 2), 4321 * and 3). Because all the getenv()s are done while the mutex is locked, 4322 * the state cannot have changed. To solve 4), we don't create a temporary 4323 * when this is called from the logging code. That code disposes of the 4324 * return value while the mutex is still locked. 4325 * 4326 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial 4327 * digits and 3 particular letters are significant; the rest are ignored by 4328 * the memory logging code. Thus the per-interpreter variable only needs 4329 * to be large enough to save the significant information, the size of 4330 * which is known at compile time. The first byte is extra, reserved for 4331 * flags for our use. To protect against overflowing, only the reserved 4332 * byte, as many digits as don't overflow, and the three letters are 4333 * stored. 4334 * 4335 * The reserved byte has two bits: 4336 * 0x1 if set indicates that if we get here, it is a recursive call of 4337 * getenv() 4338 * 0x2 if set indicates that the call is from the logging code. 4339 * 4340 * If the flag indicates this is a recursive call, just return the stored 4341 * value of PL_mem_log; An empty value gets turned into NULL. */ 4342 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) { 4343 if (PL_mem_log[1] == '\0') { 4344 return NULL; 4345 } else { 4346 return PL_mem_log + 1; 4347 } 4348 } 4349 4350 #endif 4351 4352 GETENV_LOCK; 4353 4354 #ifdef PERL_MEM_LOG 4355 4356 /* Here we are in a critical section. As explained above, we do our own 4357 * getenv(PERL_MEM_LOG), saving the result safely. */ 4358 ret = getenv("PERL_MEM_LOG"); 4359 if (ret == NULL) { /* No logging active */ 4360 4361 /* Return that immediately if called from the logging code */ 4362 if (PL_mem_log[0] & 0x2) { 4363 GETENV_UNLOCK; 4364 return NULL; 4365 } 4366 4367 PL_mem_log[1] = '\0'; 4368 } 4369 else { 4370 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */ 4371 4372 /* There is nothing to prevent the value of PERL_MEM_LOG from being an 4373 * extremely long string. But we want only a few characters from it. 4374 * PL_mem_log has been made large enough to hold just the ones we need. 4375 * First the file descriptor. */ 4376 if (isDIGIT(*ret)) { 4377 const char * s = ret; 4378 if (UNLIKELY(*s == '0')) { 4379 4380 /* Reduce multiple leading zeros to a single one. This is to 4381 * allow the caller to change what to do with leading zeros. */ 4382 *mem_log_meat++ = '0'; 4383 s++; 4384 while (*s == '0') { 4385 s++; 4386 } 4387 } 4388 4389 /* If the input overflows, copy just enough for the result to also 4390 * overflow, plus 1 to make sure */ 4391 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) { 4392 *mem_log_meat++ = *s++; 4393 } 4394 } 4395 4396 /* Then each of the four significant characters */ 4397 if (strchr(ret, 'm')) { 4398 *mem_log_meat++ = 'm'; 4399 } 4400 if (strchr(ret, 's')) { 4401 *mem_log_meat++ = 's'; 4402 } 4403 if (strchr(ret, 't')) { 4404 *mem_log_meat++ = 't'; 4405 } 4406 if (strchr(ret, 'c')) { 4407 *mem_log_meat++ = 'c'; 4408 } 4409 *mem_log_meat = '\0'; 4410 4411 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log)); 4412 } 4413 4414 /* If we are being called from the logger, it only needs the significant 4415 * portion of PERL_MEM_LOG, and doesn't need a safe copy */ 4416 if (PL_mem_log[0] & 0x2) { 4417 assert(strEQ(str, "PERL_MEM_LOG")); 4418 GETENV_UNLOCK; 4419 return PL_mem_log + 1; 4420 } 4421 4422 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that 4423 * is coming from other than the logging code, so it should be treated the 4424 * same as any other getenv(), returning the full value, not just the 4425 * significant part, and having its value saved. Set the flag that 4426 * indicates any call to this routine will be a recursion from here */ 4427 PL_mem_log[0] = 0x1; 4428 4429 #endif 4430 4431 /* Now get the value of the real desired variable, and save a copy */ 4432 ret = getenv(str); 4433 4434 if (ret != NULL) { 4435 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) ); 4436 } 4437 4438 GETENV_UNLOCK; 4439 4440 #ifdef PERL_MEM_LOG 4441 4442 /* Clear the buffer */ 4443 Zero(PL_mem_log, sizeof(PL_mem_log), char); 4444 4445 #endif 4446 4447 return ret; 4448 } 4449 4450 PERL_STATIC_INLINE bool 4451 Perl_sv_isbool(pTHX_ const SV *sv) 4452 { 4453 PERL_UNUSED_CONTEXT; 4454 return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv); 4455 } 4456 4457 #ifdef USE_ITHREADS 4458 4459 PERL_STATIC_INLINE AV * 4460 Perl_cop_file_avn(pTHX_ const COP *cop) { 4461 4462 PERL_ARGS_ASSERT_COP_FILE_AVN; 4463 4464 const char *file = CopFILE(cop); 4465 if (file) { 4466 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD); 4467 if (gv) { 4468 return GvAVn(gv); 4469 } 4470 else 4471 return NULL; 4472 } 4473 else 4474 return NULL; 4475 } 4476 4477 #endif 4478 4479 PERL_STATIC_INLINE PADNAME * 4480 Perl_padname_refcnt_inc(PADNAME *pn) 4481 { 4482 PadnameREFCNT(pn)++; 4483 return pn; 4484 } 4485 4486 PERL_STATIC_INLINE PADNAMELIST * 4487 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl) 4488 { 4489 PadnamelistREFCNT(pnl)++; 4490 return pnl; 4491 } 4492 4493 /* copy a string to a safe spot */ 4494 4495 /* 4496 =for apidoc_section $string 4497 =for apidoc savepv 4498 4499 Perl's version of C<strdup()>. Returns a pointer to a newly allocated 4500 string which is a duplicate of C<pv>. The size of the string is 4501 determined by C<strlen()>, which means it may not contain embedded C<NUL> 4502 characters and must have a trailing C<NUL>. To prevent memory leaks, the 4503 memory allocated for the new string needs to be freed when no longer needed. 4504 This can be done with the C<L</Safefree>> function, or 4505 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>. 4506 4507 On some platforms, Windows for example, all allocated memory owned by a thread 4508 is deallocated when that thread ends. So if you need that not to happen, you 4509 need to use the shared memory functions, such as C<L</savesharedpv>>. 4510 4511 =cut 4512 */ 4513 4514 PERL_STATIC_INLINE char * 4515 Perl_savepv(pTHX_ const char *pv) 4516 { 4517 PERL_UNUSED_CONTEXT; 4518 if (!pv) 4519 return NULL; 4520 else { 4521 char *newaddr; 4522 const STRLEN pvlen = strlen(pv)+1; 4523 Newx(newaddr, pvlen, char); 4524 return (char*)memcpy(newaddr, pv, pvlen); 4525 } 4526 } 4527 4528 /* same thing but with a known length */ 4529 4530 /* 4531 =for apidoc savepvn 4532 4533 Perl's version of what C<strndup()> would be if it existed. Returns a 4534 pointer to a newly allocated string which is a duplicate of the first 4535 C<len> bytes from C<pv>, plus a trailing 4536 C<NUL> byte. The memory allocated for 4537 the new string can be freed with the C<Safefree()> function. 4538 4539 On some platforms, Windows for example, all allocated memory owned by a thread 4540 is deallocated when that thread ends. So if you need that not to happen, you 4541 need to use the shared memory functions, such as C<L</savesharedpvn>>. 4542 4543 =cut 4544 */ 4545 4546 PERL_STATIC_INLINE char * 4547 Perl_savepvn(pTHX_ const char *pv, Size_t len) 4548 { 4549 char *newaddr; 4550 PERL_UNUSED_CONTEXT; 4551 4552 Newx(newaddr,len+1,char); 4553 /* Give a meaning to NULL pointer mainly for the use in sv_magic() */ 4554 if (pv) { 4555 /* might not be null terminated */ 4556 newaddr[len] = '\0'; 4557 return (char *) CopyD(pv,newaddr,len,char); 4558 } 4559 else { 4560 return (char *) ZeroD(newaddr,len+1,char); 4561 } 4562 } 4563 4564 /* 4565 =for apidoc savesvpv 4566 4567 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from 4568 the passed in SV using C<SvPV()> 4569 4570 On some platforms, Windows for example, all allocated memory owned by a thread 4571 is deallocated when that thread ends. So if you need that not to happen, you 4572 need to use the shared memory functions, such as C<L</savesharedsvpv>>. 4573 4574 =cut 4575 */ 4576 4577 PERL_STATIC_INLINE char * 4578 Perl_savesvpv(pTHX_ SV *sv) 4579 { 4580 STRLEN len; 4581 const char * const pv = SvPV_const(sv, len); 4582 char *newaddr; 4583 4584 PERL_ARGS_ASSERT_SAVESVPV; 4585 4586 ++len; 4587 Newx(newaddr,len,char); 4588 return (char *) CopyD(pv,newaddr,len,char); 4589 } 4590 4591 /* 4592 =for apidoc savesharedsvpv 4593 4594 A version of C<savesharedpv()> which allocates the duplicate string in 4595 memory which is shared between threads. 4596 4597 =cut 4598 */ 4599 4600 PERL_STATIC_INLINE char * 4601 Perl_savesharedsvpv(pTHX_ SV *sv) 4602 { 4603 STRLEN len; 4604 const char * const pv = SvPV_const(sv, len); 4605 4606 PERL_ARGS_ASSERT_SAVESHAREDSVPV; 4607 4608 return savesharedpvn(pv, len); 4609 } 4610 4611 #ifndef PERL_GET_CONTEXT_DEFINED 4612 4613 /* 4614 =for apidoc_section $embedding 4615 =for apidoc get_context 4616 4617 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead. 4618 4619 =cut 4620 */ 4621 4622 PERL_STATIC_INLINE void * 4623 Perl_get_context(void) 4624 { 4625 # if defined(USE_ITHREADS) 4626 # ifdef OLD_PTHREADS_API 4627 pthread_addr_t t; 4628 int error = pthread_getspecific(PL_thr_key, &t); 4629 if (error) 4630 Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error); 4631 return (void*)t; 4632 # elif defined(I_MACH_CTHREADS) 4633 return (void*)cthread_data(cthread_self()); 4634 # else 4635 return (void*)PTHREAD_GETSPECIFIC(PL_thr_key); 4636 # endif 4637 # else 4638 return (void*)NULL; 4639 # endif 4640 } 4641 4642 #endif 4643 4644 PERL_STATIC_INLINE MGVTBL* 4645 Perl_get_vtbl(pTHX_ int vtbl_id) 4646 { 4647 PERL_UNUSED_CONTEXT; 4648 4649 return (vtbl_id < 0 || vtbl_id >= magic_vtable_max) 4650 ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id; 4651 } 4652 4653 /* 4654 =for apidoc my_strlcat 4655 4656 The C library C<strlcat> if available, or a Perl implementation of it. 4657 This operates on C C<NUL>-terminated strings. 4658 4659 C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at 4660 most S<C<size - strlen(dst) - 1>> bytes. It will then C<NUL>-terminate, 4661 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in 4662 practice this should not happen as it means that either C<size> is incorrect or 4663 that C<dst> is not a proper C<NUL>-terminated string). 4664 4665 Note that C<size> is the full size of the destination buffer and 4666 the result is guaranteed to be C<NUL>-terminated if there is room. Note that 4667 room for the C<NUL> should be included in C<size>. 4668 4669 The return value is the total length that C<dst> would have if C<size> is 4670 sufficiently large. Thus it is the initial length of C<dst> plus the length of 4671 C<src>. If C<size> is smaller than the return, the excess was not appended. 4672 4673 =cut 4674 4675 Description stolen from http://man.openbsd.org/strlcat.3 4676 */ 4677 #ifndef HAS_STRLCAT 4678 PERL_STATIC_INLINE Size_t 4679 Perl_my_strlcat(char *dst, const char *src, Size_t size) 4680 { 4681 Size_t used, length, copy; 4682 4683 used = strlen(dst); 4684 length = strlen(src); 4685 if (size > 0 && used < size - 1) { 4686 copy = (length >= size - used) ? size - used - 1 : length; 4687 memcpy(dst + used, src, copy); 4688 dst[used + copy] = '\0'; 4689 } 4690 return used + length; 4691 } 4692 #endif 4693 4694 4695 /* 4696 =for apidoc my_strlcpy 4697 4698 The C library C<strlcpy> if available, or a Perl implementation of it. 4699 This operates on C C<NUL>-terminated strings. 4700 4701 C<my_strlcpy()> copies up to S<C<size - 1>> bytes from the string C<src> 4702 to C<dst>, C<NUL>-terminating the result if C<size> is not 0. 4703 4704 The return value is the total length C<src> would be if the copy completely 4705 succeeded. If it is larger than C<size>, the excess was not copied. 4706 4707 =cut 4708 4709 Description stolen from http://man.openbsd.org/strlcpy.3 4710 */ 4711 #ifndef HAS_STRLCPY 4712 PERL_STATIC_INLINE Size_t 4713 Perl_my_strlcpy(char *dst, const char *src, Size_t size) 4714 { 4715 Size_t length, copy; 4716 4717 length = strlen(src); 4718 if (size > 0) { 4719 copy = (length >= size) ? size - 1 : length; 4720 memcpy(dst, src, copy); 4721 dst[copy] = '\0'; 4722 } 4723 return length; 4724 } 4725 #endif 4726 4727 /* 4728 * ex: set ts=8 sts=4 sw=4 et: 4729 */ 4730