xref: /openbsd-src/gnu/usr.bin/perl/inline.h (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*    inline.h
2  *
3  *    Copyright (C) 2012 by Larry Wall and others
4  *
5  *    You may distribute under the terms of either the GNU General Public
6  *    License or the Artistic License, as specified in the README file.
7  *
8  *    This file contains tables and code adapted from
9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10  *    copyright notice:
11 
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13 
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
20 
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
23 
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31 
32  *
33  * This file is a home for static inline functions that cannot go in other
34  * header files, because they depend on proto.h (included after most other
35  * headers) or struct definitions.
36  *
37  * Each section names the header file that the functions "belong" to.
38  */
39 
40 /* ------------------------------- av.h ------------------------------- */
41 
42 /*
43 =for apidoc_section $AV
44 =for apidoc av_count
45 Returns the number of elements in the array C<av>.  This is the true length of
46 the array, including any undefined elements.  It is always the same as
47 S<C<av_top_index(av) + 1>>.
48 
49 =cut
50 */
51 PERL_STATIC_INLINE Size_t
52 Perl_av_count(pTHX_ AV *av)
53 {
54     PERL_ARGS_ASSERT_AV_COUNT;
55     assert(SvTYPE(av) == SVt_PVAV);
56 
57     return AvFILL(av) + 1;
58 }
59 
60 /* ------------------------------- av.c ------------------------------- */
61 
62 /*
63 =for apidoc av_store_simple
64 
65 This is a cut-down version of av_store that assumes that the array is
66 very straightforward - no magic, not readonly, and AvREAL - and that
67 C<key> is not negative. This function MUST NOT be used in situations
68 where any of those assumptions may not hold.
69 
70 Stores an SV in an array.  The array index is specified as C<key>. It
71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
72 
73 Note that the caller is responsible for suitably incrementing the reference
74 count of C<val> before the call.
75 
76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
77 
78 =cut
79 */
80 
81 PERL_STATIC_INLINE SV**
82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
83 {
84     SV** ary;
85 
86     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
87     assert(SvTYPE(av) == SVt_PVAV);
88     assert(!SvMAGICAL(av));
89     assert(!SvREADONLY(av));
90     assert(AvREAL(av));
91     assert(key > -1);
92 
93     ary = AvARRAY(av);
94 
95     if (AvFILLp(av) < key) {
96         if (key > AvMAX(av)) {
97             av_extend(av,key);
98             ary = AvARRAY(av);
99         }
100         AvFILLp(av) = key;
101     } else
102         SvREFCNT_dec(ary[key]);
103 
104     ary[key] = val;
105     return &ary[key];
106 }
107 
108 /*
109 =for apidoc av_fetch_simple
110 
111 This is a cut-down version of av_fetch that assumes that the array is
112 very straightforward - no magic, not readonly, and AvREAL - and that
113 C<key> is not negative. This function MUST NOT be used in situations
114 where any of those assumptions may not hold.
115 
116 Returns the SV at the specified index in the array.  The C<key> is the
117 index.  If lval is true, you are guaranteed to get a real SV back (in case
118 it wasn't real before), which you can then modify.  Check that the return
119 value is non-null before dereferencing it to a C<SV*>.
120 
121 The rough perl equivalent is C<$myarray[$key]>.
122 
123 =cut
124 */
125 
126 PERL_STATIC_INLINE SV**
127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
128 {
129     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
130     assert(SvTYPE(av) == SVt_PVAV);
131     assert(!SvMAGICAL(av));
132     assert(!SvREADONLY(av));
133     assert(AvREAL(av));
134     assert(key > -1);
135 
136     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
137         return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
138     } else {
139         return &AvARRAY(av)[key];
140     }
141 }
142 
143 /* ------------------------------- cv.h ------------------------------- */
144 
145 /*
146 =for apidoc_section $CV
147 =for apidoc CvGV
148 Returns the GV associated with the CV C<sv>, reifying it if necessary.
149 
150 =cut
151 */
152 PERL_STATIC_INLINE GV *
153 Perl_CvGV(pTHX_ CV *sv)
154 {
155     PERL_ARGS_ASSERT_CVGV;
156 
157     return CvNAMED(sv)
158         ? Perl_cvgv_from_hek(aTHX_ sv)
159         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
160 }
161 
162 /*
163 =for apidoc CvDEPTH
164 Returns the recursion level of the CV C<sv>.  Hence >= 2 indicates we are in a
165 recursive call.
166 
167 =cut
168 */
169 PERL_STATIC_INLINE I32 *
170 Perl_CvDEPTH(const CV * const sv)
171 {
172     PERL_ARGS_ASSERT_CVDEPTH;
173     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
174 
175     return &((XPVCV*)SvANY(sv))->xcv_depth;
176 }
177 
178 /*
179  CvPROTO returns the prototype as stored, which is not necessarily what
180  the interpreter should be using. Specifically, the interpreter assumes
181  that spaces have been stripped, which has been the case if the prototype
182  was added by toke.c, but is generally not the case if it was added elsewhere.
183  Since we can't enforce the spacelessness at assignment time, this routine
184  provides a temporary copy at parse time with spaces removed.
185  I<orig> is the start of the original buffer, I<len> is the length of the
186  prototype and will be updated when this returns.
187  */
188 
189 #ifdef PERL_CORE
190 PERL_STATIC_INLINE char *
191 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
192 {
193     SV * tmpsv;
194     char * tmps;
195     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
196     tmps = SvPVX(tmpsv);
197     while ((*len)--) {
198         if (!isSPACE(*orig))
199             *tmps++ = *orig;
200         orig++;
201     }
202     *tmps = '\0';
203     *len = tmps - SvPVX(tmpsv);
204                 return SvPVX(tmpsv);
205 }
206 #endif
207 
208 /* ------------------------------- mg.h ------------------------------- */
209 
210 #if defined(PERL_CORE) || defined(PERL_EXT)
211 /* assumes get-magic and stringification have already occurred */
212 PERL_STATIC_INLINE STRLEN
213 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
214 {
215     assert(mg->mg_type == PERL_MAGIC_regex_global);
216     assert(mg->mg_len != -1);
217     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
218         return (STRLEN)mg->mg_len;
219     else {
220         const STRLEN pos = (STRLEN)mg->mg_len;
221         /* Without this check, we may read past the end of the buffer: */
222         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
223         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
224     }
225 }
226 #endif
227 
228 /* ------------------------------- pad.h ------------------------------ */
229 
230 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
231 PERL_STATIC_INLINE bool
232 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
233 {
234     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
235 
236     /* is seq within the range _LOW to _HIGH ?
237      * This is complicated by the fact that PL_cop_seqmax
238      * may have wrapped around at some point */
239     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
240         return FALSE; /* not yet introduced */
241 
242     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
243     /* in compiling scope */
244         if (
245             (seq >  COP_SEQ_RANGE_LOW(pn))
246             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
247             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
248         )
249             return TRUE;
250     }
251     else if (
252         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
253         ?
254             (  seq >  COP_SEQ_RANGE_LOW(pn)
255             || seq <= COP_SEQ_RANGE_HIGH(pn))
256 
257         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
258              && seq <= COP_SEQ_RANGE_HIGH(pn))
259     )
260         return TRUE;
261     return FALSE;
262 }
263 #endif
264 
265 /* ------------------------------- pp.h ------------------------------- */
266 
267 PERL_STATIC_INLINE I32
268 Perl_TOPMARK(pTHX)
269 {
270     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
271                                  "MARK top  %p %" IVdf "\n",
272                                   PL_markstack_ptr,
273                                   (IV)*PL_markstack_ptr)));
274     return *PL_markstack_ptr;
275 }
276 
277 PERL_STATIC_INLINE I32
278 Perl_POPMARK(pTHX)
279 {
280     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
281                                  "MARK pop  %p %" IVdf "\n",
282                                   (PL_markstack_ptr-1),
283                                   (IV)*(PL_markstack_ptr-1))));
284     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
285     return *PL_markstack_ptr--;
286 }
287 
288 /* ----------------------------- regexp.h ----------------------------- */
289 
290 /* PVLVs need to act as a superset of all scalar types - they are basically
291  * PVMGs with a few extra fields.
292  * REGEXPs are first class scalars, but have many fields that can't be copied
293  * into a PVLV body.
294  *
295  * Hence we take a different approach - instead of a copy, PVLVs store a pointer
296  * back to the original body. To avoid increasing the size of PVLVs just for the
297  * rare case of REGEXP assignment, this pointer is stored in the memory usually
298  * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
299  * read the pointer from the two possible locations. The macro SvLEN() wraps the
300  * access to the union's member xpvlenu_len, but there is no equivalent macro
301  * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
302  *
303  * See commit df6b4bd56551f2d3 for more details. */
304 
305 PERL_STATIC_INLINE struct regexp *
306 Perl_ReANY(const REGEXP * const re)
307 {
308     XPV* const p = (XPV*)SvANY(re);
309 
310     PERL_ARGS_ASSERT_REANY;
311     assert(isREGEXP(re));
312 
313     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
314                                    : (struct regexp *)p;
315 }
316 
317 /* ------------------------------- sv.h ------------------------------- */
318 
319 PERL_STATIC_INLINE bool
320 Perl_SvTRUE(pTHX_ SV *sv)
321 {
322     PERL_ARGS_ASSERT_SVTRUE;
323 
324     if (UNLIKELY(sv == NULL))
325         return FALSE;
326     SvGETMAGIC(sv);
327     return SvTRUE_nomg_NN(sv);
328 }
329 
330 PERL_STATIC_INLINE bool
331 Perl_SvTRUE_nomg(pTHX_ SV *sv)
332 {
333     PERL_ARGS_ASSERT_SVTRUE_NOMG;
334 
335     if (UNLIKELY(sv == NULL))
336         return FALSE;
337     return SvTRUE_nomg_NN(sv);
338 }
339 
340 PERL_STATIC_INLINE bool
341 Perl_SvTRUE_NN(pTHX_ SV *sv)
342 {
343     PERL_ARGS_ASSERT_SVTRUE_NN;
344 
345     SvGETMAGIC(sv);
346     return SvTRUE_nomg_NN(sv);
347 }
348 
349 PERL_STATIC_INLINE bool
350 Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
351 {
352     PERL_ARGS_ASSERT_SVTRUE_COMMON;
353 
354     if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
355         return SvIMMORTAL_TRUE(sv);
356 
357     if (! SvOK(sv))
358         return FALSE;
359 
360     if (SvPOK(sv))
361         return SvPVXtrue(sv);
362 
363     if (SvIOK(sv))
364         return SvIVX(sv) != 0; /* casts to bool */
365 
366     if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
367         return TRUE;
368 
369     if (sv_2bool_is_fallback)
370         return sv_2bool_nomg(sv);
371 
372     return isGV_with_GP(sv);
373 }
374 
375 
376 PERL_STATIC_INLINE SV *
377 Perl_SvREFCNT_inc(SV *sv)
378 {
379     if (LIKELY(sv != NULL))
380         SvREFCNT(sv)++;
381     return sv;
382 }
383 PERL_STATIC_INLINE SV *
384 Perl_SvREFCNT_inc_NN(SV *sv)
385 {
386     PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
387 
388     SvREFCNT(sv)++;
389     return sv;
390 }
391 PERL_STATIC_INLINE void
392 Perl_SvREFCNT_inc_void(SV *sv)
393 {
394     if (LIKELY(sv != NULL))
395         SvREFCNT(sv)++;
396 }
397 PERL_STATIC_INLINE void
398 Perl_SvREFCNT_dec(pTHX_ SV *sv)
399 {
400     if (LIKELY(sv != NULL)) {
401         U32 rc = SvREFCNT(sv);
402         if (LIKELY(rc > 1))
403             SvREFCNT(sv) = rc - 1;
404         else
405             Perl_sv_free2(aTHX_ sv, rc);
406     }
407 }
408 
409 PERL_STATIC_INLINE void
410 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
411 {
412     U32 rc = SvREFCNT(sv);
413 
414     PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
415 
416     if (LIKELY(rc > 1))
417         SvREFCNT(sv) = rc - 1;
418     else
419         Perl_sv_free2(aTHX_ sv, rc);
420 }
421 
422 /*
423 =for apidoc SvAMAGIC_on
424 
425 Indicate that C<sv> has overloading (active magic) enabled.
426 
427 =cut
428 */
429 
430 PERL_STATIC_INLINE void
431 Perl_SvAMAGIC_on(SV *sv)
432 {
433     PERL_ARGS_ASSERT_SVAMAGIC_ON;
434     assert(SvROK(sv));
435 
436     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
437 }
438 
439 /*
440 =for apidoc SvAMAGIC_off
441 
442 Indicate that C<sv> has overloading (active magic) disabled.
443 
444 =cut
445 */
446 
447 PERL_STATIC_INLINE void
448 Perl_SvAMAGIC_off(SV *sv)
449 {
450     PERL_ARGS_ASSERT_SVAMAGIC_OFF;
451 
452     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
453         HvAMAGIC_off(SvSTASH(SvRV(sv)));
454 }
455 
456 PERL_STATIC_INLINE U32
457 Perl_SvPADSTALE_on(SV *sv)
458 {
459     assert(!(SvFLAGS(sv) & SVs_PADTMP));
460     return SvFLAGS(sv) |= SVs_PADSTALE;
461 }
462 PERL_STATIC_INLINE U32
463 Perl_SvPADSTALE_off(SV *sv)
464 {
465     assert(!(SvFLAGS(sv) & SVs_PADTMP));
466     return SvFLAGS(sv) &= ~SVs_PADSTALE;
467 }
468 #if defined(PERL_CORE) || defined (PERL_EXT)
469 PERL_STATIC_INLINE STRLEN
470 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
471 {
472     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
473     if (SvGAMAGIC(sv)) {
474         U8 *hopped = utf8_hop((U8 *)pv, pos);
475         if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
476         return (STRLEN)(hopped - (U8 *)pv);
477     }
478     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
479 }
480 #endif
481 
482 /* ------------------------------- utf8.h ------------------------------- */
483 
484 /*
485 =for apidoc_section $unicode
486 */
487 
488 PERL_STATIC_INLINE void
489 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
490 {
491     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
492      * encoded string at '*dest', updating '*dest' to include it */
493 
494     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
495 
496     if (NATIVE_BYTE_IS_INVARIANT(byte))
497         *((*dest)++) = byte;
498     else {
499         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
500         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
501     }
502 }
503 
504 /*
505 =for apidoc valid_utf8_to_uvchr
506 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
507 known that the next character in the input UTF-8 string C<s> is well-formed
508 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
509 points, and non-Unicode code points are allowed.
510 
511 =cut
512 
513  */
514 
515 PERL_STATIC_INLINE UV
516 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
517 {
518     const UV expectlen = UTF8SKIP(s);
519     const U8* send = s + expectlen;
520     UV uv = *s;
521 
522     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
523 
524     if (retlen) {
525         *retlen = expectlen;
526     }
527 
528     /* An invariant is trivially returned */
529     if (expectlen == 1) {
530         return uv;
531     }
532 
533     /* Remove the leading bits that indicate the number of bytes, leaving just
534      * the bits that are part of the value */
535     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
536 
537     /* Now, loop through the remaining bytes, accumulating each into the
538      * working total as we go.  (I khw tried unrolling the loop for up to 4
539      * bytes, but there was no performance improvement) */
540     for (++s; s < send; s++) {
541         uv = UTF8_ACCUMULATE(uv, *s);
542     }
543 
544     return UNI_TO_NATIVE(uv);
545 
546 }
547 
548 /*
549 =for apidoc is_utf8_invariant_string
550 
551 Returns TRUE if the first C<len> bytes of the string C<s> are the same
552 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
553 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
554 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
555 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
556 characters are invariant, but so also are the C1 controls.
557 
558 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
559 use this option, that C<s> can't have embedded C<NUL> characters and has to
560 have a terminating C<NUL> byte).
561 
562 See also
563 C<L</is_utf8_string>>,
564 C<L</is_utf8_string_flags>>,
565 C<L</is_utf8_string_loc>>,
566 C<L</is_utf8_string_loc_flags>>,
567 C<L</is_utf8_string_loclen>>,
568 C<L</is_utf8_string_loclen_flags>>,
569 C<L</is_utf8_fixed_width_buf_flags>>,
570 C<L</is_utf8_fixed_width_buf_loc_flags>>,
571 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
572 C<L</is_strict_utf8_string>>,
573 C<L</is_strict_utf8_string_loc>>,
574 C<L</is_strict_utf8_string_loclen>>,
575 C<L</is_c9strict_utf8_string>>,
576 C<L</is_c9strict_utf8_string_loc>>,
577 and
578 C<L</is_c9strict_utf8_string_loclen>>.
579 
580 =cut
581 
582 */
583 
584 #define is_utf8_invariant_string(s, len)                                    \
585                                 is_utf8_invariant_string_loc(s, len, NULL)
586 
587 /*
588 =for apidoc is_utf8_invariant_string_loc
589 
590 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
591 the first UTF-8 variant character in the C<ep> pointer; if all characters are
592 UTF-8 invariant, this function does not change the contents of C<*ep>.
593 
594 =cut
595 
596 */
597 
598 PERL_STATIC_INLINE bool
599 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
600 {
601     const U8* send;
602     const U8* x = s;
603 
604     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
605 
606     if (len == 0) {
607         len = strlen((const char *)s);
608     }
609 
610     send = s + len;
611 
612 /* This looks like 0x010101... */
613 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
614 
615 /* This looks like 0x808080... */
616 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
617 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
618 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
619 
620 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
621  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
622  * optimized out completely on a 32-bit system, and its mask gets optimized out
623  * on a 64-bit system */
624 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
625                                       |   (  PTR2nat(x) >> 1)                 \
626                                       | ( ( (PTR2nat(x)                       \
627                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
628 
629 #ifndef EBCDIC
630 
631     /* Do the word-at-a-time iff there is at least one usable full word.  That
632      * means that after advancing to a word boundary, there still is at least a
633      * full word left.  The number of bytes needed to advance is 'wordsize -
634      * offset' unless offset is 0. */
635     if ((STRLEN) (send - x) >= PERL_WORDSIZE
636 
637                             /* This term is wordsize if subword; 0 if not */
638                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
639 
640                             /* 'offset' */
641                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
642     {
643 
644         /* Process per-byte until reach word boundary.  XXX This loop could be
645          * eliminated if we knew that this platform had fast unaligned reads */
646         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
647             if (! UTF8_IS_INVARIANT(*x)) {
648                 if (ep) {
649                     *ep = x;
650                 }
651 
652                 return FALSE;
653             }
654             x++;
655         }
656 
657         /* Here, we know we have at least one full word to process.  Process
658          * per-word as long as we have at least a full word left */
659         do {
660             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
661 
662                 /* Found a variant.  Just return if caller doesn't want its
663                  * exact position */
664                 if (! ep) {
665                     return FALSE;
666                 }
667 
668 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
669      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
670 
671                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
672                 assert(*ep >= s && *ep < send);
673 
674                 return FALSE;
675 
676 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
677            checks. */
678 
679                 break;
680 #  endif
681             }
682 
683             x += PERL_WORDSIZE;
684 
685         } while (x + PERL_WORDSIZE <= send);
686     }
687 
688 #endif      /* End of ! EBCDIC */
689 
690     /* Process per-byte */
691     while (x < send) {
692         if (! UTF8_IS_INVARIANT(*x)) {
693             if (ep) {
694                 *ep = x;
695             }
696 
697             return FALSE;
698         }
699 
700         x++;
701     }
702 
703     return TRUE;
704 }
705 
706 /* See if the platform has builtins for finding the most/least significant bit,
707  * and which one is right for using on 32 and 64 bit operands */
708 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
709 #  if U32SIZE == INTSIZE
710 #    define PERL_CLZ_32 __builtin_clz
711 #  endif
712 #  if defined(U64TYPE) && U64SIZE == INTSIZE
713 #    define PERL_CLZ_64 __builtin_clz
714 #  endif
715 #endif
716 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
717 #  if U32SIZE == INTSIZE
718 #    define PERL_CTZ_32 __builtin_ctz
719 #  endif
720 #  if defined(U64TYPE) && U64SIZE == INTSIZE
721 #    define PERL_CTZ_64 __builtin_ctz
722 #  endif
723 #endif
724 
725 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
726 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
727 #    define PERL_CLZ_32 __builtin_clzl
728 #  endif
729 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
730 #    define PERL_CLZ_64 __builtin_clzl
731 #  endif
732 #endif
733 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
734 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
735 #    define PERL_CTZ_32 __builtin_ctzl
736 #  endif
737 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
738 #    define PERL_CTZ_64 __builtin_ctzl
739 #  endif
740 #endif
741 
742 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
743 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
744 #    define PERL_CLZ_32 __builtin_clzll
745 #  endif
746 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
747 #    define PERL_CLZ_64 __builtin_clzll
748 #  endif
749 #endif
750 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
751 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
752 #    define PERL_CTZ_32 __builtin_ctzll
753 #  endif
754 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
755 #    define PERL_CTZ_64 __builtin_ctzll
756 #  endif
757 #endif
758 
759 #if defined(_MSC_VER)
760 #  include <intrin.h>
761 #  pragma intrinsic(_BitScanForward)
762 #  pragma intrinsic(_BitScanReverse)
763 #  ifdef _WIN64
764 #    pragma intrinsic(_BitScanForward64)
765 #    pragma intrinsic(_BitScanReverse64)
766 #  endif
767 #endif
768 
769 /* The reason there are not checks to see if ffs() and ffsl() are available for
770  * determining the lsb, is because these don't improve on the deBruijn method
771  * fallback, which is just a branchless integer multiply, array element
772  * retrieval, and shift.  The others, even if the function call overhead is
773  * optimized out, have to cope with the possibility of the input being all
774  * zeroes, and almost certainly will have conditionals for this eventuality.
775  * khw, at the time of this commit, looked at the source for both gcc and clang
776  * to verify this.  (gcc used a method inferior to deBruijn.) */
777 
778 /* Below are functions to find the first, last, or only set bit in a word.  On
779  * platforms with 64-bit capability, there is a pair for each operation; the
780  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
781  * the same in each pair, so the second is stripped of most comments. */
782 
783 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
784 
785 PERL_STATIC_INLINE unsigned
786 Perl_lsbit_pos64(U64 word)
787 {
788     /* Find the position (0..63) of the least significant set bit in the input
789      * word */
790 
791     ASSUME(word != 0);
792 
793     /* If we can determine that the platform has a usable fast method to get
794      * this info, use that */
795 
796 #  if defined(PERL_CTZ_64)
797 #    define PERL_HAS_FAST_GET_LSB_POS64
798 
799     return (unsigned) PERL_CTZ_64(word);
800 
801 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
802 #    define PERL_HAS_FAST_GET_LSB_POS64
803 
804     {
805         unsigned long index;
806         _BitScanForward64(&index, word);
807         return (unsigned)index;
808     }
809 
810 #  else
811 
812     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
813      * making the lsb the only set bit in the word, and use our function that
814      * works on words with a single bit set.
815      *
816      * Isolate the lsb;
817      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
818      *
819      * The word will look like this, with a rightmost set bit in position 's':
820      * ('x's are don't cares, and 'y's are their complements)
821      *      s
822      *  x..x100..00
823      *  y..y011..11      Complement
824      *  y..y100..00      Add 1
825      *  0..0100..00      And with the original
826      *
827      *  (Yes, complementing and adding 1 is just taking the negative on 2's
828      *  complement machines, but not on 1's complement ones, and some compilers
829      *  complain about negating an unsigned.)
830      */
831     return single_1bit_pos64(word & (~word + 1));
832 
833 #  endif
834 
835 }
836 
837 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
838 #else   /* ! QUAD */
839 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
840 #endif
841 
842 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
843 Perl_lsbit_pos32(U32 word)
844 {
845     /* Find the position (0..31) of the least significant set bit in the input
846      * word */
847 
848     ASSUME(word != 0);
849 
850 #if defined(PERL_CTZ_32)
851 #  define PERL_HAS_FAST_GET_LSB_POS32
852 
853     return (unsigned) PERL_CTZ_32(word);
854 
855 #elif U32SIZE == 4 && defined(_MSC_VER)
856 #  define PERL_HAS_FAST_GET_LSB_POS32
857 
858     {
859         unsigned long index;
860         _BitScanForward(&index, word);
861         return (unsigned)index;
862     }
863 
864 #else
865 
866     return single_1bit_pos32(word & (~word + 1));
867 
868 #endif
869 
870 }
871 
872 
873 /* Convert the leading zeros count to the bit position of the first set bit.
874  * This just subtracts from the highest position, 31 or 63.  But some compilers
875  * don't optimize this optimally, and so a bit of bit twiddling encourages them
876  * to do the right thing.  It turns out that subtracting a smaller non-negative
877  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
878  * the two numbers.  To see why, first note that the sum of any number, x, and
879  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
880  * the xor of x and all ones is x'. */
881 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
882 
883 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
884 
885 PERL_STATIC_INLINE unsigned
886 Perl_msbit_pos64(U64 word)
887 {
888     /* Find the position (0..63) of the most significant set bit in the input
889      * word */
890 
891     ASSUME(word != 0);
892 
893     /* If we can determine that the platform has a usable fast method to get
894      * this, use that */
895 
896 #  if defined(PERL_CLZ_64)
897 #    define PERL_HAS_FAST_GET_MSB_POS64
898 
899     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
900 
901 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
902 #    define PERL_HAS_FAST_GET_MSB_POS64
903 
904     {
905         unsigned long index;
906         _BitScanReverse64(&index, word);
907         return (unsigned)index;
908     }
909 
910 #  else
911 
912     /* Here, we didn't find a fast method for finding the msb.  Fall back to
913      * making the msb the only set bit in the word, and use our function that
914      * works on words with a single bit set.
915      *
916      * Isolate the msb; http://codeforces.com/blog/entry/10330
917      *
918      * Only the most significant set bit matters.  Or'ing word with its right
919      * shift of 1 makes that bit and the next one to its right both 1.
920      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
921      * ...  We end with the msb and all to the right being 1. */
922     word |= (word >>  1);
923     word |= (word >>  2);
924     word |= (word >>  4);
925     word |= (word >>  8);
926     word |= (word >> 16);
927     word |= (word >> 32);
928 
929     /* Then subtracting the right shift by 1 clears all but the left-most of
930      * the 1 bits, which is our desired result */
931     word -= (word >> 1);
932 
933     /* Now we have a single bit set */
934     return single_1bit_pos64(word);
935 
936 #  endif
937 
938 }
939 
940 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
941 #else   /* ! QUAD */
942 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
943 #endif
944 
945 PERL_STATIC_INLINE unsigned
946 Perl_msbit_pos32(U32 word)
947 {
948     /* Find the position (0..31) of the most significant set bit in the input
949      * word */
950 
951     ASSUME(word != 0);
952 
953 #if defined(PERL_CLZ_32)
954 #  define PERL_HAS_FAST_GET_MSB_POS32
955 
956     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
957 
958 #elif U32SIZE == 4 && defined(_MSC_VER)
959 #  define PERL_HAS_FAST_GET_MSB_POS32
960 
961     {
962         unsigned long index;
963         _BitScanReverse(&index, word);
964         return (unsigned)index;
965     }
966 
967 #else
968 
969     word |= (word >>  1);
970     word |= (word >>  2);
971     word |= (word >>  4);
972     word |= (word >>  8);
973     word |= (word >> 16);
974     word -= (word >> 1);
975     return single_1bit_pos32(word);
976 
977 #endif
978 
979 }
980 
981 #if UVSIZE == U64SIZE
982 #  define msbit_pos(word)  msbit_pos64(word)
983 #  define lsbit_pos(word)  lsbit_pos64(word)
984 #elif UVSIZE == U32SIZE
985 #  define msbit_pos(word)  msbit_pos32(word)
986 #  define lsbit_pos(word)  lsbit_pos32(word)
987 #endif
988 
989 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
990 
991 PERL_STATIC_INLINE unsigned
992 Perl_single_1bit_pos64(U64 word)
993 {
994     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
995      * find and return the 1's position: 0..63 */
996 
997 #  ifdef PERL_CORE    /* macro not exported */
998     ASSUME(isPOWER_OF_2(word));
999 #  else
1000     ASSUME(word && (word & (word-1)) == 0);
1001 #  endif
1002 
1003     /* The only set bit is both the most and least significant bit.  If we have
1004      * a fast way of finding either one, use that.
1005      *
1006      * It may appear at first glance that those functions call this one, but
1007      * they don't if the corresponding #define is set */
1008 
1009 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
1010 
1011     return msbit_pos64(word);
1012 
1013 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1014 
1015     return lsbit_pos64(word);
1016 
1017 #  else
1018 
1019     /* The position of the only set bit in a word can be quickly calculated
1020      * using deBruijn sequences.  See for example
1021      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
1022     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
1023                                                     >> PERL_deBruijnShift64_];
1024 #  endif
1025 
1026 }
1027 
1028 #endif
1029 
1030 PERL_STATIC_INLINE unsigned
1031 Perl_single_1bit_pos32(U32 word)
1032 {
1033     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1034      * find and return the 1's position: 0..31 */
1035 
1036 #ifdef PERL_CORE    /* macro not exported */
1037     ASSUME(isPOWER_OF_2(word));
1038 #else
1039     ASSUME(word && (word & (word-1)) == 0);
1040 #endif
1041 #ifdef PERL_HAS_FAST_GET_MSB_POS32
1042 
1043     return msbit_pos32(word);
1044 
1045 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1046 
1047     return lsbit_pos32(word);
1048 
1049 /* Unlikely, but possible for the platform to have a wider fast operation but
1050  * not a narrower one.  But easy enough to handle the case by widening the
1051  * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
1052  * would be slower than the deBruijn method.) */
1053 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1054 
1055     return msbit_pos64(word);
1056 
1057 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1058 
1059     return lsbit_pos64(word);
1060 
1061 #else
1062 
1063     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1064                                                     >> PERL_deBruijnShift32_];
1065 #endif
1066 
1067 }
1068 
1069 #ifndef EBCDIC
1070 
1071 PERL_STATIC_INLINE unsigned int
1072 Perl_variant_byte_number(PERL_UINTMAX_T word)
1073 {
1074     /* This returns the position in a word (0..7) of the first variant byte in
1075      * it.  This is a helper function.  Note that there are no branches */
1076 
1077     /* Get just the msb bits of each byte */
1078     word &= PERL_VARIANTS_WORD_MASK;
1079 
1080     /* This should only be called if we know there is a variant byte in the
1081      * word */
1082     assert(word);
1083 
1084 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1085 
1086     /* Bytes are stored like
1087      *  Byte8 ... Byte2 Byte1
1088      *  63..56...15...8 7...0
1089      * so getting the lsb of the whole modified word is getting the msb of the
1090      * first byte that has its msb set */
1091     word = lsbit_pos_uintmax_(word);
1092 
1093     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1094      * to 0..7 */
1095     return (unsigned int) ((word + 1) >> 3) - 1;
1096 
1097 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1098 
1099     /* Bytes are stored like
1100      *  Byte1 Byte2  ... Byte8
1101      * 63..56 55..47 ... 7...0
1102      * so getting the msb of the whole modified word is getting the msb of the
1103      * first byte that has its msb set */
1104     word = msbit_pos_uintmax_(word);
1105 
1106     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1107      * to 0..7 */
1108     word = ((word + 1) >> 3) - 1;
1109 
1110     /* And invert the result because of the reversed byte order on this
1111      * platform */
1112     word = CHARBITS - word - 1;
1113 
1114     return (unsigned int) word;
1115 
1116 #  else
1117 #    error Unexpected byte order
1118 #  endif
1119 
1120 }
1121 
1122 #endif
1123 #if defined(PERL_CORE) || defined(PERL_EXT)
1124 
1125 /*
1126 =for apidoc variant_under_utf8_count
1127 
1128 This function looks at the sequence of bytes between C<s> and C<e>, which are
1129 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1130 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1131 each of these would occupy two bytes instead of the single one in the input
1132 string.  Thus, this function returns the precise number of bytes the string
1133 would expand by when translated to UTF-8.
1134 
1135 Unlike most of the other functions that have C<utf8> in their name, the input
1136 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1137 I<odd> to emphasize this.
1138 
1139 This function is internal to Perl because khw thinks that any XS code that
1140 would want this is probably operating too close to the internals.  Presenting a
1141 valid use case could change that.
1142 
1143 See also
1144 C<L<perlapi/is_utf8_invariant_string>>
1145 and
1146 C<L<perlapi/is_utf8_invariant_string_loc>>,
1147 
1148 =cut
1149 
1150 */
1151 
1152 PERL_STATIC_INLINE Size_t
1153 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1154 {
1155     const U8* x = s;
1156     Size_t count = 0;
1157 
1158     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1159 
1160 #  ifndef EBCDIC
1161 
1162     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1163      * same as for is_utf8_invariant_string()) */
1164     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1165                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1166                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1167     {
1168 
1169         /* Process per-byte until reach word boundary.  XXX This loop could be
1170          * eliminated if we knew that this platform had fast unaligned reads */
1171         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1172             count += ! UTF8_IS_INVARIANT(*x++);
1173         }
1174 
1175         /* Process per-word as long as we have at least a full word left */
1176         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1177                    explanation of how this works */
1178             PERL_UINTMAX_T increment
1179                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1180                       * PERL_COUNT_MULTIPLIER)
1181                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1182             count += (Size_t) increment;
1183             x += PERL_WORDSIZE;
1184         } while (x + PERL_WORDSIZE <= e);
1185     }
1186 
1187 #  endif
1188 
1189     /* Process per-byte */
1190     while (x < e) {
1191         if (! UTF8_IS_INVARIANT(*x)) {
1192             count++;
1193         }
1194 
1195         x++;
1196     }
1197 
1198     return count;
1199 }
1200 
1201 #endif
1202 
1203 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
1204 #  undef PERL_WORDSIZE
1205 #  undef PERL_COUNT_MULTIPLIER
1206 #  undef PERL_WORD_BOUNDARY_MASK
1207 #  undef PERL_VARIANTS_WORD_MASK
1208 #endif
1209 
1210 /*
1211 =for apidoc is_utf8_string
1212 
1213 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1214 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1215 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1216 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1217 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1218 
1219 This function considers Perl's extended UTF-8 to be valid.  That means that
1220 code points above Unicode, surrogates, and non-character code points are
1221 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1222 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1223 code points are considered valid.
1224 
1225 See also
1226 C<L</is_utf8_invariant_string>>,
1227 C<L</is_utf8_invariant_string_loc>>,
1228 C<L</is_utf8_string_loc>>,
1229 C<L</is_utf8_string_loclen>>,
1230 C<L</is_utf8_fixed_width_buf_flags>>,
1231 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1232 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1233 
1234 =cut
1235 */
1236 
1237 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1238 
1239 #if defined(PERL_CORE) || defined (PERL_EXT)
1240 
1241 /*
1242 =for apidoc is_utf8_non_invariant_string
1243 
1244 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1245 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1246 UTF-8; otherwise returns FALSE.
1247 
1248 A TRUE return means that at least one code point represented by the sequence
1249 either is a wide character not representable as a single byte, or the
1250 representation differs depending on whether the sequence is encoded in UTF-8 or
1251 not.
1252 
1253 See also
1254 C<L<perlapi/is_utf8_invariant_string>>,
1255 C<L<perlapi/is_utf8_string>>
1256 
1257 =cut
1258 
1259 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1260 It generally needn't be if its string is entirely UTF-8 invariant, and it
1261 shouldn't be if it otherwise contains invalid UTF-8.
1262 
1263 It is an internal function because khw thinks that XS code shouldn't be working
1264 at this low a level.  A valid use case could change that.
1265 
1266 */
1267 
1268 PERL_STATIC_INLINE bool
1269 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1270 {
1271     const U8 * first_variant;
1272 
1273     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1274 
1275     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1276         return FALSE;
1277     }
1278 
1279     return is_utf8_string(first_variant, len - (first_variant - s));
1280 }
1281 
1282 #endif
1283 
1284 /*
1285 =for apidoc is_strict_utf8_string
1286 
1287 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1288 UTF-8-encoded string that is fully interchangeable by any application using
1289 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1290 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1291 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1292 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1293 
1294 This function returns FALSE for strings containing any
1295 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1296 non-character code points.
1297 
1298 See also
1299 C<L</is_utf8_invariant_string>>,
1300 C<L</is_utf8_invariant_string_loc>>,
1301 C<L</is_utf8_string>>,
1302 C<L</is_utf8_string_flags>>,
1303 C<L</is_utf8_string_loc>>,
1304 C<L</is_utf8_string_loc_flags>>,
1305 C<L</is_utf8_string_loclen>>,
1306 C<L</is_utf8_string_loclen_flags>>,
1307 C<L</is_utf8_fixed_width_buf_flags>>,
1308 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1309 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1310 C<L</is_strict_utf8_string_loc>>,
1311 C<L</is_strict_utf8_string_loclen>>,
1312 C<L</is_c9strict_utf8_string>>,
1313 C<L</is_c9strict_utf8_string_loc>>,
1314 and
1315 C<L</is_c9strict_utf8_string_loclen>>.
1316 
1317 =cut
1318 */
1319 
1320 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1321 
1322 /*
1323 =for apidoc is_c9strict_utf8_string
1324 
1325 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1326 UTF-8-encoded string that conforms to
1327 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1328 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1329 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1330 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1331 characters being ASCII constitute 'a valid UTF-8 string'.
1332 
1333 This function returns FALSE for strings containing any code points above the
1334 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1335 code points per
1336 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1337 
1338 See also
1339 C<L</is_utf8_invariant_string>>,
1340 C<L</is_utf8_invariant_string_loc>>,
1341 C<L</is_utf8_string>>,
1342 C<L</is_utf8_string_flags>>,
1343 C<L</is_utf8_string_loc>>,
1344 C<L</is_utf8_string_loc_flags>>,
1345 C<L</is_utf8_string_loclen>>,
1346 C<L</is_utf8_string_loclen_flags>>,
1347 C<L</is_utf8_fixed_width_buf_flags>>,
1348 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1349 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1350 C<L</is_strict_utf8_string>>,
1351 C<L</is_strict_utf8_string_loc>>,
1352 C<L</is_strict_utf8_string_loclen>>,
1353 C<L</is_c9strict_utf8_string_loc>>,
1354 and
1355 C<L</is_c9strict_utf8_string_loclen>>.
1356 
1357 =cut
1358 */
1359 
1360 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1361 
1362 /*
1363 =for apidoc is_utf8_string_flags
1364 
1365 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1366 UTF-8 string, subject to the restrictions imposed by C<flags>;
1367 returns FALSE otherwise.  If C<len> is 0, it will be calculated
1368 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1369 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1370 that all characters being ASCII constitute 'a valid UTF-8 string'.
1371 
1372 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1373 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1374 as C<L</is_strict_utf8_string>>; and if C<flags> is
1375 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1376 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1377 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1378 C<L</utf8n_to_uvchr>>, with the same meanings.
1379 
1380 See also
1381 C<L</is_utf8_invariant_string>>,
1382 C<L</is_utf8_invariant_string_loc>>,
1383 C<L</is_utf8_string>>,
1384 C<L</is_utf8_string_loc>>,
1385 C<L</is_utf8_string_loc_flags>>,
1386 C<L</is_utf8_string_loclen>>,
1387 C<L</is_utf8_string_loclen_flags>>,
1388 C<L</is_utf8_fixed_width_buf_flags>>,
1389 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1390 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1391 C<L</is_strict_utf8_string>>,
1392 C<L</is_strict_utf8_string_loc>>,
1393 C<L</is_strict_utf8_string_loclen>>,
1394 C<L</is_c9strict_utf8_string>>,
1395 C<L</is_c9strict_utf8_string_loc>>,
1396 and
1397 C<L</is_c9strict_utf8_string_loclen>>.
1398 
1399 =cut
1400 */
1401 
1402 PERL_STATIC_INLINE bool
1403 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1404 {
1405     const U8 * first_variant;
1406 
1407     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1408     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1409                           |UTF8_DISALLOW_PERL_EXTENDED)));
1410 
1411     if (len == 0) {
1412         len = strlen((const char *)s);
1413     }
1414 
1415     if (flags == 0) {
1416         return is_utf8_string(s, len);
1417     }
1418 
1419     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1420                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1421     {
1422         return is_strict_utf8_string(s, len);
1423     }
1424 
1425     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1426                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1427     {
1428         return is_c9strict_utf8_string(s, len);
1429     }
1430 
1431     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1432         const U8* const send = s + len;
1433         const U8* x = first_variant;
1434 
1435         while (x < send) {
1436             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1437             if (UNLIKELY(! cur_len)) {
1438                 return FALSE;
1439             }
1440             x += cur_len;
1441         }
1442     }
1443 
1444     return TRUE;
1445 }
1446 
1447 /*
1448 
1449 =for apidoc is_utf8_string_loc
1450 
1451 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1452 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1453 "utf8ness success") in the C<ep> pointer.
1454 
1455 See also C<L</is_utf8_string_loclen>>.
1456 
1457 =cut
1458 */
1459 
1460 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1461 
1462 /*
1463 
1464 =for apidoc is_utf8_string_loclen
1465 
1466 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1467 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1468 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1469 encoded characters in the C<el> pointer.
1470 
1471 See also C<L</is_utf8_string_loc>>.
1472 
1473 =cut
1474 */
1475 
1476 PERL_STATIC_INLINE bool
1477 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1478 {
1479     const U8 * first_variant;
1480 
1481     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1482 
1483     if (len == 0) {
1484         len = strlen((const char *) s);
1485     }
1486 
1487     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1488         if (el)
1489             *el = len;
1490 
1491         if (ep) {
1492             *ep = s + len;
1493         }
1494 
1495         return TRUE;
1496     }
1497 
1498     {
1499         const U8* const send = s + len;
1500         const U8* x = first_variant;
1501         STRLEN outlen = first_variant - s;
1502 
1503         while (x < send) {
1504             const STRLEN cur_len = isUTF8_CHAR(x, send);
1505             if (UNLIKELY(! cur_len)) {
1506                 break;
1507             }
1508             x += cur_len;
1509             outlen++;
1510         }
1511 
1512         if (el)
1513             *el = outlen;
1514 
1515         if (ep) {
1516             *ep = x;
1517         }
1518 
1519         return (x == send);
1520     }
1521 }
1522 
1523 /* The perl core arranges to never call the DFA below without there being at
1524  * least one byte available to look at.  This allows the DFA to use a do {}
1525  * while loop which means that calling it with a UTF-8 invariant has a single
1526  * conditional, same as the calling code checking for invariance ahead of time.
1527  * And having the calling code remove that conditional speeds up by that
1528  * conditional, the case where it wasn't invariant.  So there's no reason to
1529  * check before caling this.
1530  *
1531  * But we don't know this for non-core calls, so have to retain the check for
1532  * them. */
1533 #ifdef PERL_CORE
1534 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
1535 #else
1536 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
1537 #endif
1538 
1539 /*
1540  * DFA for checking input is valid UTF-8 syntax.
1541  *
1542  * This uses adaptations of the table and algorithm given in
1543  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1544  * documentation of the original version.  A copyright notice for the original
1545  * version is given at the beginning of this file.  The Perl adapations are
1546  * documented at the definition of PL_extended_utf8_dfa_tab[].
1547  *
1548  * This dfa is fast.  There are three exit conditions:
1549  *  1) a well-formed code point, acceptable to the table
1550  *  2) the beginning bytes of an incomplete character, whose completion might
1551  *     or might not be acceptable
1552  *  3) unacceptable to the table.  Some of the adaptations have certain,
1553  *     hopefully less likely to occur, legal inputs be unacceptable to the
1554  *     table, so these must be sorted out afterwards.
1555  *
1556  * This macro is a complete implementation of the code executing the DFA.  It
1557  * is passed the input sequence bounds and the table to use, and what to do
1558  * for each of the exit conditions.  There are three canned actions, likely to
1559  * be the ones you want:
1560  *      DFA_RETURN_SUCCESS_
1561  *      DFA_RETURN_FAILURE_
1562  *      DFA_GOTO_TEASE_APART_FF_
1563  *
1564  * You pass a parameter giving the action to take for each of the three
1565  * possible exit conditions:
1566  *
1567  * 'accept_action'  This is executed when the DFA accepts the input.
1568  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1569  * 'reject_action'  This is executed when the DFA rejects the input.
1570  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1571  *                  you have written code to distinguish the rejecting state
1572  *                  results.  Because it happens in several places, and
1573  *                  involves #ifdefs, the special action
1574  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1575  *                  PL_extended_utf8_dfa_tab.  On platforms without
1576  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1577  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1578  *                  need to have a label 'tease_apart_FF' that it will transfer
1579  *                  to.
1580  * 'incomplete_char_action'  This is executed when the DFA ran off the end
1581  *                  before accepting or rejecting the input.
1582  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1583  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1584  *                  off the end, and you place your code to handle this case
1585  *                  immediately after it.
1586  */
1587 
1588 #define DFA_RETURN_SUCCESS_      return s - s0
1589 #define DFA_RETURN_FAILURE_      return 0
1590 #ifdef HAS_EXTRA_LONG_UTF8
1591 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1592 #else
1593 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1594 #endif
1595 
1596 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1597                               accept_action,                                \
1598                               reject_action,                                \
1599                               incomplete_char_action)                       \
1600     STMT_START {                                                            \
1601         const U8 * s = s0;                                                  \
1602         UV state = 0;                                                       \
1603                                                                             \
1604         PERL_NON_CORE_CHECK_EMPTY(s,e);                                     \
1605                                                                             \
1606         do {                                                                \
1607             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1608             s++;                                                            \
1609                                                                             \
1610             if (state == 0) {   /* Accepting state */                       \
1611                 accept_action;                                              \
1612             }                                                               \
1613                                                                             \
1614             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1615                 reject_action;                                              \
1616             }                                                               \
1617         } while (s < e);                                                    \
1618                                                                             \
1619         /* Here, dropped out of loop before end-of-char */                  \
1620         incomplete_char_action;                                             \
1621     } STMT_END
1622 
1623 
1624 /*
1625 
1626 =for apidoc isUTF8_CHAR
1627 
1628 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1629 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1630 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1631 value gives how many bytes starting at C<s> comprise the code point's
1632 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1633 form the first code point in C<s>, are not examined.
1634 
1635 The code point can be any that will fit in an IV on this machine, using Perl's
1636 extension to official UTF-8 to represent those higher than the Unicode maximum
1637 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1638 next few bytes in C<s> is legal UTF-8 for a single character.
1639 
1640 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1641 defined by Unicode to be fully interchangeable across applications;
1642 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1643 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1644 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1645 
1646 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1647 C<L</is_utf8_string_loclen>> to check entire strings.
1648 
1649 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1650 machines) is a valid UTF-8 character.
1651 
1652 =cut
1653 
1654 This uses an adaptation of the table and algorithm given in
1655 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1656 documentation of the original version.  A copyright notice for the original
1657 version is given at the beginning of this file.  The Perl adapation is
1658 documented at the definition of PL_extended_utf8_dfa_tab[].
1659 */
1660 
1661 PERL_STATIC_INLINE Size_t
1662 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1663 {
1664     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1665 
1666     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1667                           DFA_RETURN_SUCCESS_,
1668                           DFA_TEASE_APART_FF_,
1669                           DFA_RETURN_FAILURE_);
1670 
1671     /* Here, we didn't return success, but dropped out of the loop.  In the
1672      * case of PL_extended_utf8_dfa_tab, this means the input is either
1673      * malformed, or the start byte was FF on a platform that the dfa doesn't
1674      * handle FF's.  Call a helper function. */
1675 
1676 #ifdef HAS_EXTRA_LONG_UTF8
1677 
1678   tease_apart_FF:
1679 
1680     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1681      * either malformed, or was for the largest possible start byte, which we
1682      * now check, not inline */
1683     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1684         return 0;
1685     }
1686 
1687     return is_utf8_FF_helper_(s0, e,
1688                               FALSE /* require full, not partial char */
1689                              );
1690 #endif
1691 
1692 }
1693 
1694 /*
1695 
1696 =for apidoc isSTRICT_UTF8_CHAR
1697 
1698 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1699 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1700 Unicode code point completely acceptable for open interchange between all
1701 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1702 many bytes starting at C<s> comprise the code point's representation.  Any
1703 bytes remaining before C<e>, but beyond the ones needed to form the first code
1704 point in C<s>, are not examined.
1705 
1706 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1707 be a surrogate nor a non-character code point.  Thus this excludes any code
1708 point from Perl's extended UTF-8.
1709 
1710 This is used to efficiently decide if the next few bytes in C<s> is
1711 legal Unicode-acceptable UTF-8 for a single character.
1712 
1713 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1714 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1715 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1716 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1717 
1718 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1719 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1720 
1721 =cut
1722 
1723 This uses an adaptation of the tables and algorithm given in
1724 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1725 documentation of the original version.  A copyright notice for the original
1726 version is given at the beginning of this file.  The Perl adapation is
1727 documented at the definition of strict_extended_utf8_dfa_tab[].
1728 
1729 */
1730 
1731 PERL_STATIC_INLINE Size_t
1732 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1733 {
1734     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1735 
1736     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1737                           DFA_RETURN_SUCCESS_,
1738                           goto check_hanguls,
1739                           DFA_RETURN_FAILURE_);
1740   check_hanguls:
1741 
1742     /* Here, we didn't return success, but dropped out of the loop.  In the
1743      * case of PL_strict_utf8_dfa_tab, this means the input is either
1744      * malformed, or was for certain Hanguls; handle them specially */
1745 
1746     /* The dfa above drops out for incomplete or illegal inputs, and certain
1747      * legal Hanguls; check and return accordingly */
1748     return is_HANGUL_ED_utf8_safe(s0, e);
1749 }
1750 
1751 /*
1752 
1753 =for apidoc isC9_STRICT_UTF8_CHAR
1754 
1755 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1756 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1757 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1758 the value gives how many bytes starting at C<s> comprise the code point's
1759 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1760 form the first code point in C<s>, are not examined.
1761 
1762 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1763 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1764 code points.  This corresponds to
1765 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1766 which said that non-character code points are merely discouraged rather than
1767 completely forbidden in open interchange.  See
1768 L<perlunicode/Noncharacter code points>.
1769 
1770 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1771 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1772 
1773 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1774 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1775 
1776 =cut
1777 
1778 This uses an adaptation of the tables and algorithm given in
1779 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1780 documentation of the original version.  A copyright notice for the original
1781 version is given at the beginning of this file.  The Perl adapation is
1782 documented at the definition of PL_c9_utf8_dfa_tab[].
1783 
1784 */
1785 
1786 PERL_STATIC_INLINE Size_t
1787 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1788 {
1789     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1790 
1791     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1792                           DFA_RETURN_SUCCESS_,
1793                           DFA_RETURN_FAILURE_,
1794                           DFA_RETURN_FAILURE_);
1795 }
1796 
1797 /*
1798 
1799 =for apidoc is_strict_utf8_string_loc
1800 
1801 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1802 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1803 "utf8ness success") in the C<ep> pointer.
1804 
1805 See also C<L</is_strict_utf8_string_loclen>>.
1806 
1807 =cut
1808 */
1809 
1810 #define is_strict_utf8_string_loc(s, len, ep)                               \
1811                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1812 
1813 /*
1814 
1815 =for apidoc is_strict_utf8_string_loclen
1816 
1817 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1818 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1819 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1820 encoded characters in the C<el> pointer.
1821 
1822 See also C<L</is_strict_utf8_string_loc>>.
1823 
1824 =cut
1825 */
1826 
1827 PERL_STATIC_INLINE bool
1828 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1829 {
1830     const U8 * first_variant;
1831 
1832     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1833 
1834     if (len == 0) {
1835         len = strlen((const char *) s);
1836     }
1837 
1838     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1839         if (el)
1840             *el = len;
1841 
1842         if (ep) {
1843             *ep = s + len;
1844         }
1845 
1846         return TRUE;
1847     }
1848 
1849     {
1850         const U8* const send = s + len;
1851         const U8* x = first_variant;
1852         STRLEN outlen = first_variant - s;
1853 
1854         while (x < send) {
1855             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1856             if (UNLIKELY(! cur_len)) {
1857                 break;
1858             }
1859             x += cur_len;
1860             outlen++;
1861         }
1862 
1863         if (el)
1864             *el = outlen;
1865 
1866         if (ep) {
1867             *ep = x;
1868         }
1869 
1870         return (x == send);
1871     }
1872 }
1873 
1874 /*
1875 
1876 =for apidoc is_c9strict_utf8_string_loc
1877 
1878 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1879 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1880 "utf8ness success") in the C<ep> pointer.
1881 
1882 See also C<L</is_c9strict_utf8_string_loclen>>.
1883 
1884 =cut
1885 */
1886 
1887 #define is_c9strict_utf8_string_loc(s, len, ep)	                            \
1888                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1889 
1890 /*
1891 
1892 =for apidoc is_c9strict_utf8_string_loclen
1893 
1894 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1895 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1896 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1897 characters in the C<el> pointer.
1898 
1899 See also C<L</is_c9strict_utf8_string_loc>>.
1900 
1901 =cut
1902 */
1903 
1904 PERL_STATIC_INLINE bool
1905 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1906 {
1907     const U8 * first_variant;
1908 
1909     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1910 
1911     if (len == 0) {
1912         len = strlen((const char *) s);
1913     }
1914 
1915     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1916         if (el)
1917             *el = len;
1918 
1919         if (ep) {
1920             *ep = s + len;
1921         }
1922 
1923         return TRUE;
1924     }
1925 
1926     {
1927         const U8* const send = s + len;
1928         const U8* x = first_variant;
1929         STRLEN outlen = first_variant - s;
1930 
1931         while (x < send) {
1932             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1933             if (UNLIKELY(! cur_len)) {
1934                 break;
1935             }
1936             x += cur_len;
1937             outlen++;
1938         }
1939 
1940         if (el)
1941             *el = outlen;
1942 
1943         if (ep) {
1944             *ep = x;
1945         }
1946 
1947         return (x == send);
1948     }
1949 }
1950 
1951 /*
1952 
1953 =for apidoc is_utf8_string_loc_flags
1954 
1955 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1956 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1957 "utf8ness success") in the C<ep> pointer.
1958 
1959 See also C<L</is_utf8_string_loclen_flags>>.
1960 
1961 =cut
1962 */
1963 
1964 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1965                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1966 
1967 
1968 /* The above 3 actual functions could have been moved into the more general one
1969  * just below, and made #defines that call it with the right 'flags'.  They are
1970  * currently kept separate to increase their chances of getting inlined */
1971 
1972 /*
1973 
1974 =for apidoc is_utf8_string_loclen_flags
1975 
1976 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1977 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1978 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1979 encoded characters in the C<el> pointer.
1980 
1981 See also C<L</is_utf8_string_loc_flags>>.
1982 
1983 =cut
1984 */
1985 
1986 PERL_STATIC_INLINE bool
1987 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1988 {
1989     const U8 * first_variant;
1990 
1991     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1992     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1993                           |UTF8_DISALLOW_PERL_EXTENDED)));
1994 
1995     if (len == 0) {
1996         len = strlen((const char *) s);
1997     }
1998 
1999     if (flags == 0) {
2000         return is_utf8_string_loclen(s, len, ep, el);
2001     }
2002 
2003     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2004                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2005     {
2006         return is_strict_utf8_string_loclen(s, len, ep, el);
2007     }
2008 
2009     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2010                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2011     {
2012         return is_c9strict_utf8_string_loclen(s, len, ep, el);
2013     }
2014 
2015     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2016         if (el)
2017             *el = len;
2018 
2019         if (ep) {
2020             *ep = s + len;
2021         }
2022 
2023         return TRUE;
2024     }
2025 
2026     {
2027         const U8* send = s + len;
2028         const U8* x = first_variant;
2029         STRLEN outlen = first_variant - s;
2030 
2031         while (x < send) {
2032             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2033             if (UNLIKELY(! cur_len)) {
2034                 break;
2035             }
2036             x += cur_len;
2037             outlen++;
2038         }
2039 
2040         if (el)
2041             *el = outlen;
2042 
2043         if (ep) {
2044             *ep = x;
2045         }
2046 
2047         return (x == send);
2048     }
2049 }
2050 
2051 /*
2052 =for apidoc utf8_distance
2053 
2054 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2055 and C<b>.
2056 
2057 WARNING: use only if you *know* that the pointers point inside the
2058 same UTF-8 buffer.
2059 
2060 =cut
2061 */
2062 
2063 PERL_STATIC_INLINE IV
2064 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2065 {
2066     PERL_ARGS_ASSERT_UTF8_DISTANCE;
2067 
2068     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2069 }
2070 
2071 /*
2072 =for apidoc utf8_hop
2073 
2074 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2075 forward or backward.
2076 
2077 WARNING: do not use the following unless you *know* C<off> is within
2078 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
2079 on the first byte of character or just after the last byte of a character.
2080 
2081 =cut
2082 */
2083 
2084 PERL_STATIC_INLINE U8 *
2085 Perl_utf8_hop(const U8 *s, SSize_t off)
2086 {
2087     PERL_ARGS_ASSERT_UTF8_HOP;
2088 
2089     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2090      * the bitops (especially ~) can create illegal UTF-8.
2091      * In other words: in Perl UTF-8 is not just for Unicode. */
2092 
2093     if (off >= 0) {
2094         while (off--)
2095             s += UTF8SKIP(s);
2096     }
2097     else {
2098         while (off++) {
2099             s--;
2100             while (UTF8_IS_CONTINUATION(*s))
2101                 s--;
2102         }
2103     }
2104     GCC_DIAG_IGNORE(-Wcast-qual)
2105     return (U8 *)s;
2106     GCC_DIAG_RESTORE
2107 }
2108 
2109 /*
2110 =for apidoc utf8_hop_forward
2111 
2112 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2113 forward.
2114 
2115 C<off> must be non-negative.
2116 
2117 C<s> must be before or equal to C<end>.
2118 
2119 When moving forward it will not move beyond C<end>.
2120 
2121 Will not exceed this limit even if the string is not valid "UTF-8".
2122 
2123 =cut
2124 */
2125 
2126 PERL_STATIC_INLINE U8 *
2127 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2128 {
2129     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2130 
2131     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2132      * the bitops (especially ~) can create illegal UTF-8.
2133      * In other words: in Perl UTF-8 is not just for Unicode. */
2134 
2135     assert(s <= end);
2136     assert(off >= 0);
2137 
2138     while (off--) {
2139         STRLEN skip = UTF8SKIP(s);
2140         if ((STRLEN)(end - s) <= skip) {
2141             GCC_DIAG_IGNORE(-Wcast-qual)
2142             return (U8 *)end;
2143             GCC_DIAG_RESTORE
2144         }
2145         s += skip;
2146     }
2147 
2148     GCC_DIAG_IGNORE(-Wcast-qual)
2149     return (U8 *)s;
2150     GCC_DIAG_RESTORE
2151 }
2152 
2153 /*
2154 =for apidoc utf8_hop_back
2155 
2156 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2157 backward.
2158 
2159 C<off> must be non-positive.
2160 
2161 C<s> must be after or equal to C<start>.
2162 
2163 When moving backward it will not move before C<start>.
2164 
2165 Will not exceed this limit even if the string is not valid "UTF-8".
2166 
2167 =cut
2168 */
2169 
2170 PERL_STATIC_INLINE U8 *
2171 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2172 {
2173     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2174 
2175     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2176      * the bitops (especially ~) can create illegal UTF-8.
2177      * In other words: in Perl UTF-8 is not just for Unicode. */
2178 
2179     assert(start <= s);
2180     assert(off <= 0);
2181 
2182     while (off++ && s > start) {
2183         do {
2184             s--;
2185         } while (UTF8_IS_CONTINUATION(*s) && s > start);
2186     }
2187 
2188     GCC_DIAG_IGNORE(-Wcast-qual)
2189     return (U8 *)s;
2190     GCC_DIAG_RESTORE
2191 }
2192 
2193 /*
2194 =for apidoc utf8_hop_safe
2195 
2196 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2197 either forward or backward.
2198 
2199 When moving backward it will not move before C<start>.
2200 
2201 When moving forward it will not move beyond C<end>.
2202 
2203 Will not exceed those limits even if the string is not valid "UTF-8".
2204 
2205 =cut
2206 */
2207 
2208 PERL_STATIC_INLINE U8 *
2209 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2210 {
2211     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2212 
2213     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2214      * the bitops (especially ~) can create illegal UTF-8.
2215      * In other words: in Perl UTF-8 is not just for Unicode. */
2216 
2217     assert(start <= s && s <= end);
2218 
2219     if (off >= 0) {
2220         return utf8_hop_forward(s, off, end);
2221     }
2222     else {
2223         return utf8_hop_back(s, off, start);
2224     }
2225 }
2226 
2227 /*
2228 
2229 =for apidoc isUTF8_CHAR_flags
2230 
2231 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2232 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2233 that represents some code point, subject to the restrictions given by C<flags>;
2234 otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
2235 starting at C<s> comprise the code point's representation.  Any bytes remaining
2236 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2237 are not examined.
2238 
2239 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2240 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2241 as C<L</isSTRICT_UTF8_CHAR>>;
2242 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2243 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2244 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2245 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2246 
2247 The three alternative macros are for the most commonly needed validations; they
2248 are likely to run somewhat faster than this more general one, as they can be
2249 inlined into your code.
2250 
2251 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2252 L</is_utf8_string_loclen_flags> to check entire strings.
2253 
2254 =cut
2255 */
2256 
2257 PERL_STATIC_INLINE STRLEN
2258 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2259 {
2260     PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2261     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2262                           |UTF8_DISALLOW_PERL_EXTENDED)));
2263 
2264     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2265                           goto check_success,
2266                           DFA_TEASE_APART_FF_,
2267                           DFA_RETURN_FAILURE_);
2268 
2269   check_success:
2270 
2271     return is_utf8_char_helper_(s0, e, flags);
2272 
2273 #ifdef HAS_EXTRA_LONG_UTF8
2274 
2275   tease_apart_FF:
2276 
2277     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2278      * either malformed, or was for the largest possible start byte, which
2279      * indicates perl extended UTF-8, well above the Unicode maximum */
2280     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2281         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2282     {
2283         return 0;
2284     }
2285 
2286     /* Otherwise examine the sequence not inline */
2287     return is_utf8_FF_helper_(s0, e,
2288                               FALSE /* require full, not partial char */
2289                              );
2290 #endif
2291 
2292 }
2293 
2294 /*
2295 
2296 =for apidoc is_utf8_valid_partial_char
2297 
2298 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2299 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2300 points.  Otherwise, it returns 1 if there exists at least one non-empty
2301 sequence of bytes that when appended to sequence C<s>, starting at position
2302 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2303 otherwise returns 0.
2304 
2305 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2306 point.
2307 
2308 This is useful when a fixed-length buffer is being tested for being well-formed
2309 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2310 it is split somewhere in the middle of the final code point's UTF-8
2311 representation.  (Presumably when the buffer is refreshed with the next chunk
2312 of data, the new first bytes will complete the partial code point.)   This
2313 function is used to verify that the final bytes in the current buffer are in
2314 fact the legal beginning of some code point, so that if they aren't, the
2315 failure can be signalled without having to wait for the next read.
2316 
2317 =cut
2318 */
2319 #define is_utf8_valid_partial_char(s, e)                                    \
2320                                 is_utf8_valid_partial_char_flags(s, e, 0)
2321 
2322 /*
2323 
2324 =for apidoc is_utf8_valid_partial_char_flags
2325 
2326 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2327 or not the input is a valid UTF-8 encoded partial character, but it takes an
2328 extra parameter, C<flags>, which can further restrict which code points are
2329 considered valid.
2330 
2331 If C<flags> is 0, this behaves identically to
2332 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2333 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2334 there is any sequence of bytes that can complete the input partial character in
2335 such a way that a non-prohibited character is formed, the function returns
2336 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2337 partial character input.  But many  of the other possible excluded types can be
2338 determined from just the first one or two bytes.
2339 
2340 =cut
2341  */
2342 
2343 PERL_STATIC_INLINE bool
2344 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2345 {
2346     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2347     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2348                           |UTF8_DISALLOW_PERL_EXTENDED)));
2349 
2350     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2351                           DFA_RETURN_FAILURE_,
2352                           DFA_TEASE_APART_FF_,
2353                           NOOP);
2354 
2355     /* The NOOP above causes the DFA to drop down here iff the input was a
2356      * partial character.  flags=0 => can return TRUE immediately; otherwise we
2357      * need to check (not inline) if the partial character is the beginning of
2358      * a disallowed one */
2359     if (flags == 0) {
2360         return TRUE;
2361     }
2362 
2363     return cBOOL(is_utf8_char_helper_(s0, e, flags));
2364 
2365 #ifdef HAS_EXTRA_LONG_UTF8
2366 
2367   tease_apart_FF:
2368 
2369     /* Getting here means the input is either malformed, or, in the case of
2370      * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
2371      * latter case has to be extended UTF-8, so can fail immediately if that is
2372      * forbidden */
2373 
2374     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2375         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2376     {
2377         return 0;
2378     }
2379 
2380     return is_utf8_FF_helper_(s0, e,
2381                               TRUE /* Require to be a partial character */
2382                              );
2383 #endif
2384 
2385 }
2386 
2387 /*
2388 
2389 =for apidoc is_utf8_fixed_width_buf_flags
2390 
2391 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2392 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2393 otherwise it returns FALSE.
2394 
2395 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2396 without restriction.  If the final few bytes of the buffer do not form a
2397 complete code point, this will return TRUE anyway, provided that
2398 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2399 
2400 If C<flags> in non-zero, it can be any combination of the
2401 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2402 same meanings.
2403 
2404 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2405 returns FALSE if the final few bytes of the string don't form a complete code
2406 point.
2407 
2408 =cut
2409  */
2410 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2411                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2412 
2413 /*
2414 
2415 =for apidoc is_utf8_fixed_width_buf_loc_flags
2416 
2417 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2418 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2419 to the beginning of any partial character at the end of the buffer; if there is
2420 no partial character C<*ep> will contain C<s>+C<len>.
2421 
2422 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2423 
2424 =cut
2425 */
2426 
2427 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2428                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2429 
2430 /*
2431 
2432 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2433 
2434 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2435 complete, valid characters found in the C<el> pointer.
2436 
2437 =cut
2438 */
2439 
2440 PERL_STATIC_INLINE bool
2441 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2442                                        STRLEN len,
2443                                        const U8 **ep,
2444                                        STRLEN *el,
2445                                        const U32 flags)
2446 {
2447     const U8 * maybe_partial;
2448 
2449     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2450 
2451     if (! ep) {
2452         ep  = &maybe_partial;
2453     }
2454 
2455     /* If it's entirely valid, return that; otherwise see if the only error is
2456      * that the final few bytes are for a partial character */
2457     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2458            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2459 }
2460 
2461 PERL_STATIC_INLINE UV
2462 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2463                       STRLEN curlen,
2464                       STRLEN *retlen,
2465                       const U32 flags,
2466                       U32 * errors,
2467                       AV ** msgs)
2468 {
2469     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2470      * simple cases, and, if necessary calls a helper function to deal with the
2471      * more complex ones.  Almost all well-formed non-problematic code points
2472      * are considered simple, so that it's unlikely that the helper function
2473      * will need to be called.
2474      *
2475      * This is an adaptation of the tables and algorithm given in
2476      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2477      * comprehensive documentation of the original version.  A copyright notice
2478      * for the original version is given at the beginning of this file.  The
2479      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
2480      */
2481 
2482     const U8 * const s0 = s;
2483     const U8 * send = s0 + curlen;
2484     UV type;
2485     UV uv;
2486 
2487     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2488 
2489     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2490      * non-problematic code point, which can be returned immediately.
2491      * Otherwise we call a helper function to figure out the more complicated
2492      * cases. */
2493 
2494     /* No calls from core pass in an empty string; non-core need a check */
2495 #ifdef PERL_CORE
2496     assert(curlen > 0);
2497 #else
2498     if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2499                                                         flags, errors, msgs);
2500 #endif
2501 
2502     type = PL_strict_utf8_dfa_tab[*s];
2503 
2504     /* The table is structured so that 'type' is 0 iff the input byte is
2505      * represented identically regardless of the UTF-8ness of the string */
2506     if (type == 0) {   /* UTF-8 invariants are returned unchanged */
2507         uv = *s;
2508     }
2509     else {
2510         UV state = PL_strict_utf8_dfa_tab[256 + type];
2511         uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2512 
2513         while (++s < send) {
2514             type  = PL_strict_utf8_dfa_tab[*s];
2515             state = PL_strict_utf8_dfa_tab[256 + state + type];
2516 
2517             uv = UTF8_ACCUMULATE(uv, *s);
2518 
2519             if (state == 0) {
2520 #ifdef EBCDIC
2521                 uv = UNI_TO_NATIVE(uv);
2522 #endif
2523                 goto success;
2524             }
2525 
2526             if (UNLIKELY(state == 1)) {
2527                 break;
2528             }
2529         }
2530 
2531         /* Here is potentially problematic.  Use the full mechanism */
2532         return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2533                                            errors, msgs);
2534     }
2535 
2536   success:
2537     if (retlen) {
2538         *retlen = s - s0 + 1;
2539     }
2540     if (errors) {
2541         *errors = 0;
2542     }
2543     if (msgs) {
2544         *msgs = NULL;
2545     }
2546 
2547     return uv;
2548 }
2549 
2550 PERL_STATIC_INLINE UV
2551 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2552 {
2553     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2554 
2555     assert(s < send);
2556 
2557     if (! ckWARN_d(WARN_UTF8)) {
2558 
2559         /* EMPTY is not really allowed, and asserts on debugging builds.  But
2560          * on non-debugging we have to deal with it, and this causes it to
2561          * return the REPLACEMENT CHARACTER, as the documentation indicates */
2562         return utf8n_to_uvchr(s, send - s, retlen,
2563                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2564     }
2565     else {
2566         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2567         if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2568             *retlen = (STRLEN) -1;
2569         }
2570 
2571         return ret;
2572     }
2573 }
2574 
2575 /* ------------------------------- perl.h ----------------------------- */
2576 
2577 /*
2578 =for apidoc_section $utility
2579 
2580 =for apidoc is_safe_syscall
2581 
2582 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2583 C<NUL> characters.
2584 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2585 category, and return FALSE.
2586 
2587 Return TRUE if the name is safe.
2588 
2589 C<what> and C<op_name> are used in any warning.
2590 
2591 Used by the C<IS_SAFE_SYSCALL()> macro.
2592 
2593 =cut
2594 */
2595 
2596 PERL_STATIC_INLINE bool
2597 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2598 {
2599     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2600      * perl itself uses xce*() functions which accept 8-bit strings.
2601      */
2602 
2603     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2604 
2605     if (len > 1) {
2606         char *null_at;
2607         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2608                 SETERRNO(ENOENT, LIB_INVARG);
2609                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2610                                    "Invalid \\0 character in %s for %s: %s\\0%s",
2611                                    what, op_name, pv, null_at+1);
2612                 return FALSE;
2613         }
2614     }
2615 
2616     return TRUE;
2617 }
2618 
2619 /*
2620 
2621 Return true if the supplied filename has a newline character
2622 immediately before the first (hopefully only) NUL.
2623 
2624 My original look at this incorrectly used the len from SvPV(), but
2625 that's incorrect, since we allow for a NUL in pv[len-1].
2626 
2627 So instead, strlen() and work from there.
2628 
2629 This allow for the user reading a filename, forgetting to chomp it,
2630 then calling:
2631 
2632   open my $foo, "$file\0";
2633 
2634 */
2635 
2636 #ifdef PERL_CORE
2637 
2638 PERL_STATIC_INLINE bool
2639 S_should_warn_nl(const char *pv)
2640 {
2641     STRLEN len;
2642 
2643     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2644 
2645     len = strlen(pv);
2646 
2647     return len > 0 && pv[len-1] == '\n';
2648 }
2649 
2650 #endif
2651 
2652 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2653 
2654 PERL_STATIC_INLINE bool
2655 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2656 {
2657     /* This function determines if the input NV 'nv' may be converted without
2658      * loss of data to an IV.  If not, it returns FALSE taking no other action.
2659      * But if it is possible, it does the conversion, returning TRUE, and
2660      * storing the converted result in '*ivp' */
2661 
2662     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2663 
2664 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2665     /* Normally any comparison with a NaN returns false; if we can't rely
2666      * on that behaviour, check explicitly */
2667     if (UNLIKELY(Perl_isnan(nv))) {
2668         return FALSE;
2669     }
2670 #  endif
2671 
2672     /* Written this way so that with an always-false NaN comparison we
2673      * return false */
2674     if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2675         return FALSE;
2676     }
2677 
2678     if ((IV) nv != nv) {
2679         return FALSE;
2680     }
2681 
2682     *ivp = (IV) nv;
2683     return TRUE;
2684 }
2685 
2686 #endif
2687 
2688 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2689 
2690 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2691 
2692 #define MAX_CHARSET_NAME_LENGTH 2
2693 
2694 PERL_STATIC_INLINE const char *
2695 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2696 {
2697     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2698 
2699     /* Returns a string that corresponds to the name of the regex character set
2700      * given by 'flags', and *lenp is set the length of that string, which
2701      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2702 
2703     *lenp = 1;
2704     switch (get_regex_charset(flags)) {
2705         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2706         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2707         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2708         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2709         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2710             *lenp = 2;
2711             return ASCII_MORE_RESTRICT_PAT_MODS;
2712     }
2713     /* The NOT_REACHED; hides an assert() which has a rather complex
2714      * definition in perl.h. */
2715     NOT_REACHED; /* NOTREACHED */
2716     return "?";	    /* Unknown */
2717 }
2718 
2719 #endif
2720 
2721 /*
2722 
2723 Return false if any get magic is on the SV other than taint magic.
2724 
2725 */
2726 
2727 PERL_STATIC_INLINE bool
2728 Perl_sv_only_taint_gmagic(SV *sv)
2729 {
2730     MAGIC *mg = SvMAGIC(sv);
2731 
2732     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2733 
2734     while (mg) {
2735         if (mg->mg_type != PERL_MAGIC_taint
2736             && !(mg->mg_flags & MGf_GSKIP)
2737             && mg->mg_virtual->svt_get) {
2738             return FALSE;
2739         }
2740         mg = mg->mg_moremagic;
2741     }
2742 
2743     return TRUE;
2744 }
2745 
2746 /* ------------------ cop.h ------------------------------------------- */
2747 
2748 /* implement GIMME_V() macro */
2749 
2750 PERL_STATIC_INLINE U8
2751 Perl_gimme_V(pTHX)
2752 {
2753     I32 cxix;
2754     U8  gimme = (PL_op->op_flags & OPf_WANT);
2755 
2756     if (gimme)
2757         return gimme;
2758     cxix = PL_curstackinfo->si_cxsubix;
2759     if (cxix < 0)
2760         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2761     assert(cxstack[cxix].blk_gimme & G_WANT);
2762     return (cxstack[cxix].blk_gimme & G_WANT);
2763 }
2764 
2765 
2766 /* Enter a block. Push a new base context and return its address. */
2767 
2768 PERL_STATIC_INLINE PERL_CONTEXT *
2769 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2770 {
2771     PERL_CONTEXT * cx;
2772 
2773     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2774 
2775     CXINC;
2776     cx = CX_CUR();
2777     cx->cx_type        = type;
2778     cx->blk_gimme      = gimme;
2779     cx->blk_oldsaveix  = saveix;
2780     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2781     cx->blk_oldcop     = PL_curcop;
2782     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2783     cx->blk_oldscopesp = PL_scopestack_ix;
2784     cx->blk_oldpm      = PL_curpm;
2785     cx->blk_old_tmpsfloor = PL_tmps_floor;
2786 
2787     PL_tmps_floor        = PL_tmps_ix;
2788     CX_DEBUG(cx, "PUSH");
2789     return cx;
2790 }
2791 
2792 
2793 /* Exit a block (RETURN and LAST). */
2794 
2795 PERL_STATIC_INLINE void
2796 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2797 {
2798     PERL_ARGS_ASSERT_CX_POPBLOCK;
2799 
2800     CX_DEBUG(cx, "POP");
2801     /* these 3 are common to cx_popblock and cx_topblock */
2802     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2803     PL_scopestack_ix = cx->blk_oldscopesp;
2804     PL_curpm         = cx->blk_oldpm;
2805 
2806     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2807      * and leaves a CX entry lying around for repeated use, so
2808      * skip for multicall */                  \
2809     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2810             || PL_savestack_ix == cx->blk_oldsaveix);
2811     PL_curcop     = cx->blk_oldcop;
2812     PL_tmps_floor = cx->blk_old_tmpsfloor;
2813 }
2814 
2815 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2816  * Whereas cx_popblock() restores the state to the point just before
2817  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2818  * *after* cx_pushblock() was called. */
2819 
2820 PERL_STATIC_INLINE void
2821 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2822 {
2823     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2824 
2825     CX_DEBUG(cx, "TOP");
2826     /* these 3 are common to cx_popblock and cx_topblock */
2827     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2828     PL_scopestack_ix = cx->blk_oldscopesp;
2829     PL_curpm         = cx->blk_oldpm;
2830 
2831     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2832 }
2833 
2834 
2835 PERL_STATIC_INLINE void
2836 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2837 {
2838     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2839 
2840     PERL_ARGS_ASSERT_CX_PUSHSUB;
2841 
2842     PERL_DTRACE_PROBE_ENTRY(cv);
2843     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2844     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2845     cx->blk_sub.cv = cv;
2846     cx->blk_sub.olddepth = CvDEPTH(cv);
2847     cx->blk_sub.prevcomppad = PL_comppad;
2848     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2849     cx->blk_sub.retop = retop;
2850     SvREFCNT_inc_simple_void_NN(cv);
2851     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2852 }
2853 
2854 
2855 /* subsets of cx_popsub() */
2856 
2857 PERL_STATIC_INLINE void
2858 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2859 {
2860     CV *cv;
2861 
2862     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2863     assert(CxTYPE(cx) == CXt_SUB);
2864 
2865     PL_comppad = cx->blk_sub.prevcomppad;
2866     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2867     cv = cx->blk_sub.cv;
2868     CvDEPTH(cv) = cx->blk_sub.olddepth;
2869     cx->blk_sub.cv = NULL;
2870     SvREFCNT_dec(cv);
2871     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2872 }
2873 
2874 
2875 /* handle the @_ part of leaving a sub */
2876 
2877 PERL_STATIC_INLINE void
2878 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2879 {
2880     AV *av;
2881 
2882     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2883     assert(CxTYPE(cx) == CXt_SUB);
2884     assert(AvARRAY(MUTABLE_AV(
2885         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2886                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2887 
2888     CX_POP_SAVEARRAY(cx);
2889     av = MUTABLE_AV(PAD_SVl(0));
2890     if (UNLIKELY(AvREAL(av)))
2891         /* abandon @_ if it got reified */
2892         clear_defarray(av, 0);
2893     else {
2894         CLEAR_ARGARRAY(av);
2895     }
2896 }
2897 
2898 
2899 PERL_STATIC_INLINE void
2900 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2901 {
2902     PERL_ARGS_ASSERT_CX_POPSUB;
2903     assert(CxTYPE(cx) == CXt_SUB);
2904 
2905     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2906 
2907     if (CxHASARGS(cx))
2908         cx_popsub_args(cx);
2909     cx_popsub_common(cx);
2910 }
2911 
2912 
2913 PERL_STATIC_INLINE void
2914 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2915 {
2916     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2917 
2918     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2919     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2920     cx->blk_format.cv          = cv;
2921     cx->blk_format.retop       = retop;
2922     cx->blk_format.gv          = gv;
2923     cx->blk_format.dfoutgv     = PL_defoutgv;
2924     cx->blk_format.prevcomppad = PL_comppad;
2925     cx->blk_u16                = 0;
2926 
2927     SvREFCNT_inc_simple_void_NN(cv);
2928     CvDEPTH(cv)++;
2929     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2930 }
2931 
2932 
2933 PERL_STATIC_INLINE void
2934 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2935 {
2936     CV *cv;
2937     GV *dfout;
2938 
2939     PERL_ARGS_ASSERT_CX_POPFORMAT;
2940     assert(CxTYPE(cx) == CXt_FORMAT);
2941 
2942     dfout = cx->blk_format.dfoutgv;
2943     setdefout(dfout);
2944     cx->blk_format.dfoutgv = NULL;
2945     SvREFCNT_dec_NN(dfout);
2946 
2947     PL_comppad = cx->blk_format.prevcomppad;
2948     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2949     cv = cx->blk_format.cv;
2950     cx->blk_format.cv = NULL;
2951     --CvDEPTH(cv);
2952     SvREFCNT_dec_NN(cv);
2953     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2954 }
2955 
2956 
2957 PERL_STATIC_INLINE void
2958 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2959 {
2960     cx->blk_eval.retop         = retop;
2961     cx->blk_eval.old_namesv    = namesv;
2962     cx->blk_eval.old_eval_root = PL_eval_root;
2963     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2964     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2965     cx->blk_eval.cur_top_env   = PL_top_env;
2966 
2967     assert(!(PL_in_eval     & ~ 0x3F));
2968     assert(!(PL_op->op_type & ~0x1FF));
2969     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2970 }
2971 
2972 PERL_STATIC_INLINE void
2973 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2974 {
2975     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2976 
2977     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2978 
2979     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2980     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2981 }
2982 
2983 PERL_STATIC_INLINE void
2984 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2985 {
2986     PERL_ARGS_ASSERT_CX_PUSHTRY;
2987 
2988     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2989 
2990     /* Don't actually change it, just store the current value so it's restored
2991      * by the common popeval */
2992     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2993 }
2994 
2995 
2996 PERL_STATIC_INLINE void
2997 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2998 {
2999     SV *sv;
3000 
3001     PERL_ARGS_ASSERT_CX_POPEVAL;
3002     assert(CxTYPE(cx) == CXt_EVAL);
3003 
3004     PL_in_eval = CxOLD_IN_EVAL(cx);
3005     assert(!(PL_in_eval & 0xc0));
3006     PL_eval_root = cx->blk_eval.old_eval_root;
3007     sv = cx->blk_eval.cur_text;
3008     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
3009         cx->blk_eval.cur_text = NULL;
3010         SvREFCNT_dec_NN(sv);
3011     }
3012 
3013     sv = cx->blk_eval.old_namesv;
3014     if (sv) {
3015         cx->blk_eval.old_namesv = NULL;
3016         SvREFCNT_dec_NN(sv);
3017     }
3018     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
3019 }
3020 
3021 
3022 /* push a plain loop, i.e.
3023  *     { block }
3024  *     while (cond) { block }
3025  *     for (init;cond;continue) { block }
3026  * This loop can be last/redo'ed etc.
3027  */
3028 
3029 PERL_STATIC_INLINE void
3030 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3031 {
3032     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3033     cx->blk_loop.my_op = cLOOP;
3034 }
3035 
3036 
3037 /* push a true for loop, i.e.
3038  *     for var (list) { block }
3039  */
3040 
3041 PERL_STATIC_INLINE void
3042 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3043 {
3044     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3045 
3046     /* this one line is common with cx_pushloop_plain */
3047     cx->blk_loop.my_op = cLOOP;
3048 
3049     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3050     cx->blk_loop.itersave      = itersave;
3051 #ifdef USE_ITHREADS
3052     cx->blk_loop.oldcomppad = PL_comppad;
3053 #endif
3054 }
3055 
3056 
3057 /* pop all loop types, including plain */
3058 
3059 PERL_STATIC_INLINE void
3060 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3061 {
3062     PERL_ARGS_ASSERT_CX_POPLOOP;
3063 
3064     assert(CxTYPE_is_LOOP(cx));
3065     if (  CxTYPE(cx) == CXt_LOOP_ARY
3066        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3067     {
3068         /* Free ary or cur. This assumes that state_u.ary.ary
3069          * aligns with state_u.lazysv.cur. See cx_dup() */
3070         SV *sv = cx->blk_loop.state_u.lazysv.cur;
3071         cx->blk_loop.state_u.lazysv.cur = NULL;
3072         SvREFCNT_dec_NN(sv);
3073         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3074             sv = cx->blk_loop.state_u.lazysv.end;
3075             cx->blk_loop.state_u.lazysv.end = NULL;
3076             SvREFCNT_dec_NN(sv);
3077         }
3078     }
3079     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3080         SV *cursv;
3081         SV **svp = (cx)->blk_loop.itervar_u.svp;
3082         if ((cx->cx_type & CXp_FOR_GV))
3083             svp = &GvSV((GV*)svp);
3084         cursv = *svp;
3085         *svp = cx->blk_loop.itersave;
3086         cx->blk_loop.itersave = NULL;
3087         SvREFCNT_dec(cursv);
3088     }
3089 }
3090 
3091 
3092 PERL_STATIC_INLINE void
3093 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3094 {
3095     PERL_ARGS_ASSERT_CX_PUSHWHEN;
3096 
3097     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3098 }
3099 
3100 
3101 PERL_STATIC_INLINE void
3102 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3103 {
3104     PERL_ARGS_ASSERT_CX_POPWHEN;
3105     assert(CxTYPE(cx) == CXt_WHEN);
3106 
3107     PERL_UNUSED_ARG(cx);
3108     PERL_UNUSED_CONTEXT;
3109     /* currently NOOP */
3110 }
3111 
3112 
3113 PERL_STATIC_INLINE void
3114 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3115 {
3116     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3117 
3118     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3119     cx->blk_givwhen.defsv_save = orig_defsv;
3120 }
3121 
3122 
3123 PERL_STATIC_INLINE void
3124 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3125 {
3126     SV *sv;
3127 
3128     PERL_ARGS_ASSERT_CX_POPGIVEN;
3129     assert(CxTYPE(cx) == CXt_GIVEN);
3130 
3131     sv = GvSV(PL_defgv);
3132     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3133     cx->blk_givwhen.defsv_save = NULL;
3134     SvREFCNT_dec(sv);
3135 }
3136 
3137 /* ------------------ util.h ------------------------------------------- */
3138 
3139 /*
3140 =for apidoc_section $string
3141 
3142 =for apidoc foldEQ
3143 
3144 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3145 same
3146 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
3147 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
3148 range bytes match only themselves.
3149 
3150 =cut
3151 */
3152 
3153 PERL_STATIC_INLINE I32
3154 Perl_foldEQ(const char *s1, const char *s2, I32 len)
3155 {
3156     const U8 *a = (const U8 *)s1;
3157     const U8 *b = (const U8 *)s2;
3158 
3159     PERL_ARGS_ASSERT_FOLDEQ;
3160 
3161     assert(len >= 0);
3162 
3163     while (len--) {
3164         if (*a != *b && *a != PL_fold[*b])
3165             return 0;
3166         a++,b++;
3167     }
3168     return 1;
3169 }
3170 
3171 PERL_STATIC_INLINE I32
3172 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
3173 {
3174     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
3175      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3176      * does not check for this.  Nor does it check that the strings each have
3177      * at least 'len' characters. */
3178 
3179     const U8 *a = (const U8 *)s1;
3180     const U8 *b = (const U8 *)s2;
3181 
3182     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3183 
3184     assert(len >= 0);
3185 
3186     while (len--) {
3187         if (*a != *b && *a != PL_fold_latin1[*b]) {
3188             return 0;
3189         }
3190         a++, b++;
3191     }
3192     return 1;
3193 }
3194 
3195 /*
3196 =for apidoc_section $locale
3197 =for apidoc foldEQ_locale
3198 
3199 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3200 same case-insensitively in the current locale; false otherwise.
3201 
3202 =cut
3203 */
3204 
3205 PERL_STATIC_INLINE I32
3206 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3207 {
3208     const U8 *a = (const U8 *)s1;
3209     const U8 *b = (const U8 *)s2;
3210 
3211     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3212 
3213     assert(len >= 0);
3214 
3215     while (len--) {
3216         if (*a != *b && *a != PL_fold_locale[*b])
3217             return 0;
3218         a++,b++;
3219     }
3220     return 1;
3221 }
3222 
3223 /*
3224 =for apidoc_section $string
3225 =for apidoc my_strnlen
3226 
3227 The C library C<strnlen> if available, or a Perl implementation of it.
3228 
3229 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3230 characters.  It will never attempt to address more than C<maxlen>
3231 characters, making it suitable for use with strings that are not
3232 guaranteed to be NUL-terminated.
3233 
3234 =cut
3235 
3236 Description stolen from http://man.openbsd.org/strnlen.3,
3237 implementation stolen from PostgreSQL.
3238 */
3239 #ifndef HAS_STRNLEN
3240 
3241 PERL_STATIC_INLINE Size_t
3242 Perl_my_strnlen(const char *str, Size_t maxlen)
3243 {
3244     const char *end = (char *) memchr(str, '\0', maxlen);
3245 
3246     PERL_ARGS_ASSERT_MY_STRNLEN;
3247 
3248     if (end == NULL) return maxlen;
3249     return end - str;
3250 }
3251 
3252 #endif
3253 
3254 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3255 
3256 PERL_STATIC_INLINE void *
3257 S_my_memrchr(const char * s, const char c, const STRLEN len)
3258 {
3259     /* memrchr(), since many platforms lack it */
3260 
3261     const char * t = s + len - 1;
3262 
3263     PERL_ARGS_ASSERT_MY_MEMRCHR;
3264 
3265     while (t >= s) {
3266         if (*t == c) {
3267             return (void *) t;
3268         }
3269         t--;
3270     }
3271 
3272     return NULL;
3273 }
3274 
3275 #endif
3276 
3277 PERL_STATIC_INLINE char *
3278 Perl_mortal_getenv(const char * str)
3279 {
3280     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3281      *
3282      * It's (mostly) thread-safe because it uses a mutex to prevent other
3283      * threads (that look at this mutex) from destroying the result before this
3284      * routine has a chance to copy the result to a place that won't be
3285      * destroyed before the caller gets a chance to handle it.  That place is a
3286      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3287      * impression that the SV will hang around longer under more circumstances
3288      *
3289      * The reason it isn't completely thread-safe is that other code could
3290      * simply not pay attention to the mutex.  All of the Perl core uses the
3291      * mutex, but it is possible for code from, say XS, to not use this mutex,
3292      * defeating the safety.
3293      *
3294      * getenv() returns, in some implementations, a pointer to a spot in the
3295      * **environ array, which could be invalidated at any time by this or
3296      * another thread changing the environment.  Other implementations copy the
3297      * **environ value to a static buffer, returning a pointer to that.  That
3298      * buffer might or might not be invalidated by a getenv() call in another
3299      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3300      * many getenv() calls can safely be running simultaneously, so a
3301      * many-reader (but no simultaneous writers) lock is ok.  There is a
3302      * Configure probe to see if another thread destroys the buffer, and the
3303      * mutex is defined accordingly.
3304      *
3305      * But in all cases, using the mutex prevents these problems, as long as
3306      * all code uses the same mutex.
3307      *
3308      * A complication is that this can be called during phases where the
3309      * mortalization process isn't available.  These are in interpreter
3310      * destruction or early in construction.  khw believes that at these times
3311      * there shouldn't be anything else going on, so plain getenv is safe AS
3312      * LONG AS the caller acts on the return before calling it again. */
3313 
3314     char * ret;
3315     dTHX;
3316 
3317     PERL_ARGS_ASSERT_MORTAL_GETENV;
3318 
3319     /* Can't mortalize without stacks.  khw believes that no other threads
3320      * should be running, so no need to lock things, and this may be during a
3321      * phase when locking isn't even available */
3322     if (UNLIKELY(PL_scopestack_ix == 0)) {
3323         return getenv(str);
3324     }
3325 
3326 #ifdef PERL_MEM_LOG
3327 
3328     /* A major complication arises under PERL_MEM_LOG.  When that is active,
3329      * every memory allocation may result in logging, depending on the value of
3330      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3331      * saving ENV{foo}'s value (but before saving it), the logging code will
3332      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3333      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3334      * lock a boolean mutex recursively); 3) destroying the getenv() static
3335      * buffer; or 4) destroying the temporary created by this for the copy
3336      * causes a log entry to be made which could cause a new temporary to be
3337      * created, which will need to be destroyed at some point, leading to an
3338      * infinite loop.
3339      *
3340      * The solution adopted here (after some gnashing of teeth) is to detect
3341      * the recursive calls and calls from the logger, and treat them specially.
3342      * Let's say we want to do getenv("foo").  We first find
3343      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3344      * variable, so no temporary is required.  Then we do getenv(foo}, and in
3345      * the process of creating a temporary to save it, this function will be
3346      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3347      * we detect that it is such a call and return our saved value instead of
3348      * locking and doing a new getenv().  This solves all of problems 1), 2),
3349      * and 3).  Because all the getenv()s are done while the mutex is locked,
3350      * the state cannot have changed.  To solve 4), we don't create a temporary
3351      * when this is called from the logging code.  That code disposes of the
3352      * return value while the mutex is still locked.
3353      *
3354      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3355      * digits and 3 particular letters are significant; the rest are ignored by
3356      * the memory logging code.  Thus the per-interpreter variable only needs
3357      * to be large enough to save the significant information, the size of
3358      * which is known at compile time.  The first byte is extra, reserved for
3359      * flags for our use.  To protect against overflowing, only the reserved
3360      * byte, as many digits as don't overflow, and the three letters are
3361      * stored.
3362      *
3363      * The reserved byte has two bits:
3364      *      0x1 if set indicates that if we get here, it is a recursive call of
3365      *          getenv()
3366      *      0x2 if set indicates that the call is from the logging code.
3367      *
3368      * If the flag indicates this is a recursive call, just return the stored
3369      * value of PL_mem_log;  An empty value gets turned into NULL. */
3370     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3371         if (PL_mem_log[1] == '\0') {
3372             return NULL;
3373         } else {
3374             return PL_mem_log + 1;
3375         }
3376     }
3377 
3378 #endif
3379 
3380     GETENV_LOCK;
3381 
3382 #ifdef PERL_MEM_LOG
3383 
3384     /* Here we are in a critical section.  As explained above, we do our own
3385      * getenv(PERL_MEM_LOG), saving the result safely. */
3386     ret = getenv("PERL_MEM_LOG");
3387     if (ret == NULL) {  /* No logging active */
3388 
3389         /* Return that immediately if called from the logging code */
3390         if (PL_mem_log[0] & 0x2) {
3391             GETENV_UNLOCK;
3392             return NULL;
3393         }
3394 
3395         PL_mem_log[1] = '\0';
3396     }
3397     else {
3398         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3399 
3400         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3401          * extremely long string.  But we want only a few characters from it.
3402          * PL_mem_log has been made large enough to hold just the ones we need.
3403          * First the file descriptor. */
3404         if (isDIGIT(*ret)) {
3405             const char * s = ret;
3406             if (UNLIKELY(*s == '0')) {
3407 
3408                 /* Reduce multiple leading zeros to a single one.  This is to
3409                  * allow the caller to change what to do with leading zeros. */
3410                 *mem_log_meat++ = '0';
3411                 s++;
3412                 while (*s == '0') {
3413                     s++;
3414                 }
3415             }
3416 
3417             /* If the input overflows, copy just enough for the result to also
3418              * overflow, plus 1 to make sure */
3419             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3420                 *mem_log_meat++ = *s++;
3421             }
3422         }
3423 
3424         /* Then each of the three significant characters */
3425         if (strchr(ret, 'm')) {
3426             *mem_log_meat++ = 'm';
3427         }
3428         if (strchr(ret, 's')) {
3429             *mem_log_meat++ = 's';
3430         }
3431         if (strchr(ret, 't')) {
3432             *mem_log_meat++ = 't';
3433         }
3434         *mem_log_meat = '\0';
3435 
3436         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3437     }
3438 
3439     /* If we are being called from the logger, it only needs the significant
3440      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3441     if (PL_mem_log[0] & 0x2) {
3442         assert(strEQ(str, "PERL_MEM_LOG"));
3443         GETENV_UNLOCK;
3444         return PL_mem_log + 1;
3445     }
3446 
3447     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3448      * is coming from other than the logging code, so it should be treated the
3449      * same as any other getenv(), returning the full value, not just the
3450      * significant part, and having its value saved.  Set the flag that
3451      * indicates any call to this routine will be a recursion from here */
3452     PL_mem_log[0] = 0x1;
3453 
3454 #endif
3455 
3456     /* Now get the value of the real desired variable, and save a copy */
3457     ret = getenv(str);
3458 
3459     if (ret != NULL) {
3460         ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3461     }
3462 
3463     GETENV_UNLOCK;
3464 
3465 #ifdef PERL_MEM_LOG
3466 
3467     /* Clear the buffer */
3468     Zero(PL_mem_log, sizeof(PL_mem_log), char);
3469 
3470 #endif
3471 
3472     return ret;
3473 }
3474 
3475 PERL_STATIC_INLINE bool
3476 Perl_sv_isbool(pTHX_ const SV *sv)
3477 {
3478     return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3479         (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3480 }
3481 
3482 #ifdef USE_ITHREADS
3483 
3484 PERL_STATIC_INLINE AV *
3485 Perl_cop_file_avn(pTHX_ const COP *cop) {
3486 
3487     PERL_ARGS_ASSERT_COP_FILE_AVN;
3488 
3489     const char *file = CopFILE(cop);
3490     if (file) {
3491         GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3492         if (gv) {
3493             return GvAVn(gv);
3494         }
3495         else
3496             return NULL;
3497      }
3498      else
3499          return NULL;
3500 }
3501 
3502 #endif
3503 
3504 /*
3505  * ex: set ts=8 sts=4 sw=4 et:
3506  */
3507