xref: /openbsd-src/gnu/usr.bin/perl/utf8.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
13  *  heard of that we don't want to see any closer; and that's the one place
14  *  we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17  *
18  * 'Well do I understand your speech,' he answered in the same language;
19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
20  *  as is the custom in the West, if you wish to be answered?'
21  *                           --Gandalf, addressing Théoden's door wardens
22  *
23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24  *
25  * ...the travellers perceived that the floor was paved with stones of many
26  * hues; branching runes and strange devices intertwined beneath their feet.
27  *
28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29  */
30 
31 #include "EXTERN.h"
32 #define PERL_IN_UTF8_C
33 #include "perl.h"
34 #include "invlist_inline.h"
35 
36 static const char malformed_text[] = "Malformed UTF-8 character";
37 static const char unees[] =
38                         "Malformed UTF-8 character (unexpected end of string)";
39 
40 /* strlen() of a literal string constant.  We might want this more general,
41  * but using it in just this file for now.  A problem with more generality is
42  * the compiler warnings about comparing unlike signs */
43 #define STRLENs(s)  (sizeof("" s "") - 1)
44 
45 /*
46 These are various utility functions for manipulating UTF8-encoded
47 strings.  For the uninitiated, this is a method of representing arbitrary
48 Unicode characters as a variable number of bytes, in such a way that
49 characters in the ASCII range are unmodified, and a zero byte never appears
50 within non-zero characters.
51 */
52 
53 /* helper for Perl__force_out_malformed_utf8_message(). Like
54  * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
55  * PL_compiling */
56 
57 static void
58 S_restore_cop_warnings(pTHX_ void *p)
59 {
60     free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
61 }
62 
63 
64 void
65 Perl__force_out_malformed_utf8_message(pTHX_
66             const U8 *const p,      /* First byte in UTF-8 sequence */
67             const U8 * const e,     /* Final byte in sequence (may include
68                                        multiple chars */
69             const U32 flags,        /* Flags to pass to utf8n_to_uvchr(),
70                                        usually 0, or some DISALLOW flags */
71             const bool die_here)    /* If TRUE, this function does not return */
72 {
73     /* This core-only function is to be called when a malformed UTF-8 character
74      * is found, in order to output the detailed information about the
75      * malformation before dieing.  The reason it exists is for the occasions
76      * when such a malformation is fatal, but warnings might be turned off, so
77      * that normally they would not be actually output.  This ensures that they
78      * do get output.  Because a sequence may be malformed in more than one
79      * way, multiple messages may be generated, so we can't make them fatal, as
80      * that would cause the first one to die.
81      *
82      * Instead we pretend -W was passed to perl, then die afterwards.  The
83      * flexibility is here to return to the caller so they can finish up and
84      * die themselves */
85     U32 errors;
86 
87     PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
88 
89     ENTER;
90     SAVEI8(PL_dowarn);
91     SAVESPTR(PL_curcop);
92 
93     PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
94     if (PL_curcop) {
95         /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
96          * than PL_compiling */
97         SAVEDESTRUCTOR_X(S_restore_cop_warnings,
98                 (void*)PL_curcop->cop_warnings);
99         PL_curcop->cop_warnings = pWARN_ALL;
100     }
101 
102     (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
103 
104     LEAVE;
105 
106     if (! errors) {
107         Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
108                          " be called only when there are errors found");
109     }
110 
111     if (die_here) {
112         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
113     }
114 }
115 
116 STATIC HV *
117 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
118                    U32 categories,  /* Packed warning categories */
119                    U32 flag)        /* Flag associated with this message */
120 {
121     /* Creates, populates, and returns an HV* that describes an error message
122      * for the translators between UTF8 and code point */
123 
124     SV* msg_sv = newSVpv(message, 0);
125     SV* category_sv = newSVuv(categories);
126     SV* flag_bit_sv = newSVuv(flag);
127 
128     HV* msg_hv = newHV();
129 
130     PERL_ARGS_ASSERT_NEW_MSG_HV;
131 
132     (void) hv_stores(msg_hv, "text", msg_sv);
133     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
134     (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
135 
136     return msg_hv;
137 }
138 
139 /*
140 =for apidoc uvoffuni_to_utf8_flags
141 
142 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
143 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
144 L<perlapi/uvchr_to_utf8_flags>>.
145 
146 This function is like them, but the input is a strict Unicode
147 (as opposed to native) code point.  Only in very rare circumstances should code
148 not be using the native code point.
149 
150 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
151 
152 =cut
153 */
154 
155 U8 *
156 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
157 {
158     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
159 
160     return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
161 }
162 
163 /* All these formats take a single UV code point argument */
164 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
165 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
166                                    " is not recommended for open interchange";
167 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
168                                    " may not be portable";
169 
170 /*  Use shorter names internally in this file */
171 #define SHIFT   UTF_ACCUMULATION_SHIFT
172 #undef  MARK
173 #define MARK    UTF_CONTINUATION_MARK
174 #define MASK    UTF_CONTINUATION_MASK
175 
176 /*
177 =for apidoc uvchr_to_utf8_flags_msgs
178 
179 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
180 
181 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
182 
183 This function is for code that wants any warning and/or error messages to be
184 returned to the caller rather than be displayed.  All messages that would have
185 been displayed if all lexical warnings are enabled will be returned.
186 
187 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
188 placed after all the others, C<msgs>.  If this parameter is 0, this function
189 behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
190 be a pointer to an C<HV *> variable, in which this function creates a new HV to
191 contain any appropriate messages.  The hash has three key-value pairs, as
192 follows:
193 
194 =over 4
195 
196 =item C<text>
197 
198 The text of the message as a C<SVpv>.
199 
200 =item C<warn_categories>
201 
202 The warning category (or categories) packed into a C<SVuv>.
203 
204 =item C<flag>
205 
206 A single flag bit associated with this message, in a C<SVuv>.
207 The bit corresponds to some bit in the C<*errors> return value,
208 such as C<UNICODE_GOT_SURROGATE>.
209 
210 =back
211 
212 It's important to note that specifying this parameter as non-null will cause
213 any warnings this function would otherwise generate to be suppressed, and
214 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
215 (or not) when choosing what to do with the returned messages.
216 
217 The caller, of course, is responsible for freeing any returned HV.
218 
219 =cut
220 */
221 
222 /* Undocumented; we don't want people using this.  Instead they should use
223  * uvchr_to_utf8_flags_msgs() */
224 U8 *
225 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
226 {
227     U8 *p;
228     UV shifted_uv = input_uv;
229     STRLEN utf8_skip = OFFUNISKIP(input_uv);
230 
231     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
232 
233     if (msgs) {
234         *msgs = NULL;
235     }
236 
237     switch (utf8_skip) {
238       case 1:
239         *d++ = LATIN1_TO_NATIVE(input_uv);
240         return d;
241 
242       default:
243         if (   UNLIKELY(input_uv > MAX_LEGAL_CP
244             && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
245         {
246             Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
247                                                          NULL, 0, input_uv));
248         }
249 
250         if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
251             U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
252             const char * format = PL_extended_cp_format;
253             if (msgs) {
254                 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
255                                    category,
256                                    UNICODE_GOT_PERL_EXTENDED);
257             }
258             else {
259                 Perl_ck_warner_d(aTHX_ category, format, input_uv);
260             }
261 
262             /* Don't output a 2nd msg */
263             flags &= ~UNICODE_WARN_SUPER;
264         }
265 
266         if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
267             return NULL;
268         }
269 
270         p = d + utf8_skip - 1;
271         while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
272             *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
273             shifted_uv >>= SHIFT;
274         }
275 
276         /* FALLTHROUGH */
277 
278       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
279         d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
280                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
281         shifted_uv >>= SHIFT;
282         /* FALLTHROUGH */
283 
284       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
285         d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
286                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
287         shifted_uv >>= SHIFT;
288         /* FALLTHROUGH */
289 
290       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
291         if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
292             if (flags & UNICODE_WARN_SUPER) {
293                 U32 category = packWARN(WARN_NON_UNICODE);
294                 const char * format = super_cp_format;
295 
296                 if (msgs) {
297                     *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
298                                        category,
299                                        UNICODE_GOT_SUPER);
300                 }
301                 else {
302                     Perl_ck_warner_d(aTHX_ category, format, input_uv);
303                 }
304 
305                 if (flags & UNICODE_DISALLOW_SUPER) {
306                     return NULL;
307                 }
308             }
309             if (       (flags & UNICODE_DISALLOW_SUPER)
310                 || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
311                     &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
312             {
313                 return NULL;
314             }
315         }
316 
317         d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
318                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
319         shifted_uv >>= SHIFT;
320         /* FALLTHROUGH */
321 
322       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
323         if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
324             if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
325                 if (flags & UNICODE_WARN_NONCHAR) {
326                     U32 category = packWARN(WARN_NONCHAR);
327                     const char * format = nonchar_cp_format;
328                     if (msgs) {
329                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
330                                            category,
331                                            UNICODE_GOT_NONCHAR);
332                     }
333                     else {
334                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
335                     }
336                 }
337                 if (flags & UNICODE_DISALLOW_NONCHAR) {
338                     return NULL;
339                 }
340             }
341             else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
342                 if (flags & UNICODE_WARN_SURROGATE) {
343                     U32 category = packWARN(WARN_SURROGATE);
344                     const char * format = surrogate_cp_format;
345                     if (msgs) {
346                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
347                                            category,
348                                            UNICODE_GOT_SURROGATE);
349                     }
350                     else {
351                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
352                     }
353                 }
354                 if (flags & UNICODE_DISALLOW_SURROGATE) {
355                     return NULL;
356                 }
357             }
358         }
359 
360         d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
361                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
362         shifted_uv >>= SHIFT;
363         /* FALLTHROUGH */
364 
365 #ifdef EBCDIC
366 
367       case 3:
368         d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
369         shifted_uv >>= SHIFT;
370         /* FALLTHROUGH */
371 
372 #endif
373 
374         /* FALLTHROUGH */
375       case 2:
376         d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
377         shifted_uv >>= SHIFT;
378         d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
379                                              | UTF_START_MARK(utf8_skip));
380         break;
381     }
382 
383     return d + utf8_skip;
384 }
385 
386 /*
387 =for apidoc uvchr_to_utf8
388 
389 Adds the UTF-8 representation of the native code point C<uv> to the end
390 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
391 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
392 the byte after the end of the new character.  In other words,
393 
394     d = uvchr_to_utf8(d, uv);
395 
396 is the recommended wide native character-aware way of saying
397 
398     *(d++) = uv;
399 
400 This function accepts any code point from 0..C<IV_MAX> as input.
401 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
402 
403 It is possible to forbid or warn on non-Unicode code points, or those that may
404 be problematic by using L</uvchr_to_utf8_flags>.
405 
406 =cut
407 */
408 
409 /* This is also a macro */
410 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
411 
412 U8 *
413 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
414 {
415     return uvchr_to_utf8(d, uv);
416 }
417 
418 /*
419 =for apidoc uvchr_to_utf8_flags
420 
421 Adds the UTF-8 representation of the native code point C<uv> to the end
422 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
423 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
424 the byte after the end of the new character.  In other words,
425 
426     d = uvchr_to_utf8_flags(d, uv, flags);
427 
428 or, in most cases,
429 
430     d = uvchr_to_utf8_flags(d, uv, 0);
431 
432 This is the Unicode-aware way of saying
433 
434     *(d++) = uv;
435 
436 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
437 input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
438 
439 Specifying C<flags> can further restrict what is allowed and not warned on, as
440 follows:
441 
442 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
443 the function will raise a warning, provided UTF8 warnings are enabled.  If
444 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
445 NULL.  If both flags are set, the function will both warn and return NULL.
446 
447 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
448 affect how the function handles a Unicode non-character.
449 
450 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
451 affect the handling of code points that are above the Unicode maximum of
452 0x10FFFF.  Languages other than Perl may not be able to accept files that
453 contain these.
454 
455 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
456 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
457 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
458 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
459 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
460 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
461 above-Unicode and surrogate flags, but not the non-character ones, as
462 defined in
463 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
464 See L<perlunicode/Noncharacter code points>.
465 
466 Extremely high code points were never specified in any standard, and require an
467 extension to UTF-8 to express, which Perl does.  It is likely that programs
468 written in something other than Perl would not be able to read files that
469 contain these; nor would Perl understand files written by something that uses a
470 different extension.  For these reasons, there is a separate set of flags that
471 can warn and/or disallow these extremely high code points, even if other
472 above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
473 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
474 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
475 treat all above-Unicode code points, including these, as malformations.  (Note
476 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
477 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
478 
479 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
480 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
481 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
482 C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
483 platforms,these flags can apply to code points that actually do fit in 31 bits.
484 The new names accurately describe the situation in all cases.
485 
486 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
487 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
488 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
489 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
490 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
491 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
492 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
493 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
494 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
495 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
496 =for apidoc Amnh||UNICODE_WARN_NONCHAR
497 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
498 =for apidoc Amnh||UNICODE_WARN_SUPER
499 =for apidoc Amnh||UNICODE_WARN_SURROGATE
500 
501 =cut
502 */
503 
504 /* This is also a macro */
505 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
506 
507 U8 *
508 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
509 {
510     return uvchr_to_utf8_flags(d, uv, flags);
511 }
512 
513 PERL_STATIC_INLINE int
514 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
515 {
516     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
517      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
518      * it isn't, and -1 if there isn't enough information to tell.  This last
519      * return value can happen if the sequence is incomplete, missing some
520      * trailing bytes that would form a complete character.  If there are
521      * enough bytes to make a definitive decision, this function does so.
522      * Usually 2 bytes are sufficient.
523      *
524      * Overlongs can occur whenever the number of continuation bytes changes.
525      * That means whenever the number of leading 1 bits in a start byte
526      * increases from the next lower start byte.  That happens for start bytes
527      * C0, E0, F0, F8, FC, FE, and FF.
528      */
529 
530     PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
531 
532     /* Each platform has overlongs after the start bytes given above (expressed
533      * in I8 for EBCDIC).  The values below were found by manually inspecting
534      * the UTF-8 patterns.  See the tables in utf8.h and utfebcdic.h. */
535 
536     switch (NATIVE_UTF8_TO_I8(s[0])) {
537       default:
538         assert(UTF8_IS_START(s[0]));
539         return 0;
540 
541       case 0xC0:
542       case 0xC1:
543         return 1;
544 
545 #ifdef EBCDIC
546 
547       case 0xC2:
548       case 0xC3:
549       case 0xC4:
550       case 0xE0:
551         return 1;
552 #else
553       case 0xE0:
554         return (len < 2) ? -1 : s[1] < 0xA0;
555 #endif
556 
557       case 0xF0:
558         return (len < 2)
559                ? -1
560                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
561       case 0xF8:
562         return (len < 2)
563                ? -1
564                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
565       case 0xFC:
566         return (len < 2)
567                ? -1
568                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
569       case 0xFE:
570         return (len < 2)
571                ? -1
572                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
573       case 0xFF:
574         return isFF_overlong(s, len);
575     }
576 }
577 
578 PERL_STATIC_INLINE int
579 S_isFF_overlong(const U8 * const s, const STRLEN len)
580 {
581     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
582      * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
583      * it isn't, and -1 if there isn't enough information to tell.  This last
584      * return value can happen if the sequence is incomplete, missing some
585      * trailing bytes that would form a complete character.  If there are
586      * enough bytes to make a definitive decision, this function does so. */
587 
588     PERL_ARGS_ASSERT_ISFF_OVERLONG;
589 
590 #ifdef EBCDIC
591     /* This works on all three EBCDIC code pages traditionally supported by
592      * perl */
593 #  define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
594 #else
595 #  define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
596 #endif
597 
598     /* To be an FF overlong, all the available bytes must match */
599     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
600                      MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
601     {
602         return 0;
603     }
604 
605     /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
606      * be there; what comes after them doesn't matter.  See tables in utf8.h,
607      * utfebcdic.h. */
608     if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
609         return 1;
610     }
611 
612     /* The missing bytes could cause the result to go one way or the other, so
613      * the result is indeterminate */
614     return -1;
615 }
616 
617 /* At some point we may want to allow core to use up to UV_MAX */
618 
619 #ifdef EBCDIC     /* Actually is I8 */
620 #  if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
621 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA7"
622                               /* UV_MAX "\xFF\xAF" */
623 #  else      /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
624 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
625                               /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
626 #  endif
627 #else
628 #  if defined(UV_IS_QUAD)
629 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\x80\x87"
630                               /* UV_MAX "\xFF\x80" */
631 #  else
632 #    define HIGHEST_REPRESENTABLE_UTF  "\xFD"
633                               /* UV_MAX "\xFE\x83" */
634 #  endif
635 #endif
636 
637 PERL_STATIC_INLINE int
638 S_does_utf8_overflow(const U8 * const s,
639                      const U8 * e,
640                      const bool consider_overlongs)
641 {
642     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
643      * 'e' - 1 would overflow an IV on this platform; that is if it represents
644      * a code point larger than the highest representable code point.  It
645      * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
646      * enough information to tell.  This last return value can happen if the
647      * sequence is incomplete, missing some trailing bytes that would form a
648      * complete character.  If there are enough bytes to make a definitive
649      * decision, this function does so.
650      *
651      * If 'consider_overlongs' is TRUE, the function checks for the possibility
652      * that the sequence is an overlong that doesn't overflow.  Otherwise, it
653      * assumes the sequence is not an overlong.  This can give different
654      * results only on ASCII 32-bit platforms.
655      *
656      * (For ASCII platforms, we could use memcmp() because we don't have to
657      * convert each byte to I8, but it's very rare input indeed that would
658      * approach overflow, so the loop below will likely only get executed once.)
659      *
660      */
661     const STRLEN len = e - s;
662     const U8 *x;
663     const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
664     int is_overlong = 0;
665 
666     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
667 
668     for (x = s; x < e; x++, y++) {
669 
670         /* 'y' is set up to not include the trailing bytes that are all the
671          * maximum possible continuation byte.  So when we reach the end of 'y'
672          * (known to be NUL terminated), it is impossible for 'x' to contain
673          * bytes larger than those omitted bytes, and therefore 'x' can't
674          * overflow */
675         if (*y == '\0') {
676             return 0;
677         }
678 
679         /* If this byte is less than the corresponding highest non-overflowing
680          * UTF-8, the sequence doesn't overflow */
681         if (NATIVE_UTF8_TO_I8(*x) < *y) {
682             return 0;
683         }
684 
685         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
686             goto overflows_if_not_overlong;
687         }
688     }
689 
690     /* Got to the end, and all bytes are the same.  If the input is a whole
691      * character, it doesn't overflow.  And if it is a partial character,
692      * there's not enough information to tell */
693     return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
694 
695   overflows_if_not_overlong:
696 
697     /* Here, a well-formed sequence overflows.  If we are assuming
698      * well-formedness, return that it overflows. */
699     if (! consider_overlongs) {
700         return 1;
701     }
702 
703     /* Here, it could be the overlong malformation, and might not actuallly
704      * overflow if you were to calculate it out.
705      *
706      * See if it actually is overlong */
707     is_overlong = is_utf8_overlong(s, len);
708 
709     /* If it isn't overlong, is well-formed, so overflows */
710     if (is_overlong == 0) {
711         return 1;
712     }
713 
714     /* Not long enough to determine */
715     if (is_overlong < 0) {
716         return -1;
717     }
718 
719     /* Here, it appears to overflow, but it is also overlong */
720 
721 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
722 
723     /* On many platforms, it is impossible for an overlong to overflow.  For
724      * these, no further work is necessary: we can return immediately that this
725      * overlong that is an apparent overflow actually isn't
726      *
727      * To see why, note that a length_N sequence can represent as overlongs all
728      * the code points representable by shorter length sequences, but no
729      * higher.  If it could represent a higher code point without being an
730      * overlong, we wouldn't have had to increase the sequence length!
731      *
732      * The highest possible start byte is FF; the next highest is FE.  The
733      * highest code point representable as an overlong on the platform is thus
734      * the highest code point representable by a non-overlong sequence whose
735      * start byte is FE.  If that value doesn't overflow the platform's word
736      * size, overlongs can't overflow.
737      *
738      * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
739      * information (the high 7 bits, all ones, say that the sequence is 7 bytes
740      * long, and the bottom, zero, bit is s placeholder. That leaves the 6
741      * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
742       If that number of bits doesn't exceed the word size, it can't overflow. */
743 
744     return 0;
745 
746 #else
747 
748     /* In practice, only a 32-bit ASCII box gets here.  The FE start byte can
749      * represent, as an overlong, the highest code point representable by an FD
750      * start byte, which is 5*6 continuation bytes of info plus one bit from
751      * the start byte, or 31 bits.  That doesn't overflow.  More explicitly:
752      * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
753      *
754      * That means only the FF start byte can have an overflowing overlong. */
755     if (*s < 0xFF) {
756         return 0;
757     }
758 
759     /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
760      * evaluates to 2**31, so overflows an IV.  For a UV it's
761      *              \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
762 #  define OVERFLOWS  "\xff\x80\x80\x80\x80\x80\x80\x82"
763 
764     if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) {   /* Not enough info */
765          return -1;
766     }
767 
768 #  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
769 
770     return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
771 
772 #endif
773 
774 }
775 
776 STRLEN
777 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
778 {
779     SSize_t len, full_len;
780 
781     /* An internal helper function.
782      *
783      * On input:
784      *  's' is a string, which is known to be syntactically valid UTF-8 as far
785      *      as (e - 1); e > s must hold.
786      *  'e' This function is allowed to look at any byte from 's'...'e-1', but
787      *      nowhere else.  The function has to cope as best it can if that
788      *      sequence does not form a full character.
789      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
790      *      accepted by L</utf8n_to_uvchr>.  If non-zero, this function returns
791      *      0 if it determines the input will match something disallowed.
792      * On output:
793      *  The return is the number of bytes required to represent the code point
794      *  if it isn't disallowed by 'flags'; 0 otherwise.  Be aware that if the
795      *  input is for a partial character, a successful return will be larger
796      *  than 'e - s'.
797      *
798      *  If *s..*(e-1) is only for a partial character, the function will return
799      *  non-zero if there is any sequence of well-formed UTF-8 that, when
800      *  appended to the input sequence, could result in an allowed code point;
801      *  otherwise it returns 0.  Non characters cannot be determined based on
802      *  partial character input.  But many  of the other excluded types can be
803      *  determined with just the first one or two bytes.
804      *
805      */
806 
807     PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
808 
809     assert(e > s);
810     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
811                           |UTF8_DISALLOW_PERL_EXTENDED)));
812 
813     full_len = UTF8SKIP(s);
814 
815     len = e - s;
816     if (len > full_len) {
817         e = s + full_len;
818         len = full_len;
819     }
820 
821     switch (full_len) {
822         bool is_super;
823 
824       default: /* Extended */
825         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
826             return 0;
827         }
828 
829         /* FALLTHROUGH */
830 
831       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
832       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
833 
834         if (flags & UTF8_DISALLOW_SUPER) {
835             return 0;                       /* Above Unicode */
836         }
837 
838         return full_len;
839 
840       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
841         is_super = (   UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
842                     || (   len > 1
843                         && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
844                         && NATIVE_UTF8_TO_I8(s[1])
845                                                 >= UTF_FIRST_CONT_BYTE_110000_));
846         if (is_super) {
847             if (flags & UTF8_DISALLOW_SUPER) {
848                 return 0;
849             }
850         }
851         else if (   (flags & UTF8_DISALLOW_NONCHAR)
852                  && len == full_len
853                  && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
854         {
855             return 0;
856         }
857 
858         return full_len;
859 
860       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
861 
862         if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
863             return full_len;
864         }
865 
866         if (   (flags & UTF8_DISALLOW_SURROGATE)
867             &&  UNLIKELY(is_SURROGATE_utf8(s)))
868         {
869             return 0;       /* Surrogate */
870         }
871 
872         if (  (flags & UTF8_DISALLOW_NONCHAR)
873             && len == full_len
874             && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
875         {
876             return 0;
877         }
878 
879         return full_len;
880 
881       /* The lower code points don't have any disallowable characters */
882 #ifdef EBCDIC
883       case 3:
884         return full_len;
885 #endif
886 
887       case 2:
888       case 1:
889         return full_len;
890     }
891 }
892 
893 Size_t
894 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
895                         const bool require_partial)
896 {
897     /* This is called to determine if the UTF-8 sequence starting at s0 and
898      * continuing for up to one full character of bytes, but looking no further
899      * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
900      * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
901      * for the largest code points recognized by Perl, the ones that require
902      * the most UTF-8 bytes per character to represent (somewhat less than
903      * twice the size of the next longest kind).  This sequence will only ever
904      * be Perl extended UTF-8.
905      *
906      * The routine returns 0 if the sequence is not fully valid, syntactically
907      * or semantically.  That means it checks that everything following the
908      * start byte is a continuation byte, and that it doesn't overflow, nor is
909      * an overlong representation.
910      *
911      * If 'require_partial' is FALSE, the routine returns non-zero only if the
912      * input (as far as 'e-1') is a full character.  The return is the count of
913      * the bytes in the character.
914      *
915      * If 'require_partial' is TRUE, the routine returns non-zero only if the
916      * input as far as 'e-1' is a partial, not full character, with no
917      * malformations found before position 'e'.  The return is either just
918      * FALSE, or TRUE.  */
919 
920     const U8 *s = s0 + 1;
921     const U8 *send = e;
922 
923     PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
924 
925     assert(s0 < e);
926     assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
927 
928     send = s + MIN(UTF8_MAXBYTES - 1, e - s);
929     while (s < send) {
930         if (! UTF8_IS_CONTINUATION(*s)) {
931             return 0;
932         }
933 
934         s++;
935     }
936 
937     if (0 < does_utf8_overflow(s0, e,
938                                FALSE /* Don't consider_overlongs */
939     )) {
940         return 0;
941     }
942 
943     if (0 < isFF_overlong(s0, e - s0)) {
944         return 0;
945     }
946 
947     /* Here, the character is valid as far as it got.  Check if got a partial
948      * character */
949     if (s - s0 < UTF8_MAXBYTES) {
950         return (require_partial) ? 1 : 0;
951     }
952 
953     /* Here, got a full character */
954     return (require_partial) ? 0 : UTF8_MAXBYTES;
955 }
956 
957 char *
958 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
959 {
960     /* Returns a mortalized C string that is a displayable copy of the 'len'
961      * bytes starting at 'start'.  'format' gives how to display each byte.
962      * Currently, there are only two formats, so it is currently a bool:
963      *      0   \xab
964      *      1    ab         (that is a space between two hex digit bytes)
965      */
966 
967     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
968                                                trailing NUL */
969     const U8 * s = start;
970     const U8 * const e = start + len;
971     char * output;
972     char * d;
973 
974     PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
975 
976     Newx(output, output_len, char);
977     SAVEFREEPV(output);
978 
979     d = output;
980     for (s = start; s < e; s++) {
981         const unsigned high_nibble = (*s & 0xF0) >> 4;
982         const unsigned low_nibble =  (*s & 0x0F);
983 
984         if (format) {
985             if (s > start) {
986                 *d++ = ' ';
987             }
988         }
989         else {
990             *d++ = '\\';
991             *d++ = 'x';
992         }
993 
994         if (high_nibble < 10) {
995             *d++ = high_nibble + '0';
996         }
997         else {
998             *d++ = high_nibble - 10 + 'a';
999         }
1000 
1001         if (low_nibble < 10) {
1002             *d++ = low_nibble + '0';
1003         }
1004         else {
1005             *d++ = low_nibble - 10 + 'a';
1006         }
1007     }
1008 
1009     *d = '\0';
1010     return output;
1011 }
1012 
1013 PERL_STATIC_INLINE char *
1014 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1015 
1016                                          /* Max number of bytes to print */
1017                                          STRLEN print_len,
1018 
1019                                          /* Which one is the non-continuation */
1020                                          const STRLEN non_cont_byte_pos,
1021 
1022                                          /* How many bytes should there be? */
1023                                          const STRLEN expect_len)
1024 {
1025     /* Return the malformation warning text for an unexpected continuation
1026      * byte. */
1027 
1028     const char * const where = (non_cont_byte_pos == 1)
1029                                ? "immediately"
1030                                : Perl_form(aTHX_ "%d bytes",
1031                                                  (int) non_cont_byte_pos);
1032     const U8 * x = s + non_cont_byte_pos;
1033     const U8 * e = s + print_len;
1034 
1035     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1036 
1037     /* We don't need to pass this parameter, but since it has already been
1038      * calculated, it's likely faster to pass it; verify under DEBUGGING */
1039     assert(expect_len == UTF8SKIP(s));
1040 
1041     /* As a defensive coding measure, don't output anything past a NUL.  Such
1042      * bytes shouldn't be in the middle of a malformation, and could mark the
1043      * end of the allocated string, and what comes after is undefined */
1044     for (; x < e; x++) {
1045         if (*x == '\0') {
1046             x++;            /* Output this particular NUL */
1047             break;
1048         }
1049     }
1050 
1051     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1052                            " %s after start byte 0x%02x; need %d bytes, got %d)",
1053                            malformed_text,
1054                            _byte_dump_string(s, x - s, 0),
1055                            *(s + non_cont_byte_pos),
1056                            where,
1057                            *s,
1058                            (int) expect_len,
1059                            (int) non_cont_byte_pos);
1060 }
1061 
1062 /*
1063 
1064 =for apidoc utf8n_to_uvchr
1065 
1066 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1067 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1068 directly.
1069 
1070 Bottom level UTF-8 decode routine.
1071 Returns the native code point value of the first character in the string C<s>,
1072 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1073 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1074 the length, in bytes, of that character.
1075 
1076 The value of C<flags> determines the behavior when C<s> does not point to a
1077 well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
1078 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1079 is the next possible position in C<s> that could begin a non-malformed
1080 character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
1081 is raised.  Some UTF-8 input sequences may contain multiple malformations.
1082 This function tries to find every possible one in each call, so multiple
1083 warnings can be raised for the same sequence.
1084 
1085 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1086 individual types of malformations, such as the sequence being overlong (that
1087 is, when there is a shorter sequence that can express the same code point;
1088 overlong sequences are expressly forbidden in the UTF-8 standard due to
1089 potential security issues).  Another malformation example is the first byte of
1090 a character not being a legal first byte.  See F<utf8.h> for the list of such
1091 flags.  Even if allowed, this function generally returns the Unicode
1092 REPLACEMENT CHARACTER when it encounters a malformation.  There are flags in
1093 F<utf8.h> to override this behavior for the overlong malformations, but don't
1094 do that except for very specialized purposes.
1095 
1096 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1097 flags) malformation is found.  If this flag is set, the routine assumes that
1098 the caller will raise a warning, and this function will silently just set
1099 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1100 
1101 Note that this API requires disambiguation between successful decoding a C<NUL>
1102 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1103 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1104 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
1105 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
1106 error.  Or you can use C<L</utf8n_to_uvchr_error>>.
1107 
1108 Certain code points are considered problematic.  These are Unicode surrogates,
1109 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1110 By default these are considered regular code points, but certain situations
1111 warrant special handling for them, which can be specified using the C<flags>
1112 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1113 three classes are treated as malformations and handled as such.  The flags
1114 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1115 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1116 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1117 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1118 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1119 definition given by
1120 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1121 The difference between traditional strictness and C9 strictness is that the
1122 latter does not forbid non-character code points.  (They are still discouraged,
1123 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
1124 
1125 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1126 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1127 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1128 raised for their respective categories, but otherwise the code points are
1129 considered valid (not malformations).  To get a category to both be treated as
1130 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1131 (But note that warnings are not raised if lexically disabled nor if
1132 C<UTF8_CHECK_ONLY> is also specified.)
1133 
1134 Extremely high code points were never specified in any standard, and require an
1135 extension to UTF-8 to express, which Perl does.  It is likely that programs
1136 written in something other than Perl would not be able to read files that
1137 contain these; nor would Perl understand files written by something that uses a
1138 different extension.  For these reasons, there is a separate set of flags that
1139 can warn and/or disallow these extremely high code points, even if other
1140 above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
1141 C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
1142 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
1143 above-Unicode code points, including these, as malformations.
1144 (Note that the Unicode standard considers anything above 0x10FFFF to be
1145 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1146 (2**31 -1))
1147 
1148 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1149 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
1150 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1151 C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
1152 can apply to code points that actually do fit in 31 bits.  This happens on
1153 EBCDIC platforms, and sometimes when the L<overlong
1154 malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
1155 describe the situation in all cases.
1156 
1157 
1158 All other code points corresponding to Unicode characters, including private
1159 use and those yet to be assigned, are never considered malformed and never
1160 warn.
1161 
1162 =for apidoc Amnh||UTF8_CHECK_ONLY
1163 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1164 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1165 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1166 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1167 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1168 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1169 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1170 =for apidoc Amnh||UTF8_WARN_SURROGATE
1171 =for apidoc Amnh||UTF8_WARN_NONCHAR
1172 =for apidoc Amnh||UTF8_WARN_SUPER
1173 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1174 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1175 
1176 =cut
1177 
1178 Also implemented as a macro in utf8.h
1179 */
1180 
1181 UV
1182 Perl_utf8n_to_uvchr(const U8 *s,
1183                     STRLEN curlen,
1184                     STRLEN *retlen,
1185                     const U32 flags)
1186 {
1187     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1188 
1189     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1190 }
1191 
1192 /*
1193 
1194 =for apidoc utf8n_to_uvchr_error
1195 
1196 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1197 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1198 directly.
1199 
1200 This function is for code that needs to know what the precise malformation(s)
1201 are when an error is found.  If you also need to know the generated warning
1202 messages, use L</utf8n_to_uvchr_msgs>() instead.
1203 
1204 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1205 all the others, C<errors>.  If this parameter is 0, this function behaves
1206 identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
1207 to a C<U32> variable, which this function sets to indicate any errors found.
1208 Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
1209 C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
1210 of these bits will be set if a malformation is found, even if the input
1211 C<flags> parameter indicates that the given malformation is allowed; those
1212 exceptions are noted:
1213 
1214 =over 4
1215 
1216 =item C<UTF8_GOT_PERL_EXTENDED>
1217 
1218 The input sequence is not standard UTF-8, but a Perl extension.  This bit is
1219 set only if the input C<flags> parameter contains either the
1220 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1221 
1222 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1223 and so some extension must be used to express them.  Perl uses a natural
1224 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1225 extension to represent even higher ones, so that any code point that fits in a
1226 64-bit word can be represented.  Text using these extensions is not likely to
1227 be portable to non-Perl code.  We lump both of these extensions together and
1228 refer to them as Perl extended UTF-8.  There exist other extensions that people
1229 have invented, incompatible with Perl's.
1230 
1231 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1232 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1233 than on ASCII.  Prior to that, code points 2**31 and higher were simply
1234 unrepresentable, and a different, incompatible method was used to represent
1235 code points between 2**30 and 2**31 - 1.
1236 
1237 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1238 Perl extended UTF-8 is used.
1239 
1240 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1241 may use for backward compatibility.  That name is misleading, as this flag may
1242 be set when the code point actually does fit in 31 bits.  This happens on
1243 EBCDIC platforms, and sometimes when the L<overlong
1244 malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
1245 describes the situation in all cases.
1246 
1247 =item C<UTF8_GOT_CONTINUATION>
1248 
1249 The input sequence was malformed in that the first byte was a UTF-8
1250 continuation byte.
1251 
1252 =item C<UTF8_GOT_EMPTY>
1253 
1254 The input C<curlen> parameter was 0.
1255 
1256 =item C<UTF8_GOT_LONG>
1257 
1258 The input sequence was malformed in that there is some other sequence that
1259 evaluates to the same code point, but that sequence is shorter than this one.
1260 
1261 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1262 it was discovered that this created security issues.
1263 
1264 =item C<UTF8_GOT_NONCHAR>
1265 
1266 The code point represented by the input UTF-8 sequence is for a Unicode
1267 non-character code point.
1268 This bit is set only if the input C<flags> parameter contains either the
1269 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1270 
1271 =item C<UTF8_GOT_NON_CONTINUATION>
1272 
1273 The input sequence was malformed in that a non-continuation type byte was found
1274 in a position where only a continuation type one should be.  See also
1275 C<L</UTF8_GOT_SHORT>>.
1276 
1277 =item C<UTF8_GOT_OVERFLOW>
1278 
1279 The input sequence was malformed in that it is for a code point that is not
1280 representable in the number of bits available in an IV on the current platform.
1281 
1282 =item C<UTF8_GOT_SHORT>
1283 
1284 The input sequence was malformed in that C<curlen> is smaller than required for
1285 a complete sequence.  In other words, the input is for a partial character
1286 sequence.
1287 
1288 
1289 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1290 sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1291 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1292 sequence was looked at.   If no other flags are present, it means that the
1293 sequence was valid as far as it went.  Depending on the application, this could
1294 mean one of three things:
1295 
1296 =over
1297 
1298 =item *
1299 
1300 The C<curlen> length parameter passed in was too small, and the function was
1301 prevented from examining all the necessary bytes.
1302 
1303 =item *
1304 
1305 The buffer being looked at is based on reading data, and the data received so
1306 far stopped in the middle of a character, so that the next read will
1307 read the remainder of this character.  (It is up to the caller to deal with the
1308 split bytes somehow.)
1309 
1310 =item *
1311 
1312 This is a real error, and the partial sequence is all we're going to get.
1313 
1314 =back
1315 
1316 =item C<UTF8_GOT_SUPER>
1317 
1318 The input sequence was malformed in that it is for a non-Unicode code point;
1319 that is, one above the legal Unicode maximum.
1320 This bit is set only if the input C<flags> parameter contains either the
1321 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1322 
1323 =item C<UTF8_GOT_SURROGATE>
1324 
1325 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1326 code point.
1327 This bit is set only if the input C<flags> parameter contains either the
1328 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1329 
1330 =back
1331 
1332 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1333 flag to suppress any warnings, and then examine the C<*errors> return.
1334 
1335 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1336 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1337 =for apidoc Amnh||UTF8_GOT_EMPTY
1338 =for apidoc Amnh||UTF8_GOT_LONG
1339 =for apidoc Amnh||UTF8_GOT_NONCHAR
1340 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1341 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1342 =for apidoc Amnh||UTF8_GOT_SHORT
1343 =for apidoc Amnh||UTF8_GOT_SUPER
1344 =for apidoc Amnh||UTF8_GOT_SURROGATE
1345 
1346 =cut
1347 
1348 Also implemented as a macro in utf8.h
1349 */
1350 
1351 UV
1352 Perl_utf8n_to_uvchr_error(const U8 *s,
1353                           STRLEN curlen,
1354                           STRLEN *retlen,
1355                           const U32 flags,
1356                           U32 * errors)
1357 {
1358     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1359 
1360     return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1361 }
1362 
1363 /*
1364 
1365 =for apidoc utf8n_to_uvchr_msgs
1366 
1367 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1368 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1369 directly.
1370 
1371 This function is for code that needs to know what the precise malformation(s)
1372 are when an error is found, and wants the corresponding warning and/or error
1373 messages to be returned to the caller rather than be displayed.  All messages
1374 that would have been displayed if all lexical warnings are enabled will be
1375 returned.
1376 
1377 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1378 placed after all the others, C<msgs>.  If this parameter is 0, this function
1379 behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
1380 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1381 contain any appropriate messages.  The elements of the array are ordered so
1382 that the first message that would have been displayed is in the 0th element,
1383 and so on.  Each element is a hash with three key-value pairs, as follows:
1384 
1385 =over 4
1386 
1387 =item C<text>
1388 
1389 The text of the message as a C<SVpv>.
1390 
1391 =item C<warn_categories>
1392 
1393 The warning category (or categories) packed into a C<SVuv>.
1394 
1395 =item C<flag>
1396 
1397 A single flag bit associated with this message, in a C<SVuv>.
1398 The bit corresponds to some bit in the C<*errors> return value,
1399 such as C<UTF8_GOT_LONG>.
1400 
1401 =back
1402 
1403 It's important to note that specifying this parameter as non-null will cause
1404 any warnings this function would otherwise generate to be suppressed, and
1405 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
1406 (or not) when choosing what to do with the returned messages.
1407 
1408 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1409 no AV is created.
1410 
1411 The caller, of course, is responsible for freeing any returned AV.
1412 
1413 =cut
1414 */
1415 
1416 UV
1417 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1418                                STRLEN curlen,
1419                                STRLEN *retlen,
1420                                const U32 flags,
1421                                U32 * errors,
1422                                AV ** msgs)
1423 {
1424     const U8 * const s0 = s;
1425     const U8 * send = s0 + curlen;
1426     U32 possible_problems;  /* A bit is set here for each potential problem
1427                                found as we go along */
1428     UV uv;
1429     STRLEN expectlen;     /* How long should this sequence be? */
1430     STRLEN avail_len;     /* When input is too short, gives what that is */
1431     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
1432                              gets set and discarded */
1433 
1434     /* The below are used only if there is both an overlong malformation and a
1435      * too short one.  Otherwise the first two are set to 's0' and 'send', and
1436      * the third not used at all */
1437     U8 * adjusted_s0;
1438     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1439                                             routine; see [perl #130921] */
1440     UV uv_so_far;
1441     dTHX;
1442 
1443     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1444 
1445     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1446      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1447      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
1448      * final case. */
1449 
1450     /* Each of the affected Hanguls starts with \xED */
1451 
1452     if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1453         if (retlen) {
1454             *retlen = 3;
1455         }
1456         if (errors) {
1457             *errors = 0;
1458         }
1459         if (msgs) {
1460             *msgs = NULL;
1461         }
1462 
1463         return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1464              | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1465              |  (s0[2] & UTF_CONTINUATION_MASK);
1466     }
1467 
1468     /* In conjunction with the exhaustive tests that can be enabled in
1469      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1470      * what it is intended to do, and that no flaws in it are masked by
1471      * dropping down and executing the code below
1472     assert(! isUTF8_CHAR(s0, send)
1473           || UTF8_IS_SURROGATE(s0, send)
1474           || UTF8_IS_SUPER(s0, send)
1475           || UTF8_IS_NONCHAR(s0,send));
1476     */
1477 
1478     s = s0;
1479     possible_problems = 0;
1480     expectlen = 0;
1481     avail_len = 0;
1482     discard_errors = 0;
1483     adjusted_s0 = (U8 *) s0;
1484     uv_so_far = 0;
1485 
1486     if (errors) {
1487         *errors = 0;
1488     }
1489     else {
1490         errors = &discard_errors;
1491     }
1492 
1493     /* The order of malformation tests here is important.  We should consume as
1494      * few bytes as possible in order to not skip any valid character.  This is
1495      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1496      * https://unicode.org/reports/tr36 for more discussion as to why.  For
1497      * example, once we've done a UTF8SKIP, we can tell the expected number of
1498      * bytes, and could fail right off the bat if the input parameters indicate
1499      * that there are too few available.  But it could be that just that first
1500      * byte is garbled, and the intended character occupies fewer bytes.  If we
1501      * blindly assumed that the first byte is correct, and skipped based on
1502      * that number, we could skip over a valid input character.  So instead, we
1503      * always examine the sequence byte-by-byte.
1504      *
1505      * We also should not consume too few bytes, otherwise someone could inject
1506      * things.  For example, an input could be deliberately designed to
1507      * overflow, and if this code bailed out immediately upon discovering that,
1508      * returning to the caller C<*retlen> pointing to the very next byte (one
1509      * which is actually part of the overflowing sequence), that could look
1510      * legitimate to the caller, which could discard the initial partial
1511      * sequence and process the rest, inappropriately.
1512      *
1513      * Some possible input sequences are malformed in more than one way.  This
1514      * function goes to lengths to try to find all of them.  This is necessary
1515      * for correctness, as the inputs may allow one malformation but not
1516      * another, and if we abandon searching for others after finding the
1517      * allowed one, we could allow in something that shouldn't have been.
1518      */
1519 
1520     if (UNLIKELY(curlen == 0)) {
1521         possible_problems |= UTF8_GOT_EMPTY;
1522         curlen = 0;
1523         uv = UNICODE_REPLACEMENT;
1524         goto ready_to_handle_errors;
1525     }
1526 
1527     /* We now know we can examine the first byte of the input */
1528     expectlen = UTF8SKIP(s);
1529     uv = *s;
1530 
1531     /* A well-formed UTF-8 character, as the vast majority of calls to this
1532      * function will be for, has this expected length.  For efficiency, set
1533      * things up here to return it.  It will be overriden only in those rare
1534      * cases where a malformation is found */
1535     if (retlen) {
1536         *retlen = expectlen;
1537     }
1538 
1539     /* A continuation character can't start a valid sequence */
1540     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1541         possible_problems |= UTF8_GOT_CONTINUATION;
1542         curlen = 1;
1543         uv = UNICODE_REPLACEMENT;
1544         goto ready_to_handle_errors;
1545     }
1546 
1547     /* Here is not a continuation byte, nor an invariant.  The only thing left
1548      * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
1549      * because it excludes start bytes like \xC0 that always lead to
1550      * overlongs.) */
1551 
1552     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1553      * that indicate the number of bytes in the character's whole UTF-8
1554      * sequence, leaving just the bits that are part of the value.  */
1555     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1556 
1557     /* Setup the loop end point, making sure to not look past the end of the
1558      * input string, and flag it as too short if the size isn't big enough. */
1559     if (UNLIKELY(curlen < expectlen)) {
1560         possible_problems |= UTF8_GOT_SHORT;
1561         avail_len = curlen;
1562     }
1563     else {
1564         send = (U8*) s0 + expectlen;
1565     }
1566 
1567     /* Now, loop through the remaining bytes in the character's sequence,
1568      * accumulating each into the working value as we go. */
1569     for (s = s0 + 1; s < send; s++) {
1570         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1571             uv = UTF8_ACCUMULATE(uv, *s);
1572             continue;
1573         }
1574 
1575         /* Here, found a non-continuation before processing all expected bytes.
1576          * This byte indicates the beginning of a new character, so quit, even
1577          * if allowing this malformation. */
1578         possible_problems |= UTF8_GOT_NON_CONTINUATION;
1579         break;
1580     } /* End of loop through the character's bytes */
1581 
1582     /* Save how many bytes were actually in the character */
1583     curlen = s - s0;
1584 
1585     /* Note that there are two types of too-short malformation.  One is when
1586      * there is actual wrong data before the normal termination of the
1587      * sequence.  The other is that the sequence wasn't complete before the end
1588      * of the data we are allowed to look at, based on the input 'curlen'.
1589      * This means that we were passed data for a partial character, but it is
1590      * valid as far as we saw.  The other is definitely invalid.  This
1591      * distinction could be important to a caller, so the two types are kept
1592      * separate.
1593      *
1594      * A convenience macro that matches either of the too-short conditions.  */
1595 #   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1596 
1597     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1598         uv_so_far = uv;
1599         uv = UNICODE_REPLACEMENT;
1600     }
1601 
1602     /* Check for overflow.  The algorithm requires us to not look past the end
1603      * of the current character, even if partial, so the upper limit is 's' */
1604     if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1605                                          1 /* Do consider overlongs */
1606                                         )))
1607     {
1608         possible_problems |= UTF8_GOT_OVERFLOW;
1609         uv = UNICODE_REPLACEMENT;
1610     }
1611 
1612     /* Check for overlong.  If no problems so far, 'uv' is the correct code
1613      * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
1614      * we must look at the UTF-8 byte sequence itself to see if it is for an
1615      * overlong */
1616     if (     (   LIKELY(! possible_problems)
1617               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1618         || (       UNLIKELY(possible_problems)
1619             && (   UNLIKELY(! UTF8_IS_START(*s0))
1620                 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1621     {
1622         possible_problems |= UTF8_GOT_LONG;
1623 
1624         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
1625 
1626                           /* The calculation in the 'true' branch of this 'if'
1627                            * below won't work if overflows, and isn't needed
1628                            * anyway.  Further below we handle all overflow
1629                            * cases */
1630             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1631         {
1632             UV min_uv = uv_so_far;
1633             STRLEN i;
1634 
1635             /* Here, the input is both overlong and is missing some trailing
1636              * bytes.  There is no single code point it could be for, but there
1637              * may be enough information present to determine if what we have
1638              * so far is for an unallowed code point, such as for a surrogate.
1639              * The code further below has the intelligence to determine this,
1640              * but just for non-overlong UTF-8 sequences.  What we do here is
1641              * calculate the smallest code point the input could represent if
1642              * there were no too short malformation.  Then we compute and save
1643              * the UTF-8 for that, which is what the code below looks at
1644              * instead of the raw input.  It turns out that the smallest such
1645              * code point is all we need. */
1646             for (i = curlen; i < expectlen; i++) {
1647                 min_uv = UTF8_ACCUMULATE(min_uv,
1648                                 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1649             }
1650 
1651             adjusted_s0 = temp_char_buf;
1652             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1653         }
1654     }
1655 
1656     /* Here, we have found all the possible problems, except for when the input
1657      * is for a problematic code point not allowed by the input parameters. */
1658 
1659                                 /* uv is valid for overlongs */
1660     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1661                    && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1662             || (   UNLIKELY(possible_problems)
1663 
1664                           /* if overflow, we know without looking further
1665                            * precisely which of the problematic types it is,
1666                            * and we deal with those in the overflow handling
1667                            * code */
1668                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1669                 && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1670                     || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1671         && ((flags & ( UTF8_DISALLOW_NONCHAR
1672                       |UTF8_DISALLOW_SURROGATE
1673                       |UTF8_DISALLOW_SUPER
1674                       |UTF8_DISALLOW_PERL_EXTENDED
1675                       |UTF8_WARN_NONCHAR
1676                       |UTF8_WARN_SURROGATE
1677                       |UTF8_WARN_SUPER
1678                       |UTF8_WARN_PERL_EXTENDED))))
1679     {
1680         /* If there were no malformations, or the only malformation is an
1681          * overlong, 'uv' is valid */
1682         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1683             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1684                 possible_problems |= UTF8_GOT_SURROGATE;
1685             }
1686             else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1687                 possible_problems |= UTF8_GOT_SUPER;
1688             }
1689             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1690                 possible_problems |= UTF8_GOT_NONCHAR;
1691             }
1692         }
1693         else {  /* Otherwise, need to look at the source UTF-8, possibly
1694                    adjusted to be non-overlong */
1695 
1696             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1697                                                     > UTF_START_BYTE_110000_))
1698             {
1699                 possible_problems |= UTF8_GOT_SUPER;
1700             }
1701             else if (curlen > 1) {
1702                 if (UNLIKELY(   NATIVE_UTF8_TO_I8(*adjusted_s0)
1703                                                 == UTF_START_BYTE_110000_
1704                              && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1705                                                 >= UTF_FIRST_CONT_BYTE_110000_))
1706                 {
1707                     possible_problems |= UTF8_GOT_SUPER;
1708                 }
1709                 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1710                     possible_problems |= UTF8_GOT_SURROGATE;
1711                 }
1712             }
1713 
1714             /* We need a complete well-formed UTF-8 character to discern
1715              * non-characters, so can't look for them here */
1716         }
1717     }
1718 
1719   ready_to_handle_errors:
1720 
1721     /* At this point:
1722      * curlen               contains the number of bytes in the sequence that
1723      *                      this call should advance the input by.
1724      * avail_len            gives the available number of bytes passed in, but
1725      *                      only if this is less than the expected number of
1726      *                      bytes, based on the code point's start byte.
1727      * possible_problems    is 0 if there weren't any problems; otherwise a bit
1728      *                      is set in it for each potential problem found.
1729      * uv                   contains the code point the input sequence
1730      *                      represents; or if there is a problem that prevents
1731      *                      a well-defined value from being computed, it is
1732      *                      some subsitute value, typically the REPLACEMENT
1733      *                      CHARACTER.
1734      * s0                   points to the first byte of the character
1735      * s                    points to just after were we left off processing
1736      *                      the character
1737      * send                 points to just after where that character should
1738      *                      end, based on how many bytes the start byte tells
1739      *                      us should be in it, but no further than s0 +
1740      *                      avail_len
1741      */
1742 
1743     if (UNLIKELY(possible_problems)) {
1744         bool disallowed = FALSE;
1745         const U32 orig_problems = possible_problems;
1746 
1747         if (msgs) {
1748             *msgs = NULL;
1749         }
1750 
1751         while (possible_problems) { /* Handle each possible problem */
1752             U32 pack_warn = 0;
1753             char * message = NULL;
1754             U32 this_flag_bit = 0;
1755 
1756             /* Each 'if' clause handles one problem.  They are ordered so that
1757              * the first ones' messages will be displayed before the later
1758              * ones; this is kinda in decreasing severity order.  But the
1759              * overlong must come last, as it changes 'uv' looked at by the
1760              * others */
1761             if (possible_problems & UTF8_GOT_OVERFLOW) {
1762 
1763                 /* Overflow means also got a super and are using Perl's
1764                  * extended UTF-8, but we handle all three cases here */
1765                 possible_problems
1766                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1767                 *errors |= UTF8_GOT_OVERFLOW;
1768 
1769                 /* But the API says we flag all errors found */
1770                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1771                     *errors |= UTF8_GOT_SUPER;
1772                 }
1773                 if (flags
1774                         & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1775                 {
1776                     *errors |= UTF8_GOT_PERL_EXTENDED;
1777                 }
1778 
1779                 /* Disallow if any of the three categories say to */
1780                 if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
1781                     || (flags & ( UTF8_DISALLOW_SUPER
1782                                  |UTF8_DISALLOW_PERL_EXTENDED)))
1783                 {
1784                     disallowed = TRUE;
1785                 }
1786 
1787                 /* Likewise, warn if any say to */
1788                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)
1789                     ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1790                 {
1791 
1792                     /* The warnings code explicitly says it doesn't handle the
1793                      * case of packWARN2 and two categories which have
1794                      * parent-child relationship.  Even if it works now to
1795                      * raise the warning if either is enabled, it wouldn't
1796                      * necessarily do so in the future.  We output (only) the
1797                      * most dire warning */
1798                     if (! (flags & UTF8_CHECK_ONLY)) {
1799                         if (msgs || ckWARN_d(WARN_UTF8)) {
1800                             pack_warn = packWARN(WARN_UTF8);
1801                         }
1802                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1803                             pack_warn = packWARN(WARN_NON_UNICODE);
1804                         }
1805                         if (pack_warn) {
1806                             message = Perl_form(aTHX_ "%s: %s (overflows)",
1807                                             malformed_text,
1808                                             _byte_dump_string(s0, curlen, 0));
1809                             this_flag_bit = UTF8_GOT_OVERFLOW;
1810                         }
1811                     }
1812                 }
1813             }
1814             else if (possible_problems & UTF8_GOT_EMPTY) {
1815                 possible_problems &= ~UTF8_GOT_EMPTY;
1816                 *errors |= UTF8_GOT_EMPTY;
1817 
1818                 if (! (flags & UTF8_ALLOW_EMPTY)) {
1819 
1820                     /* This so-called malformation is now treated as a bug in
1821                      * the caller.  If you have nothing to decode, skip calling
1822                      * this function */
1823                     assert(0);
1824 
1825                     disallowed = TRUE;
1826                     if (  (msgs
1827                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1828                     {
1829                         pack_warn = packWARN(WARN_UTF8);
1830                         message = Perl_form(aTHX_ "%s (empty string)",
1831                                                    malformed_text);
1832                         this_flag_bit = UTF8_GOT_EMPTY;
1833                     }
1834                 }
1835             }
1836             else if (possible_problems & UTF8_GOT_CONTINUATION) {
1837                 possible_problems &= ~UTF8_GOT_CONTINUATION;
1838                 *errors |= UTF8_GOT_CONTINUATION;
1839 
1840                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1841                     disallowed = TRUE;
1842                     if ((   msgs
1843                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1844                     {
1845                         pack_warn = packWARN(WARN_UTF8);
1846                         message = Perl_form(aTHX_
1847                                 "%s: %s (unexpected continuation byte 0x%02x,"
1848                                 " with no preceding start byte)",
1849                                 malformed_text,
1850                                 _byte_dump_string(s0, 1, 0), *s0);
1851                         this_flag_bit = UTF8_GOT_CONTINUATION;
1852                     }
1853                 }
1854             }
1855             else if (possible_problems & UTF8_GOT_SHORT) {
1856                 possible_problems &= ~UTF8_GOT_SHORT;
1857                 *errors |= UTF8_GOT_SHORT;
1858 
1859                 if (! (flags & UTF8_ALLOW_SHORT)) {
1860                     disallowed = TRUE;
1861                     if ((   msgs
1862                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1863                     {
1864                         pack_warn = packWARN(WARN_UTF8);
1865                         message = Perl_form(aTHX_
1866                              "%s: %s (too short; %d byte%s available, need %d)",
1867                              malformed_text,
1868                              _byte_dump_string(s0, send - s0, 0),
1869                              (int)avail_len,
1870                              avail_len == 1 ? "" : "s",
1871                              (int)expectlen);
1872                         this_flag_bit = UTF8_GOT_SHORT;
1873                     }
1874                 }
1875 
1876             }
1877             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1878                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1879                 *errors |= UTF8_GOT_NON_CONTINUATION;
1880 
1881                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1882                     disallowed = TRUE;
1883                     if ((   msgs
1884                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1885                     {
1886 
1887                         /* If we don't know for sure that the input length is
1888                          * valid, avoid as much as possible reading past the
1889                          * end of the buffer */
1890                         int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1891                                        ? (int) (s - s0)
1892                                        : (int) (send - s0);
1893                         pack_warn = packWARN(WARN_UTF8);
1894                         message = Perl_form(aTHX_ "%s",
1895                             unexpected_non_continuation_text(s0,
1896                                                             printlen,
1897                                                             s - s0,
1898                                                             (int) expectlen));
1899                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1900                     }
1901                 }
1902             }
1903             else if (possible_problems & UTF8_GOT_SURROGATE) {
1904                 possible_problems &= ~UTF8_GOT_SURROGATE;
1905 
1906                 if (flags & UTF8_WARN_SURROGATE) {
1907                     *errors |= UTF8_GOT_SURROGATE;
1908 
1909                     if (   ! (flags & UTF8_CHECK_ONLY)
1910                         && (msgs || ckWARN_d(WARN_SURROGATE)))
1911                     {
1912                         pack_warn = packWARN(WARN_SURROGATE);
1913 
1914                         /* These are the only errors that can occur with a
1915                         * surrogate when the 'uv' isn't valid */
1916                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1917                             message = Perl_form(aTHX_
1918                                     "UTF-16 surrogate (any UTF-8 sequence that"
1919                                     " starts with \"%s\" is for a surrogate)",
1920                                     _byte_dump_string(s0, curlen, 0));
1921                         }
1922                         else {
1923                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
1924                         }
1925                         this_flag_bit = UTF8_GOT_SURROGATE;
1926                     }
1927                 }
1928 
1929                 if (flags & UTF8_DISALLOW_SURROGATE) {
1930                     disallowed = TRUE;
1931                     *errors |= UTF8_GOT_SURROGATE;
1932                 }
1933             }
1934             else if (possible_problems & UTF8_GOT_SUPER) {
1935                 possible_problems &= ~UTF8_GOT_SUPER;
1936 
1937                 if (flags & UTF8_WARN_SUPER) {
1938                     *errors |= UTF8_GOT_SUPER;
1939 
1940                     if (   ! (flags & UTF8_CHECK_ONLY)
1941                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1942                     {
1943                         pack_warn = packWARN(WARN_NON_UNICODE);
1944 
1945                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1946                             message = Perl_form(aTHX_
1947                                     "Any UTF-8 sequence that starts with"
1948                                     " \"%s\" is for a non-Unicode code point,"
1949                                     " may not be portable",
1950                                     _byte_dump_string(s0, curlen, 0));
1951                         }
1952                         else {
1953                             message = Perl_form(aTHX_ super_cp_format, uv);
1954                         }
1955                         this_flag_bit = UTF8_GOT_SUPER;
1956                     }
1957                 }
1958 
1959                 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1960                  * and before possibly bailing out, so that the more dire
1961                  * warning will override the regular one. */
1962                 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1963                     if (  ! (flags & UTF8_CHECK_ONLY)
1964                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1965                         &&  (msgs || (   ckWARN_d(WARN_NON_UNICODE)
1966                                       || ckWARN(WARN_PORTABLE))))
1967                     {
1968                         pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1969 
1970                         /* If it is an overlong that evaluates to a code point
1971                          * that doesn't have to use the Perl extended UTF-8, it
1972                          * still used it, and so we output a message that
1973                          * doesn't refer to the code point.  The same is true
1974                          * if there was a SHORT malformation where the code
1975                          * point is not valid.  In that case, 'uv' will have
1976                          * been set to the REPLACEMENT CHAR, and the message
1977                          * below without the code point in it will be selected
1978                          * */
1979                         if (UNICODE_IS_PERL_EXTENDED(uv)) {
1980                             message = Perl_form(aTHX_
1981                                             PL_extended_cp_format, uv);
1982                         }
1983                         else {
1984                             message = Perl_form(aTHX_
1985                                         "Any UTF-8 sequence that starts with"
1986                                         " \"%s\" is a Perl extension, and"
1987                                         " so is not portable",
1988                                         _byte_dump_string(s0, curlen, 0));
1989                         }
1990                         this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1991                     }
1992 
1993                     if (flags & ( UTF8_WARN_PERL_EXTENDED
1994                                  |UTF8_DISALLOW_PERL_EXTENDED))
1995                     {
1996                         *errors |= UTF8_GOT_PERL_EXTENDED;
1997 
1998                         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1999                             disallowed = TRUE;
2000                         }
2001                     }
2002                 }
2003 
2004                 if (flags & UTF8_DISALLOW_SUPER) {
2005                     *errors |= UTF8_GOT_SUPER;
2006                     disallowed = TRUE;
2007                 }
2008             }
2009             else if (possible_problems & UTF8_GOT_NONCHAR) {
2010                 possible_problems &= ~UTF8_GOT_NONCHAR;
2011 
2012                 if (flags & UTF8_WARN_NONCHAR) {
2013                     *errors |= UTF8_GOT_NONCHAR;
2014 
2015                     if (  ! (flags & UTF8_CHECK_ONLY)
2016                         && (msgs || ckWARN_d(WARN_NONCHAR)))
2017                     {
2018                         /* The code above should have guaranteed that we don't
2019                          * get here with errors other than overlong */
2020                         assert (! (orig_problems
2021                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2022 
2023                         pack_warn = packWARN(WARN_NONCHAR);
2024                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
2025                         this_flag_bit = UTF8_GOT_NONCHAR;
2026                     }
2027                 }
2028 
2029                 if (flags & UTF8_DISALLOW_NONCHAR) {
2030                     disallowed = TRUE;
2031                     *errors |= UTF8_GOT_NONCHAR;
2032                 }
2033             }
2034             else if (possible_problems & UTF8_GOT_LONG) {
2035                 possible_problems &= ~UTF8_GOT_LONG;
2036                 *errors |= UTF8_GOT_LONG;
2037 
2038                 if (flags & UTF8_ALLOW_LONG) {
2039 
2040                     /* We don't allow the actual overlong value, unless the
2041                      * special extra bit is also set */
2042                     if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
2043                                     & ~UTF8_ALLOW_LONG)))
2044                     {
2045                         uv = UNICODE_REPLACEMENT;
2046                     }
2047                 }
2048                 else {
2049                     disallowed = TRUE;
2050 
2051                     if ((   msgs
2052                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2053                     {
2054                         pack_warn = packWARN(WARN_UTF8);
2055 
2056                         /* These error types cause 'uv' to be something that
2057                          * isn't what was intended, so can't use it in the
2058                          * message.  The other error types either can't
2059                          * generate an overlong, or else the 'uv' is valid */
2060                         if (orig_problems &
2061                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2062                         {
2063                             message = Perl_form(aTHX_
2064                                     "%s: %s (any UTF-8 sequence that starts"
2065                                     " with \"%s\" is overlong which can and"
2066                                     " should be represented with a"
2067                                     " different, shorter sequence)",
2068                                     malformed_text,
2069                                     _byte_dump_string(s0, send - s0, 0),
2070                                     _byte_dump_string(s0, curlen, 0));
2071                         }
2072                         else {
2073                             U8 tmpbuf[UTF8_MAXBYTES+1];
2074                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2075                                                                         uv, 0);
2076                             /* Don't use U+ for non-Unicode code points, which
2077                              * includes those in the Latin1 range */
2078                             const char * preface = (   UNICODE_IS_SUPER(uv)
2079 #ifdef EBCDIC
2080                                                     || uv <= 0xFF
2081 #endif
2082                                                    )
2083                                                    ? "0x"
2084                                                    : "U+";
2085                             message = Perl_form(aTHX_
2086                                 "%s: %s (overlong; instead use %s to represent"
2087                                 " %s%0*" UVXf ")",
2088                                 malformed_text,
2089                                 _byte_dump_string(s0, send - s0, 0),
2090                                 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2091                                 preface,
2092                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2093                                                          small code points */
2094                                 UNI_TO_NATIVE(uv));
2095                         }
2096                         this_flag_bit = UTF8_GOT_LONG;
2097                     }
2098                 }
2099             } /* End of looking through the possible flags */
2100 
2101             /* Display the message (if any) for the problem being handled in
2102              * this iteration of the loop */
2103             if (message) {
2104                 if (msgs) {
2105                     assert(this_flag_bit);
2106 
2107                     if (*msgs == NULL) {
2108                         *msgs = newAV();
2109                     }
2110 
2111                     av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2112                                                                 pack_warn,
2113                                                                 this_flag_bit)));
2114                 }
2115                 else if (PL_op)
2116                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2117                                                  OP_DESC(PL_op));
2118                 else
2119                     Perl_warner(aTHX_ pack_warn, "%s", message);
2120             }
2121         }   /* End of 'while (possible_problems)' */
2122 
2123         /* Since there was a possible problem, the returned length may need to
2124          * be changed from the one stored at the beginning of this function.
2125          * Instead of trying to figure out if that's needed, just do it. */
2126         if (retlen) {
2127             *retlen = curlen;
2128         }
2129 
2130         if (disallowed) {
2131             if (flags & UTF8_CHECK_ONLY && retlen) {
2132                 *retlen = ((STRLEN) -1);
2133             }
2134             return 0;
2135         }
2136     }
2137 
2138     return UNI_TO_NATIVE(uv);
2139 }
2140 
2141 /*
2142 =for apidoc utf8_to_uvchr_buf
2143 
2144 Returns the native code point of the first character in the string C<s> which
2145 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2146 C<*retlen> will be set to the length, in bytes, of that character.
2147 
2148 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2149 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2150 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
2151 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2152 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2153 the next possible position in C<s> that could begin a non-malformed character.
2154 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2155 returned.
2156 
2157 =cut
2158 
2159 Also implemented as a macro in utf8.h
2160 
2161 */
2162 
2163 
2164 UV
2165 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2166 {
2167     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2168 
2169     return utf8_to_uvchr_buf_helper(s, send, retlen);
2170 }
2171 
2172 /* This is marked as deprecated
2173  *
2174 =for apidoc utf8_to_uvuni_buf
2175 
2176 Only in very rare circumstances should code need to be dealing in Unicode
2177 (as opposed to native) code points.  In those few cases, use
2178 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
2179 If you are not absolutely sure this is one of those cases, then assume it isn't
2180 and use plain C<utf8_to_uvchr_buf> instead.
2181 
2182 Returns the Unicode (not-native) code point of the first character in the
2183 string C<s> which
2184 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2185 C<retlen> will be set to the length, in bytes, of that character.
2186 
2187 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2188 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2189 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
2190 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
2191 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
2192 next possible position in C<s> that could begin a non-malformed character.
2193 See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2194 returned.
2195 
2196 =cut
2197 */
2198 
2199 UV
2200 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2201 {
2202     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
2203 
2204     assert(send > s);
2205 
2206     return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
2207 }
2208 
2209 /*
2210 =for apidoc utf8_length
2211 
2212 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2213 at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
2214 same place, it returns 0 with no warning raised.
2215 
2216 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2217 and returns the number of valid characters.
2218 
2219 =cut
2220 */
2221 
2222 STRLEN
2223 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
2224 {
2225     STRLEN len = 0;
2226 
2227     PERL_ARGS_ASSERT_UTF8_LENGTH;
2228 
2229     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2230      * the bitops (especially ~) can create illegal UTF-8.
2231      * In other words: in Perl UTF-8 is not just for Unicode. */
2232 
2233     while (s < e) {
2234         Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2235 
2236         if (UNLIKELY(e - s  < expected_byte_count)) {
2237             goto warn_and_return;
2238         }
2239 
2240         len++;
2241         s += expected_byte_count;
2242     }
2243 
2244     if (LIKELY(e == s)) {
2245         return len;
2246     }
2247 
2248     /* Here, s > e on entry */
2249 
2250   warn_and_return:
2251     if (PL_op)
2252         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2253                          "%s in %s", unees, OP_DESC(PL_op));
2254     else
2255         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2256 
2257     return len;
2258 }
2259 
2260 /*
2261 =for apidoc bytes_cmp_utf8
2262 
2263 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2264 sequence of characters (stored as UTF-8)
2265 in C<u>, C<ulen>.  Returns 0 if they are
2266 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2267 if the first string is greater than the second string.
2268 
2269 -1 or +1 is returned if the shorter string was identical to the start of the
2270 longer string.  -2 or +2 is returned if
2271 there was a difference between characters
2272 within the strings.
2273 
2274 =cut
2275 */
2276 
2277 int
2278 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2279 {
2280     const U8 *const bend = b + blen;
2281     const U8 *const uend = u + ulen;
2282 
2283     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2284 
2285     while (b < bend && u < uend) {
2286         U8 c = *u++;
2287         if (!UTF8_IS_INVARIANT(c)) {
2288             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2289                 if (u < uend) {
2290                     U8 c1 = *u++;
2291                     if (UTF8_IS_CONTINUATION(c1)) {
2292                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2293                     } else {
2294                         /* diag_listed_as: Malformed UTF-8 character%s */
2295                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2296                               "%s %s%s",
2297                               unexpected_non_continuation_text(u - 2, 2, 1, 2),
2298                               PL_op ? " in " : "",
2299                               PL_op ? OP_DESC(PL_op) : "");
2300                         return -2;
2301                     }
2302                 } else {
2303                     if (PL_op)
2304                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2305                                          "%s in %s", unees, OP_DESC(PL_op));
2306                     else
2307                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2308                     return -2; /* Really want to return undef :-)  */
2309                 }
2310             } else {
2311                 return -2;
2312             }
2313         }
2314         if (*b != c) {
2315             return *b < c ? -2 : +2;
2316         }
2317         ++b;
2318     }
2319 
2320     if (b == bend && u == uend)
2321         return 0;
2322 
2323     return b < bend ? +1 : -1;
2324 }
2325 
2326 /*
2327 =for apidoc utf8_to_bytes
2328 
2329 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2330 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2331 updates C<*lenp> to contain the new length.
2332 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2333 
2334 Upon successful return, the number of variants in the string can be computed by
2335 having saved the value of C<*lenp> before the call, and subtracting the
2336 after-call value of C<*lenp> from it.
2337 
2338 If you need a copy of the string, see L</bytes_from_utf8>.
2339 
2340 =cut
2341 */
2342 
2343 U8 *
2344 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2345 {
2346     U8 * first_variant;
2347 
2348     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2349     PERL_UNUSED_CONTEXT;
2350 
2351     /* This is a no-op if no variants at all in the input */
2352     if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2353         return s;
2354     }
2355 
2356     {
2357         U8 * const save = s;
2358         U8 * const send = s + *lenp;
2359         U8 * d;
2360 
2361         /* Nothing before the first variant needs to be changed, so start the real
2362          * work there */
2363         s = first_variant;
2364         while (s < send) {
2365             if (! UTF8_IS_INVARIANT(*s)) {
2366                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2367                     *lenp = ((STRLEN) -1);
2368                     return 0;
2369                 }
2370                 s++;
2371             }
2372             s++;
2373         }
2374 
2375         /* Is downgradable, so do it */
2376         d = s = first_variant;
2377         while (s < send) {
2378             U8 c = *s++;
2379             if (! UVCHR_IS_INVARIANT(c)) {
2380                 /* Then it is two-byte encoded */
2381                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2382                 s++;
2383             }
2384             *d++ = c;
2385         }
2386         *d = '\0';
2387         *lenp = d - save;
2388 
2389         return save;
2390     }
2391 }
2392 
2393 /*
2394 =for apidoc bytes_from_utf8
2395 
2396 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2397 byte encoding.  On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2398 actually encoded in UTF-8.
2399 
2400 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2401 the input string.
2402 
2403 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2404 not expressible in native byte encoding.  In these cases, C<*is_utf8p> and
2405 C<*lenp> are unchanged, and the return value is the original C<s>.
2406 
2407 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2408 newly created string containing a downgraded copy of C<s>, and whose length is
2409 returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
2410 caller is responsible for arranging for the memory used by this string to get
2411 freed.
2412 
2413 Upon successful return, the number of variants in the string can be computed by
2414 having saved the value of C<*lenp> before the call, and subtracting the
2415 after-call value of C<*lenp> from it.
2416 
2417 =cut
2418 
2419 There is a macro that avoids this function call, but this is retained for
2420 anyone who calls it with the Perl_ prefix */
2421 
2422 U8 *
2423 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2424 {
2425     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2426     PERL_UNUSED_CONTEXT;
2427 
2428     return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2429 }
2430 
2431 /*
2432 =for apidoc bytes_from_utf8_loc
2433 
2434 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2435 to where to store the location of the first character in C<"s"> that cannot be
2436 converted to non-UTF8.
2437 
2438 If that parameter is C<NULL>, this function behaves identically to
2439 C<bytes_from_utf8>.
2440 
2441 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2442 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2443 
2444 Otherwise, the function returns a newly created C<NUL>-terminated string
2445 containing the non-UTF8 equivalent of the convertible first portion of
2446 C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
2447 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2448 and C<*first_non_downgradable> is set to C<NULL>.
2449 
2450 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2451 first character in the original string that wasn't converted.  C<*is_utf8p> is
2452 unchanged.  Note that the new string may have length 0.
2453 
2454 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2455 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2456 converts as many characters in it as possible stopping at the first one it
2457 finds that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
2458 set to point to that.  The function returns the portion that could be converted
2459 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2460 not including the terminating C<NUL>.  If the very first character in the
2461 original could not be converted, C<*lenp> will be 0, and the new string will
2462 contain just a single C<NUL>.  If the entire input string was converted,
2463 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2464 
2465 Upon successful return, the number of variants in the converted portion of the
2466 string can be computed by having saved the value of C<*lenp> before the call,
2467 and subtracting the after-call value of C<*lenp> from it.
2468 
2469 =cut
2470 
2471 
2472 */
2473 
2474 U8 *
2475 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2476 {
2477     U8 *d;
2478     const U8 *original = s;
2479     U8 *converted_start;
2480     const U8 *send = s + *lenp;
2481 
2482     PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2483 
2484     if (! *is_utf8p) {
2485         if (first_unconverted) {
2486             *first_unconverted = NULL;
2487         }
2488 
2489         return (U8 *) original;
2490     }
2491 
2492     Newx(d, (*lenp) + 1, U8);
2493 
2494     converted_start = d;
2495     while (s < send) {
2496         U8 c = *s++;
2497         if (! UTF8_IS_INVARIANT(c)) {
2498 
2499             /* Then it is multi-byte encoded.  If the code point is above 0xFF,
2500              * have to stop now */
2501             if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2502                 if (first_unconverted) {
2503                     *first_unconverted = s - 1;
2504                     goto finish_and_return;
2505                 }
2506                 else {
2507                     Safefree(converted_start);
2508                     return (U8 *) original;
2509                 }
2510             }
2511 
2512             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2513             s++;
2514         }
2515         *d++ = c;
2516     }
2517 
2518     /* Here, converted the whole of the input */
2519     *is_utf8p = FALSE;
2520     if (first_unconverted) {
2521         *first_unconverted = NULL;
2522     }
2523 
2524   finish_and_return:
2525     *d = '\0';
2526     *lenp = d - converted_start;
2527 
2528     /* Trim unused space */
2529     Renew(converted_start, *lenp + 1, U8);
2530 
2531     return converted_start;
2532 }
2533 
2534 /*
2535 =for apidoc bytes_to_utf8
2536 
2537 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2538 UTF-8.
2539 Returns a pointer to the newly-created string, and sets C<*lenp> to
2540 reflect the new length in bytes.  The caller is responsible for arranging for
2541 the memory used by this string to get freed.
2542 
2543 Upon successful return, the number of variants in the string can be computed by
2544 having saved the value of C<*lenp> before the call, and subtracting it from the
2545 after-call value of C<*lenp>.
2546 
2547 A C<NUL> character will be written after the end of the string.
2548 
2549 If you want to convert to UTF-8 from encodings other than
2550 the native (Latin1 or EBCDIC),
2551 see L</sv_recode_to_utf8>().
2552 
2553 =cut
2554 */
2555 
2556 U8*
2557 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2558 {
2559     const U8 * const send = s + (*lenp);
2560     U8 *d;
2561     U8 *dst;
2562 
2563     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2564     PERL_UNUSED_CONTEXT;
2565 
2566     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2567     Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2568     dst = d;
2569 
2570     while (s < send) {
2571         append_utf8_from_native_byte(*s, &d);
2572         s++;
2573     }
2574 
2575     *d = '\0';
2576     *lenp = d-dst;
2577 
2578     return dst;
2579 }
2580 
2581 /*
2582  * Convert native UTF-16 to UTF-8. Called via the more public functions
2583  * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2584  * little-endian,
2585  *
2586  * 'p' is the UTF-16 input string, passed as a pointer to U8.
2587  * 'bytelen' is its length (must be even)
2588  * 'd' is the pointer to the destination buffer.  The caller must ensure that
2589  *     the space is large enough.  The maximum expansion factor is 2 times
2590  *     'bytelen'.  1.5 if never going to run on an EBCDIC box.
2591  * '*newlen' will contain the number of bytes this function filled of 'd'.
2592  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2593  * 'low_byte' is 1  if UTF-16BE; 0 if UTF-16LE
2594  *
2595  * The expansion factor is because UTF-16 requires 2 bytes for every code point
2596  * below 0x10000; otherwise 4 bytes.  UTF-8 requires 1-3 bytes for every code
2597  * point below 0x1000; otherwise 4 bytes.  UTF-EBCDIC requires 1-4 bytes for
2598  * every code point below 0x1000; otherwise 4-5 bytes.
2599  *
2600  * The worst case is where every code point is below U+10000, hence requiring 2
2601  * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2602  * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2603  *
2604  * Do not use in-place. */
2605 
2606 U8*
2607 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2608                               const bool high_byte, /* Which of next two bytes is
2609                                                   high order */
2610                               const bool low_byte)
2611 {
2612     U8* pend;
2613     U8* dstart = d;
2614 
2615     PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2616 
2617     if (bytelen & 1)
2618         Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2619                 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2620     pend = p + bytelen;
2621 
2622     while (p < pend) {
2623 
2624         /* Next 16 bits is what we want.  (The bool is cast to U8 because on
2625          * platforms where a bool is implemented as a signed char, a compiler
2626          * warning may be generated) */
2627         U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2628         p += 2;
2629 
2630         /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2631          * */
2632         if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2633 
2634 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2635 #define LAST_HIGH_SURROGATE  0xDBFF
2636 #define FIRST_LOW_SURROGATE  0xDC00
2637 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
2638 #define FIRST_IN_PLANE1      0x10000
2639 
2640             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2641                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2642             }
2643             else {
2644                 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2645                 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2646                                                        LAST_LOW_SURROGATE)))
2647                 {
2648                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2649                 }
2650 
2651                 p += 2;
2652 
2653                 /* Here uv is the high surrogate.  Combine with low surrogate
2654                  * just computed to form the actual U32 code point.
2655                  *
2656                  * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2657                 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2658                                      + low_surrogate - FIRST_LOW_SURROGATE;
2659             }
2660         }
2661 
2662         /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2663         d = uvchr_to_utf8(d, uv);
2664     }
2665 
2666     *newlen = d - dstart;
2667     return d;
2668 }
2669 
2670 U8*
2671 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2672 {
2673     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2674 
2675     return utf16_to_utf8(p, d, bytelen, newlen);
2676 }
2677 
2678 U8*
2679 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2680 {
2681     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2682 
2683     return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2684 }
2685 
2686 /*
2687  * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2688  * big-endian and utf8_to_utf16_reversed() for little-endian,
2689  *
2690  * 's' is the UTF-8 input string, passed as a pointer to U8.
2691  * 'bytelen' is its length
2692  * 'd' is the pointer to the destination buffer, currently passed as U8 *.  The
2693  *     caller must ensure that the space is large enough.  The maximum
2694  *     expansion factor is 2 times 'bytelen'.  This happens when the input is
2695  *     entirely single-byte ASCII, expanding to two-byte UTF-16.
2696  * '*newlen' will contain the number of bytes this function filled of 'd'.
2697  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2698  * 'low_byte'  is 1 if UTF-16BE; 0 if UTF-16LE
2699  *
2700  * Do not use in-place. */
2701 U8*
2702 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2703                               const bool high_byte, /* Which of next two bytes
2704                                                        is high order */
2705                               const bool low_byte)
2706 {
2707     U8* send;
2708     U8* dstart = d;
2709 
2710     PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2711 
2712     send = s + bytelen;
2713 
2714     while (s < send) {
2715         STRLEN retlen;
2716         UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2717                                /* No surrogates nor above-Unicode */
2718                                UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2719 
2720         /* The modern method is to keep going with malformed input,
2721          * substituting the REPLACEMENT CHARACTER */
2722         if (UNLIKELY(uv == 0 && *s != '\0')) {
2723             uv = UNICODE_REPLACEMENT;
2724         }
2725 
2726         if (uv >= FIRST_IN_PLANE1) {    /* Requires a surrogate pair */
2727 
2728             /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2729             U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2730                                + FIRST_HIGH_SURROGATE;
2731 
2732             /* (The bool is cast to U8 because on platforms where a bool is
2733              * implemented as a signed char, a compiler warning may be
2734              * generated) */
2735             d[(U8) high_byte] = high_surrogate >> 8;
2736             d[(U8) low_byte]  = high_surrogate & nBIT_MASK(8);
2737             d += 2;
2738 
2739             /* The low surrogate is the lower 10 bits plus the offset */
2740             uv &= nBIT_MASK(10);
2741             uv += FIRST_LOW_SURROGATE;
2742 
2743             /* Drop down to output the low surrogate like it were a
2744              * non-surrogate */
2745         }
2746 
2747         d[(U8) high_byte] = uv >> 8;
2748         d[(U8) low_byte] = uv & nBIT_MASK(8);
2749         d += 2;
2750 
2751         s += retlen;
2752     }
2753 
2754     *newlen = d - dstart;
2755     return d;
2756 }
2757 
2758 bool
2759 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2760 {
2761     return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2762 }
2763 
2764 bool
2765 Perl__is_uni_perl_idcont(pTHX_ UV c)
2766 {
2767     return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2768 }
2769 
2770 bool
2771 Perl__is_uni_perl_idstart(pTHX_ UV c)
2772 {
2773     return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2774 }
2775 
2776 UV
2777 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2778                                   const char S_or_s)
2779 {
2780     /* We have the latin1-range values compiled into the core, so just use
2781      * those, converting the result to UTF-8.  The only difference between upper
2782      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2783      * either "SS" or "Ss".  Which one to use is passed into the routine in
2784      * 'S_or_s' to avoid a test */
2785 
2786     UV converted = toUPPER_LATIN1_MOD(c);
2787 
2788     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2789 
2790     assert(S_or_s == 'S' || S_or_s == 's');
2791 
2792     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2793                                              characters in this range */
2794         *p = (U8) converted;
2795         *lenp = 1;
2796         return converted;
2797     }
2798 
2799     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2800      * which it maps to one of them, so as to only have to have one check for
2801      * it in the main case */
2802     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2803         switch (c) {
2804             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2805                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2806                 break;
2807             case MICRO_SIGN:
2808                 converted = GREEK_CAPITAL_LETTER_MU;
2809                 break;
2810 #if    UNICODE_MAJOR_VERSION > 2                                        \
2811    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
2812                                   && UNICODE_DOT_DOT_VERSION >= 8)
2813             case LATIN_SMALL_LETTER_SHARP_S:
2814                 *(p)++ = 'S';
2815                 *p = S_or_s;
2816                 *lenp = 2;
2817                 return 'S';
2818 #endif
2819             default:
2820                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2821                                  " '%c' to map to '%c'",
2822                                  c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2823                 NOT_REACHED; /* NOTREACHED */
2824         }
2825     }
2826 
2827     *(p)++ = UTF8_TWO_BYTE_HI(converted);
2828     *p = UTF8_TWO_BYTE_LO(converted);
2829     *lenp = 2;
2830 
2831     return converted;
2832 }
2833 
2834 /* If compiled on an early Unicode version, there may not be auxiliary tables
2835  * */
2836 #ifndef HAS_UC_AUX_TABLES
2837 #  define UC_AUX_TABLE_ptrs     NULL
2838 #  define UC_AUX_TABLE_lengths  NULL
2839 #endif
2840 #ifndef HAS_TC_AUX_TABLES
2841 #  define TC_AUX_TABLE_ptrs     NULL
2842 #  define TC_AUX_TABLE_lengths  NULL
2843 #endif
2844 #ifndef HAS_LC_AUX_TABLES
2845 #  define LC_AUX_TABLE_ptrs     NULL
2846 #  define LC_AUX_TABLE_lengths  NULL
2847 #endif
2848 #ifndef HAS_CF_AUX_TABLES
2849 #  define CF_AUX_TABLE_ptrs     NULL
2850 #  define CF_AUX_TABLE_lengths  NULL
2851 #endif
2852 
2853 /* Call the function to convert a UTF-8 encoded character to the specified case.
2854  * Note that there may be more than one character in the result.
2855  * 's' is a pointer to the first byte of the input character
2856  * 'd' will be set to the first byte of the string of changed characters.  It
2857  *	needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2858  * 'lenp' will be set to the length in bytes of the string of changed characters
2859  *
2860  * The functions return the ordinal of the first character in the string of
2861  * 'd' */
2862 #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
2863                 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
2864                                               Uppercase_Mapping_invmap,     \
2865                                               UC_AUX_TABLE_ptrs,            \
2866                                               UC_AUX_TABLE_lengths,         \
2867                                               "uppercase")
2868 #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
2869                 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
2870                                               Titlecase_Mapping_invmap,     \
2871                                               TC_AUX_TABLE_ptrs,            \
2872                                               TC_AUX_TABLE_lengths,         \
2873                                               "titlecase")
2874 #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
2875                 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
2876                                               Lowercase_Mapping_invmap,     \
2877                                               LC_AUX_TABLE_ptrs,            \
2878                                               LC_AUX_TABLE_lengths,         \
2879                                               "lowercase")
2880 
2881 
2882 /* This additionally has the input parameter 'specials', which if non-zero will
2883  * cause this to use the specials hash for folding (meaning get full case
2884  * folding); otherwise, when zero, this implies a simple case fold */
2885 #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
2886         (specials)                                                          \
2887         ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
2888                                           Case_Folding_invmap,              \
2889                                           CF_AUX_TABLE_ptrs,                \
2890                                           CF_AUX_TABLE_lengths,             \
2891                                           "foldcase")                       \
2892         : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
2893                                          Simple_Case_Folding_invmap,        \
2894                                          NULL, NULL,                        \
2895                                          "foldcase")
2896 
2897 UV
2898 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
2899 {
2900     /* Convert the Unicode character whose ordinal is <c> to its uppercase
2901      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2902      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2903      * the changed version may be longer than the original character.
2904      *
2905      * The ordinal of the first character of the changed version is returned
2906      * (but note, as explained above, that there may be more.) */
2907 
2908     PERL_ARGS_ASSERT_TO_UNI_UPPER;
2909 
2910     if (c < 256) {
2911         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2912     }
2913 
2914     return CALL_UPPER_CASE(c, NULL, p, lenp);
2915 }
2916 
2917 UV
2918 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
2919 {
2920     PERL_ARGS_ASSERT_TO_UNI_TITLE;
2921 
2922     if (c < 256) {
2923         return _to_upper_title_latin1((U8) c, p, lenp, 's');
2924     }
2925 
2926     return CALL_TITLE_CASE(c, NULL, p, lenp);
2927 }
2928 
2929 STATIC U8
2930 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
2931 {
2932     /* We have the latin1-range values compiled into the core, so just use
2933      * those, converting the result to UTF-8.  Since the result is always just
2934      * one character, we allow <p> to be NULL */
2935 
2936     U8 converted = toLOWER_LATIN1(c);
2937 
2938     PERL_UNUSED_ARG(dummy);
2939 
2940     if (p != NULL) {
2941         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
2942             *p = converted;
2943             *lenp = 1;
2944         }
2945         else {
2946             /* Result is known to always be < 256, so can use the EIGHT_BIT
2947              * macros */
2948             *p = UTF8_EIGHT_BIT_HI(converted);
2949             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
2950             *lenp = 2;
2951         }
2952     }
2953     return converted;
2954 }
2955 
2956 UV
2957 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
2958 {
2959     PERL_ARGS_ASSERT_TO_UNI_LOWER;
2960 
2961     if (c < 256) {
2962         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
2963     }
2964 
2965     return CALL_LOWER_CASE(c, NULL, p, lenp);
2966 }
2967 
2968 UV
2969 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
2970 {
2971     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
2972      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2973      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
2974      *
2975      *	Not to be used for locale folds
2976      */
2977 
2978     UV converted;
2979 
2980     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
2981 
2982     assert (! (flags & FOLD_FLAGS_LOCALE));
2983 
2984     if (UNLIKELY(c == MICRO_SIGN)) {
2985         converted = GREEK_SMALL_LETTER_MU;
2986     }
2987 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
2988    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
2989                                       || UNICODE_DOT_DOT_VERSION > 0)
2990     else if (   (flags & FOLD_FLAGS_FULL)
2991              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2992     {
2993         /* If can't cross 127/128 boundary, can't return "ss"; instead return
2994          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2995          * under those circumstances. */
2996         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2997             *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
2998             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2999                  p, *lenp, U8);
3000             return LATIN_SMALL_LETTER_LONG_S;
3001         }
3002         else {
3003             *(p)++ = 's';
3004             *p = 's';
3005             *lenp = 2;
3006             return 's';
3007         }
3008     }
3009 #endif
3010     else { /* In this range the fold of all other characters is their lower
3011               case */
3012         converted = toLOWER_LATIN1(c);
3013     }
3014 
3015     if (UVCHR_IS_INVARIANT(converted)) {
3016         *p = (U8) converted;
3017         *lenp = 1;
3018     }
3019     else {
3020         *(p)++ = UTF8_TWO_BYTE_HI(converted);
3021         *p = UTF8_TWO_BYTE_LO(converted);
3022         *lenp = 2;
3023     }
3024 
3025     return converted;
3026 }
3027 
3028 UV
3029 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3030 {
3031 
3032     /* Not currently externally documented, and subject to change
3033      *  <flags> bits meanings:
3034      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
3035      *	    FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3036      *	                      locale are to be used.
3037      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3038      */
3039 
3040     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3041 
3042     if (flags & FOLD_FLAGS_LOCALE) {
3043         /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3044          * except for potentially warning */
3045         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3046         if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
3047             flags &= ~FOLD_FLAGS_LOCALE;
3048         }
3049         else {
3050             goto needs_full_generality;
3051         }
3052     }
3053 
3054     if (c < 256) {
3055         return _to_fold_latin1((U8) c, p, lenp,
3056                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3057     }
3058 
3059     /* Here, above 255.  If no special needs, just use the macro */
3060     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3061         return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3062     }
3063     else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3064                the special flags. */
3065         U8 utf8_c[UTF8_MAXBYTES + 1];
3066 
3067       needs_full_generality:
3068         uvchr_to_utf8(utf8_c, c);
3069         return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3070                                   p, lenp, flags);
3071     }
3072 }
3073 
3074 PERL_STATIC_INLINE bool
3075 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3076                        SV* const invlist)
3077 {
3078     /* returns a boolean giving whether or not the UTF8-encoded character that
3079      * starts at <p>, and extending no further than <e - 1> is in the inversion
3080      * list <invlist>. */
3081 
3082     UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3083 
3084     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3085 
3086     if (cp == 0 && (p >= e || *p != '\0')) {
3087         _force_out_malformed_utf8_message(p, e, 0, 1);
3088         NOT_REACHED; /* NOTREACHED */
3089     }
3090 
3091     assert(invlist);
3092     return _invlist_contains_cp(invlist, cp);
3093 }
3094 
3095 #if 0	/* Not currently used, but may be needed in the future */
3096 PERLVAR(I, seen_deprecated_macro, HV *)
3097 
3098 STATIC void
3099 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3100                                      const char * const alternative,
3101                                      const bool use_locale,
3102                                      const char * const file,
3103                                      const unsigned line)
3104 {
3105     const char * key;
3106 
3107     PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3108 
3109     if (ckWARN_d(WARN_DEPRECATED)) {
3110 
3111         key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3112         if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3113             if (! PL_seen_deprecated_macro) {
3114                 PL_seen_deprecated_macro = newHV();
3115             }
3116             if (! hv_store(PL_seen_deprecated_macro, key,
3117                            strlen(key), &PL_sv_undef, 0))
3118             {
3119                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3120             }
3121 
3122             if (instr(file, "mathoms.c")) {
3123                 Perl_warner(aTHX_ WARN_DEPRECATED,
3124                             "In %s, line %d, starting in Perl v5.32, %s()"
3125                             " will be removed.  Avoid this message by"
3126                             " converting to use %s().\n",
3127                             file, line, name, alternative);
3128             }
3129             else {
3130                 Perl_warner(aTHX_ WARN_DEPRECATED,
3131                             "In %s, line %d, starting in Perl v5.32, %s() will"
3132                             " require an additional parameter.  Avoid this"
3133                             " message by converting to use %s().\n",
3134                             file, line, name, alternative);
3135             }
3136         }
3137     }
3138 }
3139 #endif
3140 
3141 bool
3142 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3143 {
3144     PERL_ARGS_ASSERT__IS_UTF8_FOO;
3145 
3146     return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3147 }
3148 
3149 bool
3150 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3151 {
3152     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3153 
3154     return is_utf8_common(p, e, PL_utf8_perl_idstart);
3155 }
3156 
3157 bool
3158 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3159 {
3160     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3161 
3162     return is_utf8_common(p, e, PL_utf8_perl_idcont);
3163 }
3164 
3165 STATIC UV
3166 S_to_case_cp_list(pTHX_
3167                   const UV original,
3168                   const U32 ** const remaining_list,
3169                   Size_t * remaining_count,
3170                   SV *invlist, const I32 * const invmap,
3171                   const U32 * const * const aux_tables,
3172                   const U8 * const aux_table_lengths,
3173                   const char * const normal)
3174 {
3175     SSize_t index;
3176     I32 base;
3177 
3178     /* Calculate the changed case of code point 'original'.  The first code
3179      * point of the changed case is returned.
3180      *
3181      * If 'remaining_count' is not NULL, *remaining_count will be set to how
3182      * many *other* code points are in the changed case.  If non-zero and
3183      * 'remaining_list' is also not NULL, *remaining_list will be set to point
3184      * to a non-modifiable array containing the second and potentially third
3185      * code points in the changed case.  (Unicode guarantees a maximum of 3.)
3186      * Note that this means that *remaining_list is undefined unless there are
3187      * multiple code points, and the caller has chosen to find out how many by
3188      * making 'remaining_count' not NULL.
3189      *
3190      * 'normal' is a string to use to name the new case in any generated
3191      * messages, as a fallback if the operation being used is not available.
3192      *
3193      * The casing to use is given by the data structures in the remaining
3194      * arguments.
3195      */
3196 
3197     PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3198 
3199     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3200      * that covers all possible inputs.  See [perl #133365] */
3201     index = _invlist_search(invlist, original);
3202     base = invmap[index];
3203 
3204     /* Most likely, the case change will contain just a single code point */
3205     if (remaining_count) {
3206         *remaining_count = 0;
3207     }
3208 
3209     if (LIKELY(base == 0)) {    /* 0 => original was unchanged by casing */
3210 
3211         /* At this bottom level routine is where we warn about illegal code
3212          * points */
3213         if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3214             if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3215                 if (ckWARN_d(WARN_SURROGATE)) {
3216                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3217                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3218                         "Operation \"%s\" returns its argument for"
3219                         " UTF-16 surrogate U+%04" UVXf, desc, original);
3220                 }
3221             }
3222             else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3223                 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3224                     Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3225                 }
3226                 if (ckWARN_d(WARN_NON_UNICODE)) {
3227                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3228                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3229                         "Operation \"%s\" returns its argument for"
3230                         " non-Unicode code point 0x%04" UVXf, desc, original);
3231                 }
3232             }
3233 
3234             /* Note that non-characters are perfectly legal, so no warning
3235              * should be given. */
3236         }
3237 
3238         return original;
3239     }
3240 
3241     if (LIKELY(base > 0)) {  /* means original mapped to a single code point,
3242                                 different from itself */
3243         return base + original - invlist_array(invlist)[index];
3244     }
3245 
3246     /* Here 'base' is negative.  That means the mapping is 1-to-many, and
3247      * requires an auxiliary table look up.  abs(base) gives the index into a
3248      * list of such tables which points to the proper aux table.  And a
3249      * parallel list gives the length of each corresponding aux table.  Skip
3250      * the first entry in the *remaining returns, as it is returned by the
3251      * function. */
3252     base = -base;
3253     if (remaining_count) {
3254         *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3255 
3256         if (remaining_list) {
3257             *remaining_list  = aux_tables[base] + 1;
3258         }
3259     }
3260 
3261     return (UV) aux_tables[base][0];
3262 }
3263 
3264 STATIC UV
3265 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3266                       U8* ustrp, STRLEN *lenp,
3267                       SV *invlist, const I32 * const invmap,
3268                       const U32 * const * const aux_tables,
3269                       const U8 * const aux_table_lengths,
3270                       const char * const normal)
3271 {
3272     /* Change the case of code point 'original'.  If 'p' is non-NULL, it points to
3273      * the beginning of the (assumed to be valid) UTF-8 representation of
3274      * 'original'.  'normal' is a string to use to name the new case in any
3275      * generated messages, as a fallback if the operation being used is not
3276      * available.  The new case is given by the data structures in the
3277      * remaining arguments.
3278      *
3279      * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3280      * entire changed case string, and the return value is the first code point
3281      * in that string
3282      *
3283      * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3284      * since the changed version may be longer than the original character. */
3285 
3286     const U32 * remaining_list;
3287     Size_t remaining_count;
3288     UV first = to_case_cp_list(original,
3289                                &remaining_list, &remaining_count,
3290                                invlist, invmap,
3291                                aux_tables, aux_table_lengths,
3292                                normal);
3293 
3294     PERL_ARGS_ASSERT__TO_UTF8_CASE;
3295 
3296     /* If the code point maps to itself and we already have its representation,
3297      * copy it instead of recalculating */
3298     if (original == first && p) {
3299         *lenp = UTF8SKIP(p);
3300 
3301         if (p != ustrp) {   /* Don't copy onto itself */
3302             Copy(p, ustrp, *lenp, U8);
3303         }
3304     }
3305     else {
3306         U8 * d = ustrp;
3307         Size_t i;
3308 
3309         d = uvchr_to_utf8(d, first);
3310 
3311         for (i = 0; i < remaining_count; i++) {
3312             d = uvchr_to_utf8(d, remaining_list[i]);
3313         }
3314 
3315         *d = '\0';
3316         *lenp = d - ustrp;
3317     }
3318 
3319     return first;
3320 }
3321 
3322 Size_t
3323 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3324                           const U32 ** remaining_folds_to)
3325 {
3326     /* Returns the count of the number of code points that fold to the input
3327      * 'cp' (besides itself).
3328      *
3329      * If the return is 0, there is nothing else that folds to it, and
3330      * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3331      *
3332      * If the return is 1, '*first_folds_to' is set to the single code point,
3333      * and '*remaining_folds_to' is set to NULL.
3334      *
3335      * Otherwise, '*first_folds_to' is set to a code point, and
3336      * '*remaining_fold_to' is set to an array that contains the others.  The
3337      * length of this array is the returned count minus 1.
3338      *
3339      * The reason for this convolution is to avoid having to deal with
3340      * allocating and freeing memory.  The lists are already constructed, so
3341      * the return can point to them, but single code points aren't, so would
3342      * need to be constructed if we didn't employ something like this API
3343      *
3344      * The code points returned by this function are all legal Unicode, which
3345      * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3346      * constructed with this size (to save space and memory), and we return
3347      * pointers, so they must be this size */
3348 
3349     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3350      * that covers all possible inputs.  See [perl #133365] */
3351     SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3352     I32 base = _Perl_IVCF_invmap[index];
3353 
3354     PERL_ARGS_ASSERT__INVERSE_FOLDS;
3355 
3356     if (base == 0) {            /* No fold */
3357         *first_folds_to = 0;
3358         *remaining_folds_to = NULL;
3359         return 0;
3360     }
3361 
3362 #ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
3363 
3364     assert(base > 0);
3365 
3366 #else
3367 
3368     if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
3369 
3370         /* The data structure is set up so that the absolute value of 'base' is
3371          * an index into a table of pointers to arrays, with the array
3372          * corresponding to the index being the list of code points that fold
3373          * to 'cp', and the parallel array containing the length of the list
3374          * array */
3375         *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3376         *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3377                                                 /* +1 excludes first_folds_to */
3378         return IVCF_AUX_TABLE_lengths[-base];
3379     }
3380 
3381 #endif
3382 
3383     /* Only the single code point.  This works like 'fc(G) = G - A + a' */
3384     *first_folds_to = (U32) (base + cp
3385                                   - invlist_array(PL_utf8_foldclosures)[index]);
3386     *remaining_folds_to = NULL;
3387     return 1;
3388 }
3389 
3390 STATIC UV
3391 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3392                                        U8* const ustrp, STRLEN *lenp)
3393 {
3394     /* This is called when changing the case of a UTF-8-encoded character above
3395      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
3396      * result contains a character that crosses the 255/256 boundary, disallow
3397      * the change, and return the original code point.  See L<perlfunc/lc> for
3398      * why;
3399      *
3400      * p	points to the original string whose case was changed; assumed
3401      *          by this routine to be well-formed
3402      * result	the code point of the first character in the changed-case string
3403      * ustrp	points to the changed-case string (<result> represents its
3404      *          first char)
3405      * lenp	points to the length of <ustrp> */
3406 
3407     UV original;    /* To store the first code point of <p> */
3408 
3409     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3410 
3411     assert(UTF8_IS_ABOVE_LATIN1(*p));
3412 
3413     /* We know immediately if the first character in the string crosses the
3414      * boundary, so can skip testing */
3415     if (result > 255) {
3416 
3417         /* Look at every character in the result; if any cross the
3418         * boundary, the whole thing is disallowed */
3419         U8* s = ustrp + UTF8SKIP(ustrp);
3420         U8* e = ustrp + *lenp;
3421         while (s < e) {
3422             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3423                 goto bad_crossing;
3424             }
3425             s += UTF8SKIP(s);
3426         }
3427 
3428         /* Here, no characters crossed, result is ok as-is, but we warn. */
3429         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3430         return result;
3431     }
3432 
3433   bad_crossing:
3434 
3435     /* Failed, have to return the original */
3436     original = valid_utf8_to_uvchr(p, lenp);
3437 
3438     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3439     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3440                            "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3441                            " locale; resolved to \"\\x{%" UVXf "}\".",
3442                            OP_DESC(PL_op),
3443                            original,
3444                            original);
3445     Copy(p, ustrp, *lenp, char);
3446     return original;
3447 }
3448 
3449 STATIC UV
3450 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3451                         U8 * ustrp, STRLEN *lenp)
3452 {
3453     /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3454      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3455      * Otherwise, it returns the first code point of the Turkic foldcased
3456      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3457      * contain *lenp bytes
3458      *
3459      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3460      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3461      * DOTLESS I */
3462 
3463     PERL_ARGS_ASSERT_TURKIC_FC;
3464     assert(e > p);
3465 
3466     if (UNLIKELY(*p == 'I')) {
3467         *lenp = 2;
3468         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3469         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3470         return LATIN_SMALL_LETTER_DOTLESS_I;
3471     }
3472 
3473     if (UNLIKELY(memBEGINs(p, e - p,
3474                            LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3475     {
3476         *lenp = 1;
3477         *ustrp = 'i';
3478         return 'i';
3479     }
3480 
3481     return 0;
3482 }
3483 
3484 STATIC UV
3485 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3486                         U8 * ustrp, STRLEN *lenp)
3487 {
3488     /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3489      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3490      * Otherwise, it returns the first code point of the Turkic lowercased
3491      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3492      * contain *lenp bytes */
3493 
3494     PERL_ARGS_ASSERT_TURKIC_LC;
3495     assert(e > p0);
3496 
3497     /* A 'I' requires context as to what to do */
3498     if (UNLIKELY(*p0 == 'I')) {
3499         const U8 * p = p0 + 1;
3500 
3501         /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3502          * modified by a dot above lowercases to 'i' even in turkic locales. */
3503         while (p < e) {
3504             UV cp;
3505 
3506             if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3507                 ustrp[0] = 'i';
3508                 *lenp = 1;
3509                 return 'i';
3510             }
3511 
3512             /* For the dot above to modify the 'I', it must be part of a
3513              * combining sequence immediately following the 'I', and no other
3514              * modifier with a ccc of 230 may intervene */
3515             cp = utf8_to_uvchr_buf(p, e, NULL);
3516             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3517                 break;
3518             }
3519 
3520             /* Here the combining sequence continues */
3521             p += UTF8SKIP(p);
3522         }
3523     }
3524 
3525     /* In all other cases the lc is the same as the fold */
3526     return turkic_fc(p0, e, ustrp, lenp);
3527 }
3528 
3529 STATIC UV
3530 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3531                         U8 * ustrp, STRLEN *lenp)
3532 {
3533     /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3534      * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3535      * Otherwise, it returns the first code point of the Turkic upper or
3536      * title-cased sequence, and the entire sequence will be stored in *ustrp.
3537      * ustrp will contain *lenp bytes
3538      *
3539      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3540      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3541      * DOTLESS I */
3542 
3543     PERL_ARGS_ASSERT_TURKIC_UC;
3544     assert(e > p);
3545 
3546     if (*p == 'i') {
3547         *lenp = 2;
3548         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3549         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3550         return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3551     }
3552 
3553     if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3554         *lenp = 1;
3555         *ustrp = 'I';
3556         return 'I';
3557     }
3558 
3559     return 0;
3560 }
3561 
3562 /* The process for changing the case is essentially the same for the four case
3563  * change types, except there are complications for folding.  Otherwise the
3564  * difference is only which case to change to.  To make sure that they all do
3565  * the same thing, the bodies of the functions are extracted out into the
3566  * following two macros.  The functions are written with the same variable
3567  * names, and these are known and used inside these macros.  It would be
3568  * better, of course, to have inline functions to do it, but since different
3569  * macros are called, depending on which case is being changed to, this is not
3570  * feasible in C (to khw's knowledge).  Two macros are created so that the fold
3571  * function can start with the common start macro, then finish with its special
3572  * handling; while the other three cases can just use the common end macro.
3573  *
3574  * The algorithm is to use the proper (passed in) macro or function to change
3575  * the case for code points that are below 256.  The macro is used if using
3576  * locale rules for the case change; the function if not.  If the code point is
3577  * above 255, it is computed from the input UTF-8, and another macro is called
3578  * to do the conversion.  If necessary, the output is converted to UTF-8.  If
3579  * using a locale, we have to check that the change did not cross the 255/256
3580  * boundary, see check_locale_boundary_crossing() for further details.
3581  *
3582  * The macros are split with the correct case change for the below-256 case
3583  * stored into 'result', and in the middle of an else clause for the above-255
3584  * case.  At that point in the 'else', 'result' is not the final result, but is
3585  * the input code point calculated from the UTF-8.  The fold code needs to
3586  * realize all this and take it from there.
3587  *
3588  * To deal with Turkic locales, the function specified by the parameter
3589  * 'turkic' is called when appropriate.
3590  *
3591  * If you read the two macros as sequential, it's easier to understand what's
3592  * going on. */
3593 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func,  \
3594                                L1_func_extra_param, turkic)                  \
3595                                                                              \
3596     if (flags & (locale_flags)) {                                            \
3597         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                  \
3598         if (IN_UTF8_CTYPE_LOCALE) {                                          \
3599             if (UNLIKELY(PL_in_utf8_turkic_locale)) {                        \
3600                 UV ret = turkic(p, e, ustrp, lenp);                          \
3601                 if (ret) return ret;                                         \
3602             }                                                                \
3603                                                                              \
3604             /* Otherwise, treat a UTF-8 locale as not being in locale at     \
3605              * all */                                                        \
3606             flags &= ~(locale_flags);                                        \
3607         }                                                                    \
3608     }                                                                        \
3609                                                                              \
3610     if (UTF8_IS_INVARIANT(*p)) {                                             \
3611         if (flags & (locale_flags)) {                                        \
3612             result = libc_change_function(*p);                               \
3613         }                                                                    \
3614         else {                                                               \
3615             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
3616         }                                                                    \
3617     }                                                                        \
3618     else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
3619         U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
3620         if (flags & (locale_flags)) {                                        \
3621             result = libc_change_function(c);                                \
3622         }                                                                    \
3623         else {                                                               \
3624             return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
3625         }                                                                    \
3626     }                                                                        \
3627     else {  /* malformed UTF-8 or ord above 255 */                           \
3628         STRLEN len_result;                                                   \
3629         result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
3630         if (len_result == (STRLEN) -1) {                                     \
3631             _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
3632         }
3633 
3634 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
3635         result = change_macro(result, p, ustrp, lenp);                       \
3636                                                                              \
3637         if (flags & (locale_flags)) {                                        \
3638             result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3639         }                                                                    \
3640         return result;                                                       \
3641     }                                                                        \
3642                                                                              \
3643     /* Here, used locale rules.  Convert back to UTF-8 */                    \
3644     if (UTF8_IS_INVARIANT(result)) {                                         \
3645         *ustrp = (U8) result;                                                \
3646         *lenp = 1;                                                           \
3647     }                                                                        \
3648     else {                                                                   \
3649         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);                             \
3650         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);                       \
3651         *lenp = 2;                                                           \
3652     }                                                                        \
3653                                                                              \
3654     return result;
3655 
3656 /* Not currently externally documented, and subject to change:
3657  * <flags> is set iff the rules from the current underlying locale are to
3658  *         be used. */
3659 
3660 UV
3661 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3662                                 const U8 *e,
3663                                 U8* ustrp,
3664                                 STRLEN *lenp,
3665                                 bool flags)
3666 {
3667     UV result;
3668 
3669     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3670 
3671     /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3672     /* 2nd char of uc(U+DF) is 'S' */
3673     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3674                                                                     turkic_uc);
3675     CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
3676 }
3677 
3678 /* Not currently externally documented, and subject to change:
3679  * <flags> is set iff the rules from the current underlying locale are to be
3680  *         used.  Since titlecase is not defined in POSIX, for other than a
3681  *         UTF-8 locale, uppercase is used instead for code points < 256.
3682  */
3683 
3684 UV
3685 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3686                                 const U8 *e,
3687                                 U8* ustrp,
3688                                 STRLEN *lenp,
3689                                 bool flags)
3690 {
3691     UV result;
3692 
3693     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3694 
3695     /* 2nd char of ucfirst(U+DF) is 's' */
3696     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3697                                                                     turkic_uc);
3698     CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
3699 }
3700 
3701 /* Not currently externally documented, and subject to change:
3702  * <flags> is set iff the rules from the current underlying locale are to
3703  *         be used.
3704  */
3705 
3706 UV
3707 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3708                                 const U8 *e,
3709                                 U8* ustrp,
3710                                 STRLEN *lenp,
3711                                 bool flags)
3712 {
3713     UV result;
3714 
3715     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3716 
3717     CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3718                                                                     turkic_lc);
3719     CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
3720 }
3721 
3722 /* Not currently externally documented, and subject to change,
3723  * in <flags>
3724  *	bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3725  *	                      locale are to be used.
3726  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
3727  *			      otherwise simple folds
3728  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3729  *			      prohibited
3730  */
3731 
3732 UV
3733 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3734                                const U8 *e,
3735                                U8* ustrp,
3736                                STRLEN *lenp,
3737                                U8 flags)
3738 {
3739     UV result;
3740 
3741     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3742 
3743     /* These are mutually exclusive */
3744     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3745 
3746     assert(p != ustrp); /* Otherwise overwrites */
3747 
3748     CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3749                  ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3750                                                                     turkic_fc);
3751 
3752         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3753 
3754         if (flags & FOLD_FLAGS_LOCALE) {
3755 
3756 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3757 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3758 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3759 
3760             /* Special case these two characters, as what normally gets
3761              * returned under locale doesn't work */
3762             if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3763             {
3764                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3765                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3766                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3767                               "resolved to \"\\x{17F}\\x{17F}\".");
3768                 goto return_long_s;
3769             }
3770             else
3771 #endif
3772                  if (memBEGINs((char *) p, e - p, LONG_S_T))
3773             {
3774                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3775                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3776                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3777                               "resolved to \"\\x{FB06}\".");
3778                 goto return_ligature_st;
3779             }
3780 
3781 #if    UNICODE_MAJOR_VERSION   == 3         \
3782     && UNICODE_DOT_VERSION     == 0         \
3783     && UNICODE_DOT_DOT_VERSION == 1
3784 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3785 
3786             /* And special case this on this Unicode version only, for the same
3787              * reaons the other two are special cased.  They would cross the
3788              * 255/256 boundary which is forbidden under /l, and so the code
3789              * wouldn't catch that they are equivalent (which they are only in
3790              * this release) */
3791             else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
3792                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3793                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3794                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3795                               "resolved to \"\\x{0131}\".");
3796                 goto return_dotless_i;
3797             }
3798 #endif
3799 
3800             return check_locale_boundary_crossing(p, result, ustrp, lenp);
3801         }
3802         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3803             return result;
3804         }
3805         else {
3806             /* This is called when changing the case of a UTF-8-encoded
3807              * character above the ASCII range, and the result should not
3808              * contain an ASCII character. */
3809 
3810             UV original;    /* To store the first code point of <p> */
3811 
3812             /* Look at every character in the result; if any cross the
3813             * boundary, the whole thing is disallowed */
3814             U8* s = ustrp;
3815             U8* send = ustrp + *lenp;
3816             while (s < send) {
3817                 if (isASCII(*s)) {
3818                     /* Crossed, have to return the original */
3819                     original = valid_utf8_to_uvchr(p, lenp);
3820 
3821                     /* But in these instances, there is an alternative we can
3822                      * return that is valid */
3823                     if (original == LATIN_SMALL_LETTER_SHARP_S
3824 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3825                         || original == LATIN_CAPITAL_LETTER_SHARP_S
3826 #endif
3827                     ) {
3828                         goto return_long_s;
3829                     }
3830                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3831                         goto return_ligature_st;
3832                     }
3833 #if    UNICODE_MAJOR_VERSION   == 3         \
3834     && UNICODE_DOT_VERSION     == 0         \
3835     && UNICODE_DOT_DOT_VERSION == 1
3836 
3837                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3838                         goto return_dotless_i;
3839                     }
3840 #endif
3841                     Copy(p, ustrp, *lenp, char);
3842                     return original;
3843                 }
3844                 s += UTF8SKIP(s);
3845             }
3846 
3847             /* Here, no characters crossed, result is ok as-is */
3848             return result;
3849         }
3850     }
3851 
3852     /* Here, used locale rules.  Convert back to UTF-8 */
3853     if (UTF8_IS_INVARIANT(result)) {
3854         *ustrp = (U8) result;
3855         *lenp = 1;
3856     }
3857     else {
3858         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3859         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3860         *lenp = 2;
3861     }
3862 
3863     return result;
3864 
3865   return_long_s:
3866     /* Certain folds to 'ss' are prohibited by the options, but they do allow
3867      * folds to a string of two of these characters.  By returning this
3868      * instead, then, e.g.,
3869      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3870      * works. */
3871 
3872     *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3873     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
3874         ustrp, *lenp, U8);
3875     return LATIN_SMALL_LETTER_LONG_S;
3876 
3877   return_ligature_st:
3878     /* Two folds to 'st' are prohibited by the options; instead we pick one and
3879      * have the other one fold to it */
3880 
3881     *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
3882     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3883     return LATIN_SMALL_LIGATURE_ST;
3884 
3885 #if    UNICODE_MAJOR_VERSION   == 3         \
3886     && UNICODE_DOT_VERSION     == 0         \
3887     && UNICODE_DOT_DOT_VERSION == 1
3888 
3889   return_dotless_i:
3890     *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
3891     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3892     return LATIN_SMALL_LETTER_DOTLESS_I;
3893 
3894 #endif
3895 
3896 }
3897 
3898 bool
3899 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
3900 {
3901     /* May change: warns if surrogates, non-character code points, or
3902      * non-Unicode code points are in 's' which has length 'len' bytes.
3903      * Returns TRUE if none found; FALSE otherwise.  The only other validity
3904      * check is to make sure that this won't exceed the string's length nor
3905      * overflow */
3906 
3907     const U8* const e = s + len;
3908     bool ok = TRUE;
3909 
3910     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3911 
3912     while (s < e) {
3913         if (UTF8SKIP(s) > len) {
3914             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3915                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3916             return FALSE;
3917         }
3918         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
3919             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
3920                 if (   ckWARN_d(WARN_NON_UNICODE)
3921                     || UNLIKELY(0 < does_utf8_overflow(s, s + len,
3922                                                0 /* Don't consider overlongs */
3923                                                )))
3924                 {
3925                     /* A side effect of this function will be to warn */
3926                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
3927                     ok = FALSE;
3928                 }
3929             }
3930             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
3931                 if (ckWARN_d(WARN_SURROGATE)) {
3932                     /* This has a different warning than the one the called
3933                      * function would output, so can't just call it, unlike we
3934                      * do for the non-chars and above-unicodes */
3935                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
3936                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3937                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
3938                                              uv);
3939                     ok = FALSE;
3940                 }
3941             }
3942             else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
3943                      && (ckWARN_d(WARN_NONCHAR)))
3944             {
3945                 /* A side effect of this function will be to warn */
3946                 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
3947                 ok = FALSE;
3948             }
3949         }
3950         s += UTF8SKIP(s);
3951     }
3952 
3953     return ok;
3954 }
3955 
3956 /*
3957 =for apidoc pv_uni_display
3958 
3959 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
3960 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
3961 long (if longer, the rest is truncated and C<"..."> will be appended).
3962 
3963 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
3964 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
3965 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
3966 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
3967 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
3968 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
3969 
3970 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
3971 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
3972 
3973 The pointer to the PV of the C<dsv> is returned.
3974 
3975 See also L</sv_uni_display>.
3976 
3977 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
3978 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
3979 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
3980 =for apidoc Amnh||UNI_DISPLAY_QQ
3981 =for apidoc Amnh||UNI_DISPLAY_REGEX
3982 =cut
3983 */
3984 char *
3985 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
3986                           UV flags)
3987 {
3988     int truncated = 0;
3989     const char *s, *e;
3990 
3991     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
3992 
3993     SvPVCLEAR(dsv);
3994     SvUTF8_off(dsv);
3995     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
3996          UV u;
3997          bool ok = 0;
3998 
3999          if (pvlim && SvCUR(dsv) >= pvlim) {
4000               truncated++;
4001               break;
4002          }
4003          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4004          if (u < 256) {
4005              const U8 c = (U8) u;
4006              if (flags & UNI_DISPLAY_BACKSLASH) {
4007                  if (    isMNEMONIC_CNTRL(c)
4008                      && (   c != '\b'
4009                          || (flags & UNI_DISPLAY_BACKSPACE)))
4010                  {
4011                     const char * mnemonic = cntrl_to_mnemonic(c);
4012                     sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4013                     ok = 1;
4014                  }
4015                  else if (c == '\\') {
4016                     sv_catpvs(dsv, "\\\\");
4017                     ok = 1;
4018                  }
4019              }
4020              /* isPRINT() is the locale-blind version. */
4021              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4022                  const char string = c;
4023                  sv_catpvn(dsv, &string, 1);
4024                  ok = 1;
4025              }
4026          }
4027          if (!ok)
4028              Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4029     }
4030     if (truncated)
4031          sv_catpvs(dsv, "...");
4032 
4033     return SvPVX(dsv);
4034 }
4035 
4036 /*
4037 =for apidoc sv_uni_display
4038 
4039 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4040 the displayable version being at most C<pvlim> bytes long
4041 (if longer, the rest is truncated and "..." will be appended).
4042 
4043 The C<flags> argument is as in L</pv_uni_display>().
4044 
4045 The pointer to the PV of the C<dsv> is returned.
4046 
4047 =cut
4048 */
4049 char *
4050 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4051 {
4052     const char * const ptr =
4053         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4054 
4055     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4056 
4057     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4058                                 SvCUR(ssv), pvlim, flags);
4059 }
4060 
4061 /*
4062 =for apidoc foldEQ_utf8
4063 
4064 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4065 both of which may be in UTF-8) are the same case-insensitively; false
4066 otherwise.  How far into the strings to compare is determined by other input
4067 parameters.
4068 
4069 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4070 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
4071 C<u2> with respect to C<s2>.
4072 
4073 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4074 fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
4075 The scan will not be considered to be a match unless the goal is reached, and
4076 scanning won't continue past that goal.  Correspondingly for C<l2> with respect
4077 to C<s2>.
4078 
4079 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4080 pointer is considered an end pointer to the position 1 byte past the maximum
4081 point in C<s1> beyond which scanning will not continue under any circumstances.
4082 (This routine assumes that UTF-8 encoded input strings are not malformed;
4083 malformed input can cause it to read past C<pe1>).  This means that if both
4084 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4085 will never be successful because it can never
4086 get as far as its goal (and in fact is asserted against).  Correspondingly for
4087 C<pe2> with respect to C<s2>.
4088 
4089 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4090 C<l2> must be non-zero), and if both do, both have to be
4091 reached for a successful match.   Also, if the fold of a character is multiple
4092 characters, all of them must be matched (see tr21 reference below for
4093 'folding').
4094 
4095 Upon a successful match, if C<pe1> is non-C<NULL>,
4096 it will be set to point to the beginning of the I<next> character of C<s1>
4097 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4098 
4099 For case-insensitiveness, the "casefolding" of Unicode is used
4100 instead of upper/lowercasing both the characters, see
4101 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4102 
4103 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4104 =for apidoc Cmnh||FOLDEQ_LOCALE
4105 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4106 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4107 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4108 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4109 
4110 =cut */
4111 
4112 /* A flags parameter has been added which may change, and hence isn't
4113  * externally documented.  Currently it is:
4114  *  0 for as-documented above
4115  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4116                             ASCII one, to not match
4117  *  FOLDEQ_LOCALE	    is set iff the rules from the current underlying
4118  *	                    locale are to be used.
4119  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4120  *                          routine.  This allows that step to be skipped.
4121  *                          Currently, this requires s1 to be encoded as UTF-8
4122  *                          (u1 must be true), which is asserted for.
4123  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4124  *                          cross certain boundaries.  Hence, the caller should
4125  *                          let this function do the folding instead of
4126  *                          pre-folding.  This code contains an assertion to
4127  *                          that effect.  However, if the caller knows what
4128  *                          it's doing, it can pass this flag to indicate that,
4129  *                          and the assertion is skipped.
4130  *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4131  *                          to s2, and s2 doesn't have to be UTF-8 encoded.
4132  *                          This introduces an asymmetry to save a few branches
4133  *                          in a loop.  Currently, this is not a problem, as
4134  *                          never are both inputs pre-folded.  Simply call this
4135  *                          function with the pre-folded one as the second
4136  *                          string.
4137  *  FOLDEQ_S2_FOLDS_SANE
4138  */
4139 
4140 I32
4141 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4142                              const char *s2, char **pe2, UV l2, bool u2,
4143                              U32 flags)
4144 {
4145     const U8 *p1  = (const U8*)s1; /* Point to current char */
4146     const U8 *p2  = (const U8*)s2;
4147     const U8 *g1 = NULL;       /* goal for s1 */
4148     const U8 *g2 = NULL;
4149     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4150     U8 *f1 = NULL;             /* Point to current folded */
4151     const U8 *e2 = NULL;
4152     U8 *f2 = NULL;
4153     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4154     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4155     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4156     U8 flags_for_folder = FOLD_FLAGS_FULL;
4157 
4158     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4159 
4160     assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4161                && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
4162                         && !(flags &  FOLDEQ_S1_FOLDS_SANE))
4163                     || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
4164                         && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
4165     /* The algorithm is to trial the folds without regard to the flags on
4166      * the first line of the above assert(), and then see if the result
4167      * violates them.  This means that the inputs can't be pre-folded to a
4168      * violating result, hence the assert.  This could be changed, with the
4169      * addition of extra tests here for the already-folded case, which would
4170      * slow it down.  That cost is more than any possible gain for when these
4171      * flags are specified, as the flags indicate /il or /iaa matching which
4172      * is less common than /iu, and I (khw) also believe that real-world /il
4173      * and /iaa matches are most likely to involve code points 0-255, and this
4174      * function only under rare conditions gets called for 0-255. */
4175 
4176     if (flags & FOLDEQ_LOCALE) {
4177         if (IN_UTF8_CTYPE_LOCALE) {
4178             if (UNLIKELY(PL_in_utf8_turkic_locale)) {
4179                 flags_for_folder |= FOLD_FLAGS_LOCALE;
4180             }
4181             else {
4182                 flags &= ~FOLDEQ_LOCALE;
4183             }
4184         }
4185         else {
4186             flags_for_folder |= FOLD_FLAGS_LOCALE;
4187         }
4188     }
4189     if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4190         flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4191     }
4192 
4193     if (pe1) {
4194         e1 = *(U8**)pe1;
4195     }
4196 
4197     if (l1) {
4198         g1 = (const U8*)s1 + l1;
4199     }
4200 
4201     if (pe2) {
4202         e2 = *(U8**)pe2;
4203     }
4204 
4205     if (l2) {
4206         g2 = (const U8*)s2 + l2;
4207     }
4208 
4209     /* Must have at least one goal */
4210     assert(g1 || g2);
4211 
4212     if (g1) {
4213 
4214         /* Will never match if goal is out-of-bounds */
4215         assert(! e1  || e1 >= g1);
4216 
4217         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4218         * only go as far as the goal */
4219         e1 = g1;
4220     }
4221     else {
4222         assert(e1);    /* Must have an end for looking at s1 */
4223     }
4224 
4225     /* Same for goal for s2 */
4226     if (g2) {
4227         assert(! e2  || e2 >= g2);
4228         e2 = g2;
4229     }
4230     else {
4231         assert(e2);
4232     }
4233 
4234     /* If both operands are already folded, we could just do a memEQ on the
4235      * whole strings at once, but it would be better if the caller realized
4236      * this and didn't even call us */
4237 
4238     /* Look through both strings, a character at a time */
4239     while (p1 < e1 && p2 < e2) {
4240 
4241         /* If at the beginning of a new character in s1, get its fold to use
4242          * and the length of the fold. */
4243         if (n1 == 0) {
4244             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4245                 f1 = (U8 *) p1;
4246                 assert(u1);
4247                 n1 = UTF8SKIP(f1);
4248             }
4249             else {
4250                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4251 
4252                     /* We have to forbid mixing ASCII with non-ASCII if the
4253                      * flags so indicate.  And, we can short circuit having to
4254                      * call the general functions for this common ASCII case,
4255                      * all of whose non-locale folds are also ASCII, and hence
4256                      * UTF-8 invariants, so the UTF8ness of the strings is not
4257                      * relevant. */
4258                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4259                         return 0;
4260                     }
4261                     n1 = 1;
4262                     *foldbuf1 = toFOLD(*p1);
4263                 }
4264                 else if (u1) {
4265                     _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4266                 }
4267                 else {  /* Not UTF-8, get UTF-8 fold */
4268                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4269                 }
4270                 f1 = foldbuf1;
4271             }
4272         }
4273 
4274         if (n2 == 0) {    /* Same for s2 */
4275             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4276 
4277                 /* Point to the already-folded character.  But for non-UTF-8
4278                  * variants, convert to UTF-8 for the algorithm below */
4279                 if (UTF8_IS_INVARIANT(*p2)) {
4280                     f2 = (U8 *) p2;
4281                     n2 = 1;
4282                 }
4283                 else if (u2) {
4284                     f2 = (U8 *) p2;
4285                     n2 = UTF8SKIP(f2);
4286                 }
4287                 else {
4288                     foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4289                     foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4290                     f2 = foldbuf2;
4291                     n2 = 2;
4292                 }
4293             }
4294             else {
4295                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4296                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4297                         return 0;
4298                     }
4299                     n2 = 1;
4300                     *foldbuf2 = toFOLD(*p2);
4301                 }
4302                 else if (u2) {
4303                     _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4304                 }
4305                 else {
4306                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4307                 }
4308                 f2 = foldbuf2;
4309             }
4310         }
4311 
4312         /* Here f1 and f2 point to the beginning of the strings to compare.
4313          * These strings are the folds of the next character from each input
4314          * string, stored in UTF-8. */
4315 
4316         /* While there is more to look for in both folds, see if they
4317         * continue to match */
4318         while (n1 && n2) {
4319             U8 fold_length = UTF8SKIP(f1);
4320             if (fold_length != UTF8SKIP(f2)
4321                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4322                                                        function call for single
4323                                                        byte */
4324                 || memNE((char*)f1, (char*)f2, fold_length))
4325             {
4326                 return 0; /* mismatch */
4327             }
4328 
4329             /* Here, they matched, advance past them */
4330             n1 -= fold_length;
4331             f1 += fold_length;
4332             n2 -= fold_length;
4333             f2 += fold_length;
4334         }
4335 
4336         /* When reach the end of any fold, advance the input past it */
4337         if (n1 == 0) {
4338             p1 += u1 ? UTF8SKIP(p1) : 1;
4339         }
4340         if (n2 == 0) {
4341             p2 += u2 ? UTF8SKIP(p2) : 1;
4342         }
4343     } /* End of loop through both strings */
4344 
4345     /* A match is defined by each scan that specified an explicit length
4346     * reaching its final goal, and the other not having matched a partial
4347     * character (which can happen when the fold of a character is more than one
4348     * character). */
4349     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4350         return 0;
4351     }
4352 
4353     /* Successful match.  Set output pointers */
4354     if (pe1) {
4355         *pe1 = (char*)p1;
4356     }
4357     if (pe2) {
4358         *pe2 = (char*)p2;
4359     }
4360     return 1;
4361 }
4362 
4363 /*
4364  * ex: set ts=8 sts=4 sw=4 et:
4365  */
4366