xref: /openbsd-src/gnu/usr.bin/perl/utf8.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
13  *  heard of that we don't want to see any closer; and that's the one place
14  *  we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17  *
18  * 'Well do I understand your speech,' he answered in the same language;
19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
20  *  as is the custom in the West, if you wish to be answered?'
21  *                           --Gandalf, addressing Théoden's door wardens
22  *
23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24  *
25  * ...the travellers perceived that the floor was paved with stones of many
26  * hues; branching runes and strange devices intertwined beneath their feet.
27  *
28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29  */
30 
31 #include "EXTERN.h"
32 #define PERL_IN_UTF8_C
33 #include "perl.h"
34 #include "inline_invlist.c"
35 #include "charclass_invlists.h"
36 
37 static const char unees[] =
38     "Malformed UTF-8 character (unexpected end of string)";
39 
40 /*
41 =head1 Unicode Support
42 
43 This file contains various utility functions for manipulating UTF8-encoded
44 strings.  For the uninitiated, this is a method of representing arbitrary
45 Unicode characters as a variable number of bytes, in such a way that
46 characters in the ASCII range are unmodified, and a zero byte never appears
47 within non-zero characters.
48 
49 =cut
50 */
51 
52 /*
53 =for apidoc is_ascii_string
54 
55 Returns true if the first C<len> bytes of the string C<s> are the same whether
56 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
57 is, if they are invariant.  On ASCII-ish machines, only ASCII characters
58 fit this definition, hence the function's name.
59 
60 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
61 use this option, that C<s> can't have embedded C<NUL> characters and has to
62 have a terminating C<NUL> byte).
63 
64 See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
65 
66 =cut
67 */
68 
69 bool
70 Perl_is_ascii_string(const U8 *s, STRLEN len)
71 {
72     const U8* const send = s + (len ? len : strlen((const char *)s));
73     const U8* x = s;
74 
75     PERL_ARGS_ASSERT_IS_ASCII_STRING;
76 
77     for (; x < send; ++x) {
78 	if (!UTF8_IS_INVARIANT(*x))
79 	    break;
80     }
81 
82     return x == send;
83 }
84 
85 /*
86 =for apidoc uvoffuni_to_utf8_flags
87 
88 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
89 Instead, B<Almost all code should use L</uvchr_to_utf8> or
90 L</uvchr_to_utf8_flags>>.
91 
92 This function is like them, but the input is a strict Unicode
93 (as opposed to native) code point.  Only in very rare circumstances should code
94 not be using the native code point.
95 
96 For details, see the description for L</uvchr_to_utf8_flags>>.
97 
98 =cut
99 */
100 
101 U8 *
102 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
103 {
104     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
105 
106     if (UNI_IS_INVARIANT(uv)) {
107 	*d++ = (U8) LATIN1_TO_NATIVE(uv);
108 	return d;
109     }
110 
111     /* The first problematic code point is the first surrogate */
112     if (uv >= UNICODE_SURROGATE_FIRST
113         && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
114     {
115 	if (UNICODE_IS_SURROGATE(uv)) {
116 	    if (flags & UNICODE_WARN_SURROGATE) {
117 		Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
118 					    "UTF-16 surrogate U+%04"UVXf, uv);
119 	    }
120 	    if (flags & UNICODE_DISALLOW_SURROGATE) {
121 		return NULL;
122 	    }
123 	}
124 	else if (UNICODE_IS_SUPER(uv)) {
125 	    if (flags & UNICODE_WARN_SUPER
126 		|| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
127 	    {
128 		Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
129 			  "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
130 	    }
131 	    if (flags & UNICODE_DISALLOW_SUPER
132 		|| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
133 	    {
134 		return NULL;
135 	    }
136 	}
137 	else if (UNICODE_IS_NONCHAR(uv)) {
138 	    if (flags & UNICODE_WARN_NONCHAR) {
139 		Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
140 		 "Unicode non-character U+%04"UVXf" is illegal for open interchange",
141 		 uv);
142 	    }
143 	    if (flags & UNICODE_DISALLOW_NONCHAR) {
144 		return NULL;
145 	    }
146 	}
147     }
148 
149 #if defined(EBCDIC)
150     {
151 	STRLEN len  = OFFUNISKIP(uv);
152 	U8 *p = d+len-1;
153 	while (p > d) {
154 	    *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
155 	    uv >>= UTF_ACCUMULATION_SHIFT;
156 	}
157 	*p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
158 	return d+len;
159     }
160 #else /* Non loop style */
161     if (uv < 0x800) {
162 	*d++ = (U8)(( uv >>  6)         | 0xc0);
163 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
164 	return d;
165     }
166     if (uv < 0x10000) {
167 	*d++ = (U8)(( uv >> 12)         | 0xe0);
168 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
169 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
170 	return d;
171     }
172     if (uv < 0x200000) {
173 	*d++ = (U8)(( uv >> 18)         | 0xf0);
174 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
175 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
176 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
177 	return d;
178     }
179     if (uv < 0x4000000) {
180 	*d++ = (U8)(( uv >> 24)         | 0xf8);
181 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
182 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
183 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
184 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
185 	return d;
186     }
187     if (uv < 0x80000000) {
188 	*d++ = (U8)(( uv >> 30)         | 0xfc);
189 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
190 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
191 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
192 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
193 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
194 	return d;
195     }
196 #ifdef UTF8_QUAD_MAX
197     if (uv < UTF8_QUAD_MAX)
198 #endif
199     {
200 	*d++ =                            0xfe;	/* Can't match U+FEFF! */
201 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
202 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
203 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
204 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
205 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
206 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
207 	return d;
208     }
209 #ifdef UTF8_QUAD_MAX
210     {
211 	*d++ =                            0xff;		/* Can't match U+FFFE! */
212 	*d++ =                            0x80;		/* 6 Reserved bits */
213 	*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
214 	*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
215 	*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
216 	*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
217 	*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
218 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
219 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
220 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
221 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
222 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
223 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
224 	return d;
225     }
226 #endif
227 #endif /* Non loop style */
228 }
229 /*
230 =for apidoc uvchr_to_utf8
231 
232 Adds the UTF-8 representation of the native code point C<uv> to the end
233 of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
234 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
235 the byte after the end of the new character.  In other words,
236 
237     d = uvchr_to_utf8(d, uv);
238 
239 is the recommended wide native character-aware way of saying
240 
241     *(d++) = uv;
242 
243 This function accepts any UV as input.  To forbid or warn on non-Unicode code
244 points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
245 
246 =cut
247 */
248 
249 /* This is also a macro */
250 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
251 
252 U8 *
253 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
254 {
255     return uvchr_to_utf8(d, uv);
256 }
257 
258 /*
259 =for apidoc uvchr_to_utf8_flags
260 
261 Adds the UTF-8 representation of the native code point C<uv> to the end
262 of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
263 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
264 the byte after the end of the new character.  In other words,
265 
266     d = uvchr_to_utf8_flags(d, uv, flags);
267 
268 or, in most cases,
269 
270     d = uvchr_to_utf8_flags(d, uv, 0);
271 
272 This is the Unicode-aware way of saying
273 
274     *(d++) = uv;
275 
276 This function will convert to UTF-8 (and not warn) even code points that aren't
277 legal Unicode or are problematic, unless C<flags> contains one or more of the
278 following flags:
279 
280 If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
281 the function will raise a warning, provided UTF8 warnings are enabled.  If instead
282 UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
283 If both flags are set, the function will both warn and return NULL.
284 
285 The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
286 affect how the function handles a Unicode non-character.  And likewise, the
287 UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
288 code points that are
289 above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
290 even less portable) can be warned and/or disallowed even if other above-Unicode
291 code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
292 flags.
293 
294 And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
295 above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
296 DISALLOW flags.
297 
298 =cut
299 */
300 
301 /* This is also a macro */
302 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
303 
304 U8 *
305 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
306 {
307     return uvchr_to_utf8_flags(d, uv, flags);
308 }
309 
310 /*
311 
312 Tests if the first C<len> bytes of string C<s> form a valid UTF-8
313 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
314 valid UTF-8 character.  The number of bytes in the UTF-8 character
315 will be returned if it is valid, otherwise 0.
316 
317 This is the "slow" version as opposed to the "fast" version which is
318 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
319 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
320 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
321 you should use the _slow().  In practice this means that the _slow()
322 will be used very rarely, since the maximum Unicode code point (as of
323 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
324 the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
325 five bytes or more.
326 
327 =cut */
328 PERL_STATIC_INLINE STRLEN
329 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
330 {
331     dTHX;   /* The function called below requires thread context */
332 
333     STRLEN actual_len;
334 
335     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
336 
337     utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
338 
339     return (actual_len == (STRLEN) -1) ? 0 : actual_len;
340 }
341 
342 /*
343 =for apidoc is_utf8_char_buf
344 
345 Returns the number of bytes that comprise the first UTF-8 encoded character in
346 buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
347 buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
348 encoded character.
349 
350 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
351 machines) is a valid UTF-8 character.
352 
353 =cut */
354 
355 STRLEN
356 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
357 {
358 
359     STRLEN len;
360 
361     PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
362 
363     if (buf_end <= buf) {
364 	return 0;
365     }
366 
367     len = buf_end - buf;
368     if (len > UTF8SKIP(buf)) {
369 	len = UTF8SKIP(buf);
370     }
371 
372     if (IS_UTF8_CHAR_FAST(len))
373         return IS_UTF8_CHAR(buf, len) ? len : 0;
374     return is_utf8_char_slow(buf, len);
375 }
376 
377 /*
378 =for apidoc is_utf8_char
379 
380 Tests if some arbitrary number of bytes begins in a valid UTF-8
381 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
382 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
383 character will be returned if it is valid, otherwise 0.
384 
385 This function is deprecated due to the possibility that malformed input could
386 cause reading beyond the end of the input buffer.  Use L</is_utf8_char_buf>
387 instead.
388 
389 =cut */
390 
391 STRLEN
392 Perl_is_utf8_char(const U8 *s)
393 {
394     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
395 
396     /* Assumes we have enough space, which is why this is deprecated */
397     return is_utf8_char_buf(s, s + UTF8SKIP(s));
398 }
399 
400 
401 /*
402 =for apidoc is_utf8_string
403 
404 Returns true if the first C<len> bytes of string C<s> form a valid
405 UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
406 using C<strlen(s)> (which means if you use this option, that C<s> can't have
407 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
408 that all characters being ASCII constitute 'a valid UTF-8 string'.
409 
410 See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
411 
412 =cut
413 */
414 
415 bool
416 Perl_is_utf8_string(const U8 *s, STRLEN len)
417 {
418     const U8* const send = s + (len ? len : strlen((const char *)s));
419     const U8* x = s;
420 
421     PERL_ARGS_ASSERT_IS_UTF8_STRING;
422 
423     while (x < send) {
424 	 /* Inline the easy bits of is_utf8_char() here for speed... */
425 	 if (UTF8_IS_INVARIANT(*x)) {
426 	    x++;
427 	 }
428 	 else {
429 	      /* ... and call is_utf8_char() only if really needed. */
430 	     const STRLEN c = UTF8SKIP(x);
431 	     const U8* const next_char_ptr = x + c;
432 
433 	     if (next_char_ptr > send) {
434 		 return FALSE;
435 	     }
436 
437 	     if (IS_UTF8_CHAR_FAST(c)) {
438 	         if (!IS_UTF8_CHAR(x, c))
439 		     return FALSE;
440 	     }
441 	     else if (! is_utf8_char_slow(x, c)) {
442 		 return FALSE;
443 	     }
444 	     x = next_char_ptr;
445 	 }
446     }
447 
448     return TRUE;
449 }
450 
451 /*
452 Implemented as a macro in utf8.h
453 
454 =for apidoc is_utf8_string_loc
455 
456 Like L</is_utf8_string> but stores the location of the failure (in the
457 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
458 "utf8ness success") in the C<ep>.
459 
460 See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
461 
462 =for apidoc is_utf8_string_loclen
463 
464 Like L</is_utf8_string>() but stores the location of the failure (in the
465 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
466 "utf8ness success") in the C<ep>, and the number of UTF-8
467 encoded characters in the C<el>.
468 
469 See also L</is_utf8_string_loc>() and L</is_utf8_string>().
470 
471 =cut
472 */
473 
474 bool
475 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
476 {
477     const U8* const send = s + (len ? len : strlen((const char *)s));
478     const U8* x = s;
479     STRLEN c;
480     STRLEN outlen = 0;
481 
482     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
483 
484     while (x < send) {
485 	 const U8* next_char_ptr;
486 
487 	 /* Inline the easy bits of is_utf8_char() here for speed... */
488 	 if (UTF8_IS_INVARIANT(*x))
489 	     next_char_ptr = x + 1;
490 	 else {
491 	     /* ... and call is_utf8_char() only if really needed. */
492 	     c = UTF8SKIP(x);
493 	     next_char_ptr = c + x;
494 	     if (next_char_ptr > send) {
495 		 goto out;
496 	     }
497 	     if (IS_UTF8_CHAR_FAST(c)) {
498 	         if (!IS_UTF8_CHAR(x, c))
499 		     c = 0;
500 	     } else
501 	         c = is_utf8_char_slow(x, c);
502 	     if (!c)
503 	         goto out;
504 	 }
505          x = next_char_ptr;
506 	 outlen++;
507     }
508 
509  out:
510     if (el)
511         *el = outlen;
512 
513     if (ep)
514         *ep = x;
515     return (x == send);
516 }
517 
518 /*
519 
520 =for apidoc utf8n_to_uvchr
521 
522 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
523 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
524 
525 Bottom level UTF-8 decode routine.
526 Returns the native code point value of the first character in the string C<s>,
527 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
528 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
529 the length, in bytes, of that character.
530 
531 The value of C<flags> determines the behavior when C<s> does not point to a
532 well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
533 zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
534 next possible position in C<s> that could begin a non-malformed character.
535 Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
536 
537 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
538 individual types of malformations, such as the sequence being overlong (that
539 is, when there is a shorter sequence that can express the same code point;
540 overlong sequences are expressly forbidden in the UTF-8 standard due to
541 potential security issues).  Another malformation example is the first byte of
542 a character not being a legal first byte.  See F<utf8.h> for the list of such
543 flags.  For allowed 0 length strings, this function returns 0; for allowed
544 overlong sequences, the computed code point is returned; for all other allowed
545 malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
546 determinable reasonable value.
547 
548 The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
549 flags) malformation is found.  If this flag is set, the routine assumes that
550 the caller will raise a warning, and this function will silently just set
551 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
552 
553 Note that this API requires disambiguation between successful decoding a C<NUL>
554 character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
555 in both cases, 0 is returned.  To disambiguate, upon a zero return, see if the
556 first byte of C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the
557 input had an error.
558 
559 Certain code points are considered problematic.  These are Unicode surrogates,
560 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
561 By default these are considered regular code points, but certain situations
562 warrant special handling for them.  If C<flags> contains
563 UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
564 malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
565 UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
566 maximum) can be set to disallow these categories individually.
567 
568 The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
569 UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
570 for their respective categories, but otherwise the code points are considered
571 valid (not malformations).  To get a category to both be treated as a
572 malformation and raise a warning, specify both the WARN and DISALLOW flags.
573 (But note that warnings are not raised if lexically disabled nor if
574 UTF8_CHECK_ONLY is also specified.)
575 
576 Very large code points (above 0x7FFF_FFFF) are considered more problematic than
577 the others that are above the Unicode legal maximum.  There are several
578 reasons: they requre at least 32 bits to represent them on ASCII platforms, are
579 not representable at all on EBCDIC platforms, and the original UTF-8
580 specification never went above this number (the current 0x10FFFF limit was
581 imposed later).  (The smaller ones, those that fit into 32 bits, are
582 representable by a UV on ASCII platforms, but not by an IV, which means that
583 the number of operations that can be performed on them is quite restricted.)
584 The UTF-8 encoding on ASCII platforms for these large code points begins with a
585 byte containing 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to
586 be treated as malformations, while allowing smaller above-Unicode code points.
587 (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
588 including these, as malformations.)
589 Similarly, UTF8_WARN_FE_FF acts just like
590 the other WARN flags, but applies just to these code points.
591 
592 All other code points corresponding to Unicode characters, including private
593 use and those yet to be assigned, are never considered malformed and never
594 warn.
595 
596 =cut
597 */
598 
599 UV
600 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
601 {
602     dVAR;
603     const U8 * const s0 = s;
604     U8 overflow_byte = '\0';	/* Save byte in case of overflow */
605     U8 * send;
606     UV uv = *s;
607     STRLEN expectlen;
608     SV* sv = NULL;
609     UV outlier_ret = 0;	/* return value when input is in error or problematic
610 			 */
611     UV pack_warn = 0;	/* Save result of packWARN() for later */
612     bool unexpected_non_continuation = FALSE;
613     bool overflowed = FALSE;
614     bool do_overlong_test = TRUE;   /* May have to skip this test */
615 
616     const char* const malformed_text = "Malformed UTF-8 character";
617 
618     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
619 
620     /* The order of malformation tests here is important.  We should consume as
621      * few bytes as possible in order to not skip any valid character.  This is
622      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
623      * http://unicode.org/reports/tr36 for more discussion as to why.  For
624      * example, once we've done a UTF8SKIP, we can tell the expected number of
625      * bytes, and could fail right off the bat if the input parameters indicate
626      * that there are too few available.  But it could be that just that first
627      * byte is garbled, and the intended character occupies fewer bytes.  If we
628      * blindly assumed that the first byte is correct, and skipped based on
629      * that number, we could skip over a valid input character.  So instead, we
630      * always examine the sequence byte-by-byte.
631      *
632      * We also should not consume too few bytes, otherwise someone could inject
633      * things.  For example, an input could be deliberately designed to
634      * overflow, and if this code bailed out immediately upon discovering that,
635      * returning to the caller C<*retlen> pointing to the very next byte (one
636      * which is actually part of of the overflowing sequence), that could look
637      * legitimate to the caller, which could discard the initial partial
638      * sequence and process the rest, inappropriately */
639 
640     /* Zero length strings, if allowed, of necessity are zero */
641     if (UNLIKELY(curlen == 0)) {
642 	if (retlen) {
643 	    *retlen = 0;
644 	}
645 
646 	if (flags & UTF8_ALLOW_EMPTY) {
647 	    return 0;
648 	}
649 	if (! (flags & UTF8_CHECK_ONLY)) {
650 	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
651 	}
652 	goto malformed;
653     }
654 
655     expectlen = UTF8SKIP(s);
656 
657     /* A well-formed UTF-8 character, as the vast majority of calls to this
658      * function will be for, has this expected length.  For efficiency, set
659      * things up here to return it.  It will be overriden only in those rare
660      * cases where a malformation is found */
661     if (retlen) {
662 	*retlen = expectlen;
663     }
664 
665     /* An invariant is trivially well-formed */
666     if (UTF8_IS_INVARIANT(uv)) {
667 	return uv;
668     }
669 
670     /* A continuation character can't start a valid sequence */
671     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
672 	if (flags & UTF8_ALLOW_CONTINUATION) {
673 	    if (retlen) {
674 		*retlen = 1;
675 	    }
676 	    return UNICODE_REPLACEMENT;
677 	}
678 
679 	if (! (flags & UTF8_CHECK_ONLY)) {
680 	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
681 	}
682 	curlen = 1;
683 	goto malformed;
684     }
685 
686     /* Here is not a continuation byte, nor an invariant.  The only thing left
687      * is a start byte (possibly for an overlong) */
688 
689 #ifdef EBCDIC
690     uv = NATIVE_UTF8_TO_I8(uv);
691 #endif
692 
693     /* Remove the leading bits that indicate the number of bytes in the
694      * character's whole UTF-8 sequence, leaving just the bits that are part of
695      * the value */
696     uv &= UTF_START_MASK(expectlen);
697 
698     /* Now, loop through the remaining bytes in the character's sequence,
699      * accumulating each into the working value as we go.  Be sure to not look
700      * past the end of the input string */
701     send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
702 
703     for (s = s0 + 1; s < send; s++) {
704 	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
705 #ifndef EBCDIC	/* Can't overflow in EBCDIC */
706 	    if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
707 
708 		/* The original implementors viewed this malformation as more
709 		 * serious than the others (though I, khw, don't understand
710 		 * why, since other malformations also give very very wrong
711 		 * results), so there is no way to turn off checking for it.
712 		 * Set a flag, but keep going in the loop, so that we absorb
713 		 * the rest of the bytes that comprise the character. */
714 		overflowed = TRUE;
715 		overflow_byte = *s; /* Save for warning message's use */
716 	    }
717 #endif
718 	    uv = UTF8_ACCUMULATE(uv, *s);
719 	}
720 	else {
721 	    /* Here, found a non-continuation before processing all expected
722 	     * bytes.  This byte begins a new character, so quit, even if
723 	     * allowing this malformation. */
724 	    unexpected_non_continuation = TRUE;
725 	    break;
726 	}
727     } /* End of loop through the character's bytes */
728 
729     /* Save how many bytes were actually in the character */
730     curlen = s - s0;
731 
732     /* The loop above finds two types of malformations: non-continuation and/or
733      * overflow.  The non-continuation malformation is really a too-short
734      * malformation, as it means that the current character ended before it was
735      * expected to (being terminated prematurely by the beginning of the next
736      * character, whereas in the too-short malformation there just are too few
737      * bytes available to hold the character.  In both cases, the check below
738      * that we have found the expected number of bytes would fail if executed.)
739      * Thus the non-continuation malformation is really unnecessary, being a
740      * subset of the too-short malformation.  But there may be existing
741      * applications that are expecting the non-continuation type, so we retain
742      * it, and return it in preference to the too-short malformation.  (If this
743      * code were being written from scratch, the two types might be collapsed
744      * into one.)  I, khw, am also giving priority to returning the
745      * non-continuation and too-short malformations over overflow when multiple
746      * ones are present.  I don't know of any real reason to prefer one over
747      * the other, except that it seems to me that multiple-byte errors trumps
748      * errors from a single byte */
749     if (UNLIKELY(unexpected_non_continuation)) {
750 	if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
751 	    if (! (flags & UTF8_CHECK_ONLY)) {
752 		if (curlen == 1) {
753 		    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
754 		}
755 		else {
756 		    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
757 		}
758 	    }
759 	    goto malformed;
760 	}
761 	uv = UNICODE_REPLACEMENT;
762 
763 	/* Skip testing for overlongs, as the REPLACEMENT may not be the same
764 	 * as what the original expectations were. */
765 	do_overlong_test = FALSE;
766 	if (retlen) {
767 	    *retlen = curlen;
768 	}
769     }
770     else if (UNLIKELY(curlen < expectlen)) {
771 	if (! (flags & UTF8_ALLOW_SHORT)) {
772 	    if (! (flags & UTF8_CHECK_ONLY)) {
773 		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
774 	    }
775 	    goto malformed;
776 	}
777 	uv = UNICODE_REPLACEMENT;
778 	do_overlong_test = FALSE;
779 	if (retlen) {
780 	    *retlen = curlen;
781 	}
782     }
783 
784 #ifndef EBCDIC	/* EBCDIC can't overflow */
785     if (UNLIKELY(overflowed)) {
786 	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
787 	goto malformed;
788     }
789 #endif
790 
791     if (do_overlong_test
792 	&& expectlen > (STRLEN) OFFUNISKIP(uv)
793 	&& ! (flags & UTF8_ALLOW_LONG))
794     {
795 	/* The overlong malformation has lower precedence than the others.
796 	 * Note that if this malformation is allowed, we return the actual
797 	 * value, instead of the replacement character.  This is because this
798 	 * value is actually well-defined. */
799 	if (! (flags & UTF8_CHECK_ONLY)) {
800 	    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
801 	}
802 	goto malformed;
803     }
804 
805     /* Here, the input is considered to be well-formed, but it still could be a
806      * problematic code point that is not allowed by the input parameters. */
807     if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
808 	&& (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
809 		     |UTF8_WARN_ILLEGAL_INTERCHANGE)))
810     {
811 	if (UNICODE_IS_SURROGATE(uv)) {
812 
813             /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
814              * generation of the sv, since no warnings are raised under CHECK */
815 	    if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
816 		&& ckWARN_d(WARN_SURROGATE))
817 	    {
818 		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
819 		pack_warn = packWARN(WARN_SURROGATE);
820 	    }
821 	    if (flags & UTF8_DISALLOW_SURROGATE) {
822 		goto disallowed;
823 	    }
824 	}
825 	else if ((uv > PERL_UNICODE_MAX)) {
826 	    if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
827                 && ckWARN_d(WARN_NON_UNICODE))
828 	    {
829 		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
830 		pack_warn = packWARN(WARN_NON_UNICODE);
831 	    }
832 #ifndef EBCDIC	/* EBCDIC always allows FE, FF */
833 
834             /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
835              * points.  We test for these after the regular SUPER ones, and
836              * before possibly bailing out, so that the more dire warning
837              * overrides the regular one, if applicable */
838             if ((*s0 & 0xFE) == 0xFE	/* matches both FE, FF */
839                 && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
840             {
841                 if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
842                                                             == UTF8_WARN_FE_FF
843                     && ckWARN_d(WARN_UTF8))
844                 {
845                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
846                     pack_warn = packWARN(WARN_UTF8);
847                 }
848                 if (flags & UTF8_DISALLOW_FE_FF) {
849                     goto disallowed;
850                 }
851             }
852 #endif
853 	    if (flags & UTF8_DISALLOW_SUPER) {
854 		goto disallowed;
855 	    }
856 	}
857 	else if (UNICODE_IS_NONCHAR(uv)) {
858 	    if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
859 		&& ckWARN_d(WARN_NONCHAR))
860 	    {
861 		sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
862 		pack_warn = packWARN(WARN_NONCHAR);
863 	    }
864 	    if (flags & UTF8_DISALLOW_NONCHAR) {
865 		goto disallowed;
866 	    }
867 	}
868 
869 	if (sv) {
870             outlier_ret = uv;   /* Note we don't bother to convert to native,
871                                    as all the outlier code points are the same
872                                    in both ASCII and EBCDIC */
873 	    goto do_warn;
874 	}
875 
876 	/* Here, this is not considered a malformed character, so drop through
877 	 * to return it */
878     }
879 
880     return UNI_TO_NATIVE(uv);
881 
882     /* There are three cases which get to beyond this point.  In all 3 cases:
883      * <sv>	    if not null points to a string to print as a warning.
884      * <curlen>	    is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
885      *		    set.
886      * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
887      *		    This is done by initializing it to 0, and changing it only
888      *		    for case 1).
889      * The 3 cases are:
890      * 1)   The input is valid but problematic, and to be warned about.  The
891      *	    return value is the resultant code point; <*retlen> is set to
892      *	    <curlen>, the number of bytes that comprise the code point.
893      *	    <pack_warn> contains the result of packWARN() for the warning
894      *	    types.  The entry point for this case is the label <do_warn>;
895      * 2)   The input is a valid code point but disallowed by the parameters to
896      *	    this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
897      *	    <*relen> is -1; otherwise it is <curlen>, the number of bytes that
898      *	    comprise the code point.  <pack_warn> contains the result of
899      *	    packWARN() for the warning types.  The entry point for this case is
900      *	    the label <disallowed>.
901      * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
902      *	    is set, <*relen> is -1; otherwise it is <curlen>, the number of
903      *	    bytes that comprise the malformation.  All such malformations are
904      *	    assumed to be warning type <utf8>.  The entry point for this case
905      *	    is the label <malformed>.
906      */
907 
908 malformed:
909 
910     if (sv && ckWARN_d(WARN_UTF8)) {
911 	pack_warn = packWARN(WARN_UTF8);
912     }
913 
914 disallowed:
915 
916     if (flags & UTF8_CHECK_ONLY) {
917 	if (retlen)
918 	    *retlen = ((STRLEN) -1);
919 	return 0;
920     }
921 
922 do_warn:
923 
924     if (pack_warn) {	/* <pack_warn> was initialized to 0, and changed only
925 			   if warnings are to be raised. */
926 	const char * const string = SvPVX_const(sv);
927 
928 	if (PL_op)
929 	    Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
930 	else
931 	    Perl_warner(aTHX_ pack_warn, "%s", string);
932     }
933 
934     if (retlen) {
935 	*retlen = curlen;
936     }
937 
938     return outlier_ret;
939 }
940 
941 /*
942 =for apidoc utf8_to_uvchr_buf
943 
944 Returns the native code point of the first character in the string C<s> which
945 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
946 C<*retlen> will be set to the length, in bytes, of that character.
947 
948 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
949 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
950 NULL) to -1.  If those warnings are off, the computed value, if well-defined
951 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
952 C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
953 the next possible position in C<s> that could begin a non-malformed character.
954 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
955 returned.
956 
957 =cut
958 */
959 
960 
961 UV
962 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
963 {
964     assert(s < send);
965 
966     return utf8n_to_uvchr(s, send - s, retlen,
967 			  ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
968 }
969 
970 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
971  * there are no malformations in the input UTF-8 string C<s>.  surrogates,
972  * non-character code points, and non-Unicode code points are allowed. */
973 
974 UV
975 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
976 {
977     UV expectlen = UTF8SKIP(s);
978     const U8* send = s + expectlen;
979     UV uv = *s;
980 
981     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
982 
983     if (retlen) {
984         *retlen = expectlen;
985     }
986 
987     /* An invariant is trivially returned */
988     if (expectlen == 1) {
989 	return uv;
990     }
991 
992 #ifdef EBCDIC
993     uv = NATIVE_UTF8_TO_I8(uv);
994 #endif
995 
996     /* Remove the leading bits that indicate the number of bytes, leaving just
997      * the bits that are part of the value */
998     uv &= UTF_START_MASK(expectlen);
999 
1000     /* Now, loop through the remaining bytes, accumulating each into the
1001      * working total as we go.  (I khw tried unrolling the loop for up to 4
1002      * bytes, but there was no performance improvement) */
1003     for (++s; s < send; s++) {
1004         uv = UTF8_ACCUMULATE(uv, *s);
1005     }
1006 
1007     return UNI_TO_NATIVE(uv);
1008 
1009 }
1010 
1011 /*
1012 =for apidoc utf8_to_uvchr
1013 
1014 Returns the native code point of the first character in the string C<s>
1015 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1016 length, in bytes, of that character.
1017 
1018 Some, but not all, UTF-8 malformations are detected, and in fact, some
1019 malformed input could cause reading beyond the end of the input buffer, which
1020 is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
1021 
1022 If C<s> points to one of the detected malformations, and UTF8 warnings are
1023 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1024 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
1025 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1026 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1027 next possible position in C<s> that could begin a non-malformed character.
1028 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1029 
1030 =cut
1031 */
1032 
1033 UV
1034 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
1035 {
1036     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
1037 
1038     return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
1039 }
1040 
1041 /*
1042 =for apidoc utf8_to_uvuni_buf
1043 
1044 Only in very rare circumstances should code need to be dealing in Unicode
1045 (as opposed to native) code points.  In those few cases, use
1046 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
1047 
1048 Returns the Unicode (not-native) code point of the first character in the
1049 string C<s> which
1050 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1051 C<retlen> will be set to the length, in bytes, of that character.
1052 
1053 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1054 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1055 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
1056 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1057 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1058 next possible position in C<s> that could begin a non-malformed character.
1059 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1060 
1061 =cut
1062 */
1063 
1064 UV
1065 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1066 {
1067     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1068 
1069     assert(send > s);
1070 
1071     /* Call the low level routine asking for checks */
1072     return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
1073 			       ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
1074 }
1075 
1076 /* DEPRECATED!
1077  * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
1078  * there are no malformations in the input UTF-8 string C<s>.  Surrogates,
1079  * non-character code points, and non-Unicode code points are allowed */
1080 
1081 UV
1082 Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1083 {
1084     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1085 
1086     return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1087 }
1088 
1089 /*
1090 =for apidoc utf8_to_uvuni
1091 
1092 Returns the Unicode code point of the first character in the string C<s>
1093 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1094 length, in bytes, of that character.
1095 
1096 Some, but not all, UTF-8 malformations are detected, and in fact, some
1097 malformed input could cause reading beyond the end of the input buffer, which
1098 is one reason why this function is deprecated.  The other is that only in
1099 extremely limited circumstances should the Unicode versus native code point be
1100 of any interest to you.  See L</utf8_to_uvuni_buf> for alternatives.
1101 
1102 If C<s> points to one of the detected malformations, and UTF8 warnings are
1103 enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
1104 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
1105 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1106 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1107 next possible position in C<s> that could begin a non-malformed character.
1108 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1109 
1110 =cut
1111 */
1112 
1113 UV
1114 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1115 {
1116     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1117 
1118     return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
1119 }
1120 
1121 /*
1122 =for apidoc utf8_length
1123 
1124 Return the length of the UTF-8 char encoded string C<s> in characters.
1125 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
1126 up past C<e>, croaks.
1127 
1128 =cut
1129 */
1130 
1131 STRLEN
1132 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
1133 {
1134     dVAR;
1135     STRLEN len = 0;
1136 
1137     PERL_ARGS_ASSERT_UTF8_LENGTH;
1138 
1139     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1140      * the bitops (especially ~) can create illegal UTF-8.
1141      * In other words: in Perl UTF-8 is not just for Unicode. */
1142 
1143     if (e < s)
1144 	goto warn_and_return;
1145     while (s < e) {
1146         s += UTF8SKIP(s);
1147 	len++;
1148     }
1149 
1150     if (e != s) {
1151 	len--;
1152         warn_and_return:
1153 	if (PL_op)
1154 	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1155 			     "%s in %s", unees, OP_DESC(PL_op));
1156 	else
1157 	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1158     }
1159 
1160     return len;
1161 }
1162 
1163 /*
1164 =for apidoc utf8_distance
1165 
1166 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1167 and C<b>.
1168 
1169 WARNING: use only if you *know* that the pointers point inside the
1170 same UTF-8 buffer.
1171 
1172 =cut
1173 */
1174 
1175 IV
1176 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1177 {
1178     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1179 
1180     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1181 }
1182 
1183 /*
1184 =for apidoc utf8_hop
1185 
1186 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1187 forward or backward.
1188 
1189 WARNING: do not use the following unless you *know* C<off> is within
1190 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1191 on the first byte of character or just after the last byte of a character.
1192 
1193 =cut
1194 */
1195 
1196 U8 *
1197 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1198 {
1199     PERL_ARGS_ASSERT_UTF8_HOP;
1200 
1201     PERL_UNUSED_CONTEXT;
1202     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1203      * the bitops (especially ~) can create illegal UTF-8.
1204      * In other words: in Perl UTF-8 is not just for Unicode. */
1205 
1206     if (off >= 0) {
1207 	while (off--)
1208 	    s += UTF8SKIP(s);
1209     }
1210     else {
1211 	while (off++) {
1212 	    s--;
1213 	    while (UTF8_IS_CONTINUATION(*s))
1214 		s--;
1215 	}
1216     }
1217     return (U8 *)s;
1218 }
1219 
1220 /*
1221 =for apidoc bytes_cmp_utf8
1222 
1223 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1224 sequence of characters (stored as UTF-8)
1225 in C<u>, C<ulen>.  Returns 0 if they are
1226 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1227 if the first string is greater than the second string.
1228 
1229 -1 or +1 is returned if the shorter string was identical to the start of the
1230 longer string.  -2 or +2 is returned if
1231 there was a difference between characters
1232 within the strings.
1233 
1234 =cut
1235 */
1236 
1237 int
1238 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1239 {
1240     const U8 *const bend = b + blen;
1241     const U8 *const uend = u + ulen;
1242 
1243     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1244 
1245     PERL_UNUSED_CONTEXT;
1246 
1247     while (b < bend && u < uend) {
1248         U8 c = *u++;
1249 	if (!UTF8_IS_INVARIANT(c)) {
1250 	    if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1251 		if (u < uend) {
1252 		    U8 c1 = *u++;
1253 		    if (UTF8_IS_CONTINUATION(c1)) {
1254 			c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
1255 		    } else {
1256 			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1257 					 "Malformed UTF-8 character "
1258 					 "(unexpected non-continuation byte 0x%02x"
1259 					 ", immediately after start byte 0x%02x)"
1260 					 /* Dear diag.t, it's in the pod.  */
1261 					 "%s%s", c1, c,
1262 					 PL_op ? " in " : "",
1263 					 PL_op ? OP_DESC(PL_op) : "");
1264 			return -2;
1265 		    }
1266 		} else {
1267 		    if (PL_op)
1268 			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1269 					 "%s in %s", unees, OP_DESC(PL_op));
1270 		    else
1271 			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1272 		    return -2; /* Really want to return undef :-)  */
1273 		}
1274 	    } else {
1275 		return -2;
1276 	    }
1277 	}
1278 	if (*b != c) {
1279 	    return *b < c ? -2 : +2;
1280 	}
1281 	++b;
1282     }
1283 
1284     if (b == bend && u == uend)
1285 	return 0;
1286 
1287     return b < bend ? +1 : -1;
1288 }
1289 
1290 /*
1291 =for apidoc utf8_to_bytes
1292 
1293 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1294 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1295 updates C<len> to contain the new length.
1296 Returns zero on failure, setting C<len> to -1.
1297 
1298 If you need a copy of the string, see L</bytes_from_utf8>.
1299 
1300 =cut
1301 */
1302 
1303 U8 *
1304 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1305 {
1306     U8 * const save = s;
1307     U8 * const send = s + *len;
1308     U8 *d;
1309 
1310     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1311 
1312     /* ensure valid UTF-8 and chars < 256 before updating string */
1313     while (s < send) {
1314         if (! UTF8_IS_INVARIANT(*s)) {
1315             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1316                 *len = ((STRLEN) -1);
1317                 return 0;
1318             }
1319             s++;
1320         }
1321         s++;
1322     }
1323 
1324     d = s = save;
1325     while (s < send) {
1326 	U8 c = *s++;
1327 	if (! UTF8_IS_INVARIANT(c)) {
1328 	    /* Then it is two-byte encoded */
1329 	    c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1330             s++;
1331 	}
1332 	*d++ = c;
1333     }
1334     *d = '\0';
1335     *len = d - save;
1336     return save;
1337 }
1338 
1339 /*
1340 =for apidoc bytes_from_utf8
1341 
1342 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1343 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
1344 the newly-created string, and updates C<len> to contain the new
1345 length.  Returns the original string if no conversion occurs, C<len>
1346 is unchanged.  Do nothing if C<is_utf8> points to 0.  Sets C<is_utf8> to
1347 0 if C<s> is converted or consisted entirely of characters that are invariant
1348 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1349 
1350 =cut
1351 */
1352 
1353 U8 *
1354 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1355 {
1356     U8 *d;
1357     const U8 *start = s;
1358     const U8 *send;
1359     I32 count = 0;
1360 
1361     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1362 
1363     PERL_UNUSED_CONTEXT;
1364     if (!*is_utf8)
1365         return (U8 *)start;
1366 
1367     /* ensure valid UTF-8 and chars < 256 before converting string */
1368     for (send = s + *len; s < send;) {
1369         if (! UTF8_IS_INVARIANT(*s)) {
1370             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1371                 return (U8 *)start;
1372             }
1373             count++;
1374             s++;
1375 	}
1376         s++;
1377     }
1378 
1379     *is_utf8 = FALSE;
1380 
1381     Newx(d, (*len) - count + 1, U8);
1382     s = start; start = d;
1383     while (s < send) {
1384 	U8 c = *s++;
1385 	if (! UTF8_IS_INVARIANT(c)) {
1386 	    /* Then it is two-byte encoded */
1387 	    c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1388             s++;
1389 	}
1390 	*d++ = c;
1391     }
1392     *d = '\0';
1393     *len = d - start;
1394     return (U8 *)start;
1395 }
1396 
1397 /*
1398 =for apidoc bytes_to_utf8
1399 
1400 Converts a string C<s> of length C<len> bytes from the native encoding into
1401 UTF-8.
1402 Returns a pointer to the newly-created string, and sets C<len> to
1403 reflect the new length in bytes.
1404 
1405 A C<NUL> character will be written after the end of the string.
1406 
1407 If you want to convert to UTF-8 from encodings other than
1408 the native (Latin1 or EBCDIC),
1409 see L</sv_recode_to_utf8>().
1410 
1411 =cut
1412 */
1413 
1414 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1415    likewise need duplication. */
1416 
1417 U8*
1418 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1419 {
1420     const U8 * const send = s + (*len);
1421     U8 *d;
1422     U8 *dst;
1423 
1424     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1425     PERL_UNUSED_CONTEXT;
1426 
1427     Newx(d, (*len) * 2 + 1, U8);
1428     dst = d;
1429 
1430     while (s < send) {
1431         append_utf8_from_native_byte(*s, &d);
1432         s++;
1433     }
1434     *d = '\0';
1435     *len = d-dst;
1436     return dst;
1437 }
1438 
1439 /*
1440  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1441  *
1442  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
1443  * We optimize for native, for obvious reasons. */
1444 
1445 U8*
1446 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1447 {
1448     U8* pend;
1449     U8* dstart = d;
1450 
1451     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1452 
1453     if (bytelen & 1)
1454 	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1455 
1456     pend = p + bytelen;
1457 
1458     while (p < pend) {
1459 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1460 	p += 2;
1461 	if (UNI_IS_INVARIANT(uv)) {
1462 	    *d++ = LATIN1_TO_NATIVE((U8) uv);
1463 	    continue;
1464 	}
1465 	if (uv <= MAX_UTF8_TWO_BYTE) {
1466 	    *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1467 	    *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
1468 	    continue;
1469 	}
1470 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1471 #define LAST_HIGH_SURROGATE  0xDBFF
1472 #define FIRST_LOW_SURROGATE  0xDC00
1473 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
1474 	if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
1475 	    if (p >= pend) {
1476 		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1477 	    } else {
1478 		UV low = (p[0] << 8) + p[1];
1479 		p += 2;
1480 		if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
1481 		    Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1482 		uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1483                                        + (low - FIRST_LOW_SURROGATE) + 0x10000;
1484 	    }
1485 	} else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
1486 	    Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1487 	}
1488 #ifdef EBCDIC
1489         d = uvoffuni_to_utf8_flags(d, uv, 0);
1490 #else
1491 	if (uv < 0x10000) {
1492 	    *d++ = (U8)(( uv >> 12)         | 0xe0);
1493 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1494 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
1495 	    continue;
1496 	}
1497 	else {
1498 	    *d++ = (U8)(( uv >> 18)         | 0xf0);
1499 	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1500 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1501 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
1502 	    continue;
1503 	}
1504 #endif
1505     }
1506     *newlen = d - dstart;
1507     return d;
1508 }
1509 
1510 /* Note: this one is slightly destructive of the source. */
1511 
1512 U8*
1513 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1514 {
1515     U8* s = (U8*)p;
1516     U8* const send = s + bytelen;
1517 
1518     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1519 
1520     if (bytelen & 1)
1521 	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1522 		   (UV)bytelen);
1523 
1524     while (s < send) {
1525 	const U8 tmp = s[0];
1526 	s[0] = s[1];
1527 	s[1] = tmp;
1528 	s += 2;
1529     }
1530     return utf16_to_utf8(p, d, bytelen, newlen);
1531 }
1532 
1533 bool
1534 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1535 {
1536     U8 tmpbuf[UTF8_MAXBYTES+1];
1537     uvchr_to_utf8(tmpbuf, c);
1538     return _is_utf8_FOO(classnum, tmpbuf);
1539 }
1540 
1541 /* Internal function so we can deprecate the external one, and call
1542    this one from other deprecated functions in this file */
1543 
1544 PERL_STATIC_INLINE bool
1545 S_is_utf8_idfirst(pTHX_ const U8 *p)
1546 {
1547     dVAR;
1548 
1549     if (*p == '_')
1550 	return TRUE;
1551     /* is_utf8_idstart would be more logical. */
1552     return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
1553 }
1554 
1555 bool
1556 Perl_is_uni_idfirst(pTHX_ UV c)
1557 {
1558     U8 tmpbuf[UTF8_MAXBYTES+1];
1559     uvchr_to_utf8(tmpbuf, c);
1560     return S_is_utf8_idfirst(aTHX_ tmpbuf);
1561 }
1562 
1563 bool
1564 Perl__is_uni_perl_idcont(pTHX_ UV c)
1565 {
1566     U8 tmpbuf[UTF8_MAXBYTES+1];
1567     uvchr_to_utf8(tmpbuf, c);
1568     return _is_utf8_perl_idcont(tmpbuf);
1569 }
1570 
1571 bool
1572 Perl__is_uni_perl_idstart(pTHX_ UV c)
1573 {
1574     U8 tmpbuf[UTF8_MAXBYTES+1];
1575     uvchr_to_utf8(tmpbuf, c);
1576     return _is_utf8_perl_idstart(tmpbuf);
1577 }
1578 
1579 UV
1580 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1581 {
1582     /* We have the latin1-range values compiled into the core, so just use
1583      * those, converting the result to utf8.  The only difference between upper
1584      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1585      * either "SS" or "Ss".  Which one to use is passed into the routine in
1586      * 'S_or_s' to avoid a test */
1587 
1588     UV converted = toUPPER_LATIN1_MOD(c);
1589 
1590     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1591 
1592     assert(S_or_s == 'S' || S_or_s == 's');
1593 
1594     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
1595 					     characters in this range */
1596 	*p = (U8) converted;
1597 	*lenp = 1;
1598 	return converted;
1599     }
1600 
1601     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1602      * which it maps to one of them, so as to only have to have one check for
1603      * it in the main case */
1604     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1605 	switch (c) {
1606 	    case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1607 		converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1608 		break;
1609 	    case MICRO_SIGN:
1610 		converted = GREEK_CAPITAL_LETTER_MU;
1611 		break;
1612 	    case LATIN_SMALL_LETTER_SHARP_S:
1613 		*(p)++ = 'S';
1614 		*p = S_or_s;
1615 		*lenp = 2;
1616 		return 'S';
1617 	    default:
1618 		Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1619 		assert(0); /* NOTREACHED */
1620 	}
1621     }
1622 
1623     *(p)++ = UTF8_TWO_BYTE_HI(converted);
1624     *p = UTF8_TWO_BYTE_LO(converted);
1625     *lenp = 2;
1626 
1627     return converted;
1628 }
1629 
1630 /* Call the function to convert a UTF-8 encoded character to the specified case.
1631  * Note that there may be more than one character in the result.
1632  * INP is a pointer to the first byte of the input character
1633  * OUTP will be set to the first byte of the string of changed characters.  It
1634  *	needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1635  * LENP will be set to the length in bytes of the string of changed characters
1636  *
1637  * The functions return the ordinal of the first character in the string of OUTP */
1638 #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
1639 #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
1640 #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
1641 
1642 /* This additionally has the input parameter SPECIALS, which if non-zero will
1643  * cause this to use the SPECIALS hash for folding (meaning get full case
1644  * folding); otherwise, when zero, this implies a simple case fold */
1645 #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
1646 
1647 UV
1648 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1649 {
1650     dVAR;
1651 
1652     /* Convert the Unicode character whose ordinal is <c> to its uppercase
1653      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1654      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1655      * the changed version may be longer than the original character.
1656      *
1657      * The ordinal of the first character of the changed version is returned
1658      * (but note, as explained above, that there may be more.) */
1659 
1660     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1661 
1662     if (c < 256) {
1663 	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1664     }
1665 
1666     uvchr_to_utf8(p, c);
1667     return CALL_UPPER_CASE(p, p, lenp);
1668 }
1669 
1670 UV
1671 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1672 {
1673     dVAR;
1674 
1675     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1676 
1677     if (c < 256) {
1678 	return _to_upper_title_latin1((U8) c, p, lenp, 's');
1679     }
1680 
1681     uvchr_to_utf8(p, c);
1682     return CALL_TITLE_CASE(p, p, lenp);
1683 }
1684 
1685 STATIC U8
1686 S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1687 {
1688     /* We have the latin1-range values compiled into the core, so just use
1689      * those, converting the result to utf8.  Since the result is always just
1690      * one character, we allow <p> to be NULL */
1691 
1692     U8 converted = toLOWER_LATIN1(c);
1693 
1694     if (p != NULL) {
1695 	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
1696 	    *p = converted;
1697 	    *lenp = 1;
1698 	}
1699 	else {
1700 	    *p = UTF8_TWO_BYTE_HI(converted);
1701 	    *(p+1) = UTF8_TWO_BYTE_LO(converted);
1702 	    *lenp = 2;
1703 	}
1704     }
1705     return converted;
1706 }
1707 
1708 UV
1709 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1710 {
1711     dVAR;
1712 
1713     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1714 
1715     if (c < 256) {
1716 	return to_lower_latin1((U8) c, p, lenp);
1717     }
1718 
1719     uvchr_to_utf8(p, c);
1720     return CALL_LOWER_CASE(p, p, lenp);
1721 }
1722 
1723 UV
1724 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
1725 {
1726     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1727      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1728      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
1729      *
1730      *	Not to be used for locale folds
1731      */
1732 
1733     UV converted;
1734 
1735     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1736 
1737     assert (! (flags & FOLD_FLAGS_LOCALE));
1738 
1739     if (c == MICRO_SIGN) {
1740 	converted = GREEK_SMALL_LETTER_MU;
1741     }
1742     else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1743 
1744         /* If can't cross 127/128 boundary, can't return "ss"; instead return
1745          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1746          * under those circumstances. */
1747         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1748             *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1749             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1750                  p, *lenp, U8);
1751             return LATIN_SMALL_LETTER_LONG_S;
1752         }
1753         else {
1754             *(p)++ = 's';
1755             *p = 's';
1756             *lenp = 2;
1757             return 's';
1758         }
1759     }
1760     else { /* In this range the fold of all other characters is their lower
1761               case */
1762 	converted = toLOWER_LATIN1(c);
1763     }
1764 
1765     if (UVCHR_IS_INVARIANT(converted)) {
1766 	*p = (U8) converted;
1767 	*lenp = 1;
1768     }
1769     else {
1770 	*(p)++ = UTF8_TWO_BYTE_HI(converted);
1771 	*p = UTF8_TWO_BYTE_LO(converted);
1772 	*lenp = 2;
1773     }
1774 
1775     return converted;
1776 }
1777 
1778 UV
1779 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
1780 {
1781 
1782     /* Not currently externally documented, and subject to change
1783      *  <flags> bits meanings:
1784      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
1785      *	    FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
1786      *	                      locale are to be used.
1787      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1788      */
1789 
1790     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1791 
1792     /* Tread a UTF-8 locale as not being in locale at all */
1793     if (IN_UTF8_CTYPE_LOCALE) {
1794         flags &= ~FOLD_FLAGS_LOCALE;
1795     }
1796 
1797     if (c < 256) {
1798 	UV result = _to_fold_latin1((U8) c, p, lenp,
1799 			    flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
1800 	/* It is illegal for the fold to cross the 255/256 boundary under
1801 	 * locale; in this case return the original */
1802 	return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1803 	       ? c
1804 	       : result;
1805     }
1806 
1807     /* If no special needs, just use the macro */
1808     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1809 	uvchr_to_utf8(p, c);
1810 	return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1811     }
1812     else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1813 	       the special flags. */
1814 	U8 utf8_c[UTF8_MAXBYTES + 1];
1815 	uvchr_to_utf8(utf8_c, c);
1816 	return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
1817     }
1818 }
1819 
1820 PERL_STATIC_INLINE bool
1821 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1822 		 const char *const swashname, SV* const invlist)
1823 {
1824     /* returns a boolean giving whether or not the UTF8-encoded character that
1825      * starts at <p> is in the swash indicated by <swashname>.  <swash>
1826      * contains a pointer to where the swash indicated by <swashname>
1827      * is to be stored; which this routine will do, so that future calls will
1828      * look at <*swash> and only generate a swash if it is not null.  <invlist>
1829      * is NULL or an inversion list that defines the swash.  If not null, it
1830      * saves time during initialization of the swash.
1831      *
1832      * Note that it is assumed that the buffer length of <p> is enough to
1833      * contain all the bytes that comprise the character.  Thus, <*p> should
1834      * have been checked before this call for mal-formedness enough to assure
1835      * that. */
1836 
1837     dVAR;
1838 
1839     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1840 
1841     /* The API should have included a length for the UTF-8 character in <p>,
1842      * but it doesn't.  We therefore assume that p has been validated at least
1843      * as far as there being enough bytes available in it to accommodate the
1844      * character without reading beyond the end, and pass that number on to the
1845      * validating routine */
1846     if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
1847         if (ckWARN_d(WARN_UTF8)) {
1848             Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
1849 		    "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
1850             if (ckWARN(WARN_UTF8)) {    /* This will output details as to the
1851                                            what the malformation is */
1852                 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
1853             }
1854         }
1855         return FALSE;
1856     }
1857     if (!*swash) {
1858         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1859         *swash = _core_swash_init("utf8",
1860 
1861                                   /* Only use the name if there is no inversion
1862                                    * list; otherwise will go out to disk */
1863                                   (invlist) ? "" : swashname,
1864 
1865                                   &PL_sv_undef, 1, 0, invlist, &flags);
1866     }
1867 
1868     return swash_fetch(*swash, p, TRUE) != 0;
1869 }
1870 
1871 bool
1872 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
1873 {
1874     dVAR;
1875 
1876     PERL_ARGS_ASSERT__IS_UTF8_FOO;
1877 
1878     assert(classnum < _FIRST_NON_SWASH_CC);
1879 
1880     return is_utf8_common(p,
1881                           &PL_utf8_swash_ptrs[classnum],
1882                           swash_property_names[classnum],
1883                           PL_XPosix_ptrs[classnum]);
1884 }
1885 
1886 bool
1887 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1888 {
1889     dVAR;
1890 
1891     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1892 
1893     return S_is_utf8_idfirst(aTHX_ p);
1894 }
1895 
1896 bool
1897 Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
1898 {
1899     dVAR;
1900 
1901     PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
1902 
1903     if (*p == '_')
1904 	return TRUE;
1905     /* is_utf8_idstart would be more logical. */
1906     return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
1907 }
1908 
1909 bool
1910 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
1911 {
1912     dVAR;
1913     SV* invlist = NULL;
1914 
1915     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
1916 
1917     if (! PL_utf8_perl_idstart) {
1918         invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
1919     }
1920     return is_utf8_common(p, &PL_utf8_perl_idstart, "", invlist);
1921 }
1922 
1923 bool
1924 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
1925 {
1926     dVAR;
1927     SV* invlist = NULL;
1928 
1929     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
1930 
1931     if (! PL_utf8_perl_idcont) {
1932         invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
1933     }
1934     return is_utf8_common(p, &PL_utf8_perl_idcont, "", invlist);
1935 }
1936 
1937 
1938 bool
1939 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1940 {
1941     dVAR;
1942 
1943     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1944 
1945     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
1946 }
1947 
1948 bool
1949 Perl_is_utf8_xidcont(pTHX_ const U8 *p)
1950 {
1951     dVAR;
1952 
1953     PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
1954 
1955     return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
1956 }
1957 
1958 bool
1959 Perl__is_utf8_mark(pTHX_ const U8 *p)
1960 {
1961     dVAR;
1962 
1963     PERL_ARGS_ASSERT__IS_UTF8_MARK;
1964 
1965     return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
1966 }
1967 
1968 /*
1969 =for apidoc to_utf8_case
1970 
1971 C<p> contains the pointer to the UTF-8 string encoding
1972 the character that is being converted.  This routine assumes that the character
1973 at C<p> is well-formed.
1974 
1975 C<ustrp> is a pointer to the character buffer to put the
1976 conversion result to.  C<lenp> is a pointer to the length
1977 of the result.
1978 
1979 C<swashp> is a pointer to the swash to use.
1980 
1981 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
1982 and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  C<special> (usually,
1983 but not always, a multicharacter mapping), is tried first.
1984 
1985 C<special> is a string, normally C<NULL> or C<"">.  C<NULL> means to not use
1986 any special mappings; C<""> means to use the special mappings.  Values other
1987 than these two are treated as the name of the hash containing the special
1988 mappings, like C<"utf8::ToSpecLower">.
1989 
1990 C<normal> is a string like "ToLower" which means the swash
1991 %utf8::ToLower.
1992 
1993 =cut */
1994 
1995 UV
1996 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1997 			SV **swashp, const char *normal, const char *special)
1998 {
1999     dVAR;
2000     STRLEN len = 0;
2001     const UV uv1 = valid_utf8_to_uvchr(p, NULL);
2002 
2003     PERL_ARGS_ASSERT_TO_UTF8_CASE;
2004 
2005     /* Note that swash_fetch() doesn't output warnings for these because it
2006      * assumes we will */
2007     if (uv1 >= UNICODE_SURROGATE_FIRST) {
2008 	if (uv1 <= UNICODE_SURROGATE_LAST) {
2009 	    if (ckWARN_d(WARN_SURROGATE)) {
2010 		const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2011 		Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2012 		    "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2013 	    }
2014 	}
2015 	else if (UNICODE_IS_SUPER(uv1)) {
2016 	    if (ckWARN_d(WARN_NON_UNICODE)) {
2017 		const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2018 		Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2019 		    "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2020 	    }
2021 	}
2022 
2023 	/* Note that non-characters are perfectly legal, so no warning should
2024 	 * be given */
2025     }
2026 
2027     if (!*swashp) /* load on-demand */
2028          *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
2029 
2030     if (special) {
2031          /* It might be "special" (sometimes, but not always,
2032 	  * a multicharacter mapping) */
2033          HV *hv = NULL;
2034 	 SV **svp;
2035 
2036 	 /* If passed in the specials name, use that; otherwise use any
2037 	  * given in the swash */
2038          if (*special != '\0') {
2039             hv = get_hv(special, 0);
2040         }
2041         else {
2042             svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2043             if (svp) {
2044                 hv = MUTABLE_HV(SvRV(*svp));
2045             }
2046         }
2047 
2048 	 if (hv
2049              && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
2050              && (*svp))
2051          {
2052 	     const char *s;
2053 
2054 	      s = SvPV_const(*svp, len);
2055 	      if (len == 1)
2056                   /* EIGHTBIT */
2057 		   len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2058 	      else {
2059 		   Copy(s, ustrp, len, U8);
2060 	      }
2061 	 }
2062     }
2063 
2064     if (!len && *swashp) {
2065 	const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
2066 
2067 	 if (uv2) {
2068 	      /* It was "normal" (a single character mapping). */
2069 	      len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2070 	 }
2071     }
2072 
2073     if (len) {
2074         if (lenp) {
2075             *lenp = len;
2076         }
2077         return valid_utf8_to_uvchr(ustrp, 0);
2078     }
2079 
2080     /* Here, there was no mapping defined, which means that the code point maps
2081      * to itself.  Return the inputs */
2082     len = UTF8SKIP(p);
2083     if (p != ustrp) {   /* Don't copy onto itself */
2084         Copy(p, ustrp, len, U8);
2085     }
2086 
2087     if (lenp)
2088 	 *lenp = len;
2089 
2090     return uv1;
2091 
2092 }
2093 
2094 STATIC UV
2095 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2096 {
2097     /* This is called when changing the case of a utf8-encoded character above
2098      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
2099      * result contains a character that crosses the 255/256 boundary, disallow
2100      * the change, and return the original code point.  See L<perlfunc/lc> for
2101      * why;
2102      *
2103      * p	points to the original string whose case was changed; assumed
2104      *          by this routine to be well-formed
2105      * result	the code point of the first character in the changed-case string
2106      * ustrp	points to the changed-case string (<result> represents its first char)
2107      * lenp	points to the length of <ustrp> */
2108 
2109     UV original;    /* To store the first code point of <p> */
2110 
2111     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2112 
2113     assert(UTF8_IS_ABOVE_LATIN1(*p));
2114 
2115     /* We know immediately if the first character in the string crosses the
2116      * boundary, so can skip */
2117     if (result > 255) {
2118 
2119 	/* Look at every character in the result; if any cross the
2120 	* boundary, the whole thing is disallowed */
2121 	U8* s = ustrp + UTF8SKIP(ustrp);
2122 	U8* e = ustrp + *lenp;
2123 	while (s < e) {
2124 	    if (! UTF8_IS_ABOVE_LATIN1(*s)) {
2125 		goto bad_crossing;
2126 	    }
2127 	    s += UTF8SKIP(s);
2128 	}
2129 
2130 	/* Here, no characters crossed, result is ok as-is */
2131 	return result;
2132     }
2133 
2134 bad_crossing:
2135 
2136     /* Failed, have to return the original */
2137     original = valid_utf8_to_uvchr(p, lenp);
2138     Copy(p, ustrp, *lenp, char);
2139     return original;
2140 }
2141 
2142 /*
2143 =for apidoc to_utf8_upper
2144 
2145 Instead use L</toUPPER_utf8>.
2146 
2147 =cut */
2148 
2149 /* Not currently externally documented, and subject to change:
2150  * <flags> is set iff iff the rules from the current underlying locale are to
2151  *         be used. */
2152 
2153 UV
2154 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2155 {
2156     dVAR;
2157 
2158     UV result;
2159 
2160     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2161 
2162     if (flags && IN_UTF8_CTYPE_LOCALE) {
2163         flags = FALSE;
2164     }
2165 
2166     if (UTF8_IS_INVARIANT(*p)) {
2167 	if (flags) {
2168 	    result = toUPPER_LC(*p);
2169 	}
2170 	else {
2171 	    return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2172 	}
2173     }
2174     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2175 	if (flags) {
2176             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2177 	    result = toUPPER_LC(c);
2178 	}
2179 	else {
2180 	    return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2181 				          ustrp, lenp, 'S');
2182 	}
2183     }
2184     else {  /* utf8, ord above 255 */
2185 	result = CALL_UPPER_CASE(p, ustrp, lenp);
2186 
2187 	if (flags) {
2188 	    result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2189 	}
2190 	return result;
2191     }
2192 
2193     /* Here, used locale rules.  Convert back to utf8 */
2194     if (UTF8_IS_INVARIANT(result)) {
2195 	*ustrp = (U8) result;
2196 	*lenp = 1;
2197     }
2198     else {
2199 	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2200 	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2201 	*lenp = 2;
2202     }
2203 
2204     return result;
2205 }
2206 
2207 /*
2208 =for apidoc to_utf8_title
2209 
2210 Instead use L</toTITLE_utf8>.
2211 
2212 =cut */
2213 
2214 /* Not currently externally documented, and subject to change:
2215  * <flags> is set iff the rules from the current underlying locale are to be
2216  *         used.  Since titlecase is not defined in POSIX, for other than a
2217  *         UTF-8 locale, uppercase is used instead for code points < 256.
2218  */
2219 
2220 UV
2221 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2222 {
2223     dVAR;
2224 
2225     UV result;
2226 
2227     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2228 
2229     if (flags && IN_UTF8_CTYPE_LOCALE) {
2230         flags = FALSE;
2231     }
2232 
2233     if (UTF8_IS_INVARIANT(*p)) {
2234 	if (flags) {
2235 	    result = toUPPER_LC(*p);
2236 	}
2237 	else {
2238 	    return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2239 	}
2240     }
2241     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2242 	if (flags) {
2243             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2244 	    result = toUPPER_LC(c);
2245 	}
2246 	else {
2247 	    return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2248 				          ustrp, lenp, 's');
2249 	}
2250     }
2251     else {  /* utf8, ord above 255 */
2252 	result = CALL_TITLE_CASE(p, ustrp, lenp);
2253 
2254 	if (flags) {
2255 	    result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2256 	}
2257 	return result;
2258     }
2259 
2260     /* Here, used locale rules.  Convert back to utf8 */
2261     if (UTF8_IS_INVARIANT(result)) {
2262 	*ustrp = (U8) result;
2263 	*lenp = 1;
2264     }
2265     else {
2266 	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2267 	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2268 	*lenp = 2;
2269     }
2270 
2271     return result;
2272 }
2273 
2274 /*
2275 =for apidoc to_utf8_lower
2276 
2277 Instead use L</toLOWER_utf8>.
2278 
2279 =cut */
2280 
2281 /* Not currently externally documented, and subject to change:
2282  * <flags> is set iff iff the rules from the current underlying locale are to
2283  *         be used.
2284  */
2285 
2286 UV
2287 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2288 {
2289     UV result;
2290 
2291     dVAR;
2292 
2293     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2294 
2295     if (flags && IN_UTF8_CTYPE_LOCALE) {
2296         flags = FALSE;
2297     }
2298 
2299     if (UTF8_IS_INVARIANT(*p)) {
2300 	if (flags) {
2301 	    result = toLOWER_LC(*p);
2302 	}
2303 	else {
2304 	    return to_lower_latin1(*p, ustrp, lenp);
2305 	}
2306     }
2307     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2308 	if (flags) {
2309             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2310 	    result = toLOWER_LC(c);
2311 	}
2312 	else {
2313 	    return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2314 		                   ustrp, lenp);
2315 	}
2316     }
2317     else {  /* utf8, ord above 255 */
2318 	result = CALL_LOWER_CASE(p, ustrp, lenp);
2319 
2320 	if (flags) {
2321 	    result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2322 	}
2323 
2324 	return result;
2325     }
2326 
2327     /* Here, used locale rules.  Convert back to utf8 */
2328     if (UTF8_IS_INVARIANT(result)) {
2329 	*ustrp = (U8) result;
2330 	*lenp = 1;
2331     }
2332     else {
2333 	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2334 	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2335 	*lenp = 2;
2336     }
2337 
2338     return result;
2339 }
2340 
2341 /*
2342 =for apidoc to_utf8_fold
2343 
2344 Instead use L</toFOLD_utf8>.
2345 
2346 =cut */
2347 
2348 /* Not currently externally documented, and subject to change,
2349  * in <flags>
2350  *	bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2351  *	                      locale are to be used.
2352  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
2353  *			      otherwise simple folds
2354  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2355  *			      prohibited
2356  */
2357 
2358 UV
2359 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
2360 {
2361     dVAR;
2362 
2363     UV result;
2364 
2365     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2366 
2367     /* These are mutually exclusive */
2368     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2369 
2370     assert(p != ustrp); /* Otherwise overwrites */
2371 
2372     if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) {
2373         flags &= ~FOLD_FLAGS_LOCALE;
2374     }
2375 
2376     if (UTF8_IS_INVARIANT(*p)) {
2377 	if (flags & FOLD_FLAGS_LOCALE) {
2378 	    result = toFOLD_LC(*p);
2379 	}
2380 	else {
2381 	    return _to_fold_latin1(*p, ustrp, lenp,
2382                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2383 	}
2384     }
2385     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2386 	if (flags & FOLD_FLAGS_LOCALE) {
2387             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2388 	    result = toFOLD_LC(c);
2389 	}
2390 	else {
2391 	    return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2392                             ustrp, lenp,
2393                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2394 	}
2395     }
2396     else {  /* utf8, ord above 255 */
2397 	result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
2398 
2399 	if (flags & FOLD_FLAGS_LOCALE) {
2400 
2401             /* Special case these two characters, as what normally gets
2402              * returned under locale doesn't work */
2403             if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2404                 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2405                           sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2406             {
2407                 goto return_long_s;
2408             }
2409             else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
2410                 && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
2411                           sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
2412             {
2413                 goto return_ligature_st;
2414             }
2415 	    return check_locale_boundary_crossing(p, result, ustrp, lenp);
2416 	}
2417 	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2418 	    return result;
2419 	}
2420 	else {
2421 	    /* This is called when changing the case of a utf8-encoded
2422              * character above the ASCII range, and the result should not
2423              * contain an ASCII character. */
2424 
2425 	    UV original;    /* To store the first code point of <p> */
2426 
2427 	    /* Look at every character in the result; if any cross the
2428 	    * boundary, the whole thing is disallowed */
2429 	    U8* s = ustrp;
2430 	    U8* e = ustrp + *lenp;
2431 	    while (s < e) {
2432 		if (isASCII(*s)) {
2433 		    /* Crossed, have to return the original */
2434 		    original = valid_utf8_to_uvchr(p, lenp);
2435 
2436                     /* But in these instances, there is an alternative we can
2437                      * return that is valid */
2438                     if (original == LATIN_CAPITAL_LETTER_SHARP_S
2439                         || original == LATIN_SMALL_LETTER_SHARP_S)
2440                     {
2441                         goto return_long_s;
2442                     }
2443                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2444                         goto return_ligature_st;
2445                     }
2446 		    Copy(p, ustrp, *lenp, char);
2447 		    return original;
2448 		}
2449 		s += UTF8SKIP(s);
2450 	    }
2451 
2452 	    /* Here, no characters crossed, result is ok as-is */
2453 	    return result;
2454 	}
2455     }
2456 
2457     /* Here, used locale rules.  Convert back to utf8 */
2458     if (UTF8_IS_INVARIANT(result)) {
2459 	*ustrp = (U8) result;
2460 	*lenp = 1;
2461     }
2462     else {
2463 	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2464 	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2465 	*lenp = 2;
2466     }
2467 
2468     return result;
2469 
2470   return_long_s:
2471     /* Certain folds to 'ss' are prohibited by the options, but they do allow
2472      * folds to a string of two of these characters.  By returning this
2473      * instead, then, e.g.,
2474      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2475      * works. */
2476 
2477     *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2478     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2479         ustrp, *lenp, U8);
2480     return LATIN_SMALL_LETTER_LONG_S;
2481 
2482   return_ligature_st:
2483     /* Two folds to 'st' are prohibited by the options; instead we pick one and
2484      * have the other one fold to it */
2485 
2486     *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2487     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2488     return LATIN_SMALL_LIGATURE_ST;
2489 }
2490 
2491 /* Note:
2492  * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2493  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2494  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2495  */
2496 
2497 SV*
2498 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2499 {
2500     PERL_ARGS_ASSERT_SWASH_INIT;
2501 
2502     /* Returns a copy of a swash initiated by the called function.  This is the
2503      * public interface, and returning a copy prevents others from doing
2504      * mischief on the original */
2505 
2506     return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
2507 }
2508 
2509 SV*
2510 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
2511 {
2512 
2513     /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
2514      * use the following define */
2515 
2516 #define CORE_SWASH_INIT_RETURN(x)   \
2517     PL_curpm= old_PL_curpm;         \
2518     return x
2519 
2520     /* Initialize and return a swash, creating it if necessary.  It does this
2521      * by calling utf8_heavy.pl in the general case.  The returned value may be
2522      * the swash's inversion list instead if the input parameters allow it.
2523      * Which is returned should be immaterial to callers, as the only
2524      * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2525      * and swash_to_invlist() handle both these transparently.
2526      *
2527      * This interface should only be used by functions that won't destroy or
2528      * adversely change the swash, as doing so affects all other uses of the
2529      * swash in the program; the general public should use 'Perl_swash_init'
2530      * instead.
2531      *
2532      * pkg  is the name of the package that <name> should be in.
2533      * name is the name of the swash to find.  Typically it is a Unicode
2534      *	    property name, including user-defined ones
2535      * listsv is a string to initialize the swash with.  It must be of the form
2536      *	    documented as the subroutine return value in
2537      *	    L<perlunicode/User-Defined Character Properties>
2538      * minbits is the number of bits required to represent each data element.
2539      *	    It is '1' for binary properties.
2540      * none I (khw) do not understand this one, but it is used only in tr///.
2541      * invlist is an inversion list to initialize the swash with (or NULL)
2542      * flags_p if non-NULL is the address of various input and output flag bits
2543      *      to the routine, as follows:  ('I' means is input to the routine;
2544      *      'O' means output from the routine.  Only flags marked O are
2545      *      meaningful on return.)
2546      *  _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2547      *      came from a user-defined property.  (I O)
2548      *  _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2549      *      when the swash cannot be located, to simply return NULL. (I)
2550      *  _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2551      *      return of an inversion list instead of a swash hash if this routine
2552      *      thinks that would result in faster execution of swash_fetch() later
2553      *      on. (I)
2554      *
2555      * Thus there are three possible inputs to find the swash: <name>,
2556      * <listsv>, and <invlist>.  At least one must be specified.  The result
2557      * will be the union of the specified ones, although <listsv>'s various
2558      * actions can intersect, etc. what <name> gives.  To avoid going out to
2559      * disk at all, <invlist> should specify completely what the swash should
2560      * have, and <listsv> should be &PL_sv_undef and <name> should be "".
2561      *
2562      * <invlist> is only valid for binary properties */
2563 
2564     dVAR;
2565     PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
2566 
2567     SV* retval = &PL_sv_undef;
2568     HV* swash_hv = NULL;
2569     const int invlist_swash_boundary =
2570         (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2571         ? 512    /* Based on some benchmarking, but not extensive, see commit
2572                     message */
2573         : -1;   /* Never return just an inversion list */
2574 
2575     assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2576     assert(! invlist || minbits == 1);
2577 
2578     PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
2579                        that triggered the swash init and the swash init perl logic itself.
2580                        See perl #122747 */
2581 
2582     /* If data was passed in to go out to utf8_heavy to find the swash of, do
2583      * so */
2584     if (listsv != &PL_sv_undef || strNE(name, "")) {
2585 	dSP;
2586 	const size_t pkg_len = strlen(pkg);
2587 	const size_t name_len = strlen(name);
2588 	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2589 	SV* errsv_save;
2590 	GV *method;
2591 
2592 	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2593 
2594 	PUSHSTACKi(PERLSI_MAGIC);
2595 	ENTER;
2596 	SAVEHINTS();
2597 	save_re_context();
2598 	/* We might get here via a subroutine signature which uses a utf8
2599 	 * parameter name, at which point PL_subname will have been set
2600 	 * but not yet used. */
2601 	save_item(PL_subname);
2602 	if (PL_parser && PL_parser->error_count)
2603 	    SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2604 	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2605 	if (!method) {	/* demand load utf8 */
2606 	    ENTER;
2607 	    if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2608 	    GvSV(PL_errgv) = NULL;
2609 	    /* It is assumed that callers of this routine are not passing in
2610 	     * any user derived data.  */
2611 	    /* Need to do this after save_re_context() as it will set
2612 	     * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2613 	     * in Perl_magic_get).  Even line to create errsv_save can turn on
2614 	     * PL_tainted.  */
2615 #ifndef NO_TAINT_SUPPORT
2616 	    SAVEBOOL(TAINT_get);
2617 	    TAINT_NOT;
2618 #endif
2619 	    Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2620 			     NULL);
2621 	    {
2622 		/* Not ERRSV, as there is no need to vivify a scalar we are
2623 		   about to discard. */
2624 		SV * const errsv = GvSV(PL_errgv);
2625 		if (!SvTRUE(errsv)) {
2626 		    GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2627 		    SvREFCNT_dec(errsv);
2628 		}
2629 	    }
2630 	    LEAVE;
2631 	}
2632 	SPAGAIN;
2633 	PUSHMARK(SP);
2634 	EXTEND(SP,5);
2635 	mPUSHp(pkg, pkg_len);
2636 	mPUSHp(name, name_len);
2637 	PUSHs(listsv);
2638 	mPUSHi(minbits);
2639 	mPUSHi(none);
2640 	PUTBACK;
2641 	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2642 	GvSV(PL_errgv) = NULL;
2643 	/* If we already have a pointer to the method, no need to use
2644 	 * call_method() to repeat the lookup.  */
2645 	if (method
2646             ? call_sv(MUTABLE_SV(method), G_SCALAR)
2647 	    : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2648 	{
2649 	    retval = *PL_stack_sp--;
2650 	    SvREFCNT_inc(retval);
2651 	}
2652 	{
2653 	    /* Not ERRSV.  See above. */
2654 	    SV * const errsv = GvSV(PL_errgv);
2655 	    if (!SvTRUE(errsv)) {
2656 		GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2657 		SvREFCNT_dec(errsv);
2658 	    }
2659 	}
2660 	LEAVE;
2661 	POPSTACK;
2662 	if (IN_PERL_COMPILETIME) {
2663 	    CopHINTS_set(PL_curcop, PL_hints);
2664 	}
2665 	if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2666 	    if (SvPOK(retval))
2667 
2668 		/* If caller wants to handle missing properties, let them */
2669 		if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2670                     CORE_SWASH_INIT_RETURN(NULL);
2671 		}
2672 		Perl_croak(aTHX_
2673 			   "Can't find Unicode property definition \"%"SVf"\"",
2674 			   SVfARG(retval));
2675 	    Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
2676 	}
2677     } /* End of calling the module to find the swash */
2678 
2679     /* If this operation fetched a swash, and we will need it later, get it */
2680     if (retval != &PL_sv_undef
2681         && (minbits == 1 || (flags_p
2682                             && ! (*flags_p
2683                                   & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
2684     {
2685         swash_hv = MUTABLE_HV(SvRV(retval));
2686 
2687         /* If we don't already know that there is a user-defined component to
2688          * this swash, and the user has indicated they wish to know if there is
2689          * one (by passing <flags_p>), find out */
2690         if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
2691             SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
2692             if (user_defined && SvUV(*user_defined)) {
2693                 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
2694             }
2695         }
2696     }
2697 
2698     /* Make sure there is an inversion list for binary properties */
2699     if (minbits == 1) {
2700 	SV** swash_invlistsvp = NULL;
2701 	SV* swash_invlist = NULL;
2702 	bool invlist_in_swash_is_valid = FALSE;
2703 	bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
2704 					    an unclaimed reference count */
2705 
2706         /* If this operation fetched a swash, get its already existing
2707          * inversion list, or create one for it */
2708 
2709         if (swash_hv) {
2710 	    swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
2711 	    if (swash_invlistsvp) {
2712 		swash_invlist = *swash_invlistsvp;
2713 		invlist_in_swash_is_valid = TRUE;
2714 	    }
2715 	    else {
2716 		swash_invlist = _swash_to_invlist(retval);
2717 		swash_invlist_unclaimed = TRUE;
2718 	    }
2719 	}
2720 
2721 	/* If an inversion list was passed in, have to include it */
2722 	if (invlist) {
2723 
2724             /* Any fetched swash will by now have an inversion list in it;
2725              * otherwise <swash_invlist>  will be NULL, indicating that we
2726              * didn't fetch a swash */
2727 	    if (swash_invlist) {
2728 
2729 		/* Add the passed-in inversion list, which invalidates the one
2730 		 * already stored in the swash */
2731 		invlist_in_swash_is_valid = FALSE;
2732 		_invlist_union(invlist, swash_invlist, &swash_invlist);
2733 	    }
2734 	    else {
2735 
2736                 /* Here, there is no swash already.  Set up a minimal one, if
2737                  * we are going to return a swash */
2738                 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
2739                     swash_hv = newHV();
2740                     retval = newRV_noinc(MUTABLE_SV(swash_hv));
2741                 }
2742 		swash_invlist = invlist;
2743 	    }
2744 	}
2745 
2746         /* Here, we have computed the union of all the passed-in data.  It may
2747          * be that there was an inversion list in the swash which didn't get
2748          * touched; otherwise save the computed one */
2749 	if (! invlist_in_swash_is_valid
2750             && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
2751         {
2752 	    if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
2753             {
2754 		Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2755 	    }
2756 	    /* We just stole a reference count. */
2757 	    if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
2758 	    else SvREFCNT_inc_simple_void_NN(swash_invlist);
2759 	}
2760 
2761         SvREADONLY_on(swash_invlist);
2762 
2763         /* Use the inversion list stand-alone if small enough */
2764         if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
2765 	    SvREFCNT_dec(retval);
2766 	    if (!swash_invlist_unclaimed)
2767 		SvREFCNT_inc_simple_void_NN(swash_invlist);
2768             retval = newRV_noinc(swash_invlist);
2769         }
2770     }
2771 
2772     CORE_SWASH_INIT_RETURN(retval);
2773 #undef CORE_SWASH_INIT_RETURN
2774 }
2775 
2776 
2777 /* This API is wrong for special case conversions since we may need to
2778  * return several Unicode characters for a single Unicode character
2779  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2780  * the lower-level routine, and it is similarly broken for returning
2781  * multiple values.  --jhi
2782  * For those, you should use to_utf8_case() instead */
2783 /* Now SWASHGET is recasted into S_swatch_get in this file. */
2784 
2785 /* Note:
2786  * Returns the value of property/mapping C<swash> for the first character
2787  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
2788  * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
2789  * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
2790  *
2791  * A "swash" is a hash which contains initially the keys/values set up by
2792  * SWASHNEW.  The purpose is to be able to completely represent a Unicode
2793  * property for all possible code points.  Things are stored in a compact form
2794  * (see utf8_heavy.pl) so that calculation is required to find the actual
2795  * property value for a given code point.  As code points are looked up, new
2796  * key/value pairs are added to the hash, so that the calculation doesn't have
2797  * to ever be re-done.  Further, each calculation is done, not just for the
2798  * desired one, but for a whole block of code points adjacent to that one.
2799  * For binary properties on ASCII machines, the block is usually for 64 code
2800  * points, starting with a code point evenly divisible by 64.  Thus if the
2801  * property value for code point 257 is requested, the code goes out and
2802  * calculates the property values for all 64 code points between 256 and 319,
2803  * and stores these as a single 64-bit long bit vector, called a "swatch",
2804  * under the key for code point 256.  The key is the UTF-8 encoding for code
2805  * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
2806  * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
2807  * for code point 258 is then requested, this code realizes that it would be
2808  * stored under the key for 256, and would find that value and extract the
2809  * relevant bit, offset from 256.
2810  *
2811  * Non-binary properties are stored in as many bits as necessary to represent
2812  * their values (32 currently, though the code is more general than that), not
2813  * as single bits, but the principal is the same: the value for each key is a
2814  * vector that encompasses the property values for all code points whose UTF-8
2815  * representations are represented by the key.  That is, for all code points
2816  * whose UTF-8 representations are length N bytes, and the key is the first N-1
2817  * bytes of that.
2818  */
2819 UV
2820 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
2821 {
2822     dVAR;
2823     HV *const hv = MUTABLE_HV(SvRV(swash));
2824     U32 klen;
2825     U32 off;
2826     STRLEN slen = 0;
2827     STRLEN needents;
2828     const U8 *tmps = NULL;
2829     U32 bit;
2830     SV *swatch;
2831     const U8 c = *ptr;
2832 
2833     PERL_ARGS_ASSERT_SWASH_FETCH;
2834 
2835     /* If it really isn't a hash, it isn't really swash; must be an inversion
2836      * list */
2837     if (SvTYPE(hv) != SVt_PVHV) {
2838         return _invlist_contains_cp((SV*)hv,
2839                                     (do_utf8)
2840                                      ? valid_utf8_to_uvchr(ptr, NULL)
2841                                      : c);
2842     }
2843 
2844     /* We store the values in a "swatch" which is a vec() value in a swash
2845      * hash.  Code points 0-255 are a single vec() stored with key length
2846      * (klen) 0.  All other code points have a UTF-8 representation
2847      * 0xAA..0xYY,0xZZ.  A vec() is constructed containing all of them which
2848      * share 0xAA..0xYY, which is the key in the hash to that vec.  So the key
2849      * length for them is the length of the encoded char - 1.  ptr[klen] is the
2850      * final byte in the sequence representing the character */
2851     if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
2852         klen = 0;
2853 	needents = 256;
2854         off = c;
2855     }
2856     else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2857         klen = 0;
2858 	needents = 256;
2859         off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
2860     }
2861     else {
2862         klen = UTF8SKIP(ptr) - 1;
2863 
2864         /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values.  The offset into
2865          * the vec is the final byte in the sequence.  (In EBCDIC this is
2866          * converted to I8 to get consecutive values.)  To help you visualize
2867          * all this:
2868          *                       Straight 1047   After final byte
2869          *             UTF-8      UTF-EBCDIC     I8 transform
2870          *  U+0400:  \xD0\x80    \xB8\x41\x41    \xB8\x41\xA0
2871          *  U+0401:  \xD0\x81    \xB8\x41\x42    \xB8\x41\xA1
2872          *    ...
2873          *  U+0409:  \xD0\x89    \xB8\x41\x4A    \xB8\x41\xA9
2874          *  U+040A:  \xD0\x8A    \xB8\x41\x51    \xB8\x41\xAA
2875          *    ...
2876          *  U+0412:  \xD0\x92    \xB8\x41\x59    \xB8\x41\xB2
2877          *  U+0413:  \xD0\x93    \xB8\x41\x62    \xB8\x41\xB3
2878          *    ...
2879          *  U+041B:  \xD0\x9B    \xB8\x41\x6A    \xB8\x41\xBB
2880          *  U+041C:  \xD0\x9C    \xB8\x41\x70    \xB8\x41\xBC
2881          *    ...
2882          *  U+041F:  \xD0\x9F    \xB8\x41\x73    \xB8\x41\xBF
2883          *  U+0420:  \xD0\xA0    \xB8\x42\x41    \xB8\x42\x41
2884          *
2885          * (There are no discontinuities in the elided (...) entries.)
2886          * The UTF-8 key for these 33 code points is '\xD0' (which also is the
2887          * key for the next 31, up through U+043F, whose UTF-8 final byte is
2888          * \xBF).  Thus in UTF-8, each key is for a vec() for 64 code points.
2889          * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
2890          * index into the vec() swatch (after subtracting 0x80, which we
2891          * actually do with an '&').
2892          * In UTF-EBCDIC, each key is for a 32 code point vec().  The first 32
2893          * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
2894          * dicontinuities which go away by transforming it into I8, and we
2895          * effectively subtract 0xA0 to get the index. */
2896 	needents = (1 << UTF_ACCUMULATION_SHIFT);
2897 	off      = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
2898     }
2899 
2900     /*
2901      * This single-entry cache saves about 1/3 of the utf8 overhead in test
2902      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
2903      * it's nothing to sniff at.)  Pity we usually come through at least
2904      * two function calls to get here...
2905      *
2906      * NB: this code assumes that swatches are never modified, once generated!
2907      */
2908 
2909     if (hv   == PL_last_swash_hv &&
2910 	klen == PL_last_swash_klen &&
2911 	(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
2912     {
2913 	tmps = PL_last_swash_tmps;
2914 	slen = PL_last_swash_slen;
2915     }
2916     else {
2917 	/* Try our second-level swatch cache, kept in a hash. */
2918 	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
2919 
2920 	/* If not cached, generate it via swatch_get */
2921 	if (!svp || !SvPOK(*svp)
2922 		 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
2923         {
2924             if (klen) {
2925                 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
2926                 swatch = swatch_get(swash,
2927                                     code_point & ~((UV)needents - 1),
2928 				    needents);
2929             }
2930             else {  /* For the first 256 code points, the swatch has a key of
2931                        length 0 */
2932                 swatch = swatch_get(swash, 0, needents);
2933             }
2934 
2935 	    if (IN_PERL_COMPILETIME)
2936 		CopHINTS_set(PL_curcop, PL_hints);
2937 
2938 	    svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
2939 
2940 	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
2941 		     || (slen << 3) < needents)
2942 		Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
2943 			   "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
2944 			   svp, tmps, (UV)slen, (UV)needents);
2945 	}
2946 
2947 	PL_last_swash_hv = hv;
2948 	assert(klen <= sizeof(PL_last_swash_key));
2949 	PL_last_swash_klen = (U8)klen;
2950 	/* FIXME change interpvar.h?  */
2951 	PL_last_swash_tmps = (U8 *) tmps;
2952 	PL_last_swash_slen = slen;
2953 	if (klen)
2954 	    Copy(ptr, PL_last_swash_key, klen, U8);
2955     }
2956 
2957     switch ((int)((slen << 3) / needents)) {
2958     case 1:
2959 	bit = 1 << (off & 7);
2960 	off >>= 3;
2961 	return (tmps[off] & bit) != 0;
2962     case 8:
2963 	return tmps[off];
2964     case 16:
2965 	off <<= 1;
2966 	return (tmps[off] << 8) + tmps[off + 1] ;
2967     case 32:
2968 	off <<= 2;
2969 	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2970     }
2971     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
2972 	       "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
2973     NORETURN_FUNCTION_END;
2974 }
2975 
2976 /* Read a single line of the main body of the swash input text.  These are of
2977  * the form:
2978  * 0053	0056	0073
2979  * where each number is hex.  The first two numbers form the minimum and
2980  * maximum of a range, and the third is the value associated with the range.
2981  * Not all swashes should have a third number
2982  *
2983  * On input: l	  points to the beginning of the line to be examined; it points
2984  *		  to somewhere in the string of the whole input text, and is
2985  *		  terminated by a \n or the null string terminator.
2986  *	     lend   points to the null terminator of that string
2987  *	     wants_value    is non-zero if the swash expects a third number
2988  *	     typestr is the name of the swash's mapping, like 'ToLower'
2989  * On output: *min, *max, and *val are set to the values read from the line.
2990  *	      returns a pointer just beyond the line examined.  If there was no
2991  *	      valid min number on the line, returns lend+1
2992  */
2993 
2994 STATIC U8*
2995 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
2996 			     const bool wants_value, const U8* const typestr)
2997 {
2998     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
2999     STRLEN numlen;	    /* Length of the number */
3000     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3001 		| PERL_SCAN_DISALLOW_PREFIX
3002 		| PERL_SCAN_SILENT_NON_PORTABLE;
3003 
3004     /* nl points to the next \n in the scan */
3005     U8* const nl = (U8*)memchr(l, '\n', lend - l);
3006 
3007     /* Get the first number on the line: the range minimum */
3008     numlen = lend - l;
3009     *min = grok_hex((char *)l, &numlen, &flags, NULL);
3010     if (numlen)	    /* If found a hex number, position past it */
3011 	l += numlen;
3012     else if (nl) {	    /* Else, go handle next line, if any */
3013 	return nl + 1;	/* 1 is length of "\n" */
3014     }
3015     else {		/* Else, no next line */
3016 	return lend + 1;	/* to LIST's end at which \n is not found */
3017     }
3018 
3019     /* The max range value follows, separated by a BLANK */
3020     if (isBLANK(*l)) {
3021 	++l;
3022 	flags = PERL_SCAN_SILENT_ILLDIGIT
3023 		| PERL_SCAN_DISALLOW_PREFIX
3024 		| PERL_SCAN_SILENT_NON_PORTABLE;
3025 	numlen = lend - l;
3026 	*max = grok_hex((char *)l, &numlen, &flags, NULL);
3027 	if (numlen)
3028 	    l += numlen;
3029 	else    /* If no value here, it is a single element range */
3030 	    *max = *min;
3031 
3032 	/* Non-binary tables have a third entry: what the first element of the
3033 	 * range maps to.  The map for those currently read here is in hex */
3034 	if (wants_value) {
3035 	    if (isBLANK(*l)) {
3036 		++l;
3037                 flags = PERL_SCAN_SILENT_ILLDIGIT
3038                     | PERL_SCAN_DISALLOW_PREFIX
3039                     | PERL_SCAN_SILENT_NON_PORTABLE;
3040                 numlen = lend - l;
3041                 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3042                 if (numlen)
3043                     l += numlen;
3044                 else
3045                     *val = 0;
3046 	    }
3047 	    else {
3048 		*val = 0;
3049 		if (typeto) {
3050 		    /* diag_listed_as: To%s: illegal mapping '%s' */
3051 		    Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3052 				     typestr, l);
3053 		}
3054 	    }
3055 	}
3056 	else
3057 	    *val = 0; /* bits == 1, then any val should be ignored */
3058     }
3059     else { /* Nothing following range min, should be single element with no
3060 	      mapping expected */
3061 	*max = *min;
3062 	if (wants_value) {
3063 	    *val = 0;
3064 	    if (typeto) {
3065 		/* diag_listed_as: To%s: illegal mapping '%s' */
3066 		Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3067 	    }
3068 	}
3069 	else
3070 	    *val = 0; /* bits == 1, then val should be ignored */
3071     }
3072 
3073     /* Position to next line if any, or EOF */
3074     if (nl)
3075 	l = nl + 1;
3076     else
3077 	l = lend;
3078 
3079     return l;
3080 }
3081 
3082 /* Note:
3083  * Returns a swatch (a bit vector string) for a code point sequence
3084  * that starts from the value C<start> and comprises the number C<span>.
3085  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3086  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3087  */
3088 STATIC SV*
3089 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3090 {
3091     SV *swatch;
3092     U8 *l, *lend, *x, *xend, *s, *send;
3093     STRLEN lcur, xcur, scur;
3094     HV *const hv = MUTABLE_HV(SvRV(swash));
3095     SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
3096 
3097     SV** listsvp = NULL; /* The string containing the main body of the table */
3098     SV** extssvp = NULL;
3099     SV** invert_it_svp = NULL;
3100     U8* typestr = NULL;
3101     STRLEN bits;
3102     STRLEN octets; /* if bits == 1, then octets == 0 */
3103     UV  none;
3104     UV  end = start + span;
3105 
3106     if (invlistsvp == NULL) {
3107         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3108         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3109         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3110         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3111         listsvp = hv_fetchs(hv, "LIST", FALSE);
3112         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3113 
3114 	bits  = SvUV(*bitssvp);
3115 	none  = SvUV(*nonesvp);
3116 	typestr = (U8*)SvPV_nolen(*typesvp);
3117     }
3118     else {
3119 	bits = 1;
3120 	none = 0;
3121     }
3122     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3123 
3124     PERL_ARGS_ASSERT_SWATCH_GET;
3125 
3126     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
3127 	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3128 						 (UV)bits);
3129     }
3130 
3131     /* If overflowed, use the max possible */
3132     if (end < start) {
3133 	end = UV_MAX;
3134 	span = end - start;
3135     }
3136 
3137     /* create and initialize $swatch */
3138     scur   = octets ? (span * octets) : (span + 7) / 8;
3139     swatch = newSV(scur);
3140     SvPOK_on(swatch);
3141     s = (U8*)SvPVX(swatch);
3142     if (octets && none) {
3143 	const U8* const e = s + scur;
3144 	while (s < e) {
3145 	    if (bits == 8)
3146 		*s++ = (U8)(none & 0xff);
3147 	    else if (bits == 16) {
3148 		*s++ = (U8)((none >>  8) & 0xff);
3149 		*s++ = (U8)( none        & 0xff);
3150 	    }
3151 	    else if (bits == 32) {
3152 		*s++ = (U8)((none >> 24) & 0xff);
3153 		*s++ = (U8)((none >> 16) & 0xff);
3154 		*s++ = (U8)((none >>  8) & 0xff);
3155 		*s++ = (U8)( none        & 0xff);
3156 	    }
3157 	}
3158 	*s = '\0';
3159     }
3160     else {
3161 	(void)memzero((U8*)s, scur + 1);
3162     }
3163     SvCUR_set(swatch, scur);
3164     s = (U8*)SvPVX(swatch);
3165 
3166     if (invlistsvp) {	/* If has an inversion list set up use that */
3167 	_invlist_populate_swatch(*invlistsvp, start, end, s);
3168         return swatch;
3169     }
3170 
3171     /* read $swash->{LIST} */
3172     l = (U8*)SvPV(*listsvp, lcur);
3173     lend = l + lcur;
3174     while (l < lend) {
3175 	UV min, max, val, upper;
3176 	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3177 					 cBOOL(octets), typestr);
3178 	if (l > lend) {
3179 	    break;
3180 	}
3181 
3182 	/* If looking for something beyond this range, go try the next one */
3183 	if (max < start)
3184 	    continue;
3185 
3186 	/* <end> is generally 1 beyond where we want to set things, but at the
3187 	 * platform's infinity, where we can't go any higher, we want to
3188 	 * include the code point at <end> */
3189         upper = (max < end)
3190                 ? max
3191                 : (max != UV_MAX || end != UV_MAX)
3192                   ? end - 1
3193                   : end;
3194 
3195 	if (octets) {
3196 	    UV key;
3197 	    if (min < start) {
3198 		if (!none || val < none) {
3199 		    val += start - min;
3200 		}
3201 		min = start;
3202 	    }
3203 	    for (key = min; key <= upper; key++) {
3204 		STRLEN offset;
3205 		/* offset must be non-negative (start <= min <= key < end) */
3206 		offset = octets * (key - start);
3207 		if (bits == 8)
3208 		    s[offset] = (U8)(val & 0xff);
3209 		else if (bits == 16) {
3210 		    s[offset    ] = (U8)((val >>  8) & 0xff);
3211 		    s[offset + 1] = (U8)( val        & 0xff);
3212 		}
3213 		else if (bits == 32) {
3214 		    s[offset    ] = (U8)((val >> 24) & 0xff);
3215 		    s[offset + 1] = (U8)((val >> 16) & 0xff);
3216 		    s[offset + 2] = (U8)((val >>  8) & 0xff);
3217 		    s[offset + 3] = (U8)( val        & 0xff);
3218 		}
3219 
3220 		if (!none || val < none)
3221 		    ++val;
3222 	    }
3223 	}
3224 	else { /* bits == 1, then val should be ignored */
3225 	    UV key;
3226 	    if (min < start)
3227 		min = start;
3228 
3229 	    for (key = min; key <= upper; key++) {
3230 		const STRLEN offset = (STRLEN)(key - start);
3231 		s[offset >> 3] |= 1 << (offset & 7);
3232 	    }
3233 	}
3234     } /* while */
3235 
3236     /* Invert if the data says it should be.  Assumes that bits == 1 */
3237     if (invert_it_svp && SvUV(*invert_it_svp)) {
3238 
3239 	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
3240 	 * be 0, and their inversion should also be 0, as we don't succeed any
3241 	 * Unicode property matches for non-Unicode code points */
3242 	if (start <= PERL_UNICODE_MAX) {
3243 
3244 	    /* The code below assumes that we never cross the
3245 	     * Unicode/above-Unicode boundary in a range, as otherwise we would
3246 	     * have to figure out where to stop flipping the bits.  Since this
3247 	     * boundary is divisible by a large power of 2, and swatches comes
3248 	     * in small powers of 2, this should be a valid assumption */
3249 	    assert(start + span - 1 <= PERL_UNICODE_MAX);
3250 
3251 	    send = s + scur;
3252 	    while (s < send) {
3253 		*s = ~(*s);
3254 		s++;
3255 	    }
3256 	}
3257     }
3258 
3259     /* read $swash->{EXTRAS}
3260      * This code also copied to swash_to_invlist() below */
3261     x = (U8*)SvPV(*extssvp, xcur);
3262     xend = x + xcur;
3263     while (x < xend) {
3264 	STRLEN namelen;
3265 	U8 *namestr;
3266 	SV** othersvp;
3267 	HV* otherhv;
3268 	STRLEN otherbits;
3269 	SV **otherbitssvp, *other;
3270 	U8 *s, *o, *nl;
3271 	STRLEN slen, olen;
3272 
3273 	const U8 opc = *x++;
3274 	if (opc == '\n')
3275 	    continue;
3276 
3277 	nl = (U8*)memchr(x, '\n', xend - x);
3278 
3279 	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3280 	    if (nl) {
3281 		x = nl + 1; /* 1 is length of "\n" */
3282 		continue;
3283 	    }
3284 	    else {
3285 		x = xend; /* to EXTRAS' end at which \n is not found */
3286 		break;
3287 	    }
3288 	}
3289 
3290 	namestr = x;
3291 	if (nl) {
3292 	    namelen = nl - namestr;
3293 	    x = nl + 1;
3294 	}
3295 	else {
3296 	    namelen = xend - namestr;
3297 	    x = xend;
3298 	}
3299 
3300 	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3301 	otherhv = MUTABLE_HV(SvRV(*othersvp));
3302 	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3303 	otherbits = (STRLEN)SvUV(*otherbitssvp);
3304 	if (bits < otherbits)
3305 	    Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3306 		       "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3307 
3308 	/* The "other" swatch must be destroyed after. */
3309 	other = swatch_get(*othersvp, start, span);
3310 	o = (U8*)SvPV(other, olen);
3311 
3312 	if (!olen)
3313 	    Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3314 
3315 	s = (U8*)SvPV(swatch, slen);
3316 	if (bits == 1 && otherbits == 1) {
3317 	    if (slen != olen)
3318 		Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3319 			   "mismatch, slen=%"UVuf", olen=%"UVuf,
3320 			   (UV)slen, (UV)olen);
3321 
3322 	    switch (opc) {
3323 	    case '+':
3324 		while (slen--)
3325 		    *s++ |= *o++;
3326 		break;
3327 	    case '!':
3328 		while (slen--)
3329 		    *s++ |= ~*o++;
3330 		break;
3331 	    case '-':
3332 		while (slen--)
3333 		    *s++ &= ~*o++;
3334 		break;
3335 	    case '&':
3336 		while (slen--)
3337 		    *s++ &= *o++;
3338 		break;
3339 	    default:
3340 		break;
3341 	    }
3342 	}
3343 	else {
3344 	    STRLEN otheroctets = otherbits >> 3;
3345 	    STRLEN offset = 0;
3346 	    U8* const send = s + slen;
3347 
3348 	    while (s < send) {
3349 		UV otherval = 0;
3350 
3351 		if (otherbits == 1) {
3352 		    otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3353 		    ++offset;
3354 		}
3355 		else {
3356 		    STRLEN vlen = otheroctets;
3357 		    otherval = *o++;
3358 		    while (--vlen) {
3359 			otherval <<= 8;
3360 			otherval |= *o++;
3361 		    }
3362 		}
3363 
3364 		if (opc == '+' && otherval)
3365 		    NOOP;   /* replace with otherval */
3366 		else if (opc == '!' && !otherval)
3367 		    otherval = 1;
3368 		else if (opc == '-' && otherval)
3369 		    otherval = 0;
3370 		else if (opc == '&' && !otherval)
3371 		    otherval = 0;
3372 		else {
3373 		    s += octets; /* no replacement */
3374 		    continue;
3375 		}
3376 
3377 		if (bits == 8)
3378 		    *s++ = (U8)( otherval & 0xff);
3379 		else if (bits == 16) {
3380 		    *s++ = (U8)((otherval >>  8) & 0xff);
3381 		    *s++ = (U8)( otherval        & 0xff);
3382 		}
3383 		else if (bits == 32) {
3384 		    *s++ = (U8)((otherval >> 24) & 0xff);
3385 		    *s++ = (U8)((otherval >> 16) & 0xff);
3386 		    *s++ = (U8)((otherval >>  8) & 0xff);
3387 		    *s++ = (U8)( otherval        & 0xff);
3388 		}
3389 	    }
3390 	}
3391 	sv_free(other); /* through with it! */
3392     } /* while */
3393     return swatch;
3394 }
3395 
3396 HV*
3397 Perl__swash_inversion_hash(pTHX_ SV* const swash)
3398 {
3399 
3400    /* Subject to change or removal.  For use only in regcomp.c and regexec.c
3401     * Can't be used on a property that is subject to user override, as it
3402     * relies on the value of SPECIALS in the swash which would be set by
3403     * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3404     * for overridden properties
3405     *
3406     * Returns a hash which is the inversion and closure of a swash mapping.
3407     * For example, consider the input lines:
3408     * 004B		006B
3409     * 004C		006C
3410     * 212A		006B
3411     *
3412     * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3413     * 006C.  The value for each key is an array.  For 006C, the array would
3414     * have two elements, the utf8 for itself, and for 004C.  For 006B, there
3415     * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3416     *
3417     * Note that there are no elements in the hash for 004B, 004C, 212A.  The
3418     * keys are only code points that are folded-to, so it isn't a full closure.
3419     *
3420     * Essentially, for any code point, it gives all the code points that map to
3421     * it, or the list of 'froms' for that point.
3422     *
3423     * Currently it ignores any additions or deletions from other swashes,
3424     * looking at just the main body of the swash, and if there are SPECIALS
3425     * in the swash, at that hash
3426     *
3427     * The specials hash can be extra code points, and most likely consists of
3428     * maps from single code points to multiple ones (each expressed as a string
3429     * of utf8 characters).   This function currently returns only 1-1 mappings.
3430     * However consider this possible input in the specials hash:
3431     * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
3432     * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
3433     *
3434     * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3435     * currently handle.  But it also means that FB05 and FB06 are equivalent in
3436     * a 1-1 mapping which we should handle, and this relationship may not be in
3437     * the main table.  Therefore this function examines all the multi-char
3438     * sequences and adds the 1-1 mappings that come out of that.  */
3439 
3440     U8 *l, *lend;
3441     STRLEN lcur;
3442     HV *const hv = MUTABLE_HV(SvRV(swash));
3443 
3444     /* The string containing the main body of the table.  This will have its
3445      * assertion fail if the swash has been converted to its inversion list */
3446     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3447 
3448     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3449     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3450     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3451     /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3452     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3453     const STRLEN bits  = SvUV(*bitssvp);
3454     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3455     const UV     none  = SvUV(*nonesvp);
3456     SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3457 
3458     HV* ret = newHV();
3459 
3460     PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3461 
3462     /* Must have at least 8 bits to get the mappings */
3463     if (bits != 8 && bits != 16 && bits != 32) {
3464 	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3465 						 (UV)bits);
3466     }
3467 
3468     if (specials_p) { /* It might be "special" (sometimes, but not always, a
3469 			mapping to more than one character */
3470 
3471 	/* Construct an inverse mapping hash for the specials */
3472 	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3473 	HV * specials_inverse = newHV();
3474 	char *char_from; /* the lhs of the map */
3475 	I32 from_len;   /* its byte length */
3476 	char *char_to;  /* the rhs of the map */
3477 	I32 to_len;	/* its byte length */
3478 	SV *sv_to;	/* and in a sv */
3479 	AV* from_list;  /* list of things that map to each 'to' */
3480 
3481 	hv_iterinit(specials_hv);
3482 
3483 	/* The keys are the characters (in utf8) that map to the corresponding
3484 	 * utf8 string value.  Iterate through the list creating the inverse
3485 	 * list. */
3486 	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3487 	    SV** listp;
3488 	    if (! SvPOK(sv_to)) {
3489 		Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3490 			   "unexpectedly is not a string, flags=%lu",
3491 			   (unsigned long)SvFLAGS(sv_to));
3492 	    }
3493 	    /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3494 
3495 	    /* Each key in the inverse list is a mapped-to value, and the key's
3496 	     * hash value is a list of the strings (each in utf8) that map to
3497 	     * it.  Those strings are all one character long */
3498 	    if ((listp = hv_fetch(specials_inverse,
3499 				    SvPVX(sv_to),
3500 				    SvCUR(sv_to), 0)))
3501 	    {
3502 		from_list = (AV*) *listp;
3503 	    }
3504 	    else { /* No entry yet for it: create one */
3505 		from_list = newAV();
3506 		if (! hv_store(specials_inverse,
3507 				SvPVX(sv_to),
3508 				SvCUR(sv_to),
3509 				(SV*) from_list, 0))
3510 		{
3511 		    Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3512 		}
3513 	    }
3514 
3515 	    /* Here have the list associated with this 'to' (perhaps newly
3516 	     * created and empty).  Just add to it.  Note that we ASSUME that
3517 	     * the input is guaranteed to not have duplications, so we don't
3518 	     * check for that.  Duplications just slow down execution time. */
3519 	    av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3520 	}
3521 
3522 	/* Here, 'specials_inverse' contains the inverse mapping.  Go through
3523 	 * it looking for cases like the FB05/FB06 examples above.  There would
3524 	 * be an entry in the hash like
3525 	*	'st' => [ FB05, FB06 ]
3526 	* In this example we will create two lists that get stored in the
3527 	* returned hash, 'ret':
3528 	*	FB05 => [ FB05, FB06 ]
3529 	*	FB06 => [ FB05, FB06 ]
3530 	*
3531 	* Note that there is nothing to do if the array only has one element.
3532 	* (In the normal 1-1 case handled below, we don't have to worry about
3533 	* two lists, as everything gets tied to the single list that is
3534 	* generated for the single character 'to'.  But here, we are omitting
3535 	* that list, ('st' in the example), so must have multiple lists.) */
3536 	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3537 						 &char_to, &to_len)))
3538 	{
3539 	    if (av_tindex(from_list) > 0) {
3540 		SSize_t i;
3541 
3542 		/* We iterate over all combinations of i,j to place each code
3543 		 * point on each list */
3544 		for (i = 0; i <= av_tindex(from_list); i++) {
3545 		    SSize_t j;
3546 		    AV* i_list = newAV();
3547 		    SV** entryp = av_fetch(from_list, i, FALSE);
3548 		    if (entryp == NULL) {
3549 			Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3550 		    }
3551 		    if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3552 			Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3553 		    }
3554 		    if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3555 				   (SV*) i_list, FALSE))
3556 		    {
3557 			Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3558 		    }
3559 
3560 		    /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3561 		    for (j = 0; j <= av_tindex(from_list); j++) {
3562 			entryp = av_fetch(from_list, j, FALSE);
3563 			if (entryp == NULL) {
3564 			    Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3565 			}
3566 
3567 			/* When i==j this adds itself to the list */
3568 			av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3569 					(U8*) SvPVX(*entryp),
3570 					(U8*) SvPVX(*entryp) + SvCUR(*entryp),
3571 					0)));
3572 			/*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3573 		    }
3574 		}
3575 	    }
3576 	}
3577 	SvREFCNT_dec(specials_inverse); /* done with it */
3578     } /* End of specials */
3579 
3580     /* read $swash->{LIST} */
3581     l = (U8*)SvPV(*listsvp, lcur);
3582     lend = l + lcur;
3583 
3584     /* Go through each input line */
3585     while (l < lend) {
3586 	UV min, max, val;
3587 	UV inverse;
3588 	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3589 					 cBOOL(octets), typestr);
3590 	if (l > lend) {
3591 	    break;
3592 	}
3593 
3594 	/* Each element in the range is to be inverted */
3595 	for (inverse = min; inverse <= max; inverse++) {
3596 	    AV* list;
3597 	    SV** listp;
3598 	    IV i;
3599 	    bool found_key = FALSE;
3600 	    bool found_inverse = FALSE;
3601 
3602 	    /* The key is the inverse mapping */
3603 	    char key[UTF8_MAXBYTES+1];
3604 	    char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
3605 	    STRLEN key_len = key_end - key;
3606 
3607 	    /* Get the list for the map */
3608 	    if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3609 		list = (AV*) *listp;
3610 	    }
3611 	    else { /* No entry yet for it: create one */
3612 		list = newAV();
3613 		if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3614 		    Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3615 		}
3616 	    }
3617 
3618 	    /* Look through list to see if this inverse mapping already is
3619 	     * listed, or if there is a mapping to itself already */
3620 	    for (i = 0; i <= av_tindex(list); i++) {
3621 		SV** entryp = av_fetch(list, i, FALSE);
3622 		SV* entry;
3623 		if (entryp == NULL) {
3624 		    Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3625 		}
3626 		entry = *entryp;
3627 		/*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
3628 		if (SvUV(entry) == val) {
3629 		    found_key = TRUE;
3630 		}
3631 		if (SvUV(entry) == inverse) {
3632 		    found_inverse = TRUE;
3633 		}
3634 
3635 		/* No need to continue searching if found everything we are
3636 		 * looking for */
3637 		if (found_key && found_inverse) {
3638 		    break;
3639 		}
3640 	    }
3641 
3642 	    /* Make sure there is a mapping to itself on the list */
3643 	    if (! found_key) {
3644 		av_push(list, newSVuv(val));
3645 		/*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
3646 	    }
3647 
3648 
3649 	    /* Simply add the value to the list */
3650 	    if (! found_inverse) {
3651 		av_push(list, newSVuv(inverse));
3652 		/*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
3653 	    }
3654 
3655 	    /* swatch_get() increments the value of val for each element in the
3656 	     * range.  That makes more compact tables possible.  You can
3657 	     * express the capitalization, for example, of all consecutive
3658 	     * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3659 	     * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
3660 	     * and it's not documented; it appears to be used only in
3661 	     * implementing tr//; I copied the semantics from swatch_get(), just
3662 	     * in case */
3663 	    if (!none || val < none) {
3664 		++val;
3665 	    }
3666 	}
3667     }
3668 
3669     return ret;
3670 }
3671 
3672 SV*
3673 Perl__swash_to_invlist(pTHX_ SV* const swash)
3674 {
3675 
3676    /* Subject to change or removal.  For use only in one place in regcomp.c.
3677     * Ownership is given to one reference count in the returned SV* */
3678 
3679     U8 *l, *lend;
3680     char *loc;
3681     STRLEN lcur;
3682     HV *const hv = MUTABLE_HV(SvRV(swash));
3683     UV elements = 0;    /* Number of elements in the inversion list */
3684     U8 empty[] = "";
3685     SV** listsvp;
3686     SV** typesvp;
3687     SV** bitssvp;
3688     SV** extssvp;
3689     SV** invert_it_svp;
3690 
3691     U8* typestr;
3692     STRLEN bits;
3693     STRLEN octets; /* if bits == 1, then octets == 0 */
3694     U8 *x, *xend;
3695     STRLEN xcur;
3696 
3697     SV* invlist;
3698 
3699     PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
3700 
3701     /* If not a hash, it must be the swash's inversion list instead */
3702     if (SvTYPE(hv) != SVt_PVHV) {
3703         return SvREFCNT_inc_simple_NN((SV*) hv);
3704     }
3705 
3706     /* The string containing the main body of the table */
3707     listsvp = hv_fetchs(hv, "LIST", FALSE);
3708     typesvp = hv_fetchs(hv, "TYPE", FALSE);
3709     bitssvp = hv_fetchs(hv, "BITS", FALSE);
3710     extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3711     invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3712 
3713     typestr = (U8*)SvPV_nolen(*typesvp);
3714     bits  = SvUV(*bitssvp);
3715     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3716 
3717     /* read $swash->{LIST} */
3718     if (SvPOK(*listsvp)) {
3719 	l = (U8*)SvPV(*listsvp, lcur);
3720     }
3721     else {
3722 	/* LIST legitimately doesn't contain a string during compilation phases
3723 	 * of Perl itself, before the Unicode tables are generated.  In this
3724 	 * case, just fake things up by creating an empty list */
3725 	l = empty;
3726 	lcur = 0;
3727     }
3728     loc = (char *) l;
3729     lend = l + lcur;
3730 
3731     if (*l == 'V') {    /*  Inversion list format */
3732         char *after_strtol = (char *) lend;
3733         UV element0;
3734         UV* other_elements_ptr;
3735 
3736         /* The first number is a count of the rest */
3737         l++;
3738         elements = Strtoul((char *)l, &after_strtol, 10);
3739         if (elements == 0) {
3740             invlist = _new_invlist(0);
3741         }
3742         else {
3743             l = (U8 *) after_strtol;
3744 
3745             /* Get the 0th element, which is needed to setup the inversion list */
3746             element0 = (UV) Strtoul((char *)l, &after_strtol, 10);
3747             l = (U8 *) after_strtol;
3748             invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
3749             elements--;
3750 
3751             /* Then just populate the rest of the input */
3752             while (elements-- > 0) {
3753                 if (l > lend) {
3754                     Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
3755                 }
3756                 *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10);
3757                 l = (U8 *) after_strtol;
3758             }
3759         }
3760     }
3761     else {
3762 
3763         /* Scan the input to count the number of lines to preallocate array
3764          * size based on worst possible case, which is each line in the input
3765          * creates 2 elements in the inversion list: 1) the beginning of a
3766          * range in the list; 2) the beginning of a range not in the list.  */
3767         while ((loc = (strchr(loc, '\n'))) != NULL) {
3768             elements += 2;
3769             loc++;
3770         }
3771 
3772         /* If the ending is somehow corrupt and isn't a new line, add another
3773          * element for the final range that isn't in the inversion list */
3774         if (! (*lend == '\n'
3775             || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
3776         {
3777             elements++;
3778         }
3779 
3780         invlist = _new_invlist(elements);
3781 
3782         /* Now go through the input again, adding each range to the list */
3783         while (l < lend) {
3784             UV start, end;
3785             UV val;		/* Not used by this function */
3786 
3787             l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
3788                                             cBOOL(octets), typestr);
3789 
3790             if (l > lend) {
3791                 break;
3792             }
3793 
3794             invlist = _add_range_to_invlist(invlist, start, end);
3795         }
3796     }
3797 
3798     /* Invert if the data says it should be */
3799     if (invert_it_svp && SvUV(*invert_it_svp)) {
3800 	_invlist_invert(invlist);
3801     }
3802 
3803     /* This code is copied from swatch_get()
3804      * read $swash->{EXTRAS} */
3805     x = (U8*)SvPV(*extssvp, xcur);
3806     xend = x + xcur;
3807     while (x < xend) {
3808 	STRLEN namelen;
3809 	U8 *namestr;
3810 	SV** othersvp;
3811 	HV* otherhv;
3812 	STRLEN otherbits;
3813 	SV **otherbitssvp, *other;
3814 	U8 *nl;
3815 
3816 	const U8 opc = *x++;
3817 	if (opc == '\n')
3818 	    continue;
3819 
3820 	nl = (U8*)memchr(x, '\n', xend - x);
3821 
3822 	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3823 	    if (nl) {
3824 		x = nl + 1; /* 1 is length of "\n" */
3825 		continue;
3826 	    }
3827 	    else {
3828 		x = xend; /* to EXTRAS' end at which \n is not found */
3829 		break;
3830 	    }
3831 	}
3832 
3833 	namestr = x;
3834 	if (nl) {
3835 	    namelen = nl - namestr;
3836 	    x = nl + 1;
3837 	}
3838 	else {
3839 	    namelen = xend - namestr;
3840 	    x = xend;
3841 	}
3842 
3843 	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3844 	otherhv = MUTABLE_HV(SvRV(*othersvp));
3845 	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3846 	otherbits = (STRLEN)SvUV(*otherbitssvp);
3847 
3848 	if (bits != otherbits || bits != 1) {
3849 	    Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
3850 		       "properties, bits=%"UVuf", otherbits=%"UVuf,
3851 		       (UV)bits, (UV)otherbits);
3852 	}
3853 
3854 	/* The "other" swatch must be destroyed after. */
3855 	other = _swash_to_invlist((SV *)*othersvp);
3856 
3857 	/* End of code copied from swatch_get() */
3858 	switch (opc) {
3859 	case '+':
3860 	    _invlist_union(invlist, other, &invlist);
3861 	    break;
3862 	case '!':
3863             _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
3864 	    break;
3865 	case '-':
3866 	    _invlist_subtract(invlist, other, &invlist);
3867 	    break;
3868 	case '&':
3869 	    _invlist_intersection(invlist, other, &invlist);
3870 	    break;
3871 	default:
3872 	    break;
3873 	}
3874 	sv_free(other); /* through with it! */
3875     }
3876 
3877     SvREADONLY_on(invlist);
3878     return invlist;
3879 }
3880 
3881 SV*
3882 Perl__get_swash_invlist(pTHX_ SV* const swash)
3883 {
3884     SV** ptr;
3885 
3886     PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
3887 
3888     if (! SvROK(swash)) {
3889         return NULL;
3890     }
3891 
3892     /* If it really isn't a hash, it isn't really swash; must be an inversion
3893      * list */
3894     if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
3895         return SvRV(swash);
3896     }
3897 
3898     ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
3899     if (! ptr) {
3900         return NULL;
3901     }
3902 
3903     return *ptr;
3904 }
3905 
3906 bool
3907 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
3908 {
3909     /* May change: warns if surrogates, non-character code points, or
3910      * non-Unicode code points are in s which has length len bytes.  Returns
3911      * TRUE if none found; FALSE otherwise.  The only other validity check is
3912      * to make sure that this won't exceed the string's length */
3913 
3914     const U8* const e = s + len;
3915     bool ok = TRUE;
3916 
3917     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3918 
3919     while (s < e) {
3920 	if (UTF8SKIP(s) > len) {
3921 	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3922 			   "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3923 	    return FALSE;
3924 	}
3925 	if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
3926 	    STRLEN char_len;
3927 	    if (UTF8_IS_SUPER(s)) {
3928 		if (ckWARN_d(WARN_NON_UNICODE)) {
3929 		    UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3930 		    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3931 			"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
3932 		    ok = FALSE;
3933 		}
3934 	    }
3935 	    else if (UTF8_IS_SURROGATE(s)) {
3936 		if (ckWARN_d(WARN_SURROGATE)) {
3937 		    UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3938 		    Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3939 			"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
3940 		    ok = FALSE;
3941 		}
3942 	    }
3943 	    else if
3944 		((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
3945 		 && (ckWARN_d(WARN_NONCHAR)))
3946 	    {
3947 		UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3948 		Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
3949 		    "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
3950 		ok = FALSE;
3951 	    }
3952 	}
3953 	s += UTF8SKIP(s);
3954     }
3955 
3956     return ok;
3957 }
3958 
3959 /*
3960 =for apidoc pv_uni_display
3961 
3962 Build to the scalar C<dsv> a displayable version of the string C<spv>,
3963 length C<len>, the displayable version being at most C<pvlim> bytes long
3964 (if longer, the rest is truncated and "..." will be appended).
3965 
3966 The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
3967 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
3968 to display the \\[nrfta\\] as the backslashed versions (like '\n')
3969 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
3970 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
3971 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
3972 
3973 The pointer to the PV of the C<dsv> is returned.
3974 
3975 =cut */
3976 char *
3977 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
3978 {
3979     int truncated = 0;
3980     const char *s, *e;
3981 
3982     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
3983 
3984     sv_setpvs(dsv, "");
3985     SvUTF8_off(dsv);
3986     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
3987 	 UV u;
3988 	  /* This serves double duty as a flag and a character to print after
3989 	     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
3990 	  */
3991 	 char ok = 0;
3992 
3993 	 if (pvlim && SvCUR(dsv) >= pvlim) {
3994 	      truncated++;
3995 	      break;
3996 	 }
3997 	 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
3998 	 if (u < 256) {
3999 	     const unsigned char c = (unsigned char)u & 0xFF;
4000 	     if (flags & UNI_DISPLAY_BACKSLASH) {
4001 	         switch (c) {
4002 		 case '\n':
4003 		     ok = 'n'; break;
4004 		 case '\r':
4005 		     ok = 'r'; break;
4006 		 case '\t':
4007 		     ok = 't'; break;
4008 		 case '\f':
4009 		     ok = 'f'; break;
4010 		 case '\a':
4011 		     ok = 'a'; break;
4012 		 case '\\':
4013 		     ok = '\\'; break;
4014 		 default: break;
4015 		 }
4016 		 if (ok) {
4017 		     const char string = ok;
4018 		     sv_catpvs(dsv, "\\");
4019 		     sv_catpvn(dsv, &string, 1);
4020 		 }
4021 	     }
4022 	     /* isPRINT() is the locale-blind version. */
4023 	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4024 		 const char string = c;
4025 		 sv_catpvn(dsv, &string, 1);
4026 		 ok = 1;
4027 	     }
4028 	 }
4029 	 if (!ok)
4030 	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
4031     }
4032     if (truncated)
4033 	 sv_catpvs(dsv, "...");
4034 
4035     return SvPVX(dsv);
4036 }
4037 
4038 /*
4039 =for apidoc sv_uni_display
4040 
4041 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4042 the displayable version being at most C<pvlim> bytes long
4043 (if longer, the rest is truncated and "..." will be appended).
4044 
4045 The C<flags> argument is as in L</pv_uni_display>().
4046 
4047 The pointer to the PV of the C<dsv> is returned.
4048 
4049 =cut
4050 */
4051 char *
4052 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4053 {
4054     const char * const ptr =
4055         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4056 
4057     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4058 
4059     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4060 				SvCUR(ssv), pvlim, flags);
4061 }
4062 
4063 /*
4064 =for apidoc foldEQ_utf8
4065 
4066 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
4067 of which may be in UTF-8) are the same case-insensitively; false otherwise.
4068 How far into the strings to compare is determined by other input parameters.
4069 
4070 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4071 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
4072 with respect to C<s2>.
4073 
4074 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
4075 equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
4076 scan will not be considered to be a match unless the goal is reached, and
4077 scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
4078 C<s2>.
4079 
4080 If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
4081 considered an end pointer to the position 1 byte past the maximum point
4082 in C<s1> beyond which scanning will not continue under any circumstances.
4083 (This routine assumes that UTF-8 encoded input strings are not malformed;
4084 malformed input can cause it to read past C<pe1>).
4085 This means that if both C<l1> and C<pe1> are specified, and C<pe1>
4086 is less than C<s1>+C<l1>, the match will never be successful because it can
4087 never
4088 get as far as its goal (and in fact is asserted against).  Correspondingly for
4089 C<pe2> with respect to C<s2>.
4090 
4091 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4092 C<l2> must be non-zero), and if both do, both have to be
4093 reached for a successful match.   Also, if the fold of a character is multiple
4094 characters, all of them must be matched (see tr21 reference below for
4095 'folding').
4096 
4097 Upon a successful match, if C<pe1> is non-NULL,
4098 it will be set to point to the beginning of the I<next> character of C<s1>
4099 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4100 
4101 For case-insensitiveness, the "casefolding" of Unicode is used
4102 instead of upper/lowercasing both the characters, see
4103 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
4104 
4105 =cut */
4106 
4107 /* A flags parameter has been added which may change, and hence isn't
4108  * externally documented.  Currently it is:
4109  *  0 for as-documented above
4110  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4111 			    ASCII one, to not match
4112  *  FOLDEQ_LOCALE	    is set iff the rules from the current underlying
4113  *	                    locale are to be used.
4114  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4115  *                            routine.  This allows that step to be skipped.
4116  *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
4117  */
4118 I32
4119 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
4120 {
4121     dVAR;
4122     const U8 *p1  = (const U8*)s1; /* Point to current char */
4123     const U8 *p2  = (const U8*)s2;
4124     const U8 *g1 = NULL;       /* goal for s1 */
4125     const U8 *g2 = NULL;
4126     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4127     U8 *f1 = NULL;             /* Point to current folded */
4128     const U8 *e2 = NULL;
4129     U8 *f2 = NULL;
4130     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4131     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4132     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4133 
4134     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4135 
4136     assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4137            && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
4138     /* The algorithm is to trial the folds without regard to the flags on
4139      * the first line of the above assert(), and then see if the result
4140      * violates them.  This means that the inputs can't be pre-folded to a
4141      * violating result, hence the assert.  This could be changed, with the
4142      * addition of extra tests here for the already-folded case, which would
4143      * slow it down.  That cost is more than any possible gain for when these
4144      * flags are specified, as the flags indicate /il or /iaa matching which
4145      * is less common than /iu, and I (khw) also believe that real-world /il
4146      * and /iaa matches are most likely to involve code points 0-255, and this
4147      * function only under rare conditions gets called for 0-255. */
4148 
4149     if (IN_UTF8_CTYPE_LOCALE) {
4150         flags &= ~FOLDEQ_LOCALE;
4151     }
4152 
4153     if (pe1) {
4154         e1 = *(U8**)pe1;
4155     }
4156 
4157     if (l1) {
4158         g1 = (const U8*)s1 + l1;
4159     }
4160 
4161     if (pe2) {
4162         e2 = *(U8**)pe2;
4163     }
4164 
4165     if (l2) {
4166         g2 = (const U8*)s2 + l2;
4167     }
4168 
4169     /* Must have at least one goal */
4170     assert(g1 || g2);
4171 
4172     if (g1) {
4173 
4174         /* Will never match if goal is out-of-bounds */
4175         assert(! e1  || e1 >= g1);
4176 
4177         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4178         * only go as far as the goal */
4179         e1 = g1;
4180     }
4181     else {
4182 	assert(e1);    /* Must have an end for looking at s1 */
4183     }
4184 
4185     /* Same for goal for s2 */
4186     if (g2) {
4187         assert(! e2  || e2 >= g2);
4188         e2 = g2;
4189     }
4190     else {
4191 	assert(e2);
4192     }
4193 
4194     /* If both operands are already folded, we could just do a memEQ on the
4195      * whole strings at once, but it would be better if the caller realized
4196      * this and didn't even call us */
4197 
4198     /* Look through both strings, a character at a time */
4199     while (p1 < e1 && p2 < e2) {
4200 
4201         /* If at the beginning of a new character in s1, get its fold to use
4202 	 * and the length of the fold.  (exception: locale rules just get the
4203 	 * character to a single byte) */
4204         if (n1 == 0) {
4205 	    if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4206 		f1 = (U8 *) p1;
4207 		n1 = UTF8SKIP(f1);
4208 	    }
4209 	    else {
4210 		/* If in locale matching, we use two sets of rules, depending
4211 		 * on if the code point is above or below 255.  Here, we test
4212 		 * for and handle locale rules */
4213 		if ((flags & FOLDEQ_LOCALE)
4214 		    && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1)))
4215 		{
4216 		    /* There is no mixing of code points above and below 255. */
4217 		    if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) {
4218 			return 0;
4219 		    }
4220 
4221 		    /* We handle locale rules by converting, if necessary, the
4222 		     * code point to a single byte. */
4223 		    if (! u1 || UTF8_IS_INVARIANT(*p1)) {
4224 			*foldbuf1 = *p1;
4225 		    }
4226 		    else {
4227 			*foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1));
4228 		    }
4229 		    n1 = 1;
4230 		}
4231 		else if (isASCII(*p1)) {    /* Note, that here won't be both
4232 					       ASCII and using locale rules */
4233 
4234 		    /* If trying to mix non- with ASCII, and not supposed to,
4235 		     * fail */
4236 		    if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4237 			return 0;
4238 		    }
4239 		    n1 = 1;
4240 		    *foldbuf1 = toFOLD(*p1);
4241 		}
4242 		else if (u1) {
4243 		    to_utf8_fold(p1, foldbuf1, &n1);
4244 		}
4245 		else {  /* Not utf8, get utf8 fold */
4246 		    to_uni_fold(*p1, foldbuf1, &n1);
4247 		}
4248 		f1 = foldbuf1;
4249 	    }
4250         }
4251 
4252         if (n2 == 0) {    /* Same for s2 */
4253 	    if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4254 		f2 = (U8 *) p2;
4255 		n2 = UTF8SKIP(f2);
4256 	    }
4257 	    else {
4258 		if ((flags & FOLDEQ_LOCALE)
4259 		    && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2)))
4260 		{
4261 		    /* Here, the next char in s2 is < 256.  We've already
4262 		     * worked on s1, and if it isn't also < 256, can't match */
4263 		    if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) {
4264 			return 0;
4265 		    }
4266 		    if (! u2 || UTF8_IS_INVARIANT(*p2)) {
4267 			*foldbuf2 = *p2;
4268 		    }
4269 		    else {
4270 			*foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1));
4271 		    }
4272 
4273 		    /* Use another function to handle locale rules.  We've made
4274 		     * sure that both characters to compare are single bytes */
4275 		    if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
4276 			return 0;
4277 		    }
4278 		    n1 = n2 = 0;
4279 		}
4280 		else if (isASCII(*p2)) {
4281 		    if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4282 			return 0;
4283 		    }
4284 		    n2 = 1;
4285 		    *foldbuf2 = toFOLD(*p2);
4286 		}
4287 		else if (u2) {
4288 		    to_utf8_fold(p2, foldbuf2, &n2);
4289 		}
4290 		else {
4291 		    to_uni_fold(*p2, foldbuf2, &n2);
4292 		}
4293 		f2 = foldbuf2;
4294 	    }
4295         }
4296 
4297 	/* Here f1 and f2 point to the beginning of the strings to compare.
4298 	 * These strings are the folds of the next character from each input
4299 	 * string, stored in utf8. */
4300 
4301         /* While there is more to look for in both folds, see if they
4302         * continue to match */
4303         while (n1 && n2) {
4304             U8 fold_length = UTF8SKIP(f1);
4305             if (fold_length != UTF8SKIP(f2)
4306                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4307                                                        function call for single
4308                                                        byte */
4309                 || memNE((char*)f1, (char*)f2, fold_length))
4310             {
4311                 return 0; /* mismatch */
4312             }
4313 
4314             /* Here, they matched, advance past them */
4315             n1 -= fold_length;
4316             f1 += fold_length;
4317             n2 -= fold_length;
4318             f2 += fold_length;
4319         }
4320 
4321         /* When reach the end of any fold, advance the input past it */
4322         if (n1 == 0) {
4323             p1 += u1 ? UTF8SKIP(p1) : 1;
4324         }
4325         if (n2 == 0) {
4326             p2 += u2 ? UTF8SKIP(p2) : 1;
4327         }
4328     } /* End of loop through both strings */
4329 
4330     /* A match is defined by each scan that specified an explicit length
4331     * reaching its final goal, and the other not having matched a partial
4332     * character (which can happen when the fold of a character is more than one
4333     * character). */
4334     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4335         return 0;
4336     }
4337 
4338     /* Successful match.  Set output pointers */
4339     if (pe1) {
4340         *pe1 = (char*)p1;
4341     }
4342     if (pe2) {
4343         *pe2 = (char*)p2;
4344     }
4345     return 1;
4346 }
4347 
4348 /* XXX The next four functions should likely be moved to mathoms.c once all
4349  * occurrences of them are removed from the core; some cpan-upstream modules
4350  * still use them */
4351 
4352 U8 *
4353 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
4354 {
4355     PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
4356 
4357     return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
4358 }
4359 
4360 UV
4361 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
4362 {
4363     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
4364 
4365     return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
4366 }
4367 
4368 /*
4369 =for apidoc uvuni_to_utf8_flags
4370 
4371 Instead you almost certainly want to use L</uvchr_to_utf8> or
4372 L</uvchr_to_utf8_flags>>.
4373 
4374 This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
4375 which itself, while not deprecated, should be used only in isolated
4376 circumstances.  These functions were useful for code that wanted to handle
4377 both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
4378 v5.20, the distinctions between the platforms have mostly been made invisible
4379 to most code, so this function is quite unlikely to be what you want.
4380 
4381 =cut
4382 */
4383 
4384 U8 *
4385 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
4386 {
4387     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
4388 
4389     return uvoffuni_to_utf8_flags(d, uv, flags);
4390 }
4391 
4392 /*
4393 =for apidoc utf8n_to_uvuni
4394 
4395 Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
4396 
4397 This function was useful for code that wanted to handle both EBCDIC and
4398 ASCII platforms with Unicode properties, but starting in Perl v5.20, the
4399 distinctions between the platforms have mostly been made invisible to most
4400 code, so this function is quite unlikely to be what you want.  If you do need
4401 this precise functionality, use instead
4402 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
4403 or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
4404 
4405 =cut
4406 */
4407 
4408 /*
4409  * Local variables:
4410  * c-indentation-style: bsd
4411  * c-basic-offset: 4
4412  * indent-tabs-mode: nil
4413  * End:
4414  *
4415  * ex: set ts=8 sts=4 sw=4 et:
4416  */
4417