xref: /openbsd-src/gnu/usr.bin/perl/utf8.c (revision 43003dfe3ad45d1698bed8a37f2b0f5b14f20d4f)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
13  *  heard of that we don't want to see any closer; and that's the one place
14  *  we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sm�agol"]
17  *
18  * 'Well do I understand your speech,' he answered in the same language;
19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
20  *  as is the custom in the West, if you wish to be answered?'
21  *                           --Gandalf, addressing Th�oden's door wardens
22  *
23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24  *
25  * ...the travellers perceived that the floor was paved with stones of many
26  * hues; branching runes and strange devices intertwined beneath their feet.
27  *
28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29  */
30 
31 #include "EXTERN.h"
32 #define PERL_IN_UTF8_C
33 #include "perl.h"
34 
35 #ifndef EBCDIC
36 /* Separate prototypes needed because in ASCII systems these
37  * usually macros but they still are compiled as code, too. */
38 PERL_CALLCONV UV	Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39 PERL_CALLCONV U8*	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40 #endif
41 
42 static const char unees[] =
43     "Malformed UTF-8 character (unexpected end of string)";
44 
45 /*
46 =head1 Unicode Support
47 
48 This file contains various utility functions for manipulating UTF8-encoded
49 strings. For the uninitiated, this is a method of representing arbitrary
50 Unicode characters as a variable number of bytes, in such a way that
51 characters in the ASCII range are unmodified, and a zero byte never appears
52 within non-zero characters.
53 
54 =for apidoc uvuni_to_utf8_flags
55 
56 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
57 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
58 bytes available. The return value is the pointer to the byte after the
59 end of the new character. In other words,
60 
61     d = uvuni_to_utf8_flags(d, uv, flags);
62 
63 or, in most cases,
64 
65     d = uvuni_to_utf8(d, uv);
66 
67 (which is equivalent to)
68 
69     d = uvuni_to_utf8_flags(d, uv, 0);
70 
71 is the recommended Unicode-aware way of saying
72 
73     *(d++) = uv;
74 
75 =cut
76 */
77 
78 U8 *
79 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
80 {
81     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
82 
83     if (ckWARN(WARN_UTF8)) {
84 	 if (UNICODE_IS_SURROGATE(uv) &&
85 	     !(flags & UNICODE_ALLOW_SURROGATE))
86 	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
87 	 else if (
88 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
89 		    !(flags & UNICODE_ALLOW_FDD0))
90 		   ||
91 		   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
92 		    !(flags & UNICODE_ALLOW_FFFF))) &&
93 		  /* UNICODE_ALLOW_SUPER includes
94 		   * FFFEs and FFFFs beyond 0x10FFFF. */
95 		  ((uv <= PERL_UNICODE_MAX) ||
96 		   !(flags & UNICODE_ALLOW_SUPER))
97 		  )
98 	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
99 			 "Unicode character 0x%04"UVxf" is illegal", uv);
100     }
101     if (UNI_IS_INVARIANT(uv)) {
102 	*d++ = (U8)UTF_TO_NATIVE(uv);
103 	return d;
104     }
105 #if defined(EBCDIC)
106     else {
107 	STRLEN len  = UNISKIP(uv);
108 	U8 *p = d+len-1;
109 	while (p > d) {
110 	    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
111 	    uv >>= UTF_ACCUMULATION_SHIFT;
112 	}
113 	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
114 	return d+len;
115     }
116 #else /* Non loop style */
117     if (uv < 0x800) {
118 	*d++ = (U8)(( uv >>  6)         | 0xc0);
119 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
120 	return d;
121     }
122     if (uv < 0x10000) {
123 	*d++ = (U8)(( uv >> 12)         | 0xe0);
124 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
125 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
126 	return d;
127     }
128     if (uv < 0x200000) {
129 	*d++ = (U8)(( uv >> 18)         | 0xf0);
130 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
131 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
132 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
133 	return d;
134     }
135     if (uv < 0x4000000) {
136 	*d++ = (U8)(( uv >> 24)         | 0xf8);
137 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
140 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
141 	return d;
142     }
143     if (uv < 0x80000000) {
144 	*d++ = (U8)(( uv >> 30)         | 0xfc);
145 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
146 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
147 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
148 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
149 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
150 	return d;
151     }
152 #ifdef HAS_QUAD
153     if (uv < UTF8_QUAD_MAX)
154 #endif
155     {
156 	*d++ =                            0xfe;	/* Can't match U+FEFF! */
157 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
158 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
159 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
160 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
161 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
162 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
163 	return d;
164     }
165 #ifdef HAS_QUAD
166     {
167 	*d++ =                            0xff;		/* Can't match U+FFFE! */
168 	*d++ =                            0x80;		/* 6 Reserved bits */
169 	*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
170 	*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
171 	*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
172 	*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
173 	*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
174 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
175 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
176 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
177 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
178 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
179 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
180 	return d;
181     }
182 #endif
183 #endif /* Loop style */
184 }
185 
186 /*
187 
188 Tests if some arbitrary number of bytes begins in a valid UTF-8
189 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
190 UTF-8 character.  The actual number of bytes in the UTF-8 character
191 will be returned if it is valid, otherwise 0.
192 
193 This is the "slow" version as opposed to the "fast" version which is
194 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
195 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
196 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
197 you should use the _slow().  In practice this means that the _slow()
198 will be used very rarely, since the maximum Unicode code point (as of
199 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
200 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
201 five bytes or more.
202 
203 =cut */
204 STATIC STRLEN
205 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
206 {
207     U8 u = *s;
208     STRLEN slen;
209     UV uv, ouv;
210 
211     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
212 
213     if (UTF8_IS_INVARIANT(u))
214 	return 1;
215 
216     if (!UTF8_IS_START(u))
217 	return 0;
218 
219     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
220 	return 0;
221 
222     slen = len - 1;
223     s++;
224 #ifdef EBCDIC
225     u = NATIVE_TO_UTF(u);
226 #endif
227     u &= UTF_START_MASK(len);
228     uv  = u;
229     ouv = uv;
230     while (slen--) {
231 	if (!UTF8_IS_CONTINUATION(*s))
232 	    return 0;
233 	uv = UTF8_ACCUMULATE(uv, *s);
234 	if (uv < ouv)
235 	    return 0;
236 	ouv = uv;
237 	s++;
238     }
239 
240     if ((STRLEN)UNISKIP(uv) < len)
241 	return 0;
242 
243     return len;
244 }
245 
246 /*
247 =for apidoc is_utf8_char
248 
249 Tests if some arbitrary number of bytes begins in a valid UTF-8
250 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
251 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
252 character will be returned if it is valid, otherwise 0.
253 
254 =cut */
255 STRLEN
256 Perl_is_utf8_char(pTHX_ const U8 *s)
257 {
258     const STRLEN len = UTF8SKIP(s);
259 
260     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
261     PERL_UNUSED_CONTEXT;
262 #ifdef IS_UTF8_CHAR
263     if (IS_UTF8_CHAR_FAST(len))
264         return IS_UTF8_CHAR(s, len) ? len : 0;
265 #endif /* #ifdef IS_UTF8_CHAR */
266     return is_utf8_char_slow(s, len);
267 }
268 
269 /*
270 =for apidoc is_utf8_string
271 
272 Returns true if first C<len> bytes of the given string form a valid
273 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
274 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
275 because a valid ASCII string is a valid UTF-8 string.
276 
277 See also is_utf8_string_loclen() and is_utf8_string_loc().
278 
279 =cut
280 */
281 
282 bool
283 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
284 {
285     const U8* const send = s + (len ? len : strlen((const char *)s));
286     const U8* x = s;
287 
288     PERL_ARGS_ASSERT_IS_UTF8_STRING;
289     PERL_UNUSED_CONTEXT;
290 
291     while (x < send) {
292 	STRLEN c;
293 	 /* Inline the easy bits of is_utf8_char() here for speed... */
294 	 if (UTF8_IS_INVARIANT(*x))
295 	      c = 1;
296 	 else if (!UTF8_IS_START(*x))
297 	     goto out;
298 	 else {
299 	      /* ... and call is_utf8_char() only if really needed. */
300 #ifdef IS_UTF8_CHAR
301 	     c = UTF8SKIP(x);
302 	     if (IS_UTF8_CHAR_FAST(c)) {
303 	         if (!IS_UTF8_CHAR(x, c))
304 		     c = 0;
305 	     }
306 	     else
307 		c = is_utf8_char_slow(x, c);
308 #else
309 	     c = is_utf8_char(x);
310 #endif /* #ifdef IS_UTF8_CHAR */
311 	      if (!c)
312 		  goto out;
313 	 }
314         x += c;
315     }
316 
317  out:
318     if (x != send)
319 	return FALSE;
320 
321     return TRUE;
322 }
323 
324 /*
325 Implemented as a macro in utf8.h
326 
327 =for apidoc is_utf8_string_loc
328 
329 Like is_utf8_string() but stores the location of the failure (in the
330 case of "utf8ness failure") or the location s+len (in the case of
331 "utf8ness success") in the C<ep>.
332 
333 See also is_utf8_string_loclen() and is_utf8_string().
334 
335 =for apidoc is_utf8_string_loclen
336 
337 Like is_utf8_string() but stores the location of the failure (in the
338 case of "utf8ness failure") or the location s+len (in the case of
339 "utf8ness success") in the C<ep>, and the number of UTF-8
340 encoded characters in the C<el>.
341 
342 See also is_utf8_string_loc() and is_utf8_string().
343 
344 =cut
345 */
346 
347 bool
348 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
349 {
350     const U8* const send = s + (len ? len : strlen((const char *)s));
351     const U8* x = s;
352     STRLEN c;
353     STRLEN outlen = 0;
354 
355     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
356     PERL_UNUSED_CONTEXT;
357 
358     while (x < send) {
359 	 /* Inline the easy bits of is_utf8_char() here for speed... */
360 	 if (UTF8_IS_INVARIANT(*x))
361 	     c = 1;
362 	 else if (!UTF8_IS_START(*x))
363 	     goto out;
364 	 else {
365 	     /* ... and call is_utf8_char() only if really needed. */
366 #ifdef IS_UTF8_CHAR
367 	     c = UTF8SKIP(x);
368 	     if (IS_UTF8_CHAR_FAST(c)) {
369 	         if (!IS_UTF8_CHAR(x, c))
370 		     c = 0;
371 	     } else
372 	         c = is_utf8_char_slow(x, c);
373 #else
374 	     c = is_utf8_char(x);
375 #endif /* #ifdef IS_UTF8_CHAR */
376 	     if (!c)
377 	         goto out;
378 	 }
379          x += c;
380 	 outlen++;
381     }
382 
383  out:
384     if (el)
385         *el = outlen;
386 
387     if (ep)
388         *ep = x;
389     return (x == send);
390 }
391 
392 /*
393 
394 =for apidoc utf8n_to_uvuni
395 
396 Bottom level UTF-8 decode routine.
397 Returns the Unicode code point value of the first character in the string C<s>
398 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
399 C<retlen> will be set to the length, in bytes, of that character.
400 
401 If C<s> does not point to a well-formed UTF-8 character, the behaviour
402 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
403 it is assumed that the caller will raise a warning, and this function
404 will silently just set C<retlen> to C<-1> and return zero.  If the
405 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
406 malformations will be given, C<retlen> will be set to the expected
407 length of the UTF-8 character in bytes, and zero will be returned.
408 
409 The C<flags> can also contain various flags to allow deviations from
410 the strict UTF-8 encoding (see F<utf8.h>).
411 
412 Most code should use utf8_to_uvchr() rather than call this directly.
413 
414 =cut
415 */
416 
417 UV
418 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
419 {
420     dVAR;
421     const U8 * const s0 = s;
422     UV uv = *s, ouv = 0;
423     STRLEN len = 1;
424     const bool dowarn = ckWARN_d(WARN_UTF8);
425     const UV startbyte = *s;
426     STRLEN expectlen = 0;
427     U32 warning = 0;
428 
429     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
430 
431 /* This list is a superset of the UTF8_ALLOW_XXX. */
432 
433 #define UTF8_WARN_EMPTY				 1
434 #define UTF8_WARN_CONTINUATION			 2
435 #define UTF8_WARN_NON_CONTINUATION	 	 3
436 #define UTF8_WARN_FE_FF				 4
437 #define UTF8_WARN_SHORT				 5
438 #define UTF8_WARN_OVERFLOW			 6
439 #define UTF8_WARN_SURROGATE			 7
440 #define UTF8_WARN_LONG				 8
441 #define UTF8_WARN_FFFF				 9 /* Also FFFE. */
442 
443     if (curlen == 0 &&
444 	!(flags & UTF8_ALLOW_EMPTY)) {
445 	warning = UTF8_WARN_EMPTY;
446 	goto malformed;
447     }
448 
449     if (UTF8_IS_INVARIANT(uv)) {
450 	if (retlen)
451 	    *retlen = 1;
452 	return (UV) (NATIVE_TO_UTF(*s));
453     }
454 
455     if (UTF8_IS_CONTINUATION(uv) &&
456 	!(flags & UTF8_ALLOW_CONTINUATION)) {
457 	warning = UTF8_WARN_CONTINUATION;
458 	goto malformed;
459     }
460 
461     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
462 	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
463 	warning = UTF8_WARN_NON_CONTINUATION;
464 	goto malformed;
465     }
466 
467 #ifdef EBCDIC
468     uv = NATIVE_TO_UTF(uv);
469 #else
470     if ((uv == 0xfe || uv == 0xff) &&
471 	!(flags & UTF8_ALLOW_FE_FF)) {
472 	warning = UTF8_WARN_FE_FF;
473 	goto malformed;
474     }
475 #endif
476 
477     if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
478     else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
479     else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
480     else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
481 #ifdef EBCDIC
482     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
483     else			{ len =  7; uv &= 0x01; }
484 #else
485     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
486     else if (!(uv & 0x01))	{ len =  7; uv = 0; }
487     else			{ len = 13; uv = 0; } /* whoa! */
488 #endif
489 
490     if (retlen)
491 	*retlen = len;
492 
493     expectlen = len;
494 
495     if ((curlen < expectlen) &&
496 	!(flags & UTF8_ALLOW_SHORT)) {
497 	warning = UTF8_WARN_SHORT;
498 	goto malformed;
499     }
500 
501     len--;
502     s++;
503     ouv = uv;
504 
505     while (len--) {
506 	if (!UTF8_IS_CONTINUATION(*s) &&
507 	    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
508 	    s--;
509 	    warning = UTF8_WARN_NON_CONTINUATION;
510 	    goto malformed;
511 	}
512 	else
513 	    uv = UTF8_ACCUMULATE(uv, *s);
514 	if (!(uv > ouv)) {
515 	    /* These cannot be allowed. */
516 	    if (uv == ouv) {
517 		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
518 		    warning = UTF8_WARN_LONG;
519 		    goto malformed;
520 		}
521 	    }
522 	    else { /* uv < ouv */
523 		/* This cannot be allowed. */
524 		warning = UTF8_WARN_OVERFLOW;
525 		goto malformed;
526 	    }
527 	}
528 	s++;
529 	ouv = uv;
530     }
531 
532     if (UNICODE_IS_SURROGATE(uv) &&
533 	!(flags & UTF8_ALLOW_SURROGATE)) {
534 	warning = UTF8_WARN_SURROGATE;
535 	goto malformed;
536     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
537 	       !(flags & UTF8_ALLOW_LONG)) {
538 	warning = UTF8_WARN_LONG;
539 	goto malformed;
540     } else if (UNICODE_IS_ILLEGAL(uv) &&
541 	       !(flags & UTF8_ALLOW_FFFF)) {
542 	warning = UTF8_WARN_FFFF;
543 	goto malformed;
544     }
545 
546     return uv;
547 
548 malformed:
549 
550     if (flags & UTF8_CHECK_ONLY) {
551 	if (retlen)
552 	    *retlen = ((STRLEN) -1);
553 	return 0;
554     }
555 
556     if (dowarn) {
557 	SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
558 
559 	switch (warning) {
560 	case 0: /* Intentionally empty. */ break;
561 	case UTF8_WARN_EMPTY:
562 	    sv_catpvs(sv, "(empty string)");
563 	    break;
564 	case UTF8_WARN_CONTINUATION:
565 	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
566 	    break;
567 	case UTF8_WARN_NON_CONTINUATION:
568 	    if (s == s0)
569 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
570                            (UV)s[1], startbyte);
571 	    else {
572 		const int len = (int)(s-s0);
573 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
574                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
575 	    }
576 
577 	    break;
578 	case UTF8_WARN_FE_FF:
579 	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
580 	    break;
581 	case UTF8_WARN_SHORT:
582 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
583                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
584 	    expectlen = curlen;		/* distance for caller to skip */
585 	    break;
586 	case UTF8_WARN_OVERFLOW:
587 	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
588                            ouv, *s, startbyte);
589 	    break;
590 	case UTF8_WARN_SURROGATE:
591 	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
592 	    break;
593 	case UTF8_WARN_LONG:
594 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
595 			   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
596 	    break;
597 	case UTF8_WARN_FFFF:
598 	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
599 	    break;
600 	default:
601 	    sv_catpvs(sv, "(unknown reason)");
602 	    break;
603 	}
604 
605 	if (warning) {
606 	    const char * const s = SvPVX_const(sv);
607 
608 	    if (PL_op)
609 		Perl_warner(aTHX_ packWARN(WARN_UTF8),
610 			    "%s in %s", s,  OP_DESC(PL_op));
611 	    else
612 		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
613 	}
614     }
615 
616     if (retlen)
617 	*retlen = expectlen ? expectlen : len;
618 
619     return 0;
620 }
621 
622 /*
623 =for apidoc utf8_to_uvchr
624 
625 Returns the native character value of the first character in the string C<s>
626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
627 length, in bytes, of that character.
628 
629 If C<s> does not point to a well-formed UTF-8 character, zero is
630 returned and retlen is set, if possible, to -1.
631 
632 =cut
633 */
634 
635 UV
636 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
637 {
638     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
639 
640     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
641 			  ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
642 }
643 
644 /*
645 =for apidoc utf8_to_uvuni
646 
647 Returns the Unicode code point of the first character in the string C<s>
648 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
649 length, in bytes, of that character.
650 
651 This function should only be used when the returned UV is considered
652 an index into the Unicode semantic tables (e.g. swashes).
653 
654 If C<s> does not point to a well-formed UTF-8 character, zero is
655 returned and retlen is set, if possible, to -1.
656 
657 =cut
658 */
659 
660 UV
661 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
662 {
663     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
664 
665     /* Call the low level routine asking for checks */
666     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
667 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
668 }
669 
670 /*
671 =for apidoc utf8_length
672 
673 Return the length of the UTF-8 char encoded string C<s> in characters.
674 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
675 up past C<e>, croaks.
676 
677 =cut
678 */
679 
680 STRLEN
681 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
682 {
683     dVAR;
684     STRLEN len = 0;
685 
686     PERL_ARGS_ASSERT_UTF8_LENGTH;
687 
688     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
689      * the bitops (especially ~) can create illegal UTF-8.
690      * In other words: in Perl UTF-8 is not just for Unicode. */
691 
692     if (e < s)
693 	goto warn_and_return;
694     while (s < e) {
695 	if (!UTF8_IS_INVARIANT(*s))
696 	    s += UTF8SKIP(s);
697 	else
698 	    s++;
699 	len++;
700     }
701 
702     if (e != s) {
703 	len--;
704         warn_and_return:
705 	if (ckWARN_d(WARN_UTF8)) {
706 	    if (PL_op)
707 		Perl_warner(aTHX_ packWARN(WARN_UTF8),
708 			    "%s in %s", unees, OP_DESC(PL_op));
709 	    else
710 		Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
711 	}
712     }
713 
714     return len;
715 }
716 
717 /*
718 =for apidoc utf8_distance
719 
720 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
721 and C<b>.
722 
723 WARNING: use only if you *know* that the pointers point inside the
724 same UTF-8 buffer.
725 
726 =cut
727 */
728 
729 IV
730 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
731 {
732     PERL_ARGS_ASSERT_UTF8_DISTANCE;
733 
734     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
735 }
736 
737 /*
738 =for apidoc utf8_hop
739 
740 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
741 forward or backward.
742 
743 WARNING: do not use the following unless you *know* C<off> is within
744 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
745 on the first byte of character or just after the last byte of a character.
746 
747 =cut
748 */
749 
750 U8 *
751 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
752 {
753     PERL_ARGS_ASSERT_UTF8_HOP;
754 
755     PERL_UNUSED_CONTEXT;
756     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
757      * the bitops (especially ~) can create illegal UTF-8.
758      * In other words: in Perl UTF-8 is not just for Unicode. */
759 
760     if (off >= 0) {
761 	while (off--)
762 	    s += UTF8SKIP(s);
763     }
764     else {
765 	while (off++) {
766 	    s--;
767 	    while (UTF8_IS_CONTINUATION(*s))
768 		s--;
769 	}
770     }
771     return (U8 *)s;
772 }
773 
774 /*
775 =for apidoc utf8_to_bytes
776 
777 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
778 Unlike C<bytes_to_utf8>, this over-writes the original string, and
779 updates len to contain the new length.
780 Returns zero on failure, setting C<len> to -1.
781 
782 If you need a copy of the string, see C<bytes_from_utf8>.
783 
784 =cut
785 */
786 
787 U8 *
788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
789 {
790     U8 * const save = s;
791     U8 * const send = s + *len;
792     U8 *d;
793 
794     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
795 
796     /* ensure valid UTF-8 and chars < 256 before updating string */
797     while (s < send) {
798         U8 c = *s++;
799 
800         if (!UTF8_IS_INVARIANT(c) &&
801             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
802 	     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
803             *len = ((STRLEN) -1);
804             return 0;
805         }
806     }
807 
808     d = s = save;
809     while (s < send) {
810         STRLEN ulen;
811         *d++ = (U8)utf8_to_uvchr(s, &ulen);
812         s += ulen;
813     }
814     *d = '\0';
815     *len = d - save;
816     return save;
817 }
818 
819 /*
820 =for apidoc bytes_from_utf8
821 
822 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
823 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
824 the newly-created string, and updates C<len> to contain the new
825 length.  Returns the original string if no conversion occurs, C<len>
826 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
827 0 if C<s> is converted or consisted entirely of characters that are invariant
828 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
829 
830 =cut
831 */
832 
833 U8 *
834 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
835 {
836     U8 *d;
837     const U8 *start = s;
838     const U8 *send;
839     I32 count = 0;
840 
841     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
842 
843     PERL_UNUSED_CONTEXT;
844     if (!*is_utf8)
845         return (U8 *)start;
846 
847     /* ensure valid UTF-8 and chars < 256 before converting string */
848     for (send = s + *len; s < send;) {
849         U8 c = *s++;
850 	if (!UTF8_IS_INVARIANT(c)) {
851 	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
852                 (c = *s++) && UTF8_IS_CONTINUATION(c))
853 		count++;
854 	    else
855                 return (U8 *)start;
856 	}
857     }
858 
859     *is_utf8 = FALSE;
860 
861     Newx(d, (*len) - count + 1, U8);
862     s = start; start = d;
863     while (s < send) {
864 	U8 c = *s++;
865 	if (!UTF8_IS_INVARIANT(c)) {
866 	    /* Then it is two-byte encoded */
867 	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
868 	    c = ASCII_TO_NATIVE(c);
869 	}
870 	*d++ = c;
871     }
872     *d = '\0';
873     *len = d - start;
874     return (U8 *)start;
875 }
876 
877 /*
878 =for apidoc bytes_to_utf8
879 
880 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
881 Returns a pointer to the newly-created string, and sets C<len> to
882 reflect the new length.
883 
884 A NUL character will be written after the end of the string.
885 
886 If you want to convert to UTF-8 from encodings other than
887 the native (Latin1 or EBCDIC),
888 see sv_recode_to_utf8().
889 
890 =cut
891 */
892 
893 U8*
894 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
895 {
896     const U8 * const send = s + (*len);
897     U8 *d;
898     U8 *dst;
899 
900     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
901     PERL_UNUSED_CONTEXT;
902 
903     Newx(d, (*len) * 2 + 1, U8);
904     dst = d;
905 
906     while (s < send) {
907         const UV uv = NATIVE_TO_ASCII(*s++);
908         if (UNI_IS_INVARIANT(uv))
909             *d++ = (U8)UTF_TO_NATIVE(uv);
910         else {
911             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
912             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
913         }
914     }
915     *d = '\0';
916     *len = d-dst;
917     return dst;
918 }
919 
920 /*
921  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
922  *
923  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
924  * We optimize for native, for obvious reasons. */
925 
926 U8*
927 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
928 {
929     U8* pend;
930     U8* dstart = d;
931 
932     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
933 
934     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
935 	 d[0] = 0;
936 	 *newlen = 1;
937 	 return d;
938     }
939 
940     if (bytelen & 1)
941 	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
942 
943     pend = p + bytelen;
944 
945     while (p < pend) {
946 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
947 	p += 2;
948 	if (uv < 0x80) {
949 #ifdef EBCDIC
950 	    *d++ = UNI_TO_NATIVE(uv);
951 #else
952 	    *d++ = (U8)uv;
953 #endif
954 	    continue;
955 	}
956 	if (uv < 0x800) {
957 	    *d++ = (U8)(( uv >>  6)         | 0xc0);
958 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
959 	    continue;
960 	}
961 	if (uv >= 0xd800 && uv < 0xdbff) {	/* surrogates */
962 	    UV low = (p[0] << 8) + p[1];
963 	    p += 2;
964 	    if (low < 0xdc00 || low >= 0xdfff)
965 		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
966 	    uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
967 	}
968 	if (uv < 0x10000) {
969 	    *d++ = (U8)(( uv >> 12)         | 0xe0);
970 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
971 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
972 	    continue;
973 	}
974 	else {
975 	    *d++ = (U8)(( uv >> 18)         | 0xf0);
976 	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
977 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
978 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
979 	    continue;
980 	}
981     }
982     *newlen = d - dstart;
983     return d;
984 }
985 
986 /* Note: this one is slightly destructive of the source. */
987 
988 U8*
989 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
990 {
991     U8* s = (U8*)p;
992     U8* const send = s + bytelen;
993 
994     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
995 
996     while (s < send) {
997 	const U8 tmp = s[0];
998 	s[0] = s[1];
999 	s[1] = tmp;
1000 	s += 2;
1001     }
1002     return utf16_to_utf8(p, d, bytelen, newlen);
1003 }
1004 
1005 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1006 
1007 bool
1008 Perl_is_uni_alnum(pTHX_ UV c)
1009 {
1010     U8 tmpbuf[UTF8_MAXBYTES+1];
1011     uvchr_to_utf8(tmpbuf, c);
1012     return is_utf8_alnum(tmpbuf);
1013 }
1014 
1015 bool
1016 Perl_is_uni_alnumc(pTHX_ UV c)
1017 {
1018     U8 tmpbuf[UTF8_MAXBYTES+1];
1019     uvchr_to_utf8(tmpbuf, c);
1020     return is_utf8_alnumc(tmpbuf);
1021 }
1022 
1023 bool
1024 Perl_is_uni_idfirst(pTHX_ UV c)
1025 {
1026     U8 tmpbuf[UTF8_MAXBYTES+1];
1027     uvchr_to_utf8(tmpbuf, c);
1028     return is_utf8_idfirst(tmpbuf);
1029 }
1030 
1031 bool
1032 Perl_is_uni_alpha(pTHX_ UV c)
1033 {
1034     U8 tmpbuf[UTF8_MAXBYTES+1];
1035     uvchr_to_utf8(tmpbuf, c);
1036     return is_utf8_alpha(tmpbuf);
1037 }
1038 
1039 bool
1040 Perl_is_uni_ascii(pTHX_ UV c)
1041 {
1042     U8 tmpbuf[UTF8_MAXBYTES+1];
1043     uvchr_to_utf8(tmpbuf, c);
1044     return is_utf8_ascii(tmpbuf);
1045 }
1046 
1047 bool
1048 Perl_is_uni_space(pTHX_ UV c)
1049 {
1050     U8 tmpbuf[UTF8_MAXBYTES+1];
1051     uvchr_to_utf8(tmpbuf, c);
1052     return is_utf8_space(tmpbuf);
1053 }
1054 
1055 bool
1056 Perl_is_uni_digit(pTHX_ UV c)
1057 {
1058     U8 tmpbuf[UTF8_MAXBYTES+1];
1059     uvchr_to_utf8(tmpbuf, c);
1060     return is_utf8_digit(tmpbuf);
1061 }
1062 
1063 bool
1064 Perl_is_uni_upper(pTHX_ UV c)
1065 {
1066     U8 tmpbuf[UTF8_MAXBYTES+1];
1067     uvchr_to_utf8(tmpbuf, c);
1068     return is_utf8_upper(tmpbuf);
1069 }
1070 
1071 bool
1072 Perl_is_uni_lower(pTHX_ UV c)
1073 {
1074     U8 tmpbuf[UTF8_MAXBYTES+1];
1075     uvchr_to_utf8(tmpbuf, c);
1076     return is_utf8_lower(tmpbuf);
1077 }
1078 
1079 bool
1080 Perl_is_uni_cntrl(pTHX_ UV c)
1081 {
1082     U8 tmpbuf[UTF8_MAXBYTES+1];
1083     uvchr_to_utf8(tmpbuf, c);
1084     return is_utf8_cntrl(tmpbuf);
1085 }
1086 
1087 bool
1088 Perl_is_uni_graph(pTHX_ UV c)
1089 {
1090     U8 tmpbuf[UTF8_MAXBYTES+1];
1091     uvchr_to_utf8(tmpbuf, c);
1092     return is_utf8_graph(tmpbuf);
1093 }
1094 
1095 bool
1096 Perl_is_uni_print(pTHX_ UV c)
1097 {
1098     U8 tmpbuf[UTF8_MAXBYTES+1];
1099     uvchr_to_utf8(tmpbuf, c);
1100     return is_utf8_print(tmpbuf);
1101 }
1102 
1103 bool
1104 Perl_is_uni_punct(pTHX_ UV c)
1105 {
1106     U8 tmpbuf[UTF8_MAXBYTES+1];
1107     uvchr_to_utf8(tmpbuf, c);
1108     return is_utf8_punct(tmpbuf);
1109 }
1110 
1111 bool
1112 Perl_is_uni_xdigit(pTHX_ UV c)
1113 {
1114     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1115     uvchr_to_utf8(tmpbuf, c);
1116     return is_utf8_xdigit(tmpbuf);
1117 }
1118 
1119 UV
1120 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1121 {
1122     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1123 
1124     uvchr_to_utf8(p, c);
1125     return to_utf8_upper(p, p, lenp);
1126 }
1127 
1128 UV
1129 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1130 {
1131     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1132 
1133     uvchr_to_utf8(p, c);
1134     return to_utf8_title(p, p, lenp);
1135 }
1136 
1137 UV
1138 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1139 {
1140     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1141 
1142     uvchr_to_utf8(p, c);
1143     return to_utf8_lower(p, p, lenp);
1144 }
1145 
1146 UV
1147 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1148 {
1149     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1150 
1151     uvchr_to_utf8(p, c);
1152     return to_utf8_fold(p, p, lenp);
1153 }
1154 
1155 /* for now these all assume no locale info available for Unicode > 255 */
1156 
1157 bool
1158 Perl_is_uni_alnum_lc(pTHX_ UV c)
1159 {
1160     return is_uni_alnum(c);	/* XXX no locale support yet */
1161 }
1162 
1163 bool
1164 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1165 {
1166     return is_uni_alnumc(c);	/* XXX no locale support yet */
1167 }
1168 
1169 bool
1170 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1171 {
1172     return is_uni_idfirst(c);	/* XXX no locale support yet */
1173 }
1174 
1175 bool
1176 Perl_is_uni_alpha_lc(pTHX_ UV c)
1177 {
1178     return is_uni_alpha(c);	/* XXX no locale support yet */
1179 }
1180 
1181 bool
1182 Perl_is_uni_ascii_lc(pTHX_ UV c)
1183 {
1184     return is_uni_ascii(c);	/* XXX no locale support yet */
1185 }
1186 
1187 bool
1188 Perl_is_uni_space_lc(pTHX_ UV c)
1189 {
1190     return is_uni_space(c);	/* XXX no locale support yet */
1191 }
1192 
1193 bool
1194 Perl_is_uni_digit_lc(pTHX_ UV c)
1195 {
1196     return is_uni_digit(c);	/* XXX no locale support yet */
1197 }
1198 
1199 bool
1200 Perl_is_uni_upper_lc(pTHX_ UV c)
1201 {
1202     return is_uni_upper(c);	/* XXX no locale support yet */
1203 }
1204 
1205 bool
1206 Perl_is_uni_lower_lc(pTHX_ UV c)
1207 {
1208     return is_uni_lower(c);	/* XXX no locale support yet */
1209 }
1210 
1211 bool
1212 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1213 {
1214     return is_uni_cntrl(c);	/* XXX no locale support yet */
1215 }
1216 
1217 bool
1218 Perl_is_uni_graph_lc(pTHX_ UV c)
1219 {
1220     return is_uni_graph(c);	/* XXX no locale support yet */
1221 }
1222 
1223 bool
1224 Perl_is_uni_print_lc(pTHX_ UV c)
1225 {
1226     return is_uni_print(c);	/* XXX no locale support yet */
1227 }
1228 
1229 bool
1230 Perl_is_uni_punct_lc(pTHX_ UV c)
1231 {
1232     return is_uni_punct(c);	/* XXX no locale support yet */
1233 }
1234 
1235 bool
1236 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1237 {
1238     return is_uni_xdigit(c);	/* XXX no locale support yet */
1239 }
1240 
1241 U32
1242 Perl_to_uni_upper_lc(pTHX_ U32 c)
1243 {
1244     /* XXX returns only the first character -- do not use XXX */
1245     /* XXX no locale support yet */
1246     STRLEN len;
1247     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1248     return (U32)to_uni_upper(c, tmpbuf, &len);
1249 }
1250 
1251 U32
1252 Perl_to_uni_title_lc(pTHX_ U32 c)
1253 {
1254     /* XXX returns only the first character XXX -- do not use XXX */
1255     /* XXX no locale support yet */
1256     STRLEN len;
1257     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1258     return (U32)to_uni_title(c, tmpbuf, &len);
1259 }
1260 
1261 U32
1262 Perl_to_uni_lower_lc(pTHX_ U32 c)
1263 {
1264     /* XXX returns only the first character -- do not use XXX */
1265     /* XXX no locale support yet */
1266     STRLEN len;
1267     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1268     return (U32)to_uni_lower(c, tmpbuf, &len);
1269 }
1270 
1271 static bool
1272 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1273 		 const char *const swashname)
1274 {
1275     dVAR;
1276 
1277     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1278 
1279     if (!is_utf8_char(p))
1280 	return FALSE;
1281     if (!*swash)
1282 	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1283     return swash_fetch(*swash, p, TRUE) != 0;
1284 }
1285 
1286 bool
1287 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1288 {
1289     dVAR;
1290 
1291     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1292 
1293     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1294      * descendant of isalnum(3), in other words, it doesn't
1295      * contain the '_'. --jhi */
1296     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1297 }
1298 
1299 bool
1300 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1301 {
1302     dVAR;
1303 
1304     PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1305 
1306     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1307 }
1308 
1309 bool
1310 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1311 {
1312     dVAR;
1313 
1314     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1315 
1316     if (*p == '_')
1317 	return TRUE;
1318     /* is_utf8_idstart would be more logical. */
1319     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1320 }
1321 
1322 bool
1323 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1324 {
1325     dVAR;
1326 
1327     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1328 
1329     if (*p == '_')
1330 	return TRUE;
1331     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1332 }
1333 
1334 bool
1335 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1336 {
1337     dVAR;
1338 
1339     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1340 
1341     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1342 }
1343 
1344 bool
1345 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1346 {
1347     dVAR;
1348 
1349     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1350 
1351     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1352 }
1353 
1354 bool
1355 Perl_is_utf8_space(pTHX_ const U8 *p)
1356 {
1357     dVAR;
1358 
1359     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1360 
1361     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1362 }
1363 
1364 bool
1365 Perl_is_utf8_digit(pTHX_ const U8 *p)
1366 {
1367     dVAR;
1368 
1369     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1370 
1371     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1372 }
1373 
1374 bool
1375 Perl_is_utf8_upper(pTHX_ const U8 *p)
1376 {
1377     dVAR;
1378 
1379     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1380 
1381     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1382 }
1383 
1384 bool
1385 Perl_is_utf8_lower(pTHX_ const U8 *p)
1386 {
1387     dVAR;
1388 
1389     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1390 
1391     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1392 }
1393 
1394 bool
1395 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1396 {
1397     dVAR;
1398 
1399     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1400 
1401     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1402 }
1403 
1404 bool
1405 Perl_is_utf8_graph(pTHX_ const U8 *p)
1406 {
1407     dVAR;
1408 
1409     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1410 
1411     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1412 }
1413 
1414 bool
1415 Perl_is_utf8_print(pTHX_ const U8 *p)
1416 {
1417     dVAR;
1418 
1419     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1420 
1421     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1422 }
1423 
1424 bool
1425 Perl_is_utf8_punct(pTHX_ const U8 *p)
1426 {
1427     dVAR;
1428 
1429     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1430 
1431     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1432 }
1433 
1434 bool
1435 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1436 {
1437     dVAR;
1438 
1439     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1440 
1441     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1442 }
1443 
1444 bool
1445 Perl_is_utf8_mark(pTHX_ const U8 *p)
1446 {
1447     dVAR;
1448 
1449     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1450 
1451     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1452 }
1453 
1454 /*
1455 =for apidoc to_utf8_case
1456 
1457 The "p" contains the pointer to the UTF-8 string encoding
1458 the character that is being converted.
1459 
1460 The "ustrp" is a pointer to the character buffer to put the
1461 conversion result to.  The "lenp" is a pointer to the length
1462 of the result.
1463 
1464 The "swashp" is a pointer to the swash to use.
1465 
1466 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1467 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1468 but not always, a multicharacter mapping), is tried first.
1469 
1470 The "special" is a string like "utf8::ToSpecLower", which means the
1471 hash %utf8::ToSpecLower.  The access to the hash is through
1472 Perl_to_utf8_case().
1473 
1474 The "normal" is a string like "ToLower" which means the swash
1475 %utf8::ToLower.
1476 
1477 =cut */
1478 
1479 UV
1480 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1481 			SV **swashp, const char *normal, const char *special)
1482 {
1483     dVAR;
1484     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1485     STRLEN len = 0;
1486     const UV uv0 = utf8_to_uvchr(p, NULL);
1487     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1488      * are necessary in EBCDIC, they are redundant no-ops
1489      * in ASCII-ish platforms, and hopefully optimized away. */
1490     const UV uv1 = NATIVE_TO_UNI(uv0);
1491 
1492     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1493 
1494     uvuni_to_utf8(tmpbuf, uv1);
1495 
1496     if (!*swashp) /* load on-demand */
1497          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1498 
1499     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1500     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1501          /* It might be "special" (sometimes, but not always,
1502 	  * a multicharacter mapping) */
1503 	 HV * const hv = get_hv(special, 0);
1504 	 SV **svp;
1505 
1506 	 if (hv &&
1507 	     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1508 	     (*svp)) {
1509 	     const char *s;
1510 
1511 	      s = SvPV_const(*svp, len);
1512 	      if (len == 1)
1513 		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1514 	      else {
1515 #ifdef EBCDIC
1516 		   /* If we have EBCDIC we need to remap the characters
1517 		    * since any characters in the low 256 are Unicode
1518 		    * code points, not EBCDIC. */
1519 		   U8 *t = (U8*)s, *tend = t + len, *d;
1520 
1521 		   d = tmpbuf;
1522 		   if (SvUTF8(*svp)) {
1523 			STRLEN tlen = 0;
1524 
1525 			while (t < tend) {
1526 			     const UV c = utf8_to_uvchr(t, &tlen);
1527 			     if (tlen > 0) {
1528 				  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1529 				  t += tlen;
1530 			     }
1531 			     else
1532 				  break;
1533 			}
1534 		   }
1535 		   else {
1536 			while (t < tend) {
1537 			     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1538 			     t++;
1539 			}
1540 		   }
1541 		   len = d - tmpbuf;
1542 		   Copy(tmpbuf, ustrp, len, U8);
1543 #else
1544 		   Copy(s, ustrp, len, U8);
1545 #endif
1546 	      }
1547 	 }
1548     }
1549 
1550     if (!len && *swashp) {
1551 	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1552 
1553 	 if (uv2) {
1554 	      /* It was "normal" (a single character mapping). */
1555 	      const UV uv3 = UNI_TO_NATIVE(uv2);
1556 	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1557 	 }
1558     }
1559 
1560     if (!len) /* Neither: just copy. */
1561 	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1562 
1563     if (lenp)
1564 	 *lenp = len;
1565 
1566     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1567 }
1568 
1569 /*
1570 =for apidoc to_utf8_upper
1571 
1572 Convert the UTF-8 encoded character at p to its uppercase version and
1573 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1574 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1575 the uppercase version may be longer than the original character.
1576 
1577 The first character of the uppercased version is returned
1578 (but note, as explained above, that there may be more.)
1579 
1580 =cut */
1581 
1582 UV
1583 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1584 {
1585     dVAR;
1586 
1587     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1588 
1589     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1590                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1591 }
1592 
1593 /*
1594 =for apidoc to_utf8_title
1595 
1596 Convert the UTF-8 encoded character at p to its titlecase version and
1597 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1598 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1599 titlecase version may be longer than the original character.
1600 
1601 The first character of the titlecased version is returned
1602 (but note, as explained above, that there may be more.)
1603 
1604 =cut */
1605 
1606 UV
1607 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1608 {
1609     dVAR;
1610 
1611     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1612 
1613     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1614                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1615 }
1616 
1617 /*
1618 =for apidoc to_utf8_lower
1619 
1620 Convert the UTF-8 encoded character at p to its lowercase version and
1621 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1622 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1623 lowercase version may be longer than the original character.
1624 
1625 The first character of the lowercased version is returned
1626 (but note, as explained above, that there may be more.)
1627 
1628 =cut */
1629 
1630 UV
1631 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1632 {
1633     dVAR;
1634 
1635     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1636 
1637     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1638                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1639 }
1640 
1641 /*
1642 =for apidoc to_utf8_fold
1643 
1644 Convert the UTF-8 encoded character at p to its foldcase version and
1645 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1646 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1647 foldcase version may be longer than the original character (up to
1648 three characters).
1649 
1650 The first character of the foldcased version is returned
1651 (but note, as explained above, that there may be more.)
1652 
1653 =cut */
1654 
1655 UV
1656 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1657 {
1658     dVAR;
1659 
1660     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1661 
1662     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1663                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1664 }
1665 
1666 /* Note:
1667  * A "swash" is a swatch hash.
1668  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1669  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1670  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1671  */
1672 SV*
1673 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1674 {
1675     dVAR;
1676     SV* retval;
1677     dSP;
1678     const size_t pkg_len = strlen(pkg);
1679     const size_t name_len = strlen(name);
1680     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1681     SV* errsv_save;
1682 
1683     PERL_ARGS_ASSERT_SWASH_INIT;
1684 
1685     PUSHSTACKi(PERLSI_MAGIC);
1686     ENTER;
1687     SAVEI32(PL_hints);
1688     PL_hints = 0;
1689     save_re_context();
1690     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
1691 	ENTER;
1692 	errsv_save = newSVsv(ERRSV);
1693 	/* It is assumed that callers of this routine are not passing in any
1694 	   user derived data.  */
1695 	/* Need to do this after save_re_context() as it will set PL_tainted to
1696 	   1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1697 	   Even line to create errsv_save can turn on PL_tainted.  */
1698 	SAVEBOOL(PL_tainted);
1699 	PL_tainted = 0;
1700 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1701 			 NULL);
1702 	if (!SvTRUE(ERRSV))
1703 	    sv_setsv(ERRSV, errsv_save);
1704 	SvREFCNT_dec(errsv_save);
1705 	LEAVE;
1706     }
1707     SPAGAIN;
1708     PUSHMARK(SP);
1709     EXTEND(SP,5);
1710     mPUSHp(pkg, pkg_len);
1711     mPUSHp(name, name_len);
1712     PUSHs(listsv);
1713     mPUSHi(minbits);
1714     mPUSHi(none);
1715     PUTBACK;
1716     errsv_save = newSVsv(ERRSV);
1717     if (call_method("SWASHNEW", G_SCALAR))
1718 	retval = newSVsv(*PL_stack_sp--);
1719     else
1720 	retval = &PL_sv_undef;
1721     if (!SvTRUE(ERRSV))
1722 	sv_setsv(ERRSV, errsv_save);
1723     SvREFCNT_dec(errsv_save);
1724     LEAVE;
1725     POPSTACK;
1726     if (IN_PERL_COMPILETIME) {
1727 	CopHINTS_set(PL_curcop, PL_hints);
1728     }
1729     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1730         if (SvPOK(retval))
1731 	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1732 		       SVfARG(retval));
1733 	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1734     }
1735     return retval;
1736 }
1737 
1738 
1739 /* This API is wrong for special case conversions since we may need to
1740  * return several Unicode characters for a single Unicode character
1741  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1742  * the lower-level routine, and it is similarly broken for returning
1743  * multiple values.  --jhi */
1744 /* Now SWASHGET is recasted into S_swash_get in this file. */
1745 
1746 /* Note:
1747  * Returns the value of property/mapping C<swash> for the first character
1748  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1749  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1750  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1751  */
1752 UV
1753 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1754 {
1755     dVAR;
1756     HV *const hv = MUTABLE_HV(SvRV(swash));
1757     U32 klen;
1758     U32 off;
1759     STRLEN slen;
1760     STRLEN needents;
1761     const U8 *tmps = NULL;
1762     U32 bit;
1763     SV *swatch;
1764     U8 tmputf8[2];
1765     const UV c = NATIVE_TO_ASCII(*ptr);
1766 
1767     PERL_ARGS_ASSERT_SWASH_FETCH;
1768 
1769     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1770 	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1771 	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1772 	ptr = tmputf8;
1773     }
1774     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1775      * then the "swatch" is a vec() for al the chars which start
1776      * with 0xAA..0xYY
1777      * So the key in the hash (klen) is length of encoded char -1
1778      */
1779     klen = UTF8SKIP(ptr) - 1;
1780     off  = ptr[klen];
1781 
1782     if (klen == 0) {
1783       /* If char in invariant then swatch is for all the invariant chars
1784        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1785        */
1786 	needents = UTF_CONTINUATION_MARK;
1787 	off      = NATIVE_TO_UTF(ptr[klen]);
1788     }
1789     else {
1790       /* If char is encoded then swatch is for the prefix */
1791 	needents = (1 << UTF_ACCUMULATION_SHIFT);
1792 	off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1793     }
1794 
1795     /*
1796      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1797      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1798      * it's nothing to sniff at.)  Pity we usually come through at least
1799      * two function calls to get here...
1800      *
1801      * NB: this code assumes that swatches are never modified, once generated!
1802      */
1803 
1804     if (hv   == PL_last_swash_hv &&
1805 	klen == PL_last_swash_klen &&
1806 	(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1807     {
1808 	tmps = PL_last_swash_tmps;
1809 	slen = PL_last_swash_slen;
1810     }
1811     else {
1812 	/* Try our second-level swatch cache, kept in a hash. */
1813 	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1814 
1815 	/* If not cached, generate it via swash_get */
1816 	if (!svp || !SvPOK(*svp)
1817 		 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1818 	    /* We use utf8n_to_uvuni() as we want an index into
1819 	       Unicode tables, not a native character number.
1820 	     */
1821 	    const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1822 					   ckWARN(WARN_UTF8) ?
1823 					   0 : UTF8_ALLOW_ANY);
1824 	    swatch = swash_get(swash,
1825 		    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1826 				(klen) ? (code_point & ~(needents - 1)) : 0,
1827 				needents);
1828 
1829 	    if (IN_PERL_COMPILETIME)
1830 		CopHINTS_set(PL_curcop, PL_hints);
1831 
1832 	    svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1833 
1834 	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1835 		     || (slen << 3) < needents)
1836 		Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1837 	}
1838 
1839 	PL_last_swash_hv = hv;
1840 	assert(klen <= sizeof(PL_last_swash_key));
1841 	PL_last_swash_klen = (U8)klen;
1842 	/* FIXME change interpvar.h?  */
1843 	PL_last_swash_tmps = (U8 *) tmps;
1844 	PL_last_swash_slen = slen;
1845 	if (klen)
1846 	    Copy(ptr, PL_last_swash_key, klen, U8);
1847     }
1848 
1849     switch ((int)((slen << 3) / needents)) {
1850     case 1:
1851 	bit = 1 << (off & 7);
1852 	off >>= 3;
1853 	return (tmps[off] & bit) != 0;
1854     case 8:
1855 	return tmps[off];
1856     case 16:
1857 	off <<= 1;
1858 	return (tmps[off] << 8) + tmps[off + 1] ;
1859     case 32:
1860 	off <<= 2;
1861 	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1862     }
1863     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1864     NORETURN_FUNCTION_END;
1865 }
1866 
1867 /* Note:
1868  * Returns a swatch (a bit vector string) for a code point sequence
1869  * that starts from the value C<start> and comprises the number C<span>.
1870  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1871  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1872  */
1873 STATIC SV*
1874 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1875 {
1876     SV *swatch;
1877     U8 *l, *lend, *x, *xend, *s;
1878     STRLEN lcur, xcur, scur;
1879     HV *const hv = MUTABLE_HV(SvRV(swash));
1880     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1881     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1882     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1883     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1884     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1885     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1886     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1887     const STRLEN bits  = SvUV(*bitssvp);
1888     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1889     const UV     none  = SvUV(*nonesvp);
1890     const UV     end   = start + span;
1891 
1892     PERL_ARGS_ASSERT_SWASH_GET;
1893 
1894     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1895 	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1896 						 (UV)bits);
1897     }
1898 
1899     /* create and initialize $swatch */
1900     scur   = octets ? (span * octets) : (span + 7) / 8;
1901     swatch = newSV(scur);
1902     SvPOK_on(swatch);
1903     s = (U8*)SvPVX(swatch);
1904     if (octets && none) {
1905 	const U8* const e = s + scur;
1906 	while (s < e) {
1907 	    if (bits == 8)
1908 		*s++ = (U8)(none & 0xff);
1909 	    else if (bits == 16) {
1910 		*s++ = (U8)((none >>  8) & 0xff);
1911 		*s++ = (U8)( none        & 0xff);
1912 	    }
1913 	    else if (bits == 32) {
1914 		*s++ = (U8)((none >> 24) & 0xff);
1915 		*s++ = (U8)((none >> 16) & 0xff);
1916 		*s++ = (U8)((none >>  8) & 0xff);
1917 		*s++ = (U8)( none        & 0xff);
1918 	    }
1919 	}
1920 	*s = '\0';
1921     }
1922     else {
1923 	(void)memzero((U8*)s, scur + 1);
1924     }
1925     SvCUR_set(swatch, scur);
1926     s = (U8*)SvPVX(swatch);
1927 
1928     /* read $swash->{LIST} */
1929     l = (U8*)SvPV(*listsvp, lcur);
1930     lend = l + lcur;
1931     while (l < lend) {
1932 	UV min, max, val;
1933 	STRLEN numlen;
1934 	I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1935 
1936 	U8* const nl = (U8*)memchr(l, '\n', lend - l);
1937 
1938 	numlen = lend - l;
1939 	min = grok_hex((char *)l, &numlen, &flags, NULL);
1940 	if (numlen)
1941 	    l += numlen;
1942 	else if (nl) {
1943 	    l = nl + 1; /* 1 is length of "\n" */
1944 	    continue;
1945 	}
1946 	else {
1947 	    l = lend; /* to LIST's end at which \n is not found */
1948 	    break;
1949 	}
1950 
1951 	if (isBLANK(*l)) {
1952 	    ++l;
1953 	    flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1954 	    numlen = lend - l;
1955 	    max = grok_hex((char *)l, &numlen, &flags, NULL);
1956 	    if (numlen)
1957 		l += numlen;
1958 	    else
1959 		max = min;
1960 
1961 	    if (octets) {
1962 		if (isBLANK(*l)) {
1963 		    ++l;
1964 		    flags = PERL_SCAN_SILENT_ILLDIGIT |
1965 			    PERL_SCAN_DISALLOW_PREFIX;
1966 		    numlen = lend - l;
1967 		    val = grok_hex((char *)l, &numlen, &flags, NULL);
1968 		    if (numlen)
1969 			l += numlen;
1970 		    else
1971 			val = 0;
1972 		}
1973 		else {
1974 		    val = 0;
1975 		    if (typeto) {
1976 			Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1977 					 typestr, l);
1978 		    }
1979 		}
1980 	    }
1981 	    else
1982 		val = 0; /* bits == 1, then val should be ignored */
1983 	}
1984 	else {
1985 	    max = min;
1986 	    if (octets) {
1987 		val = 0;
1988 		if (typeto) {
1989 		    Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1990 		}
1991 	    }
1992 	    else
1993 		val = 0; /* bits == 1, then val should be ignored */
1994 	}
1995 
1996 	if (nl)
1997 	    l = nl + 1;
1998 	else
1999 	    l = lend;
2000 
2001 	if (max < start)
2002 	    continue;
2003 
2004 	if (octets) {
2005 	    UV key;
2006 	    if (min < start) {
2007 		if (!none || val < none) {
2008 		    val += start - min;
2009 		}
2010 		min = start;
2011 	    }
2012 	    for (key = min; key <= max; key++) {
2013 		STRLEN offset;
2014 		if (key >= end)
2015 		    goto go_out_list;
2016 		/* offset must be non-negative (start <= min <= key < end) */
2017 		offset = octets * (key - start);
2018 		if (bits == 8)
2019 		    s[offset] = (U8)(val & 0xff);
2020 		else if (bits == 16) {
2021 		    s[offset    ] = (U8)((val >>  8) & 0xff);
2022 		    s[offset + 1] = (U8)( val        & 0xff);
2023 		}
2024 		else if (bits == 32) {
2025 		    s[offset    ] = (U8)((val >> 24) & 0xff);
2026 		    s[offset + 1] = (U8)((val >> 16) & 0xff);
2027 		    s[offset + 2] = (U8)((val >>  8) & 0xff);
2028 		    s[offset + 3] = (U8)( val        & 0xff);
2029 		}
2030 
2031 		if (!none || val < none)
2032 		    ++val;
2033 	    }
2034 	}
2035 	else { /* bits == 1, then val should be ignored */
2036 	    UV key;
2037 	    if (min < start)
2038 		min = start;
2039 	    for (key = min; key <= max; key++) {
2040 		const STRLEN offset = (STRLEN)(key - start);
2041 		if (key >= end)
2042 		    goto go_out_list;
2043 		s[offset >> 3] |= 1 << (offset & 7);
2044 	    }
2045 	}
2046     } /* while */
2047   go_out_list:
2048 
2049     /* read $swash->{EXTRAS} */
2050     x = (U8*)SvPV(*extssvp, xcur);
2051     xend = x + xcur;
2052     while (x < xend) {
2053 	STRLEN namelen;
2054 	U8 *namestr;
2055 	SV** othersvp;
2056 	HV* otherhv;
2057 	STRLEN otherbits;
2058 	SV **otherbitssvp, *other;
2059 	U8 *s, *o, *nl;
2060 	STRLEN slen, olen;
2061 
2062 	const U8 opc = *x++;
2063 	if (opc == '\n')
2064 	    continue;
2065 
2066 	nl = (U8*)memchr(x, '\n', xend - x);
2067 
2068 	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2069 	    if (nl) {
2070 		x = nl + 1; /* 1 is length of "\n" */
2071 		continue;
2072 	    }
2073 	    else {
2074 		x = xend; /* to EXTRAS' end at which \n is not found */
2075 		break;
2076 	    }
2077 	}
2078 
2079 	namestr = x;
2080 	if (nl) {
2081 	    namelen = nl - namestr;
2082 	    x = nl + 1;
2083 	}
2084 	else {
2085 	    namelen = xend - namestr;
2086 	    x = xend;
2087 	}
2088 
2089 	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2090 	otherhv = MUTABLE_HV(SvRV(*othersvp));
2091 	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2092 	otherbits = (STRLEN)SvUV(*otherbitssvp);
2093 	if (bits < otherbits)
2094 	    Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2095 
2096 	/* The "other" swatch must be destroyed after. */
2097 	other = swash_get(*othersvp, start, span);
2098 	o = (U8*)SvPV(other, olen);
2099 
2100 	if (!olen)
2101 	    Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2102 
2103 	s = (U8*)SvPV(swatch, slen);
2104 	if (bits == 1 && otherbits == 1) {
2105 	    if (slen != olen)
2106 		Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2107 
2108 	    switch (opc) {
2109 	    case '+':
2110 		while (slen--)
2111 		    *s++ |= *o++;
2112 		break;
2113 	    case '!':
2114 		while (slen--)
2115 		    *s++ |= ~*o++;
2116 		break;
2117 	    case '-':
2118 		while (slen--)
2119 		    *s++ &= ~*o++;
2120 		break;
2121 	    case '&':
2122 		while (slen--)
2123 		    *s++ &= *o++;
2124 		break;
2125 	    default:
2126 		break;
2127 	    }
2128 	}
2129 	else {
2130 	    STRLEN otheroctets = otherbits >> 3;
2131 	    STRLEN offset = 0;
2132 	    U8* const send = s + slen;
2133 
2134 	    while (s < send) {
2135 		UV otherval = 0;
2136 
2137 		if (otherbits == 1) {
2138 		    otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2139 		    ++offset;
2140 		}
2141 		else {
2142 		    STRLEN vlen = otheroctets;
2143 		    otherval = *o++;
2144 		    while (--vlen) {
2145 			otherval <<= 8;
2146 			otherval |= *o++;
2147 		    }
2148 		}
2149 
2150 		if (opc == '+' && otherval)
2151 		    NOOP;   /* replace with otherval */
2152 		else if (opc == '!' && !otherval)
2153 		    otherval = 1;
2154 		else if (opc == '-' && otherval)
2155 		    otherval = 0;
2156 		else if (opc == '&' && !otherval)
2157 		    otherval = 0;
2158 		else {
2159 		    s += octets; /* no replacement */
2160 		    continue;
2161 		}
2162 
2163 		if (bits == 8)
2164 		    *s++ = (U8)( otherval & 0xff);
2165 		else if (bits == 16) {
2166 		    *s++ = (U8)((otherval >>  8) & 0xff);
2167 		    *s++ = (U8)( otherval        & 0xff);
2168 		}
2169 		else if (bits == 32) {
2170 		    *s++ = (U8)((otherval >> 24) & 0xff);
2171 		    *s++ = (U8)((otherval >> 16) & 0xff);
2172 		    *s++ = (U8)((otherval >>  8) & 0xff);
2173 		    *s++ = (U8)( otherval        & 0xff);
2174 		}
2175 	    }
2176 	}
2177 	sv_free(other); /* through with it! */
2178     } /* while */
2179     return swatch;
2180 }
2181 
2182 /*
2183 =for apidoc uvchr_to_utf8
2184 
2185 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2186 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2187 bytes available. The return value is the pointer to the byte after the
2188 end of the new character. In other words,
2189 
2190     d = uvchr_to_utf8(d, uv);
2191 
2192 is the recommended wide native character-aware way of saying
2193 
2194     *(d++) = uv;
2195 
2196 =cut
2197 */
2198 
2199 /* On ASCII machines this is normally a macro but we want a
2200    real function in case XS code wants it
2201 */
2202 U8 *
2203 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2204 {
2205     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2206 
2207     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2208 }
2209 
2210 U8 *
2211 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2212 {
2213     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2214 
2215     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2216 }
2217 
2218 /*
2219 =for apidoc utf8n_to_uvchr
2220 flags
2221 
2222 Returns the native character value of the first character in the string
2223 C<s>
2224 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2225 length, in bytes, of that character.
2226 
2227 Allows length and flags to be passed to low level routine.
2228 
2229 =cut
2230 */
2231 /* On ASCII machines this is normally a macro but we want
2232    a real function in case XS code wants it
2233 */
2234 UV
2235 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2236 U32 flags)
2237 {
2238     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2239 
2240     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2241 
2242     return UNI_TO_NATIVE(uv);
2243 }
2244 
2245 /*
2246 =for apidoc pv_uni_display
2247 
2248 Build to the scalar dsv a displayable version of the string spv,
2249 length len, the displayable version being at most pvlim bytes long
2250 (if longer, the rest is truncated and "..." will be appended).
2251 
2252 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2253 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2254 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2255 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2256 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2257 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2258 
2259 The pointer to the PV of the dsv is returned.
2260 
2261 =cut */
2262 char *
2263 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2264 {
2265     int truncated = 0;
2266     const char *s, *e;
2267 
2268     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2269 
2270     sv_setpvs(dsv, "");
2271     SvUTF8_off(dsv);
2272     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2273 	 UV u;
2274 	  /* This serves double duty as a flag and a character to print after
2275 	     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2276 	  */
2277 	 char ok = 0;
2278 
2279 	 if (pvlim && SvCUR(dsv) >= pvlim) {
2280 	      truncated++;
2281 	      break;
2282 	 }
2283 	 u = utf8_to_uvchr((U8*)s, 0);
2284 	 if (u < 256) {
2285 	     const unsigned char c = (unsigned char)u & 0xFF;
2286 	     if (flags & UNI_DISPLAY_BACKSLASH) {
2287 	         switch (c) {
2288 		 case '\n':
2289 		     ok = 'n'; break;
2290 		 case '\r':
2291 		     ok = 'r'; break;
2292 		 case '\t':
2293 		     ok = 't'; break;
2294 		 case '\f':
2295 		     ok = 'f'; break;
2296 		 case '\a':
2297 		     ok = 'a'; break;
2298 		 case '\\':
2299 		     ok = '\\'; break;
2300 		 default: break;
2301 		 }
2302 		 if (ok) {
2303 		     const char string = ok;
2304 		     sv_catpvs(dsv, "\\");
2305 		     sv_catpvn(dsv, &string, 1);
2306 		 }
2307 	     }
2308 	     /* isPRINT() is the locale-blind version. */
2309 	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2310 		 const char string = c;
2311 		 sv_catpvn(dsv, &string, 1);
2312 		 ok = 1;
2313 	     }
2314 	 }
2315 	 if (!ok)
2316 	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2317     }
2318     if (truncated)
2319 	 sv_catpvs(dsv, "...");
2320 
2321     return SvPVX(dsv);
2322 }
2323 
2324 /*
2325 =for apidoc sv_uni_display
2326 
2327 Build to the scalar dsv a displayable version of the scalar sv,
2328 the displayable version being at most pvlim bytes long
2329 (if longer, the rest is truncated and "..." will be appended).
2330 
2331 The flags argument is as in pv_uni_display().
2332 
2333 The pointer to the PV of the dsv is returned.
2334 
2335 =cut
2336 */
2337 char *
2338 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2339 {
2340     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2341 
2342      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2343 				SvCUR(ssv), pvlim, flags);
2344 }
2345 
2346 /*
2347 =for apidoc ibcmp_utf8
2348 
2349 Return true if the strings s1 and s2 differ case-insensitively, false
2350 if not (if they are equal case-insensitively).  If u1 is true, the
2351 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2352 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2353 are false, the respective string is assumed to be in native 8-bit
2354 encoding.
2355 
2356 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2357 in there (they will point at the beginning of the I<next> character).
2358 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2359 pointers beyond which scanning will not continue under any
2360 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2361 s2+l2 will be used as goal end pointers that will also stop the scan,
2362 and which qualify towards defining a successful match: all the scans
2363 that define an explicit length must reach their goal pointers for
2364 a match to succeed).
2365 
2366 For case-insensitiveness, the "casefolding" of Unicode is used
2367 instead of upper/lowercasing both the characters, see
2368 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2369 
2370 =cut */
2371 I32
2372 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2373 {
2374      dVAR;
2375      register const U8 *p1  = (const U8*)s1;
2376      register const U8 *p2  = (const U8*)s2;
2377      register const U8 *f1 = NULL;
2378      register const U8 *f2 = NULL;
2379      register U8 *e1 = NULL;
2380      register U8 *q1 = NULL;
2381      register U8 *e2 = NULL;
2382      register U8 *q2 = NULL;
2383      STRLEN n1 = 0, n2 = 0;
2384      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2385      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2386      U8 natbuf[1+1];
2387      STRLEN foldlen1, foldlen2;
2388      bool match;
2389 
2390      PERL_ARGS_ASSERT_IBCMP_UTF8;
2391 
2392      if (pe1)
2393 	  e1 = *(U8**)pe1;
2394      /* assert(e1 || l1); */
2395      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2396 	  f1 = (const U8*)s1 + l1;
2397      if (pe2)
2398 	  e2 = *(U8**)pe2;
2399      /* assert(e2 || l2); */
2400      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2401 	  f2 = (const U8*)s2 + l2;
2402 
2403      /* This shouldn't happen. However, putting an assert() there makes some
2404       * tests fail. */
2405      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2406      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2407 	  return 1; /* mismatch; possible infinite loop or false positive */
2408 
2409      if (!u1 || !u2)
2410 	  natbuf[1] = 0; /* Need to terminate the buffer. */
2411 
2412      while ((e1 == 0 || p1 < e1) &&
2413 	    (f1 == 0 || p1 < f1) &&
2414 	    (e2 == 0 || p2 < e2) &&
2415 	    (f2 == 0 || p2 < f2)) {
2416 	  if (n1 == 0) {
2417 	       if (u1)
2418 		    to_utf8_fold(p1, foldbuf1, &foldlen1);
2419 	       else {
2420 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2421 		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2422 	       }
2423 	       q1 = foldbuf1;
2424 	       n1 = foldlen1;
2425 	  }
2426 	  if (n2 == 0) {
2427 	       if (u2)
2428 		    to_utf8_fold(p2, foldbuf2, &foldlen2);
2429 	       else {
2430 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2431 		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2432 	       }
2433 	       q2 = foldbuf2;
2434 	       n2 = foldlen2;
2435 	  }
2436 	  while (n1 && n2) {
2437 	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2438 		   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2439 		    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2440 		   return 1; /* mismatch */
2441 	       n1 -= UTF8SKIP(q1);
2442 	       q1 += UTF8SKIP(q1);
2443 	       n2 -= UTF8SKIP(q2);
2444 	       q2 += UTF8SKIP(q2);
2445 	  }
2446 	  if (n1 == 0)
2447 	       p1 += u1 ? UTF8SKIP(p1) : 1;
2448 	  if (n2 == 0)
2449 	       p2 += u2 ? UTF8SKIP(p2) : 1;
2450 
2451      }
2452 
2453      /* A match is defined by all the scans that specified
2454       * an explicit length reaching their final goals. */
2455      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2456 
2457      if (match) {
2458 	  if (pe1)
2459 	       *pe1 = (char*)p1;
2460 	  if (pe2)
2461 	       *pe2 = (char*)p2;
2462      }
2463 
2464      return match ? 0 : 1; /* 0 match, 1 mismatch */
2465 }
2466 
2467 /*
2468  * Local variables:
2469  * c-indentation-style: bsd
2470  * c-basic-offset: 4
2471  * indent-tabs-mode: t
2472  * End:
2473  *
2474  * ex: set ts=8 sts=4 sw=4 noet:
2475  */
2476