xref: /openbsd-src/gnu/usr.bin/perl/utf8.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13  * heard of that we don't want to see any closer; and that's the one place
14  * we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  * 'Well do I understand your speech,' he answered in the same language;
17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
18  * as is the custom in the West, if you wish to be answered?'
19  *
20  * ...the travellers perceived that the floor was paved with stones of many
21  * hues; branching runes and strange devices intertwined beneath their feet.
22  */
23 
24 #include "EXTERN.h"
25 #define PERL_IN_UTF8_C
26 #include "perl.h"
27 
28 #ifndef EBCDIC
29 /* Separate prototypes needed because in ASCII systems these
30  * usually macros but they still are compiled as code, too. */
31 PERL_CALLCONV UV	Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32 PERL_CALLCONV U8*	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
33 #endif
34 
35 static const char unees[] =
36     "Malformed UTF-8 character (unexpected end of string)";
37 
38 /*
39 =head1 Unicode Support
40 
41 This file contains various utility functions for manipulating UTF8-encoded
42 strings. For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
46 
47 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
48 
49 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
50 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
51 bytes available. The return value is the pointer to the byte after the
52 end of the new character. In other words,
53 
54     d = uvuni_to_utf8_flags(d, uv, flags);
55 
56 or, in most cases,
57 
58     d = uvuni_to_utf8(d, uv);
59 
60 (which is equivalent to)
61 
62     d = uvuni_to_utf8_flags(d, uv, 0);
63 
64 is the recommended Unicode-aware way of saying
65 
66     *(d++) = uv;
67 
68 =cut
69 */
70 
71 U8 *
72 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
73 {
74     if (ckWARN(WARN_UTF8)) {
75 	 if (UNICODE_IS_SURROGATE(uv) &&
76 	     !(flags & UNICODE_ALLOW_SURROGATE))
77 	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
78 	 else if (
79 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
80 		    !(flags & UNICODE_ALLOW_FDD0))
81 		   ||
82 		   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
83 		    !(flags & UNICODE_ALLOW_FFFF))) &&
84 		  /* UNICODE_ALLOW_SUPER includes
85 		   * FFFEs and FFFFs beyond 0x10FFFF. */
86 		  ((uv <= PERL_UNICODE_MAX) ||
87 		   !(flags & UNICODE_ALLOW_SUPER))
88 		  )
89 	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
90 			 "Unicode character 0x%04"UVxf" is illegal", uv);
91     }
92     if (UNI_IS_INVARIANT(uv)) {
93 	*d++ = (U8)UTF_TO_NATIVE(uv);
94 	return d;
95     }
96 #if defined(EBCDIC)
97     else {
98 	STRLEN len  = UNISKIP(uv);
99 	U8 *p = d+len-1;
100 	while (p > d) {
101 	    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
102 	    uv >>= UTF_ACCUMULATION_SHIFT;
103 	}
104 	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
105 	return d+len;
106     }
107 #else /* Non loop style */
108     if (uv < 0x800) {
109 	*d++ = (U8)(( uv >>  6)         | 0xc0);
110 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
111 	return d;
112     }
113     if (uv < 0x10000) {
114 	*d++ = (U8)(( uv >> 12)         | 0xe0);
115 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
116 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
117 	return d;
118     }
119     if (uv < 0x200000) {
120 	*d++ = (U8)(( uv >> 18)         | 0xf0);
121 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
122 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
123 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
124 	return d;
125     }
126     if (uv < 0x4000000) {
127 	*d++ = (U8)(( uv >> 24)         | 0xf8);
128 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
131 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
132 	return d;
133     }
134     if (uv < 0x80000000) {
135 	*d++ = (U8)(( uv >> 30)         | 0xfc);
136 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
137 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
140 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
141 	return d;
142     }
143 #ifdef HAS_QUAD
144     if (uv < UTF8_QUAD_MAX)
145 #endif
146     {
147 	*d++ =                            0xfe;	/* Can't match U+FEFF! */
148 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
149 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
150 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
151 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
152 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
153 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
154 	return d;
155     }
156 #ifdef HAS_QUAD
157     {
158 	*d++ =                            0xff;		/* Can't match U+FFFE! */
159 	*d++ =                            0x80;		/* 6 Reserved bits */
160 	*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
161 	*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
162 	*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
163 	*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
164 	*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
165 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
166 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
167 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
168 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
169 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
170 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
171 	return d;
172     }
173 #endif
174 #endif /* Loop style */
175 }
176 
177 /*
178 
179 Tests if some arbitrary number of bytes begins in a valid UTF-8
180 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
181 UTF-8 character.  The actual number of bytes in the UTF-8 character
182 will be returned if it is valid, otherwise 0.
183 
184 This is the "slow" version as opposed to the "fast" version which is
185 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
186 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
187 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
188 you should use the _slow().  In practice this means that the _slow()
189 will be used very rarely, since the maximum Unicode code point (as of
190 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
191 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
192 five bytes or more.
193 
194 =cut */
195 STATIC STRLEN
196 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
197 {
198     U8 u = *s;
199     STRLEN slen;
200     UV uv, ouv;
201 
202     if (UTF8_IS_INVARIANT(u))
203 	return 1;
204 
205     if (!UTF8_IS_START(u))
206 	return 0;
207 
208     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
209 	return 0;
210 
211     slen = len - 1;
212     s++;
213 #ifdef EBCDIC
214     u = NATIVE_TO_UTF(u);
215 #endif
216     u &= UTF_START_MASK(len);
217     uv  = u;
218     ouv = uv;
219     while (slen--) {
220 	if (!UTF8_IS_CONTINUATION(*s))
221 	    return 0;
222 	uv = UTF8_ACCUMULATE(uv, *s);
223 	if (uv < ouv)
224 	    return 0;
225 	ouv = uv;
226 	s++;
227     }
228 
229     if ((STRLEN)UNISKIP(uv) < len)
230 	return 0;
231 
232     return len;
233 }
234 
235 /*
236 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
237 
238 Tests if some arbitrary number of bytes begins in a valid UTF-8
239 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
240 UTF-8 character.  The actual number of bytes in the UTF-8 character
241 will be returned if it is valid, otherwise 0.
242 
243 =cut */
244 STRLEN
245 Perl_is_utf8_char(pTHX_ const U8 *s)
246 {
247     const STRLEN len = UTF8SKIP(s);
248     PERL_UNUSED_CONTEXT;
249 #ifdef IS_UTF8_CHAR
250     if (IS_UTF8_CHAR_FAST(len))
251         return IS_UTF8_CHAR(s, len) ? len : 0;
252 #endif /* #ifdef IS_UTF8_CHAR */
253     return is_utf8_char_slow(s, len);
254 }
255 
256 /*
257 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
258 
259 Returns true if first C<len> bytes of the given string form a valid
260 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
261 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
262 because a valid ASCII string is a valid UTF-8 string.
263 
264 See also is_utf8_string_loclen() and is_utf8_string_loc().
265 
266 =cut
267 */
268 
269 bool
270 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
271 {
272     const U8* const send = s + (len ? len : strlen((const char *)s));
273     const U8* x = s;
274 
275     PERL_UNUSED_CONTEXT;
276 
277     while (x < send) {
278 	STRLEN c;
279 	 /* Inline the easy bits of is_utf8_char() here for speed... */
280 	 if (UTF8_IS_INVARIANT(*x))
281 	      c = 1;
282 	 else if (!UTF8_IS_START(*x))
283 	     goto out;
284 	 else {
285 	      /* ... and call is_utf8_char() only if really needed. */
286 #ifdef IS_UTF8_CHAR
287 	     c = UTF8SKIP(x);
288 	     if (IS_UTF8_CHAR_FAST(c)) {
289 	         if (!IS_UTF8_CHAR(x, c))
290 		     c = 0;
291 	     }
292 	     else
293 		c = is_utf8_char_slow(x, c);
294 #else
295 	     c = is_utf8_char(x);
296 #endif /* #ifdef IS_UTF8_CHAR */
297 	      if (!c)
298 		  goto out;
299 	 }
300         x += c;
301     }
302 
303  out:
304     if (x != send)
305 	return FALSE;
306 
307     return TRUE;
308 }
309 
310 /*
311 Implemented as a macro in utf8.h
312 
313 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
314 
315 Like is_utf8_string() but stores the location of the failure (in the
316 case of "utf8ness failure") or the location s+len (in the case of
317 "utf8ness success") in the C<ep>.
318 
319 See also is_utf8_string_loclen() and is_utf8_string().
320 
321 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
322 
323 Like is_utf8_string() but stores the location of the failure (in the
324 case of "utf8ness failure") or the location s+len (in the case of
325 "utf8ness success") in the C<ep>, and the number of UTF-8
326 encoded characters in the C<el>.
327 
328 See also is_utf8_string_loc() and is_utf8_string().
329 
330 =cut
331 */
332 
333 bool
334 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
335 {
336     const U8* const send = s + (len ? len : strlen((const char *)s));
337     const U8* x = s;
338     STRLEN c;
339     STRLEN outlen = 0;
340     PERL_UNUSED_CONTEXT;
341 
342     while (x < send) {
343 	 /* Inline the easy bits of is_utf8_char() here for speed... */
344 	 if (UTF8_IS_INVARIANT(*x))
345 	     c = 1;
346 	 else if (!UTF8_IS_START(*x))
347 	     goto out;
348 	 else {
349 	     /* ... and call is_utf8_char() only if really needed. */
350 #ifdef IS_UTF8_CHAR
351 	     c = UTF8SKIP(x);
352 	     if (IS_UTF8_CHAR_FAST(c)) {
353 	         if (!IS_UTF8_CHAR(x, c))
354 		     c = 0;
355 	     } else
356 	         c = is_utf8_char_slow(x, c);
357 #else
358 	     c = is_utf8_char(x);
359 #endif /* #ifdef IS_UTF8_CHAR */
360 	     if (!c)
361 	         goto out;
362 	 }
363          x += c;
364 	 outlen++;
365     }
366 
367  out:
368     if (el)
369         *el = outlen;
370 
371     if (ep)
372         *ep = x;
373     return (x == send);
374 }
375 
376 /*
377 
378 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
379 
380 Bottom level UTF-8 decode routine.
381 Returns the Unicode code point value of the first character in the string C<s>
382 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
383 C<retlen> will be set to the length, in bytes, of that character.
384 
385 If C<s> does not point to a well-formed UTF-8 character, the behaviour
386 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
387 it is assumed that the caller will raise a warning, and this function
388 will silently just set C<retlen> to C<-1> and return zero.  If the
389 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
390 malformations will be given, C<retlen> will be set to the expected
391 length of the UTF-8 character in bytes, and zero will be returned.
392 
393 The C<flags> can also contain various flags to allow deviations from
394 the strict UTF-8 encoding (see F<utf8.h>).
395 
396 Most code should use utf8_to_uvchr() rather than call this directly.
397 
398 =cut
399 */
400 
401 UV
402 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
403 {
404     dVAR;
405     const U8 * const s0 = s;
406     UV uv = *s, ouv = 0;
407     STRLEN len = 1;
408     const bool dowarn = ckWARN_d(WARN_UTF8);
409     const UV startbyte = *s;
410     STRLEN expectlen = 0;
411     U32 warning = 0;
412 
413 /* This list is a superset of the UTF8_ALLOW_XXX. */
414 
415 #define UTF8_WARN_EMPTY				 1
416 #define UTF8_WARN_CONTINUATION			 2
417 #define UTF8_WARN_NON_CONTINUATION	 	 3
418 #define UTF8_WARN_FE_FF				 4
419 #define UTF8_WARN_SHORT				 5
420 #define UTF8_WARN_OVERFLOW			 6
421 #define UTF8_WARN_SURROGATE			 7
422 #define UTF8_WARN_LONG				 8
423 #define UTF8_WARN_FFFF				 9 /* Also FFFE. */
424 
425     if (curlen == 0 &&
426 	!(flags & UTF8_ALLOW_EMPTY)) {
427 	warning = UTF8_WARN_EMPTY;
428 	goto malformed;
429     }
430 
431     if (UTF8_IS_INVARIANT(uv)) {
432 	if (retlen)
433 	    *retlen = 1;
434 	return (UV) (NATIVE_TO_UTF(*s));
435     }
436 
437     if (UTF8_IS_CONTINUATION(uv) &&
438 	!(flags & UTF8_ALLOW_CONTINUATION)) {
439 	warning = UTF8_WARN_CONTINUATION;
440 	goto malformed;
441     }
442 
443     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
444 	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
445 	warning = UTF8_WARN_NON_CONTINUATION;
446 	goto malformed;
447     }
448 
449 #ifdef EBCDIC
450     uv = NATIVE_TO_UTF(uv);
451 #else
452     if ((uv == 0xfe || uv == 0xff) &&
453 	!(flags & UTF8_ALLOW_FE_FF)) {
454 	warning = UTF8_WARN_FE_FF;
455 	goto malformed;
456     }
457 #endif
458 
459     if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
460     else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
461     else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
462     else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
463 #ifdef EBCDIC
464     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
465     else			{ len =  7; uv &= 0x01; }
466 #else
467     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
468     else if (!(uv & 0x01))	{ len =  7; uv = 0; }
469     else			{ len = 13; uv = 0; } /* whoa! */
470 #endif
471 
472     if (retlen)
473 	*retlen = len;
474 
475     expectlen = len;
476 
477     if ((curlen < expectlen) &&
478 	!(flags & UTF8_ALLOW_SHORT)) {
479 	warning = UTF8_WARN_SHORT;
480 	goto malformed;
481     }
482 
483     len--;
484     s++;
485     ouv = uv;
486 
487     while (len--) {
488 	if (!UTF8_IS_CONTINUATION(*s) &&
489 	    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
490 	    s--;
491 	    warning = UTF8_WARN_NON_CONTINUATION;
492 	    goto malformed;
493 	}
494 	else
495 	    uv = UTF8_ACCUMULATE(uv, *s);
496 	if (!(uv > ouv)) {
497 	    /* These cannot be allowed. */
498 	    if (uv == ouv) {
499 		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
500 		    warning = UTF8_WARN_LONG;
501 		    goto malformed;
502 		}
503 	    }
504 	    else { /* uv < ouv */
505 		/* This cannot be allowed. */
506 		warning = UTF8_WARN_OVERFLOW;
507 		goto malformed;
508 	    }
509 	}
510 	s++;
511 	ouv = uv;
512     }
513 
514     if (UNICODE_IS_SURROGATE(uv) &&
515 	!(flags & UTF8_ALLOW_SURROGATE)) {
516 	warning = UTF8_WARN_SURROGATE;
517 	goto malformed;
518     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
519 	       !(flags & UTF8_ALLOW_LONG)) {
520 	warning = UTF8_WARN_LONG;
521 	goto malformed;
522     } else if (UNICODE_IS_ILLEGAL(uv) &&
523 	       !(flags & UTF8_ALLOW_FFFF)) {
524 	warning = UTF8_WARN_FFFF;
525 	goto malformed;
526     }
527 
528     return uv;
529 
530 malformed:
531 
532     if (flags & UTF8_CHECK_ONLY) {
533 	if (retlen)
534 	    *retlen = ((STRLEN) -1);
535 	return 0;
536     }
537 
538     if (dowarn) {
539 	SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
540 
541 	switch (warning) {
542 	case 0: /* Intentionally empty. */ break;
543 	case UTF8_WARN_EMPTY:
544 	    sv_catpvs(sv, "(empty string)");
545 	    break;
546 	case UTF8_WARN_CONTINUATION:
547 	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
548 	    break;
549 	case UTF8_WARN_NON_CONTINUATION:
550 	    if (s == s0)
551 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
552                            (UV)s[1], startbyte);
553 	    else {
554 		const int len = (int)(s-s0);
555 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
556                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
557 	    }
558 
559 	    break;
560 	case UTF8_WARN_FE_FF:
561 	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
562 	    break;
563 	case UTF8_WARN_SHORT:
564 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
565                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
566 	    expectlen = curlen;		/* distance for caller to skip */
567 	    break;
568 	case UTF8_WARN_OVERFLOW:
569 	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
570                            ouv, *s, startbyte);
571 	    break;
572 	case UTF8_WARN_SURROGATE:
573 	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
574 	    break;
575 	case UTF8_WARN_LONG:
576 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
577 			   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
578 	    break;
579 	case UTF8_WARN_FFFF:
580 	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
581 	    break;
582 	default:
583 	    sv_catpvs(sv, "(unknown reason)");
584 	    break;
585 	}
586 
587 	if (warning) {
588 	    const char * const s = SvPVX_const(sv);
589 
590 	    if (PL_op)
591 		Perl_warner(aTHX_ packWARN(WARN_UTF8),
592 			    "%s in %s", s,  OP_DESC(PL_op));
593 	    else
594 		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
595 	}
596     }
597 
598     if (retlen)
599 	*retlen = expectlen ? expectlen : len;
600 
601     return 0;
602 }
603 
604 /*
605 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
606 
607 Returns the native character value of the first character in the string C<s>
608 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
609 length, in bytes, of that character.
610 
611 If C<s> does not point to a well-formed UTF-8 character, zero is
612 returned and retlen is set, if possible, to -1.
613 
614 =cut
615 */
616 
617 UV
618 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
619 {
620     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
621 			  ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
622 }
623 
624 /*
625 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
626 
627 Returns the Unicode code point of the first character in the string C<s>
628 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
629 length, in bytes, of that character.
630 
631 This function should only be used when returned UV is considered
632 an index into the Unicode semantic tables (e.g. swashes).
633 
634 If C<s> does not point to a well-formed UTF-8 character, zero is
635 returned and retlen is set, if possible, to -1.
636 
637 =cut
638 */
639 
640 UV
641 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
642 {
643     /* Call the low level routine asking for checks */
644     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
645 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
646 }
647 
648 /*
649 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
650 
651 Return the length of the UTF-8 char encoded string C<s> in characters.
652 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
653 up past C<e>, croaks.
654 
655 =cut
656 */
657 
658 STRLEN
659 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
660 {
661     dVAR;
662     STRLEN len = 0;
663     U8 t = 0;
664 
665     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
666      * the bitops (especially ~) can create illegal UTF-8.
667      * In other words: in Perl UTF-8 is not just for Unicode. */
668 
669     if (e < s)
670 	goto warn_and_return;
671     while (s < e) {
672 	t = UTF8SKIP(s);
673 	if (e - s < t) {
674 	    warn_and_return:
675 	    if (ckWARN_d(WARN_UTF8)) {
676 	        if (PL_op)
677 		    Perl_warner(aTHX_ packWARN(WARN_UTF8),
678 			    "%s in %s", unees, OP_DESC(PL_op));
679 		else
680 		    Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
681 	    }
682 	    return len;
683 	}
684 	s += t;
685 	len++;
686     }
687 
688     return len;
689 }
690 
691 /*
692 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
693 
694 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
695 and C<b>.
696 
697 WARNING: use only if you *know* that the pointers point inside the
698 same UTF-8 buffer.
699 
700 =cut
701 */
702 
703 IV
704 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
705 {
706     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
707 }
708 
709 /*
710 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
711 
712 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
713 forward or backward.
714 
715 WARNING: do not use the following unless you *know* C<off> is within
716 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
717 on the first byte of character or just after the last byte of a character.
718 
719 =cut
720 */
721 
722 U8 *
723 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
724 {
725     PERL_UNUSED_CONTEXT;
726     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
727      * the bitops (especially ~) can create illegal UTF-8.
728      * In other words: in Perl UTF-8 is not just for Unicode. */
729 
730     if (off >= 0) {
731 	while (off--)
732 	    s += UTF8SKIP(s);
733     }
734     else {
735 	while (off++) {
736 	    s--;
737 	    while (UTF8_IS_CONTINUATION(*s))
738 		s--;
739 	}
740     }
741     return (U8 *)s;
742 }
743 
744 /*
745 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
746 
747 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
748 Unlike C<bytes_to_utf8>, this over-writes the original string, and
749 updates len to contain the new length.
750 Returns zero on failure, setting C<len> to -1.
751 
752 If you need a copy of the string, see C<bytes_from_utf8>.
753 
754 =cut
755 */
756 
757 U8 *
758 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
759 {
760     U8 * const save = s;
761     U8 * const send = s + *len;
762     U8 *d;
763 
764     /* ensure valid UTF-8 and chars < 256 before updating string */
765     while (s < send) {
766         U8 c = *s++;
767 
768         if (!UTF8_IS_INVARIANT(c) &&
769             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
770 	     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
771             *len = ((STRLEN) -1);
772             return 0;
773         }
774     }
775 
776     d = s = save;
777     while (s < send) {
778         STRLEN ulen;
779         *d++ = (U8)utf8_to_uvchr(s, &ulen);
780         s += ulen;
781     }
782     *d = '\0';
783     *len = d - save;
784     return save;
785 }
786 
787 /*
788 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
789 
790 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
791 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
792 the newly-created string, and updates C<len> to contain the new
793 length.  Returns the original string if no conversion occurs, C<len>
794 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
795 0 if C<s> is converted or contains all 7bit characters.
796 
797 =cut
798 */
799 
800 U8 *
801 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
802 {
803     U8 *d;
804     const U8 *start = s;
805     const U8 *send;
806     I32 count = 0;
807 
808     PERL_UNUSED_CONTEXT;
809     if (!*is_utf8)
810         return (U8 *)start;
811 
812     /* ensure valid UTF-8 and chars < 256 before converting string */
813     for (send = s + *len; s < send;) {
814         U8 c = *s++;
815 	if (!UTF8_IS_INVARIANT(c)) {
816 	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
817                 (c = *s++) && UTF8_IS_CONTINUATION(c))
818 		count++;
819 	    else
820                 return (U8 *)start;
821 	}
822     }
823 
824     *is_utf8 = FALSE;
825 
826     Newx(d, (*len) - count + 1, U8);
827     s = start; start = d;
828     while (s < send) {
829 	U8 c = *s++;
830 	if (!UTF8_IS_INVARIANT(c)) {
831 	    /* Then it is two-byte encoded */
832 	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
833 	    c = ASCII_TO_NATIVE(c);
834 	}
835 	*d++ = c;
836     }
837     *d = '\0';
838     *len = d - start;
839     return (U8 *)start;
840 }
841 
842 /*
843 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
844 
845 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
846 Returns a pointer to the newly-created string, and sets C<len> to
847 reflect the new length.
848 
849 If you want to convert to UTF-8 from other encodings than ASCII,
850 see sv_recode_to_utf8().
851 
852 =cut
853 */
854 
855 U8*
856 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
857 {
858     const U8 * const send = s + (*len);
859     U8 *d;
860     U8 *dst;
861     PERL_UNUSED_CONTEXT;
862 
863     Newx(d, (*len) * 2 + 1, U8);
864     dst = d;
865 
866     while (s < send) {
867         const UV uv = NATIVE_TO_ASCII(*s++);
868         if (UNI_IS_INVARIANT(uv))
869             *d++ = (U8)UTF_TO_NATIVE(uv);
870         else {
871             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
872             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
873         }
874     }
875     *d = '\0';
876     *len = d-dst;
877     return dst;
878 }
879 
880 /*
881  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
882  *
883  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
884  * We optimize for native, for obvious reasons. */
885 
886 U8*
887 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
888 {
889     U8* pend;
890     U8* dstart = d;
891 
892     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
893 	 d[0] = 0;
894 	 *newlen = 1;
895 	 return d;
896     }
897 
898     if (bytelen & 1)
899 	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
900 
901     pend = p + bytelen;
902 
903     while (p < pend) {
904 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
905 	p += 2;
906 	if (uv < 0x80) {
907 #ifdef EBCDIC
908 	    *d++ = UNI_TO_NATIVE(uv);
909 #else
910 	    *d++ = (U8)uv;
911 #endif
912 	    continue;
913 	}
914 	if (uv < 0x800) {
915 	    *d++ = (U8)(( uv >>  6)         | 0xc0);
916 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
917 	    continue;
918 	}
919 	if (uv >= 0xd800 && uv < 0xdbff) {	/* surrogates */
920 	    UV low = (p[0] << 8) + p[1];
921 	    p += 2;
922 	    if (low < 0xdc00 || low >= 0xdfff)
923 		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
924 	    uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
925 	}
926 	if (uv < 0x10000) {
927 	    *d++ = (U8)(( uv >> 12)         | 0xe0);
928 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
929 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
930 	    continue;
931 	}
932 	else {
933 	    *d++ = (U8)(( uv >> 18)         | 0xf0);
934 	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
935 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
936 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
937 	    continue;
938 	}
939     }
940     *newlen = d - dstart;
941     return d;
942 }
943 
944 /* Note: this one is slightly destructive of the source. */
945 
946 U8*
947 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
948 {
949     U8* s = (U8*)p;
950     U8* const send = s + bytelen;
951     while (s < send) {
952 	const U8 tmp = s[0];
953 	s[0] = s[1];
954 	s[1] = tmp;
955 	s += 2;
956     }
957     return utf16_to_utf8(p, d, bytelen, newlen);
958 }
959 
960 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
961 
962 bool
963 Perl_is_uni_alnum(pTHX_ UV c)
964 {
965     U8 tmpbuf[UTF8_MAXBYTES+1];
966     uvchr_to_utf8(tmpbuf, c);
967     return is_utf8_alnum(tmpbuf);
968 }
969 
970 bool
971 Perl_is_uni_alnumc(pTHX_ UV c)
972 {
973     U8 tmpbuf[UTF8_MAXBYTES+1];
974     uvchr_to_utf8(tmpbuf, c);
975     return is_utf8_alnumc(tmpbuf);
976 }
977 
978 bool
979 Perl_is_uni_idfirst(pTHX_ UV c)
980 {
981     U8 tmpbuf[UTF8_MAXBYTES+1];
982     uvchr_to_utf8(tmpbuf, c);
983     return is_utf8_idfirst(tmpbuf);
984 }
985 
986 bool
987 Perl_is_uni_alpha(pTHX_ UV c)
988 {
989     U8 tmpbuf[UTF8_MAXBYTES+1];
990     uvchr_to_utf8(tmpbuf, c);
991     return is_utf8_alpha(tmpbuf);
992 }
993 
994 bool
995 Perl_is_uni_ascii(pTHX_ UV c)
996 {
997     U8 tmpbuf[UTF8_MAXBYTES+1];
998     uvchr_to_utf8(tmpbuf, c);
999     return is_utf8_ascii(tmpbuf);
1000 }
1001 
1002 bool
1003 Perl_is_uni_space(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_space(tmpbuf);
1008 }
1009 
1010 bool
1011 Perl_is_uni_digit(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_digit(tmpbuf);
1016 }
1017 
1018 bool
1019 Perl_is_uni_upper(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_upper(tmpbuf);
1024 }
1025 
1026 bool
1027 Perl_is_uni_lower(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_lower(tmpbuf);
1032 }
1033 
1034 bool
1035 Perl_is_uni_cntrl(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_cntrl(tmpbuf);
1040 }
1041 
1042 bool
1043 Perl_is_uni_graph(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_graph(tmpbuf);
1048 }
1049 
1050 bool
1051 Perl_is_uni_print(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_print(tmpbuf);
1056 }
1057 
1058 bool
1059 Perl_is_uni_punct(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_punct(tmpbuf);
1064 }
1065 
1066 bool
1067 Perl_is_uni_xdigit(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_xdigit(tmpbuf);
1072 }
1073 
1074 UV
1075 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1076 {
1077     uvchr_to_utf8(p, c);
1078     return to_utf8_upper(p, p, lenp);
1079 }
1080 
1081 UV
1082 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1083 {
1084     uvchr_to_utf8(p, c);
1085     return to_utf8_title(p, p, lenp);
1086 }
1087 
1088 UV
1089 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1090 {
1091     uvchr_to_utf8(p, c);
1092     return to_utf8_lower(p, p, lenp);
1093 }
1094 
1095 UV
1096 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1097 {
1098     uvchr_to_utf8(p, c);
1099     return to_utf8_fold(p, p, lenp);
1100 }
1101 
1102 /* for now these all assume no locale info available for Unicode > 255 */
1103 
1104 bool
1105 Perl_is_uni_alnum_lc(pTHX_ UV c)
1106 {
1107     return is_uni_alnum(c);	/* XXX no locale support yet */
1108 }
1109 
1110 bool
1111 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1112 {
1113     return is_uni_alnumc(c);	/* XXX no locale support yet */
1114 }
1115 
1116 bool
1117 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1118 {
1119     return is_uni_idfirst(c);	/* XXX no locale support yet */
1120 }
1121 
1122 bool
1123 Perl_is_uni_alpha_lc(pTHX_ UV c)
1124 {
1125     return is_uni_alpha(c);	/* XXX no locale support yet */
1126 }
1127 
1128 bool
1129 Perl_is_uni_ascii_lc(pTHX_ UV c)
1130 {
1131     return is_uni_ascii(c);	/* XXX no locale support yet */
1132 }
1133 
1134 bool
1135 Perl_is_uni_space_lc(pTHX_ UV c)
1136 {
1137     return is_uni_space(c);	/* XXX no locale support yet */
1138 }
1139 
1140 bool
1141 Perl_is_uni_digit_lc(pTHX_ UV c)
1142 {
1143     return is_uni_digit(c);	/* XXX no locale support yet */
1144 }
1145 
1146 bool
1147 Perl_is_uni_upper_lc(pTHX_ UV c)
1148 {
1149     return is_uni_upper(c);	/* XXX no locale support yet */
1150 }
1151 
1152 bool
1153 Perl_is_uni_lower_lc(pTHX_ UV c)
1154 {
1155     return is_uni_lower(c);	/* XXX no locale support yet */
1156 }
1157 
1158 bool
1159 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1160 {
1161     return is_uni_cntrl(c);	/* XXX no locale support yet */
1162 }
1163 
1164 bool
1165 Perl_is_uni_graph_lc(pTHX_ UV c)
1166 {
1167     return is_uni_graph(c);	/* XXX no locale support yet */
1168 }
1169 
1170 bool
1171 Perl_is_uni_print_lc(pTHX_ UV c)
1172 {
1173     return is_uni_print(c);	/* XXX no locale support yet */
1174 }
1175 
1176 bool
1177 Perl_is_uni_punct_lc(pTHX_ UV c)
1178 {
1179     return is_uni_punct(c);	/* XXX no locale support yet */
1180 }
1181 
1182 bool
1183 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1184 {
1185     return is_uni_xdigit(c);	/* XXX no locale support yet */
1186 }
1187 
1188 U32
1189 Perl_to_uni_upper_lc(pTHX_ U32 c)
1190 {
1191     /* XXX returns only the first character -- do not use XXX */
1192     /* XXX no locale support yet */
1193     STRLEN len;
1194     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1195     return (U32)to_uni_upper(c, tmpbuf, &len);
1196 }
1197 
1198 U32
1199 Perl_to_uni_title_lc(pTHX_ U32 c)
1200 {
1201     /* XXX returns only the first character XXX -- do not use XXX */
1202     /* XXX no locale support yet */
1203     STRLEN len;
1204     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1205     return (U32)to_uni_title(c, tmpbuf, &len);
1206 }
1207 
1208 U32
1209 Perl_to_uni_lower_lc(pTHX_ U32 c)
1210 {
1211     /* XXX returns only the first character -- do not use XXX */
1212     /* XXX no locale support yet */
1213     STRLEN len;
1214     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1215     return (U32)to_uni_lower(c, tmpbuf, &len);
1216 }
1217 
1218 static bool
1219 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1220 		 const char *const swashname)
1221 {
1222     dVAR;
1223     if (!is_utf8_char(p))
1224 	return FALSE;
1225     if (!*swash)
1226 	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1227     return swash_fetch(*swash, p, TRUE) != 0;
1228 }
1229 
1230 bool
1231 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1232 {
1233     dVAR;
1234     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1235      * descendant of isalnum(3), in other words, it doesn't
1236      * contain the '_'. --jhi */
1237     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1238 }
1239 
1240 bool
1241 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1242 {
1243     dVAR;
1244     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1245 }
1246 
1247 bool
1248 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1249 {
1250     dVAR;
1251     if (*p == '_')
1252 	return TRUE;
1253     /* is_utf8_idstart would be more logical. */
1254     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1255 }
1256 
1257 bool
1258 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1259 {
1260     dVAR;
1261     if (*p == '_')
1262 	return TRUE;
1263     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1264 }
1265 
1266 bool
1267 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1268 {
1269     dVAR;
1270     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1271 }
1272 
1273 bool
1274 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1275 {
1276     dVAR;
1277     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1278 }
1279 
1280 bool
1281 Perl_is_utf8_space(pTHX_ const U8 *p)
1282 {
1283     dVAR;
1284     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1285 }
1286 
1287 bool
1288 Perl_is_utf8_digit(pTHX_ const U8 *p)
1289 {
1290     dVAR;
1291     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1292 }
1293 
1294 bool
1295 Perl_is_utf8_upper(pTHX_ const U8 *p)
1296 {
1297     dVAR;
1298     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1299 }
1300 
1301 bool
1302 Perl_is_utf8_lower(pTHX_ const U8 *p)
1303 {
1304     dVAR;
1305     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1306 }
1307 
1308 bool
1309 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1310 {
1311     dVAR;
1312     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1313 }
1314 
1315 bool
1316 Perl_is_utf8_graph(pTHX_ const U8 *p)
1317 {
1318     dVAR;
1319     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1320 }
1321 
1322 bool
1323 Perl_is_utf8_print(pTHX_ const U8 *p)
1324 {
1325     dVAR;
1326     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1327 }
1328 
1329 bool
1330 Perl_is_utf8_punct(pTHX_ const U8 *p)
1331 {
1332     dVAR;
1333     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1334 }
1335 
1336 bool
1337 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1338 {
1339     dVAR;
1340     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1341 }
1342 
1343 bool
1344 Perl_is_utf8_mark(pTHX_ const U8 *p)
1345 {
1346     dVAR;
1347     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1348 }
1349 
1350 /*
1351 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1352 
1353 The "p" contains the pointer to the UTF-8 string encoding
1354 the character that is being converted.
1355 
1356 The "ustrp" is a pointer to the character buffer to put the
1357 conversion result to.  The "lenp" is a pointer to the length
1358 of the result.
1359 
1360 The "swashp" is a pointer to the swash to use.
1361 
1362 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1363 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1364 but not always, a multicharacter mapping), is tried first.
1365 
1366 The "special" is a string like "utf8::ToSpecLower", which means the
1367 hash %utf8::ToSpecLower.  The access to the hash is through
1368 Perl_to_utf8_case().
1369 
1370 The "normal" is a string like "ToLower" which means the swash
1371 %utf8::ToLower.
1372 
1373 =cut */
1374 
1375 UV
1376 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1377 			SV **swashp, const char *normal, const char *special)
1378 {
1379     dVAR;
1380     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1381     STRLEN len = 0;
1382 
1383     const UV uv0 = utf8_to_uvchr(p, NULL);
1384     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1385      * are necessary in EBCDIC, they are redundant no-ops
1386      * in ASCII-ish platforms, and hopefully optimized away. */
1387     const UV uv1 = NATIVE_TO_UNI(uv0);
1388     uvuni_to_utf8(tmpbuf, uv1);
1389 
1390     if (!*swashp) /* load on-demand */
1391          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1392 
1393     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1394     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1395          /* It might be "special" (sometimes, but not always,
1396 	  * a multicharacter mapping) */
1397 	 HV * const hv = get_hv(special, FALSE);
1398 	 SV **svp;
1399 
1400 	 if (hv &&
1401 	     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1402 	     (*svp)) {
1403 	     const char *s;
1404 
1405 	      s = SvPV_const(*svp, len);
1406 	      if (len == 1)
1407 		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1408 	      else {
1409 #ifdef EBCDIC
1410 		   /* If we have EBCDIC we need to remap the characters
1411 		    * since any characters in the low 256 are Unicode
1412 		    * code points, not EBCDIC. */
1413 		   U8 *t = (U8*)s, *tend = t + len, *d;
1414 
1415 		   d = tmpbuf;
1416 		   if (SvUTF8(*svp)) {
1417 			STRLEN tlen = 0;
1418 
1419 			while (t < tend) {
1420 			     const UV c = utf8_to_uvchr(t, &tlen);
1421 			     if (tlen > 0) {
1422 				  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1423 				  t += tlen;
1424 			     }
1425 			     else
1426 				  break;
1427 			}
1428 		   }
1429 		   else {
1430 			while (t < tend) {
1431 			     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1432 			     t++;
1433 			}
1434 		   }
1435 		   len = d - tmpbuf;
1436 		   Copy(tmpbuf, ustrp, len, U8);
1437 #else
1438 		   Copy(s, ustrp, len, U8);
1439 #endif
1440 	      }
1441 	 }
1442     }
1443 
1444     if (!len && *swashp) {
1445 	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1446 
1447 	 if (uv2) {
1448 	      /* It was "normal" (a single character mapping). */
1449 	      const UV uv3 = UNI_TO_NATIVE(uv2);
1450 	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1451 	 }
1452     }
1453 
1454     if (!len) /* Neither: just copy. */
1455 	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1456 
1457     if (lenp)
1458 	 *lenp = len;
1459 
1460     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1461 }
1462 
1463 /*
1464 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1465 
1466 Convert the UTF-8 encoded character at p to its uppercase version and
1467 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1468 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1469 the uppercase version may be longer than the original character.
1470 
1471 The first character of the uppercased version is returned
1472 (but note, as explained above, that there may be more.)
1473 
1474 =cut */
1475 
1476 UV
1477 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1478 {
1479     dVAR;
1480     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1481                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1482 }
1483 
1484 /*
1485 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1486 
1487 Convert the UTF-8 encoded character at p to its titlecase version and
1488 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1489 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1490 titlecase version may be longer than the original character.
1491 
1492 The first character of the titlecased version is returned
1493 (but note, as explained above, that there may be more.)
1494 
1495 =cut */
1496 
1497 UV
1498 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1499 {
1500     dVAR;
1501     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1502                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1503 }
1504 
1505 /*
1506 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1507 
1508 Convert the UTF-8 encoded character at p to its lowercase version and
1509 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1510 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1511 lowercase version may be longer than the original character.
1512 
1513 The first character of the lowercased version is returned
1514 (but note, as explained above, that there may be more.)
1515 
1516 =cut */
1517 
1518 UV
1519 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1520 {
1521     dVAR;
1522     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1523                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1524 }
1525 
1526 /*
1527 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1528 
1529 Convert the UTF-8 encoded character at p to its foldcase version and
1530 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1531 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1532 foldcase version may be longer than the original character (up to
1533 three characters).
1534 
1535 The first character of the foldcased version is returned
1536 (but note, as explained above, that there may be more.)
1537 
1538 =cut */
1539 
1540 UV
1541 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1542 {
1543     dVAR;
1544     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1546 }
1547 
1548 /* Note:
1549  * A "swash" is a swatch hash.
1550  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1553  */
1554 SV*
1555 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1556 {
1557     dVAR;
1558     SV* retval;
1559     dSP;
1560     const size_t pkg_len = strlen(pkg);
1561     const size_t name_len = strlen(name);
1562     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1563     SV* errsv_save;
1564 
1565     PUSHSTACKi(PERLSI_MAGIC);
1566     ENTER;
1567     SAVEI32(PL_hints);
1568     PL_hints = 0;
1569     save_re_context();
1570     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
1571 	ENTER;
1572 	errsv_save = newSVsv(ERRSV);
1573 	/* It is assumed that callers of this routine are not passing in any
1574 	   user derived data.  */
1575 	/* Need to do this after save_re_context() as it will set PL_tainted to
1576 	   1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1577 	   Even line to create errsv_save can turn on PL_tainted.  */
1578 	SAVEBOOL(PL_tainted);
1579 	PL_tainted = 0;
1580 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1581 			 NULL);
1582 	if (!SvTRUE(ERRSV))
1583 	    sv_setsv(ERRSV, errsv_save);
1584 	SvREFCNT_dec(errsv_save);
1585 	LEAVE;
1586     }
1587     SPAGAIN;
1588     PUSHMARK(SP);
1589     EXTEND(SP,5);
1590     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1591     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1592     PUSHs(listsv);
1593     PUSHs(sv_2mortal(newSViv(minbits)));
1594     PUSHs(sv_2mortal(newSViv(none)));
1595     PUTBACK;
1596     errsv_save = newSVsv(ERRSV);
1597     if (call_method("SWASHNEW", G_SCALAR))
1598 	retval = newSVsv(*PL_stack_sp--);
1599     else
1600 	retval = &PL_sv_undef;
1601     if (!SvTRUE(ERRSV))
1602 	sv_setsv(ERRSV, errsv_save);
1603     SvREFCNT_dec(errsv_save);
1604     LEAVE;
1605     POPSTACK;
1606     if (IN_PERL_COMPILETIME) {
1607 	CopHINTS_set(PL_curcop, PL_hints);
1608     }
1609     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1610         if (SvPOK(retval))
1611 	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1612 		       SVfARG(retval));
1613 	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1614     }
1615     return retval;
1616 }
1617 
1618 
1619 /* This API is wrong for special case conversions since we may need to
1620  * return several Unicode characters for a single Unicode character
1621  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1622  * the lower-level routine, and it is similarly broken for returning
1623  * multiple values.  --jhi */
1624 /* Now SWASHGET is recasted into S_swash_get in this file. */
1625 
1626 /* Note:
1627  * Returns the value of property/mapping C<swash> for the first character
1628  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1629  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1630  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1631  */
1632 UV
1633 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1634 {
1635     dVAR;
1636     HV* const hv = (HV*)SvRV(swash);
1637     U32 klen;
1638     U32 off;
1639     STRLEN slen;
1640     STRLEN needents;
1641     const U8 *tmps = NULL;
1642     U32 bit;
1643     SV *swatch;
1644     U8 tmputf8[2];
1645     const UV c = NATIVE_TO_ASCII(*ptr);
1646 
1647     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1648 	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1649 	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1650 	ptr = tmputf8;
1651     }
1652     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1653      * then the "swatch" is a vec() for al the chars which start
1654      * with 0xAA..0xYY
1655      * So the key in the hash (klen) is length of encoded char -1
1656      */
1657     klen = UTF8SKIP(ptr) - 1;
1658     off  = ptr[klen];
1659 
1660     if (klen == 0) {
1661       /* If char in invariant then swatch is for all the invariant chars
1662        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1663        */
1664 	needents = UTF_CONTINUATION_MARK;
1665 	off      = NATIVE_TO_UTF(ptr[klen]);
1666     }
1667     else {
1668       /* If char is encoded then swatch is for the prefix */
1669 	needents = (1 << UTF_ACCUMULATION_SHIFT);
1670 	off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1671     }
1672 
1673     /*
1674      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1675      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1676      * it's nothing to sniff at.)  Pity we usually come through at least
1677      * two function calls to get here...
1678      *
1679      * NB: this code assumes that swatches are never modified, once generated!
1680      */
1681 
1682     if (hv   == PL_last_swash_hv &&
1683 	klen == PL_last_swash_klen &&
1684 	(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1685     {
1686 	tmps = PL_last_swash_tmps;
1687 	slen = PL_last_swash_slen;
1688     }
1689     else {
1690 	/* Try our second-level swatch cache, kept in a hash. */
1691 	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1692 
1693 	/* If not cached, generate it via swash_get */
1694 	if (!svp || !SvPOK(*svp)
1695 		 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1696 	    /* We use utf8n_to_uvuni() as we want an index into
1697 	       Unicode tables, not a native character number.
1698 	     */
1699 	    const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1700 					   ckWARN(WARN_UTF8) ?
1701 					   0 : UTF8_ALLOW_ANY);
1702 	    swatch = swash_get(swash,
1703 		    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1704 				(klen) ? (code_point & ~(needents - 1)) : 0,
1705 				needents);
1706 
1707 	    if (IN_PERL_COMPILETIME)
1708 		CopHINTS_set(PL_curcop, PL_hints);
1709 
1710 	    svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1711 
1712 	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1713 		     || (slen << 3) < needents)
1714 		Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1715 	}
1716 
1717 	PL_last_swash_hv = hv;
1718 	assert(klen <= sizeof(PL_last_swash_key));
1719 	PL_last_swash_klen = (U8)klen;
1720 	/* FIXME change interpvar.h?  */
1721 	PL_last_swash_tmps = (U8 *) tmps;
1722 	PL_last_swash_slen = slen;
1723 	if (klen)
1724 	    Copy(ptr, PL_last_swash_key, klen, U8);
1725     }
1726 
1727     switch ((int)((slen << 3) / needents)) {
1728     case 1:
1729 	bit = 1 << (off & 7);
1730 	off >>= 3;
1731 	return (tmps[off] & bit) != 0;
1732     case 8:
1733 	return tmps[off];
1734     case 16:
1735 	off <<= 1;
1736 	return (tmps[off] << 8) + tmps[off + 1] ;
1737     case 32:
1738 	off <<= 2;
1739 	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1740     }
1741     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1742     NORETURN_FUNCTION_END;
1743 }
1744 
1745 /* Note:
1746  * Returns a swatch (a bit vector string) for a code point sequence
1747  * that starts from the value C<start> and comprises the number C<span>.
1748  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1749  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1750  */
1751 STATIC SV*
1752 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1753 {
1754     SV *swatch;
1755     U8 *l, *lend, *x, *xend, *s;
1756     STRLEN lcur, xcur, scur;
1757 
1758     HV* const hv = (HV*)SvRV(swash);
1759     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1760     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1761     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1762     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1763     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1764     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1765     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1766     const STRLEN bits  = SvUV(*bitssvp);
1767     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1768     const UV     none  = SvUV(*nonesvp);
1769     const UV     end   = start + span;
1770 
1771     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1772 	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1773 						 (UV)bits);
1774     }
1775 
1776     /* create and initialize $swatch */
1777     scur   = octets ? (span * octets) : (span + 7) / 8;
1778     swatch = newSV(scur);
1779     SvPOK_on(swatch);
1780     s = (U8*)SvPVX(swatch);
1781     if (octets && none) {
1782 	const U8* const e = s + scur;
1783 	while (s < e) {
1784 	    if (bits == 8)
1785 		*s++ = (U8)(none & 0xff);
1786 	    else if (bits == 16) {
1787 		*s++ = (U8)((none >>  8) & 0xff);
1788 		*s++ = (U8)( none        & 0xff);
1789 	    }
1790 	    else if (bits == 32) {
1791 		*s++ = (U8)((none >> 24) & 0xff);
1792 		*s++ = (U8)((none >> 16) & 0xff);
1793 		*s++ = (U8)((none >>  8) & 0xff);
1794 		*s++ = (U8)( none        & 0xff);
1795 	    }
1796 	}
1797 	*s = '\0';
1798     }
1799     else {
1800 	(void)memzero((U8*)s, scur + 1);
1801     }
1802     SvCUR_set(swatch, scur);
1803     s = (U8*)SvPVX(swatch);
1804 
1805     /* read $swash->{LIST} */
1806     l = (U8*)SvPV(*listsvp, lcur);
1807     lend = l + lcur;
1808     while (l < lend) {
1809 	UV min, max, val;
1810 	STRLEN numlen;
1811 	I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1812 
1813 	U8* const nl = (U8*)memchr(l, '\n', lend - l);
1814 
1815 	numlen = lend - l;
1816 	min = grok_hex((char *)l, &numlen, &flags, NULL);
1817 	if (numlen)
1818 	    l += numlen;
1819 	else if (nl) {
1820 	    l = nl + 1; /* 1 is length of "\n" */
1821 	    continue;
1822 	}
1823 	else {
1824 	    l = lend; /* to LIST's end at which \n is not found */
1825 	    break;
1826 	}
1827 
1828 	if (isBLANK(*l)) {
1829 	    ++l;
1830 	    flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1831 	    numlen = lend - l;
1832 	    max = grok_hex((char *)l, &numlen, &flags, NULL);
1833 	    if (numlen)
1834 		l += numlen;
1835 	    else
1836 		max = min;
1837 
1838 	    if (octets) {
1839 		if (isBLANK(*l)) {
1840 		    ++l;
1841 		    flags = PERL_SCAN_SILENT_ILLDIGIT |
1842 			    PERL_SCAN_DISALLOW_PREFIX;
1843 		    numlen = lend - l;
1844 		    val = grok_hex((char *)l, &numlen, &flags, NULL);
1845 		    if (numlen)
1846 			l += numlen;
1847 		    else
1848 			val = 0;
1849 		}
1850 		else {
1851 		    val = 0;
1852 		    if (typeto) {
1853 			Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1854 					 typestr, l);
1855 		    }
1856 		}
1857 	    }
1858 	    else
1859 		val = 0; /* bits == 1, then val should be ignored */
1860 	}
1861 	else {
1862 	    max = min;
1863 	    if (octets) {
1864 		val = 0;
1865 		if (typeto) {
1866 		    Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1867 		}
1868 	    }
1869 	    else
1870 		val = 0; /* bits == 1, then val should be ignored */
1871 	}
1872 
1873 	if (nl)
1874 	    l = nl + 1;
1875 	else
1876 	    l = lend;
1877 
1878 	if (max < start)
1879 	    continue;
1880 
1881 	if (octets) {
1882 	    UV key;
1883 	    if (min < start) {
1884 		if (!none || val < none) {
1885 		    val += start - min;
1886 		}
1887 		min = start;
1888 	    }
1889 	    for (key = min; key <= max; key++) {
1890 		STRLEN offset;
1891 		if (key >= end)
1892 		    goto go_out_list;
1893 		/* offset must be non-negative (start <= min <= key < end) */
1894 		offset = octets * (key - start);
1895 		if (bits == 8)
1896 		    s[offset] = (U8)(val & 0xff);
1897 		else if (bits == 16) {
1898 		    s[offset    ] = (U8)((val >>  8) & 0xff);
1899 		    s[offset + 1] = (U8)( val        & 0xff);
1900 		}
1901 		else if (bits == 32) {
1902 		    s[offset    ] = (U8)((val >> 24) & 0xff);
1903 		    s[offset + 1] = (U8)((val >> 16) & 0xff);
1904 		    s[offset + 2] = (U8)((val >>  8) & 0xff);
1905 		    s[offset + 3] = (U8)( val        & 0xff);
1906 		}
1907 
1908 		if (!none || val < none)
1909 		    ++val;
1910 	    }
1911 	}
1912 	else { /* bits == 1, then val should be ignored */
1913 	    UV key;
1914 	    if (min < start)
1915 		min = start;
1916 	    for (key = min; key <= max; key++) {
1917 		const STRLEN offset = (STRLEN)(key - start);
1918 		if (key >= end)
1919 		    goto go_out_list;
1920 		s[offset >> 3] |= 1 << (offset & 7);
1921 	    }
1922 	}
1923     } /* while */
1924   go_out_list:
1925 
1926     /* read $swash->{EXTRAS} */
1927     x = (U8*)SvPV(*extssvp, xcur);
1928     xend = x + xcur;
1929     while (x < xend) {
1930 	STRLEN namelen;
1931 	U8 *namestr;
1932 	SV** othersvp;
1933 	HV* otherhv;
1934 	STRLEN otherbits;
1935 	SV **otherbitssvp, *other;
1936 	U8 *s, *o, *nl;
1937 	STRLEN slen, olen;
1938 
1939 	const U8 opc = *x++;
1940 	if (opc == '\n')
1941 	    continue;
1942 
1943 	nl = (U8*)memchr(x, '\n', xend - x);
1944 
1945 	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1946 	    if (nl) {
1947 		x = nl + 1; /* 1 is length of "\n" */
1948 		continue;
1949 	    }
1950 	    else {
1951 		x = xend; /* to EXTRAS' end at which \n is not found */
1952 		break;
1953 	    }
1954 	}
1955 
1956 	namestr = x;
1957 	if (nl) {
1958 	    namelen = nl - namestr;
1959 	    x = nl + 1;
1960 	}
1961 	else {
1962 	    namelen = xend - namestr;
1963 	    x = xend;
1964 	}
1965 
1966 	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1967 	otherhv = (HV*)SvRV(*othersvp);
1968 	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1969 	otherbits = (STRLEN)SvUV(*otherbitssvp);
1970 	if (bits < otherbits)
1971 	    Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1972 
1973 	/* The "other" swatch must be destroyed after. */
1974 	other = swash_get(*othersvp, start, span);
1975 	o = (U8*)SvPV(other, olen);
1976 
1977 	if (!olen)
1978 	    Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1979 
1980 	s = (U8*)SvPV(swatch, slen);
1981 	if (bits == 1 && otherbits == 1) {
1982 	    if (slen != olen)
1983 		Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1984 
1985 	    switch (opc) {
1986 	    case '+':
1987 		while (slen--)
1988 		    *s++ |= *o++;
1989 		break;
1990 	    case '!':
1991 		while (slen--)
1992 		    *s++ |= ~*o++;
1993 		break;
1994 	    case '-':
1995 		while (slen--)
1996 		    *s++ &= ~*o++;
1997 		break;
1998 	    case '&':
1999 		while (slen--)
2000 		    *s++ &= *o++;
2001 		break;
2002 	    default:
2003 		break;
2004 	    }
2005 	}
2006 	else {
2007 	    STRLEN otheroctets = otherbits >> 3;
2008 	    STRLEN offset = 0;
2009 	    U8* const send = s + slen;
2010 
2011 	    while (s < send) {
2012 		UV otherval = 0;
2013 
2014 		if (otherbits == 1) {
2015 		    otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2016 		    ++offset;
2017 		}
2018 		else {
2019 		    STRLEN vlen = otheroctets;
2020 		    otherval = *o++;
2021 		    while (--vlen) {
2022 			otherval <<= 8;
2023 			otherval |= *o++;
2024 		    }
2025 		}
2026 
2027 		if (opc == '+' && otherval)
2028 		    NOOP;   /* replace with otherval */
2029 		else if (opc == '!' && !otherval)
2030 		    otherval = 1;
2031 		else if (opc == '-' && otherval)
2032 		    otherval = 0;
2033 		else if (opc == '&' && !otherval)
2034 		    otherval = 0;
2035 		else {
2036 		    s += octets; /* no replacement */
2037 		    continue;
2038 		}
2039 
2040 		if (bits == 8)
2041 		    *s++ = (U8)( otherval & 0xff);
2042 		else if (bits == 16) {
2043 		    *s++ = (U8)((otherval >>  8) & 0xff);
2044 		    *s++ = (U8)( otherval        & 0xff);
2045 		}
2046 		else if (bits == 32) {
2047 		    *s++ = (U8)((otherval >> 24) & 0xff);
2048 		    *s++ = (U8)((otherval >> 16) & 0xff);
2049 		    *s++ = (U8)((otherval >>  8) & 0xff);
2050 		    *s++ = (U8)( otherval        & 0xff);
2051 		}
2052 	    }
2053 	}
2054 	sv_free(other); /* through with it! */
2055     } /* while */
2056     return swatch;
2057 }
2058 
2059 /*
2060 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2061 
2062 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2063 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2064 bytes available. The return value is the pointer to the byte after the
2065 end of the new character. In other words,
2066 
2067     d = uvchr_to_utf8(d, uv);
2068 
2069 is the recommended wide native character-aware way of saying
2070 
2071     *(d++) = uv;
2072 
2073 =cut
2074 */
2075 
2076 /* On ASCII machines this is normally a macro but we want a
2077    real function in case XS code wants it
2078 */
2079 U8 *
2080 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2081 {
2082     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2083 }
2084 
2085 U8 *
2086 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2087 {
2088     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2089 }
2090 
2091 /*
2092 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2093 flags
2094 
2095 Returns the native character value of the first character in the string
2096 C<s>
2097 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2098 length, in bytes, of that character.
2099 
2100 Allows length and flags to be passed to low level routine.
2101 
2102 =cut
2103 */
2104 /* On ASCII machines this is normally a macro but we want
2105    a real function in case XS code wants it
2106 */
2107 UV
2108 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2109 U32 flags)
2110 {
2111     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2112     return UNI_TO_NATIVE(uv);
2113 }
2114 
2115 /*
2116 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2117 
2118 Build to the scalar dsv a displayable version of the string spv,
2119 length len, the displayable version being at most pvlim bytes long
2120 (if longer, the rest is truncated and "..." will be appended).
2121 
2122 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2123 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2124 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2125 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2126 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2127 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2128 
2129 The pointer to the PV of the dsv is returned.
2130 
2131 =cut */
2132 char *
2133 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2134 {
2135     int truncated = 0;
2136     const char *s, *e;
2137 
2138     sv_setpvn(dsv, "", 0);
2139     SvUTF8_off(dsv);
2140     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2141 	 UV u;
2142 	  /* This serves double duty as a flag and a character to print after
2143 	     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2144 	  */
2145 	 char ok = 0;
2146 
2147 	 if (pvlim && SvCUR(dsv) >= pvlim) {
2148 	      truncated++;
2149 	      break;
2150 	 }
2151 	 u = utf8_to_uvchr((U8*)s, 0);
2152 	 if (u < 256) {
2153 	     const unsigned char c = (unsigned char)u & 0xFF;
2154 	     if (flags & UNI_DISPLAY_BACKSLASH) {
2155 	         switch (c) {
2156 		 case '\n':
2157 		     ok = 'n'; break;
2158 		 case '\r':
2159 		     ok = 'r'; break;
2160 		 case '\t':
2161 		     ok = 't'; break;
2162 		 case '\f':
2163 		     ok = 'f'; break;
2164 		 case '\a':
2165 		     ok = 'a'; break;
2166 		 case '\\':
2167 		     ok = '\\'; break;
2168 		 default: break;
2169 		 }
2170 		 if (ok) {
2171 		     const char string = ok;
2172 		     sv_catpvn(dsv, &string, 1);
2173 		 }
2174 	     }
2175 	     /* isPRINT() is the locale-blind version. */
2176 	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2177 		 const char string = c;
2178 		 sv_catpvn(dsv, &string, 1);
2179 		 ok = 1;
2180 	     }
2181 	 }
2182 	 if (!ok)
2183 	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2184     }
2185     if (truncated)
2186 	 sv_catpvs(dsv, "...");
2187 
2188     return SvPVX(dsv);
2189 }
2190 
2191 /*
2192 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2193 
2194 Build to the scalar dsv a displayable version of the scalar sv,
2195 the displayable version being at most pvlim bytes long
2196 (if longer, the rest is truncated and "..." will be appended).
2197 
2198 The flags argument is as in pv_uni_display().
2199 
2200 The pointer to the PV of the dsv is returned.
2201 
2202 =cut
2203 */
2204 char *
2205 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2206 {
2207      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2208 				SvCUR(ssv), pvlim, flags);
2209 }
2210 
2211 /*
2212 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2213 
2214 Return true if the strings s1 and s2 differ case-insensitively, false
2215 if not (if they are equal case-insensitively).  If u1 is true, the
2216 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2217 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2218 are false, the respective string is assumed to be in native 8-bit
2219 encoding.
2220 
2221 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2222 in there (they will point at the beginning of the I<next> character).
2223 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2224 pointers beyond which scanning will not continue under any
2225 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2226 s2+l2 will be used as goal end pointers that will also stop the scan,
2227 and which qualify towards defining a successful match: all the scans
2228 that define an explicit length must reach their goal pointers for
2229 a match to succeed).
2230 
2231 For case-insensitiveness, the "casefolding" of Unicode is used
2232 instead of upper/lowercasing both the characters, see
2233 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2234 
2235 =cut */
2236 I32
2237 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2238 {
2239      dVAR;
2240      register const U8 *p1  = (const U8*)s1;
2241      register const U8 *p2  = (const U8*)s2;
2242      register const U8 *f1 = NULL;
2243      register const U8 *f2 = NULL;
2244      register U8 *e1 = NULL;
2245      register U8 *q1 = NULL;
2246      register U8 *e2 = NULL;
2247      register U8 *q2 = NULL;
2248      STRLEN n1 = 0, n2 = 0;
2249      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2250      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2251      U8 natbuf[1+1];
2252      STRLEN foldlen1, foldlen2;
2253      bool match;
2254 
2255      if (pe1)
2256 	  e1 = *(U8**)pe1;
2257      /* assert(e1 || l1); */
2258      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2259 	  f1 = (const U8*)s1 + l1;
2260      if (pe2)
2261 	  e2 = *(U8**)pe2;
2262      /* assert(e2 || l2); */
2263      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2264 	  f2 = (const U8*)s2 + l2;
2265 
2266      /* This shouldn't happen. However, putting an assert() there makes some
2267       * tests fail. */
2268      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2269      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2270 	  return 1; /* mismatch; possible infinite loop or false positive */
2271 
2272      if (!u1 || !u2)
2273 	  natbuf[1] = 0; /* Need to terminate the buffer. */
2274 
2275      while ((e1 == 0 || p1 < e1) &&
2276 	    (f1 == 0 || p1 < f1) &&
2277 	    (e2 == 0 || p2 < e2) &&
2278 	    (f2 == 0 || p2 < f2)) {
2279 	  if (n1 == 0) {
2280 	       if (u1)
2281 		    to_utf8_fold(p1, foldbuf1, &foldlen1);
2282 	       else {
2283 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2284 		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2285 	       }
2286 	       q1 = foldbuf1;
2287 	       n1 = foldlen1;
2288 	  }
2289 	  if (n2 == 0) {
2290 	       if (u2)
2291 		    to_utf8_fold(p2, foldbuf2, &foldlen2);
2292 	       else {
2293 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2294 		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2295 	       }
2296 	       q2 = foldbuf2;
2297 	       n2 = foldlen2;
2298 	  }
2299 	  while (n1 && n2) {
2300 	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2301 		   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2302 		    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2303 		   return 1; /* mismatch */
2304 	       n1 -= UTF8SKIP(q1);
2305 	       q1 += UTF8SKIP(q1);
2306 	       n2 -= UTF8SKIP(q2);
2307 	       q2 += UTF8SKIP(q2);
2308 	  }
2309 	  if (n1 == 0)
2310 	       p1 += u1 ? UTF8SKIP(p1) : 1;
2311 	  if (n2 == 0)
2312 	       p2 += u2 ? UTF8SKIP(p2) : 1;
2313 
2314      }
2315 
2316      /* A match is defined by all the scans that specified
2317       * an explicit length reaching their final goals. */
2318      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2319 
2320      if (match) {
2321 	  if (pe1)
2322 	       *pe1 = (char*)p1;
2323 	  if (pe2)
2324 	       *pe2 = (char*)p2;
2325      }
2326 
2327      return match ? 0 : 1; /* 0 match, 1 mismatch */
2328 }
2329 
2330 /*
2331  * Local variables:
2332  * c-indentation-style: bsd
2333  * c-basic-offset: 4
2334  * indent-tabs-mode: t
2335  * End:
2336  *
2337  * ex: set ts=8 sts=4 sw=4 noet:
2338  */
2339