xref: /openbsd-src/gnu/usr.bin/perl/utf8.c (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
13  *  heard of that we don't want to see any closer; and that's the one place
14  *  we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sm�agol"]
17  *
18  * 'Well do I understand your speech,' he answered in the same language;
19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
20  *  as is the custom in the West, if you wish to be answered?'
21  *                           --Gandalf, addressing Th�oden's door wardens
22  *
23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24  *
25  * ...the travellers perceived that the floor was paved with stones of many
26  * hues; branching runes and strange devices intertwined beneath their feet.
27  *
28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29  */
30 
31 #include "EXTERN.h"
32 #define PERL_IN_UTF8_C
33 #include "perl.h"
34 
35 #ifndef EBCDIC
36 /* Separate prototypes needed because in ASCII systems these
37  * usually macros but they still are compiled as code, too. */
38 PERL_CALLCONV UV	Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39 PERL_CALLCONV U8*	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40 #endif
41 
42 static const char unees[] =
43     "Malformed UTF-8 character (unexpected end of string)";
44 
45 /*
46 =head1 Unicode Support
47 
48 This file contains various utility functions for manipulating UTF8-encoded
49 strings. For the uninitiated, this is a method of representing arbitrary
50 Unicode characters as a variable number of bytes, in such a way that
51 characters in the ASCII range are unmodified, and a zero byte never appears
52 within non-zero characters.
53 
54 =cut
55 */
56 
57 /*
58 =for apidoc is_ascii_string
59 
60 Returns true if first C<len> bytes of the given string are ASCII (i.e. none
61 of them even raise the question of UTF-8-ness).
62 
63 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
64 
65 =cut
66 */
67 
68 bool
69 Perl_is_ascii_string(const U8 *s, STRLEN len)
70 {
71     const U8* const send = s + (len ? len : strlen((const char *)s));
72     const U8* x = s;
73 
74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
75 
76     for (; x < send; ++x) {
77 	if (!UTF8_IS_INVARIANT(*x))
78 	    break;
79     }
80 
81     return x == send;
82 }
83 
84 /*
85 =for apidoc uvuni_to_utf8_flags
86 
87 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
88 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
89 bytes available. The return value is the pointer to the byte after the
90 end of the new character. In other words,
91 
92     d = uvuni_to_utf8_flags(d, uv, flags);
93 
94 or, in most cases,
95 
96     d = uvuni_to_utf8(d, uv);
97 
98 (which is equivalent to)
99 
100     d = uvuni_to_utf8_flags(d, uv, 0);
101 
102 is the recommended Unicode-aware way of saying
103 
104     *(d++) = uv;
105 
106 =cut
107 */
108 
109 U8 *
110 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
111 {
112     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
113 
114     if (ckWARN(WARN_UTF8)) {
115 	 if (UNICODE_IS_SURROGATE(uv) &&
116 	     !(flags & UNICODE_ALLOW_SURROGATE))
117 	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
118 	 else if (
119 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
120 		    !(flags & UNICODE_ALLOW_FDD0))
121 		   ||
122 		   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
123 		    !(flags & UNICODE_ALLOW_FFFF))) &&
124 		  /* UNICODE_ALLOW_SUPER includes
125 		   * FFFEs and FFFFs beyond 0x10FFFF. */
126 		  ((uv <= PERL_UNICODE_MAX) ||
127 		   !(flags & UNICODE_ALLOW_SUPER))
128 		  )
129 	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
130 		      "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
131     }
132     if (UNI_IS_INVARIANT(uv)) {
133 	*d++ = (U8)UTF_TO_NATIVE(uv);
134 	return d;
135     }
136 #if defined(EBCDIC)
137     else {
138 	STRLEN len  = UNISKIP(uv);
139 	U8 *p = d+len-1;
140 	while (p > d) {
141 	    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
142 	    uv >>= UTF_ACCUMULATION_SHIFT;
143 	}
144 	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
145 	return d+len;
146     }
147 #else /* Non loop style */
148     if (uv < 0x800) {
149 	*d++ = (U8)(( uv >>  6)         | 0xc0);
150 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
151 	return d;
152     }
153     if (uv < 0x10000) {
154 	*d++ = (U8)(( uv >> 12)         | 0xe0);
155 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
156 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
157 	return d;
158     }
159     if (uv < 0x200000) {
160 	*d++ = (U8)(( uv >> 18)         | 0xf0);
161 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
163 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
164 	return d;
165     }
166     if (uv < 0x4000000) {
167 	*d++ = (U8)(( uv >> 24)         | 0xf8);
168 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
169 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
170 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
171 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
172 	return d;
173     }
174     if (uv < 0x80000000) {
175 	*d++ = (U8)(( uv >> 30)         | 0xfc);
176 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
177 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
178 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
179 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
180 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
181 	return d;
182     }
183 #ifdef HAS_QUAD
184     if (uv < UTF8_QUAD_MAX)
185 #endif
186     {
187 	*d++ =                            0xfe;	/* Can't match U+FEFF! */
188 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
189 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
190 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
191 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
192 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
193 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
194 	return d;
195     }
196 #ifdef HAS_QUAD
197     {
198 	*d++ =                            0xff;		/* Can't match U+FFFE! */
199 	*d++ =                            0x80;		/* 6 Reserved bits */
200 	*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
201 	*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
202 	*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
203 	*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
204 	*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
205 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
206 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
207 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
208 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
209 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
210 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
211 	return d;
212     }
213 #endif
214 #endif /* Loop style */
215 }
216 
217 /*
218 
219 Tests if some arbitrary number of bytes begins in a valid UTF-8
220 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
221 UTF-8 character.  The actual number of bytes in the UTF-8 character
222 will be returned if it is valid, otherwise 0.
223 
224 This is the "slow" version as opposed to the "fast" version which is
225 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
226 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
227 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
228 you should use the _slow().  In practice this means that the _slow()
229 will be used very rarely, since the maximum Unicode code point (as of
230 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
231 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
232 five bytes or more.
233 
234 =cut */
235 STATIC STRLEN
236 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
237 {
238     U8 u = *s;
239     STRLEN slen;
240     UV uv, ouv;
241 
242     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
243 
244     if (UTF8_IS_INVARIANT(u))
245 	return 1;
246 
247     if (!UTF8_IS_START(u))
248 	return 0;
249 
250     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
251 	return 0;
252 
253     slen = len - 1;
254     s++;
255 #ifdef EBCDIC
256     u = NATIVE_TO_UTF(u);
257 #endif
258     u &= UTF_START_MASK(len);
259     uv  = u;
260     ouv = uv;
261     while (slen--) {
262 	if (!UTF8_IS_CONTINUATION(*s))
263 	    return 0;
264 	uv = UTF8_ACCUMULATE(uv, *s);
265 	if (uv < ouv)
266 	    return 0;
267 	ouv = uv;
268 	s++;
269     }
270 
271     if ((STRLEN)UNISKIP(uv) < len)
272 	return 0;
273 
274     return len;
275 }
276 
277 /*
278 =for apidoc is_utf8_char
279 
280 Tests if some arbitrary number of bytes begins in a valid UTF-8
281 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
282 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
283 character will be returned if it is valid, otherwise 0.
284 
285 =cut */
286 STRLEN
287 Perl_is_utf8_char(const U8 *s)
288 {
289     const STRLEN len = UTF8SKIP(s);
290 
291     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
292 #ifdef IS_UTF8_CHAR
293     if (IS_UTF8_CHAR_FAST(len))
294         return IS_UTF8_CHAR(s, len) ? len : 0;
295 #endif /* #ifdef IS_UTF8_CHAR */
296     return is_utf8_char_slow(s, len);
297 }
298 
299 
300 /*
301 =for apidoc is_utf8_string
302 
303 Returns true if first C<len> bytes of the given string form a valid
304 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
305 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
306 because a valid ASCII string is a valid UTF-8 string.
307 
308 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
309 
310 =cut
311 */
312 
313 bool
314 Perl_is_utf8_string(const U8 *s, STRLEN len)
315 {
316     const U8* const send = s + (len ? len : strlen((const char *)s));
317     const U8* x = s;
318 
319     PERL_ARGS_ASSERT_IS_UTF8_STRING;
320 
321     while (x < send) {
322 	STRLEN c;
323 	 /* Inline the easy bits of is_utf8_char() here for speed... */
324 	 if (UTF8_IS_INVARIANT(*x))
325 	      c = 1;
326 	 else if (!UTF8_IS_START(*x))
327 	     goto out;
328 	 else {
329 	      /* ... and call is_utf8_char() only if really needed. */
330 #ifdef IS_UTF8_CHAR
331 	     c = UTF8SKIP(x);
332 	     if (IS_UTF8_CHAR_FAST(c)) {
333 	         if (!IS_UTF8_CHAR(x, c))
334 		     c = 0;
335 	     }
336 	     else
337 		c = is_utf8_char_slow(x, c);
338 #else
339 	     c = is_utf8_char(x);
340 #endif /* #ifdef IS_UTF8_CHAR */
341 	      if (!c)
342 		  goto out;
343 	 }
344         x += c;
345     }
346 
347  out:
348     if (x != send)
349 	return FALSE;
350 
351     return TRUE;
352 }
353 
354 /*
355 Implemented as a macro in utf8.h
356 
357 =for apidoc is_utf8_string_loc
358 
359 Like is_utf8_string() but stores the location of the failure (in the
360 case of "utf8ness failure") or the location s+len (in the case of
361 "utf8ness success") in the C<ep>.
362 
363 See also is_utf8_string_loclen() and is_utf8_string().
364 
365 =for apidoc is_utf8_string_loclen
366 
367 Like is_utf8_string() but stores the location of the failure (in the
368 case of "utf8ness failure") or the location s+len (in the case of
369 "utf8ness success") in the C<ep>, and the number of UTF-8
370 encoded characters in the C<el>.
371 
372 See also is_utf8_string_loc() and is_utf8_string().
373 
374 =cut
375 */
376 
377 bool
378 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
379 {
380     const U8* const send = s + (len ? len : strlen((const char *)s));
381     const U8* x = s;
382     STRLEN c;
383     STRLEN outlen = 0;
384 
385     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
386 
387     while (x < send) {
388 	 /* Inline the easy bits of is_utf8_char() here for speed... */
389 	 if (UTF8_IS_INVARIANT(*x))
390 	     c = 1;
391 	 else if (!UTF8_IS_START(*x))
392 	     goto out;
393 	 else {
394 	     /* ... and call is_utf8_char() only if really needed. */
395 #ifdef IS_UTF8_CHAR
396 	     c = UTF8SKIP(x);
397 	     if (IS_UTF8_CHAR_FAST(c)) {
398 	         if (!IS_UTF8_CHAR(x, c))
399 		     c = 0;
400 	     } else
401 	         c = is_utf8_char_slow(x, c);
402 #else
403 	     c = is_utf8_char(x);
404 #endif /* #ifdef IS_UTF8_CHAR */
405 	     if (!c)
406 	         goto out;
407 	 }
408          x += c;
409 	 outlen++;
410     }
411 
412  out:
413     if (el)
414         *el = outlen;
415 
416     if (ep)
417         *ep = x;
418     return (x == send);
419 }
420 
421 /*
422 
423 =for apidoc utf8n_to_uvuni
424 
425 Bottom level UTF-8 decode routine.
426 Returns the Unicode code point value of the first character in the string C<s>
427 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
428 C<retlen> will be set to the length, in bytes, of that character.
429 
430 If C<s> does not point to a well-formed UTF-8 character, the behaviour
431 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
432 it is assumed that the caller will raise a warning, and this function
433 will silently just set C<retlen> to C<-1> and return zero.  If the
434 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
435 malformations will be given, C<retlen> will be set to the expected
436 length of the UTF-8 character in bytes, and zero will be returned.
437 
438 The C<flags> can also contain various flags to allow deviations from
439 the strict UTF-8 encoding (see F<utf8.h>).
440 
441 Most code should use utf8_to_uvchr() rather than call this directly.
442 
443 =cut
444 */
445 
446 UV
447 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
448 {
449     dVAR;
450     const U8 * const s0 = s;
451     UV uv = *s, ouv = 0;
452     STRLEN len = 1;
453     const bool dowarn = ckWARN_d(WARN_UTF8);
454     const UV startbyte = *s;
455     STRLEN expectlen = 0;
456     U32 warning = 0;
457     SV* sv;
458 
459     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
460 
461 /* This list is a superset of the UTF8_ALLOW_XXX.  BUT it isn't, eg SUPER missing XXX */
462 
463 #define UTF8_WARN_EMPTY				 1
464 #define UTF8_WARN_CONTINUATION			 2
465 #define UTF8_WARN_NON_CONTINUATION	 	 3
466 #define UTF8_WARN_FE_FF				 4
467 #define UTF8_WARN_SHORT				 5
468 #define UTF8_WARN_OVERFLOW			 6
469 #define UTF8_WARN_SURROGATE			 7
470 #define UTF8_WARN_LONG				 8
471 #define UTF8_WARN_FFFF				 9 /* Also FFFE. */
472 
473     if (curlen == 0 &&
474 	!(flags & UTF8_ALLOW_EMPTY)) {
475 	warning = UTF8_WARN_EMPTY;
476 	goto malformed;
477     }
478 
479     if (UTF8_IS_INVARIANT(uv)) {
480 	if (retlen)
481 	    *retlen = 1;
482 	return (UV) (NATIVE_TO_UTF(*s));
483     }
484 
485     if (UTF8_IS_CONTINUATION(uv) &&
486 	!(flags & UTF8_ALLOW_CONTINUATION)) {
487 	warning = UTF8_WARN_CONTINUATION;
488 	goto malformed;
489     }
490 
491     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
492 	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
493 	warning = UTF8_WARN_NON_CONTINUATION;
494 	goto malformed;
495     }
496 
497 #ifdef EBCDIC
498     uv = NATIVE_TO_UTF(uv);
499 #else
500     if ((uv == 0xfe || uv == 0xff) &&
501 	!(flags & UTF8_ALLOW_FE_FF)) {
502 	warning = UTF8_WARN_FE_FF;
503 	goto malformed;
504     }
505 #endif
506 
507     if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
508     else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
509     else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
510     else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
511 #ifdef EBCDIC
512     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
513     else			{ len =  7; uv &= 0x01; }
514 #else
515     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
516     else if (!(uv & 0x01))	{ len =  7; uv = 0; }
517     else			{ len = 13; uv = 0; } /* whoa! */
518 #endif
519 
520     if (retlen)
521 	*retlen = len;
522 
523     expectlen = len;
524 
525     if ((curlen < expectlen) &&
526 	!(flags & UTF8_ALLOW_SHORT)) {
527 	warning = UTF8_WARN_SHORT;
528 	goto malformed;
529     }
530 
531     len--;
532     s++;
533     ouv = uv;
534 
535     while (len--) {
536 	if (!UTF8_IS_CONTINUATION(*s) &&
537 	    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
538 	    s--;
539 	    warning = UTF8_WARN_NON_CONTINUATION;
540 	    goto malformed;
541 	}
542 	else
543 	    uv = UTF8_ACCUMULATE(uv, *s);
544 	if (!(uv > ouv)) {
545 	    /* These cannot be allowed. */
546 	    if (uv == ouv) {
547 		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
548 		    warning = UTF8_WARN_LONG;
549 		    goto malformed;
550 		}
551 	    }
552 	    else { /* uv < ouv */
553 		/* This cannot be allowed. */
554 		warning = UTF8_WARN_OVERFLOW;
555 		goto malformed;
556 	    }
557 	}
558 	s++;
559 	ouv = uv;
560     }
561 
562     if (UNICODE_IS_SURROGATE(uv) &&
563 	!(flags & UTF8_ALLOW_SURROGATE)) {
564 	warning = UTF8_WARN_SURROGATE;
565 	goto malformed;
566     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
567 	       !(flags & UTF8_ALLOW_LONG)) {
568 	warning = UTF8_WARN_LONG;
569 	goto malformed;
570     } else if (UNICODE_IS_ILLEGAL(uv) &&
571 	       !(flags & UTF8_ALLOW_FFFF)) {
572 	warning = UTF8_WARN_FFFF;
573 	goto malformed;
574     }
575 
576     return uv;
577 
578 malformed:
579 
580     if (flags & UTF8_CHECK_ONLY) {
581 	if (retlen)
582 	    *retlen = ((STRLEN) -1);
583 	return 0;
584     }
585 
586     if (dowarn) {
587 	if (warning == UTF8_WARN_FFFF) {
588 	    sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
589 	    Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
590 	}
591 	else {
592 	    sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
593 
594 	    switch (warning) {
595 		case 0: /* Intentionally empty. */ break;
596 		case UTF8_WARN_EMPTY:
597 		    sv_catpvs(sv, "(empty string)");
598 		    break;
599 		case UTF8_WARN_CONTINUATION:
600 		    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
601 		    break;
602 		case UTF8_WARN_NON_CONTINUATION:
603 		    if (s == s0)
604 			Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
605 				   (UV)s[1], startbyte);
606 		    else {
607 			const int len = (int)(s-s0);
608 			Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
609 				   (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
610 		    }
611 
612 		    break;
613 		case UTF8_WARN_FE_FF:
614 		    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
615 		    break;
616 		case UTF8_WARN_SHORT:
617 		    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
618 				   (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
619 		    expectlen = curlen;		/* distance for caller to skip */
620 		    break;
621 		case UTF8_WARN_OVERFLOW:
622 		    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
623 				   ouv, *s, startbyte);
624 		    break;
625 		case UTF8_WARN_SURROGATE:
626 		    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
627 		    break;
628 		case UTF8_WARN_LONG:
629 		    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
630 				   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
631 		    break;
632 		default:
633 		    sv_catpvs(sv, "(unknown reason)");
634 		    break;
635 	    }
636 	}
637 
638 	if (warning) {
639 	    const char * const s = SvPVX_const(sv);
640 
641 	    if (PL_op)
642 		Perl_warner(aTHX_ packWARN(WARN_UTF8),
643 			    "%s in %s", s,  OP_DESC(PL_op));
644 	    else
645 		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
646 	}
647     }
648 
649     if (retlen)
650 	*retlen = expectlen ? expectlen : len;
651 
652     return 0;
653 }
654 
655 /*
656 =for apidoc utf8_to_uvchr
657 
658 Returns the native character value of the first character in the string C<s>
659 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
660 length, in bytes, of that character.
661 
662 If C<s> does not point to a well-formed UTF-8 character, zero is
663 returned and retlen is set, if possible, to -1.
664 
665 =cut
666 */
667 
668 UV
669 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
670 {
671     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
672 
673     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
674 			  ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
675 }
676 
677 /*
678 =for apidoc utf8_to_uvuni
679 
680 Returns the Unicode code point of the first character in the string C<s>
681 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
682 length, in bytes, of that character.
683 
684 This function should only be used when the returned UV is considered
685 an index into the Unicode semantic tables (e.g. swashes).
686 
687 If C<s> does not point to a well-formed UTF-8 character, zero is
688 returned and retlen is set, if possible, to -1.
689 
690 =cut
691 */
692 
693 UV
694 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
695 {
696     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
697 
698     /* Call the low level routine asking for checks */
699     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
700 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
701 }
702 
703 /*
704 =for apidoc utf8_length
705 
706 Return the length of the UTF-8 char encoded string C<s> in characters.
707 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
708 up past C<e>, croaks.
709 
710 =cut
711 */
712 
713 STRLEN
714 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
715 {
716     dVAR;
717     STRLEN len = 0;
718 
719     PERL_ARGS_ASSERT_UTF8_LENGTH;
720 
721     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
722      * the bitops (especially ~) can create illegal UTF-8.
723      * In other words: in Perl UTF-8 is not just for Unicode. */
724 
725     if (e < s)
726 	goto warn_and_return;
727     while (s < e) {
728 	if (!UTF8_IS_INVARIANT(*s))
729 	    s += UTF8SKIP(s);
730 	else
731 	    s++;
732 	len++;
733     }
734 
735     if (e != s) {
736 	len--;
737         warn_and_return:
738 	if (PL_op)
739 	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
740 			     "%s in %s", unees, OP_DESC(PL_op));
741 	else
742 	    Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
743     }
744 
745     return len;
746 }
747 
748 /*
749 =for apidoc utf8_distance
750 
751 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
752 and C<b>.
753 
754 WARNING: use only if you *know* that the pointers point inside the
755 same UTF-8 buffer.
756 
757 =cut
758 */
759 
760 IV
761 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
762 {
763     PERL_ARGS_ASSERT_UTF8_DISTANCE;
764 
765     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
766 }
767 
768 /*
769 =for apidoc utf8_hop
770 
771 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
772 forward or backward.
773 
774 WARNING: do not use the following unless you *know* C<off> is within
775 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
776 on the first byte of character or just after the last byte of a character.
777 
778 =cut
779 */
780 
781 U8 *
782 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
783 {
784     PERL_ARGS_ASSERT_UTF8_HOP;
785 
786     PERL_UNUSED_CONTEXT;
787     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
788      * the bitops (especially ~) can create illegal UTF-8.
789      * In other words: in Perl UTF-8 is not just for Unicode. */
790 
791     if (off >= 0) {
792 	while (off--)
793 	    s += UTF8SKIP(s);
794     }
795     else {
796 	while (off++) {
797 	    s--;
798 	    while (UTF8_IS_CONTINUATION(*s))
799 		s--;
800 	}
801     }
802     return (U8 *)s;
803 }
804 
805 /*
806 =for apidoc utf8_to_bytes
807 
808 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
809 Unlike C<bytes_to_utf8>, this over-writes the original string, and
810 updates len to contain the new length.
811 Returns zero on failure, setting C<len> to -1.
812 
813 If you need a copy of the string, see C<bytes_from_utf8>.
814 
815 =cut
816 */
817 
818 U8 *
819 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
820 {
821     U8 * const save = s;
822     U8 * const send = s + *len;
823     U8 *d;
824 
825     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
826 
827     /* ensure valid UTF-8 and chars < 256 before updating string */
828     while (s < send) {
829         U8 c = *s++;
830 
831         if (!UTF8_IS_INVARIANT(c) &&
832             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
833 	     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
834             *len = ((STRLEN) -1);
835             return 0;
836         }
837     }
838 
839     d = s = save;
840     while (s < send) {
841         STRLEN ulen;
842         *d++ = (U8)utf8_to_uvchr(s, &ulen);
843         s += ulen;
844     }
845     *d = '\0';
846     *len = d - save;
847     return save;
848 }
849 
850 /*
851 =for apidoc bytes_from_utf8
852 
853 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
854 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
855 the newly-created string, and updates C<len> to contain the new
856 length.  Returns the original string if no conversion occurs, C<len>
857 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
858 0 if C<s> is converted or consisted entirely of characters that are invariant
859 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
860 
861 =cut
862 */
863 
864 U8 *
865 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
866 {
867     U8 *d;
868     const U8 *start = s;
869     const U8 *send;
870     I32 count = 0;
871 
872     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
873 
874     PERL_UNUSED_CONTEXT;
875     if (!*is_utf8)
876         return (U8 *)start;
877 
878     /* ensure valid UTF-8 and chars < 256 before converting string */
879     for (send = s + *len; s < send;) {
880         U8 c = *s++;
881 	if (!UTF8_IS_INVARIANT(c)) {
882 	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
883                 (c = *s++) && UTF8_IS_CONTINUATION(c))
884 		count++;
885 	    else
886                 return (U8 *)start;
887 	}
888     }
889 
890     *is_utf8 = FALSE;
891 
892     Newx(d, (*len) - count + 1, U8);
893     s = start; start = d;
894     while (s < send) {
895 	U8 c = *s++;
896 	if (!UTF8_IS_INVARIANT(c)) {
897 	    /* Then it is two-byte encoded */
898 	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
899 	    c = ASCII_TO_NATIVE(c);
900 	}
901 	*d++ = c;
902     }
903     *d = '\0';
904     *len = d - start;
905     return (U8 *)start;
906 }
907 
908 /*
909 =for apidoc bytes_to_utf8
910 
911 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
912 Returns a pointer to the newly-created string, and sets C<len> to
913 reflect the new length.
914 
915 A NUL character will be written after the end of the string.
916 
917 If you want to convert to UTF-8 from encodings other than
918 the native (Latin1 or EBCDIC),
919 see sv_recode_to_utf8().
920 
921 =cut
922 */
923 
924 U8*
925 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
926 {
927     const U8 * const send = s + (*len);
928     U8 *d;
929     U8 *dst;
930 
931     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
932     PERL_UNUSED_CONTEXT;
933 
934     Newx(d, (*len) * 2 + 1, U8);
935     dst = d;
936 
937     while (s < send) {
938         const UV uv = NATIVE_TO_ASCII(*s++);
939         if (UNI_IS_INVARIANT(uv))
940             *d++ = (U8)UTF_TO_NATIVE(uv);
941         else {
942             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
943             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
944         }
945     }
946     *d = '\0';
947     *len = d-dst;
948     return dst;
949 }
950 
951 /*
952  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
953  *
954  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
955  * We optimize for native, for obvious reasons. */
956 
957 U8*
958 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
959 {
960     U8* pend;
961     U8* dstart = d;
962 
963     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
964 
965     if (bytelen & 1)
966 	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
967 
968     pend = p + bytelen;
969 
970     while (p < pend) {
971 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
972 	p += 2;
973 	if (uv < 0x80) {
974 #ifdef EBCDIC
975 	    *d++ = UNI_TO_NATIVE(uv);
976 #else
977 	    *d++ = (U8)uv;
978 #endif
979 	    continue;
980 	}
981 	if (uv < 0x800) {
982 	    *d++ = (U8)(( uv >>  6)         | 0xc0);
983 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
984 	    continue;
985 	}
986 	if (uv >= 0xd800 && uv <= 0xdbff) {	/* surrogates */
987 	    if (p >= pend) {
988 		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
989 	    } else {
990 		UV low = (p[0] << 8) + p[1];
991 		p += 2;
992 		if (low < 0xdc00 || low > 0xdfff)
993 		    Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
994 		uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
995 	    }
996 	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
997 	    Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
998 	}
999 	if (uv < 0x10000) {
1000 	    *d++ = (U8)(( uv >> 12)         | 0xe0);
1001 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1002 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
1003 	    continue;
1004 	}
1005 	else {
1006 	    *d++ = (U8)(( uv >> 18)         | 0xf0);
1007 	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1008 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1009 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
1010 	    continue;
1011 	}
1012     }
1013     *newlen = d - dstart;
1014     return d;
1015 }
1016 
1017 /* Note: this one is slightly destructive of the source. */
1018 
1019 U8*
1020 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1021 {
1022     U8* s = (U8*)p;
1023     U8* const send = s + bytelen;
1024 
1025     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1026 
1027     if (bytelen & 1)
1028 	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1029 		   (UV)bytelen);
1030 
1031     while (s < send) {
1032 	const U8 tmp = s[0];
1033 	s[0] = s[1];
1034 	s[1] = tmp;
1035 	s += 2;
1036     }
1037     return utf16_to_utf8(p, d, bytelen, newlen);
1038 }
1039 
1040 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1041 
1042 bool
1043 Perl_is_uni_alnum(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_alnum(tmpbuf);
1048 }
1049 
1050 bool
1051 Perl_is_uni_idfirst(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_idfirst(tmpbuf);
1056 }
1057 
1058 bool
1059 Perl_is_uni_alpha(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_alpha(tmpbuf);
1064 }
1065 
1066 bool
1067 Perl_is_uni_ascii(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_ascii(tmpbuf);
1072 }
1073 
1074 bool
1075 Perl_is_uni_space(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_space(tmpbuf);
1080 }
1081 
1082 bool
1083 Perl_is_uni_digit(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_digit(tmpbuf);
1088 }
1089 
1090 bool
1091 Perl_is_uni_upper(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_upper(tmpbuf);
1096 }
1097 
1098 bool
1099 Perl_is_uni_lower(pTHX_ UV c)
1100 {
1101     U8 tmpbuf[UTF8_MAXBYTES+1];
1102     uvchr_to_utf8(tmpbuf, c);
1103     return is_utf8_lower(tmpbuf);
1104 }
1105 
1106 bool
1107 Perl_is_uni_cntrl(pTHX_ UV c)
1108 {
1109     U8 tmpbuf[UTF8_MAXBYTES+1];
1110     uvchr_to_utf8(tmpbuf, c);
1111     return is_utf8_cntrl(tmpbuf);
1112 }
1113 
1114 bool
1115 Perl_is_uni_graph(pTHX_ UV c)
1116 {
1117     U8 tmpbuf[UTF8_MAXBYTES+1];
1118     uvchr_to_utf8(tmpbuf, c);
1119     return is_utf8_graph(tmpbuf);
1120 }
1121 
1122 bool
1123 Perl_is_uni_print(pTHX_ UV c)
1124 {
1125     U8 tmpbuf[UTF8_MAXBYTES+1];
1126     uvchr_to_utf8(tmpbuf, c);
1127     return is_utf8_print(tmpbuf);
1128 }
1129 
1130 bool
1131 Perl_is_uni_punct(pTHX_ UV c)
1132 {
1133     U8 tmpbuf[UTF8_MAXBYTES+1];
1134     uvchr_to_utf8(tmpbuf, c);
1135     return is_utf8_punct(tmpbuf);
1136 }
1137 
1138 bool
1139 Perl_is_uni_xdigit(pTHX_ UV c)
1140 {
1141     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1142     uvchr_to_utf8(tmpbuf, c);
1143     return is_utf8_xdigit(tmpbuf);
1144 }
1145 
1146 UV
1147 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1148 {
1149     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1150 
1151     uvchr_to_utf8(p, c);
1152     return to_utf8_upper(p, p, lenp);
1153 }
1154 
1155 UV
1156 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1157 {
1158     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1159 
1160     uvchr_to_utf8(p, c);
1161     return to_utf8_title(p, p, lenp);
1162 }
1163 
1164 UV
1165 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1166 {
1167     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1168 
1169     uvchr_to_utf8(p, c);
1170     return to_utf8_lower(p, p, lenp);
1171 }
1172 
1173 UV
1174 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1175 {
1176     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1177 
1178     uvchr_to_utf8(p, c);
1179     return to_utf8_fold(p, p, lenp);
1180 }
1181 
1182 /* for now these all assume no locale info available for Unicode > 255 */
1183 
1184 bool
1185 Perl_is_uni_alnum_lc(pTHX_ UV c)
1186 {
1187     return is_uni_alnum(c);	/* XXX no locale support yet */
1188 }
1189 
1190 bool
1191 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1192 {
1193     return is_uni_idfirst(c);	/* XXX no locale support yet */
1194 }
1195 
1196 bool
1197 Perl_is_uni_alpha_lc(pTHX_ UV c)
1198 {
1199     return is_uni_alpha(c);	/* XXX no locale support yet */
1200 }
1201 
1202 bool
1203 Perl_is_uni_ascii_lc(pTHX_ UV c)
1204 {
1205     return is_uni_ascii(c);	/* XXX no locale support yet */
1206 }
1207 
1208 bool
1209 Perl_is_uni_space_lc(pTHX_ UV c)
1210 {
1211     return is_uni_space(c);	/* XXX no locale support yet */
1212 }
1213 
1214 bool
1215 Perl_is_uni_digit_lc(pTHX_ UV c)
1216 {
1217     return is_uni_digit(c);	/* XXX no locale support yet */
1218 }
1219 
1220 bool
1221 Perl_is_uni_upper_lc(pTHX_ UV c)
1222 {
1223     return is_uni_upper(c);	/* XXX no locale support yet */
1224 }
1225 
1226 bool
1227 Perl_is_uni_lower_lc(pTHX_ UV c)
1228 {
1229     return is_uni_lower(c);	/* XXX no locale support yet */
1230 }
1231 
1232 bool
1233 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1234 {
1235     return is_uni_cntrl(c);	/* XXX no locale support yet */
1236 }
1237 
1238 bool
1239 Perl_is_uni_graph_lc(pTHX_ UV c)
1240 {
1241     return is_uni_graph(c);	/* XXX no locale support yet */
1242 }
1243 
1244 bool
1245 Perl_is_uni_print_lc(pTHX_ UV c)
1246 {
1247     return is_uni_print(c);	/* XXX no locale support yet */
1248 }
1249 
1250 bool
1251 Perl_is_uni_punct_lc(pTHX_ UV c)
1252 {
1253     return is_uni_punct(c);	/* XXX no locale support yet */
1254 }
1255 
1256 bool
1257 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1258 {
1259     return is_uni_xdigit(c);	/* XXX no locale support yet */
1260 }
1261 
1262 U32
1263 Perl_to_uni_upper_lc(pTHX_ U32 c)
1264 {
1265     /* XXX returns only the first character -- do not use XXX */
1266     /* XXX no locale support yet */
1267     STRLEN len;
1268     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1269     return (U32)to_uni_upper(c, tmpbuf, &len);
1270 }
1271 
1272 U32
1273 Perl_to_uni_title_lc(pTHX_ U32 c)
1274 {
1275     /* XXX returns only the first character XXX -- do not use XXX */
1276     /* XXX no locale support yet */
1277     STRLEN len;
1278     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1279     return (U32)to_uni_title(c, tmpbuf, &len);
1280 }
1281 
1282 U32
1283 Perl_to_uni_lower_lc(pTHX_ U32 c)
1284 {
1285     /* XXX returns only the first character -- do not use XXX */
1286     /* XXX no locale support yet */
1287     STRLEN len;
1288     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1289     return (U32)to_uni_lower(c, tmpbuf, &len);
1290 }
1291 
1292 static bool
1293 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1294 		 const char *const swashname)
1295 {
1296     dVAR;
1297 
1298     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1299 
1300     if (!is_utf8_char(p))
1301 	return FALSE;
1302     if (!*swash)
1303 	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1304     return swash_fetch(*swash, p, TRUE) != 0;
1305 }
1306 
1307 bool
1308 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1309 {
1310     dVAR;
1311 
1312     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1313 
1314     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1315      * descendant of isalnum(3), in other words, it doesn't
1316      * contain the '_'. --jhi */
1317     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1318 }
1319 
1320 bool
1321 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1322 {
1323     dVAR;
1324 
1325     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1326 
1327     if (*p == '_')
1328 	return TRUE;
1329     /* is_utf8_idstart would be more logical. */
1330     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1331 }
1332 
1333 bool
1334 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1335 {
1336     dVAR;
1337 
1338     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1339 
1340     if (*p == '_')
1341 	return TRUE;
1342     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1343 }
1344 
1345 bool
1346 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1347 {
1348     dVAR;
1349 
1350     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1351 
1352     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1353 }
1354 
1355 bool
1356 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1357 {
1358     dVAR;
1359 
1360     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1361 
1362     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1363 }
1364 
1365 bool
1366 Perl_is_utf8_space(pTHX_ const U8 *p)
1367 {
1368     dVAR;
1369 
1370     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1371 
1372     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1373 }
1374 
1375 bool
1376 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1377 {
1378     dVAR;
1379 
1380     PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1381 
1382     return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
1383 }
1384 
1385 bool
1386 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1387 {
1388     dVAR;
1389 
1390     PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1391 
1392     return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
1393 }
1394 
1395 bool
1396 Perl_is_utf8_digit(pTHX_ const U8 *p)
1397 {
1398     dVAR;
1399 
1400     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1401 
1402     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1403 }
1404 
1405 bool
1406 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1407 {
1408     dVAR;
1409 
1410     PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1411 
1412     return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
1413 }
1414 
1415 bool
1416 Perl_is_utf8_upper(pTHX_ const U8 *p)
1417 {
1418     dVAR;
1419 
1420     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1421 
1422     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1423 }
1424 
1425 bool
1426 Perl_is_utf8_lower(pTHX_ const U8 *p)
1427 {
1428     dVAR;
1429 
1430     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1431 
1432     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1433 }
1434 
1435 bool
1436 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1437 {
1438     dVAR;
1439 
1440     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1441 
1442     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1443 }
1444 
1445 bool
1446 Perl_is_utf8_graph(pTHX_ const U8 *p)
1447 {
1448     dVAR;
1449 
1450     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1451 
1452     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1453 }
1454 
1455 bool
1456 Perl_is_utf8_print(pTHX_ const U8 *p)
1457 {
1458     dVAR;
1459 
1460     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1461 
1462     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1463 }
1464 
1465 bool
1466 Perl_is_utf8_punct(pTHX_ const U8 *p)
1467 {
1468     dVAR;
1469 
1470     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1471 
1472     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1473 }
1474 
1475 bool
1476 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1477 {
1478     dVAR;
1479 
1480     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1481 
1482     return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
1483 }
1484 
1485 bool
1486 Perl_is_utf8_mark(pTHX_ const U8 *p)
1487 {
1488     dVAR;
1489 
1490     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1491 
1492     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1493 }
1494 
1495 bool
1496 Perl_is_utf8_X_begin(pTHX_ const U8 *p)
1497 {
1498     dVAR;
1499 
1500     PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
1501 
1502     return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
1503 }
1504 
1505 bool
1506 Perl_is_utf8_X_extend(pTHX_ const U8 *p)
1507 {
1508     dVAR;
1509 
1510     PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
1511 
1512     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
1513 }
1514 
1515 bool
1516 Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
1517 {
1518     dVAR;
1519 
1520     PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
1521 
1522     return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
1523 }
1524 
1525 bool
1526 Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
1527 {
1528     dVAR;
1529 
1530     PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
1531 
1532     return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
1533 }
1534 
1535 bool
1536 Perl_is_utf8_X_L(pTHX_ const U8 *p)
1537 {
1538     dVAR;
1539 
1540     PERL_ARGS_ASSERT_IS_UTF8_X_L;
1541 
1542     return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
1543 }
1544 
1545 bool
1546 Perl_is_utf8_X_LV(pTHX_ const U8 *p)
1547 {
1548     dVAR;
1549 
1550     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
1551 
1552     return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
1553 }
1554 
1555 bool
1556 Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
1557 {
1558     dVAR;
1559 
1560     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
1561 
1562     return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
1563 }
1564 
1565 bool
1566 Perl_is_utf8_X_T(pTHX_ const U8 *p)
1567 {
1568     dVAR;
1569 
1570     PERL_ARGS_ASSERT_IS_UTF8_X_T;
1571 
1572     return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
1573 }
1574 
1575 bool
1576 Perl_is_utf8_X_V(pTHX_ const U8 *p)
1577 {
1578     dVAR;
1579 
1580     PERL_ARGS_ASSERT_IS_UTF8_X_V;
1581 
1582     return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
1583 }
1584 
1585 bool
1586 Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
1587 {
1588     dVAR;
1589 
1590     PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
1591 
1592     return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
1593 }
1594 
1595 /*
1596 =for apidoc to_utf8_case
1597 
1598 The "p" contains the pointer to the UTF-8 string encoding
1599 the character that is being converted.
1600 
1601 The "ustrp" is a pointer to the character buffer to put the
1602 conversion result to.  The "lenp" is a pointer to the length
1603 of the result.
1604 
1605 The "swashp" is a pointer to the swash to use.
1606 
1607 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1608 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1609 but not always, a multicharacter mapping), is tried first.
1610 
1611 The "special" is a string like "utf8::ToSpecLower", which means the
1612 hash %utf8::ToSpecLower.  The access to the hash is through
1613 Perl_to_utf8_case().
1614 
1615 The "normal" is a string like "ToLower" which means the swash
1616 %utf8::ToLower.
1617 
1618 =cut */
1619 
1620 UV
1621 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1622 			SV **swashp, const char *normal, const char *special)
1623 {
1624     dVAR;
1625     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1626     STRLEN len = 0;
1627     const UV uv0 = utf8_to_uvchr(p, NULL);
1628     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1629      * are necessary in EBCDIC, they are redundant no-ops
1630      * in ASCII-ish platforms, and hopefully optimized away. */
1631     const UV uv1 = NATIVE_TO_UNI(uv0);
1632 
1633     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1634 
1635     uvuni_to_utf8(tmpbuf, uv1);
1636 
1637     if (!*swashp) /* load on-demand */
1638          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1639     /* This is the beginnings of a skeleton of code to read the info section
1640      * that is in all the swashes in case we ever want to do that, so one can
1641      * read things whose maps aren't code points, and whose default if missing
1642      * is not to the code point itself.  This was just to see if it actually
1643      * worked.  Details on what the possibilities are are in perluniprops.pod
1644 	HV * const hv = get_hv("utf8::SwashInfo", 0);
1645 	if (hv) {
1646 	 SV **svp;
1647 	 svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
1648 	     const char *s;
1649 
1650 	      HV * const this_hash = SvRV(*svp);
1651 		svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
1652 	      s = SvPV_const(*svp, len);
1653 	}
1654     }*/
1655 
1656     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1657     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1658          /* It might be "special" (sometimes, but not always,
1659 	  * a multicharacter mapping) */
1660 	 HV * const hv = get_hv(special, 0);
1661 	 SV **svp;
1662 
1663 	 if (hv &&
1664 	     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1665 	     (*svp)) {
1666 	     const char *s;
1667 
1668 	      s = SvPV_const(*svp, len);
1669 	      if (len == 1)
1670 		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1671 	      else {
1672 #ifdef EBCDIC
1673 		   /* If we have EBCDIC we need to remap the characters
1674 		    * since any characters in the low 256 are Unicode
1675 		    * code points, not EBCDIC. */
1676 		   U8 *t = (U8*)s, *tend = t + len, *d;
1677 
1678 		   d = tmpbuf;
1679 		   if (SvUTF8(*svp)) {
1680 			STRLEN tlen = 0;
1681 
1682 			while (t < tend) {
1683 			     const UV c = utf8_to_uvchr(t, &tlen);
1684 			     if (tlen > 0) {
1685 				  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1686 				  t += tlen;
1687 			     }
1688 			     else
1689 				  break;
1690 			}
1691 		   }
1692 		   else {
1693 			while (t < tend) {
1694 			     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1695 			     t++;
1696 			}
1697 		   }
1698 		   len = d - tmpbuf;
1699 		   Copy(tmpbuf, ustrp, len, U8);
1700 #else
1701 		   Copy(s, ustrp, len, U8);
1702 #endif
1703 	      }
1704 	 }
1705     }
1706 
1707     if (!len && *swashp) {
1708 	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1709 
1710 	 if (uv2) {
1711 	      /* It was "normal" (a single character mapping). */
1712 	      const UV uv3 = UNI_TO_NATIVE(uv2);
1713 	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1714 	 }
1715     }
1716 
1717     if (!len) /* Neither: just copy.  In other words, there was no mapping
1718 		 defined, which means that the code point maps to itself */
1719 	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1720 
1721     if (lenp)
1722 	 *lenp = len;
1723 
1724     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1725 }
1726 
1727 /*
1728 =for apidoc to_utf8_upper
1729 
1730 Convert the UTF-8 encoded character at p to its uppercase version and
1731 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1732 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1733 the uppercase version may be longer than the original character.
1734 
1735 The first character of the uppercased version is returned
1736 (but note, as explained above, that there may be more.)
1737 
1738 =cut */
1739 
1740 UV
1741 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1742 {
1743     dVAR;
1744 
1745     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1746 
1747     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1748                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1749 }
1750 
1751 /*
1752 =for apidoc to_utf8_title
1753 
1754 Convert the UTF-8 encoded character at p to its titlecase version and
1755 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1756 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1757 titlecase version may be longer than the original character.
1758 
1759 The first character of the titlecased version is returned
1760 (but note, as explained above, that there may be more.)
1761 
1762 =cut */
1763 
1764 UV
1765 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1766 {
1767     dVAR;
1768 
1769     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1770 
1771     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1772                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1773 }
1774 
1775 /*
1776 =for apidoc to_utf8_lower
1777 
1778 Convert the UTF-8 encoded character at p to its lowercase version and
1779 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1780 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1781 lowercase version may be longer than the original character.
1782 
1783 The first character of the lowercased version is returned
1784 (but note, as explained above, that there may be more.)
1785 
1786 =cut */
1787 
1788 UV
1789 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1790 {
1791     dVAR;
1792 
1793     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1794 
1795     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1796                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1797 }
1798 
1799 /*
1800 =for apidoc to_utf8_fold
1801 
1802 Convert the UTF-8 encoded character at p to its foldcase version and
1803 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1804 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1805 foldcase version may be longer than the original character (up to
1806 three characters).
1807 
1808 The first character of the foldcased version is returned
1809 (but note, as explained above, that there may be more.)
1810 
1811 =cut */
1812 
1813 UV
1814 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1815 {
1816     dVAR;
1817 
1818     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1819 
1820     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1821                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1822 }
1823 
1824 /* Note:
1825  * A "swash" is a swatch hash.
1826  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1827  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1828  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1829  */
1830 SV*
1831 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1832 {
1833     dVAR;
1834     SV* retval;
1835     dSP;
1836     const size_t pkg_len = strlen(pkg);
1837     const size_t name_len = strlen(name);
1838     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1839     SV* errsv_save;
1840 
1841     PERL_ARGS_ASSERT_SWASH_INIT;
1842 
1843     PUSHSTACKi(PERLSI_MAGIC);
1844     ENTER;
1845     SAVEHINTS();
1846     save_re_context();
1847     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
1848 	ENTER;
1849 	errsv_save = newSVsv(ERRSV);
1850 	/* It is assumed that callers of this routine are not passing in any
1851 	   user derived data.  */
1852 	/* Need to do this after save_re_context() as it will set PL_tainted to
1853 	   1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1854 	   Even line to create errsv_save can turn on PL_tainted.  */
1855 	SAVEBOOL(PL_tainted);
1856 	PL_tainted = 0;
1857 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1858 			 NULL);
1859 	if (!SvTRUE(ERRSV))
1860 	    sv_setsv(ERRSV, errsv_save);
1861 	SvREFCNT_dec(errsv_save);
1862 	LEAVE;
1863     }
1864     SPAGAIN;
1865     PUSHMARK(SP);
1866     EXTEND(SP,5);
1867     mPUSHp(pkg, pkg_len);
1868     mPUSHp(name, name_len);
1869     PUSHs(listsv);
1870     mPUSHi(minbits);
1871     mPUSHi(none);
1872     PUTBACK;
1873     errsv_save = newSVsv(ERRSV);
1874     if (call_method("SWASHNEW", G_SCALAR))
1875 	retval = newSVsv(*PL_stack_sp--);
1876     else
1877 	retval = &PL_sv_undef;
1878     if (!SvTRUE(ERRSV))
1879 	sv_setsv(ERRSV, errsv_save);
1880     SvREFCNT_dec(errsv_save);
1881     LEAVE;
1882     POPSTACK;
1883     if (IN_PERL_COMPILETIME) {
1884 	CopHINTS_set(PL_curcop, PL_hints);
1885     }
1886     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1887         if (SvPOK(retval))
1888 	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1889 		       SVfARG(retval));
1890 	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1891     }
1892     return retval;
1893 }
1894 
1895 
1896 /* This API is wrong for special case conversions since we may need to
1897  * return several Unicode characters for a single Unicode character
1898  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1899  * the lower-level routine, and it is similarly broken for returning
1900  * multiple values.  --jhi */
1901 /* Now SWASHGET is recasted into S_swash_get in this file. */
1902 
1903 /* Note:
1904  * Returns the value of property/mapping C<swash> for the first character
1905  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1906  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1907  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1908  */
1909 UV
1910 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1911 {
1912     dVAR;
1913     HV *const hv = MUTABLE_HV(SvRV(swash));
1914     U32 klen;
1915     U32 off;
1916     STRLEN slen;
1917     STRLEN needents;
1918     const U8 *tmps = NULL;
1919     U32 bit;
1920     SV *swatch;
1921     U8 tmputf8[2];
1922     const UV c = NATIVE_TO_ASCII(*ptr);
1923 
1924     PERL_ARGS_ASSERT_SWASH_FETCH;
1925 
1926     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1927 	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1928 	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1929 	ptr = tmputf8;
1930     }
1931     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1932      * then the "swatch" is a vec() for all the chars which start
1933      * with 0xAA..0xYY
1934      * So the key in the hash (klen) is length of encoded char -1
1935      */
1936     klen = UTF8SKIP(ptr) - 1;
1937     off  = ptr[klen];
1938 
1939     if (klen == 0) {
1940       /* If char is invariant then swatch is for all the invariant chars
1941        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1942        */
1943 	needents = UTF_CONTINUATION_MARK;
1944 	off      = NATIVE_TO_UTF(ptr[klen]);
1945     }
1946     else {
1947       /* If char is encoded then swatch is for the prefix */
1948 	needents = (1 << UTF_ACCUMULATION_SHIFT);
1949 	off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1950     }
1951 
1952     /*
1953      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1954      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1955      * it's nothing to sniff at.)  Pity we usually come through at least
1956      * two function calls to get here...
1957      *
1958      * NB: this code assumes that swatches are never modified, once generated!
1959      */
1960 
1961     if (hv   == PL_last_swash_hv &&
1962 	klen == PL_last_swash_klen &&
1963 	(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1964     {
1965 	tmps = PL_last_swash_tmps;
1966 	slen = PL_last_swash_slen;
1967     }
1968     else {
1969 	/* Try our second-level swatch cache, kept in a hash. */
1970 	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1971 
1972 	/* If not cached, generate it via swash_get */
1973 	if (!svp || !SvPOK(*svp)
1974 		 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1975 	    /* We use utf8n_to_uvuni() as we want an index into
1976 	       Unicode tables, not a native character number.
1977 	     */
1978 	    const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1979 					   ckWARN(WARN_UTF8) ?
1980 					   0 : UTF8_ALLOW_ANY);
1981 	    swatch = swash_get(swash,
1982 		    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1983 				(klen) ? (code_point & ~(needents - 1)) : 0,
1984 				needents);
1985 
1986 	    if (IN_PERL_COMPILETIME)
1987 		CopHINTS_set(PL_curcop, PL_hints);
1988 
1989 	    svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1990 
1991 	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1992 		     || (slen << 3) < needents)
1993 		Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1994 	}
1995 
1996 	PL_last_swash_hv = hv;
1997 	assert(klen <= sizeof(PL_last_swash_key));
1998 	PL_last_swash_klen = (U8)klen;
1999 	/* FIXME change interpvar.h?  */
2000 	PL_last_swash_tmps = (U8 *) tmps;
2001 	PL_last_swash_slen = slen;
2002 	if (klen)
2003 	    Copy(ptr, PL_last_swash_key, klen, U8);
2004     }
2005 
2006     switch ((int)((slen << 3) / needents)) {
2007     case 1:
2008 	bit = 1 << (off & 7);
2009 	off >>= 3;
2010 	return (tmps[off] & bit) != 0;
2011     case 8:
2012 	return tmps[off];
2013     case 16:
2014 	off <<= 1;
2015 	return (tmps[off] << 8) + tmps[off + 1] ;
2016     case 32:
2017 	off <<= 2;
2018 	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2019     }
2020     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
2021     NORETURN_FUNCTION_END;
2022 }
2023 
2024 /* Note:
2025  * Returns a swatch (a bit vector string) for a code point sequence
2026  * that starts from the value C<start> and comprises the number C<span>.
2027  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
2028  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
2029  */
2030 STATIC SV*
2031 S_swash_get(pTHX_ SV* swash, UV start, UV span)
2032 {
2033     SV *swatch;
2034     U8 *l, *lend, *x, *xend, *s;
2035     STRLEN lcur, xcur, scur;
2036     HV *const hv = MUTABLE_HV(SvRV(swash));
2037     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
2038     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
2039     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2040     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
2041     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
2042     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
2043     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
2044     const STRLEN bits  = SvUV(*bitssvp);
2045     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
2046     const UV     none  = SvUV(*nonesvp);
2047     const UV     end   = start + span;
2048 
2049     PERL_ARGS_ASSERT_SWASH_GET;
2050 
2051     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
2052 	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
2053 						 (UV)bits);
2054     }
2055 
2056     /* create and initialize $swatch */
2057     scur   = octets ? (span * octets) : (span + 7) / 8;
2058     swatch = newSV(scur);
2059     SvPOK_on(swatch);
2060     s = (U8*)SvPVX(swatch);
2061     if (octets && none) {
2062 	const U8* const e = s + scur;
2063 	while (s < e) {
2064 	    if (bits == 8)
2065 		*s++ = (U8)(none & 0xff);
2066 	    else if (bits == 16) {
2067 		*s++ = (U8)((none >>  8) & 0xff);
2068 		*s++ = (U8)( none        & 0xff);
2069 	    }
2070 	    else if (bits == 32) {
2071 		*s++ = (U8)((none >> 24) & 0xff);
2072 		*s++ = (U8)((none >> 16) & 0xff);
2073 		*s++ = (U8)((none >>  8) & 0xff);
2074 		*s++ = (U8)( none        & 0xff);
2075 	    }
2076 	}
2077 	*s = '\0';
2078     }
2079     else {
2080 	(void)memzero((U8*)s, scur + 1);
2081     }
2082     SvCUR_set(swatch, scur);
2083     s = (U8*)SvPVX(swatch);
2084 
2085     /* read $swash->{LIST} */
2086     l = (U8*)SvPV(*listsvp, lcur);
2087     lend = l + lcur;
2088     while (l < lend) {
2089 	UV min, max, val;
2090 	STRLEN numlen;
2091 	I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2092 
2093 	U8* const nl = (U8*)memchr(l, '\n', lend - l);
2094 
2095 	numlen = lend - l;
2096 	min = grok_hex((char *)l, &numlen, &flags, NULL);
2097 	if (numlen)
2098 	    l += numlen;
2099 	else if (nl) {
2100 	    l = nl + 1; /* 1 is length of "\n" */
2101 	    continue;
2102 	}
2103 	else {
2104 	    l = lend; /* to LIST's end at which \n is not found */
2105 	    break;
2106 	}
2107 
2108 	if (isBLANK(*l)) {
2109 	    ++l;
2110 	    flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2111 	    numlen = lend - l;
2112 	    max = grok_hex((char *)l, &numlen, &flags, NULL);
2113 	    if (numlen)
2114 		l += numlen;
2115 	    else
2116 		max = min;
2117 
2118 	    if (octets) {
2119 		if (isBLANK(*l)) {
2120 		    ++l;
2121 		    flags = PERL_SCAN_SILENT_ILLDIGIT |
2122 			    PERL_SCAN_DISALLOW_PREFIX;
2123 		    numlen = lend - l;
2124 		    val = grok_hex((char *)l, &numlen, &flags, NULL);
2125 		    if (numlen)
2126 			l += numlen;
2127 		    else
2128 			val = 0;
2129 		}
2130 		else {
2131 		    val = 0;
2132 		    if (typeto) {
2133 			Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2134 					 typestr, l);
2135 		    }
2136 		}
2137 	    }
2138 	    else
2139 		val = 0; /* bits == 1, then val should be ignored */
2140 	}
2141 	else {
2142 	    max = min;
2143 	    if (octets) {
2144 		val = 0;
2145 		if (typeto) {
2146 		    Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2147 		}
2148 	    }
2149 	    else
2150 		val = 0; /* bits == 1, then val should be ignored */
2151 	}
2152 
2153 	if (nl)
2154 	    l = nl + 1;
2155 	else
2156 	    l = lend;
2157 
2158 	if (max < start)
2159 	    continue;
2160 
2161 	if (octets) {
2162 	    UV key;
2163 	    if (min < start) {
2164 		if (!none || val < none) {
2165 		    val += start - min;
2166 		}
2167 		min = start;
2168 	    }
2169 	    for (key = min; key <= max; key++) {
2170 		STRLEN offset;
2171 		if (key >= end)
2172 		    goto go_out_list;
2173 		/* offset must be non-negative (start <= min <= key < end) */
2174 		offset = octets * (key - start);
2175 		if (bits == 8)
2176 		    s[offset] = (U8)(val & 0xff);
2177 		else if (bits == 16) {
2178 		    s[offset    ] = (U8)((val >>  8) & 0xff);
2179 		    s[offset + 1] = (U8)( val        & 0xff);
2180 		}
2181 		else if (bits == 32) {
2182 		    s[offset    ] = (U8)((val >> 24) & 0xff);
2183 		    s[offset + 1] = (U8)((val >> 16) & 0xff);
2184 		    s[offset + 2] = (U8)((val >>  8) & 0xff);
2185 		    s[offset + 3] = (U8)( val        & 0xff);
2186 		}
2187 
2188 		if (!none || val < none)
2189 		    ++val;
2190 	    }
2191 	}
2192 	else { /* bits == 1, then val should be ignored */
2193 	    UV key;
2194 	    if (min < start)
2195 		min = start;
2196 	    for (key = min; key <= max; key++) {
2197 		const STRLEN offset = (STRLEN)(key - start);
2198 		if (key >= end)
2199 		    goto go_out_list;
2200 		s[offset >> 3] |= 1 << (offset & 7);
2201 	    }
2202 	}
2203     } /* while */
2204   go_out_list:
2205 
2206     /* read $swash->{EXTRAS} */
2207     x = (U8*)SvPV(*extssvp, xcur);
2208     xend = x + xcur;
2209     while (x < xend) {
2210 	STRLEN namelen;
2211 	U8 *namestr;
2212 	SV** othersvp;
2213 	HV* otherhv;
2214 	STRLEN otherbits;
2215 	SV **otherbitssvp, *other;
2216 	U8 *s, *o, *nl;
2217 	STRLEN slen, olen;
2218 
2219 	const U8 opc = *x++;
2220 	if (opc == '\n')
2221 	    continue;
2222 
2223 	nl = (U8*)memchr(x, '\n', xend - x);
2224 
2225 	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2226 	    if (nl) {
2227 		x = nl + 1; /* 1 is length of "\n" */
2228 		continue;
2229 	    }
2230 	    else {
2231 		x = xend; /* to EXTRAS' end at which \n is not found */
2232 		break;
2233 	    }
2234 	}
2235 
2236 	namestr = x;
2237 	if (nl) {
2238 	    namelen = nl - namestr;
2239 	    x = nl + 1;
2240 	}
2241 	else {
2242 	    namelen = xend - namestr;
2243 	    x = xend;
2244 	}
2245 
2246 	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2247 	otherhv = MUTABLE_HV(SvRV(*othersvp));
2248 	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2249 	otherbits = (STRLEN)SvUV(*otherbitssvp);
2250 	if (bits < otherbits)
2251 	    Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2252 
2253 	/* The "other" swatch must be destroyed after. */
2254 	other = swash_get(*othersvp, start, span);
2255 	o = (U8*)SvPV(other, olen);
2256 
2257 	if (!olen)
2258 	    Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2259 
2260 	s = (U8*)SvPV(swatch, slen);
2261 	if (bits == 1 && otherbits == 1) {
2262 	    if (slen != olen)
2263 		Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2264 
2265 	    switch (opc) {
2266 	    case '+':
2267 		while (slen--)
2268 		    *s++ |= *o++;
2269 		break;
2270 	    case '!':
2271 		while (slen--)
2272 		    *s++ |= ~*o++;
2273 		break;
2274 	    case '-':
2275 		while (slen--)
2276 		    *s++ &= ~*o++;
2277 		break;
2278 	    case '&':
2279 		while (slen--)
2280 		    *s++ &= *o++;
2281 		break;
2282 	    default:
2283 		break;
2284 	    }
2285 	}
2286 	else {
2287 	    STRLEN otheroctets = otherbits >> 3;
2288 	    STRLEN offset = 0;
2289 	    U8* const send = s + slen;
2290 
2291 	    while (s < send) {
2292 		UV otherval = 0;
2293 
2294 		if (otherbits == 1) {
2295 		    otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2296 		    ++offset;
2297 		}
2298 		else {
2299 		    STRLEN vlen = otheroctets;
2300 		    otherval = *o++;
2301 		    while (--vlen) {
2302 			otherval <<= 8;
2303 			otherval |= *o++;
2304 		    }
2305 		}
2306 
2307 		if (opc == '+' && otherval)
2308 		    NOOP;   /* replace with otherval */
2309 		else if (opc == '!' && !otherval)
2310 		    otherval = 1;
2311 		else if (opc == '-' && otherval)
2312 		    otherval = 0;
2313 		else if (opc == '&' && !otherval)
2314 		    otherval = 0;
2315 		else {
2316 		    s += octets; /* no replacement */
2317 		    continue;
2318 		}
2319 
2320 		if (bits == 8)
2321 		    *s++ = (U8)( otherval & 0xff);
2322 		else if (bits == 16) {
2323 		    *s++ = (U8)((otherval >>  8) & 0xff);
2324 		    *s++ = (U8)( otherval        & 0xff);
2325 		}
2326 		else if (bits == 32) {
2327 		    *s++ = (U8)((otherval >> 24) & 0xff);
2328 		    *s++ = (U8)((otherval >> 16) & 0xff);
2329 		    *s++ = (U8)((otherval >>  8) & 0xff);
2330 		    *s++ = (U8)( otherval        & 0xff);
2331 		}
2332 	    }
2333 	}
2334 	sv_free(other); /* through with it! */
2335     } /* while */
2336     return swatch;
2337 }
2338 
2339 /*
2340 =for apidoc uvchr_to_utf8
2341 
2342 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2343 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2344 bytes available. The return value is the pointer to the byte after the
2345 end of the new character. In other words,
2346 
2347     d = uvchr_to_utf8(d, uv);
2348 
2349 is the recommended wide native character-aware way of saying
2350 
2351     *(d++) = uv;
2352 
2353 =cut
2354 */
2355 
2356 /* On ASCII machines this is normally a macro but we want a
2357    real function in case XS code wants it
2358 */
2359 U8 *
2360 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2361 {
2362     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2363 
2364     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2365 }
2366 
2367 U8 *
2368 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2369 {
2370     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2371 
2372     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2373 }
2374 
2375 /*
2376 =for apidoc utf8n_to_uvchr
2377 flags
2378 
2379 Returns the native character value of the first character in the string
2380 C<s>
2381 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2382 length, in bytes, of that character.
2383 
2384 Allows length and flags to be passed to low level routine.
2385 
2386 =cut
2387 */
2388 /* On ASCII machines this is normally a macro but we want
2389    a real function in case XS code wants it
2390 */
2391 UV
2392 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2393 U32 flags)
2394 {
2395     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2396 
2397     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2398 
2399     return UNI_TO_NATIVE(uv);
2400 }
2401 
2402 /*
2403 =for apidoc pv_uni_display
2404 
2405 Build to the scalar dsv a displayable version of the string spv,
2406 length len, the displayable version being at most pvlim bytes long
2407 (if longer, the rest is truncated and "..." will be appended).
2408 
2409 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2410 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2411 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2412 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2413 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2414 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2415 
2416 The pointer to the PV of the dsv is returned.
2417 
2418 =cut */
2419 char *
2420 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2421 {
2422     int truncated = 0;
2423     const char *s, *e;
2424 
2425     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2426 
2427     sv_setpvs(dsv, "");
2428     SvUTF8_off(dsv);
2429     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2430 	 UV u;
2431 	  /* This serves double duty as a flag and a character to print after
2432 	     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2433 	  */
2434 	 char ok = 0;
2435 
2436 	 if (pvlim && SvCUR(dsv) >= pvlim) {
2437 	      truncated++;
2438 	      break;
2439 	 }
2440 	 u = utf8_to_uvchr((U8*)s, 0);
2441 	 if (u < 256) {
2442 	     const unsigned char c = (unsigned char)u & 0xFF;
2443 	     if (flags & UNI_DISPLAY_BACKSLASH) {
2444 	         switch (c) {
2445 		 case '\n':
2446 		     ok = 'n'; break;
2447 		 case '\r':
2448 		     ok = 'r'; break;
2449 		 case '\t':
2450 		     ok = 't'; break;
2451 		 case '\f':
2452 		     ok = 'f'; break;
2453 		 case '\a':
2454 		     ok = 'a'; break;
2455 		 case '\\':
2456 		     ok = '\\'; break;
2457 		 default: break;
2458 		 }
2459 		 if (ok) {
2460 		     const char string = ok;
2461 		     sv_catpvs(dsv, "\\");
2462 		     sv_catpvn(dsv, &string, 1);
2463 		 }
2464 	     }
2465 	     /* isPRINT() is the locale-blind version. */
2466 	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2467 		 const char string = c;
2468 		 sv_catpvn(dsv, &string, 1);
2469 		 ok = 1;
2470 	     }
2471 	 }
2472 	 if (!ok)
2473 	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2474     }
2475     if (truncated)
2476 	 sv_catpvs(dsv, "...");
2477 
2478     return SvPVX(dsv);
2479 }
2480 
2481 /*
2482 =for apidoc sv_uni_display
2483 
2484 Build to the scalar dsv a displayable version of the scalar sv,
2485 the displayable version being at most pvlim bytes long
2486 (if longer, the rest is truncated and "..." will be appended).
2487 
2488 The flags argument is as in pv_uni_display().
2489 
2490 The pointer to the PV of the dsv is returned.
2491 
2492 =cut
2493 */
2494 char *
2495 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2496 {
2497     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2498 
2499      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2500 				SvCUR(ssv), pvlim, flags);
2501 }
2502 
2503 /*
2504 =for apidoc ibcmp_utf8
2505 
2506 Return true if the strings s1 and s2 differ case-insensitively, false
2507 if not (if they are equal case-insensitively).  If u1 is true, the
2508 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2509 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2510 are false, the respective string is assumed to be in native 8-bit
2511 encoding.
2512 
2513 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2514 in there (they will point at the beginning of the I<next> character).
2515 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2516 pointers beyond which scanning will not continue under any
2517 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2518 s2+l2 will be used as goal end pointers that will also stop the scan,
2519 and which qualify towards defining a successful match: all the scans
2520 that define an explicit length must reach their goal pointers for
2521 a match to succeed).
2522 
2523 For case-insensitiveness, the "casefolding" of Unicode is used
2524 instead of upper/lowercasing both the characters, see
2525 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2526 
2527 =cut */
2528 I32
2529 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2530 {
2531      dVAR;
2532      register const U8 *p1  = (const U8*)s1;
2533      register const U8 *p2  = (const U8*)s2;
2534      register const U8 *f1 = NULL;
2535      register const U8 *f2 = NULL;
2536      register U8 *e1 = NULL;
2537      register U8 *q1 = NULL;
2538      register U8 *e2 = NULL;
2539      register U8 *q2 = NULL;
2540      STRLEN n1 = 0, n2 = 0;
2541      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2542      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2543      U8 natbuf[1+1];
2544      STRLEN foldlen1, foldlen2;
2545      bool match;
2546 
2547      PERL_ARGS_ASSERT_IBCMP_UTF8;
2548 
2549      if (pe1)
2550 	  e1 = *(U8**)pe1;
2551      /* assert(e1 || l1); */
2552      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2553 	  f1 = (const U8*)s1 + l1;
2554      if (pe2)
2555 	  e2 = *(U8**)pe2;
2556      /* assert(e2 || l2); */
2557      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2558 	  f2 = (const U8*)s2 + l2;
2559 
2560      /* This shouldn't happen. However, putting an assert() there makes some
2561       * tests fail. */
2562      /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
2563      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2564 	  return 1; /* mismatch; possible infinite loop or false positive */
2565 
2566      if (!u1 || !u2)
2567 	  natbuf[1] = 0; /* Need to terminate the buffer. */
2568 
2569      while ((e1 == 0 || p1 < e1) &&
2570 	    (f1 == 0 || p1 < f1) &&
2571 	    (e2 == 0 || p2 < e2) &&
2572 	    (f2 == 0 || p2 < f2)) {
2573 	  if (n1 == 0) {
2574 	       if (u1)
2575 		    to_utf8_fold(p1, foldbuf1, &foldlen1);
2576 	       else {
2577 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2578 		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2579 	       }
2580 	       q1 = foldbuf1;
2581 	       n1 = foldlen1;
2582 	  }
2583 	  if (n2 == 0) {
2584 	       if (u2)
2585 		    to_utf8_fold(p2, foldbuf2, &foldlen2);
2586 	       else {
2587 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2588 		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2589 	       }
2590 	       q2 = foldbuf2;
2591 	       n2 = foldlen2;
2592 	  }
2593 	  while (n1 && n2) {
2594 	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2595 		   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2596 		    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2597 		   return 1; /* mismatch */
2598 	       n1 -= UTF8SKIP(q1);
2599 	       q1 += UTF8SKIP(q1);
2600 	       n2 -= UTF8SKIP(q2);
2601 	       q2 += UTF8SKIP(q2);
2602 	  }
2603 	  if (n1 == 0)
2604 	       p1 += u1 ? UTF8SKIP(p1) : 1;
2605 	  if (n2 == 0)
2606 	       p2 += u2 ? UTF8SKIP(p2) : 1;
2607 
2608      }
2609 
2610      /* A match is defined by all the scans that specified
2611       * an explicit length reaching their final goals. */
2612      match = (n1 == 0 && n2 == 0    /* Must not match partial char; Bug #72998 */
2613 	     && (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2));
2614 
2615      if (match) {
2616 	  if (pe1)
2617 	       *pe1 = (char*)p1;
2618 	  if (pe2)
2619 	       *pe2 = (char*)p2;
2620      }
2621 
2622      return match ? 0 : 1; /* 0 match, 1 mismatch */
2623 }
2624 
2625 /*
2626  * Local variables:
2627  * c-indentation-style: bsd
2628  * c-basic-offset: 4
2629  * indent-tabs-mode: t
2630  * End:
2631  *
2632  * ex: set ts=8 sts=4 sw=4 noet:
2633  */
2634