xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/utf8.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*    utf8.c
2*0Sstevel@tonic-gate  *
3*0Sstevel@tonic-gate  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
4*0Sstevel@tonic-gate  *
5*0Sstevel@tonic-gate  *    You may distribute under the terms of either the GNU General Public
6*0Sstevel@tonic-gate  *    License or the Artistic License, as specified in the README file.
7*0Sstevel@tonic-gate  *
8*0Sstevel@tonic-gate  */
9*0Sstevel@tonic-gate 
10*0Sstevel@tonic-gate /*
11*0Sstevel@tonic-gate  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12*0Sstevel@tonic-gate  * heard of that we don't want to see any closer; and that's the one place
13*0Sstevel@tonic-gate  * we're trying to get to!  And that's just where we can't get, nohow.'
14*0Sstevel@tonic-gate  *
15*0Sstevel@tonic-gate  * 'Well do I understand your speech,' he answered in the same language;
16*0Sstevel@tonic-gate  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
17*0Sstevel@tonic-gate  * as is the custom in the West, if you wish to be answered?'
18*0Sstevel@tonic-gate  *
19*0Sstevel@tonic-gate  * ...the travellers perceived that the floor was paved with stones of many
20*0Sstevel@tonic-gate  * hues; branching runes and strange devices intertwined beneath their feet.
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate 
23*0Sstevel@tonic-gate #include "EXTERN.h"
24*0Sstevel@tonic-gate #define PERL_IN_UTF8_C
25*0Sstevel@tonic-gate #include "perl.h"
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate =head1 Unicode Support
31*0Sstevel@tonic-gate 
32*0Sstevel@tonic-gate =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
33*0Sstevel@tonic-gate 
34*0Sstevel@tonic-gate Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
35*0Sstevel@tonic-gate of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36*0Sstevel@tonic-gate bytes available. The return value is the pointer to the byte after the
37*0Sstevel@tonic-gate end of the new character. In other words,
38*0Sstevel@tonic-gate 
39*0Sstevel@tonic-gate     d = uvuni_to_utf8_flags(d, uv, flags);
40*0Sstevel@tonic-gate 
41*0Sstevel@tonic-gate or, in most cases,
42*0Sstevel@tonic-gate 
43*0Sstevel@tonic-gate     d = uvuni_to_utf8(d, uv);
44*0Sstevel@tonic-gate 
45*0Sstevel@tonic-gate (which is equivalent to)
46*0Sstevel@tonic-gate 
47*0Sstevel@tonic-gate     d = uvuni_to_utf8_flags(d, uv, 0);
48*0Sstevel@tonic-gate 
49*0Sstevel@tonic-gate is the recommended Unicode-aware way of saying
50*0Sstevel@tonic-gate 
51*0Sstevel@tonic-gate     *(d++) = uv;
52*0Sstevel@tonic-gate 
53*0Sstevel@tonic-gate =cut
54*0Sstevel@tonic-gate */
55*0Sstevel@tonic-gate 
56*0Sstevel@tonic-gate U8 *
Perl_uvuni_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)57*0Sstevel@tonic-gate Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
58*0Sstevel@tonic-gate {
59*0Sstevel@tonic-gate     if (ckWARN(WARN_UTF8)) {
60*0Sstevel@tonic-gate 	 if (UNICODE_IS_SURROGATE(uv) &&
61*0Sstevel@tonic-gate 	     !(flags & UNICODE_ALLOW_SURROGATE))
62*0Sstevel@tonic-gate 	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
63*0Sstevel@tonic-gate 	 else if (
64*0Sstevel@tonic-gate 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65*0Sstevel@tonic-gate 		    !(flags & UNICODE_ALLOW_FDD0))
66*0Sstevel@tonic-gate 		   ||
67*0Sstevel@tonic-gate 		   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68*0Sstevel@tonic-gate 		    !(flags & UNICODE_ALLOW_FFFF))) &&
69*0Sstevel@tonic-gate 		  /* UNICODE_ALLOW_SUPER includes
70*0Sstevel@tonic-gate 		   * FFFEs and FFFFs beyond 0x10FFFF. */
71*0Sstevel@tonic-gate 		  ((uv <= PERL_UNICODE_MAX) ||
72*0Sstevel@tonic-gate 		   !(flags & UNICODE_ALLOW_SUPER))
73*0Sstevel@tonic-gate 		  )
74*0Sstevel@tonic-gate 	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
75*0Sstevel@tonic-gate 			 "Unicode character 0x%04"UVxf" is illegal", uv);
76*0Sstevel@tonic-gate     }
77*0Sstevel@tonic-gate     if (UNI_IS_INVARIANT(uv)) {
78*0Sstevel@tonic-gate 	*d++ = (U8)UTF_TO_NATIVE(uv);
79*0Sstevel@tonic-gate 	return d;
80*0Sstevel@tonic-gate     }
81*0Sstevel@tonic-gate #if defined(EBCDIC)
82*0Sstevel@tonic-gate     else {
83*0Sstevel@tonic-gate 	STRLEN len  = UNISKIP(uv);
84*0Sstevel@tonic-gate 	U8 *p = d+len-1;
85*0Sstevel@tonic-gate 	while (p > d) {
86*0Sstevel@tonic-gate 	    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87*0Sstevel@tonic-gate 	    uv >>= UTF_ACCUMULATION_SHIFT;
88*0Sstevel@tonic-gate 	}
89*0Sstevel@tonic-gate 	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
90*0Sstevel@tonic-gate 	return d+len;
91*0Sstevel@tonic-gate     }
92*0Sstevel@tonic-gate #else /* Non loop style */
93*0Sstevel@tonic-gate     if (uv < 0x800) {
94*0Sstevel@tonic-gate 	*d++ = (U8)(( uv >>  6)         | 0xc0);
95*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
96*0Sstevel@tonic-gate 	return d;
97*0Sstevel@tonic-gate     }
98*0Sstevel@tonic-gate     if (uv < 0x10000) {
99*0Sstevel@tonic-gate 	*d++ = (U8)(( uv >> 12)         | 0xe0);
100*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
101*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
102*0Sstevel@tonic-gate 	return d;
103*0Sstevel@tonic-gate     }
104*0Sstevel@tonic-gate     if (uv < 0x200000) {
105*0Sstevel@tonic-gate 	*d++ = (U8)(( uv >> 18)         | 0xf0);
106*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
108*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
109*0Sstevel@tonic-gate 	return d;
110*0Sstevel@tonic-gate     }
111*0Sstevel@tonic-gate     if (uv < 0x4000000) {
112*0Sstevel@tonic-gate 	*d++ = (U8)(( uv >> 24)         | 0xf8);
113*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
116*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
117*0Sstevel@tonic-gate 	return d;
118*0Sstevel@tonic-gate     }
119*0Sstevel@tonic-gate     if (uv < 0x80000000) {
120*0Sstevel@tonic-gate 	*d++ = (U8)(( uv >> 30)         | 0xfc);
121*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
125*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
126*0Sstevel@tonic-gate 	return d;
127*0Sstevel@tonic-gate     }
128*0Sstevel@tonic-gate #ifdef HAS_QUAD
129*0Sstevel@tonic-gate     if (uv < UTF8_QUAD_MAX)
130*0Sstevel@tonic-gate #endif
131*0Sstevel@tonic-gate     {
132*0Sstevel@tonic-gate 	*d++ =                            0xfe;	/* Can't match U+FEFF! */
133*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
138*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
139*0Sstevel@tonic-gate 	return d;
140*0Sstevel@tonic-gate     }
141*0Sstevel@tonic-gate #ifdef HAS_QUAD
142*0Sstevel@tonic-gate     {
143*0Sstevel@tonic-gate 	*d++ =                            0xff;		/* Can't match U+FFFE! */
144*0Sstevel@tonic-gate 	*d++ =                            0x80;		/* 6 Reserved bits */
145*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
146*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154*0Sstevel@tonic-gate 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
155*0Sstevel@tonic-gate 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
156*0Sstevel@tonic-gate 	return d;
157*0Sstevel@tonic-gate     }
158*0Sstevel@tonic-gate #endif
159*0Sstevel@tonic-gate #endif /* Loop style */
160*0Sstevel@tonic-gate }
161*0Sstevel@tonic-gate 
162*0Sstevel@tonic-gate U8 *
Perl_uvuni_to_utf8(pTHX_ U8 * d,UV uv)163*0Sstevel@tonic-gate Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164*0Sstevel@tonic-gate {
165*0Sstevel@tonic-gate     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166*0Sstevel@tonic-gate }
167*0Sstevel@tonic-gate 
168*0Sstevel@tonic-gate 
169*0Sstevel@tonic-gate /*
170*0Sstevel@tonic-gate =for apidoc A|STRLEN|is_utf8_char|U8 *s
171*0Sstevel@tonic-gate 
172*0Sstevel@tonic-gate Tests if some arbitrary number of bytes begins in a valid UTF-8
173*0Sstevel@tonic-gate character.  Note that an INVARIANT (i.e. ASCII) character is a valid
174*0Sstevel@tonic-gate UTF-8 character.  The actual number of bytes in the UTF-8 character
175*0Sstevel@tonic-gate will be returned if it is valid, otherwise 0.
176*0Sstevel@tonic-gate 
177*0Sstevel@tonic-gate =cut */
178*0Sstevel@tonic-gate STRLEN
Perl_is_utf8_char(pTHX_ U8 * s)179*0Sstevel@tonic-gate Perl_is_utf8_char(pTHX_ U8 *s)
180*0Sstevel@tonic-gate {
181*0Sstevel@tonic-gate     U8 u = *s;
182*0Sstevel@tonic-gate     STRLEN slen, len;
183*0Sstevel@tonic-gate     UV uv, ouv;
184*0Sstevel@tonic-gate 
185*0Sstevel@tonic-gate     if (UTF8_IS_INVARIANT(u))
186*0Sstevel@tonic-gate 	return 1;
187*0Sstevel@tonic-gate 
188*0Sstevel@tonic-gate     if (!UTF8_IS_START(u))
189*0Sstevel@tonic-gate 	return 0;
190*0Sstevel@tonic-gate 
191*0Sstevel@tonic-gate     len = UTF8SKIP(s);
192*0Sstevel@tonic-gate 
193*0Sstevel@tonic-gate     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
194*0Sstevel@tonic-gate 	return 0;
195*0Sstevel@tonic-gate 
196*0Sstevel@tonic-gate     slen = len - 1;
197*0Sstevel@tonic-gate     s++;
198*0Sstevel@tonic-gate     u &= UTF_START_MASK(len);
199*0Sstevel@tonic-gate     uv  = u;
200*0Sstevel@tonic-gate     ouv = uv;
201*0Sstevel@tonic-gate     while (slen--) {
202*0Sstevel@tonic-gate 	if (!UTF8_IS_CONTINUATION(*s))
203*0Sstevel@tonic-gate 	    return 0;
204*0Sstevel@tonic-gate 	uv = UTF8_ACCUMULATE(uv, *s);
205*0Sstevel@tonic-gate 	if (uv < ouv)
206*0Sstevel@tonic-gate 	    return 0;
207*0Sstevel@tonic-gate 	ouv = uv;
208*0Sstevel@tonic-gate 	s++;
209*0Sstevel@tonic-gate     }
210*0Sstevel@tonic-gate 
211*0Sstevel@tonic-gate     if ((STRLEN)UNISKIP(uv) < len)
212*0Sstevel@tonic-gate 	return 0;
213*0Sstevel@tonic-gate 
214*0Sstevel@tonic-gate     return len;
215*0Sstevel@tonic-gate }
216*0Sstevel@tonic-gate 
217*0Sstevel@tonic-gate /*
218*0Sstevel@tonic-gate =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
219*0Sstevel@tonic-gate 
220*0Sstevel@tonic-gate Returns true if first C<len> bytes of the given string form a valid
221*0Sstevel@tonic-gate UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
222*0Sstevel@tonic-gate not mean 'a string that contains code points above 0x7F encoded in UTF-8'
223*0Sstevel@tonic-gate because a valid ASCII string is a valid UTF-8 string.
224*0Sstevel@tonic-gate 
225*0Sstevel@tonic-gate =cut
226*0Sstevel@tonic-gate */
227*0Sstevel@tonic-gate 
228*0Sstevel@tonic-gate bool
Perl_is_utf8_string(pTHX_ U8 * s,STRLEN len)229*0Sstevel@tonic-gate Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230*0Sstevel@tonic-gate {
231*0Sstevel@tonic-gate     U8* x = s;
232*0Sstevel@tonic-gate     U8* send;
233*0Sstevel@tonic-gate     STRLEN c;
234*0Sstevel@tonic-gate 
235*0Sstevel@tonic-gate     if (!len)
236*0Sstevel@tonic-gate 	len = strlen((char *)s);
237*0Sstevel@tonic-gate     send = s + len;
238*0Sstevel@tonic-gate 
239*0Sstevel@tonic-gate     while (x < send) {
240*0Sstevel@tonic-gate 	 /* Inline the easy bits of is_utf8_char() here for speed... */
241*0Sstevel@tonic-gate 	 if (UTF8_IS_INVARIANT(*x))
242*0Sstevel@tonic-gate 	      c = 1;
243*0Sstevel@tonic-gate 	 else if (!UTF8_IS_START(*x))
244*0Sstevel@tonic-gate 	      return FALSE;
245*0Sstevel@tonic-gate 	 else {
246*0Sstevel@tonic-gate 	      /* ... and call is_utf8_char() only if really needed. */
247*0Sstevel@tonic-gate 	      c = is_utf8_char(x);
248*0Sstevel@tonic-gate 	      if (!c)
249*0Sstevel@tonic-gate 		   return FALSE;
250*0Sstevel@tonic-gate 	 }
251*0Sstevel@tonic-gate         x += c;
252*0Sstevel@tonic-gate     }
253*0Sstevel@tonic-gate     if (x != send)
254*0Sstevel@tonic-gate 	return FALSE;
255*0Sstevel@tonic-gate 
256*0Sstevel@tonic-gate     return TRUE;
257*0Sstevel@tonic-gate }
258*0Sstevel@tonic-gate 
259*0Sstevel@tonic-gate /*
260*0Sstevel@tonic-gate =for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
261*0Sstevel@tonic-gate 
262*0Sstevel@tonic-gate Like is_ut8_string but store the location of the failure in
263*0Sstevel@tonic-gate the last argument.
264*0Sstevel@tonic-gate 
265*0Sstevel@tonic-gate =cut
266*0Sstevel@tonic-gate */
267*0Sstevel@tonic-gate 
268*0Sstevel@tonic-gate bool
Perl_is_utf8_string_loc(pTHX_ U8 * s,STRLEN len,U8 ** p)269*0Sstevel@tonic-gate Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
270*0Sstevel@tonic-gate {
271*0Sstevel@tonic-gate     U8* x = s;
272*0Sstevel@tonic-gate     U8* send;
273*0Sstevel@tonic-gate     STRLEN c;
274*0Sstevel@tonic-gate 
275*0Sstevel@tonic-gate     if (!len)
276*0Sstevel@tonic-gate 	len = strlen((char *)s);
277*0Sstevel@tonic-gate     send = s + len;
278*0Sstevel@tonic-gate 
279*0Sstevel@tonic-gate     while (x < send) {
280*0Sstevel@tonic-gate 	 /* Inline the easy bits of is_utf8_char() here for speed... */
281*0Sstevel@tonic-gate 	 if (UTF8_IS_INVARIANT(*x))
282*0Sstevel@tonic-gate 	      c = 1;
283*0Sstevel@tonic-gate 	 else if (!UTF8_IS_START(*x)) {
284*0Sstevel@tonic-gate 	      if (p)
285*0Sstevel@tonic-gate 		  *p = x;
286*0Sstevel@tonic-gate 	      return FALSE;
287*0Sstevel@tonic-gate 	 }
288*0Sstevel@tonic-gate 	 else {
289*0Sstevel@tonic-gate 	      /* ... and call is_utf8_char() only if really needed. */
290*0Sstevel@tonic-gate 	      c = is_utf8_char(x);
291*0Sstevel@tonic-gate 	      if (!c) {
292*0Sstevel@tonic-gate 		   if (p)
293*0Sstevel@tonic-gate 		      *p = x;
294*0Sstevel@tonic-gate 		   return FALSE;
295*0Sstevel@tonic-gate 	      }
296*0Sstevel@tonic-gate 	 }
297*0Sstevel@tonic-gate         x += c;
298*0Sstevel@tonic-gate     }
299*0Sstevel@tonic-gate     if (x != send) {
300*0Sstevel@tonic-gate        if (p)
301*0Sstevel@tonic-gate 	   *p = x;
302*0Sstevel@tonic-gate 	return FALSE;
303*0Sstevel@tonic-gate     }
304*0Sstevel@tonic-gate 
305*0Sstevel@tonic-gate     return TRUE;
306*0Sstevel@tonic-gate }
307*0Sstevel@tonic-gate 
308*0Sstevel@tonic-gate /*
309*0Sstevel@tonic-gate =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
310*0Sstevel@tonic-gate 
311*0Sstevel@tonic-gate Bottom level UTF-8 decode routine.
312*0Sstevel@tonic-gate Returns the unicode code point value of the first character in the string C<s>
313*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
314*0Sstevel@tonic-gate C<retlen> will be set to the length, in bytes, of that character.
315*0Sstevel@tonic-gate 
316*0Sstevel@tonic-gate If C<s> does not point to a well-formed UTF-8 character, the behaviour
317*0Sstevel@tonic-gate is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
318*0Sstevel@tonic-gate it is assumed that the caller will raise a warning, and this function
319*0Sstevel@tonic-gate will silently just set C<retlen> to C<-1> and return zero.  If the
320*0Sstevel@tonic-gate C<flags> does not contain UTF8_CHECK_ONLY, warnings about
321*0Sstevel@tonic-gate malformations will be given, C<retlen> will be set to the expected
322*0Sstevel@tonic-gate length of the UTF-8 character in bytes, and zero will be returned.
323*0Sstevel@tonic-gate 
324*0Sstevel@tonic-gate The C<flags> can also contain various flags to allow deviations from
325*0Sstevel@tonic-gate the strict UTF-8 encoding (see F<utf8.h>).
326*0Sstevel@tonic-gate 
327*0Sstevel@tonic-gate Most code should use utf8_to_uvchr() rather than call this directly.
328*0Sstevel@tonic-gate 
329*0Sstevel@tonic-gate =cut
330*0Sstevel@tonic-gate */
331*0Sstevel@tonic-gate 
332*0Sstevel@tonic-gate UV
Perl_utf8n_to_uvuni(pTHX_ U8 * s,STRLEN curlen,STRLEN * retlen,U32 flags)333*0Sstevel@tonic-gate Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
334*0Sstevel@tonic-gate {
335*0Sstevel@tonic-gate     U8 *s0 = s;
336*0Sstevel@tonic-gate     UV uv = *s, ouv = 0;
337*0Sstevel@tonic-gate     STRLEN len = 1;
338*0Sstevel@tonic-gate     bool dowarn = ckWARN_d(WARN_UTF8);
339*0Sstevel@tonic-gate     UV startbyte = *s;
340*0Sstevel@tonic-gate     STRLEN expectlen = 0;
341*0Sstevel@tonic-gate     U32 warning = 0;
342*0Sstevel@tonic-gate 
343*0Sstevel@tonic-gate /* This list is a superset of the UTF8_ALLOW_XXX. */
344*0Sstevel@tonic-gate 
345*0Sstevel@tonic-gate #define UTF8_WARN_EMPTY				 1
346*0Sstevel@tonic-gate #define UTF8_WARN_CONTINUATION			 2
347*0Sstevel@tonic-gate #define UTF8_WARN_NON_CONTINUATION	 	 3
348*0Sstevel@tonic-gate #define UTF8_WARN_FE_FF				 4
349*0Sstevel@tonic-gate #define UTF8_WARN_SHORT				 5
350*0Sstevel@tonic-gate #define UTF8_WARN_OVERFLOW			 6
351*0Sstevel@tonic-gate #define UTF8_WARN_SURROGATE			 7
352*0Sstevel@tonic-gate #define UTF8_WARN_LONG				 8
353*0Sstevel@tonic-gate #define UTF8_WARN_FFFF				 9 /* Also FFFE. */
354*0Sstevel@tonic-gate 
355*0Sstevel@tonic-gate     if (curlen == 0 &&
356*0Sstevel@tonic-gate 	!(flags & UTF8_ALLOW_EMPTY)) {
357*0Sstevel@tonic-gate 	warning = UTF8_WARN_EMPTY;
358*0Sstevel@tonic-gate 	goto malformed;
359*0Sstevel@tonic-gate     }
360*0Sstevel@tonic-gate 
361*0Sstevel@tonic-gate     if (UTF8_IS_INVARIANT(uv)) {
362*0Sstevel@tonic-gate 	if (retlen)
363*0Sstevel@tonic-gate 	    *retlen = 1;
364*0Sstevel@tonic-gate 	return (UV) (NATIVE_TO_UTF(*s));
365*0Sstevel@tonic-gate     }
366*0Sstevel@tonic-gate 
367*0Sstevel@tonic-gate     if (UTF8_IS_CONTINUATION(uv) &&
368*0Sstevel@tonic-gate 	!(flags & UTF8_ALLOW_CONTINUATION)) {
369*0Sstevel@tonic-gate 	warning = UTF8_WARN_CONTINUATION;
370*0Sstevel@tonic-gate 	goto malformed;
371*0Sstevel@tonic-gate     }
372*0Sstevel@tonic-gate 
373*0Sstevel@tonic-gate     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
374*0Sstevel@tonic-gate 	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
375*0Sstevel@tonic-gate 	warning = UTF8_WARN_NON_CONTINUATION;
376*0Sstevel@tonic-gate 	goto malformed;
377*0Sstevel@tonic-gate     }
378*0Sstevel@tonic-gate 
379*0Sstevel@tonic-gate #ifdef EBCDIC
380*0Sstevel@tonic-gate     uv = NATIVE_TO_UTF(uv);
381*0Sstevel@tonic-gate #else
382*0Sstevel@tonic-gate     if ((uv == 0xfe || uv == 0xff) &&
383*0Sstevel@tonic-gate 	!(flags & UTF8_ALLOW_FE_FF)) {
384*0Sstevel@tonic-gate 	warning = UTF8_WARN_FE_FF;
385*0Sstevel@tonic-gate 	goto malformed;
386*0Sstevel@tonic-gate     }
387*0Sstevel@tonic-gate #endif
388*0Sstevel@tonic-gate 
389*0Sstevel@tonic-gate     if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
390*0Sstevel@tonic-gate     else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
391*0Sstevel@tonic-gate     else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
392*0Sstevel@tonic-gate     else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
393*0Sstevel@tonic-gate #ifdef EBCDIC
394*0Sstevel@tonic-gate     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
395*0Sstevel@tonic-gate     else			{ len =  7; uv &= 0x01; }
396*0Sstevel@tonic-gate #else
397*0Sstevel@tonic-gate     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
398*0Sstevel@tonic-gate     else if (!(uv & 0x01))	{ len =  7; uv = 0; }
399*0Sstevel@tonic-gate     else			{ len = 13; uv = 0; } /* whoa! */
400*0Sstevel@tonic-gate #endif
401*0Sstevel@tonic-gate 
402*0Sstevel@tonic-gate     if (retlen)
403*0Sstevel@tonic-gate 	*retlen = len;
404*0Sstevel@tonic-gate 
405*0Sstevel@tonic-gate     expectlen = len;
406*0Sstevel@tonic-gate 
407*0Sstevel@tonic-gate     if ((curlen < expectlen) &&
408*0Sstevel@tonic-gate 	!(flags & UTF8_ALLOW_SHORT)) {
409*0Sstevel@tonic-gate 	warning = UTF8_WARN_SHORT;
410*0Sstevel@tonic-gate 	goto malformed;
411*0Sstevel@tonic-gate     }
412*0Sstevel@tonic-gate 
413*0Sstevel@tonic-gate     len--;
414*0Sstevel@tonic-gate     s++;
415*0Sstevel@tonic-gate     ouv = uv;
416*0Sstevel@tonic-gate 
417*0Sstevel@tonic-gate     while (len--) {
418*0Sstevel@tonic-gate 	if (!UTF8_IS_CONTINUATION(*s) &&
419*0Sstevel@tonic-gate 	    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
420*0Sstevel@tonic-gate 	    s--;
421*0Sstevel@tonic-gate 	    warning = UTF8_WARN_NON_CONTINUATION;
422*0Sstevel@tonic-gate 	    goto malformed;
423*0Sstevel@tonic-gate 	}
424*0Sstevel@tonic-gate 	else
425*0Sstevel@tonic-gate 	    uv = UTF8_ACCUMULATE(uv, *s);
426*0Sstevel@tonic-gate 	if (!(uv > ouv)) {
427*0Sstevel@tonic-gate 	    /* These cannot be allowed. */
428*0Sstevel@tonic-gate 	    if (uv == ouv) {
429*0Sstevel@tonic-gate 		if (!(flags & UTF8_ALLOW_LONG)) {
430*0Sstevel@tonic-gate 		    warning = UTF8_WARN_LONG;
431*0Sstevel@tonic-gate 		    goto malformed;
432*0Sstevel@tonic-gate 		}
433*0Sstevel@tonic-gate 	    }
434*0Sstevel@tonic-gate 	    else { /* uv < ouv */
435*0Sstevel@tonic-gate 		/* This cannot be allowed. */
436*0Sstevel@tonic-gate 		warning = UTF8_WARN_OVERFLOW;
437*0Sstevel@tonic-gate 		goto malformed;
438*0Sstevel@tonic-gate 	    }
439*0Sstevel@tonic-gate 	}
440*0Sstevel@tonic-gate 	s++;
441*0Sstevel@tonic-gate 	ouv = uv;
442*0Sstevel@tonic-gate     }
443*0Sstevel@tonic-gate 
444*0Sstevel@tonic-gate     if (UNICODE_IS_SURROGATE(uv) &&
445*0Sstevel@tonic-gate 	!(flags & UTF8_ALLOW_SURROGATE)) {
446*0Sstevel@tonic-gate 	warning = UTF8_WARN_SURROGATE;
447*0Sstevel@tonic-gate 	goto malformed;
448*0Sstevel@tonic-gate     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
449*0Sstevel@tonic-gate 	       !(flags & UTF8_ALLOW_LONG)) {
450*0Sstevel@tonic-gate 	warning = UTF8_WARN_LONG;
451*0Sstevel@tonic-gate 	goto malformed;
452*0Sstevel@tonic-gate     } else if (UNICODE_IS_ILLEGAL(uv) &&
453*0Sstevel@tonic-gate 	       !(flags & UTF8_ALLOW_FFFF)) {
454*0Sstevel@tonic-gate 	warning = UTF8_WARN_FFFF;
455*0Sstevel@tonic-gate 	goto malformed;
456*0Sstevel@tonic-gate     }
457*0Sstevel@tonic-gate 
458*0Sstevel@tonic-gate     return uv;
459*0Sstevel@tonic-gate 
460*0Sstevel@tonic-gate malformed:
461*0Sstevel@tonic-gate 
462*0Sstevel@tonic-gate     if (flags & UTF8_CHECK_ONLY) {
463*0Sstevel@tonic-gate 	if (retlen)
464*0Sstevel@tonic-gate 	    *retlen = -1;
465*0Sstevel@tonic-gate 	return 0;
466*0Sstevel@tonic-gate     }
467*0Sstevel@tonic-gate 
468*0Sstevel@tonic-gate     if (dowarn) {
469*0Sstevel@tonic-gate 	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
470*0Sstevel@tonic-gate 
471*0Sstevel@tonic-gate 	switch (warning) {
472*0Sstevel@tonic-gate 	case 0: /* Intentionally empty. */ break;
473*0Sstevel@tonic-gate 	case UTF8_WARN_EMPTY:
474*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(empty string)");
475*0Sstevel@tonic-gate 	    break;
476*0Sstevel@tonic-gate 	case UTF8_WARN_CONTINUATION:
477*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
478*0Sstevel@tonic-gate 	    break;
479*0Sstevel@tonic-gate 	case UTF8_WARN_NON_CONTINUATION:
480*0Sstevel@tonic-gate 	    if (s == s0)
481*0Sstevel@tonic-gate 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
482*0Sstevel@tonic-gate                            (UV)s[1], startbyte);
483*0Sstevel@tonic-gate 	    else
484*0Sstevel@tonic-gate 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
485*0Sstevel@tonic-gate                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
486*0Sstevel@tonic-gate 
487*0Sstevel@tonic-gate 	    break;
488*0Sstevel@tonic-gate 	case UTF8_WARN_FE_FF:
489*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
490*0Sstevel@tonic-gate 	    break;
491*0Sstevel@tonic-gate 	case UTF8_WARN_SHORT:
492*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
493*0Sstevel@tonic-gate                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
494*0Sstevel@tonic-gate 	    expectlen = curlen;		/* distance for caller to skip */
495*0Sstevel@tonic-gate 	    break;
496*0Sstevel@tonic-gate 	case UTF8_WARN_OVERFLOW:
497*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
498*0Sstevel@tonic-gate                            ouv, *s, startbyte);
499*0Sstevel@tonic-gate 	    break;
500*0Sstevel@tonic-gate 	case UTF8_WARN_SURROGATE:
501*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
502*0Sstevel@tonic-gate 	    break;
503*0Sstevel@tonic-gate 	case UTF8_WARN_LONG:
504*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
505*0Sstevel@tonic-gate 			   expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
506*0Sstevel@tonic-gate 	    break;
507*0Sstevel@tonic-gate 	case UTF8_WARN_FFFF:
508*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
509*0Sstevel@tonic-gate 	    break;
510*0Sstevel@tonic-gate 	default:
511*0Sstevel@tonic-gate 	    Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
512*0Sstevel@tonic-gate 	    break;
513*0Sstevel@tonic-gate 	}
514*0Sstevel@tonic-gate 
515*0Sstevel@tonic-gate 	if (warning) {
516*0Sstevel@tonic-gate 	    char *s = SvPVX(sv);
517*0Sstevel@tonic-gate 
518*0Sstevel@tonic-gate 	    if (PL_op)
519*0Sstevel@tonic-gate 		Perl_warner(aTHX_ packWARN(WARN_UTF8),
520*0Sstevel@tonic-gate 			    "%s in %s", s,  OP_DESC(PL_op));
521*0Sstevel@tonic-gate 	    else
522*0Sstevel@tonic-gate 		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
523*0Sstevel@tonic-gate 	}
524*0Sstevel@tonic-gate     }
525*0Sstevel@tonic-gate 
526*0Sstevel@tonic-gate     if (retlen)
527*0Sstevel@tonic-gate 	*retlen = expectlen ? expectlen : len;
528*0Sstevel@tonic-gate 
529*0Sstevel@tonic-gate     return 0;
530*0Sstevel@tonic-gate }
531*0Sstevel@tonic-gate 
532*0Sstevel@tonic-gate /*
533*0Sstevel@tonic-gate =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
534*0Sstevel@tonic-gate 
535*0Sstevel@tonic-gate Returns the native character value of the first character in the string C<s>
536*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
537*0Sstevel@tonic-gate length, in bytes, of that character.
538*0Sstevel@tonic-gate 
539*0Sstevel@tonic-gate If C<s> does not point to a well-formed UTF-8 character, zero is
540*0Sstevel@tonic-gate returned and retlen is set, if possible, to -1.
541*0Sstevel@tonic-gate 
542*0Sstevel@tonic-gate =cut
543*0Sstevel@tonic-gate */
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate UV
Perl_utf8_to_uvchr(pTHX_ U8 * s,STRLEN * retlen)546*0Sstevel@tonic-gate Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
547*0Sstevel@tonic-gate {
548*0Sstevel@tonic-gate     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
549*0Sstevel@tonic-gate 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
550*0Sstevel@tonic-gate }
551*0Sstevel@tonic-gate 
552*0Sstevel@tonic-gate /*
553*0Sstevel@tonic-gate =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
554*0Sstevel@tonic-gate 
555*0Sstevel@tonic-gate Returns the Unicode code point of the first character in the string C<s>
556*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
557*0Sstevel@tonic-gate length, in bytes, of that character.
558*0Sstevel@tonic-gate 
559*0Sstevel@tonic-gate This function should only be used when returned UV is considered
560*0Sstevel@tonic-gate an index into the Unicode semantic tables (e.g. swashes).
561*0Sstevel@tonic-gate 
562*0Sstevel@tonic-gate If C<s> does not point to a well-formed UTF-8 character, zero is
563*0Sstevel@tonic-gate returned and retlen is set, if possible, to -1.
564*0Sstevel@tonic-gate 
565*0Sstevel@tonic-gate =cut
566*0Sstevel@tonic-gate */
567*0Sstevel@tonic-gate 
568*0Sstevel@tonic-gate UV
Perl_utf8_to_uvuni(pTHX_ U8 * s,STRLEN * retlen)569*0Sstevel@tonic-gate Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
570*0Sstevel@tonic-gate {
571*0Sstevel@tonic-gate     /* Call the low level routine asking for checks */
572*0Sstevel@tonic-gate     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
573*0Sstevel@tonic-gate 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
574*0Sstevel@tonic-gate }
575*0Sstevel@tonic-gate 
576*0Sstevel@tonic-gate /*
577*0Sstevel@tonic-gate =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
578*0Sstevel@tonic-gate 
579*0Sstevel@tonic-gate Return the length of the UTF-8 char encoded string C<s> in characters.
580*0Sstevel@tonic-gate Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
581*0Sstevel@tonic-gate up past C<e>, croaks.
582*0Sstevel@tonic-gate 
583*0Sstevel@tonic-gate =cut
584*0Sstevel@tonic-gate */
585*0Sstevel@tonic-gate 
586*0Sstevel@tonic-gate STRLEN
Perl_utf8_length(pTHX_ U8 * s,U8 * e)587*0Sstevel@tonic-gate Perl_utf8_length(pTHX_ U8 *s, U8 *e)
588*0Sstevel@tonic-gate {
589*0Sstevel@tonic-gate     STRLEN len = 0;
590*0Sstevel@tonic-gate 
591*0Sstevel@tonic-gate     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
592*0Sstevel@tonic-gate      * the bitops (especially ~) can create illegal UTF-8.
593*0Sstevel@tonic-gate      * In other words: in Perl UTF-8 is not just for Unicode. */
594*0Sstevel@tonic-gate 
595*0Sstevel@tonic-gate     if (e < s) {
596*0Sstevel@tonic-gate         if (ckWARN_d(WARN_UTF8)) {
597*0Sstevel@tonic-gate 	    if (PL_op)
598*0Sstevel@tonic-gate 	        Perl_warner(aTHX_ packWARN(WARN_UTF8),
599*0Sstevel@tonic-gate 			    "%s in %s", unees, OP_DESC(PL_op));
600*0Sstevel@tonic-gate 	    else
601*0Sstevel@tonic-gate 	        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
602*0Sstevel@tonic-gate 	}
603*0Sstevel@tonic-gate 	return 0;
604*0Sstevel@tonic-gate     }
605*0Sstevel@tonic-gate     while (s < e) {
606*0Sstevel@tonic-gate 	U8 t = UTF8SKIP(s);
607*0Sstevel@tonic-gate 
608*0Sstevel@tonic-gate 	if (e - s < t) {
609*0Sstevel@tonic-gate 	    if (ckWARN_d(WARN_UTF8)) {
610*0Sstevel@tonic-gate 	        if (PL_op)
611*0Sstevel@tonic-gate 		    Perl_warner(aTHX_ packWARN(WARN_UTF8),
612*0Sstevel@tonic-gate 				unees, OP_DESC(PL_op));
613*0Sstevel@tonic-gate 		else
614*0Sstevel@tonic-gate 		    Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
615*0Sstevel@tonic-gate 	    }
616*0Sstevel@tonic-gate 	    return len;
617*0Sstevel@tonic-gate 	}
618*0Sstevel@tonic-gate 	s += t;
619*0Sstevel@tonic-gate 	len++;
620*0Sstevel@tonic-gate     }
621*0Sstevel@tonic-gate 
622*0Sstevel@tonic-gate     return len;
623*0Sstevel@tonic-gate }
624*0Sstevel@tonic-gate 
625*0Sstevel@tonic-gate /*
626*0Sstevel@tonic-gate =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
627*0Sstevel@tonic-gate 
628*0Sstevel@tonic-gate Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
629*0Sstevel@tonic-gate and C<b>.
630*0Sstevel@tonic-gate 
631*0Sstevel@tonic-gate WARNING: use only if you *know* that the pointers point inside the
632*0Sstevel@tonic-gate same UTF-8 buffer.
633*0Sstevel@tonic-gate 
634*0Sstevel@tonic-gate =cut
635*0Sstevel@tonic-gate */
636*0Sstevel@tonic-gate 
637*0Sstevel@tonic-gate IV
Perl_utf8_distance(pTHX_ U8 * a,U8 * b)638*0Sstevel@tonic-gate Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
639*0Sstevel@tonic-gate {
640*0Sstevel@tonic-gate     IV off = 0;
641*0Sstevel@tonic-gate 
642*0Sstevel@tonic-gate     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
643*0Sstevel@tonic-gate      * the bitops (especially ~) can create illegal UTF-8.
644*0Sstevel@tonic-gate      * In other words: in Perl UTF-8 is not just for Unicode. */
645*0Sstevel@tonic-gate 
646*0Sstevel@tonic-gate     if (a < b) {
647*0Sstevel@tonic-gate 	while (a < b) {
648*0Sstevel@tonic-gate 	    U8 c = UTF8SKIP(a);
649*0Sstevel@tonic-gate 
650*0Sstevel@tonic-gate 	    if (b - a < c) {
651*0Sstevel@tonic-gate 	        if (ckWARN_d(WARN_UTF8)) {
652*0Sstevel@tonic-gate 		    if (PL_op)
653*0Sstevel@tonic-gate 		        Perl_warner(aTHX_ packWARN(WARN_UTF8),
654*0Sstevel@tonic-gate 				    "%s in %s", unees, OP_DESC(PL_op));
655*0Sstevel@tonic-gate 		    else
656*0Sstevel@tonic-gate 		        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
657*0Sstevel@tonic-gate 		}
658*0Sstevel@tonic-gate 		return off;
659*0Sstevel@tonic-gate 	    }
660*0Sstevel@tonic-gate 	    a += c;
661*0Sstevel@tonic-gate 	    off--;
662*0Sstevel@tonic-gate 	}
663*0Sstevel@tonic-gate     }
664*0Sstevel@tonic-gate     else {
665*0Sstevel@tonic-gate 	while (b < a) {
666*0Sstevel@tonic-gate 	    U8 c = UTF8SKIP(b);
667*0Sstevel@tonic-gate 
668*0Sstevel@tonic-gate 	    if (a - b < c) {
669*0Sstevel@tonic-gate 	        if (ckWARN_d(WARN_UTF8)) {
670*0Sstevel@tonic-gate 		    if (PL_op)
671*0Sstevel@tonic-gate 		        Perl_warner(aTHX_ packWARN(WARN_UTF8),
672*0Sstevel@tonic-gate 				    "%s in %s", unees, OP_DESC(PL_op));
673*0Sstevel@tonic-gate 		    else
674*0Sstevel@tonic-gate 		        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675*0Sstevel@tonic-gate 		}
676*0Sstevel@tonic-gate 		return off;
677*0Sstevel@tonic-gate 	    }
678*0Sstevel@tonic-gate 	    b += c;
679*0Sstevel@tonic-gate 	    off++;
680*0Sstevel@tonic-gate 	}
681*0Sstevel@tonic-gate     }
682*0Sstevel@tonic-gate 
683*0Sstevel@tonic-gate     return off;
684*0Sstevel@tonic-gate }
685*0Sstevel@tonic-gate 
686*0Sstevel@tonic-gate /*
687*0Sstevel@tonic-gate =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
688*0Sstevel@tonic-gate 
689*0Sstevel@tonic-gate Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690*0Sstevel@tonic-gate forward or backward.
691*0Sstevel@tonic-gate 
692*0Sstevel@tonic-gate WARNING: do not use the following unless you *know* C<off> is within
693*0Sstevel@tonic-gate the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694*0Sstevel@tonic-gate on the first byte of character or just after the last byte of a character.
695*0Sstevel@tonic-gate 
696*0Sstevel@tonic-gate =cut
697*0Sstevel@tonic-gate */
698*0Sstevel@tonic-gate 
699*0Sstevel@tonic-gate U8 *
Perl_utf8_hop(pTHX_ U8 * s,I32 off)700*0Sstevel@tonic-gate Perl_utf8_hop(pTHX_ U8 *s, I32 off)
701*0Sstevel@tonic-gate {
702*0Sstevel@tonic-gate     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703*0Sstevel@tonic-gate      * the bitops (especially ~) can create illegal UTF-8.
704*0Sstevel@tonic-gate      * In other words: in Perl UTF-8 is not just for Unicode. */
705*0Sstevel@tonic-gate 
706*0Sstevel@tonic-gate     if (off >= 0) {
707*0Sstevel@tonic-gate 	while (off--)
708*0Sstevel@tonic-gate 	    s += UTF8SKIP(s);
709*0Sstevel@tonic-gate     }
710*0Sstevel@tonic-gate     else {
711*0Sstevel@tonic-gate 	while (off++) {
712*0Sstevel@tonic-gate 	    s--;
713*0Sstevel@tonic-gate 	    while (UTF8_IS_CONTINUATION(*s))
714*0Sstevel@tonic-gate 		s--;
715*0Sstevel@tonic-gate 	}
716*0Sstevel@tonic-gate     }
717*0Sstevel@tonic-gate     return s;
718*0Sstevel@tonic-gate }
719*0Sstevel@tonic-gate 
720*0Sstevel@tonic-gate /*
721*0Sstevel@tonic-gate =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
722*0Sstevel@tonic-gate 
723*0Sstevel@tonic-gate Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
724*0Sstevel@tonic-gate Unlike C<bytes_to_utf8>, this over-writes the original string, and
725*0Sstevel@tonic-gate updates len to contain the new length.
726*0Sstevel@tonic-gate Returns zero on failure, setting C<len> to -1.
727*0Sstevel@tonic-gate 
728*0Sstevel@tonic-gate =cut
729*0Sstevel@tonic-gate */
730*0Sstevel@tonic-gate 
731*0Sstevel@tonic-gate U8 *
Perl_utf8_to_bytes(pTHX_ U8 * s,STRLEN * len)732*0Sstevel@tonic-gate Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
733*0Sstevel@tonic-gate {
734*0Sstevel@tonic-gate     U8 *send;
735*0Sstevel@tonic-gate     U8 *d;
736*0Sstevel@tonic-gate     U8 *save = s;
737*0Sstevel@tonic-gate 
738*0Sstevel@tonic-gate     /* ensure valid UTF-8 and chars < 256 before updating string */
739*0Sstevel@tonic-gate     for (send = s + *len; s < send; ) {
740*0Sstevel@tonic-gate         U8 c = *s++;
741*0Sstevel@tonic-gate 
742*0Sstevel@tonic-gate         if (!UTF8_IS_INVARIANT(c) &&
743*0Sstevel@tonic-gate             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744*0Sstevel@tonic-gate 	     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
745*0Sstevel@tonic-gate             *len = -1;
746*0Sstevel@tonic-gate             return 0;
747*0Sstevel@tonic-gate         }
748*0Sstevel@tonic-gate     }
749*0Sstevel@tonic-gate 
750*0Sstevel@tonic-gate     d = s = save;
751*0Sstevel@tonic-gate     while (s < send) {
752*0Sstevel@tonic-gate         STRLEN ulen;
753*0Sstevel@tonic-gate         *d++ = (U8)utf8_to_uvchr(s, &ulen);
754*0Sstevel@tonic-gate         s += ulen;
755*0Sstevel@tonic-gate     }
756*0Sstevel@tonic-gate     *d = '\0';
757*0Sstevel@tonic-gate     *len = d - save;
758*0Sstevel@tonic-gate     return save;
759*0Sstevel@tonic-gate }
760*0Sstevel@tonic-gate 
761*0Sstevel@tonic-gate /*
762*0Sstevel@tonic-gate =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
763*0Sstevel@tonic-gate 
764*0Sstevel@tonic-gate Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
765*0Sstevel@tonic-gate Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
766*0Sstevel@tonic-gate the newly-created string, and updates C<len> to contain the new
767*0Sstevel@tonic-gate length.  Returns the original string if no conversion occurs, C<len>
768*0Sstevel@tonic-gate is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
769*0Sstevel@tonic-gate 0 if C<s> is converted or contains all 7bit characters.
770*0Sstevel@tonic-gate 
771*0Sstevel@tonic-gate =cut
772*0Sstevel@tonic-gate */
773*0Sstevel@tonic-gate 
774*0Sstevel@tonic-gate U8 *
Perl_bytes_from_utf8(pTHX_ U8 * s,STRLEN * len,bool * is_utf8)775*0Sstevel@tonic-gate Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
776*0Sstevel@tonic-gate {
777*0Sstevel@tonic-gate     U8 *d;
778*0Sstevel@tonic-gate     U8 *start = s;
779*0Sstevel@tonic-gate     U8 *send;
780*0Sstevel@tonic-gate     I32 count = 0;
781*0Sstevel@tonic-gate 
782*0Sstevel@tonic-gate     if (!*is_utf8)
783*0Sstevel@tonic-gate 	return start;
784*0Sstevel@tonic-gate 
785*0Sstevel@tonic-gate     /* ensure valid UTF-8 and chars < 256 before converting string */
786*0Sstevel@tonic-gate     for (send = s + *len; s < send;) {
787*0Sstevel@tonic-gate 	U8 c = *s++;
788*0Sstevel@tonic-gate 	if (!UTF8_IS_INVARIANT(c)) {
789*0Sstevel@tonic-gate 	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790*0Sstevel@tonic-gate                 (c = *s++) && UTF8_IS_CONTINUATION(c))
791*0Sstevel@tonic-gate 		count++;
792*0Sstevel@tonic-gate 	    else
793*0Sstevel@tonic-gate 		return start;
794*0Sstevel@tonic-gate 	}
795*0Sstevel@tonic-gate     }
796*0Sstevel@tonic-gate 
797*0Sstevel@tonic-gate     *is_utf8 = 0;
798*0Sstevel@tonic-gate 
799*0Sstevel@tonic-gate     Newz(801, d, (*len) - count + 1, U8);
800*0Sstevel@tonic-gate     s = start; start = d;
801*0Sstevel@tonic-gate     while (s < send) {
802*0Sstevel@tonic-gate 	U8 c = *s++;
803*0Sstevel@tonic-gate 	if (!UTF8_IS_INVARIANT(c)) {
804*0Sstevel@tonic-gate 	    /* Then it is two-byte encoded */
805*0Sstevel@tonic-gate 	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806*0Sstevel@tonic-gate 	    c = ASCII_TO_NATIVE(c);
807*0Sstevel@tonic-gate 	}
808*0Sstevel@tonic-gate 	*d++ = c;
809*0Sstevel@tonic-gate     }
810*0Sstevel@tonic-gate     *d = '\0';
811*0Sstevel@tonic-gate     *len = d - start;
812*0Sstevel@tonic-gate     return start;
813*0Sstevel@tonic-gate }
814*0Sstevel@tonic-gate 
815*0Sstevel@tonic-gate /*
816*0Sstevel@tonic-gate =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
817*0Sstevel@tonic-gate 
818*0Sstevel@tonic-gate Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
819*0Sstevel@tonic-gate Returns a pointer to the newly-created string, and sets C<len> to
820*0Sstevel@tonic-gate reflect the new length.
821*0Sstevel@tonic-gate 
822*0Sstevel@tonic-gate If you want to convert to UTF-8 from other encodings than ASCII,
823*0Sstevel@tonic-gate see sv_recode_to_utf8().
824*0Sstevel@tonic-gate 
825*0Sstevel@tonic-gate =cut
826*0Sstevel@tonic-gate */
827*0Sstevel@tonic-gate 
828*0Sstevel@tonic-gate U8*
Perl_bytes_to_utf8(pTHX_ U8 * s,STRLEN * len)829*0Sstevel@tonic-gate Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
830*0Sstevel@tonic-gate {
831*0Sstevel@tonic-gate     U8 *send;
832*0Sstevel@tonic-gate     U8 *d;
833*0Sstevel@tonic-gate     U8 *dst;
834*0Sstevel@tonic-gate     send = s + (*len);
835*0Sstevel@tonic-gate 
836*0Sstevel@tonic-gate     Newz(801, d, (*len) * 2 + 1, U8);
837*0Sstevel@tonic-gate     dst = d;
838*0Sstevel@tonic-gate 
839*0Sstevel@tonic-gate     while (s < send) {
840*0Sstevel@tonic-gate         UV uv = NATIVE_TO_ASCII(*s++);
841*0Sstevel@tonic-gate         if (UNI_IS_INVARIANT(uv))
842*0Sstevel@tonic-gate             *d++ = (U8)UTF_TO_NATIVE(uv);
843*0Sstevel@tonic-gate         else {
844*0Sstevel@tonic-gate             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
845*0Sstevel@tonic-gate             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
846*0Sstevel@tonic-gate         }
847*0Sstevel@tonic-gate     }
848*0Sstevel@tonic-gate     *d = '\0';
849*0Sstevel@tonic-gate     *len = d-dst;
850*0Sstevel@tonic-gate     return dst;
851*0Sstevel@tonic-gate }
852*0Sstevel@tonic-gate 
853*0Sstevel@tonic-gate /*
854*0Sstevel@tonic-gate  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
855*0Sstevel@tonic-gate  *
856*0Sstevel@tonic-gate  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
857*0Sstevel@tonic-gate  * We optimize for native, for obvious reasons. */
858*0Sstevel@tonic-gate 
859*0Sstevel@tonic-gate U8*
Perl_utf16_to_utf8(pTHX_ U8 * p,U8 * d,I32 bytelen,I32 * newlen)860*0Sstevel@tonic-gate Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
861*0Sstevel@tonic-gate {
862*0Sstevel@tonic-gate     U8* pend;
863*0Sstevel@tonic-gate     U8* dstart = d;
864*0Sstevel@tonic-gate 
865*0Sstevel@tonic-gate     if (bytelen & 1)
866*0Sstevel@tonic-gate 	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
867*0Sstevel@tonic-gate 
868*0Sstevel@tonic-gate     pend = p + bytelen;
869*0Sstevel@tonic-gate 
870*0Sstevel@tonic-gate     while (p < pend) {
871*0Sstevel@tonic-gate 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
872*0Sstevel@tonic-gate 	p += 2;
873*0Sstevel@tonic-gate 	if (uv < 0x80) {
874*0Sstevel@tonic-gate 	    *d++ = (U8)uv;
875*0Sstevel@tonic-gate 	    continue;
876*0Sstevel@tonic-gate 	}
877*0Sstevel@tonic-gate 	if (uv < 0x800) {
878*0Sstevel@tonic-gate 	    *d++ = (U8)(( uv >>  6)         | 0xc0);
879*0Sstevel@tonic-gate 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
880*0Sstevel@tonic-gate 	    continue;
881*0Sstevel@tonic-gate 	}
882*0Sstevel@tonic-gate 	if (uv >= 0xd800 && uv < 0xdbff) {	/* surrogates */
883*0Sstevel@tonic-gate 	    UV low = (p[0] << 8) + p[1];
884*0Sstevel@tonic-gate 	    p += 2;
885*0Sstevel@tonic-gate 	    if (low < 0xdc00 || low >= 0xdfff)
886*0Sstevel@tonic-gate 		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
887*0Sstevel@tonic-gate 	    uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
888*0Sstevel@tonic-gate 	}
889*0Sstevel@tonic-gate 	if (uv < 0x10000) {
890*0Sstevel@tonic-gate 	    *d++ = (U8)(( uv >> 12)         | 0xe0);
891*0Sstevel@tonic-gate 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
892*0Sstevel@tonic-gate 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
893*0Sstevel@tonic-gate 	    continue;
894*0Sstevel@tonic-gate 	}
895*0Sstevel@tonic-gate 	else {
896*0Sstevel@tonic-gate 	    *d++ = (U8)(( uv >> 18)         | 0xf0);
897*0Sstevel@tonic-gate 	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
898*0Sstevel@tonic-gate 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
899*0Sstevel@tonic-gate 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
900*0Sstevel@tonic-gate 	    continue;
901*0Sstevel@tonic-gate 	}
902*0Sstevel@tonic-gate     }
903*0Sstevel@tonic-gate     *newlen = d - dstart;
904*0Sstevel@tonic-gate     return d;
905*0Sstevel@tonic-gate }
906*0Sstevel@tonic-gate 
907*0Sstevel@tonic-gate /* Note: this one is slightly destructive of the source. */
908*0Sstevel@tonic-gate 
909*0Sstevel@tonic-gate U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8 * p,U8 * d,I32 bytelen,I32 * newlen)910*0Sstevel@tonic-gate Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
911*0Sstevel@tonic-gate {
912*0Sstevel@tonic-gate     U8* s = (U8*)p;
913*0Sstevel@tonic-gate     U8* send = s + bytelen;
914*0Sstevel@tonic-gate     while (s < send) {
915*0Sstevel@tonic-gate 	U8 tmp = s[0];
916*0Sstevel@tonic-gate 	s[0] = s[1];
917*0Sstevel@tonic-gate 	s[1] = tmp;
918*0Sstevel@tonic-gate 	s += 2;
919*0Sstevel@tonic-gate     }
920*0Sstevel@tonic-gate     return utf16_to_utf8(p, d, bytelen, newlen);
921*0Sstevel@tonic-gate }
922*0Sstevel@tonic-gate 
923*0Sstevel@tonic-gate /* for now these are all defined (inefficiently) in terms of the utf8 versions */
924*0Sstevel@tonic-gate 
925*0Sstevel@tonic-gate bool
Perl_is_uni_alnum(pTHX_ UV c)926*0Sstevel@tonic-gate Perl_is_uni_alnum(pTHX_ UV c)
927*0Sstevel@tonic-gate {
928*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
929*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
930*0Sstevel@tonic-gate     return is_utf8_alnum(tmpbuf);
931*0Sstevel@tonic-gate }
932*0Sstevel@tonic-gate 
933*0Sstevel@tonic-gate bool
Perl_is_uni_alnumc(pTHX_ UV c)934*0Sstevel@tonic-gate Perl_is_uni_alnumc(pTHX_ UV c)
935*0Sstevel@tonic-gate {
936*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
937*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
938*0Sstevel@tonic-gate     return is_utf8_alnumc(tmpbuf);
939*0Sstevel@tonic-gate }
940*0Sstevel@tonic-gate 
941*0Sstevel@tonic-gate bool
Perl_is_uni_idfirst(pTHX_ UV c)942*0Sstevel@tonic-gate Perl_is_uni_idfirst(pTHX_ UV c)
943*0Sstevel@tonic-gate {
944*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
945*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
946*0Sstevel@tonic-gate     return is_utf8_idfirst(tmpbuf);
947*0Sstevel@tonic-gate }
948*0Sstevel@tonic-gate 
949*0Sstevel@tonic-gate bool
Perl_is_uni_alpha(pTHX_ UV c)950*0Sstevel@tonic-gate Perl_is_uni_alpha(pTHX_ UV c)
951*0Sstevel@tonic-gate {
952*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
953*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
954*0Sstevel@tonic-gate     return is_utf8_alpha(tmpbuf);
955*0Sstevel@tonic-gate }
956*0Sstevel@tonic-gate 
957*0Sstevel@tonic-gate bool
Perl_is_uni_ascii(pTHX_ UV c)958*0Sstevel@tonic-gate Perl_is_uni_ascii(pTHX_ UV c)
959*0Sstevel@tonic-gate {
960*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
961*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
962*0Sstevel@tonic-gate     return is_utf8_ascii(tmpbuf);
963*0Sstevel@tonic-gate }
964*0Sstevel@tonic-gate 
965*0Sstevel@tonic-gate bool
Perl_is_uni_space(pTHX_ UV c)966*0Sstevel@tonic-gate Perl_is_uni_space(pTHX_ UV c)
967*0Sstevel@tonic-gate {
968*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
969*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
970*0Sstevel@tonic-gate     return is_utf8_space(tmpbuf);
971*0Sstevel@tonic-gate }
972*0Sstevel@tonic-gate 
973*0Sstevel@tonic-gate bool
Perl_is_uni_digit(pTHX_ UV c)974*0Sstevel@tonic-gate Perl_is_uni_digit(pTHX_ UV c)
975*0Sstevel@tonic-gate {
976*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
977*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
978*0Sstevel@tonic-gate     return is_utf8_digit(tmpbuf);
979*0Sstevel@tonic-gate }
980*0Sstevel@tonic-gate 
981*0Sstevel@tonic-gate bool
Perl_is_uni_upper(pTHX_ UV c)982*0Sstevel@tonic-gate Perl_is_uni_upper(pTHX_ UV c)
983*0Sstevel@tonic-gate {
984*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
985*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
986*0Sstevel@tonic-gate     return is_utf8_upper(tmpbuf);
987*0Sstevel@tonic-gate }
988*0Sstevel@tonic-gate 
989*0Sstevel@tonic-gate bool
Perl_is_uni_lower(pTHX_ UV c)990*0Sstevel@tonic-gate Perl_is_uni_lower(pTHX_ UV c)
991*0Sstevel@tonic-gate {
992*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
993*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
994*0Sstevel@tonic-gate     return is_utf8_lower(tmpbuf);
995*0Sstevel@tonic-gate }
996*0Sstevel@tonic-gate 
997*0Sstevel@tonic-gate bool
Perl_is_uni_cntrl(pTHX_ UV c)998*0Sstevel@tonic-gate Perl_is_uni_cntrl(pTHX_ UV c)
999*0Sstevel@tonic-gate {
1000*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
1001*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
1002*0Sstevel@tonic-gate     return is_utf8_cntrl(tmpbuf);
1003*0Sstevel@tonic-gate }
1004*0Sstevel@tonic-gate 
1005*0Sstevel@tonic-gate bool
Perl_is_uni_graph(pTHX_ UV c)1006*0Sstevel@tonic-gate Perl_is_uni_graph(pTHX_ UV c)
1007*0Sstevel@tonic-gate {
1008*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
1009*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
1010*0Sstevel@tonic-gate     return is_utf8_graph(tmpbuf);
1011*0Sstevel@tonic-gate }
1012*0Sstevel@tonic-gate 
1013*0Sstevel@tonic-gate bool
Perl_is_uni_print(pTHX_ UV c)1014*0Sstevel@tonic-gate Perl_is_uni_print(pTHX_ UV c)
1015*0Sstevel@tonic-gate {
1016*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
1017*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
1018*0Sstevel@tonic-gate     return is_utf8_print(tmpbuf);
1019*0Sstevel@tonic-gate }
1020*0Sstevel@tonic-gate 
1021*0Sstevel@tonic-gate bool
Perl_is_uni_punct(pTHX_ UV c)1022*0Sstevel@tonic-gate Perl_is_uni_punct(pTHX_ UV c)
1023*0Sstevel@tonic-gate {
1024*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN+1];
1025*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
1026*0Sstevel@tonic-gate     return is_utf8_punct(tmpbuf);
1027*0Sstevel@tonic-gate }
1028*0Sstevel@tonic-gate 
1029*0Sstevel@tonic-gate bool
Perl_is_uni_xdigit(pTHX_ UV c)1030*0Sstevel@tonic-gate Perl_is_uni_xdigit(pTHX_ UV c)
1031*0Sstevel@tonic-gate {
1032*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1033*0Sstevel@tonic-gate     uvchr_to_utf8(tmpbuf, c);
1034*0Sstevel@tonic-gate     return is_utf8_xdigit(tmpbuf);
1035*0Sstevel@tonic-gate }
1036*0Sstevel@tonic-gate 
1037*0Sstevel@tonic-gate UV
Perl_to_uni_upper(pTHX_ UV c,U8 * p,STRLEN * lenp)1038*0Sstevel@tonic-gate Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1039*0Sstevel@tonic-gate {
1040*0Sstevel@tonic-gate     uvchr_to_utf8(p, c);
1041*0Sstevel@tonic-gate     return to_utf8_upper(p, p, lenp);
1042*0Sstevel@tonic-gate }
1043*0Sstevel@tonic-gate 
1044*0Sstevel@tonic-gate UV
Perl_to_uni_title(pTHX_ UV c,U8 * p,STRLEN * lenp)1045*0Sstevel@tonic-gate Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1046*0Sstevel@tonic-gate {
1047*0Sstevel@tonic-gate     uvchr_to_utf8(p, c);
1048*0Sstevel@tonic-gate     return to_utf8_title(p, p, lenp);
1049*0Sstevel@tonic-gate }
1050*0Sstevel@tonic-gate 
1051*0Sstevel@tonic-gate UV
Perl_to_uni_lower(pTHX_ UV c,U8 * p,STRLEN * lenp)1052*0Sstevel@tonic-gate Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1053*0Sstevel@tonic-gate {
1054*0Sstevel@tonic-gate     uvchr_to_utf8(p, c);
1055*0Sstevel@tonic-gate     return to_utf8_lower(p, p, lenp);
1056*0Sstevel@tonic-gate }
1057*0Sstevel@tonic-gate 
1058*0Sstevel@tonic-gate UV
Perl_to_uni_fold(pTHX_ UV c,U8 * p,STRLEN * lenp)1059*0Sstevel@tonic-gate Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1060*0Sstevel@tonic-gate {
1061*0Sstevel@tonic-gate     uvchr_to_utf8(p, c);
1062*0Sstevel@tonic-gate     return to_utf8_fold(p, p, lenp);
1063*0Sstevel@tonic-gate }
1064*0Sstevel@tonic-gate 
1065*0Sstevel@tonic-gate /* for now these all assume no locale info available for Unicode > 255 */
1066*0Sstevel@tonic-gate 
1067*0Sstevel@tonic-gate bool
Perl_is_uni_alnum_lc(pTHX_ UV c)1068*0Sstevel@tonic-gate Perl_is_uni_alnum_lc(pTHX_ UV c)
1069*0Sstevel@tonic-gate {
1070*0Sstevel@tonic-gate     return is_uni_alnum(c);	/* XXX no locale support yet */
1071*0Sstevel@tonic-gate }
1072*0Sstevel@tonic-gate 
1073*0Sstevel@tonic-gate bool
Perl_is_uni_alnumc_lc(pTHX_ UV c)1074*0Sstevel@tonic-gate Perl_is_uni_alnumc_lc(pTHX_ UV c)
1075*0Sstevel@tonic-gate {
1076*0Sstevel@tonic-gate     return is_uni_alnumc(c);	/* XXX no locale support yet */
1077*0Sstevel@tonic-gate }
1078*0Sstevel@tonic-gate 
1079*0Sstevel@tonic-gate bool
Perl_is_uni_idfirst_lc(pTHX_ UV c)1080*0Sstevel@tonic-gate Perl_is_uni_idfirst_lc(pTHX_ UV c)
1081*0Sstevel@tonic-gate {
1082*0Sstevel@tonic-gate     return is_uni_idfirst(c);	/* XXX no locale support yet */
1083*0Sstevel@tonic-gate }
1084*0Sstevel@tonic-gate 
1085*0Sstevel@tonic-gate bool
Perl_is_uni_alpha_lc(pTHX_ UV c)1086*0Sstevel@tonic-gate Perl_is_uni_alpha_lc(pTHX_ UV c)
1087*0Sstevel@tonic-gate {
1088*0Sstevel@tonic-gate     return is_uni_alpha(c);	/* XXX no locale support yet */
1089*0Sstevel@tonic-gate }
1090*0Sstevel@tonic-gate 
1091*0Sstevel@tonic-gate bool
Perl_is_uni_ascii_lc(pTHX_ UV c)1092*0Sstevel@tonic-gate Perl_is_uni_ascii_lc(pTHX_ UV c)
1093*0Sstevel@tonic-gate {
1094*0Sstevel@tonic-gate     return is_uni_ascii(c);	/* XXX no locale support yet */
1095*0Sstevel@tonic-gate }
1096*0Sstevel@tonic-gate 
1097*0Sstevel@tonic-gate bool
Perl_is_uni_space_lc(pTHX_ UV c)1098*0Sstevel@tonic-gate Perl_is_uni_space_lc(pTHX_ UV c)
1099*0Sstevel@tonic-gate {
1100*0Sstevel@tonic-gate     return is_uni_space(c);	/* XXX no locale support yet */
1101*0Sstevel@tonic-gate }
1102*0Sstevel@tonic-gate 
1103*0Sstevel@tonic-gate bool
Perl_is_uni_digit_lc(pTHX_ UV c)1104*0Sstevel@tonic-gate Perl_is_uni_digit_lc(pTHX_ UV c)
1105*0Sstevel@tonic-gate {
1106*0Sstevel@tonic-gate     return is_uni_digit(c);	/* XXX no locale support yet */
1107*0Sstevel@tonic-gate }
1108*0Sstevel@tonic-gate 
1109*0Sstevel@tonic-gate bool
Perl_is_uni_upper_lc(pTHX_ UV c)1110*0Sstevel@tonic-gate Perl_is_uni_upper_lc(pTHX_ UV c)
1111*0Sstevel@tonic-gate {
1112*0Sstevel@tonic-gate     return is_uni_upper(c);	/* XXX no locale support yet */
1113*0Sstevel@tonic-gate }
1114*0Sstevel@tonic-gate 
1115*0Sstevel@tonic-gate bool
Perl_is_uni_lower_lc(pTHX_ UV c)1116*0Sstevel@tonic-gate Perl_is_uni_lower_lc(pTHX_ UV c)
1117*0Sstevel@tonic-gate {
1118*0Sstevel@tonic-gate     return is_uni_lower(c);	/* XXX no locale support yet */
1119*0Sstevel@tonic-gate }
1120*0Sstevel@tonic-gate 
1121*0Sstevel@tonic-gate bool
Perl_is_uni_cntrl_lc(pTHX_ UV c)1122*0Sstevel@tonic-gate Perl_is_uni_cntrl_lc(pTHX_ UV c)
1123*0Sstevel@tonic-gate {
1124*0Sstevel@tonic-gate     return is_uni_cntrl(c);	/* XXX no locale support yet */
1125*0Sstevel@tonic-gate }
1126*0Sstevel@tonic-gate 
1127*0Sstevel@tonic-gate bool
Perl_is_uni_graph_lc(pTHX_ UV c)1128*0Sstevel@tonic-gate Perl_is_uni_graph_lc(pTHX_ UV c)
1129*0Sstevel@tonic-gate {
1130*0Sstevel@tonic-gate     return is_uni_graph(c);	/* XXX no locale support yet */
1131*0Sstevel@tonic-gate }
1132*0Sstevel@tonic-gate 
1133*0Sstevel@tonic-gate bool
Perl_is_uni_print_lc(pTHX_ UV c)1134*0Sstevel@tonic-gate Perl_is_uni_print_lc(pTHX_ UV c)
1135*0Sstevel@tonic-gate {
1136*0Sstevel@tonic-gate     return is_uni_print(c);	/* XXX no locale support yet */
1137*0Sstevel@tonic-gate }
1138*0Sstevel@tonic-gate 
1139*0Sstevel@tonic-gate bool
Perl_is_uni_punct_lc(pTHX_ UV c)1140*0Sstevel@tonic-gate Perl_is_uni_punct_lc(pTHX_ UV c)
1141*0Sstevel@tonic-gate {
1142*0Sstevel@tonic-gate     return is_uni_punct(c);	/* XXX no locale support yet */
1143*0Sstevel@tonic-gate }
1144*0Sstevel@tonic-gate 
1145*0Sstevel@tonic-gate bool
Perl_is_uni_xdigit_lc(pTHX_ UV c)1146*0Sstevel@tonic-gate Perl_is_uni_xdigit_lc(pTHX_ UV c)
1147*0Sstevel@tonic-gate {
1148*0Sstevel@tonic-gate     return is_uni_xdigit(c);	/* XXX no locale support yet */
1149*0Sstevel@tonic-gate }
1150*0Sstevel@tonic-gate 
1151*0Sstevel@tonic-gate U32
Perl_to_uni_upper_lc(pTHX_ U32 c)1152*0Sstevel@tonic-gate Perl_to_uni_upper_lc(pTHX_ U32 c)
1153*0Sstevel@tonic-gate {
1154*0Sstevel@tonic-gate     /* XXX returns only the first character -- do not use XXX */
1155*0Sstevel@tonic-gate     /* XXX no locale support yet */
1156*0Sstevel@tonic-gate     STRLEN len;
1157*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1158*0Sstevel@tonic-gate     return (U32)to_uni_upper(c, tmpbuf, &len);
1159*0Sstevel@tonic-gate }
1160*0Sstevel@tonic-gate 
1161*0Sstevel@tonic-gate U32
Perl_to_uni_title_lc(pTHX_ U32 c)1162*0Sstevel@tonic-gate Perl_to_uni_title_lc(pTHX_ U32 c)
1163*0Sstevel@tonic-gate {
1164*0Sstevel@tonic-gate     /* XXX returns only the first character XXX -- do not use XXX */
1165*0Sstevel@tonic-gate     /* XXX no locale support yet */
1166*0Sstevel@tonic-gate     STRLEN len;
1167*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1168*0Sstevel@tonic-gate     return (U32)to_uni_title(c, tmpbuf, &len);
1169*0Sstevel@tonic-gate }
1170*0Sstevel@tonic-gate 
1171*0Sstevel@tonic-gate U32
Perl_to_uni_lower_lc(pTHX_ U32 c)1172*0Sstevel@tonic-gate Perl_to_uni_lower_lc(pTHX_ U32 c)
1173*0Sstevel@tonic-gate {
1174*0Sstevel@tonic-gate     /* XXX returns only the first character -- do not use XXX */
1175*0Sstevel@tonic-gate     /* XXX no locale support yet */
1176*0Sstevel@tonic-gate     STRLEN len;
1177*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1178*0Sstevel@tonic-gate     return (U32)to_uni_lower(c, tmpbuf, &len);
1179*0Sstevel@tonic-gate }
1180*0Sstevel@tonic-gate 
1181*0Sstevel@tonic-gate bool
Perl_is_utf8_alnum(pTHX_ U8 * p)1182*0Sstevel@tonic-gate Perl_is_utf8_alnum(pTHX_ U8 *p)
1183*0Sstevel@tonic-gate {
1184*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1185*0Sstevel@tonic-gate 	return FALSE;
1186*0Sstevel@tonic-gate     if (!PL_utf8_alnum)
1187*0Sstevel@tonic-gate 	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1188*0Sstevel@tonic-gate 	 * descendant of isalnum(3), in other words, it doesn't
1189*0Sstevel@tonic-gate 	 * contain the '_'. --jhi */
1190*0Sstevel@tonic-gate 	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1191*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1192*0Sstevel@tonic-gate /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1193*0Sstevel@tonic-gate #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1194*0Sstevel@tonic-gate     if (!PL_utf8_alnum)
1195*0Sstevel@tonic-gate 	PL_utf8_alnum = swash_init("utf8", "",
1196*0Sstevel@tonic-gate 	    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1197*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1198*0Sstevel@tonic-gate #endif
1199*0Sstevel@tonic-gate }
1200*0Sstevel@tonic-gate 
1201*0Sstevel@tonic-gate bool
Perl_is_utf8_alnumc(pTHX_ U8 * p)1202*0Sstevel@tonic-gate Perl_is_utf8_alnumc(pTHX_ U8 *p)
1203*0Sstevel@tonic-gate {
1204*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1205*0Sstevel@tonic-gate 	return FALSE;
1206*0Sstevel@tonic-gate     if (!PL_utf8_alnum)
1207*0Sstevel@tonic-gate 	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1208*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1209*0Sstevel@tonic-gate /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1210*0Sstevel@tonic-gate #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1211*0Sstevel@tonic-gate     if (!PL_utf8_alnum)
1212*0Sstevel@tonic-gate 	PL_utf8_alnum = swash_init("utf8", "",
1213*0Sstevel@tonic-gate 	    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1214*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1215*0Sstevel@tonic-gate #endif
1216*0Sstevel@tonic-gate }
1217*0Sstevel@tonic-gate 
1218*0Sstevel@tonic-gate bool
Perl_is_utf8_idfirst(pTHX_ U8 * p)1219*0Sstevel@tonic-gate Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1220*0Sstevel@tonic-gate {
1221*0Sstevel@tonic-gate     if (*p == '_')
1222*0Sstevel@tonic-gate 	return TRUE;
1223*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1224*0Sstevel@tonic-gate 	return FALSE;
1225*0Sstevel@tonic-gate     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1226*0Sstevel@tonic-gate 	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1227*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1228*0Sstevel@tonic-gate }
1229*0Sstevel@tonic-gate 
1230*0Sstevel@tonic-gate bool
Perl_is_utf8_idcont(pTHX_ U8 * p)1231*0Sstevel@tonic-gate Perl_is_utf8_idcont(pTHX_ U8 *p)
1232*0Sstevel@tonic-gate {
1233*0Sstevel@tonic-gate     if (*p == '_')
1234*0Sstevel@tonic-gate 	return TRUE;
1235*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1236*0Sstevel@tonic-gate 	return FALSE;
1237*0Sstevel@tonic-gate     if (!PL_utf8_idcont)
1238*0Sstevel@tonic-gate 	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1239*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1240*0Sstevel@tonic-gate }
1241*0Sstevel@tonic-gate 
1242*0Sstevel@tonic-gate bool
Perl_is_utf8_alpha(pTHX_ U8 * p)1243*0Sstevel@tonic-gate Perl_is_utf8_alpha(pTHX_ U8 *p)
1244*0Sstevel@tonic-gate {
1245*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1246*0Sstevel@tonic-gate 	return FALSE;
1247*0Sstevel@tonic-gate     if (!PL_utf8_alpha)
1248*0Sstevel@tonic-gate 	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1249*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1250*0Sstevel@tonic-gate }
1251*0Sstevel@tonic-gate 
1252*0Sstevel@tonic-gate bool
Perl_is_utf8_ascii(pTHX_ U8 * p)1253*0Sstevel@tonic-gate Perl_is_utf8_ascii(pTHX_ U8 *p)
1254*0Sstevel@tonic-gate {
1255*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1256*0Sstevel@tonic-gate 	return FALSE;
1257*0Sstevel@tonic-gate     if (!PL_utf8_ascii)
1258*0Sstevel@tonic-gate 	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1259*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1260*0Sstevel@tonic-gate }
1261*0Sstevel@tonic-gate 
1262*0Sstevel@tonic-gate bool
Perl_is_utf8_space(pTHX_ U8 * p)1263*0Sstevel@tonic-gate Perl_is_utf8_space(pTHX_ U8 *p)
1264*0Sstevel@tonic-gate {
1265*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1266*0Sstevel@tonic-gate 	return FALSE;
1267*0Sstevel@tonic-gate     if (!PL_utf8_space)
1268*0Sstevel@tonic-gate 	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1269*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1270*0Sstevel@tonic-gate }
1271*0Sstevel@tonic-gate 
1272*0Sstevel@tonic-gate bool
Perl_is_utf8_digit(pTHX_ U8 * p)1273*0Sstevel@tonic-gate Perl_is_utf8_digit(pTHX_ U8 *p)
1274*0Sstevel@tonic-gate {
1275*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1276*0Sstevel@tonic-gate 	return FALSE;
1277*0Sstevel@tonic-gate     if (!PL_utf8_digit)
1278*0Sstevel@tonic-gate 	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1279*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1280*0Sstevel@tonic-gate }
1281*0Sstevel@tonic-gate 
1282*0Sstevel@tonic-gate bool
Perl_is_utf8_upper(pTHX_ U8 * p)1283*0Sstevel@tonic-gate Perl_is_utf8_upper(pTHX_ U8 *p)
1284*0Sstevel@tonic-gate {
1285*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1286*0Sstevel@tonic-gate 	return FALSE;
1287*0Sstevel@tonic-gate     if (!PL_utf8_upper)
1288*0Sstevel@tonic-gate 	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1289*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1290*0Sstevel@tonic-gate }
1291*0Sstevel@tonic-gate 
1292*0Sstevel@tonic-gate bool
Perl_is_utf8_lower(pTHX_ U8 * p)1293*0Sstevel@tonic-gate Perl_is_utf8_lower(pTHX_ U8 *p)
1294*0Sstevel@tonic-gate {
1295*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1296*0Sstevel@tonic-gate 	return FALSE;
1297*0Sstevel@tonic-gate     if (!PL_utf8_lower)
1298*0Sstevel@tonic-gate 	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1299*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1300*0Sstevel@tonic-gate }
1301*0Sstevel@tonic-gate 
1302*0Sstevel@tonic-gate bool
Perl_is_utf8_cntrl(pTHX_ U8 * p)1303*0Sstevel@tonic-gate Perl_is_utf8_cntrl(pTHX_ U8 *p)
1304*0Sstevel@tonic-gate {
1305*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1306*0Sstevel@tonic-gate 	return FALSE;
1307*0Sstevel@tonic-gate     if (!PL_utf8_cntrl)
1308*0Sstevel@tonic-gate 	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1309*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1310*0Sstevel@tonic-gate }
1311*0Sstevel@tonic-gate 
1312*0Sstevel@tonic-gate bool
Perl_is_utf8_graph(pTHX_ U8 * p)1313*0Sstevel@tonic-gate Perl_is_utf8_graph(pTHX_ U8 *p)
1314*0Sstevel@tonic-gate {
1315*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1316*0Sstevel@tonic-gate 	return FALSE;
1317*0Sstevel@tonic-gate     if (!PL_utf8_graph)
1318*0Sstevel@tonic-gate 	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1319*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1320*0Sstevel@tonic-gate }
1321*0Sstevel@tonic-gate 
1322*0Sstevel@tonic-gate bool
Perl_is_utf8_print(pTHX_ U8 * p)1323*0Sstevel@tonic-gate Perl_is_utf8_print(pTHX_ U8 *p)
1324*0Sstevel@tonic-gate {
1325*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1326*0Sstevel@tonic-gate 	return FALSE;
1327*0Sstevel@tonic-gate     if (!PL_utf8_print)
1328*0Sstevel@tonic-gate 	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1329*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1330*0Sstevel@tonic-gate }
1331*0Sstevel@tonic-gate 
1332*0Sstevel@tonic-gate bool
Perl_is_utf8_punct(pTHX_ U8 * p)1333*0Sstevel@tonic-gate Perl_is_utf8_punct(pTHX_ U8 *p)
1334*0Sstevel@tonic-gate {
1335*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1336*0Sstevel@tonic-gate 	return FALSE;
1337*0Sstevel@tonic-gate     if (!PL_utf8_punct)
1338*0Sstevel@tonic-gate 	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1339*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1340*0Sstevel@tonic-gate }
1341*0Sstevel@tonic-gate 
1342*0Sstevel@tonic-gate bool
Perl_is_utf8_xdigit(pTHX_ U8 * p)1343*0Sstevel@tonic-gate Perl_is_utf8_xdigit(pTHX_ U8 *p)
1344*0Sstevel@tonic-gate {
1345*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1346*0Sstevel@tonic-gate 	return FALSE;
1347*0Sstevel@tonic-gate     if (!PL_utf8_xdigit)
1348*0Sstevel@tonic-gate 	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1349*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1350*0Sstevel@tonic-gate }
1351*0Sstevel@tonic-gate 
1352*0Sstevel@tonic-gate bool
Perl_is_utf8_mark(pTHX_ U8 * p)1353*0Sstevel@tonic-gate Perl_is_utf8_mark(pTHX_ U8 *p)
1354*0Sstevel@tonic-gate {
1355*0Sstevel@tonic-gate     if (!is_utf8_char(p))
1356*0Sstevel@tonic-gate 	return FALSE;
1357*0Sstevel@tonic-gate     if (!PL_utf8_mark)
1358*0Sstevel@tonic-gate 	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1359*0Sstevel@tonic-gate     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1360*0Sstevel@tonic-gate }
1361*0Sstevel@tonic-gate 
1362*0Sstevel@tonic-gate /*
1363*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1364*0Sstevel@tonic-gate 
1365*0Sstevel@tonic-gate The "p" contains the pointer to the UTF-8 string encoding
1366*0Sstevel@tonic-gate the character that is being converted.
1367*0Sstevel@tonic-gate 
1368*0Sstevel@tonic-gate The "ustrp" is a pointer to the character buffer to put the
1369*0Sstevel@tonic-gate conversion result to.  The "lenp" is a pointer to the length
1370*0Sstevel@tonic-gate of the result.
1371*0Sstevel@tonic-gate 
1372*0Sstevel@tonic-gate The "swashp" is a pointer to the swash to use.
1373*0Sstevel@tonic-gate 
1374*0Sstevel@tonic-gate Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1375*0Sstevel@tonic-gate and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1376*0Sstevel@tonic-gate but not always, a multicharacter mapping), is tried first.
1377*0Sstevel@tonic-gate 
1378*0Sstevel@tonic-gate The "special" is a string like "utf8::ToSpecLower", which means the
1379*0Sstevel@tonic-gate hash %utf8::ToSpecLower.  The access to the hash is through
1380*0Sstevel@tonic-gate Perl_to_utf8_case().
1381*0Sstevel@tonic-gate 
1382*0Sstevel@tonic-gate The "normal" is a string like "ToLower" which means the swash
1383*0Sstevel@tonic-gate %utf8::ToLower.
1384*0Sstevel@tonic-gate 
1385*0Sstevel@tonic-gate =cut */
1386*0Sstevel@tonic-gate 
1387*0Sstevel@tonic-gate UV
Perl_to_utf8_case(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp,SV ** swashp,char * normal,char * special)1388*0Sstevel@tonic-gate Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1389*0Sstevel@tonic-gate {
1390*0Sstevel@tonic-gate     UV uv0, uv1;
1391*0Sstevel@tonic-gate     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1392*0Sstevel@tonic-gate     STRLEN len = 0;
1393*0Sstevel@tonic-gate 
1394*0Sstevel@tonic-gate     uv0 = utf8_to_uvchr(p, 0);
1395*0Sstevel@tonic-gate     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1396*0Sstevel@tonic-gate      * are necessary in EBCDIC, they are redundant no-ops
1397*0Sstevel@tonic-gate      * in ASCII-ish platforms, and hopefully optimized away. */
1398*0Sstevel@tonic-gate     uv1 = NATIVE_TO_UNI(uv0);
1399*0Sstevel@tonic-gate     uvuni_to_utf8(tmpbuf, uv1);
1400*0Sstevel@tonic-gate 
1401*0Sstevel@tonic-gate     if (!*swashp) /* load on-demand */
1402*0Sstevel@tonic-gate          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1403*0Sstevel@tonic-gate 
1404*0Sstevel@tonic-gate     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1405*0Sstevel@tonic-gate     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1406*0Sstevel@tonic-gate          /* It might be "special" (sometimes, but not always,
1407*0Sstevel@tonic-gate 	  * a multicharacter mapping) */
1408*0Sstevel@tonic-gate 	 HV *hv;
1409*0Sstevel@tonic-gate 	 SV **svp;
1410*0Sstevel@tonic-gate 
1411*0Sstevel@tonic-gate 	 if ((hv  = get_hv(special, FALSE)) &&
1412*0Sstevel@tonic-gate 	     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1413*0Sstevel@tonic-gate 	     (*svp)) {
1414*0Sstevel@tonic-gate 	      char *s;
1415*0Sstevel@tonic-gate 
1416*0Sstevel@tonic-gate 	      s = SvPV(*svp, len);
1417*0Sstevel@tonic-gate 	      if (len == 1)
1418*0Sstevel@tonic-gate 		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1419*0Sstevel@tonic-gate 	      else {
1420*0Sstevel@tonic-gate #ifdef EBCDIC
1421*0Sstevel@tonic-gate 		   /* If we have EBCDIC we need to remap the characters
1422*0Sstevel@tonic-gate 		    * since any characters in the low 256 are Unicode
1423*0Sstevel@tonic-gate 		    * code points, not EBCDIC. */
1424*0Sstevel@tonic-gate 		   U8 *t = (U8*)s, *tend = t + len, *d;
1425*0Sstevel@tonic-gate 
1426*0Sstevel@tonic-gate 		   d = tmpbuf;
1427*0Sstevel@tonic-gate 		   if (SvUTF8(*svp)) {
1428*0Sstevel@tonic-gate 			STRLEN tlen = 0;
1429*0Sstevel@tonic-gate 
1430*0Sstevel@tonic-gate 			while (t < tend) {
1431*0Sstevel@tonic-gate 			     UV c = utf8_to_uvchr(t, &tlen);
1432*0Sstevel@tonic-gate 			     if (tlen > 0) {
1433*0Sstevel@tonic-gate 				  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1434*0Sstevel@tonic-gate 				  t += tlen;
1435*0Sstevel@tonic-gate 			     }
1436*0Sstevel@tonic-gate 			     else
1437*0Sstevel@tonic-gate 				  break;
1438*0Sstevel@tonic-gate 			}
1439*0Sstevel@tonic-gate 		   }
1440*0Sstevel@tonic-gate 		   else {
1441*0Sstevel@tonic-gate 			while (t < tend) {
1442*0Sstevel@tonic-gate 			     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1443*0Sstevel@tonic-gate 			     t++;
1444*0Sstevel@tonic-gate 			}
1445*0Sstevel@tonic-gate 		   }
1446*0Sstevel@tonic-gate 		   len = d - tmpbuf;
1447*0Sstevel@tonic-gate 		   Copy(tmpbuf, ustrp, len, U8);
1448*0Sstevel@tonic-gate #else
1449*0Sstevel@tonic-gate 		   Copy(s, ustrp, len, U8);
1450*0Sstevel@tonic-gate #endif
1451*0Sstevel@tonic-gate 	      }
1452*0Sstevel@tonic-gate 	 }
1453*0Sstevel@tonic-gate     }
1454*0Sstevel@tonic-gate 
1455*0Sstevel@tonic-gate     if (!len && *swashp) {
1456*0Sstevel@tonic-gate 	 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1457*0Sstevel@tonic-gate 
1458*0Sstevel@tonic-gate 	 if (uv2) {
1459*0Sstevel@tonic-gate 	      /* It was "normal" (a single character mapping). */
1460*0Sstevel@tonic-gate 	      UV uv3 = UNI_TO_NATIVE(uv2);
1461*0Sstevel@tonic-gate 
1462*0Sstevel@tonic-gate 	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1463*0Sstevel@tonic-gate 	 }
1464*0Sstevel@tonic-gate     }
1465*0Sstevel@tonic-gate 
1466*0Sstevel@tonic-gate     if (!len) /* Neither: just copy. */
1467*0Sstevel@tonic-gate 	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1468*0Sstevel@tonic-gate 
1469*0Sstevel@tonic-gate     if (lenp)
1470*0Sstevel@tonic-gate 	 *lenp = len;
1471*0Sstevel@tonic-gate 
1472*0Sstevel@tonic-gate     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1473*0Sstevel@tonic-gate }
1474*0Sstevel@tonic-gate 
1475*0Sstevel@tonic-gate /*
1476*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1477*0Sstevel@tonic-gate 
1478*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its uppercase version and
1479*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1480*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1481*0Sstevel@tonic-gate uppercase version may be longer than the original character (up to two
1482*0Sstevel@tonic-gate characters).
1483*0Sstevel@tonic-gate 
1484*0Sstevel@tonic-gate The first character of the uppercased version is returned
1485*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1486*0Sstevel@tonic-gate 
1487*0Sstevel@tonic-gate =cut */
1488*0Sstevel@tonic-gate 
1489*0Sstevel@tonic-gate UV
Perl_to_utf8_upper(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1490*0Sstevel@tonic-gate Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1491*0Sstevel@tonic-gate {
1492*0Sstevel@tonic-gate     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1493*0Sstevel@tonic-gate                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1494*0Sstevel@tonic-gate }
1495*0Sstevel@tonic-gate 
1496*0Sstevel@tonic-gate /*
1497*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1498*0Sstevel@tonic-gate 
1499*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its titlecase version and
1500*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1501*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1502*0Sstevel@tonic-gate titlecase version may be longer than the original character (up to two
1503*0Sstevel@tonic-gate characters).
1504*0Sstevel@tonic-gate 
1505*0Sstevel@tonic-gate The first character of the titlecased version is returned
1506*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1507*0Sstevel@tonic-gate 
1508*0Sstevel@tonic-gate =cut */
1509*0Sstevel@tonic-gate 
1510*0Sstevel@tonic-gate UV
Perl_to_utf8_title(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1511*0Sstevel@tonic-gate Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1512*0Sstevel@tonic-gate {
1513*0Sstevel@tonic-gate     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1514*0Sstevel@tonic-gate                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1515*0Sstevel@tonic-gate }
1516*0Sstevel@tonic-gate 
1517*0Sstevel@tonic-gate /*
1518*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1519*0Sstevel@tonic-gate 
1520*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its lowercase version and
1521*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1522*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1523*0Sstevel@tonic-gate lowercase version may be longer than the original character (up to two
1524*0Sstevel@tonic-gate characters).
1525*0Sstevel@tonic-gate 
1526*0Sstevel@tonic-gate The first character of the lowercased version is returned
1527*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1528*0Sstevel@tonic-gate 
1529*0Sstevel@tonic-gate =cut */
1530*0Sstevel@tonic-gate 
1531*0Sstevel@tonic-gate UV
Perl_to_utf8_lower(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1532*0Sstevel@tonic-gate Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1533*0Sstevel@tonic-gate {
1534*0Sstevel@tonic-gate     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1535*0Sstevel@tonic-gate                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1536*0Sstevel@tonic-gate }
1537*0Sstevel@tonic-gate 
1538*0Sstevel@tonic-gate /*
1539*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1540*0Sstevel@tonic-gate 
1541*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its foldcase version and
1542*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1543*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1544*0Sstevel@tonic-gate foldcase version may be longer than the original character (up to
1545*0Sstevel@tonic-gate three characters).
1546*0Sstevel@tonic-gate 
1547*0Sstevel@tonic-gate The first character of the foldcased version is returned
1548*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1549*0Sstevel@tonic-gate 
1550*0Sstevel@tonic-gate =cut */
1551*0Sstevel@tonic-gate 
1552*0Sstevel@tonic-gate UV
Perl_to_utf8_fold(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1553*0Sstevel@tonic-gate Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1554*0Sstevel@tonic-gate {
1555*0Sstevel@tonic-gate     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1556*0Sstevel@tonic-gate                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1557*0Sstevel@tonic-gate }
1558*0Sstevel@tonic-gate 
1559*0Sstevel@tonic-gate /* a "swash" is a swatch hash */
1560*0Sstevel@tonic-gate 
1561*0Sstevel@tonic-gate SV*
Perl_swash_init(pTHX_ char * pkg,char * name,SV * listsv,I32 minbits,I32 none)1562*0Sstevel@tonic-gate Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1563*0Sstevel@tonic-gate {
1564*0Sstevel@tonic-gate     SV* retval;
1565*0Sstevel@tonic-gate     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1566*0Sstevel@tonic-gate     dSP;
1567*0Sstevel@tonic-gate     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1568*0Sstevel@tonic-gate     SV* errsv_save;
1569*0Sstevel@tonic-gate 
1570*0Sstevel@tonic-gate     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
1571*0Sstevel@tonic-gate 	ENTER;
1572*0Sstevel@tonic-gate 	errsv_save = newSVsv(ERRSV);
1573*0Sstevel@tonic-gate 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1574*0Sstevel@tonic-gate 	if (!SvTRUE(ERRSV))
1575*0Sstevel@tonic-gate 	    sv_setsv(ERRSV, errsv_save);
1576*0Sstevel@tonic-gate 	SvREFCNT_dec(errsv_save);
1577*0Sstevel@tonic-gate 	LEAVE;
1578*0Sstevel@tonic-gate     }
1579*0Sstevel@tonic-gate     SPAGAIN;
1580*0Sstevel@tonic-gate     PUSHSTACKi(PERLSI_MAGIC);
1581*0Sstevel@tonic-gate     PUSHMARK(SP);
1582*0Sstevel@tonic-gate     EXTEND(SP,5);
1583*0Sstevel@tonic-gate     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1584*0Sstevel@tonic-gate     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1585*0Sstevel@tonic-gate     PUSHs(listsv);
1586*0Sstevel@tonic-gate     PUSHs(sv_2mortal(newSViv(minbits)));
1587*0Sstevel@tonic-gate     PUSHs(sv_2mortal(newSViv(none)));
1588*0Sstevel@tonic-gate     PUTBACK;
1589*0Sstevel@tonic-gate     ENTER;
1590*0Sstevel@tonic-gate     SAVEI32(PL_hints);
1591*0Sstevel@tonic-gate     PL_hints = 0;
1592*0Sstevel@tonic-gate     save_re_context();
1593*0Sstevel@tonic-gate     if (IN_PERL_COMPILETIME) {
1594*0Sstevel@tonic-gate 	/* XXX ought to be handled by lex_start */
1595*0Sstevel@tonic-gate 	SAVEI32(PL_in_my);
1596*0Sstevel@tonic-gate 	PL_in_my = 0;
1597*0Sstevel@tonic-gate 	sv_setpv(tokenbufsv, PL_tokenbuf);
1598*0Sstevel@tonic-gate     }
1599*0Sstevel@tonic-gate     errsv_save = newSVsv(ERRSV);
1600*0Sstevel@tonic-gate     if (call_method("SWASHNEW", G_SCALAR))
1601*0Sstevel@tonic-gate 	retval = newSVsv(*PL_stack_sp--);
1602*0Sstevel@tonic-gate     else
1603*0Sstevel@tonic-gate 	retval = &PL_sv_undef;
1604*0Sstevel@tonic-gate     if (!SvTRUE(ERRSV))
1605*0Sstevel@tonic-gate 	sv_setsv(ERRSV, errsv_save);
1606*0Sstevel@tonic-gate     SvREFCNT_dec(errsv_save);
1607*0Sstevel@tonic-gate     LEAVE;
1608*0Sstevel@tonic-gate     POPSTACK;
1609*0Sstevel@tonic-gate     if (IN_PERL_COMPILETIME) {
1610*0Sstevel@tonic-gate 	STRLEN len;
1611*0Sstevel@tonic-gate 	char* pv = SvPV(tokenbufsv, len);
1612*0Sstevel@tonic-gate 
1613*0Sstevel@tonic-gate 	Copy(pv, PL_tokenbuf, len+1, char);
1614*0Sstevel@tonic-gate 	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1615*0Sstevel@tonic-gate     }
1616*0Sstevel@tonic-gate     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1617*0Sstevel@tonic-gate         if (SvPOK(retval))
1618*0Sstevel@tonic-gate 	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1619*0Sstevel@tonic-gate 		       retval);
1620*0Sstevel@tonic-gate 	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1621*0Sstevel@tonic-gate     }
1622*0Sstevel@tonic-gate     return retval;
1623*0Sstevel@tonic-gate }
1624*0Sstevel@tonic-gate 
1625*0Sstevel@tonic-gate 
1626*0Sstevel@tonic-gate /* This API is wrong for special case conversions since we may need to
1627*0Sstevel@tonic-gate  * return several Unicode characters for a single Unicode character
1628*0Sstevel@tonic-gate  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1629*0Sstevel@tonic-gate  * the lower-level routine, and it is similarly broken for returning
1630*0Sstevel@tonic-gate  * multiple values.  --jhi */
1631*0Sstevel@tonic-gate UV
Perl_swash_fetch(pTHX_ SV * sv,U8 * ptr,bool do_utf8)1632*0Sstevel@tonic-gate Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1633*0Sstevel@tonic-gate {
1634*0Sstevel@tonic-gate     HV* hv = (HV*)SvRV(sv);
1635*0Sstevel@tonic-gate     U32 klen;
1636*0Sstevel@tonic-gate     U32 off;
1637*0Sstevel@tonic-gate     STRLEN slen;
1638*0Sstevel@tonic-gate     STRLEN needents;
1639*0Sstevel@tonic-gate     U8 *tmps = NULL;
1640*0Sstevel@tonic-gate     U32 bit;
1641*0Sstevel@tonic-gate     SV *retval;
1642*0Sstevel@tonic-gate     U8 tmputf8[2];
1643*0Sstevel@tonic-gate     UV c = NATIVE_TO_ASCII(*ptr);
1644*0Sstevel@tonic-gate 
1645*0Sstevel@tonic-gate     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1646*0Sstevel@tonic-gate         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1647*0Sstevel@tonic-gate         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1648*0Sstevel@tonic-gate         ptr = tmputf8;
1649*0Sstevel@tonic-gate     }
1650*0Sstevel@tonic-gate     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1651*0Sstevel@tonic-gate      * then the "swatch" is a vec() for al the chars which start
1652*0Sstevel@tonic-gate      * with 0xAA..0xYY
1653*0Sstevel@tonic-gate      * So the key in the hash (klen) is length of encoded char -1
1654*0Sstevel@tonic-gate      */
1655*0Sstevel@tonic-gate     klen = UTF8SKIP(ptr) - 1;
1656*0Sstevel@tonic-gate     off  = ptr[klen];
1657*0Sstevel@tonic-gate 
1658*0Sstevel@tonic-gate     if (klen == 0)
1659*0Sstevel@tonic-gate      {
1660*0Sstevel@tonic-gate       /* If char in invariant then swatch is for all the invariant chars
1661*0Sstevel@tonic-gate        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1662*0Sstevel@tonic-gate        */
1663*0Sstevel@tonic-gate       needents = UTF_CONTINUATION_MARK;
1664*0Sstevel@tonic-gate       off      = NATIVE_TO_UTF(ptr[klen]);
1665*0Sstevel@tonic-gate      }
1666*0Sstevel@tonic-gate     else
1667*0Sstevel@tonic-gate      {
1668*0Sstevel@tonic-gate       /* If char is encoded then swatch is for the prefix */
1669*0Sstevel@tonic-gate       needents = (1 << UTF_ACCUMULATION_SHIFT);
1670*0Sstevel@tonic-gate       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1671*0Sstevel@tonic-gate      }
1672*0Sstevel@tonic-gate 
1673*0Sstevel@tonic-gate     /*
1674*0Sstevel@tonic-gate      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1675*0Sstevel@tonic-gate      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1676*0Sstevel@tonic-gate      * it's nothing to sniff at.)  Pity we usually come through at least
1677*0Sstevel@tonic-gate      * two function calls to get here...
1678*0Sstevel@tonic-gate      *
1679*0Sstevel@tonic-gate      * NB: this code assumes that swatches are never modified, once generated!
1680*0Sstevel@tonic-gate      */
1681*0Sstevel@tonic-gate 
1682*0Sstevel@tonic-gate     if (hv   == PL_last_swash_hv &&
1683*0Sstevel@tonic-gate 	klen == PL_last_swash_klen &&
1684*0Sstevel@tonic-gate 	(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1685*0Sstevel@tonic-gate     {
1686*0Sstevel@tonic-gate 	tmps = PL_last_swash_tmps;
1687*0Sstevel@tonic-gate 	slen = PL_last_swash_slen;
1688*0Sstevel@tonic-gate     }
1689*0Sstevel@tonic-gate     else {
1690*0Sstevel@tonic-gate 	/* Try our second-level swatch cache, kept in a hash. */
1691*0Sstevel@tonic-gate 	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1692*0Sstevel@tonic-gate 
1693*0Sstevel@tonic-gate 	/* If not cached, generate it via utf8::SWASHGET */
1694*0Sstevel@tonic-gate 	if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1695*0Sstevel@tonic-gate 	    dSP;
1696*0Sstevel@tonic-gate 	    /* We use utf8n_to_uvuni() as we want an index into
1697*0Sstevel@tonic-gate 	       Unicode tables, not a native character number.
1698*0Sstevel@tonic-gate 	     */
1699*0Sstevel@tonic-gate 	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1700*0Sstevel@tonic-gate 					   ckWARN(WARN_UTF8) ?
1701*0Sstevel@tonic-gate 					   0 : UTF8_ALLOW_ANY);
1702*0Sstevel@tonic-gate 	    SV *errsv_save;
1703*0Sstevel@tonic-gate 	    ENTER;
1704*0Sstevel@tonic-gate 	    SAVETMPS;
1705*0Sstevel@tonic-gate 	    save_re_context();
1706*0Sstevel@tonic-gate 	    PUSHSTACKi(PERLSI_MAGIC);
1707*0Sstevel@tonic-gate 	    PUSHMARK(SP);
1708*0Sstevel@tonic-gate 	    EXTEND(SP,3);
1709*0Sstevel@tonic-gate 	    PUSHs((SV*)sv);
1710*0Sstevel@tonic-gate 	    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1711*0Sstevel@tonic-gate 	    PUSHs(sv_2mortal(newSViv((klen) ?
1712*0Sstevel@tonic-gate 				     (code_point & ~(needents - 1)) : 0)));
1713*0Sstevel@tonic-gate 	    PUSHs(sv_2mortal(newSViv(needents)));
1714*0Sstevel@tonic-gate 	    PUTBACK;
1715*0Sstevel@tonic-gate 	    errsv_save = newSVsv(ERRSV);
1716*0Sstevel@tonic-gate 	    if (call_method("SWASHGET", G_SCALAR))
1717*0Sstevel@tonic-gate 		retval = newSVsv(*PL_stack_sp--);
1718*0Sstevel@tonic-gate 	    else
1719*0Sstevel@tonic-gate 		retval = &PL_sv_undef;
1720*0Sstevel@tonic-gate 	    if (!SvTRUE(ERRSV))
1721*0Sstevel@tonic-gate 		sv_setsv(ERRSV, errsv_save);
1722*0Sstevel@tonic-gate 	    SvREFCNT_dec(errsv_save);
1723*0Sstevel@tonic-gate 	    POPSTACK;
1724*0Sstevel@tonic-gate 	    FREETMPS;
1725*0Sstevel@tonic-gate 	    LEAVE;
1726*0Sstevel@tonic-gate 	    if (IN_PERL_COMPILETIME)
1727*0Sstevel@tonic-gate 		PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1728*0Sstevel@tonic-gate 
1729*0Sstevel@tonic-gate 	    svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1730*0Sstevel@tonic-gate 
1731*0Sstevel@tonic-gate 	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1732*0Sstevel@tonic-gate 		Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1733*0Sstevel@tonic-gate 	}
1734*0Sstevel@tonic-gate 
1735*0Sstevel@tonic-gate 	PL_last_swash_hv = hv;
1736*0Sstevel@tonic-gate 	PL_last_swash_klen = klen;
1737*0Sstevel@tonic-gate 	PL_last_swash_tmps = tmps;
1738*0Sstevel@tonic-gate 	PL_last_swash_slen = slen;
1739*0Sstevel@tonic-gate 	if (klen)
1740*0Sstevel@tonic-gate 	    Copy(ptr, PL_last_swash_key, klen, U8);
1741*0Sstevel@tonic-gate     }
1742*0Sstevel@tonic-gate 
1743*0Sstevel@tonic-gate     switch ((int)((slen << 3) / needents)) {
1744*0Sstevel@tonic-gate     case 1:
1745*0Sstevel@tonic-gate 	bit = 1 << (off & 7);
1746*0Sstevel@tonic-gate 	off >>= 3;
1747*0Sstevel@tonic-gate 	return (tmps[off] & bit) != 0;
1748*0Sstevel@tonic-gate     case 8:
1749*0Sstevel@tonic-gate 	return tmps[off];
1750*0Sstevel@tonic-gate     case 16:
1751*0Sstevel@tonic-gate 	off <<= 1;
1752*0Sstevel@tonic-gate 	return (tmps[off] << 8) + tmps[off + 1] ;
1753*0Sstevel@tonic-gate     case 32:
1754*0Sstevel@tonic-gate 	off <<= 2;
1755*0Sstevel@tonic-gate 	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1756*0Sstevel@tonic-gate     }
1757*0Sstevel@tonic-gate     Perl_croak(aTHX_ "panic: swash_fetch");
1758*0Sstevel@tonic-gate     return 0;
1759*0Sstevel@tonic-gate }
1760*0Sstevel@tonic-gate 
1761*0Sstevel@tonic-gate 
1762*0Sstevel@tonic-gate /*
1763*0Sstevel@tonic-gate =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1764*0Sstevel@tonic-gate 
1765*0Sstevel@tonic-gate Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1766*0Sstevel@tonic-gate of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1767*0Sstevel@tonic-gate bytes available. The return value is the pointer to the byte after the
1768*0Sstevel@tonic-gate end of the new character. In other words,
1769*0Sstevel@tonic-gate 
1770*0Sstevel@tonic-gate     d = uvchr_to_utf8(d, uv);
1771*0Sstevel@tonic-gate 
1772*0Sstevel@tonic-gate is the recommended wide native character-aware way of saying
1773*0Sstevel@tonic-gate 
1774*0Sstevel@tonic-gate     *(d++) = uv;
1775*0Sstevel@tonic-gate 
1776*0Sstevel@tonic-gate =cut
1777*0Sstevel@tonic-gate */
1778*0Sstevel@tonic-gate 
1779*0Sstevel@tonic-gate /* On ASCII machines this is normally a macro but we want a
1780*0Sstevel@tonic-gate    real function in case XS code wants it
1781*0Sstevel@tonic-gate */
1782*0Sstevel@tonic-gate #undef Perl_uvchr_to_utf8
1783*0Sstevel@tonic-gate U8 *
Perl_uvchr_to_utf8(pTHX_ U8 * d,UV uv)1784*0Sstevel@tonic-gate Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1785*0Sstevel@tonic-gate {
1786*0Sstevel@tonic-gate     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1787*0Sstevel@tonic-gate }
1788*0Sstevel@tonic-gate 
1789*0Sstevel@tonic-gate U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)1790*0Sstevel@tonic-gate Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1791*0Sstevel@tonic-gate {
1792*0Sstevel@tonic-gate     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1793*0Sstevel@tonic-gate }
1794*0Sstevel@tonic-gate 
1795*0Sstevel@tonic-gate /*
1796*0Sstevel@tonic-gate =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1797*0Sstevel@tonic-gate 
1798*0Sstevel@tonic-gate Returns the native character value of the first character in the string C<s>
1799*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1800*0Sstevel@tonic-gate length, in bytes, of that character.
1801*0Sstevel@tonic-gate 
1802*0Sstevel@tonic-gate Allows length and flags to be passed to low level routine.
1803*0Sstevel@tonic-gate 
1804*0Sstevel@tonic-gate =cut
1805*0Sstevel@tonic-gate */
1806*0Sstevel@tonic-gate /* On ASCII machines this is normally a macro but we want
1807*0Sstevel@tonic-gate    a real function in case XS code wants it
1808*0Sstevel@tonic-gate */
1809*0Sstevel@tonic-gate #undef Perl_utf8n_to_uvchr
1810*0Sstevel@tonic-gate UV
Perl_utf8n_to_uvchr(pTHX_ U8 * s,STRLEN curlen,STRLEN * retlen,U32 flags)1811*0Sstevel@tonic-gate Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1812*0Sstevel@tonic-gate {
1813*0Sstevel@tonic-gate     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1814*0Sstevel@tonic-gate     return UNI_TO_NATIVE(uv);
1815*0Sstevel@tonic-gate }
1816*0Sstevel@tonic-gate 
1817*0Sstevel@tonic-gate /*
1818*0Sstevel@tonic-gate =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1819*0Sstevel@tonic-gate 
1820*0Sstevel@tonic-gate Build to the scalar dsv a displayable version of the string spv,
1821*0Sstevel@tonic-gate length len, the displayable version being at most pvlim bytes long
1822*0Sstevel@tonic-gate (if longer, the rest is truncated and "..." will be appended).
1823*0Sstevel@tonic-gate 
1824*0Sstevel@tonic-gate The flags argument can have UNI_DISPLAY_ISPRINT set to display
1825*0Sstevel@tonic-gate isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1826*0Sstevel@tonic-gate to display the \\[nrfta\\] as the backslashed versions (like '\n')
1827*0Sstevel@tonic-gate (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1828*0Sstevel@tonic-gate UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1829*0Sstevel@tonic-gate UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1830*0Sstevel@tonic-gate 
1831*0Sstevel@tonic-gate The pointer to the PV of the dsv is returned.
1832*0Sstevel@tonic-gate 
1833*0Sstevel@tonic-gate =cut */
1834*0Sstevel@tonic-gate char *
Perl_pv_uni_display(pTHX_ SV * dsv,U8 * spv,STRLEN len,STRLEN pvlim,UV flags)1835*0Sstevel@tonic-gate Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1836*0Sstevel@tonic-gate {
1837*0Sstevel@tonic-gate     int truncated = 0;
1838*0Sstevel@tonic-gate     char *s, *e;
1839*0Sstevel@tonic-gate 
1840*0Sstevel@tonic-gate     sv_setpvn(dsv, "", 0);
1841*0Sstevel@tonic-gate     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1842*0Sstevel@tonic-gate 	 UV u;
1843*0Sstevel@tonic-gate 	 bool ok = FALSE;
1844*0Sstevel@tonic-gate 
1845*0Sstevel@tonic-gate 	 if (pvlim && SvCUR(dsv) >= pvlim) {
1846*0Sstevel@tonic-gate 	      truncated++;
1847*0Sstevel@tonic-gate 	      break;
1848*0Sstevel@tonic-gate 	 }
1849*0Sstevel@tonic-gate 	 u = utf8_to_uvchr((U8*)s, 0);
1850*0Sstevel@tonic-gate 	 if (u < 256) {
1851*0Sstevel@tonic-gate 	     if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1852*0Sstevel@tonic-gate 	         switch (u & 0xFF) {
1853*0Sstevel@tonic-gate 		 case '\n':
1854*0Sstevel@tonic-gate 		     Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1855*0Sstevel@tonic-gate 		 case '\r':
1856*0Sstevel@tonic-gate 		     Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1857*0Sstevel@tonic-gate 		 case '\t':
1858*0Sstevel@tonic-gate 		     Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1859*0Sstevel@tonic-gate 		 case '\f':
1860*0Sstevel@tonic-gate 		     Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1861*0Sstevel@tonic-gate 		 case '\a':
1862*0Sstevel@tonic-gate 		     Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1863*0Sstevel@tonic-gate 		 case '\\':
1864*0Sstevel@tonic-gate 		     Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1865*0Sstevel@tonic-gate 		 default: break;
1866*0Sstevel@tonic-gate 		 }
1867*0Sstevel@tonic-gate 	     }
1868*0Sstevel@tonic-gate 	     /* isPRINT() is the locale-blind version. */
1869*0Sstevel@tonic-gate 	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1870*0Sstevel@tonic-gate 	         Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1871*0Sstevel@tonic-gate 		 ok = TRUE;
1872*0Sstevel@tonic-gate 	     }
1873*0Sstevel@tonic-gate 	 }
1874*0Sstevel@tonic-gate 	 if (!ok)
1875*0Sstevel@tonic-gate 	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1876*0Sstevel@tonic-gate     }
1877*0Sstevel@tonic-gate     if (truncated)
1878*0Sstevel@tonic-gate 	 sv_catpvn(dsv, "...", 3);
1879*0Sstevel@tonic-gate 
1880*0Sstevel@tonic-gate     return SvPVX(dsv);
1881*0Sstevel@tonic-gate }
1882*0Sstevel@tonic-gate 
1883*0Sstevel@tonic-gate /*
1884*0Sstevel@tonic-gate =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1885*0Sstevel@tonic-gate 
1886*0Sstevel@tonic-gate Build to the scalar dsv a displayable version of the scalar sv,
1887*0Sstevel@tonic-gate the displayable version being at most pvlim bytes long
1888*0Sstevel@tonic-gate (if longer, the rest is truncated and "..." will be appended).
1889*0Sstevel@tonic-gate 
1890*0Sstevel@tonic-gate The flags argument is as in pv_uni_display().
1891*0Sstevel@tonic-gate 
1892*0Sstevel@tonic-gate The pointer to the PV of the dsv is returned.
1893*0Sstevel@tonic-gate 
1894*0Sstevel@tonic-gate =cut */
1895*0Sstevel@tonic-gate char *
Perl_sv_uni_display(pTHX_ SV * dsv,SV * ssv,STRLEN pvlim,UV flags)1896*0Sstevel@tonic-gate Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1897*0Sstevel@tonic-gate {
1898*0Sstevel@tonic-gate      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1899*0Sstevel@tonic-gate 				pvlim, flags);
1900*0Sstevel@tonic-gate }
1901*0Sstevel@tonic-gate 
1902*0Sstevel@tonic-gate /*
1903*0Sstevel@tonic-gate =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1904*0Sstevel@tonic-gate 
1905*0Sstevel@tonic-gate Return true if the strings s1 and s2 differ case-insensitively, false
1906*0Sstevel@tonic-gate if not (if they are equal case-insensitively).  If u1 is true, the
1907*0Sstevel@tonic-gate string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1908*0Sstevel@tonic-gate the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1909*0Sstevel@tonic-gate are false, the respective string is assumed to be in native 8-bit
1910*0Sstevel@tonic-gate encoding.
1911*0Sstevel@tonic-gate 
1912*0Sstevel@tonic-gate If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1913*0Sstevel@tonic-gate in there (they will point at the beginning of the I<next> character).
1914*0Sstevel@tonic-gate If the pointers behind pe1 or pe2 are non-NULL, they are the end
1915*0Sstevel@tonic-gate pointers beyond which scanning will not continue under any
1916*0Sstevel@tonic-gate circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1917*0Sstevel@tonic-gate s2+l2 will be used as goal end pointers that will also stop the scan,
1918*0Sstevel@tonic-gate and which qualify towards defining a successful match: all the scans
1919*0Sstevel@tonic-gate that define an explicit length must reach their goal pointers for
1920*0Sstevel@tonic-gate a match to succeed).
1921*0Sstevel@tonic-gate 
1922*0Sstevel@tonic-gate For case-insensitiveness, the "casefolding" of Unicode is used
1923*0Sstevel@tonic-gate instead of upper/lowercasing both the characters, see
1924*0Sstevel@tonic-gate http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1925*0Sstevel@tonic-gate 
1926*0Sstevel@tonic-gate =cut */
1927*0Sstevel@tonic-gate I32
Perl_ibcmp_utf8(pTHX_ const char * s1,char ** pe1,register UV l1,bool u1,const char * s2,char ** pe2,register UV l2,bool u2)1928*0Sstevel@tonic-gate Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1929*0Sstevel@tonic-gate {
1930*0Sstevel@tonic-gate      register U8 *p1  = (U8*)s1;
1931*0Sstevel@tonic-gate      register U8 *p2  = (U8*)s2;
1932*0Sstevel@tonic-gate      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1933*0Sstevel@tonic-gate      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1934*0Sstevel@tonic-gate      STRLEN n1 = 0, n2 = 0;
1935*0Sstevel@tonic-gate      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1936*0Sstevel@tonic-gate      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1937*0Sstevel@tonic-gate      U8 natbuf[1+1];
1938*0Sstevel@tonic-gate      STRLEN foldlen1, foldlen2;
1939*0Sstevel@tonic-gate      bool match;
1940*0Sstevel@tonic-gate 
1941*0Sstevel@tonic-gate      if (pe1)
1942*0Sstevel@tonic-gate 	  e1 = *(U8**)pe1;
1943*0Sstevel@tonic-gate      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1944*0Sstevel@tonic-gate 	  f1 = (U8*)s1 + l1;
1945*0Sstevel@tonic-gate      if (pe2)
1946*0Sstevel@tonic-gate 	  e2 = *(U8**)pe2;
1947*0Sstevel@tonic-gate      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1948*0Sstevel@tonic-gate 	  f2 = (U8*)s2 + l2;
1949*0Sstevel@tonic-gate 
1950*0Sstevel@tonic-gate      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1951*0Sstevel@tonic-gate 	  return 1; /* mismatch; possible infinite loop or false positive */
1952*0Sstevel@tonic-gate 
1953*0Sstevel@tonic-gate      if (!u1 || !u2)
1954*0Sstevel@tonic-gate 	  natbuf[1] = 0; /* Need to terminate the buffer. */
1955*0Sstevel@tonic-gate 
1956*0Sstevel@tonic-gate      while ((e1 == 0 || p1 < e1) &&
1957*0Sstevel@tonic-gate 	    (f1 == 0 || p1 < f1) &&
1958*0Sstevel@tonic-gate 	    (e2 == 0 || p2 < e2) &&
1959*0Sstevel@tonic-gate 	    (f2 == 0 || p2 < f2)) {
1960*0Sstevel@tonic-gate 	  if (n1 == 0) {
1961*0Sstevel@tonic-gate 	       if (u1)
1962*0Sstevel@tonic-gate 		    to_utf8_fold(p1, foldbuf1, &foldlen1);
1963*0Sstevel@tonic-gate 	       else {
1964*0Sstevel@tonic-gate 		    natbuf[0] = *p1;
1965*0Sstevel@tonic-gate 		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1966*0Sstevel@tonic-gate 	       }
1967*0Sstevel@tonic-gate 	       q1 = foldbuf1;
1968*0Sstevel@tonic-gate 	       n1 = foldlen1;
1969*0Sstevel@tonic-gate 	  }
1970*0Sstevel@tonic-gate 	  if (n2 == 0) {
1971*0Sstevel@tonic-gate 	       if (u2)
1972*0Sstevel@tonic-gate 		    to_utf8_fold(p2, foldbuf2, &foldlen2);
1973*0Sstevel@tonic-gate 	       else {
1974*0Sstevel@tonic-gate 		    natbuf[0] = *p2;
1975*0Sstevel@tonic-gate 		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1976*0Sstevel@tonic-gate 	       }
1977*0Sstevel@tonic-gate 	       q2 = foldbuf2;
1978*0Sstevel@tonic-gate 	       n2 = foldlen2;
1979*0Sstevel@tonic-gate 	  }
1980*0Sstevel@tonic-gate 	  while (n1 && n2) {
1981*0Sstevel@tonic-gate 	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1982*0Sstevel@tonic-gate 		   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1983*0Sstevel@tonic-gate 		    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1984*0Sstevel@tonic-gate 		   return 1; /* mismatch */
1985*0Sstevel@tonic-gate 	       n1 -= UTF8SKIP(q1);
1986*0Sstevel@tonic-gate 	       q1 += UTF8SKIP(q1);
1987*0Sstevel@tonic-gate 	       n2 -= UTF8SKIP(q2);
1988*0Sstevel@tonic-gate 	       q2 += UTF8SKIP(q2);
1989*0Sstevel@tonic-gate 	  }
1990*0Sstevel@tonic-gate 	  if (n1 == 0)
1991*0Sstevel@tonic-gate 	       p1 += u1 ? UTF8SKIP(p1) : 1;
1992*0Sstevel@tonic-gate 	  if (n2 == 0)
1993*0Sstevel@tonic-gate 	       p2 += u2 ? UTF8SKIP(p2) : 1;
1994*0Sstevel@tonic-gate 
1995*0Sstevel@tonic-gate      }
1996*0Sstevel@tonic-gate 
1997*0Sstevel@tonic-gate      /* A match is defined by all the scans that specified
1998*0Sstevel@tonic-gate       * an explicit length reaching their final goals. */
1999*0Sstevel@tonic-gate      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2000*0Sstevel@tonic-gate 
2001*0Sstevel@tonic-gate      if (match) {
2002*0Sstevel@tonic-gate 	  if (pe1)
2003*0Sstevel@tonic-gate 	       *pe1 = (char*)p1;
2004*0Sstevel@tonic-gate 	  if (pe2)
2005*0Sstevel@tonic-gate 	       *pe2 = (char*)p2;
2006*0Sstevel@tonic-gate      }
2007*0Sstevel@tonic-gate 
2008*0Sstevel@tonic-gate      return match ? 0 : 1; /* 0 match, 1 mismatch */
2009*0Sstevel@tonic-gate }
2010*0Sstevel@tonic-gate 
2011