1*0Sstevel@tonic-gate /* utf8.c
2*0Sstevel@tonic-gate *
3*0Sstevel@tonic-gate * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
4*0Sstevel@tonic-gate *
5*0Sstevel@tonic-gate * You may distribute under the terms of either the GNU General Public
6*0Sstevel@tonic-gate * License or the Artistic License, as specified in the README file.
7*0Sstevel@tonic-gate *
8*0Sstevel@tonic-gate */
9*0Sstevel@tonic-gate
10*0Sstevel@tonic-gate /*
11*0Sstevel@tonic-gate * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12*0Sstevel@tonic-gate * heard of that we don't want to see any closer; and that's the one place
13*0Sstevel@tonic-gate * we're trying to get to! And that's just where we can't get, nohow.'
14*0Sstevel@tonic-gate *
15*0Sstevel@tonic-gate * 'Well do I understand your speech,' he answered in the same language;
16*0Sstevel@tonic-gate * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17*0Sstevel@tonic-gate * as is the custom in the West, if you wish to be answered?'
18*0Sstevel@tonic-gate *
19*0Sstevel@tonic-gate * ...the travellers perceived that the floor was paved with stones of many
20*0Sstevel@tonic-gate * hues; branching runes and strange devices intertwined beneath their feet.
21*0Sstevel@tonic-gate */
22*0Sstevel@tonic-gate
23*0Sstevel@tonic-gate #include "EXTERN.h"
24*0Sstevel@tonic-gate #define PERL_IN_UTF8_C
25*0Sstevel@tonic-gate #include "perl.h"
26*0Sstevel@tonic-gate
27*0Sstevel@tonic-gate static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28*0Sstevel@tonic-gate
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate =head1 Unicode Support
31*0Sstevel@tonic-gate
32*0Sstevel@tonic-gate =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
33*0Sstevel@tonic-gate
34*0Sstevel@tonic-gate Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
35*0Sstevel@tonic-gate of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36*0Sstevel@tonic-gate bytes available. The return value is the pointer to the byte after the
37*0Sstevel@tonic-gate end of the new character. In other words,
38*0Sstevel@tonic-gate
39*0Sstevel@tonic-gate d = uvuni_to_utf8_flags(d, uv, flags);
40*0Sstevel@tonic-gate
41*0Sstevel@tonic-gate or, in most cases,
42*0Sstevel@tonic-gate
43*0Sstevel@tonic-gate d = uvuni_to_utf8(d, uv);
44*0Sstevel@tonic-gate
45*0Sstevel@tonic-gate (which is equivalent to)
46*0Sstevel@tonic-gate
47*0Sstevel@tonic-gate d = uvuni_to_utf8_flags(d, uv, 0);
48*0Sstevel@tonic-gate
49*0Sstevel@tonic-gate is the recommended Unicode-aware way of saying
50*0Sstevel@tonic-gate
51*0Sstevel@tonic-gate *(d++) = uv;
52*0Sstevel@tonic-gate
53*0Sstevel@tonic-gate =cut
54*0Sstevel@tonic-gate */
55*0Sstevel@tonic-gate
56*0Sstevel@tonic-gate U8 *
Perl_uvuni_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)57*0Sstevel@tonic-gate Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
58*0Sstevel@tonic-gate {
59*0Sstevel@tonic-gate if (ckWARN(WARN_UTF8)) {
60*0Sstevel@tonic-gate if (UNICODE_IS_SURROGATE(uv) &&
61*0Sstevel@tonic-gate !(flags & UNICODE_ALLOW_SURROGATE))
62*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
63*0Sstevel@tonic-gate else if (
64*0Sstevel@tonic-gate ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65*0Sstevel@tonic-gate !(flags & UNICODE_ALLOW_FDD0))
66*0Sstevel@tonic-gate ||
67*0Sstevel@tonic-gate ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
68*0Sstevel@tonic-gate !(flags & UNICODE_ALLOW_FFFF))) &&
69*0Sstevel@tonic-gate /* UNICODE_ALLOW_SUPER includes
70*0Sstevel@tonic-gate * FFFEs and FFFFs beyond 0x10FFFF. */
71*0Sstevel@tonic-gate ((uv <= PERL_UNICODE_MAX) ||
72*0Sstevel@tonic-gate !(flags & UNICODE_ALLOW_SUPER))
73*0Sstevel@tonic-gate )
74*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8),
75*0Sstevel@tonic-gate "Unicode character 0x%04"UVxf" is illegal", uv);
76*0Sstevel@tonic-gate }
77*0Sstevel@tonic-gate if (UNI_IS_INVARIANT(uv)) {
78*0Sstevel@tonic-gate *d++ = (U8)UTF_TO_NATIVE(uv);
79*0Sstevel@tonic-gate return d;
80*0Sstevel@tonic-gate }
81*0Sstevel@tonic-gate #if defined(EBCDIC)
82*0Sstevel@tonic-gate else {
83*0Sstevel@tonic-gate STRLEN len = UNISKIP(uv);
84*0Sstevel@tonic-gate U8 *p = d+len-1;
85*0Sstevel@tonic-gate while (p > d) {
86*0Sstevel@tonic-gate *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
87*0Sstevel@tonic-gate uv >>= UTF_ACCUMULATION_SHIFT;
88*0Sstevel@tonic-gate }
89*0Sstevel@tonic-gate *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
90*0Sstevel@tonic-gate return d+len;
91*0Sstevel@tonic-gate }
92*0Sstevel@tonic-gate #else /* Non loop style */
93*0Sstevel@tonic-gate if (uv < 0x800) {
94*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 6) | 0xc0);
95*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
96*0Sstevel@tonic-gate return d;
97*0Sstevel@tonic-gate }
98*0Sstevel@tonic-gate if (uv < 0x10000) {
99*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 12) | 0xe0);
100*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
102*0Sstevel@tonic-gate return d;
103*0Sstevel@tonic-gate }
104*0Sstevel@tonic-gate if (uv < 0x200000) {
105*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 18) | 0xf0);
106*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
109*0Sstevel@tonic-gate return d;
110*0Sstevel@tonic-gate }
111*0Sstevel@tonic-gate if (uv < 0x4000000) {
112*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 24) | 0xf8);
113*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
117*0Sstevel@tonic-gate return d;
118*0Sstevel@tonic-gate }
119*0Sstevel@tonic-gate if (uv < 0x80000000) {
120*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 30) | 0xfc);
121*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
126*0Sstevel@tonic-gate return d;
127*0Sstevel@tonic-gate }
128*0Sstevel@tonic-gate #ifdef HAS_QUAD
129*0Sstevel@tonic-gate if (uv < UTF8_QUAD_MAX)
130*0Sstevel@tonic-gate #endif
131*0Sstevel@tonic-gate {
132*0Sstevel@tonic-gate *d++ = 0xfe; /* Can't match U+FEFF! */
133*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
139*0Sstevel@tonic-gate return d;
140*0Sstevel@tonic-gate }
141*0Sstevel@tonic-gate #ifdef HAS_QUAD
142*0Sstevel@tonic-gate {
143*0Sstevel@tonic-gate *d++ = 0xff; /* Can't match U+FFFE! */
144*0Sstevel@tonic-gate *d++ = 0x80; /* 6 Reserved bits */
145*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
156*0Sstevel@tonic-gate return d;
157*0Sstevel@tonic-gate }
158*0Sstevel@tonic-gate #endif
159*0Sstevel@tonic-gate #endif /* Loop style */
160*0Sstevel@tonic-gate }
161*0Sstevel@tonic-gate
162*0Sstevel@tonic-gate U8 *
Perl_uvuni_to_utf8(pTHX_ U8 * d,UV uv)163*0Sstevel@tonic-gate Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164*0Sstevel@tonic-gate {
165*0Sstevel@tonic-gate return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166*0Sstevel@tonic-gate }
167*0Sstevel@tonic-gate
168*0Sstevel@tonic-gate
169*0Sstevel@tonic-gate /*
170*0Sstevel@tonic-gate =for apidoc A|STRLEN|is_utf8_char|U8 *s
171*0Sstevel@tonic-gate
172*0Sstevel@tonic-gate Tests if some arbitrary number of bytes begins in a valid UTF-8
173*0Sstevel@tonic-gate character. Note that an INVARIANT (i.e. ASCII) character is a valid
174*0Sstevel@tonic-gate UTF-8 character. The actual number of bytes in the UTF-8 character
175*0Sstevel@tonic-gate will be returned if it is valid, otherwise 0.
176*0Sstevel@tonic-gate
177*0Sstevel@tonic-gate =cut */
178*0Sstevel@tonic-gate STRLEN
Perl_is_utf8_char(pTHX_ U8 * s)179*0Sstevel@tonic-gate Perl_is_utf8_char(pTHX_ U8 *s)
180*0Sstevel@tonic-gate {
181*0Sstevel@tonic-gate U8 u = *s;
182*0Sstevel@tonic-gate STRLEN slen, len;
183*0Sstevel@tonic-gate UV uv, ouv;
184*0Sstevel@tonic-gate
185*0Sstevel@tonic-gate if (UTF8_IS_INVARIANT(u))
186*0Sstevel@tonic-gate return 1;
187*0Sstevel@tonic-gate
188*0Sstevel@tonic-gate if (!UTF8_IS_START(u))
189*0Sstevel@tonic-gate return 0;
190*0Sstevel@tonic-gate
191*0Sstevel@tonic-gate len = UTF8SKIP(s);
192*0Sstevel@tonic-gate
193*0Sstevel@tonic-gate if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
194*0Sstevel@tonic-gate return 0;
195*0Sstevel@tonic-gate
196*0Sstevel@tonic-gate slen = len - 1;
197*0Sstevel@tonic-gate s++;
198*0Sstevel@tonic-gate u &= UTF_START_MASK(len);
199*0Sstevel@tonic-gate uv = u;
200*0Sstevel@tonic-gate ouv = uv;
201*0Sstevel@tonic-gate while (slen--) {
202*0Sstevel@tonic-gate if (!UTF8_IS_CONTINUATION(*s))
203*0Sstevel@tonic-gate return 0;
204*0Sstevel@tonic-gate uv = UTF8_ACCUMULATE(uv, *s);
205*0Sstevel@tonic-gate if (uv < ouv)
206*0Sstevel@tonic-gate return 0;
207*0Sstevel@tonic-gate ouv = uv;
208*0Sstevel@tonic-gate s++;
209*0Sstevel@tonic-gate }
210*0Sstevel@tonic-gate
211*0Sstevel@tonic-gate if ((STRLEN)UNISKIP(uv) < len)
212*0Sstevel@tonic-gate return 0;
213*0Sstevel@tonic-gate
214*0Sstevel@tonic-gate return len;
215*0Sstevel@tonic-gate }
216*0Sstevel@tonic-gate
217*0Sstevel@tonic-gate /*
218*0Sstevel@tonic-gate =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
219*0Sstevel@tonic-gate
220*0Sstevel@tonic-gate Returns true if first C<len> bytes of the given string form a valid
221*0Sstevel@tonic-gate UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
222*0Sstevel@tonic-gate not mean 'a string that contains code points above 0x7F encoded in UTF-8'
223*0Sstevel@tonic-gate because a valid ASCII string is a valid UTF-8 string.
224*0Sstevel@tonic-gate
225*0Sstevel@tonic-gate =cut
226*0Sstevel@tonic-gate */
227*0Sstevel@tonic-gate
228*0Sstevel@tonic-gate bool
Perl_is_utf8_string(pTHX_ U8 * s,STRLEN len)229*0Sstevel@tonic-gate Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230*0Sstevel@tonic-gate {
231*0Sstevel@tonic-gate U8* x = s;
232*0Sstevel@tonic-gate U8* send;
233*0Sstevel@tonic-gate STRLEN c;
234*0Sstevel@tonic-gate
235*0Sstevel@tonic-gate if (!len)
236*0Sstevel@tonic-gate len = strlen((char *)s);
237*0Sstevel@tonic-gate send = s + len;
238*0Sstevel@tonic-gate
239*0Sstevel@tonic-gate while (x < send) {
240*0Sstevel@tonic-gate /* Inline the easy bits of is_utf8_char() here for speed... */
241*0Sstevel@tonic-gate if (UTF8_IS_INVARIANT(*x))
242*0Sstevel@tonic-gate c = 1;
243*0Sstevel@tonic-gate else if (!UTF8_IS_START(*x))
244*0Sstevel@tonic-gate return FALSE;
245*0Sstevel@tonic-gate else {
246*0Sstevel@tonic-gate /* ... and call is_utf8_char() only if really needed. */
247*0Sstevel@tonic-gate c = is_utf8_char(x);
248*0Sstevel@tonic-gate if (!c)
249*0Sstevel@tonic-gate return FALSE;
250*0Sstevel@tonic-gate }
251*0Sstevel@tonic-gate x += c;
252*0Sstevel@tonic-gate }
253*0Sstevel@tonic-gate if (x != send)
254*0Sstevel@tonic-gate return FALSE;
255*0Sstevel@tonic-gate
256*0Sstevel@tonic-gate return TRUE;
257*0Sstevel@tonic-gate }
258*0Sstevel@tonic-gate
259*0Sstevel@tonic-gate /*
260*0Sstevel@tonic-gate =for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
261*0Sstevel@tonic-gate
262*0Sstevel@tonic-gate Like is_ut8_string but store the location of the failure in
263*0Sstevel@tonic-gate the last argument.
264*0Sstevel@tonic-gate
265*0Sstevel@tonic-gate =cut
266*0Sstevel@tonic-gate */
267*0Sstevel@tonic-gate
268*0Sstevel@tonic-gate bool
Perl_is_utf8_string_loc(pTHX_ U8 * s,STRLEN len,U8 ** p)269*0Sstevel@tonic-gate Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
270*0Sstevel@tonic-gate {
271*0Sstevel@tonic-gate U8* x = s;
272*0Sstevel@tonic-gate U8* send;
273*0Sstevel@tonic-gate STRLEN c;
274*0Sstevel@tonic-gate
275*0Sstevel@tonic-gate if (!len)
276*0Sstevel@tonic-gate len = strlen((char *)s);
277*0Sstevel@tonic-gate send = s + len;
278*0Sstevel@tonic-gate
279*0Sstevel@tonic-gate while (x < send) {
280*0Sstevel@tonic-gate /* Inline the easy bits of is_utf8_char() here for speed... */
281*0Sstevel@tonic-gate if (UTF8_IS_INVARIANT(*x))
282*0Sstevel@tonic-gate c = 1;
283*0Sstevel@tonic-gate else if (!UTF8_IS_START(*x)) {
284*0Sstevel@tonic-gate if (p)
285*0Sstevel@tonic-gate *p = x;
286*0Sstevel@tonic-gate return FALSE;
287*0Sstevel@tonic-gate }
288*0Sstevel@tonic-gate else {
289*0Sstevel@tonic-gate /* ... and call is_utf8_char() only if really needed. */
290*0Sstevel@tonic-gate c = is_utf8_char(x);
291*0Sstevel@tonic-gate if (!c) {
292*0Sstevel@tonic-gate if (p)
293*0Sstevel@tonic-gate *p = x;
294*0Sstevel@tonic-gate return FALSE;
295*0Sstevel@tonic-gate }
296*0Sstevel@tonic-gate }
297*0Sstevel@tonic-gate x += c;
298*0Sstevel@tonic-gate }
299*0Sstevel@tonic-gate if (x != send) {
300*0Sstevel@tonic-gate if (p)
301*0Sstevel@tonic-gate *p = x;
302*0Sstevel@tonic-gate return FALSE;
303*0Sstevel@tonic-gate }
304*0Sstevel@tonic-gate
305*0Sstevel@tonic-gate return TRUE;
306*0Sstevel@tonic-gate }
307*0Sstevel@tonic-gate
308*0Sstevel@tonic-gate /*
309*0Sstevel@tonic-gate =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
310*0Sstevel@tonic-gate
311*0Sstevel@tonic-gate Bottom level UTF-8 decode routine.
312*0Sstevel@tonic-gate Returns the unicode code point value of the first character in the string C<s>
313*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
314*0Sstevel@tonic-gate C<retlen> will be set to the length, in bytes, of that character.
315*0Sstevel@tonic-gate
316*0Sstevel@tonic-gate If C<s> does not point to a well-formed UTF-8 character, the behaviour
317*0Sstevel@tonic-gate is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
318*0Sstevel@tonic-gate it is assumed that the caller will raise a warning, and this function
319*0Sstevel@tonic-gate will silently just set C<retlen> to C<-1> and return zero. If the
320*0Sstevel@tonic-gate C<flags> does not contain UTF8_CHECK_ONLY, warnings about
321*0Sstevel@tonic-gate malformations will be given, C<retlen> will be set to the expected
322*0Sstevel@tonic-gate length of the UTF-8 character in bytes, and zero will be returned.
323*0Sstevel@tonic-gate
324*0Sstevel@tonic-gate The C<flags> can also contain various flags to allow deviations from
325*0Sstevel@tonic-gate the strict UTF-8 encoding (see F<utf8.h>).
326*0Sstevel@tonic-gate
327*0Sstevel@tonic-gate Most code should use utf8_to_uvchr() rather than call this directly.
328*0Sstevel@tonic-gate
329*0Sstevel@tonic-gate =cut
330*0Sstevel@tonic-gate */
331*0Sstevel@tonic-gate
332*0Sstevel@tonic-gate UV
Perl_utf8n_to_uvuni(pTHX_ U8 * s,STRLEN curlen,STRLEN * retlen,U32 flags)333*0Sstevel@tonic-gate Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
334*0Sstevel@tonic-gate {
335*0Sstevel@tonic-gate U8 *s0 = s;
336*0Sstevel@tonic-gate UV uv = *s, ouv = 0;
337*0Sstevel@tonic-gate STRLEN len = 1;
338*0Sstevel@tonic-gate bool dowarn = ckWARN_d(WARN_UTF8);
339*0Sstevel@tonic-gate UV startbyte = *s;
340*0Sstevel@tonic-gate STRLEN expectlen = 0;
341*0Sstevel@tonic-gate U32 warning = 0;
342*0Sstevel@tonic-gate
343*0Sstevel@tonic-gate /* This list is a superset of the UTF8_ALLOW_XXX. */
344*0Sstevel@tonic-gate
345*0Sstevel@tonic-gate #define UTF8_WARN_EMPTY 1
346*0Sstevel@tonic-gate #define UTF8_WARN_CONTINUATION 2
347*0Sstevel@tonic-gate #define UTF8_WARN_NON_CONTINUATION 3
348*0Sstevel@tonic-gate #define UTF8_WARN_FE_FF 4
349*0Sstevel@tonic-gate #define UTF8_WARN_SHORT 5
350*0Sstevel@tonic-gate #define UTF8_WARN_OVERFLOW 6
351*0Sstevel@tonic-gate #define UTF8_WARN_SURROGATE 7
352*0Sstevel@tonic-gate #define UTF8_WARN_LONG 8
353*0Sstevel@tonic-gate #define UTF8_WARN_FFFF 9 /* Also FFFE. */
354*0Sstevel@tonic-gate
355*0Sstevel@tonic-gate if (curlen == 0 &&
356*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_EMPTY)) {
357*0Sstevel@tonic-gate warning = UTF8_WARN_EMPTY;
358*0Sstevel@tonic-gate goto malformed;
359*0Sstevel@tonic-gate }
360*0Sstevel@tonic-gate
361*0Sstevel@tonic-gate if (UTF8_IS_INVARIANT(uv)) {
362*0Sstevel@tonic-gate if (retlen)
363*0Sstevel@tonic-gate *retlen = 1;
364*0Sstevel@tonic-gate return (UV) (NATIVE_TO_UTF(*s));
365*0Sstevel@tonic-gate }
366*0Sstevel@tonic-gate
367*0Sstevel@tonic-gate if (UTF8_IS_CONTINUATION(uv) &&
368*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_CONTINUATION)) {
369*0Sstevel@tonic-gate warning = UTF8_WARN_CONTINUATION;
370*0Sstevel@tonic-gate goto malformed;
371*0Sstevel@tonic-gate }
372*0Sstevel@tonic-gate
373*0Sstevel@tonic-gate if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
374*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
375*0Sstevel@tonic-gate warning = UTF8_WARN_NON_CONTINUATION;
376*0Sstevel@tonic-gate goto malformed;
377*0Sstevel@tonic-gate }
378*0Sstevel@tonic-gate
379*0Sstevel@tonic-gate #ifdef EBCDIC
380*0Sstevel@tonic-gate uv = NATIVE_TO_UTF(uv);
381*0Sstevel@tonic-gate #else
382*0Sstevel@tonic-gate if ((uv == 0xfe || uv == 0xff) &&
383*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_FE_FF)) {
384*0Sstevel@tonic-gate warning = UTF8_WARN_FE_FF;
385*0Sstevel@tonic-gate goto malformed;
386*0Sstevel@tonic-gate }
387*0Sstevel@tonic-gate #endif
388*0Sstevel@tonic-gate
389*0Sstevel@tonic-gate if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
390*0Sstevel@tonic-gate else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
391*0Sstevel@tonic-gate else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
392*0Sstevel@tonic-gate else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
393*0Sstevel@tonic-gate #ifdef EBCDIC
394*0Sstevel@tonic-gate else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
395*0Sstevel@tonic-gate else { len = 7; uv &= 0x01; }
396*0Sstevel@tonic-gate #else
397*0Sstevel@tonic-gate else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
398*0Sstevel@tonic-gate else if (!(uv & 0x01)) { len = 7; uv = 0; }
399*0Sstevel@tonic-gate else { len = 13; uv = 0; } /* whoa! */
400*0Sstevel@tonic-gate #endif
401*0Sstevel@tonic-gate
402*0Sstevel@tonic-gate if (retlen)
403*0Sstevel@tonic-gate *retlen = len;
404*0Sstevel@tonic-gate
405*0Sstevel@tonic-gate expectlen = len;
406*0Sstevel@tonic-gate
407*0Sstevel@tonic-gate if ((curlen < expectlen) &&
408*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_SHORT)) {
409*0Sstevel@tonic-gate warning = UTF8_WARN_SHORT;
410*0Sstevel@tonic-gate goto malformed;
411*0Sstevel@tonic-gate }
412*0Sstevel@tonic-gate
413*0Sstevel@tonic-gate len--;
414*0Sstevel@tonic-gate s++;
415*0Sstevel@tonic-gate ouv = uv;
416*0Sstevel@tonic-gate
417*0Sstevel@tonic-gate while (len--) {
418*0Sstevel@tonic-gate if (!UTF8_IS_CONTINUATION(*s) &&
419*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
420*0Sstevel@tonic-gate s--;
421*0Sstevel@tonic-gate warning = UTF8_WARN_NON_CONTINUATION;
422*0Sstevel@tonic-gate goto malformed;
423*0Sstevel@tonic-gate }
424*0Sstevel@tonic-gate else
425*0Sstevel@tonic-gate uv = UTF8_ACCUMULATE(uv, *s);
426*0Sstevel@tonic-gate if (!(uv > ouv)) {
427*0Sstevel@tonic-gate /* These cannot be allowed. */
428*0Sstevel@tonic-gate if (uv == ouv) {
429*0Sstevel@tonic-gate if (!(flags & UTF8_ALLOW_LONG)) {
430*0Sstevel@tonic-gate warning = UTF8_WARN_LONG;
431*0Sstevel@tonic-gate goto malformed;
432*0Sstevel@tonic-gate }
433*0Sstevel@tonic-gate }
434*0Sstevel@tonic-gate else { /* uv < ouv */
435*0Sstevel@tonic-gate /* This cannot be allowed. */
436*0Sstevel@tonic-gate warning = UTF8_WARN_OVERFLOW;
437*0Sstevel@tonic-gate goto malformed;
438*0Sstevel@tonic-gate }
439*0Sstevel@tonic-gate }
440*0Sstevel@tonic-gate s++;
441*0Sstevel@tonic-gate ouv = uv;
442*0Sstevel@tonic-gate }
443*0Sstevel@tonic-gate
444*0Sstevel@tonic-gate if (UNICODE_IS_SURROGATE(uv) &&
445*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_SURROGATE)) {
446*0Sstevel@tonic-gate warning = UTF8_WARN_SURROGATE;
447*0Sstevel@tonic-gate goto malformed;
448*0Sstevel@tonic-gate } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
449*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_LONG)) {
450*0Sstevel@tonic-gate warning = UTF8_WARN_LONG;
451*0Sstevel@tonic-gate goto malformed;
452*0Sstevel@tonic-gate } else if (UNICODE_IS_ILLEGAL(uv) &&
453*0Sstevel@tonic-gate !(flags & UTF8_ALLOW_FFFF)) {
454*0Sstevel@tonic-gate warning = UTF8_WARN_FFFF;
455*0Sstevel@tonic-gate goto malformed;
456*0Sstevel@tonic-gate }
457*0Sstevel@tonic-gate
458*0Sstevel@tonic-gate return uv;
459*0Sstevel@tonic-gate
460*0Sstevel@tonic-gate malformed:
461*0Sstevel@tonic-gate
462*0Sstevel@tonic-gate if (flags & UTF8_CHECK_ONLY) {
463*0Sstevel@tonic-gate if (retlen)
464*0Sstevel@tonic-gate *retlen = -1;
465*0Sstevel@tonic-gate return 0;
466*0Sstevel@tonic-gate }
467*0Sstevel@tonic-gate
468*0Sstevel@tonic-gate if (dowarn) {
469*0Sstevel@tonic-gate SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
470*0Sstevel@tonic-gate
471*0Sstevel@tonic-gate switch (warning) {
472*0Sstevel@tonic-gate case 0: /* Intentionally empty. */ break;
473*0Sstevel@tonic-gate case UTF8_WARN_EMPTY:
474*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(empty string)");
475*0Sstevel@tonic-gate break;
476*0Sstevel@tonic-gate case UTF8_WARN_CONTINUATION:
477*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
478*0Sstevel@tonic-gate break;
479*0Sstevel@tonic-gate case UTF8_WARN_NON_CONTINUATION:
480*0Sstevel@tonic-gate if (s == s0)
481*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
482*0Sstevel@tonic-gate (UV)s[1], startbyte);
483*0Sstevel@tonic-gate else
484*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
485*0Sstevel@tonic-gate (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
486*0Sstevel@tonic-gate
487*0Sstevel@tonic-gate break;
488*0Sstevel@tonic-gate case UTF8_WARN_FE_FF:
489*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
490*0Sstevel@tonic-gate break;
491*0Sstevel@tonic-gate case UTF8_WARN_SHORT:
492*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
493*0Sstevel@tonic-gate curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
494*0Sstevel@tonic-gate expectlen = curlen; /* distance for caller to skip */
495*0Sstevel@tonic-gate break;
496*0Sstevel@tonic-gate case UTF8_WARN_OVERFLOW:
497*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
498*0Sstevel@tonic-gate ouv, *s, startbyte);
499*0Sstevel@tonic-gate break;
500*0Sstevel@tonic-gate case UTF8_WARN_SURROGATE:
501*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
502*0Sstevel@tonic-gate break;
503*0Sstevel@tonic-gate case UTF8_WARN_LONG:
504*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
505*0Sstevel@tonic-gate expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
506*0Sstevel@tonic-gate break;
507*0Sstevel@tonic-gate case UTF8_WARN_FFFF:
508*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
509*0Sstevel@tonic-gate break;
510*0Sstevel@tonic-gate default:
511*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
512*0Sstevel@tonic-gate break;
513*0Sstevel@tonic-gate }
514*0Sstevel@tonic-gate
515*0Sstevel@tonic-gate if (warning) {
516*0Sstevel@tonic-gate char *s = SvPVX(sv);
517*0Sstevel@tonic-gate
518*0Sstevel@tonic-gate if (PL_op)
519*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8),
520*0Sstevel@tonic-gate "%s in %s", s, OP_DESC(PL_op));
521*0Sstevel@tonic-gate else
522*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
523*0Sstevel@tonic-gate }
524*0Sstevel@tonic-gate }
525*0Sstevel@tonic-gate
526*0Sstevel@tonic-gate if (retlen)
527*0Sstevel@tonic-gate *retlen = expectlen ? expectlen : len;
528*0Sstevel@tonic-gate
529*0Sstevel@tonic-gate return 0;
530*0Sstevel@tonic-gate }
531*0Sstevel@tonic-gate
532*0Sstevel@tonic-gate /*
533*0Sstevel@tonic-gate =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
534*0Sstevel@tonic-gate
535*0Sstevel@tonic-gate Returns the native character value of the first character in the string C<s>
536*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
537*0Sstevel@tonic-gate length, in bytes, of that character.
538*0Sstevel@tonic-gate
539*0Sstevel@tonic-gate If C<s> does not point to a well-formed UTF-8 character, zero is
540*0Sstevel@tonic-gate returned and retlen is set, if possible, to -1.
541*0Sstevel@tonic-gate
542*0Sstevel@tonic-gate =cut
543*0Sstevel@tonic-gate */
544*0Sstevel@tonic-gate
545*0Sstevel@tonic-gate UV
Perl_utf8_to_uvchr(pTHX_ U8 * s,STRLEN * retlen)546*0Sstevel@tonic-gate Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
547*0Sstevel@tonic-gate {
548*0Sstevel@tonic-gate return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
549*0Sstevel@tonic-gate ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
550*0Sstevel@tonic-gate }
551*0Sstevel@tonic-gate
552*0Sstevel@tonic-gate /*
553*0Sstevel@tonic-gate =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
554*0Sstevel@tonic-gate
555*0Sstevel@tonic-gate Returns the Unicode code point of the first character in the string C<s>
556*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
557*0Sstevel@tonic-gate length, in bytes, of that character.
558*0Sstevel@tonic-gate
559*0Sstevel@tonic-gate This function should only be used when returned UV is considered
560*0Sstevel@tonic-gate an index into the Unicode semantic tables (e.g. swashes).
561*0Sstevel@tonic-gate
562*0Sstevel@tonic-gate If C<s> does not point to a well-formed UTF-8 character, zero is
563*0Sstevel@tonic-gate returned and retlen is set, if possible, to -1.
564*0Sstevel@tonic-gate
565*0Sstevel@tonic-gate =cut
566*0Sstevel@tonic-gate */
567*0Sstevel@tonic-gate
568*0Sstevel@tonic-gate UV
Perl_utf8_to_uvuni(pTHX_ U8 * s,STRLEN * retlen)569*0Sstevel@tonic-gate Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
570*0Sstevel@tonic-gate {
571*0Sstevel@tonic-gate /* Call the low level routine asking for checks */
572*0Sstevel@tonic-gate return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
573*0Sstevel@tonic-gate ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
574*0Sstevel@tonic-gate }
575*0Sstevel@tonic-gate
576*0Sstevel@tonic-gate /*
577*0Sstevel@tonic-gate =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
578*0Sstevel@tonic-gate
579*0Sstevel@tonic-gate Return the length of the UTF-8 char encoded string C<s> in characters.
580*0Sstevel@tonic-gate Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
581*0Sstevel@tonic-gate up past C<e>, croaks.
582*0Sstevel@tonic-gate
583*0Sstevel@tonic-gate =cut
584*0Sstevel@tonic-gate */
585*0Sstevel@tonic-gate
586*0Sstevel@tonic-gate STRLEN
Perl_utf8_length(pTHX_ U8 * s,U8 * e)587*0Sstevel@tonic-gate Perl_utf8_length(pTHX_ U8 *s, U8 *e)
588*0Sstevel@tonic-gate {
589*0Sstevel@tonic-gate STRLEN len = 0;
590*0Sstevel@tonic-gate
591*0Sstevel@tonic-gate /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
592*0Sstevel@tonic-gate * the bitops (especially ~) can create illegal UTF-8.
593*0Sstevel@tonic-gate * In other words: in Perl UTF-8 is not just for Unicode. */
594*0Sstevel@tonic-gate
595*0Sstevel@tonic-gate if (e < s) {
596*0Sstevel@tonic-gate if (ckWARN_d(WARN_UTF8)) {
597*0Sstevel@tonic-gate if (PL_op)
598*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8),
599*0Sstevel@tonic-gate "%s in %s", unees, OP_DESC(PL_op));
600*0Sstevel@tonic-gate else
601*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
602*0Sstevel@tonic-gate }
603*0Sstevel@tonic-gate return 0;
604*0Sstevel@tonic-gate }
605*0Sstevel@tonic-gate while (s < e) {
606*0Sstevel@tonic-gate U8 t = UTF8SKIP(s);
607*0Sstevel@tonic-gate
608*0Sstevel@tonic-gate if (e - s < t) {
609*0Sstevel@tonic-gate if (ckWARN_d(WARN_UTF8)) {
610*0Sstevel@tonic-gate if (PL_op)
611*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8),
612*0Sstevel@tonic-gate unees, OP_DESC(PL_op));
613*0Sstevel@tonic-gate else
614*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
615*0Sstevel@tonic-gate }
616*0Sstevel@tonic-gate return len;
617*0Sstevel@tonic-gate }
618*0Sstevel@tonic-gate s += t;
619*0Sstevel@tonic-gate len++;
620*0Sstevel@tonic-gate }
621*0Sstevel@tonic-gate
622*0Sstevel@tonic-gate return len;
623*0Sstevel@tonic-gate }
624*0Sstevel@tonic-gate
625*0Sstevel@tonic-gate /*
626*0Sstevel@tonic-gate =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
627*0Sstevel@tonic-gate
628*0Sstevel@tonic-gate Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
629*0Sstevel@tonic-gate and C<b>.
630*0Sstevel@tonic-gate
631*0Sstevel@tonic-gate WARNING: use only if you *know* that the pointers point inside the
632*0Sstevel@tonic-gate same UTF-8 buffer.
633*0Sstevel@tonic-gate
634*0Sstevel@tonic-gate =cut
635*0Sstevel@tonic-gate */
636*0Sstevel@tonic-gate
637*0Sstevel@tonic-gate IV
Perl_utf8_distance(pTHX_ U8 * a,U8 * b)638*0Sstevel@tonic-gate Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
639*0Sstevel@tonic-gate {
640*0Sstevel@tonic-gate IV off = 0;
641*0Sstevel@tonic-gate
642*0Sstevel@tonic-gate /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
643*0Sstevel@tonic-gate * the bitops (especially ~) can create illegal UTF-8.
644*0Sstevel@tonic-gate * In other words: in Perl UTF-8 is not just for Unicode. */
645*0Sstevel@tonic-gate
646*0Sstevel@tonic-gate if (a < b) {
647*0Sstevel@tonic-gate while (a < b) {
648*0Sstevel@tonic-gate U8 c = UTF8SKIP(a);
649*0Sstevel@tonic-gate
650*0Sstevel@tonic-gate if (b - a < c) {
651*0Sstevel@tonic-gate if (ckWARN_d(WARN_UTF8)) {
652*0Sstevel@tonic-gate if (PL_op)
653*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8),
654*0Sstevel@tonic-gate "%s in %s", unees, OP_DESC(PL_op));
655*0Sstevel@tonic-gate else
656*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
657*0Sstevel@tonic-gate }
658*0Sstevel@tonic-gate return off;
659*0Sstevel@tonic-gate }
660*0Sstevel@tonic-gate a += c;
661*0Sstevel@tonic-gate off--;
662*0Sstevel@tonic-gate }
663*0Sstevel@tonic-gate }
664*0Sstevel@tonic-gate else {
665*0Sstevel@tonic-gate while (b < a) {
666*0Sstevel@tonic-gate U8 c = UTF8SKIP(b);
667*0Sstevel@tonic-gate
668*0Sstevel@tonic-gate if (a - b < c) {
669*0Sstevel@tonic-gate if (ckWARN_d(WARN_UTF8)) {
670*0Sstevel@tonic-gate if (PL_op)
671*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8),
672*0Sstevel@tonic-gate "%s in %s", unees, OP_DESC(PL_op));
673*0Sstevel@tonic-gate else
674*0Sstevel@tonic-gate Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675*0Sstevel@tonic-gate }
676*0Sstevel@tonic-gate return off;
677*0Sstevel@tonic-gate }
678*0Sstevel@tonic-gate b += c;
679*0Sstevel@tonic-gate off++;
680*0Sstevel@tonic-gate }
681*0Sstevel@tonic-gate }
682*0Sstevel@tonic-gate
683*0Sstevel@tonic-gate return off;
684*0Sstevel@tonic-gate }
685*0Sstevel@tonic-gate
686*0Sstevel@tonic-gate /*
687*0Sstevel@tonic-gate =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
688*0Sstevel@tonic-gate
689*0Sstevel@tonic-gate Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690*0Sstevel@tonic-gate forward or backward.
691*0Sstevel@tonic-gate
692*0Sstevel@tonic-gate WARNING: do not use the following unless you *know* C<off> is within
693*0Sstevel@tonic-gate the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694*0Sstevel@tonic-gate on the first byte of character or just after the last byte of a character.
695*0Sstevel@tonic-gate
696*0Sstevel@tonic-gate =cut
697*0Sstevel@tonic-gate */
698*0Sstevel@tonic-gate
699*0Sstevel@tonic-gate U8 *
Perl_utf8_hop(pTHX_ U8 * s,I32 off)700*0Sstevel@tonic-gate Perl_utf8_hop(pTHX_ U8 *s, I32 off)
701*0Sstevel@tonic-gate {
702*0Sstevel@tonic-gate /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703*0Sstevel@tonic-gate * the bitops (especially ~) can create illegal UTF-8.
704*0Sstevel@tonic-gate * In other words: in Perl UTF-8 is not just for Unicode. */
705*0Sstevel@tonic-gate
706*0Sstevel@tonic-gate if (off >= 0) {
707*0Sstevel@tonic-gate while (off--)
708*0Sstevel@tonic-gate s += UTF8SKIP(s);
709*0Sstevel@tonic-gate }
710*0Sstevel@tonic-gate else {
711*0Sstevel@tonic-gate while (off++) {
712*0Sstevel@tonic-gate s--;
713*0Sstevel@tonic-gate while (UTF8_IS_CONTINUATION(*s))
714*0Sstevel@tonic-gate s--;
715*0Sstevel@tonic-gate }
716*0Sstevel@tonic-gate }
717*0Sstevel@tonic-gate return s;
718*0Sstevel@tonic-gate }
719*0Sstevel@tonic-gate
720*0Sstevel@tonic-gate /*
721*0Sstevel@tonic-gate =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
722*0Sstevel@tonic-gate
723*0Sstevel@tonic-gate Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
724*0Sstevel@tonic-gate Unlike C<bytes_to_utf8>, this over-writes the original string, and
725*0Sstevel@tonic-gate updates len to contain the new length.
726*0Sstevel@tonic-gate Returns zero on failure, setting C<len> to -1.
727*0Sstevel@tonic-gate
728*0Sstevel@tonic-gate =cut
729*0Sstevel@tonic-gate */
730*0Sstevel@tonic-gate
731*0Sstevel@tonic-gate U8 *
Perl_utf8_to_bytes(pTHX_ U8 * s,STRLEN * len)732*0Sstevel@tonic-gate Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
733*0Sstevel@tonic-gate {
734*0Sstevel@tonic-gate U8 *send;
735*0Sstevel@tonic-gate U8 *d;
736*0Sstevel@tonic-gate U8 *save = s;
737*0Sstevel@tonic-gate
738*0Sstevel@tonic-gate /* ensure valid UTF-8 and chars < 256 before updating string */
739*0Sstevel@tonic-gate for (send = s + *len; s < send; ) {
740*0Sstevel@tonic-gate U8 c = *s++;
741*0Sstevel@tonic-gate
742*0Sstevel@tonic-gate if (!UTF8_IS_INVARIANT(c) &&
743*0Sstevel@tonic-gate (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744*0Sstevel@tonic-gate || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
745*0Sstevel@tonic-gate *len = -1;
746*0Sstevel@tonic-gate return 0;
747*0Sstevel@tonic-gate }
748*0Sstevel@tonic-gate }
749*0Sstevel@tonic-gate
750*0Sstevel@tonic-gate d = s = save;
751*0Sstevel@tonic-gate while (s < send) {
752*0Sstevel@tonic-gate STRLEN ulen;
753*0Sstevel@tonic-gate *d++ = (U8)utf8_to_uvchr(s, &ulen);
754*0Sstevel@tonic-gate s += ulen;
755*0Sstevel@tonic-gate }
756*0Sstevel@tonic-gate *d = '\0';
757*0Sstevel@tonic-gate *len = d - save;
758*0Sstevel@tonic-gate return save;
759*0Sstevel@tonic-gate }
760*0Sstevel@tonic-gate
761*0Sstevel@tonic-gate /*
762*0Sstevel@tonic-gate =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
763*0Sstevel@tonic-gate
764*0Sstevel@tonic-gate Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
765*0Sstevel@tonic-gate Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
766*0Sstevel@tonic-gate the newly-created string, and updates C<len> to contain the new
767*0Sstevel@tonic-gate length. Returns the original string if no conversion occurs, C<len>
768*0Sstevel@tonic-gate is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
769*0Sstevel@tonic-gate 0 if C<s> is converted or contains all 7bit characters.
770*0Sstevel@tonic-gate
771*0Sstevel@tonic-gate =cut
772*0Sstevel@tonic-gate */
773*0Sstevel@tonic-gate
774*0Sstevel@tonic-gate U8 *
Perl_bytes_from_utf8(pTHX_ U8 * s,STRLEN * len,bool * is_utf8)775*0Sstevel@tonic-gate Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
776*0Sstevel@tonic-gate {
777*0Sstevel@tonic-gate U8 *d;
778*0Sstevel@tonic-gate U8 *start = s;
779*0Sstevel@tonic-gate U8 *send;
780*0Sstevel@tonic-gate I32 count = 0;
781*0Sstevel@tonic-gate
782*0Sstevel@tonic-gate if (!*is_utf8)
783*0Sstevel@tonic-gate return start;
784*0Sstevel@tonic-gate
785*0Sstevel@tonic-gate /* ensure valid UTF-8 and chars < 256 before converting string */
786*0Sstevel@tonic-gate for (send = s + *len; s < send;) {
787*0Sstevel@tonic-gate U8 c = *s++;
788*0Sstevel@tonic-gate if (!UTF8_IS_INVARIANT(c)) {
789*0Sstevel@tonic-gate if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790*0Sstevel@tonic-gate (c = *s++) && UTF8_IS_CONTINUATION(c))
791*0Sstevel@tonic-gate count++;
792*0Sstevel@tonic-gate else
793*0Sstevel@tonic-gate return start;
794*0Sstevel@tonic-gate }
795*0Sstevel@tonic-gate }
796*0Sstevel@tonic-gate
797*0Sstevel@tonic-gate *is_utf8 = 0;
798*0Sstevel@tonic-gate
799*0Sstevel@tonic-gate Newz(801, d, (*len) - count + 1, U8);
800*0Sstevel@tonic-gate s = start; start = d;
801*0Sstevel@tonic-gate while (s < send) {
802*0Sstevel@tonic-gate U8 c = *s++;
803*0Sstevel@tonic-gate if (!UTF8_IS_INVARIANT(c)) {
804*0Sstevel@tonic-gate /* Then it is two-byte encoded */
805*0Sstevel@tonic-gate c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806*0Sstevel@tonic-gate c = ASCII_TO_NATIVE(c);
807*0Sstevel@tonic-gate }
808*0Sstevel@tonic-gate *d++ = c;
809*0Sstevel@tonic-gate }
810*0Sstevel@tonic-gate *d = '\0';
811*0Sstevel@tonic-gate *len = d - start;
812*0Sstevel@tonic-gate return start;
813*0Sstevel@tonic-gate }
814*0Sstevel@tonic-gate
815*0Sstevel@tonic-gate /*
816*0Sstevel@tonic-gate =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
817*0Sstevel@tonic-gate
818*0Sstevel@tonic-gate Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
819*0Sstevel@tonic-gate Returns a pointer to the newly-created string, and sets C<len> to
820*0Sstevel@tonic-gate reflect the new length.
821*0Sstevel@tonic-gate
822*0Sstevel@tonic-gate If you want to convert to UTF-8 from other encodings than ASCII,
823*0Sstevel@tonic-gate see sv_recode_to_utf8().
824*0Sstevel@tonic-gate
825*0Sstevel@tonic-gate =cut
826*0Sstevel@tonic-gate */
827*0Sstevel@tonic-gate
828*0Sstevel@tonic-gate U8*
Perl_bytes_to_utf8(pTHX_ U8 * s,STRLEN * len)829*0Sstevel@tonic-gate Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
830*0Sstevel@tonic-gate {
831*0Sstevel@tonic-gate U8 *send;
832*0Sstevel@tonic-gate U8 *d;
833*0Sstevel@tonic-gate U8 *dst;
834*0Sstevel@tonic-gate send = s + (*len);
835*0Sstevel@tonic-gate
836*0Sstevel@tonic-gate Newz(801, d, (*len) * 2 + 1, U8);
837*0Sstevel@tonic-gate dst = d;
838*0Sstevel@tonic-gate
839*0Sstevel@tonic-gate while (s < send) {
840*0Sstevel@tonic-gate UV uv = NATIVE_TO_ASCII(*s++);
841*0Sstevel@tonic-gate if (UNI_IS_INVARIANT(uv))
842*0Sstevel@tonic-gate *d++ = (U8)UTF_TO_NATIVE(uv);
843*0Sstevel@tonic-gate else {
844*0Sstevel@tonic-gate *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
845*0Sstevel@tonic-gate *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
846*0Sstevel@tonic-gate }
847*0Sstevel@tonic-gate }
848*0Sstevel@tonic-gate *d = '\0';
849*0Sstevel@tonic-gate *len = d-dst;
850*0Sstevel@tonic-gate return dst;
851*0Sstevel@tonic-gate }
852*0Sstevel@tonic-gate
853*0Sstevel@tonic-gate /*
854*0Sstevel@tonic-gate * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
855*0Sstevel@tonic-gate *
856*0Sstevel@tonic-gate * Destination must be pre-extended to 3/2 source. Do not use in-place.
857*0Sstevel@tonic-gate * We optimize for native, for obvious reasons. */
858*0Sstevel@tonic-gate
859*0Sstevel@tonic-gate U8*
Perl_utf16_to_utf8(pTHX_ U8 * p,U8 * d,I32 bytelen,I32 * newlen)860*0Sstevel@tonic-gate Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
861*0Sstevel@tonic-gate {
862*0Sstevel@tonic-gate U8* pend;
863*0Sstevel@tonic-gate U8* dstart = d;
864*0Sstevel@tonic-gate
865*0Sstevel@tonic-gate if (bytelen & 1)
866*0Sstevel@tonic-gate Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
867*0Sstevel@tonic-gate
868*0Sstevel@tonic-gate pend = p + bytelen;
869*0Sstevel@tonic-gate
870*0Sstevel@tonic-gate while (p < pend) {
871*0Sstevel@tonic-gate UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
872*0Sstevel@tonic-gate p += 2;
873*0Sstevel@tonic-gate if (uv < 0x80) {
874*0Sstevel@tonic-gate *d++ = (U8)uv;
875*0Sstevel@tonic-gate continue;
876*0Sstevel@tonic-gate }
877*0Sstevel@tonic-gate if (uv < 0x800) {
878*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 6) | 0xc0);
879*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
880*0Sstevel@tonic-gate continue;
881*0Sstevel@tonic-gate }
882*0Sstevel@tonic-gate if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
883*0Sstevel@tonic-gate UV low = (p[0] << 8) + p[1];
884*0Sstevel@tonic-gate p += 2;
885*0Sstevel@tonic-gate if (low < 0xdc00 || low >= 0xdfff)
886*0Sstevel@tonic-gate Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
887*0Sstevel@tonic-gate uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
888*0Sstevel@tonic-gate }
889*0Sstevel@tonic-gate if (uv < 0x10000) {
890*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 12) | 0xe0);
891*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
892*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
893*0Sstevel@tonic-gate continue;
894*0Sstevel@tonic-gate }
895*0Sstevel@tonic-gate else {
896*0Sstevel@tonic-gate *d++ = (U8)(( uv >> 18) | 0xf0);
897*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
898*0Sstevel@tonic-gate *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
899*0Sstevel@tonic-gate *d++ = (U8)(( uv & 0x3f) | 0x80);
900*0Sstevel@tonic-gate continue;
901*0Sstevel@tonic-gate }
902*0Sstevel@tonic-gate }
903*0Sstevel@tonic-gate *newlen = d - dstart;
904*0Sstevel@tonic-gate return d;
905*0Sstevel@tonic-gate }
906*0Sstevel@tonic-gate
907*0Sstevel@tonic-gate /* Note: this one is slightly destructive of the source. */
908*0Sstevel@tonic-gate
909*0Sstevel@tonic-gate U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8 * p,U8 * d,I32 bytelen,I32 * newlen)910*0Sstevel@tonic-gate Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
911*0Sstevel@tonic-gate {
912*0Sstevel@tonic-gate U8* s = (U8*)p;
913*0Sstevel@tonic-gate U8* send = s + bytelen;
914*0Sstevel@tonic-gate while (s < send) {
915*0Sstevel@tonic-gate U8 tmp = s[0];
916*0Sstevel@tonic-gate s[0] = s[1];
917*0Sstevel@tonic-gate s[1] = tmp;
918*0Sstevel@tonic-gate s += 2;
919*0Sstevel@tonic-gate }
920*0Sstevel@tonic-gate return utf16_to_utf8(p, d, bytelen, newlen);
921*0Sstevel@tonic-gate }
922*0Sstevel@tonic-gate
923*0Sstevel@tonic-gate /* for now these are all defined (inefficiently) in terms of the utf8 versions */
924*0Sstevel@tonic-gate
925*0Sstevel@tonic-gate bool
Perl_is_uni_alnum(pTHX_ UV c)926*0Sstevel@tonic-gate Perl_is_uni_alnum(pTHX_ UV c)
927*0Sstevel@tonic-gate {
928*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
929*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
930*0Sstevel@tonic-gate return is_utf8_alnum(tmpbuf);
931*0Sstevel@tonic-gate }
932*0Sstevel@tonic-gate
933*0Sstevel@tonic-gate bool
Perl_is_uni_alnumc(pTHX_ UV c)934*0Sstevel@tonic-gate Perl_is_uni_alnumc(pTHX_ UV c)
935*0Sstevel@tonic-gate {
936*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
937*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
938*0Sstevel@tonic-gate return is_utf8_alnumc(tmpbuf);
939*0Sstevel@tonic-gate }
940*0Sstevel@tonic-gate
941*0Sstevel@tonic-gate bool
Perl_is_uni_idfirst(pTHX_ UV c)942*0Sstevel@tonic-gate Perl_is_uni_idfirst(pTHX_ UV c)
943*0Sstevel@tonic-gate {
944*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
945*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
946*0Sstevel@tonic-gate return is_utf8_idfirst(tmpbuf);
947*0Sstevel@tonic-gate }
948*0Sstevel@tonic-gate
949*0Sstevel@tonic-gate bool
Perl_is_uni_alpha(pTHX_ UV c)950*0Sstevel@tonic-gate Perl_is_uni_alpha(pTHX_ UV c)
951*0Sstevel@tonic-gate {
952*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
953*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
954*0Sstevel@tonic-gate return is_utf8_alpha(tmpbuf);
955*0Sstevel@tonic-gate }
956*0Sstevel@tonic-gate
957*0Sstevel@tonic-gate bool
Perl_is_uni_ascii(pTHX_ UV c)958*0Sstevel@tonic-gate Perl_is_uni_ascii(pTHX_ UV c)
959*0Sstevel@tonic-gate {
960*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
961*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
962*0Sstevel@tonic-gate return is_utf8_ascii(tmpbuf);
963*0Sstevel@tonic-gate }
964*0Sstevel@tonic-gate
965*0Sstevel@tonic-gate bool
Perl_is_uni_space(pTHX_ UV c)966*0Sstevel@tonic-gate Perl_is_uni_space(pTHX_ UV c)
967*0Sstevel@tonic-gate {
968*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
969*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
970*0Sstevel@tonic-gate return is_utf8_space(tmpbuf);
971*0Sstevel@tonic-gate }
972*0Sstevel@tonic-gate
973*0Sstevel@tonic-gate bool
Perl_is_uni_digit(pTHX_ UV c)974*0Sstevel@tonic-gate Perl_is_uni_digit(pTHX_ UV c)
975*0Sstevel@tonic-gate {
976*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
977*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
978*0Sstevel@tonic-gate return is_utf8_digit(tmpbuf);
979*0Sstevel@tonic-gate }
980*0Sstevel@tonic-gate
981*0Sstevel@tonic-gate bool
Perl_is_uni_upper(pTHX_ UV c)982*0Sstevel@tonic-gate Perl_is_uni_upper(pTHX_ UV c)
983*0Sstevel@tonic-gate {
984*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
985*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
986*0Sstevel@tonic-gate return is_utf8_upper(tmpbuf);
987*0Sstevel@tonic-gate }
988*0Sstevel@tonic-gate
989*0Sstevel@tonic-gate bool
Perl_is_uni_lower(pTHX_ UV c)990*0Sstevel@tonic-gate Perl_is_uni_lower(pTHX_ UV c)
991*0Sstevel@tonic-gate {
992*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
993*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
994*0Sstevel@tonic-gate return is_utf8_lower(tmpbuf);
995*0Sstevel@tonic-gate }
996*0Sstevel@tonic-gate
997*0Sstevel@tonic-gate bool
Perl_is_uni_cntrl(pTHX_ UV c)998*0Sstevel@tonic-gate Perl_is_uni_cntrl(pTHX_ UV c)
999*0Sstevel@tonic-gate {
1000*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
1001*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
1002*0Sstevel@tonic-gate return is_utf8_cntrl(tmpbuf);
1003*0Sstevel@tonic-gate }
1004*0Sstevel@tonic-gate
1005*0Sstevel@tonic-gate bool
Perl_is_uni_graph(pTHX_ UV c)1006*0Sstevel@tonic-gate Perl_is_uni_graph(pTHX_ UV c)
1007*0Sstevel@tonic-gate {
1008*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
1009*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
1010*0Sstevel@tonic-gate return is_utf8_graph(tmpbuf);
1011*0Sstevel@tonic-gate }
1012*0Sstevel@tonic-gate
1013*0Sstevel@tonic-gate bool
Perl_is_uni_print(pTHX_ UV c)1014*0Sstevel@tonic-gate Perl_is_uni_print(pTHX_ UV c)
1015*0Sstevel@tonic-gate {
1016*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
1017*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
1018*0Sstevel@tonic-gate return is_utf8_print(tmpbuf);
1019*0Sstevel@tonic-gate }
1020*0Sstevel@tonic-gate
1021*0Sstevel@tonic-gate bool
Perl_is_uni_punct(pTHX_ UV c)1022*0Sstevel@tonic-gate Perl_is_uni_punct(pTHX_ UV c)
1023*0Sstevel@tonic-gate {
1024*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN+1];
1025*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
1026*0Sstevel@tonic-gate return is_utf8_punct(tmpbuf);
1027*0Sstevel@tonic-gate }
1028*0Sstevel@tonic-gate
1029*0Sstevel@tonic-gate bool
Perl_is_uni_xdigit(pTHX_ UV c)1030*0Sstevel@tonic-gate Perl_is_uni_xdigit(pTHX_ UV c)
1031*0Sstevel@tonic-gate {
1032*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1033*0Sstevel@tonic-gate uvchr_to_utf8(tmpbuf, c);
1034*0Sstevel@tonic-gate return is_utf8_xdigit(tmpbuf);
1035*0Sstevel@tonic-gate }
1036*0Sstevel@tonic-gate
1037*0Sstevel@tonic-gate UV
Perl_to_uni_upper(pTHX_ UV c,U8 * p,STRLEN * lenp)1038*0Sstevel@tonic-gate Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1039*0Sstevel@tonic-gate {
1040*0Sstevel@tonic-gate uvchr_to_utf8(p, c);
1041*0Sstevel@tonic-gate return to_utf8_upper(p, p, lenp);
1042*0Sstevel@tonic-gate }
1043*0Sstevel@tonic-gate
1044*0Sstevel@tonic-gate UV
Perl_to_uni_title(pTHX_ UV c,U8 * p,STRLEN * lenp)1045*0Sstevel@tonic-gate Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1046*0Sstevel@tonic-gate {
1047*0Sstevel@tonic-gate uvchr_to_utf8(p, c);
1048*0Sstevel@tonic-gate return to_utf8_title(p, p, lenp);
1049*0Sstevel@tonic-gate }
1050*0Sstevel@tonic-gate
1051*0Sstevel@tonic-gate UV
Perl_to_uni_lower(pTHX_ UV c,U8 * p,STRLEN * lenp)1052*0Sstevel@tonic-gate Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1053*0Sstevel@tonic-gate {
1054*0Sstevel@tonic-gate uvchr_to_utf8(p, c);
1055*0Sstevel@tonic-gate return to_utf8_lower(p, p, lenp);
1056*0Sstevel@tonic-gate }
1057*0Sstevel@tonic-gate
1058*0Sstevel@tonic-gate UV
Perl_to_uni_fold(pTHX_ UV c,U8 * p,STRLEN * lenp)1059*0Sstevel@tonic-gate Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1060*0Sstevel@tonic-gate {
1061*0Sstevel@tonic-gate uvchr_to_utf8(p, c);
1062*0Sstevel@tonic-gate return to_utf8_fold(p, p, lenp);
1063*0Sstevel@tonic-gate }
1064*0Sstevel@tonic-gate
1065*0Sstevel@tonic-gate /* for now these all assume no locale info available for Unicode > 255 */
1066*0Sstevel@tonic-gate
1067*0Sstevel@tonic-gate bool
Perl_is_uni_alnum_lc(pTHX_ UV c)1068*0Sstevel@tonic-gate Perl_is_uni_alnum_lc(pTHX_ UV c)
1069*0Sstevel@tonic-gate {
1070*0Sstevel@tonic-gate return is_uni_alnum(c); /* XXX no locale support yet */
1071*0Sstevel@tonic-gate }
1072*0Sstevel@tonic-gate
1073*0Sstevel@tonic-gate bool
Perl_is_uni_alnumc_lc(pTHX_ UV c)1074*0Sstevel@tonic-gate Perl_is_uni_alnumc_lc(pTHX_ UV c)
1075*0Sstevel@tonic-gate {
1076*0Sstevel@tonic-gate return is_uni_alnumc(c); /* XXX no locale support yet */
1077*0Sstevel@tonic-gate }
1078*0Sstevel@tonic-gate
1079*0Sstevel@tonic-gate bool
Perl_is_uni_idfirst_lc(pTHX_ UV c)1080*0Sstevel@tonic-gate Perl_is_uni_idfirst_lc(pTHX_ UV c)
1081*0Sstevel@tonic-gate {
1082*0Sstevel@tonic-gate return is_uni_idfirst(c); /* XXX no locale support yet */
1083*0Sstevel@tonic-gate }
1084*0Sstevel@tonic-gate
1085*0Sstevel@tonic-gate bool
Perl_is_uni_alpha_lc(pTHX_ UV c)1086*0Sstevel@tonic-gate Perl_is_uni_alpha_lc(pTHX_ UV c)
1087*0Sstevel@tonic-gate {
1088*0Sstevel@tonic-gate return is_uni_alpha(c); /* XXX no locale support yet */
1089*0Sstevel@tonic-gate }
1090*0Sstevel@tonic-gate
1091*0Sstevel@tonic-gate bool
Perl_is_uni_ascii_lc(pTHX_ UV c)1092*0Sstevel@tonic-gate Perl_is_uni_ascii_lc(pTHX_ UV c)
1093*0Sstevel@tonic-gate {
1094*0Sstevel@tonic-gate return is_uni_ascii(c); /* XXX no locale support yet */
1095*0Sstevel@tonic-gate }
1096*0Sstevel@tonic-gate
1097*0Sstevel@tonic-gate bool
Perl_is_uni_space_lc(pTHX_ UV c)1098*0Sstevel@tonic-gate Perl_is_uni_space_lc(pTHX_ UV c)
1099*0Sstevel@tonic-gate {
1100*0Sstevel@tonic-gate return is_uni_space(c); /* XXX no locale support yet */
1101*0Sstevel@tonic-gate }
1102*0Sstevel@tonic-gate
1103*0Sstevel@tonic-gate bool
Perl_is_uni_digit_lc(pTHX_ UV c)1104*0Sstevel@tonic-gate Perl_is_uni_digit_lc(pTHX_ UV c)
1105*0Sstevel@tonic-gate {
1106*0Sstevel@tonic-gate return is_uni_digit(c); /* XXX no locale support yet */
1107*0Sstevel@tonic-gate }
1108*0Sstevel@tonic-gate
1109*0Sstevel@tonic-gate bool
Perl_is_uni_upper_lc(pTHX_ UV c)1110*0Sstevel@tonic-gate Perl_is_uni_upper_lc(pTHX_ UV c)
1111*0Sstevel@tonic-gate {
1112*0Sstevel@tonic-gate return is_uni_upper(c); /* XXX no locale support yet */
1113*0Sstevel@tonic-gate }
1114*0Sstevel@tonic-gate
1115*0Sstevel@tonic-gate bool
Perl_is_uni_lower_lc(pTHX_ UV c)1116*0Sstevel@tonic-gate Perl_is_uni_lower_lc(pTHX_ UV c)
1117*0Sstevel@tonic-gate {
1118*0Sstevel@tonic-gate return is_uni_lower(c); /* XXX no locale support yet */
1119*0Sstevel@tonic-gate }
1120*0Sstevel@tonic-gate
1121*0Sstevel@tonic-gate bool
Perl_is_uni_cntrl_lc(pTHX_ UV c)1122*0Sstevel@tonic-gate Perl_is_uni_cntrl_lc(pTHX_ UV c)
1123*0Sstevel@tonic-gate {
1124*0Sstevel@tonic-gate return is_uni_cntrl(c); /* XXX no locale support yet */
1125*0Sstevel@tonic-gate }
1126*0Sstevel@tonic-gate
1127*0Sstevel@tonic-gate bool
Perl_is_uni_graph_lc(pTHX_ UV c)1128*0Sstevel@tonic-gate Perl_is_uni_graph_lc(pTHX_ UV c)
1129*0Sstevel@tonic-gate {
1130*0Sstevel@tonic-gate return is_uni_graph(c); /* XXX no locale support yet */
1131*0Sstevel@tonic-gate }
1132*0Sstevel@tonic-gate
1133*0Sstevel@tonic-gate bool
Perl_is_uni_print_lc(pTHX_ UV c)1134*0Sstevel@tonic-gate Perl_is_uni_print_lc(pTHX_ UV c)
1135*0Sstevel@tonic-gate {
1136*0Sstevel@tonic-gate return is_uni_print(c); /* XXX no locale support yet */
1137*0Sstevel@tonic-gate }
1138*0Sstevel@tonic-gate
1139*0Sstevel@tonic-gate bool
Perl_is_uni_punct_lc(pTHX_ UV c)1140*0Sstevel@tonic-gate Perl_is_uni_punct_lc(pTHX_ UV c)
1141*0Sstevel@tonic-gate {
1142*0Sstevel@tonic-gate return is_uni_punct(c); /* XXX no locale support yet */
1143*0Sstevel@tonic-gate }
1144*0Sstevel@tonic-gate
1145*0Sstevel@tonic-gate bool
Perl_is_uni_xdigit_lc(pTHX_ UV c)1146*0Sstevel@tonic-gate Perl_is_uni_xdigit_lc(pTHX_ UV c)
1147*0Sstevel@tonic-gate {
1148*0Sstevel@tonic-gate return is_uni_xdigit(c); /* XXX no locale support yet */
1149*0Sstevel@tonic-gate }
1150*0Sstevel@tonic-gate
1151*0Sstevel@tonic-gate U32
Perl_to_uni_upper_lc(pTHX_ U32 c)1152*0Sstevel@tonic-gate Perl_to_uni_upper_lc(pTHX_ U32 c)
1153*0Sstevel@tonic-gate {
1154*0Sstevel@tonic-gate /* XXX returns only the first character -- do not use XXX */
1155*0Sstevel@tonic-gate /* XXX no locale support yet */
1156*0Sstevel@tonic-gate STRLEN len;
1157*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1158*0Sstevel@tonic-gate return (U32)to_uni_upper(c, tmpbuf, &len);
1159*0Sstevel@tonic-gate }
1160*0Sstevel@tonic-gate
1161*0Sstevel@tonic-gate U32
Perl_to_uni_title_lc(pTHX_ U32 c)1162*0Sstevel@tonic-gate Perl_to_uni_title_lc(pTHX_ U32 c)
1163*0Sstevel@tonic-gate {
1164*0Sstevel@tonic-gate /* XXX returns only the first character XXX -- do not use XXX */
1165*0Sstevel@tonic-gate /* XXX no locale support yet */
1166*0Sstevel@tonic-gate STRLEN len;
1167*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1168*0Sstevel@tonic-gate return (U32)to_uni_title(c, tmpbuf, &len);
1169*0Sstevel@tonic-gate }
1170*0Sstevel@tonic-gate
1171*0Sstevel@tonic-gate U32
Perl_to_uni_lower_lc(pTHX_ U32 c)1172*0Sstevel@tonic-gate Perl_to_uni_lower_lc(pTHX_ U32 c)
1173*0Sstevel@tonic-gate {
1174*0Sstevel@tonic-gate /* XXX returns only the first character -- do not use XXX */
1175*0Sstevel@tonic-gate /* XXX no locale support yet */
1176*0Sstevel@tonic-gate STRLEN len;
1177*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1178*0Sstevel@tonic-gate return (U32)to_uni_lower(c, tmpbuf, &len);
1179*0Sstevel@tonic-gate }
1180*0Sstevel@tonic-gate
1181*0Sstevel@tonic-gate bool
Perl_is_utf8_alnum(pTHX_ U8 * p)1182*0Sstevel@tonic-gate Perl_is_utf8_alnum(pTHX_ U8 *p)
1183*0Sstevel@tonic-gate {
1184*0Sstevel@tonic-gate if (!is_utf8_char(p))
1185*0Sstevel@tonic-gate return FALSE;
1186*0Sstevel@tonic-gate if (!PL_utf8_alnum)
1187*0Sstevel@tonic-gate /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1188*0Sstevel@tonic-gate * descendant of isalnum(3), in other words, it doesn't
1189*0Sstevel@tonic-gate * contain the '_'. --jhi */
1190*0Sstevel@tonic-gate PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1191*0Sstevel@tonic-gate return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1192*0Sstevel@tonic-gate /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1193*0Sstevel@tonic-gate #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1194*0Sstevel@tonic-gate if (!PL_utf8_alnum)
1195*0Sstevel@tonic-gate PL_utf8_alnum = swash_init("utf8", "",
1196*0Sstevel@tonic-gate sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1197*0Sstevel@tonic-gate return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1198*0Sstevel@tonic-gate #endif
1199*0Sstevel@tonic-gate }
1200*0Sstevel@tonic-gate
1201*0Sstevel@tonic-gate bool
Perl_is_utf8_alnumc(pTHX_ U8 * p)1202*0Sstevel@tonic-gate Perl_is_utf8_alnumc(pTHX_ U8 *p)
1203*0Sstevel@tonic-gate {
1204*0Sstevel@tonic-gate if (!is_utf8_char(p))
1205*0Sstevel@tonic-gate return FALSE;
1206*0Sstevel@tonic-gate if (!PL_utf8_alnum)
1207*0Sstevel@tonic-gate PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1208*0Sstevel@tonic-gate return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1209*0Sstevel@tonic-gate /* return is_utf8_alpha(p) || is_utf8_digit(p); */
1210*0Sstevel@tonic-gate #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1211*0Sstevel@tonic-gate if (!PL_utf8_alnum)
1212*0Sstevel@tonic-gate PL_utf8_alnum = swash_init("utf8", "",
1213*0Sstevel@tonic-gate sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1214*0Sstevel@tonic-gate return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1215*0Sstevel@tonic-gate #endif
1216*0Sstevel@tonic-gate }
1217*0Sstevel@tonic-gate
1218*0Sstevel@tonic-gate bool
Perl_is_utf8_idfirst(pTHX_ U8 * p)1219*0Sstevel@tonic-gate Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1220*0Sstevel@tonic-gate {
1221*0Sstevel@tonic-gate if (*p == '_')
1222*0Sstevel@tonic-gate return TRUE;
1223*0Sstevel@tonic-gate if (!is_utf8_char(p))
1224*0Sstevel@tonic-gate return FALSE;
1225*0Sstevel@tonic-gate if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1226*0Sstevel@tonic-gate PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1227*0Sstevel@tonic-gate return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1228*0Sstevel@tonic-gate }
1229*0Sstevel@tonic-gate
1230*0Sstevel@tonic-gate bool
Perl_is_utf8_idcont(pTHX_ U8 * p)1231*0Sstevel@tonic-gate Perl_is_utf8_idcont(pTHX_ U8 *p)
1232*0Sstevel@tonic-gate {
1233*0Sstevel@tonic-gate if (*p == '_')
1234*0Sstevel@tonic-gate return TRUE;
1235*0Sstevel@tonic-gate if (!is_utf8_char(p))
1236*0Sstevel@tonic-gate return FALSE;
1237*0Sstevel@tonic-gate if (!PL_utf8_idcont)
1238*0Sstevel@tonic-gate PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1239*0Sstevel@tonic-gate return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1240*0Sstevel@tonic-gate }
1241*0Sstevel@tonic-gate
1242*0Sstevel@tonic-gate bool
Perl_is_utf8_alpha(pTHX_ U8 * p)1243*0Sstevel@tonic-gate Perl_is_utf8_alpha(pTHX_ U8 *p)
1244*0Sstevel@tonic-gate {
1245*0Sstevel@tonic-gate if (!is_utf8_char(p))
1246*0Sstevel@tonic-gate return FALSE;
1247*0Sstevel@tonic-gate if (!PL_utf8_alpha)
1248*0Sstevel@tonic-gate PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1249*0Sstevel@tonic-gate return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1250*0Sstevel@tonic-gate }
1251*0Sstevel@tonic-gate
1252*0Sstevel@tonic-gate bool
Perl_is_utf8_ascii(pTHX_ U8 * p)1253*0Sstevel@tonic-gate Perl_is_utf8_ascii(pTHX_ U8 *p)
1254*0Sstevel@tonic-gate {
1255*0Sstevel@tonic-gate if (!is_utf8_char(p))
1256*0Sstevel@tonic-gate return FALSE;
1257*0Sstevel@tonic-gate if (!PL_utf8_ascii)
1258*0Sstevel@tonic-gate PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1259*0Sstevel@tonic-gate return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1260*0Sstevel@tonic-gate }
1261*0Sstevel@tonic-gate
1262*0Sstevel@tonic-gate bool
Perl_is_utf8_space(pTHX_ U8 * p)1263*0Sstevel@tonic-gate Perl_is_utf8_space(pTHX_ U8 *p)
1264*0Sstevel@tonic-gate {
1265*0Sstevel@tonic-gate if (!is_utf8_char(p))
1266*0Sstevel@tonic-gate return FALSE;
1267*0Sstevel@tonic-gate if (!PL_utf8_space)
1268*0Sstevel@tonic-gate PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1269*0Sstevel@tonic-gate return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1270*0Sstevel@tonic-gate }
1271*0Sstevel@tonic-gate
1272*0Sstevel@tonic-gate bool
Perl_is_utf8_digit(pTHX_ U8 * p)1273*0Sstevel@tonic-gate Perl_is_utf8_digit(pTHX_ U8 *p)
1274*0Sstevel@tonic-gate {
1275*0Sstevel@tonic-gate if (!is_utf8_char(p))
1276*0Sstevel@tonic-gate return FALSE;
1277*0Sstevel@tonic-gate if (!PL_utf8_digit)
1278*0Sstevel@tonic-gate PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1279*0Sstevel@tonic-gate return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1280*0Sstevel@tonic-gate }
1281*0Sstevel@tonic-gate
1282*0Sstevel@tonic-gate bool
Perl_is_utf8_upper(pTHX_ U8 * p)1283*0Sstevel@tonic-gate Perl_is_utf8_upper(pTHX_ U8 *p)
1284*0Sstevel@tonic-gate {
1285*0Sstevel@tonic-gate if (!is_utf8_char(p))
1286*0Sstevel@tonic-gate return FALSE;
1287*0Sstevel@tonic-gate if (!PL_utf8_upper)
1288*0Sstevel@tonic-gate PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1289*0Sstevel@tonic-gate return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1290*0Sstevel@tonic-gate }
1291*0Sstevel@tonic-gate
1292*0Sstevel@tonic-gate bool
Perl_is_utf8_lower(pTHX_ U8 * p)1293*0Sstevel@tonic-gate Perl_is_utf8_lower(pTHX_ U8 *p)
1294*0Sstevel@tonic-gate {
1295*0Sstevel@tonic-gate if (!is_utf8_char(p))
1296*0Sstevel@tonic-gate return FALSE;
1297*0Sstevel@tonic-gate if (!PL_utf8_lower)
1298*0Sstevel@tonic-gate PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1299*0Sstevel@tonic-gate return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1300*0Sstevel@tonic-gate }
1301*0Sstevel@tonic-gate
1302*0Sstevel@tonic-gate bool
Perl_is_utf8_cntrl(pTHX_ U8 * p)1303*0Sstevel@tonic-gate Perl_is_utf8_cntrl(pTHX_ U8 *p)
1304*0Sstevel@tonic-gate {
1305*0Sstevel@tonic-gate if (!is_utf8_char(p))
1306*0Sstevel@tonic-gate return FALSE;
1307*0Sstevel@tonic-gate if (!PL_utf8_cntrl)
1308*0Sstevel@tonic-gate PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1309*0Sstevel@tonic-gate return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1310*0Sstevel@tonic-gate }
1311*0Sstevel@tonic-gate
1312*0Sstevel@tonic-gate bool
Perl_is_utf8_graph(pTHX_ U8 * p)1313*0Sstevel@tonic-gate Perl_is_utf8_graph(pTHX_ U8 *p)
1314*0Sstevel@tonic-gate {
1315*0Sstevel@tonic-gate if (!is_utf8_char(p))
1316*0Sstevel@tonic-gate return FALSE;
1317*0Sstevel@tonic-gate if (!PL_utf8_graph)
1318*0Sstevel@tonic-gate PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1319*0Sstevel@tonic-gate return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1320*0Sstevel@tonic-gate }
1321*0Sstevel@tonic-gate
1322*0Sstevel@tonic-gate bool
Perl_is_utf8_print(pTHX_ U8 * p)1323*0Sstevel@tonic-gate Perl_is_utf8_print(pTHX_ U8 *p)
1324*0Sstevel@tonic-gate {
1325*0Sstevel@tonic-gate if (!is_utf8_char(p))
1326*0Sstevel@tonic-gate return FALSE;
1327*0Sstevel@tonic-gate if (!PL_utf8_print)
1328*0Sstevel@tonic-gate PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1329*0Sstevel@tonic-gate return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1330*0Sstevel@tonic-gate }
1331*0Sstevel@tonic-gate
1332*0Sstevel@tonic-gate bool
Perl_is_utf8_punct(pTHX_ U8 * p)1333*0Sstevel@tonic-gate Perl_is_utf8_punct(pTHX_ U8 *p)
1334*0Sstevel@tonic-gate {
1335*0Sstevel@tonic-gate if (!is_utf8_char(p))
1336*0Sstevel@tonic-gate return FALSE;
1337*0Sstevel@tonic-gate if (!PL_utf8_punct)
1338*0Sstevel@tonic-gate PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1339*0Sstevel@tonic-gate return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1340*0Sstevel@tonic-gate }
1341*0Sstevel@tonic-gate
1342*0Sstevel@tonic-gate bool
Perl_is_utf8_xdigit(pTHX_ U8 * p)1343*0Sstevel@tonic-gate Perl_is_utf8_xdigit(pTHX_ U8 *p)
1344*0Sstevel@tonic-gate {
1345*0Sstevel@tonic-gate if (!is_utf8_char(p))
1346*0Sstevel@tonic-gate return FALSE;
1347*0Sstevel@tonic-gate if (!PL_utf8_xdigit)
1348*0Sstevel@tonic-gate PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1349*0Sstevel@tonic-gate return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1350*0Sstevel@tonic-gate }
1351*0Sstevel@tonic-gate
1352*0Sstevel@tonic-gate bool
Perl_is_utf8_mark(pTHX_ U8 * p)1353*0Sstevel@tonic-gate Perl_is_utf8_mark(pTHX_ U8 *p)
1354*0Sstevel@tonic-gate {
1355*0Sstevel@tonic-gate if (!is_utf8_char(p))
1356*0Sstevel@tonic-gate return FALSE;
1357*0Sstevel@tonic-gate if (!PL_utf8_mark)
1358*0Sstevel@tonic-gate PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1359*0Sstevel@tonic-gate return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1360*0Sstevel@tonic-gate }
1361*0Sstevel@tonic-gate
1362*0Sstevel@tonic-gate /*
1363*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1364*0Sstevel@tonic-gate
1365*0Sstevel@tonic-gate The "p" contains the pointer to the UTF-8 string encoding
1366*0Sstevel@tonic-gate the character that is being converted.
1367*0Sstevel@tonic-gate
1368*0Sstevel@tonic-gate The "ustrp" is a pointer to the character buffer to put the
1369*0Sstevel@tonic-gate conversion result to. The "lenp" is a pointer to the length
1370*0Sstevel@tonic-gate of the result.
1371*0Sstevel@tonic-gate
1372*0Sstevel@tonic-gate The "swashp" is a pointer to the swash to use.
1373*0Sstevel@tonic-gate
1374*0Sstevel@tonic-gate Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1375*0Sstevel@tonic-gate and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1376*0Sstevel@tonic-gate but not always, a multicharacter mapping), is tried first.
1377*0Sstevel@tonic-gate
1378*0Sstevel@tonic-gate The "special" is a string like "utf8::ToSpecLower", which means the
1379*0Sstevel@tonic-gate hash %utf8::ToSpecLower. The access to the hash is through
1380*0Sstevel@tonic-gate Perl_to_utf8_case().
1381*0Sstevel@tonic-gate
1382*0Sstevel@tonic-gate The "normal" is a string like "ToLower" which means the swash
1383*0Sstevel@tonic-gate %utf8::ToLower.
1384*0Sstevel@tonic-gate
1385*0Sstevel@tonic-gate =cut */
1386*0Sstevel@tonic-gate
1387*0Sstevel@tonic-gate UV
Perl_to_utf8_case(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp,SV ** swashp,char * normal,char * special)1388*0Sstevel@tonic-gate Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1389*0Sstevel@tonic-gate {
1390*0Sstevel@tonic-gate UV uv0, uv1;
1391*0Sstevel@tonic-gate U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1392*0Sstevel@tonic-gate STRLEN len = 0;
1393*0Sstevel@tonic-gate
1394*0Sstevel@tonic-gate uv0 = utf8_to_uvchr(p, 0);
1395*0Sstevel@tonic-gate /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1396*0Sstevel@tonic-gate * are necessary in EBCDIC, they are redundant no-ops
1397*0Sstevel@tonic-gate * in ASCII-ish platforms, and hopefully optimized away. */
1398*0Sstevel@tonic-gate uv1 = NATIVE_TO_UNI(uv0);
1399*0Sstevel@tonic-gate uvuni_to_utf8(tmpbuf, uv1);
1400*0Sstevel@tonic-gate
1401*0Sstevel@tonic-gate if (!*swashp) /* load on-demand */
1402*0Sstevel@tonic-gate *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1403*0Sstevel@tonic-gate
1404*0Sstevel@tonic-gate /* The 0xDF is the only special casing Unicode code point below 0x100. */
1405*0Sstevel@tonic-gate if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1406*0Sstevel@tonic-gate /* It might be "special" (sometimes, but not always,
1407*0Sstevel@tonic-gate * a multicharacter mapping) */
1408*0Sstevel@tonic-gate HV *hv;
1409*0Sstevel@tonic-gate SV **svp;
1410*0Sstevel@tonic-gate
1411*0Sstevel@tonic-gate if ((hv = get_hv(special, FALSE)) &&
1412*0Sstevel@tonic-gate (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1413*0Sstevel@tonic-gate (*svp)) {
1414*0Sstevel@tonic-gate char *s;
1415*0Sstevel@tonic-gate
1416*0Sstevel@tonic-gate s = SvPV(*svp, len);
1417*0Sstevel@tonic-gate if (len == 1)
1418*0Sstevel@tonic-gate len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1419*0Sstevel@tonic-gate else {
1420*0Sstevel@tonic-gate #ifdef EBCDIC
1421*0Sstevel@tonic-gate /* If we have EBCDIC we need to remap the characters
1422*0Sstevel@tonic-gate * since any characters in the low 256 are Unicode
1423*0Sstevel@tonic-gate * code points, not EBCDIC. */
1424*0Sstevel@tonic-gate U8 *t = (U8*)s, *tend = t + len, *d;
1425*0Sstevel@tonic-gate
1426*0Sstevel@tonic-gate d = tmpbuf;
1427*0Sstevel@tonic-gate if (SvUTF8(*svp)) {
1428*0Sstevel@tonic-gate STRLEN tlen = 0;
1429*0Sstevel@tonic-gate
1430*0Sstevel@tonic-gate while (t < tend) {
1431*0Sstevel@tonic-gate UV c = utf8_to_uvchr(t, &tlen);
1432*0Sstevel@tonic-gate if (tlen > 0) {
1433*0Sstevel@tonic-gate d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1434*0Sstevel@tonic-gate t += tlen;
1435*0Sstevel@tonic-gate }
1436*0Sstevel@tonic-gate else
1437*0Sstevel@tonic-gate break;
1438*0Sstevel@tonic-gate }
1439*0Sstevel@tonic-gate }
1440*0Sstevel@tonic-gate else {
1441*0Sstevel@tonic-gate while (t < tend) {
1442*0Sstevel@tonic-gate d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1443*0Sstevel@tonic-gate t++;
1444*0Sstevel@tonic-gate }
1445*0Sstevel@tonic-gate }
1446*0Sstevel@tonic-gate len = d - tmpbuf;
1447*0Sstevel@tonic-gate Copy(tmpbuf, ustrp, len, U8);
1448*0Sstevel@tonic-gate #else
1449*0Sstevel@tonic-gate Copy(s, ustrp, len, U8);
1450*0Sstevel@tonic-gate #endif
1451*0Sstevel@tonic-gate }
1452*0Sstevel@tonic-gate }
1453*0Sstevel@tonic-gate }
1454*0Sstevel@tonic-gate
1455*0Sstevel@tonic-gate if (!len && *swashp) {
1456*0Sstevel@tonic-gate UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1457*0Sstevel@tonic-gate
1458*0Sstevel@tonic-gate if (uv2) {
1459*0Sstevel@tonic-gate /* It was "normal" (a single character mapping). */
1460*0Sstevel@tonic-gate UV uv3 = UNI_TO_NATIVE(uv2);
1461*0Sstevel@tonic-gate
1462*0Sstevel@tonic-gate len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1463*0Sstevel@tonic-gate }
1464*0Sstevel@tonic-gate }
1465*0Sstevel@tonic-gate
1466*0Sstevel@tonic-gate if (!len) /* Neither: just copy. */
1467*0Sstevel@tonic-gate len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1468*0Sstevel@tonic-gate
1469*0Sstevel@tonic-gate if (lenp)
1470*0Sstevel@tonic-gate *lenp = len;
1471*0Sstevel@tonic-gate
1472*0Sstevel@tonic-gate return len ? utf8_to_uvchr(ustrp, 0) : 0;
1473*0Sstevel@tonic-gate }
1474*0Sstevel@tonic-gate
1475*0Sstevel@tonic-gate /*
1476*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1477*0Sstevel@tonic-gate
1478*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its uppercase version and
1479*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1480*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1481*0Sstevel@tonic-gate uppercase version may be longer than the original character (up to two
1482*0Sstevel@tonic-gate characters).
1483*0Sstevel@tonic-gate
1484*0Sstevel@tonic-gate The first character of the uppercased version is returned
1485*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1486*0Sstevel@tonic-gate
1487*0Sstevel@tonic-gate =cut */
1488*0Sstevel@tonic-gate
1489*0Sstevel@tonic-gate UV
Perl_to_utf8_upper(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1490*0Sstevel@tonic-gate Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1491*0Sstevel@tonic-gate {
1492*0Sstevel@tonic-gate return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1493*0Sstevel@tonic-gate &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1494*0Sstevel@tonic-gate }
1495*0Sstevel@tonic-gate
1496*0Sstevel@tonic-gate /*
1497*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1498*0Sstevel@tonic-gate
1499*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its titlecase version and
1500*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1501*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1502*0Sstevel@tonic-gate titlecase version may be longer than the original character (up to two
1503*0Sstevel@tonic-gate characters).
1504*0Sstevel@tonic-gate
1505*0Sstevel@tonic-gate The first character of the titlecased version is returned
1506*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1507*0Sstevel@tonic-gate
1508*0Sstevel@tonic-gate =cut */
1509*0Sstevel@tonic-gate
1510*0Sstevel@tonic-gate UV
Perl_to_utf8_title(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1511*0Sstevel@tonic-gate Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1512*0Sstevel@tonic-gate {
1513*0Sstevel@tonic-gate return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1514*0Sstevel@tonic-gate &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1515*0Sstevel@tonic-gate }
1516*0Sstevel@tonic-gate
1517*0Sstevel@tonic-gate /*
1518*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1519*0Sstevel@tonic-gate
1520*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its lowercase version and
1521*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1522*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1523*0Sstevel@tonic-gate lowercase version may be longer than the original character (up to two
1524*0Sstevel@tonic-gate characters).
1525*0Sstevel@tonic-gate
1526*0Sstevel@tonic-gate The first character of the lowercased version is returned
1527*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1528*0Sstevel@tonic-gate
1529*0Sstevel@tonic-gate =cut */
1530*0Sstevel@tonic-gate
1531*0Sstevel@tonic-gate UV
Perl_to_utf8_lower(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1532*0Sstevel@tonic-gate Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1533*0Sstevel@tonic-gate {
1534*0Sstevel@tonic-gate return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1535*0Sstevel@tonic-gate &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1536*0Sstevel@tonic-gate }
1537*0Sstevel@tonic-gate
1538*0Sstevel@tonic-gate /*
1539*0Sstevel@tonic-gate =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1540*0Sstevel@tonic-gate
1541*0Sstevel@tonic-gate Convert the UTF-8 encoded character at p to its foldcase version and
1542*0Sstevel@tonic-gate store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1543*0Sstevel@tonic-gate that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1544*0Sstevel@tonic-gate foldcase version may be longer than the original character (up to
1545*0Sstevel@tonic-gate three characters).
1546*0Sstevel@tonic-gate
1547*0Sstevel@tonic-gate The first character of the foldcased version is returned
1548*0Sstevel@tonic-gate (but note, as explained above, that there may be more.)
1549*0Sstevel@tonic-gate
1550*0Sstevel@tonic-gate =cut */
1551*0Sstevel@tonic-gate
1552*0Sstevel@tonic-gate UV
Perl_to_utf8_fold(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1553*0Sstevel@tonic-gate Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1554*0Sstevel@tonic-gate {
1555*0Sstevel@tonic-gate return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1556*0Sstevel@tonic-gate &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1557*0Sstevel@tonic-gate }
1558*0Sstevel@tonic-gate
1559*0Sstevel@tonic-gate /* a "swash" is a swatch hash */
1560*0Sstevel@tonic-gate
1561*0Sstevel@tonic-gate SV*
Perl_swash_init(pTHX_ char * pkg,char * name,SV * listsv,I32 minbits,I32 none)1562*0Sstevel@tonic-gate Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1563*0Sstevel@tonic-gate {
1564*0Sstevel@tonic-gate SV* retval;
1565*0Sstevel@tonic-gate SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1566*0Sstevel@tonic-gate dSP;
1567*0Sstevel@tonic-gate HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1568*0Sstevel@tonic-gate SV* errsv_save;
1569*0Sstevel@tonic-gate
1570*0Sstevel@tonic-gate if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1571*0Sstevel@tonic-gate ENTER;
1572*0Sstevel@tonic-gate errsv_save = newSVsv(ERRSV);
1573*0Sstevel@tonic-gate Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1574*0Sstevel@tonic-gate if (!SvTRUE(ERRSV))
1575*0Sstevel@tonic-gate sv_setsv(ERRSV, errsv_save);
1576*0Sstevel@tonic-gate SvREFCNT_dec(errsv_save);
1577*0Sstevel@tonic-gate LEAVE;
1578*0Sstevel@tonic-gate }
1579*0Sstevel@tonic-gate SPAGAIN;
1580*0Sstevel@tonic-gate PUSHSTACKi(PERLSI_MAGIC);
1581*0Sstevel@tonic-gate PUSHMARK(SP);
1582*0Sstevel@tonic-gate EXTEND(SP,5);
1583*0Sstevel@tonic-gate PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1584*0Sstevel@tonic-gate PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1585*0Sstevel@tonic-gate PUSHs(listsv);
1586*0Sstevel@tonic-gate PUSHs(sv_2mortal(newSViv(minbits)));
1587*0Sstevel@tonic-gate PUSHs(sv_2mortal(newSViv(none)));
1588*0Sstevel@tonic-gate PUTBACK;
1589*0Sstevel@tonic-gate ENTER;
1590*0Sstevel@tonic-gate SAVEI32(PL_hints);
1591*0Sstevel@tonic-gate PL_hints = 0;
1592*0Sstevel@tonic-gate save_re_context();
1593*0Sstevel@tonic-gate if (IN_PERL_COMPILETIME) {
1594*0Sstevel@tonic-gate /* XXX ought to be handled by lex_start */
1595*0Sstevel@tonic-gate SAVEI32(PL_in_my);
1596*0Sstevel@tonic-gate PL_in_my = 0;
1597*0Sstevel@tonic-gate sv_setpv(tokenbufsv, PL_tokenbuf);
1598*0Sstevel@tonic-gate }
1599*0Sstevel@tonic-gate errsv_save = newSVsv(ERRSV);
1600*0Sstevel@tonic-gate if (call_method("SWASHNEW", G_SCALAR))
1601*0Sstevel@tonic-gate retval = newSVsv(*PL_stack_sp--);
1602*0Sstevel@tonic-gate else
1603*0Sstevel@tonic-gate retval = &PL_sv_undef;
1604*0Sstevel@tonic-gate if (!SvTRUE(ERRSV))
1605*0Sstevel@tonic-gate sv_setsv(ERRSV, errsv_save);
1606*0Sstevel@tonic-gate SvREFCNT_dec(errsv_save);
1607*0Sstevel@tonic-gate LEAVE;
1608*0Sstevel@tonic-gate POPSTACK;
1609*0Sstevel@tonic-gate if (IN_PERL_COMPILETIME) {
1610*0Sstevel@tonic-gate STRLEN len;
1611*0Sstevel@tonic-gate char* pv = SvPV(tokenbufsv, len);
1612*0Sstevel@tonic-gate
1613*0Sstevel@tonic-gate Copy(pv, PL_tokenbuf, len+1, char);
1614*0Sstevel@tonic-gate PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1615*0Sstevel@tonic-gate }
1616*0Sstevel@tonic-gate if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1617*0Sstevel@tonic-gate if (SvPOK(retval))
1618*0Sstevel@tonic-gate Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1619*0Sstevel@tonic-gate retval);
1620*0Sstevel@tonic-gate Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1621*0Sstevel@tonic-gate }
1622*0Sstevel@tonic-gate return retval;
1623*0Sstevel@tonic-gate }
1624*0Sstevel@tonic-gate
1625*0Sstevel@tonic-gate
1626*0Sstevel@tonic-gate /* This API is wrong for special case conversions since we may need to
1627*0Sstevel@tonic-gate * return several Unicode characters for a single Unicode character
1628*0Sstevel@tonic-gate * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1629*0Sstevel@tonic-gate * the lower-level routine, and it is similarly broken for returning
1630*0Sstevel@tonic-gate * multiple values. --jhi */
1631*0Sstevel@tonic-gate UV
Perl_swash_fetch(pTHX_ SV * sv,U8 * ptr,bool do_utf8)1632*0Sstevel@tonic-gate Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1633*0Sstevel@tonic-gate {
1634*0Sstevel@tonic-gate HV* hv = (HV*)SvRV(sv);
1635*0Sstevel@tonic-gate U32 klen;
1636*0Sstevel@tonic-gate U32 off;
1637*0Sstevel@tonic-gate STRLEN slen;
1638*0Sstevel@tonic-gate STRLEN needents;
1639*0Sstevel@tonic-gate U8 *tmps = NULL;
1640*0Sstevel@tonic-gate U32 bit;
1641*0Sstevel@tonic-gate SV *retval;
1642*0Sstevel@tonic-gate U8 tmputf8[2];
1643*0Sstevel@tonic-gate UV c = NATIVE_TO_ASCII(*ptr);
1644*0Sstevel@tonic-gate
1645*0Sstevel@tonic-gate if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1646*0Sstevel@tonic-gate tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1647*0Sstevel@tonic-gate tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1648*0Sstevel@tonic-gate ptr = tmputf8;
1649*0Sstevel@tonic-gate }
1650*0Sstevel@tonic-gate /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1651*0Sstevel@tonic-gate * then the "swatch" is a vec() for al the chars which start
1652*0Sstevel@tonic-gate * with 0xAA..0xYY
1653*0Sstevel@tonic-gate * So the key in the hash (klen) is length of encoded char -1
1654*0Sstevel@tonic-gate */
1655*0Sstevel@tonic-gate klen = UTF8SKIP(ptr) - 1;
1656*0Sstevel@tonic-gate off = ptr[klen];
1657*0Sstevel@tonic-gate
1658*0Sstevel@tonic-gate if (klen == 0)
1659*0Sstevel@tonic-gate {
1660*0Sstevel@tonic-gate /* If char in invariant then swatch is for all the invariant chars
1661*0Sstevel@tonic-gate * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1662*0Sstevel@tonic-gate */
1663*0Sstevel@tonic-gate needents = UTF_CONTINUATION_MARK;
1664*0Sstevel@tonic-gate off = NATIVE_TO_UTF(ptr[klen]);
1665*0Sstevel@tonic-gate }
1666*0Sstevel@tonic-gate else
1667*0Sstevel@tonic-gate {
1668*0Sstevel@tonic-gate /* If char is encoded then swatch is for the prefix */
1669*0Sstevel@tonic-gate needents = (1 << UTF_ACCUMULATION_SHIFT);
1670*0Sstevel@tonic-gate off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1671*0Sstevel@tonic-gate }
1672*0Sstevel@tonic-gate
1673*0Sstevel@tonic-gate /*
1674*0Sstevel@tonic-gate * This single-entry cache saves about 1/3 of the utf8 overhead in test
1675*0Sstevel@tonic-gate * suite. (That is, only 7-8% overall over just a hash cache. Still,
1676*0Sstevel@tonic-gate * it's nothing to sniff at.) Pity we usually come through at least
1677*0Sstevel@tonic-gate * two function calls to get here...
1678*0Sstevel@tonic-gate *
1679*0Sstevel@tonic-gate * NB: this code assumes that swatches are never modified, once generated!
1680*0Sstevel@tonic-gate */
1681*0Sstevel@tonic-gate
1682*0Sstevel@tonic-gate if (hv == PL_last_swash_hv &&
1683*0Sstevel@tonic-gate klen == PL_last_swash_klen &&
1684*0Sstevel@tonic-gate (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1685*0Sstevel@tonic-gate {
1686*0Sstevel@tonic-gate tmps = PL_last_swash_tmps;
1687*0Sstevel@tonic-gate slen = PL_last_swash_slen;
1688*0Sstevel@tonic-gate }
1689*0Sstevel@tonic-gate else {
1690*0Sstevel@tonic-gate /* Try our second-level swatch cache, kept in a hash. */
1691*0Sstevel@tonic-gate SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1692*0Sstevel@tonic-gate
1693*0Sstevel@tonic-gate /* If not cached, generate it via utf8::SWASHGET */
1694*0Sstevel@tonic-gate if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1695*0Sstevel@tonic-gate dSP;
1696*0Sstevel@tonic-gate /* We use utf8n_to_uvuni() as we want an index into
1697*0Sstevel@tonic-gate Unicode tables, not a native character number.
1698*0Sstevel@tonic-gate */
1699*0Sstevel@tonic-gate UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1700*0Sstevel@tonic-gate ckWARN(WARN_UTF8) ?
1701*0Sstevel@tonic-gate 0 : UTF8_ALLOW_ANY);
1702*0Sstevel@tonic-gate SV *errsv_save;
1703*0Sstevel@tonic-gate ENTER;
1704*0Sstevel@tonic-gate SAVETMPS;
1705*0Sstevel@tonic-gate save_re_context();
1706*0Sstevel@tonic-gate PUSHSTACKi(PERLSI_MAGIC);
1707*0Sstevel@tonic-gate PUSHMARK(SP);
1708*0Sstevel@tonic-gate EXTEND(SP,3);
1709*0Sstevel@tonic-gate PUSHs((SV*)sv);
1710*0Sstevel@tonic-gate /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1711*0Sstevel@tonic-gate PUSHs(sv_2mortal(newSViv((klen) ?
1712*0Sstevel@tonic-gate (code_point & ~(needents - 1)) : 0)));
1713*0Sstevel@tonic-gate PUSHs(sv_2mortal(newSViv(needents)));
1714*0Sstevel@tonic-gate PUTBACK;
1715*0Sstevel@tonic-gate errsv_save = newSVsv(ERRSV);
1716*0Sstevel@tonic-gate if (call_method("SWASHGET", G_SCALAR))
1717*0Sstevel@tonic-gate retval = newSVsv(*PL_stack_sp--);
1718*0Sstevel@tonic-gate else
1719*0Sstevel@tonic-gate retval = &PL_sv_undef;
1720*0Sstevel@tonic-gate if (!SvTRUE(ERRSV))
1721*0Sstevel@tonic-gate sv_setsv(ERRSV, errsv_save);
1722*0Sstevel@tonic-gate SvREFCNT_dec(errsv_save);
1723*0Sstevel@tonic-gate POPSTACK;
1724*0Sstevel@tonic-gate FREETMPS;
1725*0Sstevel@tonic-gate LEAVE;
1726*0Sstevel@tonic-gate if (IN_PERL_COMPILETIME)
1727*0Sstevel@tonic-gate PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1728*0Sstevel@tonic-gate
1729*0Sstevel@tonic-gate svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1730*0Sstevel@tonic-gate
1731*0Sstevel@tonic-gate if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1732*0Sstevel@tonic-gate Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1733*0Sstevel@tonic-gate }
1734*0Sstevel@tonic-gate
1735*0Sstevel@tonic-gate PL_last_swash_hv = hv;
1736*0Sstevel@tonic-gate PL_last_swash_klen = klen;
1737*0Sstevel@tonic-gate PL_last_swash_tmps = tmps;
1738*0Sstevel@tonic-gate PL_last_swash_slen = slen;
1739*0Sstevel@tonic-gate if (klen)
1740*0Sstevel@tonic-gate Copy(ptr, PL_last_swash_key, klen, U8);
1741*0Sstevel@tonic-gate }
1742*0Sstevel@tonic-gate
1743*0Sstevel@tonic-gate switch ((int)((slen << 3) / needents)) {
1744*0Sstevel@tonic-gate case 1:
1745*0Sstevel@tonic-gate bit = 1 << (off & 7);
1746*0Sstevel@tonic-gate off >>= 3;
1747*0Sstevel@tonic-gate return (tmps[off] & bit) != 0;
1748*0Sstevel@tonic-gate case 8:
1749*0Sstevel@tonic-gate return tmps[off];
1750*0Sstevel@tonic-gate case 16:
1751*0Sstevel@tonic-gate off <<= 1;
1752*0Sstevel@tonic-gate return (tmps[off] << 8) + tmps[off + 1] ;
1753*0Sstevel@tonic-gate case 32:
1754*0Sstevel@tonic-gate off <<= 2;
1755*0Sstevel@tonic-gate return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1756*0Sstevel@tonic-gate }
1757*0Sstevel@tonic-gate Perl_croak(aTHX_ "panic: swash_fetch");
1758*0Sstevel@tonic-gate return 0;
1759*0Sstevel@tonic-gate }
1760*0Sstevel@tonic-gate
1761*0Sstevel@tonic-gate
1762*0Sstevel@tonic-gate /*
1763*0Sstevel@tonic-gate =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1764*0Sstevel@tonic-gate
1765*0Sstevel@tonic-gate Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1766*0Sstevel@tonic-gate of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1767*0Sstevel@tonic-gate bytes available. The return value is the pointer to the byte after the
1768*0Sstevel@tonic-gate end of the new character. In other words,
1769*0Sstevel@tonic-gate
1770*0Sstevel@tonic-gate d = uvchr_to_utf8(d, uv);
1771*0Sstevel@tonic-gate
1772*0Sstevel@tonic-gate is the recommended wide native character-aware way of saying
1773*0Sstevel@tonic-gate
1774*0Sstevel@tonic-gate *(d++) = uv;
1775*0Sstevel@tonic-gate
1776*0Sstevel@tonic-gate =cut
1777*0Sstevel@tonic-gate */
1778*0Sstevel@tonic-gate
1779*0Sstevel@tonic-gate /* On ASCII machines this is normally a macro but we want a
1780*0Sstevel@tonic-gate real function in case XS code wants it
1781*0Sstevel@tonic-gate */
1782*0Sstevel@tonic-gate #undef Perl_uvchr_to_utf8
1783*0Sstevel@tonic-gate U8 *
Perl_uvchr_to_utf8(pTHX_ U8 * d,UV uv)1784*0Sstevel@tonic-gate Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1785*0Sstevel@tonic-gate {
1786*0Sstevel@tonic-gate return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1787*0Sstevel@tonic-gate }
1788*0Sstevel@tonic-gate
1789*0Sstevel@tonic-gate U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)1790*0Sstevel@tonic-gate Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1791*0Sstevel@tonic-gate {
1792*0Sstevel@tonic-gate return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1793*0Sstevel@tonic-gate }
1794*0Sstevel@tonic-gate
1795*0Sstevel@tonic-gate /*
1796*0Sstevel@tonic-gate =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1797*0Sstevel@tonic-gate
1798*0Sstevel@tonic-gate Returns the native character value of the first character in the string C<s>
1799*0Sstevel@tonic-gate which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1800*0Sstevel@tonic-gate length, in bytes, of that character.
1801*0Sstevel@tonic-gate
1802*0Sstevel@tonic-gate Allows length and flags to be passed to low level routine.
1803*0Sstevel@tonic-gate
1804*0Sstevel@tonic-gate =cut
1805*0Sstevel@tonic-gate */
1806*0Sstevel@tonic-gate /* On ASCII machines this is normally a macro but we want
1807*0Sstevel@tonic-gate a real function in case XS code wants it
1808*0Sstevel@tonic-gate */
1809*0Sstevel@tonic-gate #undef Perl_utf8n_to_uvchr
1810*0Sstevel@tonic-gate UV
Perl_utf8n_to_uvchr(pTHX_ U8 * s,STRLEN curlen,STRLEN * retlen,U32 flags)1811*0Sstevel@tonic-gate Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1812*0Sstevel@tonic-gate {
1813*0Sstevel@tonic-gate UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1814*0Sstevel@tonic-gate return UNI_TO_NATIVE(uv);
1815*0Sstevel@tonic-gate }
1816*0Sstevel@tonic-gate
1817*0Sstevel@tonic-gate /*
1818*0Sstevel@tonic-gate =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1819*0Sstevel@tonic-gate
1820*0Sstevel@tonic-gate Build to the scalar dsv a displayable version of the string spv,
1821*0Sstevel@tonic-gate length len, the displayable version being at most pvlim bytes long
1822*0Sstevel@tonic-gate (if longer, the rest is truncated and "..." will be appended).
1823*0Sstevel@tonic-gate
1824*0Sstevel@tonic-gate The flags argument can have UNI_DISPLAY_ISPRINT set to display
1825*0Sstevel@tonic-gate isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1826*0Sstevel@tonic-gate to display the \\[nrfta\\] as the backslashed versions (like '\n')
1827*0Sstevel@tonic-gate (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1828*0Sstevel@tonic-gate UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1829*0Sstevel@tonic-gate UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1830*0Sstevel@tonic-gate
1831*0Sstevel@tonic-gate The pointer to the PV of the dsv is returned.
1832*0Sstevel@tonic-gate
1833*0Sstevel@tonic-gate =cut */
1834*0Sstevel@tonic-gate char *
Perl_pv_uni_display(pTHX_ SV * dsv,U8 * spv,STRLEN len,STRLEN pvlim,UV flags)1835*0Sstevel@tonic-gate Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1836*0Sstevel@tonic-gate {
1837*0Sstevel@tonic-gate int truncated = 0;
1838*0Sstevel@tonic-gate char *s, *e;
1839*0Sstevel@tonic-gate
1840*0Sstevel@tonic-gate sv_setpvn(dsv, "", 0);
1841*0Sstevel@tonic-gate for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1842*0Sstevel@tonic-gate UV u;
1843*0Sstevel@tonic-gate bool ok = FALSE;
1844*0Sstevel@tonic-gate
1845*0Sstevel@tonic-gate if (pvlim && SvCUR(dsv) >= pvlim) {
1846*0Sstevel@tonic-gate truncated++;
1847*0Sstevel@tonic-gate break;
1848*0Sstevel@tonic-gate }
1849*0Sstevel@tonic-gate u = utf8_to_uvchr((U8*)s, 0);
1850*0Sstevel@tonic-gate if (u < 256) {
1851*0Sstevel@tonic-gate if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1852*0Sstevel@tonic-gate switch (u & 0xFF) {
1853*0Sstevel@tonic-gate case '\n':
1854*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1855*0Sstevel@tonic-gate case '\r':
1856*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1857*0Sstevel@tonic-gate case '\t':
1858*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1859*0Sstevel@tonic-gate case '\f':
1860*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1861*0Sstevel@tonic-gate case '\a':
1862*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1863*0Sstevel@tonic-gate case '\\':
1864*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1865*0Sstevel@tonic-gate default: break;
1866*0Sstevel@tonic-gate }
1867*0Sstevel@tonic-gate }
1868*0Sstevel@tonic-gate /* isPRINT() is the locale-blind version. */
1869*0Sstevel@tonic-gate if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1870*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1871*0Sstevel@tonic-gate ok = TRUE;
1872*0Sstevel@tonic-gate }
1873*0Sstevel@tonic-gate }
1874*0Sstevel@tonic-gate if (!ok)
1875*0Sstevel@tonic-gate Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1876*0Sstevel@tonic-gate }
1877*0Sstevel@tonic-gate if (truncated)
1878*0Sstevel@tonic-gate sv_catpvn(dsv, "...", 3);
1879*0Sstevel@tonic-gate
1880*0Sstevel@tonic-gate return SvPVX(dsv);
1881*0Sstevel@tonic-gate }
1882*0Sstevel@tonic-gate
1883*0Sstevel@tonic-gate /*
1884*0Sstevel@tonic-gate =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1885*0Sstevel@tonic-gate
1886*0Sstevel@tonic-gate Build to the scalar dsv a displayable version of the scalar sv,
1887*0Sstevel@tonic-gate the displayable version being at most pvlim bytes long
1888*0Sstevel@tonic-gate (if longer, the rest is truncated and "..." will be appended).
1889*0Sstevel@tonic-gate
1890*0Sstevel@tonic-gate The flags argument is as in pv_uni_display().
1891*0Sstevel@tonic-gate
1892*0Sstevel@tonic-gate The pointer to the PV of the dsv is returned.
1893*0Sstevel@tonic-gate
1894*0Sstevel@tonic-gate =cut */
1895*0Sstevel@tonic-gate char *
Perl_sv_uni_display(pTHX_ SV * dsv,SV * ssv,STRLEN pvlim,UV flags)1896*0Sstevel@tonic-gate Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1897*0Sstevel@tonic-gate {
1898*0Sstevel@tonic-gate return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1899*0Sstevel@tonic-gate pvlim, flags);
1900*0Sstevel@tonic-gate }
1901*0Sstevel@tonic-gate
1902*0Sstevel@tonic-gate /*
1903*0Sstevel@tonic-gate =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1904*0Sstevel@tonic-gate
1905*0Sstevel@tonic-gate Return true if the strings s1 and s2 differ case-insensitively, false
1906*0Sstevel@tonic-gate if not (if they are equal case-insensitively). If u1 is true, the
1907*0Sstevel@tonic-gate string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1908*0Sstevel@tonic-gate the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1909*0Sstevel@tonic-gate are false, the respective string is assumed to be in native 8-bit
1910*0Sstevel@tonic-gate encoding.
1911*0Sstevel@tonic-gate
1912*0Sstevel@tonic-gate If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1913*0Sstevel@tonic-gate in there (they will point at the beginning of the I<next> character).
1914*0Sstevel@tonic-gate If the pointers behind pe1 or pe2 are non-NULL, they are the end
1915*0Sstevel@tonic-gate pointers beyond which scanning will not continue under any
1916*0Sstevel@tonic-gate circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1917*0Sstevel@tonic-gate s2+l2 will be used as goal end pointers that will also stop the scan,
1918*0Sstevel@tonic-gate and which qualify towards defining a successful match: all the scans
1919*0Sstevel@tonic-gate that define an explicit length must reach their goal pointers for
1920*0Sstevel@tonic-gate a match to succeed).
1921*0Sstevel@tonic-gate
1922*0Sstevel@tonic-gate For case-insensitiveness, the "casefolding" of Unicode is used
1923*0Sstevel@tonic-gate instead of upper/lowercasing both the characters, see
1924*0Sstevel@tonic-gate http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1925*0Sstevel@tonic-gate
1926*0Sstevel@tonic-gate =cut */
1927*0Sstevel@tonic-gate I32
Perl_ibcmp_utf8(pTHX_ const char * s1,char ** pe1,register UV l1,bool u1,const char * s2,char ** pe2,register UV l2,bool u2)1928*0Sstevel@tonic-gate Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1929*0Sstevel@tonic-gate {
1930*0Sstevel@tonic-gate register U8 *p1 = (U8*)s1;
1931*0Sstevel@tonic-gate register U8 *p2 = (U8*)s2;
1932*0Sstevel@tonic-gate register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1933*0Sstevel@tonic-gate register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1934*0Sstevel@tonic-gate STRLEN n1 = 0, n2 = 0;
1935*0Sstevel@tonic-gate U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1936*0Sstevel@tonic-gate U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1937*0Sstevel@tonic-gate U8 natbuf[1+1];
1938*0Sstevel@tonic-gate STRLEN foldlen1, foldlen2;
1939*0Sstevel@tonic-gate bool match;
1940*0Sstevel@tonic-gate
1941*0Sstevel@tonic-gate if (pe1)
1942*0Sstevel@tonic-gate e1 = *(U8**)pe1;
1943*0Sstevel@tonic-gate if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1944*0Sstevel@tonic-gate f1 = (U8*)s1 + l1;
1945*0Sstevel@tonic-gate if (pe2)
1946*0Sstevel@tonic-gate e2 = *(U8**)pe2;
1947*0Sstevel@tonic-gate if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1948*0Sstevel@tonic-gate f2 = (U8*)s2 + l2;
1949*0Sstevel@tonic-gate
1950*0Sstevel@tonic-gate if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1951*0Sstevel@tonic-gate return 1; /* mismatch; possible infinite loop or false positive */
1952*0Sstevel@tonic-gate
1953*0Sstevel@tonic-gate if (!u1 || !u2)
1954*0Sstevel@tonic-gate natbuf[1] = 0; /* Need to terminate the buffer. */
1955*0Sstevel@tonic-gate
1956*0Sstevel@tonic-gate while ((e1 == 0 || p1 < e1) &&
1957*0Sstevel@tonic-gate (f1 == 0 || p1 < f1) &&
1958*0Sstevel@tonic-gate (e2 == 0 || p2 < e2) &&
1959*0Sstevel@tonic-gate (f2 == 0 || p2 < f2)) {
1960*0Sstevel@tonic-gate if (n1 == 0) {
1961*0Sstevel@tonic-gate if (u1)
1962*0Sstevel@tonic-gate to_utf8_fold(p1, foldbuf1, &foldlen1);
1963*0Sstevel@tonic-gate else {
1964*0Sstevel@tonic-gate natbuf[0] = *p1;
1965*0Sstevel@tonic-gate to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1966*0Sstevel@tonic-gate }
1967*0Sstevel@tonic-gate q1 = foldbuf1;
1968*0Sstevel@tonic-gate n1 = foldlen1;
1969*0Sstevel@tonic-gate }
1970*0Sstevel@tonic-gate if (n2 == 0) {
1971*0Sstevel@tonic-gate if (u2)
1972*0Sstevel@tonic-gate to_utf8_fold(p2, foldbuf2, &foldlen2);
1973*0Sstevel@tonic-gate else {
1974*0Sstevel@tonic-gate natbuf[0] = *p2;
1975*0Sstevel@tonic-gate to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1976*0Sstevel@tonic-gate }
1977*0Sstevel@tonic-gate q2 = foldbuf2;
1978*0Sstevel@tonic-gate n2 = foldlen2;
1979*0Sstevel@tonic-gate }
1980*0Sstevel@tonic-gate while (n1 && n2) {
1981*0Sstevel@tonic-gate if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1982*0Sstevel@tonic-gate (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1983*0Sstevel@tonic-gate memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1984*0Sstevel@tonic-gate return 1; /* mismatch */
1985*0Sstevel@tonic-gate n1 -= UTF8SKIP(q1);
1986*0Sstevel@tonic-gate q1 += UTF8SKIP(q1);
1987*0Sstevel@tonic-gate n2 -= UTF8SKIP(q2);
1988*0Sstevel@tonic-gate q2 += UTF8SKIP(q2);
1989*0Sstevel@tonic-gate }
1990*0Sstevel@tonic-gate if (n1 == 0)
1991*0Sstevel@tonic-gate p1 += u1 ? UTF8SKIP(p1) : 1;
1992*0Sstevel@tonic-gate if (n2 == 0)
1993*0Sstevel@tonic-gate p2 += u2 ? UTF8SKIP(p2) : 1;
1994*0Sstevel@tonic-gate
1995*0Sstevel@tonic-gate }
1996*0Sstevel@tonic-gate
1997*0Sstevel@tonic-gate /* A match is defined by all the scans that specified
1998*0Sstevel@tonic-gate * an explicit length reaching their final goals. */
1999*0Sstevel@tonic-gate match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2000*0Sstevel@tonic-gate
2001*0Sstevel@tonic-gate if (match) {
2002*0Sstevel@tonic-gate if (pe1)
2003*0Sstevel@tonic-gate *pe1 = (char*)p1;
2004*0Sstevel@tonic-gate if (pe2)
2005*0Sstevel@tonic-gate *pe2 = (char*)p2;
2006*0Sstevel@tonic-gate }
2007*0Sstevel@tonic-gate
2008*0Sstevel@tonic-gate return match ? 0 : 1; /* 0 match, 1 mismatch */
2009*0Sstevel@tonic-gate }
2010*0Sstevel@tonic-gate
2011