xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.xs (revision aa5e9e10509ffd51558f081f01cd78bfa3c4f2a5)
1 /*
2  $Id: Unicode.xs,v 2.8 2011/08/09 07:49:44 dankogai Exp dankogai $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #define U8 U8
10 #include "../Encode/encode.h"
11 
12 #define FBCHAR			0xFFFd
13 #define BOM_BE			0xFeFF
14 #define BOM16LE			0xFFFe
15 #define BOM32LE			0xFFFe0000
16 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
17 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
18 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
19 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
20 
21 /* For pre-5.14 source compatibility */
22 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
23 #   define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
24 #   define UTF8_DISALLOW_SURROGATE 0
25 #   define UTF8_WARN_SURROGATE 0
26 #   define UTF8_DISALLOW_FE_FF 0
27 #   define UTF8_WARN_FE_FF 0
28 #   define UTF8_WARN_NONCHAR 0
29 #endif
30 
31 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
32 
33 /* Avoid wasting too much space in the result buffer */
34 /* static void */
35 /* shrink_buffer(SV *result) */
36 /* { */
37 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
38 /* 	char *buf; */
39 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
40 /* 	New(0, buf, len, char); */
41 /* 	Copy(SvPVX(result), buf, len, char); */
42 /* 	Safefree(SvPVX(result)); */
43 /* 	SvPV_set(result, buf); */
44 /* 	SvLEN_set(result, len); */
45 /*     } */
46 /* } */
47 
48 #define shrink_buffer(result) { \
49     if (SvLEN(result) > 42 + SvCUR(result)) { \
50 	char *newpv; \
51 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
52 	New(0, newpv, newlen, char); \
53 	Copy(SvPVX(result), newpv, newlen, char); \
54 	Safefree(SvPVX(result)); \
55 	SvPV_set(result, newpv); \
56 	SvLEN_set(result, newlen); \
57     } \
58 }
59 
60 static UV
61 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
62 {
63     U8 *s = *sp;
64     UV v = 0;
65     if (s+size > e) {
66 	croak("Partial character %c",(char) endian);
67     }
68     switch(endian) {
69     case 'N':
70 	v = *s++;
71 	v = (v << 8) | *s++;
72     case 'n':
73 	v = (v << 8) | *s++;
74 	v = (v << 8) | *s++;
75 	break;
76     case 'V':
77     case 'v':
78 	v |= *s++;
79 	v |= (*s++ << 8);
80 	if (endian == 'v')
81 	    break;
82 	v |= (*s++ << 16);
83 	v |= (*s++ << 24);
84 	break;
85     default:
86 	croak("Unknown endian %c",(char) endian);
87 	break;
88     }
89     *sp = s;
90     return v;
91 }
92 
93 void
94 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
95 {
96     U8 *d = (U8 *) SvPV_nolen(result);
97 
98     switch(endian) {
99     case 'v':
100     case 'V':
101 	d += SvCUR(result);
102 	SvCUR_set(result,SvCUR(result)+size);
103 	while (size--) {
104 	    *d++ = (U8)(value & 0xFF);
105 	    value >>= 8;
106 	}
107 	break;
108     case 'n':
109     case 'N':
110 	SvCUR_set(result,SvCUR(result)+size);
111 	d += SvCUR(result);
112 	while (size--) {
113 	    *--d = (U8)(value & 0xFF);
114 	    value >>= 8;
115 	}
116 	break;
117     default:
118 	croak("Unknown endian %c",(char) endian);
119 	break;
120     }
121 }
122 
123 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
124 
125 PROTOTYPES: DISABLE
126 
127 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
128     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
129 
130 void
131 decode_xs(obj, str, check = 0)
132 SV *	obj
133 SV *	str
134 IV	check
135 CODE:
136 {
137     U8 endian    = *((U8 *)SvPV_nolen(attr("endian", 6)));
138     int size     = SvIV(attr("size", 4));
139     int ucs2     = -1; /* only needed in the event of surrogate pairs */
140     SV *result   = newSVpvn("",0);
141     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
142     STRLEN ulen;
143     STRLEN resultbuflen;
144     U8 *resultbuf;
145     U8 *s = (U8 *)SvPVbyte(str,ulen);
146     U8 *e = (U8 *)SvEND(str);
147     /* Optimise for the common case of being called from PerlIOEncode_fill()
148        with a standard length buffer. In this case the result SV's buffer is
149        only used temporarily, so we can afford to allocate the maximum needed
150        and not care about unused space. */
151     const bool temp_result = (ulen == PERLIO_BUFSIZ);
152 
153     ST(0) = sv_2mortal(result);
154     SvUTF8_on(result);
155 
156     if (!endian && s+size <= e) {
157 	UV bom;
158 	endian = (size == 4) ? 'N' : 'n';
159 	bom = enc_unpack(aTHX_ &s,e,size,endian);
160 	if (bom != BOM_BE) {
161 	    if (bom == BOM16LE) {
162 		endian = 'v';
163 	    }
164 	    else if (bom == BOM32LE) {
165 		endian = 'V';
166 	    }
167 	    else {
168 		croak("%"SVf":Unrecognised BOM %"UVxf,
169 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
170 		      bom);
171 	    }
172 	}
173 #if 1
174 	/* Update endian for next sequence */
175 	if (SvTRUE(attr("renewed", 7))) {
176 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
177 	}
178 #endif
179     }
180 
181     if (temp_result) {
182 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
183     } else {
184 	/* Preallocate the buffer to the minimum possible space required. */
185 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
186     }
187     resultbuf = (U8 *) SvGROW(result, resultbuflen);
188 
189     while (s < e && s+size <= e) {
190 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
191 	U8 *d;
192 	if (issurrogate(ord)) {
193 	    if (ucs2 == -1) {
194 		ucs2 = SvTRUE(attr("ucs2", 4));
195 	    }
196 	    if (ucs2 || size == 4) {
197 		if (check) {
198 		    croak("%"SVf":no surrogates allowed %"UVxf,
199 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
200 			  ord);
201 		}
202 		if (s+size <= e) {
203 		    /* skip the next one as well */
204 		    enc_unpack(aTHX_ &s,e,size,endian);
205 		}
206 		ord = FBCHAR;
207 	    }
208 	    else {
209 		UV lo;
210 		if (!isHiSurrogate(ord)) {
211 		    if (check) {
212 			croak("%"SVf":Malformed HI surrogate %"UVxf,
213 			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
214 			      ord);
215 		    }
216 		    else {
217 			ord = FBCHAR;
218 		    }
219 		}
220 		else {
221 		    if (s+size > e) {
222 			/* Partial character */
223 			s -= size;   /* back up to 1st half */
224 			break;       /* And exit loop */
225 		    }
226 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
227 		    if (!isLoSurrogate(lo)) {
228 			if (check) {
229 			    croak("%"SVf":Malformed LO surrogate %"UVxf,
230 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
231 				  ord);
232 			}
233 			else {
234 			    ord = FBCHAR;
235 			}
236 		    }
237 		    else {
238 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
239 		    }
240 		}
241 	    }
242 	}
243 
244 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
245 	    if (check) {
246 		croak("%"SVf":Unicode character %"UVxf" is illegal",
247 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
248 		      ord);
249 	    } else {
250 		ord = FBCHAR;
251 	    }
252 	}
253 
254 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
255 	    /* Do not allocate >8Mb more than the minimum needed.
256 	       This prevents allocating too much in the rogue case of a large
257 	       input consisting initially of long sequence uft8-byte unicode
258 	       chars followed by single utf8-byte chars. */
259             /* +1
260                fixes  Unicode.xs!decode_xs n-byte heap-overflow
261               */
262 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
263 	    STRLEN max_alloc = remaining + (8*1024*1024);
264 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
265 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
266 		(est_alloc > max_alloc ? max_alloc : est_alloc);
267 	    resultbuf = (U8 *) SvGROW(result, newlen);
268 	    resultbuflen = SvLEN(result);
269 	}
270 
271 	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
272                                             UNICODE_WARN_ILLEGAL_INTERCHANGE);
273 	SvCUR_set(result, d - (U8 *)SvPVX(result));
274     }
275 
276     if (s < e) {
277 	/* unlikely to happen because it's fixed-length -- dankogai */
278 	if (check & ENCODE_WARN_ON_ERR) {
279 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
280 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
281 	}
282     }
283     if (check && !(check & ENCODE_LEAVE_SRC)) {
284 	if (s < e) {
285 	    Move(s,SvPVX(str),e-s,U8);
286 	    SvCUR_set(str,(e-s));
287 	}
288 	else {
289 	    SvCUR_set(str,0);
290 	}
291 	*SvEND(str) = '\0';
292     }
293 
294     if (!temp_result)
295 	shrink_buffer(result);
296 
297     XSRETURN(1);
298 }
299 
300 void
301 encode_xs(obj, utf8, check = 0)
302 SV *	obj
303 SV *	utf8
304 IV	check
305 CODE:
306 {
307     U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
308     const int size = SvIV(attr("size", 4));
309     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
310     const STRLEN usize = (size > 0 ? size : 1);
311     SV *result = newSVpvn("", 0);
312     STRLEN ulen;
313     U8 *s = (U8 *) SvPVutf8(utf8, ulen);
314     const U8 *e = (U8 *) SvEND(utf8);
315     /* Optimise for the common case of being called from PerlIOEncode_flush()
316        with a standard length buffer. In this case the result SV's buffer is
317        only used temporarily, so we can afford to allocate the maximum needed
318        and not care about unused space. */
319     const bool temp_result = (ulen == PERLIO_BUFSIZ);
320 
321     ST(0) = sv_2mortal(result);
322 
323     /* Preallocate the result buffer to the maximum possible size.
324        ie. assume each UTF8 byte is 1 character.
325        Then shrink the result's buffer if necesary at the end. */
326     SvGROW(result, ((ulen+1) * usize));
327 
328     if (!endian) {
329 	endian = (size == 4) ? 'N' : 'n';
330 	enc_pack(aTHX_ result,size,endian,BOM_BE);
331 #if 1
332 	/* Update endian for next sequence */
333 	if (SvTRUE(attr("renewed", 7))) {
334 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
335 	}
336 #endif
337     }
338     while (s < e && s+UTF8SKIP(s) <= e) {
339 	STRLEN len;
340 	UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
341                                                |UTF8_WARN_SURROGATE
342                                                |UTF8_DISALLOW_FE_FF
343                                                |UTF8_WARN_FE_FF
344                                                |UTF8_WARN_NONCHAR));
345 	s += len;
346 	if (size != 4 && invalid_ucs2(ord)) {
347 	    if (!issurrogate(ord)) {
348 		if (ucs2 == -1) {
349 		    ucs2 = SvTRUE(attr("ucs2", 4));
350 		}
351 		if (ucs2) {
352 		    if (check) {
353 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
354 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
355 		    }
356 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
357 		} else {
358 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
359 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
360 		    enc_pack(aTHX_ result,size,endian,hi);
361 		    enc_pack(aTHX_ result,size,endian,lo);
362 		}
363 	    }
364 	    else {
365 		/* not supposed to happen */
366 		enc_pack(aTHX_ result,size,endian,FBCHAR);
367 	    }
368 	}
369 	else {
370 	    enc_pack(aTHX_ result,size,endian,ord);
371 	}
372     }
373     if (s < e) {
374 	/* UTF-8 partial char happens often on PerlIO.
375 	   Since this is okay and normal, we do not warn.
376 	   But this is critical when you choose to LEAVE_SRC
377 	   in which case we die */
378 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
379 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
380 		       "when CHECK = 0x%" UVuf,
381 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
382 	}
383     }
384     if (check && !(check & ENCODE_LEAVE_SRC)) {
385 	if (s < e) {
386 	    Move(s,SvPVX(utf8),e-s,U8);
387 	    SvCUR_set(utf8,(e-s));
388 	}
389 	else {
390 	    SvCUR_set(utf8,0);
391 	}
392 	*SvEND(utf8) = '\0';
393     }
394 
395     if (!temp_result)
396 	shrink_buffer(result);
397 
398     SvSETMAGIC(utf8);
399 
400     XSRETURN(1);
401 }
402