xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.xs (revision e9ce384231aabe5c5a622aa68cef46f2c5bfdb4a)
1 /*
2  $Id: Unicode.xs,v 2.9 2012/08/05 23:08:49 dankogai Exp $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #define U8 U8
10 #include "../Encode/encode.h"
11 
12 #define FBCHAR			0xFFFd
13 #define BOM_BE			0xFeFF
14 #define BOM16LE			0xFFFe
15 #define BOM32LE			0xFFFe0000
16 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
17 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
18 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
19 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
20 
21 /* For pre-5.14 source compatibility */
22 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
23 #   define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
24 #   define UTF8_DISALLOW_SURROGATE 0
25 #   define UTF8_WARN_SURROGATE 0
26 #   define UTF8_DISALLOW_FE_FF 0
27 #   define UTF8_WARN_FE_FF 0
28 #   define UTF8_WARN_NONCHAR 0
29 #endif
30 
31 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
32 
33 /* Avoid wasting too much space in the result buffer */
34 /* static void */
35 /* shrink_buffer(SV *result) */
36 /* { */
37 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
38 /* 	char *buf; */
39 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
40 /* 	New(0, buf, len, char); */
41 /* 	Copy(SvPVX(result), buf, len, char); */
42 /* 	Safefree(SvPVX(result)); */
43 /* 	SvPV_set(result, buf); */
44 /* 	SvLEN_set(result, len); */
45 /*     } */
46 /* } */
47 
48 #define shrink_buffer(result) { \
49     if (SvLEN(result) > 42 + SvCUR(result)) { \
50 	char *newpv; \
51 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
52 	New(0, newpv, newlen, char); \
53 	Copy(SvPVX(result), newpv, newlen, char); \
54 	Safefree(SvPVX(result)); \
55 	SvPV_set(result, newpv); \
56 	SvLEN_set(result, newlen); \
57     } \
58 }
59 
60 static UV
61 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
62 {
63     U8 *s = *sp;
64     UV v = 0;
65     if (s+size > e) {
66 	croak("Partial character %c",(char) endian);
67     }
68     switch(endian) {
69     case 'N':
70 	v = *s++;
71 	v = (v << 8) | *s++;
72     case 'n':
73 	v = (v << 8) | *s++;
74 	v = (v << 8) | *s++;
75 	break;
76     case 'V':
77     case 'v':
78 	v |= *s++;
79 	v |= (*s++ << 8);
80 	if (endian == 'v')
81 	    break;
82 	v |= (*s++ << 16);
83 	v |= (*s++ << 24);
84 	break;
85     default:
86 	croak("Unknown endian %c",(char) endian);
87 	break;
88     }
89     *sp = s;
90     return v;
91 }
92 
93 void
94 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
95 {
96     U8 *d = (U8 *) SvPV_nolen(result);
97 
98     switch(endian) {
99     case 'v':
100     case 'V':
101 	d += SvCUR(result);
102 	SvCUR_set(result,SvCUR(result)+size);
103 	while (size--) {
104 	    *d++ = (U8)(value & 0xFF);
105 	    value >>= 8;
106 	}
107 	break;
108     case 'n':
109     case 'N':
110 	SvCUR_set(result,SvCUR(result)+size);
111 	d += SvCUR(result);
112 	while (size--) {
113 	    *--d = (U8)(value & 0xFF);
114 	    value >>= 8;
115 	}
116 	break;
117     default:
118 	croak("Unknown endian %c",(char) endian);
119 	break;
120     }
121 }
122 
123 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
124 
125 PROTOTYPES: DISABLE
126 
127 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
128     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
129 
130 void
131 decode_xs(obj, str, check = 0)
132 SV *	obj
133 SV *	str
134 IV	check
135 CODE:
136 {
137     U8 endian    = *((U8 *)SvPV_nolen(attr("endian", 6)));
138     int size     = SvIV(attr("size", 4));
139     int ucs2     = -1; /* only needed in the event of surrogate pairs */
140     SV *result   = newSVpvn("",0);
141     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
142     STRLEN ulen;
143     STRLEN resultbuflen;
144     U8 *resultbuf;
145     U8 *s = (U8 *)SvPVbyte(str,ulen);
146     U8 *e = (U8 *)SvEND(str);
147     /* Optimise for the common case of being called from PerlIOEncode_fill()
148        with a standard length buffer. In this case the result SV's buffer is
149        only used temporarily, so we can afford to allocate the maximum needed
150        and not care about unused space. */
151     const bool temp_result = (ulen == PERLIO_BUFSIZ);
152 
153     ST(0) = sv_2mortal(result);
154     SvUTF8_on(result);
155 
156     if (!endian && s+size <= e) {
157 	UV bom;
158 	endian = (size == 4) ? 'N' : 'n';
159 	bom = enc_unpack(aTHX_ &s,e,size,endian);
160 	if (bom != BOM_BE) {
161 	    if (bom == BOM16LE) {
162 		endian = 'v';
163 	    }
164 	    else if (bom == BOM32LE) {
165 		endian = 'V';
166 	    }
167 	    else {
168 		croak("%"SVf":Unrecognised BOM %"UVxf,
169 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
170 		      bom);
171 	    }
172 	}
173 #if 1
174 	/* Update endian for next sequence */
175 	if (SvTRUE(attr("renewed", 7))) {
176 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
177 	}
178 #endif
179     }
180 
181     if (temp_result) {
182 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
183     } else {
184 	/* Preallocate the buffer to the minimum possible space required. */
185 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
186     }
187     resultbuf = (U8 *) SvGROW(result, resultbuflen);
188 
189     while (s < e && s+size <= e) {
190 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
191 	U8 *d;
192 	if (issurrogate(ord)) {
193 	    if (ucs2 == -1) {
194 		ucs2 = SvTRUE(attr("ucs2", 4));
195 	    }
196 	    if (ucs2 || size == 4) {
197 		if (check) {
198 		    croak("%"SVf":no surrogates allowed %"UVxf,
199 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
200 			  ord);
201 		}
202 		ord = FBCHAR;
203 	    }
204 	    else {
205 		UV lo;
206 		if (!isHiSurrogate(ord)) {
207 		    if (check) {
208 			croak("%"SVf":Malformed HI surrogate %"UVxf,
209 			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
210 			      ord);
211 		    }
212 		    else {
213 			ord = FBCHAR;
214 		    }
215 		}
216 		else if (s+size > e) {
217 		    if (check) {
218 		        if (check & ENCODE_STOP_AT_PARTIAL) {
219 		             s -= size;
220 		             break;
221 		        }
222 		        else {
223 		             croak("%"SVf":Malformed HI surrogate %"UVxf,
224 				   *hv_fetch((HV *)SvRV(obj),"Name",4,0),
225 				   ord);
226 		        }
227 		    }
228 		    else {
229 		        ord = FBCHAR;
230 		    }
231 		}
232 		else {
233 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
234 		    if (!isLoSurrogate(lo)) {
235 			if (check) {
236 			    croak("%"SVf":Malformed LO surrogate %"UVxf,
237 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
238 				  ord);
239 			}
240 			else {
241 			    s -= size;
242 			    ord = FBCHAR;
243 			}
244 		    }
245 		    else {
246 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
247 		    }
248 		}
249 	    }
250 	}
251 
252 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
253 	    if (check) {
254 		croak("%"SVf":Unicode character %"UVxf" is illegal",
255 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
256 		      ord);
257 	    } else {
258 		ord = FBCHAR;
259 	    }
260 	}
261 
262 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
263 	    /* Do not allocate >8Mb more than the minimum needed.
264 	       This prevents allocating too much in the rogue case of a large
265 	       input consisting initially of long sequence uft8-byte unicode
266 	       chars followed by single utf8-byte chars. */
267             /* +1
268                fixes  Unicode.xs!decode_xs n-byte heap-overflow
269               */
270 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
271 	    STRLEN max_alloc = remaining + (8*1024*1024);
272 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
273 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
274 		(est_alloc > max_alloc ? max_alloc : est_alloc);
275 	    resultbuf = (U8 *) SvGROW(result, newlen);
276 	    resultbuflen = SvLEN(result);
277 	}
278 
279 	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
280                                             UNICODE_WARN_ILLEGAL_INTERCHANGE);
281 	SvCUR_set(result, d - (U8 *)SvPVX(result));
282     }
283 
284     if (s < e) {
285 	/* unlikely to happen because it's fixed-length -- dankogai */
286 	if (check & ENCODE_WARN_ON_ERR) {
287 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
288 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
289 	}
290     }
291     if (check && !(check & ENCODE_LEAVE_SRC)) {
292 	if (s < e) {
293 	    Move(s,SvPVX(str),e-s,U8);
294 	    SvCUR_set(str,(e-s));
295 	}
296 	else {
297 	    SvCUR_set(str,0);
298 	}
299 	*SvEND(str) = '\0';
300     }
301 
302     if (!temp_result)
303 	shrink_buffer(result);
304 
305     XSRETURN(1);
306 }
307 
308 void
309 encode_xs(obj, utf8, check = 0)
310 SV *	obj
311 SV *	utf8
312 IV	check
313 CODE:
314 {
315     U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
316     const int size = SvIV(attr("size", 4));
317     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
318     const STRLEN usize = (size > 0 ? size : 1);
319     SV *result = newSVpvn("", 0);
320     STRLEN ulen;
321     U8 *s = (U8 *) SvPVutf8(utf8, ulen);
322     const U8 *e = (U8 *) SvEND(utf8);
323     /* Optimise for the common case of being called from PerlIOEncode_flush()
324        with a standard length buffer. In this case the result SV's buffer is
325        only used temporarily, so we can afford to allocate the maximum needed
326        and not care about unused space. */
327     const bool temp_result = (ulen == PERLIO_BUFSIZ);
328 
329     ST(0) = sv_2mortal(result);
330 
331     /* Preallocate the result buffer to the maximum possible size.
332        ie. assume each UTF8 byte is 1 character.
333        Then shrink the result's buffer if necesary at the end. */
334     SvGROW(result, ((ulen+1) * usize));
335 
336     if (!endian) {
337 	endian = (size == 4) ? 'N' : 'n';
338 	enc_pack(aTHX_ result,size,endian,BOM_BE);
339 #if 1
340 	/* Update endian for next sequence */
341 	if (SvTRUE(attr("renewed", 7))) {
342 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
343 	}
344 #endif
345     }
346     while (s < e && s+UTF8SKIP(s) <= e) {
347 	STRLEN len;
348 	UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
349                                                |UTF8_WARN_SURROGATE
350                                                |UTF8_DISALLOW_FE_FF
351                                                |UTF8_WARN_FE_FF
352                                                |UTF8_WARN_NONCHAR));
353 	s += len;
354 	if (size != 4 && invalid_ucs2(ord)) {
355 	    if (!issurrogate(ord)) {
356 		if (ucs2 == -1) {
357 		    ucs2 = SvTRUE(attr("ucs2", 4));
358 		}
359 		if (ucs2 || ord > 0x10FFFF) {
360 		    if (check) {
361 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
362 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
363 		    }
364 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
365 		} else {
366 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
367 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
368 		    enc_pack(aTHX_ result,size,endian,hi);
369 		    enc_pack(aTHX_ result,size,endian,lo);
370 		}
371 	    }
372 	    else {
373 		/* not supposed to happen */
374 		enc_pack(aTHX_ result,size,endian,FBCHAR);
375 	    }
376 	}
377 	else {
378 	    enc_pack(aTHX_ result,size,endian,ord);
379 	}
380     }
381     if (s < e) {
382 	/* UTF-8 partial char happens often on PerlIO.
383 	   Since this is okay and normal, we do not warn.
384 	   But this is critical when you choose to LEAVE_SRC
385 	   in which case we die */
386 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
387 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
388 		       "when CHECK = 0x%" UVuf,
389 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
390 	}
391     }
392     if (check && !(check & ENCODE_LEAVE_SRC)) {
393 	if (s < e) {
394 	    Move(s,SvPVX(utf8),e-s,U8);
395 	    SvCUR_set(utf8,(e-s));
396 	}
397 	else {
398 	    SvCUR_set(utf8,0);
399 	}
400 	*SvEND(utf8) = '\0';
401     }
402 
403     if (!temp_result)
404 	shrink_buffer(result);
405 
406     SvSETMAGIC(utf8);
407 
408     XSRETURN(1);
409 }
410