xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.xs (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*
2  $Id: Unicode.xs,v 2.14 2016/01/22 06:33:07 dankogai Exp $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #include "../Encode/encode.h"
10 
11 #define FBCHAR			0xFFFd
12 #define BOM_BE			0xFeFF
13 #define BOM16LE			0xFFFe
14 #define BOM32LE			0xFFFe0000
15 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
16 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
17 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
18 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
19 
20 /* For pre-5.14 source compatibility */
21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
22 #   define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
23 #   define UTF8_DISALLOW_SURROGATE 0
24 #   define UTF8_WARN_SURROGATE 0
25 #   define UTF8_DISALLOW_FE_FF 0
26 #   define UTF8_WARN_FE_FF 0
27 #   define UTF8_WARN_NONCHAR 0
28 #endif
29 
30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
31 
32 /* Avoid wasting too much space in the result buffer */
33 /* static void */
34 /* shrink_buffer(SV *result) */
35 /* { */
36 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
37 /* 	char *buf; */
38 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
39 /* 	New(0, buf, len, char); */
40 /* 	Copy(SvPVX(result), buf, len, char); */
41 /* 	Safefree(SvPVX(result)); */
42 /* 	SvPV_set(result, buf); */
43 /* 	SvLEN_set(result, len); */
44 /*     } */
45 /* } */
46 
47 #define shrink_buffer(result) { \
48     if (SvLEN(result) > 42 + SvCUR(result)) { \
49 	char *newpv; \
50 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
51 	New(0, newpv, newlen, char); \
52 	Copy(SvPVX(result), newpv, newlen, char); \
53 	Safefree(SvPVX(result)); \
54 	SvPV_set(result, newpv); \
55 	SvLEN_set(result, newlen); \
56     } \
57 }
58 
59 static UV
60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
61 {
62     U8 *s = *sp;
63     UV v = 0;
64     if (s+size > e) {
65 	croak("Partial character %c",(char) endian);
66     }
67     switch(endian) {
68     case 'N':
69 	v = *s++;
70 	v = (v << 8) | *s++;
71     case 'n':
72 	v = (v << 8) | *s++;
73 	v = (v << 8) | *s++;
74 	break;
75     case 'V':
76     case 'v':
77 	v |= *s++;
78 	v |= (*s++ << 8);
79 	if (endian == 'v')
80 	    break;
81 	v |= (*s++ << 16);
82 	v |= ((UV)*s++ << 24);
83 	break;
84     default:
85 	croak("Unknown endian %c",(char) endian);
86 	break;
87     }
88     *sp = s;
89     return v;
90 }
91 
92 static void
93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
94 {
95     U8 *d = (U8 *) SvPV_nolen(result);
96 
97     switch(endian) {
98     case 'v':
99     case 'V':
100 	d += SvCUR(result);
101 	SvCUR_set(result,SvCUR(result)+size);
102 	while (size--) {
103 	    *d++ = (U8)(value & 0xFF);
104 	    value >>= 8;
105 	}
106 	break;
107     case 'n':
108     case 'N':
109 	SvCUR_set(result,SvCUR(result)+size);
110 	d += SvCUR(result);
111 	while (size--) {
112 	    *--d = (U8)(value & 0xFF);
113 	    value >>= 8;
114 	}
115 	break;
116     default:
117 	croak("Unknown endian %c",(char) endian);
118 	break;
119     }
120 }
121 
122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
123 
124 PROTOTYPES: DISABLE
125 
126 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
127     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
128 #define attr_true(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
129     SvTRUE(*hv_fetch((HV *)SvRV(obj),k,l,0)) : FALSE)
130 
131 void
132 decode_xs(obj, str, check = 0)
133 SV *	obj
134 SV *	str
135 IV	check
136 CODE:
137 {
138     U8 endian    = *((U8 *)SvPV_nolen(attr("endian", 6)));
139     int size     = SvIV(attr("size", 4));
140     int ucs2     = -1; /* only needed in the event of surrogate pairs */
141     SV *result   = newSVpvn("",0);
142     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
143     STRLEN ulen;
144     STRLEN resultbuflen;
145     U8 *resultbuf;
146     U8 *s = (U8 *)SvPVbyte(str,ulen);
147     U8 *e = (U8 *)SvEND(str);
148     /* Optimise for the common case of being called from PerlIOEncode_fill()
149        with a standard length buffer. In this case the result SV's buffer is
150        only used temporarily, so we can afford to allocate the maximum needed
151        and not care about unused space. */
152     const bool temp_result = (ulen == PERLIO_BUFSIZ);
153 
154     ST(0) = sv_2mortal(result);
155     SvUTF8_on(result);
156 
157     if (!endian && s+size <= e) {
158 	UV bom;
159 	endian = (size == 4) ? 'N' : 'n';
160 	bom = enc_unpack(aTHX_ &s,e,size,endian);
161 	if (bom != BOM_BE) {
162 	    if (bom == BOM16LE) {
163 		endian = 'v';
164 	    }
165 	    else if (bom == BOM32LE) {
166 		endian = 'V';
167 	    }
168 	    else {
169                /* No BOM found, use big-endian fallback as specified in
170                 * RFC2781 and the Unicode Standard version 8.0:
171                 *
172                 *  The UTF-16 encoding scheme may or may not begin with
173                 *  a BOM. However, when there is no BOM, and in the
174                 *  absence of a higher-level protocol, the byte order
175                 *  of the UTF-16 encoding scheme is big-endian.
176                 *
177                 *  If the first two octets of the text is not 0xFE
178                 *  followed by 0xFF, and is not 0xFF followed by 0xFE,
179                 *  then the text SHOULD be interpreted as big-endian.
180                 */
181                 s -= size;
182 	    }
183 	}
184 #if 1
185 	/* Update endian for next sequence */
186 	if (attr_true("renewed", 7)) {
187 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
188 	}
189 #endif
190     }
191 
192     if (temp_result) {
193 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
194     } else {
195 	/* Preallocate the buffer to the minimum possible space required. */
196 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
197     }
198     resultbuf = (U8 *) SvGROW(result, resultbuflen);
199 
200     while (s < e && s+size <= e) {
201 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
202 	U8 *d;
203 	if (issurrogate(ord)) {
204 	    if (ucs2 == -1) {
205 		ucs2 = attr_true("ucs2", 4);
206 	    }
207 	    if (ucs2 || size == 4) {
208 		if (check) {
209 		    croak("%"SVf":no surrogates allowed %"UVxf,
210 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
211 			  ord);
212 		}
213 		ord = FBCHAR;
214 	    }
215 	    else {
216 		UV lo;
217 		if (!isHiSurrogate(ord)) {
218 		    if (check) {
219 			croak("%"SVf":Malformed HI surrogate %"UVxf,
220 			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
221 			      ord);
222 		    }
223 		    else {
224 			ord = FBCHAR;
225 		    }
226 		}
227 		else if (s+size > e) {
228 		    if (check) {
229 		        if (check & ENCODE_STOP_AT_PARTIAL) {
230 		             s -= size;
231 		             break;
232 		        }
233 		        else {
234 		             croak("%"SVf":Malformed HI surrogate %"UVxf,
235 				   *hv_fetch((HV *)SvRV(obj),"Name",4,0),
236 				   ord);
237 		        }
238 		    }
239 		    else {
240 		        ord = FBCHAR;
241 		    }
242 		}
243 		else {
244 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
245 		    if (!isLoSurrogate(lo)) {
246 			if (check) {
247 			    croak("%"SVf":Malformed LO surrogate %"UVxf,
248 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
249 				  ord);
250 			}
251 			else {
252 			    s -= size;
253 			    ord = FBCHAR;
254 			}
255 		    }
256 		    else {
257 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
258 		    }
259 		}
260 	    }
261 	}
262 
263 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
264 	    if (check) {
265 		croak("%"SVf":Unicode character %"UVxf" is illegal",
266 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
267 		      ord);
268 	    } else {
269 		ord = FBCHAR;
270 	    }
271 	}
272 
273 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
274 	    /* Do not allocate >8Mb more than the minimum needed.
275 	       This prevents allocating too much in the rogue case of a large
276 	       input consisting initially of long sequence uft8-byte unicode
277 	       chars followed by single utf8-byte chars. */
278             /* +1
279                fixes  Unicode.xs!decode_xs n-byte heap-overflow
280               */
281 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
282 	    STRLEN max_alloc = remaining + (8*1024*1024);
283 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
284 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
285 		(est_alloc > max_alloc ? max_alloc : est_alloc);
286 	    resultbuf = (U8 *) SvGROW(result, newlen);
287 	    resultbuflen = SvLEN(result);
288 	}
289 
290 	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
291                                             UNICODE_WARN_ILLEGAL_INTERCHANGE);
292 	SvCUR_set(result, d - (U8 *)SvPVX(result));
293     }
294 
295     if (s < e) {
296 	/* unlikely to happen because it's fixed-length -- dankogai */
297 	if (check & ENCODE_WARN_ON_ERR) {
298 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
299 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
300 	}
301     }
302     if (check && !(check & ENCODE_LEAVE_SRC)) {
303 	if (s < e) {
304 	    Move(s,SvPVX(str),e-s,U8);
305 	    SvCUR_set(str,(e-s));
306 	}
307 	else {
308 	    SvCUR_set(str,0);
309 	}
310 	*SvEND(str) = '\0';
311     }
312 
313     if (!temp_result) shrink_buffer(result);
314     if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
315     XSRETURN(1);
316 }
317 
318 void
319 encode_xs(obj, utf8, check = 0)
320 SV *	obj
321 SV *	utf8
322 IV	check
323 CODE:
324 {
325     U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
326     const int size = SvIV(attr("size", 4));
327     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
328     const STRLEN usize = (size > 0 ? size : 1);
329     SV *result = newSVpvn("", 0);
330     STRLEN ulen;
331     U8 *s = (U8 *) SvPVutf8(utf8, ulen);
332     const U8 *e = (U8 *) SvEND(utf8);
333     /* Optimise for the common case of being called from PerlIOEncode_flush()
334        with a standard length buffer. In this case the result SV's buffer is
335        only used temporarily, so we can afford to allocate the maximum needed
336        and not care about unused space. */
337     const bool temp_result = (ulen == PERLIO_BUFSIZ);
338 
339     ST(0) = sv_2mortal(result);
340 
341     /* Preallocate the result buffer to the maximum possible size.
342        ie. assume each UTF8 byte is 1 character.
343        Then shrink the result's buffer if necesary at the end. */
344     SvGROW(result, ((ulen+1) * usize));
345 
346     if (!endian) {
347 	endian = (size == 4) ? 'N' : 'n';
348 	enc_pack(aTHX_ result,size,endian,BOM_BE);
349 #if 1
350 	/* Update endian for next sequence */
351 	if (attr_true("renewed", 7)) {
352 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
353 	}
354 #endif
355     }
356     while (s < e && s+UTF8SKIP(s) <= e) {
357 	STRLEN len;
358 	UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
359                                                |UTF8_WARN_SURROGATE
360                                                |UTF8_DISALLOW_FE_FF
361                                                |UTF8_WARN_FE_FF
362                                                |UTF8_WARN_NONCHAR));
363 	s += len;
364 	if (size != 4 && invalid_ucs2(ord)) {
365 	    if (!issurrogate(ord)) {
366 		if (ucs2 == -1) {
367 		    ucs2 = attr_true("ucs2", 4);
368 		}
369 		if (ucs2 || ord > 0x10FFFF) {
370 		    if (check) {
371 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
372 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
373 		    }
374 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
375 		} else {
376 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
377 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
378 		    enc_pack(aTHX_ result,size,endian,hi);
379 		    enc_pack(aTHX_ result,size,endian,lo);
380 		}
381 	    }
382 	    else {
383 		/* not supposed to happen */
384 		enc_pack(aTHX_ result,size,endian,FBCHAR);
385 	    }
386 	}
387 	else {
388 	    enc_pack(aTHX_ result,size,endian,ord);
389 	}
390     }
391     if (s < e) {
392 	/* UTF-8 partial char happens often on PerlIO.
393 	   Since this is okay and normal, we do not warn.
394 	   But this is critical when you choose to LEAVE_SRC
395 	   in which case we die */
396 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
397 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
398 		       "when CHECK = 0x%" UVuf,
399 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
400 	}
401     }
402     if (check && !(check & ENCODE_LEAVE_SRC)) {
403 	if (s < e) {
404 	    Move(s,SvPVX(utf8),e-s,U8);
405 	    SvCUR_set(utf8,(e-s));
406 	}
407 	else {
408 	    SvCUR_set(utf8,0);
409 	}
410 	*SvEND(utf8) = '\0';
411     }
412 
413     if (!temp_result) shrink_buffer(result);
414     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
415 
416     SvSETMAGIC(utf8);
417 
418     XSRETURN(1);
419 }
420