xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.xs (revision 2584ca0b0c079044b412124fefd2e9be6e9a2447)
1 /*
2  $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #include "../Encode/encode.h"
10 
11 #define FBCHAR			0xFFFd
12 #define BOM_BE			0xFeFF
13 #define BOM16LE			0xFFFe
14 #define BOM32LE			0xFFFe0000
15 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
16 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
17 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
18 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
19 
20 /* For pre-5.14 source compatibility */
21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
22 #   define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
23 #   define UTF8_DISALLOW_SURROGATE 0
24 #   define UTF8_WARN_SURROGATE 0
25 #   define UTF8_DISALLOW_FE_FF 0
26 #   define UTF8_WARN_FE_FF 0
27 #   define UTF8_WARN_NONCHAR 0
28 #endif
29 
30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
31 
32 /* Avoid wasting too much space in the result buffer */
33 /* static void */
34 /* shrink_buffer(SV *result) */
35 /* { */
36 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
37 /* 	char *buf; */
38 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
39 /* 	New(0, buf, len, char); */
40 /* 	Copy(SvPVX(result), buf, len, char); */
41 /* 	Safefree(SvPVX(result)); */
42 /* 	SvPV_set(result, buf); */
43 /* 	SvLEN_set(result, len); */
44 /*     } */
45 /* } */
46 
47 #define shrink_buffer(result) { \
48     if (SvLEN(result) > 42 + SvCUR(result)) { \
49 	char *newpv; \
50 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
51 	New(0, newpv, newlen, char); \
52 	Copy(SvPVX(result), newpv, newlen, char); \
53 	Safefree(SvPVX(result)); \
54 	SvPV_set(result, newpv); \
55 	SvLEN_set(result, newlen); \
56     } \
57 }
58 
59 static UV
60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
61 {
62     U8 *s = *sp;
63     UV v = 0;
64     if (s+size > e) {
65 	croak("Partial character %c",(char) endian);
66     }
67     switch(endian) {
68     case 'N':
69 	v = *s++;
70 	v = (v << 8) | *s++;
71     case 'n':
72 	v = (v << 8) | *s++;
73 	v = (v << 8) | *s++;
74 	break;
75     case 'V':
76     case 'v':
77 	v |= *s++;
78 	v |= (*s++ << 8);
79 	if (endian == 'v')
80 	    break;
81 	v |= (*s++ << 16);
82 	v |= ((UV)*s++ << 24);
83 	break;
84     default:
85 	croak("Unknown endian %c",(char) endian);
86 	break;
87     }
88     *sp = s;
89     return v;
90 }
91 
92 static void
93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
94 {
95     U8 *d = (U8 *) SvPV_nolen(result);
96 
97     switch(endian) {
98     case 'v':
99     case 'V':
100 	d += SvCUR(result);
101 	SvCUR_set(result,SvCUR(result)+size);
102 	while (size--) {
103 	    *d++ = (U8)(value & 0xFF);
104 	    value >>= 8;
105 	}
106 	break;
107     case 'n':
108     case 'N':
109 	SvCUR_set(result,SvCUR(result)+size);
110 	d += SvCUR(result);
111 	while (size--) {
112 	    *--d = (U8)(value & 0xFF);
113 	    value >>= 8;
114 	}
115 	break;
116     default:
117 	croak("Unknown endian %c",(char) endian);
118 	break;
119     }
120 }
121 
122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
123 
124 PROTOTYPES: DISABLE
125 
126 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
127     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
128 
129 void
130 decode(obj, str, check = 0)
131 SV *	obj
132 SV *	str
133 IV	check
134 CODE:
135 {
136     SV *sve      = attr("endian", 6);
137     U8 endian    = *((U8 *)SvPV_nolen(sve));
138     SV *svs      = attr("size", 4);
139     int size     = SvIV(svs);
140     int ucs2     = -1; /* only needed in the event of surrogate pairs */
141     SV *result   = newSVpvn("",0);
142     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
143     STRLEN ulen;
144     STRLEN resultbuflen;
145     U8 *resultbuf;
146     U8 *s;
147     U8 *e;
148     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
149     bool temp_result;
150 
151     SvGETMAGIC(str);
152     if (!SvOK(str))
153         XSRETURN_UNDEF;
154     s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
155     if (SvUTF8(str)) {
156         if (!modify) {
157             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
158             SvUTF8_on(tmp);
159             if (SvTAINTED(str))
160                 SvTAINTED_on(tmp);
161             str = tmp;
162             s = (U8 *)SvPVX(str);
163         }
164         if (ulen) {
165             if (!utf8_to_bytes(s, &ulen))
166                 croak("Wide character");
167             SvCUR_set(str, ulen);
168         }
169         SvUTF8_off(str);
170     }
171     e = s+ulen;
172 
173     /* Optimise for the common case of being called from PerlIOEncode_fill()
174        with a standard length buffer. In this case the result SV's buffer is
175        only used temporarily, so we can afford to allocate the maximum needed
176        and not care about unused space. */
177     temp_result = (ulen == PERLIO_BUFSIZ);
178 
179     ST(0) = sv_2mortal(result);
180     SvUTF8_on(result);
181 
182     if (!endian && s+size <= e) {
183 	SV *sv;
184 	UV bom;
185 	endian = (size == 4) ? 'N' : 'n';
186 	bom = enc_unpack(aTHX_ &s,e,size,endian);
187 	if (bom != BOM_BE) {
188 	    if (bom == BOM16LE) {
189 		endian = 'v';
190 	    }
191 	    else if (bom == BOM32LE) {
192 		endian = 'V';
193 	    }
194 	    else {
195                /* No BOM found, use big-endian fallback as specified in
196                 * RFC2781 and the Unicode Standard version 8.0:
197                 *
198                 *  The UTF-16 encoding scheme may or may not begin with
199                 *  a BOM. However, when there is no BOM, and in the
200                 *  absence of a higher-level protocol, the byte order
201                 *  of the UTF-16 encoding scheme is big-endian.
202                 *
203                 *  If the first two octets of the text is not 0xFE
204                 *  followed by 0xFF, and is not 0xFF followed by 0xFE,
205                 *  then the text SHOULD be interpreted as big-endian.
206                 */
207                 s -= size;
208 	    }
209 	}
210 #if 1
211 	/* Update endian for next sequence */
212 	sv = attr("renewed", 7);
213 	if (SvTRUE(sv)) {
214 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
215 	}
216 #endif
217     }
218 
219     if (temp_result) {
220 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
221     } else {
222 	/* Preallocate the buffer to the minimum possible space required. */
223 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
224     }
225     resultbuf = (U8 *) SvGROW(result, resultbuflen);
226 
227     while (s < e && s+size <= e) {
228 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
229 	U8 *d;
230 	if (issurrogate(ord)) {
231 	    if (ucs2 == -1) {
232 		SV *sv = attr("ucs2", 4);
233 		ucs2 = SvTRUE(sv);
234 	    }
235 	    if (ucs2 || size == 4) {
236 		if (check) {
237 		    croak("%" SVf ":no surrogates allowed %" UVxf,
238 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
239 			  ord);
240 		}
241 		ord = FBCHAR;
242 	    }
243 	    else {
244 		UV lo;
245 		if (!isHiSurrogate(ord)) {
246 		    if (check) {
247 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
248 			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
249 			      ord);
250 		    }
251 		    else {
252 			ord = FBCHAR;
253 		    }
254 		}
255 		else if (s+size > e) {
256 		    if (check) {
257 		        if (check & ENCODE_STOP_AT_PARTIAL) {
258 		             s -= size;
259 		             break;
260 		        }
261 		        else {
262 		             croak("%" SVf ":Malformed HI surrogate %" UVxf,
263 				   *hv_fetch((HV *)SvRV(obj),"Name",4,0),
264 				   ord);
265 		        }
266 		    }
267 		    else {
268 		        ord = FBCHAR;
269 		    }
270 		}
271 		else {
272 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
273 		    if (!isLoSurrogate(lo)) {
274 			if (check) {
275 			    croak("%" SVf ":Malformed LO surrogate %" UVxf,
276 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
277 				  ord);
278 			}
279 			else {
280 			    s -= size;
281 			    ord = FBCHAR;
282 			}
283 		    }
284 		    else {
285 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
286 		    }
287 		}
288 	    }
289 	}
290 
291 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
292 	    if (check) {
293 		croak("%" SVf ":Unicode character %" UVxf " is illegal",
294 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
295 		      ord);
296 	    } else {
297 		ord = FBCHAR;
298 	    }
299 	}
300 
301 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
302 	    /* Do not allocate >8Mb more than the minimum needed.
303 	       This prevents allocating too much in the rogue case of a large
304 	       input consisting initially of long sequence uft8-byte unicode
305 	       chars followed by single utf8-byte chars. */
306             /* +1
307                fixes  Unicode.xs!decode_xs n-byte heap-overflow
308               */
309 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
310 	    STRLEN max_alloc = remaining + (8*1024*1024);
311 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
312 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
313 		(est_alloc > max_alloc ? max_alloc : est_alloc);
314 	    resultbuf = (U8 *) SvGROW(result, newlen);
315 	    resultbuflen = SvLEN(result);
316 	}
317 
318 	d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord,
319                                             UNICODE_WARN_ILLEGAL_INTERCHANGE);
320 	SvCUR_set(result, d - (U8 *)SvPVX(result));
321     }
322 
323     if (s < e) {
324 	/* unlikely to happen because it's fixed-length -- dankogai */
325 	if (check & ENCODE_WARN_ON_ERR) {
326 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character",
327 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
328 	}
329     }
330     if (check && !(check & ENCODE_LEAVE_SRC)) {
331 	if (s < e) {
332 	    Move(s,SvPVX(str),e-s,U8);
333 	    SvCUR_set(str,(e-s));
334 	}
335 	else {
336 	    SvCUR_set(str,0);
337 	}
338 	*SvEND(str) = '\0';
339 	SvSETMAGIC(str);
340     }
341 
342     if (!temp_result) shrink_buffer(result);
343     if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
344     XSRETURN(1);
345 }
346 
347 void
348 encode(obj, utf8, check = 0)
349 SV *	obj
350 SV *	utf8
351 IV	check
352 CODE:
353 {
354     SV *sve = attr("endian", 6);
355     U8 endian = *((U8 *)SvPV_nolen(sve));
356     SV *svs = attr("size", 4);
357     const int size = SvIV(svs);
358     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
359     const STRLEN usize = (size > 0 ? size : 1);
360     SV *result = newSVpvn("", 0);
361     STRLEN ulen;
362     U8 *s;
363     U8 *e;
364     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
365     bool temp_result;
366 
367     SvGETMAGIC(utf8);
368     if (!SvOK(utf8))
369         XSRETURN_UNDEF;
370     s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
371     if (!SvUTF8(utf8)) {
372         if (!modify) {
373             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
374             if (SvTAINTED(utf8))
375                 SvTAINTED_on(tmp);
376             utf8 = tmp;
377         }
378         sv_utf8_upgrade_nomg(utf8);
379         s = (U8 *)SvPV_nomg(utf8, ulen);
380     }
381     e = s+ulen;
382 
383     /* Optimise for the common case of being called from PerlIOEncode_flush()
384        with a standard length buffer. In this case the result SV's buffer is
385        only used temporarily, so we can afford to allocate the maximum needed
386        and not care about unused space. */
387     temp_result = (ulen == PERLIO_BUFSIZ);
388 
389     ST(0) = sv_2mortal(result);
390 
391     /* Preallocate the result buffer to the maximum possible size.
392        ie. assume each UTF8 byte is 1 character.
393        Then shrink the result's buffer if necesary at the end. */
394     SvGROW(result, ((ulen+1) * usize));
395 
396     if (!endian) {
397 	SV *sv;
398 	endian = (size == 4) ? 'N' : 'n';
399 	enc_pack(aTHX_ result,size,endian,BOM_BE);
400 #if 1
401 	/* Update endian for next sequence */
402 	sv = attr("renewed", 7);
403 	if (SvTRUE(sv)) {
404 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
405 	}
406 #endif
407     }
408     while (s < e && s+UTF8SKIP(s) <= e) {
409 	STRLEN len;
410 	UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
411                                                |UTF8_WARN_SURROGATE
412                                                |UTF8_DISALLOW_FE_FF
413                                                |UTF8_WARN_FE_FF
414                                                |UTF8_WARN_NONCHAR));
415 	s += len;
416 	if (size != 4 && invalid_ucs2(ord)) {
417 	    if (!issurrogate(ord)) {
418 		if (ucs2 == -1) {
419 		    SV *sv = attr("ucs2", 4);
420 		    ucs2 = SvTRUE(sv);
421 		}
422 		if (ucs2 || ord > 0x10FFFF) {
423 		    if (check) {
424 			croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
425 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
426 		    }
427 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
428 		} else {
429 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
430 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
431 		    enc_pack(aTHX_ result,size,endian,hi);
432 		    enc_pack(aTHX_ result,size,endian,lo);
433 		}
434 	    }
435 	    else {
436 		/* not supposed to happen */
437 		enc_pack(aTHX_ result,size,endian,FBCHAR);
438 	    }
439 	}
440 	else {
441 	    enc_pack(aTHX_ result,size,endian,ord);
442 	}
443     }
444     if (s < e) {
445 	/* UTF-8 partial char happens often on PerlIO.
446 	   Since this is okay and normal, we do not warn.
447 	   But this is critical when you choose to LEAVE_SRC
448 	   in which case we die */
449 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
450 	    Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
451 		       "when CHECK = 0x%" UVuf,
452 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
453 	}
454     }
455     if (check && !(check & ENCODE_LEAVE_SRC)) {
456 	if (s < e) {
457 	    Move(s,SvPVX(utf8),e-s,U8);
458 	    SvCUR_set(utf8,(e-s));
459 	}
460 	else {
461 	    SvCUR_set(utf8,0);
462 	}
463 	*SvEND(utf8) = '\0';
464 	SvSETMAGIC(utf8);
465     }
466 
467     if (!temp_result) shrink_buffer(result);
468     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
469 
470     XSRETURN(1);
471 }
472