xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.xs (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*
2  $Id: Unicode.xs,v 2.19 2019/01/21 03:09:59 dankogai Exp $
3  */
4 
5 #define IN_UNICODE_XS
6 
7 #define PERL_NO_GET_CONTEXT
8 #include "EXTERN.h"
9 #include "perl.h"
10 #include "XSUB.h"
11 #include "../Encode/encode.h"
12 
13 #define FBCHAR			0xFFFd
14 #define BOM_BE			0xFeFF
15 #define BOM16LE			0xFFFe
16 #define BOM32LE			0xFFFe0000
17 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
18 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
19 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
20 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
21 
22 #ifndef SVfARG
23 #define SVfARG(p) ((void*)(p))
24 #endif
25 
26 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
27 
28 /* Avoid wasting too much space in the result buffer */
29 /* static void */
30 /* shrink_buffer(SV *result) */
31 /* { */
32 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
33 /* 	char *buf; */
34 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
35 /* 	New(0, buf, len, char); */
36 /* 	Copy(SvPVX(result), buf, len, char); */
37 /* 	Safefree(SvPVX(result)); */
38 /* 	SvPV_set(result, buf); */
39 /* 	SvLEN_set(result, len); */
40 /*     } */
41 /* } */
42 
43 #define shrink_buffer(result) { \
44     if (SvLEN(result) > 42 + SvCUR(result)) { \
45 	char *newpv; \
46 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
47 	New(0, newpv, newlen, char); \
48 	Copy(SvPVX(result), newpv, newlen, char); \
49 	Safefree(SvPVX(result)); \
50 	SvPV_set(result, newpv); \
51 	SvLEN_set(result, newlen); \
52     } \
53 }
54 
55 static UV
56 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
57 {
58     U8 *s = *sp;
59     UV v = 0;
60     if (s+size > e) {
61 	croak("Partial character %c",(char) endian);
62     }
63     switch(endian) {
64     case 'N':
65 	v = *s++;
66 	v = (v << 8) | *s++;
67         /* FALLTHROUGH */
68     case 'n':
69 	v = (v << 8) | *s++;
70 	v = (v << 8) | *s++;
71 	break;
72     case 'V':
73     case 'v':
74 	v |= *s++;
75 	v |= (*s++ << 8);
76 	if (endian == 'v')
77 	    break;
78 	v |= (*s++ << 16);
79 	v |= ((UV)*s++ << 24);
80 	break;
81     default:
82 	croak("Unknown endian %c",(char) endian);
83 	break;
84     }
85     *sp = s;
86     return v;
87 }
88 
89 static void
90 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
91 {
92     U8 *d = (U8 *) SvPV_nolen(result);
93 
94     switch(endian) {
95     case 'v':
96     case 'V':
97 	d += SvCUR(result);
98 	SvCUR_set(result,SvCUR(result)+size);
99 	while (size--) {
100 	    *d++ = (U8)(value & 0xFF);
101 	    value >>= 8;
102 	}
103 	break;
104     case 'n':
105     case 'N':
106 	SvCUR_set(result,SvCUR(result)+size);
107 	d += SvCUR(result);
108 	while (size--) {
109 	    *--d = (U8)(value & 0xFF);
110 	    value >>= 8;
111 	}
112 	break;
113     default:
114 	croak("Unknown endian %c",(char) endian);
115 	break;
116     }
117 }
118 
119 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
120 
121 PROTOTYPES: DISABLE
122 
123 #define attr(k)  (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \
124     *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef)
125 
126 void
127 decode(obj, str, check = 0)
128 SV *	obj
129 SV *	str
130 IV	check
131 CODE:
132 {
133     SV *name     = attr("Name");
134     SV *sve      = attr("endian");
135     U8 endian    = *((U8 *)SvPV_nolen(sve));
136     SV *svs      = attr("size");
137     int size     = SvIV(svs);
138     int ucs2     = -1; /* only needed in the event of surrogate pairs */
139     SV *result   = newSVpvn("",0);
140     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
141     STRLEN ulen;
142     STRLEN resultbuflen;
143     U8 *resultbuf;
144     U8 *s;
145     U8 *e;
146     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
147     bool temp_result;
148 
149     SvGETMAGIC(str);
150     if (!SvOK(str))
151         XSRETURN_UNDEF;
152     s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
153     if (SvUTF8(str)) {
154         if (!modify) {
155             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
156             SvUTF8_on(tmp);
157             if (SvTAINTED(str))
158                 SvTAINTED_on(tmp);
159             str = tmp;
160             s = (U8 *)SvPVX(str);
161         }
162         if (ulen) {
163             if (!utf8_to_bytes(s, &ulen))
164                 croak("Wide character");
165             SvCUR_set(str, ulen);
166         }
167         SvUTF8_off(str);
168     }
169     e = s+ulen;
170 
171     /* Optimise for the common case of being called from PerlIOEncode_fill()
172        with a standard length buffer. In this case the result SV's buffer is
173        only used temporarily, so we can afford to allocate the maximum needed
174        and not care about unused space. */
175     temp_result = (ulen == PERLIO_BUFSIZ);
176 
177     ST(0) = sv_2mortal(result);
178     SvUTF8_on(result);
179 
180     if (!endian && s+size <= e) {
181 	SV *sv;
182 	UV bom;
183 	endian = (size == 4) ? 'N' : 'n';
184 	bom = enc_unpack(aTHX_ &s,e,size,endian);
185 	if (bom != BOM_BE) {
186 	    if (bom == BOM16LE) {
187 		endian = 'v';
188 	    }
189 	    else if (bom == BOM32LE) {
190 		endian = 'V';
191 	    }
192 	    else {
193                /* No BOM found, use big-endian fallback as specified in
194                 * RFC2781 and the Unicode Standard version 8.0:
195                 *
196                 *  The UTF-16 encoding scheme may or may not begin with
197                 *  a BOM. However, when there is no BOM, and in the
198                 *  absence of a higher-level protocol, the byte order
199                 *  of the UTF-16 encoding scheme is big-endian.
200                 *
201                 *  If the first two octets of the text is not 0xFE
202                 *  followed by 0xFF, and is not 0xFF followed by 0xFE,
203                 *  then the text SHOULD be interpreted as big-endian.
204                 */
205                 s -= size;
206 	    }
207 	}
208 #if 1
209 	/* Update endian for next sequence */
210 	sv = attr("renewed");
211 	if (SvTRUE(sv)) {
212 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
213 	}
214 #endif
215     }
216 
217     if (temp_result) {
218 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
219     } else {
220 	/* Preallocate the buffer to the minimum possible space required. */
221 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
222     }
223     resultbuf = (U8 *) SvGROW(result, resultbuflen);
224 
225     while (s < e && s+size <= e) {
226 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
227 	U8 *d;
228 	HV *hv = NULL;
229 	if (issurrogate(ord)) {
230 	    if (ucs2 == -1) {
231 		SV *sv = attr("ucs2");
232 		ucs2 = SvTRUE(sv);
233 	    }
234 	    if (ucs2 || size == 4) {
235 		if (check & ENCODE_DIE_ON_ERR) {
236 		    croak("%" SVf ":no surrogates allowed %" UVxf,
237 			  SVfARG(name), ord);
238 		}
239 		if (encode_ckWARN(check, WARN_SURROGATE)) {
240 		    warner(packWARN(WARN_SURROGATE),
241 			  "%" SVf ":no surrogates allowed %" UVxf,
242 			  SVfARG(name), ord);
243 		}
244 		ord = FBCHAR;
245 	    }
246 	    else {
247 		UV lo;
248 		if (!isHiSurrogate(ord)) {
249 		    if (check & ENCODE_DIE_ON_ERR) {
250 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
251 			      SVfARG(name), ord);
252 		    }
253 		    if (encode_ckWARN(check, WARN_SURROGATE)) {
254 			warner(packWARN(WARN_SURROGATE),
255 			      "%" SVf ":Malformed HI surrogate %" UVxf,
256 			      SVfARG(name), ord);
257 		    }
258 		    ord = FBCHAR;
259 		}
260 		else if (s+size > e) {
261 		    if (check & ENCODE_STOP_AT_PARTIAL) {
262 		        s -= size;
263 		        break;
264 		    }
265 		    if (check & ENCODE_DIE_ON_ERR) {
266 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
267 			      SVfARG(name), ord);
268 		    }
269 		    if (encode_ckWARN(check, WARN_SURROGATE)) {
270 			warner(packWARN(WARN_SURROGATE),
271 			      "%" SVf ":Malformed HI surrogate %" UVxf,
272 			      SVfARG(name), ord);
273 		    }
274 		    ord = FBCHAR;
275 		}
276 		else {
277 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
278 		    if (!isLoSurrogate(lo)) {
279 			if (check & ENCODE_DIE_ON_ERR) {
280 			    croak("%" SVf ":Malformed LO surrogate %" UVxf,
281 				  SVfARG(name), ord);
282 			}
283 			if (encode_ckWARN(check, WARN_SURROGATE)) {
284 			    warner(packWARN(WARN_SURROGATE),
285 				  "%" SVf ":Malformed LO surrogate %" UVxf,
286 				  SVfARG(name), ord);
287 			}
288 			s -= size;
289 			ord = FBCHAR;
290 		    }
291 		    else {
292 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
293 		    }
294 		}
295 	    }
296 	}
297 
298 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
299 	    if (check & ENCODE_DIE_ON_ERR) {
300 		croak("%" SVf ":Unicode character %" UVxf " is illegal",
301 		      SVfARG(name), ord);
302 	    }
303 	    if (encode_ckWARN(check, WARN_NONCHAR)) {
304 	        warner(packWARN(WARN_NONCHAR),
305 		      "%" SVf ":Unicode character %" UVxf " is illegal",
306 		      SVfARG(name), ord);
307 	    }
308 	    ord = FBCHAR;
309 	}
310 
311 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
312 	    /* Do not allocate >8Mb more than the minimum needed.
313 	       This prevents allocating too much in the rogue case of a large
314 	       input consisting initially of long sequence uft8-byte unicode
315 	       chars followed by single utf8-byte chars. */
316             /* +1
317                fixes  Unicode.xs!decode_xs n-byte heap-overflow
318               */
319 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
320 	    STRLEN max_alloc = remaining + (8*1024*1024);
321 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
322 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
323 		(est_alloc > max_alloc ? max_alloc : est_alloc);
324 	    resultbuf = (U8 *) SvGROW(result, newlen);
325 	    resultbuflen = SvLEN(result);
326 	}
327 
328         d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv);
329         if (hv) {
330             SV *message = *hv_fetch(hv, "text", 4, 0);
331             U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
332             sv_2mortal((SV *)hv);
333             if (check & ENCODE_DIE_ON_ERR)
334                 croak("%" SVf, SVfARG(message));
335             if (encode_ckWARN_packed(check, categories))
336                 warner(categories, "%" SVf, SVfARG(message));
337             d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0);
338         }
339 
340 	SvCUR_set(result, d - (U8 *)SvPVX(result));
341     }
342 
343     if (s < e) {
344 	/* unlikely to happen because it's fixed-length -- dankogai */
345         if (check & ENCODE_DIE_ON_ERR)
346             croak("%" SVf ":Partial character", SVfARG(name));
347         if (encode_ckWARN(check, WARN_UTF8)) {
348             warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name));
349 	}
350     }
351     if (check && !(check & ENCODE_LEAVE_SRC)) {
352 	if (s < e) {
353 	    Move(s,SvPVX(str),e-s,U8);
354 	    SvCUR_set(str,(e-s));
355 	}
356 	else {
357 	    SvCUR_set(str,0);
358 	}
359 	*SvEND(str) = '\0';
360 	SvSETMAGIC(str);
361     }
362 
363     if (!temp_result) shrink_buffer(result);
364     if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
365     XSRETURN(1);
366 }
367 
368 void
369 encode(obj, utf8, check = 0)
370 SV *	obj
371 SV *	utf8
372 IV	check
373 CODE:
374 {
375     SV *name = attr("Name");
376     SV *sve = attr("endian");
377     U8 endian = *((U8 *)SvPV_nolen(sve));
378     SV *svs = attr("size");
379     const int size = SvIV(svs);
380     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
381     const STRLEN usize = (size > 0 ? size : 1);
382     SV *result = newSVpvn("", 0);
383     STRLEN ulen;
384     U8 *s;
385     U8 *e;
386     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
387     bool temp_result;
388 
389     SvGETMAGIC(utf8);
390     if (!SvOK(utf8))
391         XSRETURN_UNDEF;
392     s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
393     if (!SvUTF8(utf8)) {
394         if (!modify) {
395             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
396             if (SvTAINTED(utf8))
397                 SvTAINTED_on(tmp);
398             utf8 = tmp;
399         }
400         sv_utf8_upgrade_nomg(utf8);
401         s = (U8 *)SvPV_nomg(utf8, ulen);
402     }
403     e = s+ulen;
404 
405     /* Optimise for the common case of being called from PerlIOEncode_flush()
406        with a standard length buffer. In this case the result SV's buffer is
407        only used temporarily, so we can afford to allocate the maximum needed
408        and not care about unused space. */
409     temp_result = (ulen == PERLIO_BUFSIZ);
410 
411     ST(0) = sv_2mortal(result);
412 
413     /* Preallocate the result buffer to the maximum possible size.
414        ie. assume each UTF8 byte is 1 character.
415        Then shrink the result's buffer if necesary at the end. */
416     SvGROW(result, ((ulen+1) * usize));
417 
418     if (!endian) {
419 	SV *sv;
420 	endian = (size == 4) ? 'N' : 'n';
421 	enc_pack(aTHX_ result,size,endian,BOM_BE);
422 #if 1
423 	/* Update endian for next sequence */
424 	sv = attr("renewed");
425 	if (SvTRUE(sv)) {
426 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
427 	}
428 #endif
429     }
430     while (s < e && s+UTF8SKIP(s) <= e) {
431         STRLEN len;
432         AV *msgs = NULL;
433         UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs);
434         if (msgs) {
435             SSize_t i;
436             SSize_t len = av_len(msgs)+1;
437             sv_2mortal((SV *)msgs);
438             for (i = 0; i < len; ++i) {
439                 SV *sv = *av_fetch(msgs, i, 0);
440                 HV *hv = (HV *)SvRV(sv);
441                 SV *message = *hv_fetch(hv, "text", 4, 0);
442                 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
443                 if (check & ENCODE_DIE_ON_ERR)
444                     croak("%" SVf, SVfARG(message));
445                 if (encode_ckWARN_packed(check, categories))
446                     warner(categories, "%" SVf, SVfARG(message));
447             }
448         }
449 	if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) {
450 	    if (!issurrogate(ord)) {
451 		if (ucs2 == -1) {
452 		    SV *sv = attr("ucs2");
453 		    ucs2 = SvTRUE(sv);
454 		}
455 		if (ucs2 || ord > 0x10FFFF) {
456 		    if (check & ENCODE_DIE_ON_ERR) {
457 			croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
458 				  SVfARG(name),ord);
459 		    }
460 		    if (encode_ckWARN(check, WARN_NON_UNICODE)) {
461 			warner(packWARN(WARN_NON_UNICODE),
462 				  "%" SVf ":code point \"\\x{%" UVxf "}\" too high",
463 				  SVfARG(name),ord);
464 		    }
465 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
466 		} else if (ord == 0) {
467 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
468 		} else {
469 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
470 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
471 		    enc_pack(aTHX_ result,size,endian,hi);
472 		    enc_pack(aTHX_ result,size,endian,lo);
473 		}
474 	    }
475 	    else {
476 		/* not supposed to happen */
477 		enc_pack(aTHX_ result,size,endian,FBCHAR);
478 	    }
479 	}
480 	else {
481 	    enc_pack(aTHX_ result,size,endian,ord);
482 	}
483 	s += len;
484     }
485     if (s < e) {
486 	/* UTF-8 partial char happens often on PerlIO.
487 	   Since this is okay and normal, we do not warn.
488 	   But this is critical when you choose to LEAVE_SRC
489 	   in which case we die */
490 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
491 	    Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
492 		       "when CHECK = 0x%" UVuf,
493 		       SVfARG(name), check);
494 	}
495     }
496     if (check && !(check & ENCODE_LEAVE_SRC)) {
497 	if (s < e) {
498 	    Move(s,SvPVX(utf8),e-s,U8);
499 	    SvCUR_set(utf8,(e-s));
500 	}
501 	else {
502 	    SvCUR_set(utf8,0);
503 	}
504 	*SvEND(utf8) = '\0';
505 	SvSETMAGIC(utf8);
506     }
507 
508     if (!temp_result) shrink_buffer(result);
509     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
510 
511     XSRETURN(1);
512 }
513