xref: /openbsd-src/gnu/usr.bin/perl/cpan/Encode/Unicode/Unicode.xs (revision eac174f2741a08d8deb8aae59a7f778ef9b5d770)
1b39c5158Smillert /*
2*eac174f2Safresh1  $Id: Unicode.xs,v 2.20 2021/07/23 02:26:54 dankogai Exp $
3b39c5158Smillert  */
4b39c5158Smillert 
5b46d8ef2Safresh1 #define IN_UNICODE_XS
6b46d8ef2Safresh1 
7b39c5158Smillert #define PERL_NO_GET_CONTEXT
8b39c5158Smillert #include "EXTERN.h"
9b39c5158Smillert #include "perl.h"
10b39c5158Smillert #include "XSUB.h"
11b39c5158Smillert #include "../Encode/encode.h"
12b39c5158Smillert 
13b39c5158Smillert #define FBCHAR			0xFFFd
14b39c5158Smillert #define BOM_BE			0xFeFF
15b39c5158Smillert #define BOM16LE			0xFFFe
16b39c5158Smillert #define BOM32LE			0xFFFe0000
17b39c5158Smillert #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
18b39c5158Smillert #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
19b39c5158Smillert #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
20b39c5158Smillert #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
21b39c5158Smillert 
22b46d8ef2Safresh1 #ifndef SVfARG
23b46d8ef2Safresh1 #define SVfARG(p) ((void*)(p))
2448950c12Ssthen #endif
2548950c12Ssthen 
26b39c5158Smillert #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
27b39c5158Smillert 
28b39c5158Smillert /* Avoid wasting too much space in the result buffer */
29b39c5158Smillert /* static void */
30b39c5158Smillert /* shrink_buffer(SV *result) */
31b39c5158Smillert /* { */
32b39c5158Smillert /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
33b39c5158Smillert /* 	char *buf; */
34b39c5158Smillert /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
35b39c5158Smillert /* 	New(0, buf, len, char); */
36b39c5158Smillert /* 	Copy(SvPVX(result), buf, len, char); */
37b39c5158Smillert /* 	Safefree(SvPVX(result)); */
38b39c5158Smillert /* 	SvPV_set(result, buf); */
39b39c5158Smillert /* 	SvLEN_set(result, len); */
40b39c5158Smillert /*     } */
41b39c5158Smillert /* } */
42b39c5158Smillert 
43b39c5158Smillert #define shrink_buffer(result) { \
44b39c5158Smillert     if (SvLEN(result) > 42 + SvCUR(result)) { \
45b39c5158Smillert 	char *newpv; \
46b39c5158Smillert 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
47b39c5158Smillert 	New(0, newpv, newlen, char); \
48b39c5158Smillert 	Copy(SvPVX(result), newpv, newlen, char); \
49b39c5158Smillert 	Safefree(SvPVX(result)); \
50b39c5158Smillert 	SvPV_set(result, newpv); \
51b39c5158Smillert 	SvLEN_set(result, newlen); \
52b39c5158Smillert     } \
53b39c5158Smillert }
54b39c5158Smillert 
55b39c5158Smillert static UV
enc_unpack(pTHX_ U8 ** sp,U8 * e,STRLEN size,U8 endian)56b39c5158Smillert enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
57b39c5158Smillert {
58b39c5158Smillert     U8 *s = *sp;
59b39c5158Smillert     UV v = 0;
60b39c5158Smillert     if (s+size > e) {
61b39c5158Smillert 	croak("Partial character %c",(char) endian);
62b39c5158Smillert     }
63b39c5158Smillert     switch(endian) {
64b39c5158Smillert     case 'N':
65b39c5158Smillert 	v = *s++;
66b39c5158Smillert 	v = (v << 8) | *s++;
67b46d8ef2Safresh1         /* FALLTHROUGH */
68b39c5158Smillert     case 'n':
69b39c5158Smillert 	v = (v << 8) | *s++;
70b39c5158Smillert 	v = (v << 8) | *s++;
71b39c5158Smillert 	break;
72b39c5158Smillert     case 'V':
73b39c5158Smillert     case 'v':
74b39c5158Smillert 	v |= *s++;
75b39c5158Smillert 	v |= (*s++ << 8);
76b39c5158Smillert 	if (endian == 'v')
77b39c5158Smillert 	    break;
78b39c5158Smillert 	v |= (*s++ << 16);
79e5157e49Safresh1 	v |= ((UV)*s++ << 24);
80b39c5158Smillert 	break;
81b39c5158Smillert     default:
82b39c5158Smillert 	croak("Unknown endian %c",(char) endian);
83b39c5158Smillert 	break;
84b39c5158Smillert     }
85b39c5158Smillert     *sp = s;
86b39c5158Smillert     return v;
87b39c5158Smillert }
88b39c5158Smillert 
89b8851fccSafresh1 static void
enc_pack(pTHX_ SV * result,STRLEN size,U8 endian,UV value)90b39c5158Smillert enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
91b39c5158Smillert {
92b39c5158Smillert     U8 *d = (U8 *) SvPV_nolen(result);
93b39c5158Smillert 
94b39c5158Smillert     switch(endian) {
95b39c5158Smillert     case 'v':
96b39c5158Smillert     case 'V':
97b39c5158Smillert 	d += SvCUR(result);
98b39c5158Smillert 	SvCUR_set(result,SvCUR(result)+size);
99b39c5158Smillert 	while (size--) {
100b39c5158Smillert 	    *d++ = (U8)(value & 0xFF);
101b39c5158Smillert 	    value >>= 8;
102b39c5158Smillert 	}
103b39c5158Smillert 	break;
104b39c5158Smillert     case 'n':
105b39c5158Smillert     case 'N':
106b39c5158Smillert 	SvCUR_set(result,SvCUR(result)+size);
107b39c5158Smillert 	d += SvCUR(result);
108b39c5158Smillert 	while (size--) {
109b39c5158Smillert 	    *--d = (U8)(value & 0xFF);
110b39c5158Smillert 	    value >>= 8;
111b39c5158Smillert 	}
112b39c5158Smillert 	break;
113b39c5158Smillert     default:
114b39c5158Smillert 	croak("Unknown endian %c",(char) endian);
115b39c5158Smillert 	break;
116b39c5158Smillert     }
117b39c5158Smillert }
118b39c5158Smillert 
119b39c5158Smillert MODULE = Encode::Unicode PACKAGE = Encode::Unicode
120b39c5158Smillert 
121b39c5158Smillert PROTOTYPES: DISABLE
122b39c5158Smillert 
123b46d8ef2Safresh1 #define attr(k)  (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \
124b46d8ef2Safresh1     *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef)
125b39c5158Smillert 
126b39c5158Smillert void
1279f11ffb7Safresh1 decode(obj, str, check = 0)
128b39c5158Smillert SV *	obj
129b39c5158Smillert SV *	str
130b39c5158Smillert IV	check
131b39c5158Smillert CODE:
132b39c5158Smillert {
133b46d8ef2Safresh1     SV *name     = attr("Name");
134b46d8ef2Safresh1     SV *sve      = attr("endian");
1359f11ffb7Safresh1     U8 endian    = *((U8 *)SvPV_nolen(sve));
136b46d8ef2Safresh1     SV *svs      = attr("size");
1379f11ffb7Safresh1     int size     = SvIV(svs);
138b39c5158Smillert     int ucs2     = -1; /* only needed in the event of surrogate pairs */
139b39c5158Smillert     SV *result   = newSVpvn("",0);
140b39c5158Smillert     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
141b39c5158Smillert     STRLEN ulen;
142b39c5158Smillert     STRLEN resultbuflen;
143b39c5158Smillert     U8 *resultbuf;
1449f11ffb7Safresh1     U8 *s;
1459f11ffb7Safresh1     U8 *e;
1469f11ffb7Safresh1     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
1479f11ffb7Safresh1     bool temp_result;
1489f11ffb7Safresh1 
1499f11ffb7Safresh1     SvGETMAGIC(str);
1509f11ffb7Safresh1     if (!SvOK(str))
1519f11ffb7Safresh1         XSRETURN_UNDEF;
1529f11ffb7Safresh1     s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
1539f11ffb7Safresh1     if (SvUTF8(str)) {
1549f11ffb7Safresh1         if (!modify) {
1559f11ffb7Safresh1             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
1569f11ffb7Safresh1             SvUTF8_on(tmp);
1579f11ffb7Safresh1             if (SvTAINTED(str))
1589f11ffb7Safresh1                 SvTAINTED_on(tmp);
1599f11ffb7Safresh1             str = tmp;
1609f11ffb7Safresh1             s = (U8 *)SvPVX(str);
1619f11ffb7Safresh1         }
1629f11ffb7Safresh1         if (ulen) {
1639f11ffb7Safresh1             if (!utf8_to_bytes(s, &ulen))
1649f11ffb7Safresh1                 croak("Wide character");
1659f11ffb7Safresh1             SvCUR_set(str, ulen);
1669f11ffb7Safresh1         }
1679f11ffb7Safresh1         SvUTF8_off(str);
1689f11ffb7Safresh1     }
1699f11ffb7Safresh1     e = s+ulen;
1709f11ffb7Safresh1 
171b39c5158Smillert     /* Optimise for the common case of being called from PerlIOEncode_fill()
172b39c5158Smillert        with a standard length buffer. In this case the result SV's buffer is
173b39c5158Smillert        only used temporarily, so we can afford to allocate the maximum needed
174b39c5158Smillert        and not care about unused space. */
1759f11ffb7Safresh1     temp_result = (ulen == PERLIO_BUFSIZ);
176b39c5158Smillert 
177b39c5158Smillert     ST(0) = sv_2mortal(result);
178b39c5158Smillert     SvUTF8_on(result);
179b39c5158Smillert 
180b39c5158Smillert     if (!endian && s+size <= e) {
1819f11ffb7Safresh1 	SV *sv;
182b39c5158Smillert 	UV bom;
183b39c5158Smillert 	endian = (size == 4) ? 'N' : 'n';
184b39c5158Smillert 	bom = enc_unpack(aTHX_ &s,e,size,endian);
185b39c5158Smillert 	if (bom != BOM_BE) {
186b39c5158Smillert 	    if (bom == BOM16LE) {
187b39c5158Smillert 		endian = 'v';
188b39c5158Smillert 	    }
189b39c5158Smillert 	    else if (bom == BOM32LE) {
190b39c5158Smillert 		endian = 'V';
191b39c5158Smillert 	    }
192b39c5158Smillert 	    else {
193b8851fccSafresh1                /* No BOM found, use big-endian fallback as specified in
194b8851fccSafresh1                 * RFC2781 and the Unicode Standard version 8.0:
195b8851fccSafresh1                 *
196b8851fccSafresh1                 *  The UTF-16 encoding scheme may or may not begin with
197b8851fccSafresh1                 *  a BOM. However, when there is no BOM, and in the
198b8851fccSafresh1                 *  absence of a higher-level protocol, the byte order
199b8851fccSafresh1                 *  of the UTF-16 encoding scheme is big-endian.
200b8851fccSafresh1                 *
201b8851fccSafresh1                 *  If the first two octets of the text is not 0xFE
202b8851fccSafresh1                 *  followed by 0xFF, and is not 0xFF followed by 0xFE,
203b8851fccSafresh1                 *  then the text SHOULD be interpreted as big-endian.
204b8851fccSafresh1                 */
205b8851fccSafresh1                 s -= size;
206b39c5158Smillert 	    }
207b39c5158Smillert 	}
208b39c5158Smillert #if 1
209b39c5158Smillert 	/* Update endian for next sequence */
210b46d8ef2Safresh1 	sv = attr("renewed");
2119f11ffb7Safresh1 	if (SvTRUE(sv)) {
2129f11ffb7Safresh1 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
213b39c5158Smillert 	}
214b39c5158Smillert #endif
215b39c5158Smillert     }
216b39c5158Smillert 
217b39c5158Smillert     if (temp_result) {
218b39c5158Smillert 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
219b39c5158Smillert     } else {
220b39c5158Smillert 	/* Preallocate the buffer to the minimum possible space required. */
221b39c5158Smillert 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
222b39c5158Smillert     }
223b39c5158Smillert     resultbuf = (U8 *) SvGROW(result, resultbuflen);
224b39c5158Smillert 
225b39c5158Smillert     while (s < e && s+size <= e) {
226b39c5158Smillert 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
227b39c5158Smillert 	U8 *d;
228b46d8ef2Safresh1 	HV *hv = NULL;
229b39c5158Smillert 	if (issurrogate(ord)) {
230b39c5158Smillert 	    if (ucs2 == -1) {
231b46d8ef2Safresh1 		SV *sv = attr("ucs2");
2329f11ffb7Safresh1 		ucs2 = SvTRUE(sv);
233b39c5158Smillert 	    }
234b39c5158Smillert 	    if (ucs2 || size == 4) {
235b46d8ef2Safresh1 		if (check & ENCODE_DIE_ON_ERR) {
236b39c5158Smillert 		    croak("%" SVf ":no surrogates allowed %" UVxf,
237b46d8ef2Safresh1 			  SVfARG(name), ord);
238b46d8ef2Safresh1 		}
239b46d8ef2Safresh1 		if (encode_ckWARN(check, WARN_SURROGATE)) {
240b46d8ef2Safresh1 		    warner(packWARN(WARN_SURROGATE),
241b46d8ef2Safresh1 			  "%" SVf ":no surrogates allowed %" UVxf,
242b46d8ef2Safresh1 			  SVfARG(name), ord);
243b39c5158Smillert 		}
244b39c5158Smillert 		ord = FBCHAR;
245b39c5158Smillert 	    }
246b39c5158Smillert 	    else {
247b39c5158Smillert 		UV lo;
248b39c5158Smillert 		if (!isHiSurrogate(ord)) {
249b46d8ef2Safresh1 		    if (check & ENCODE_DIE_ON_ERR) {
250b39c5158Smillert 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
251b46d8ef2Safresh1 			      SVfARG(name), ord);
252b39c5158Smillert 		    }
253b46d8ef2Safresh1 		    if (encode_ckWARN(check, WARN_SURROGATE)) {
254b46d8ef2Safresh1 			warner(packWARN(WARN_SURROGATE),
255b46d8ef2Safresh1 			      "%" SVf ":Malformed HI surrogate %" UVxf,
256b46d8ef2Safresh1 			      SVfARG(name), ord);
257b46d8ef2Safresh1 		    }
258b39c5158Smillert 		    ord = FBCHAR;
259b39c5158Smillert 		}
260e9ce3842Safresh1 		else if (s+size > e) {
261e9ce3842Safresh1 		    if (check & ENCODE_STOP_AT_PARTIAL) {
262e9ce3842Safresh1 		        s -= size;
263e9ce3842Safresh1 		        break;
264b39c5158Smillert 		    }
265b46d8ef2Safresh1 		    if (check & ENCODE_DIE_ON_ERR) {
266e9ce3842Safresh1 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
267b46d8ef2Safresh1 			      SVfARG(name), ord);
268e9ce3842Safresh1 		    }
269b46d8ef2Safresh1 		    if (encode_ckWARN(check, WARN_SURROGATE)) {
270b46d8ef2Safresh1 			warner(packWARN(WARN_SURROGATE),
271b46d8ef2Safresh1 			      "%" SVf ":Malformed HI surrogate %" UVxf,
272b46d8ef2Safresh1 			      SVfARG(name), ord);
273e9ce3842Safresh1 		    }
274e9ce3842Safresh1 		    ord = FBCHAR;
275e9ce3842Safresh1 		}
276e9ce3842Safresh1 		else {
277b39c5158Smillert 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
278b39c5158Smillert 		    if (!isLoSurrogate(lo)) {
279b46d8ef2Safresh1 			if (check & ENCODE_DIE_ON_ERR) {
280b39c5158Smillert 			    croak("%" SVf ":Malformed LO surrogate %" UVxf,
281b46d8ef2Safresh1 				  SVfARG(name), ord);
282b39c5158Smillert 			}
283b46d8ef2Safresh1 			if (encode_ckWARN(check, WARN_SURROGATE)) {
284b46d8ef2Safresh1 			    warner(packWARN(WARN_SURROGATE),
285b46d8ef2Safresh1 				  "%" SVf ":Malformed LO surrogate %" UVxf,
286b46d8ef2Safresh1 				  SVfARG(name), ord);
287b46d8ef2Safresh1 			}
288e9ce3842Safresh1 			s -= size;
289b39c5158Smillert 			ord = FBCHAR;
290b39c5158Smillert 		    }
291b39c5158Smillert 		    else {
292b39c5158Smillert 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
293b39c5158Smillert 		    }
294b39c5158Smillert 		}
295b39c5158Smillert 	    }
296b39c5158Smillert 	}
297b39c5158Smillert 
298b39c5158Smillert 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
299b46d8ef2Safresh1 	    if (check & ENCODE_DIE_ON_ERR) {
300b39c5158Smillert 		croak("%" SVf ":Unicode character %" UVxf " is illegal",
301b46d8ef2Safresh1 		      SVfARG(name), ord);
302b39c5158Smillert 	    }
303b46d8ef2Safresh1 	    if (encode_ckWARN(check, WARN_NONCHAR)) {
304b46d8ef2Safresh1 	        warner(packWARN(WARN_NONCHAR),
305b46d8ef2Safresh1 		      "%" SVf ":Unicode character %" UVxf " is illegal",
306b46d8ef2Safresh1 		      SVfARG(name), ord);
307b46d8ef2Safresh1 	    }
308b46d8ef2Safresh1 	    ord = FBCHAR;
309b39c5158Smillert 	}
310b39c5158Smillert 
311b39c5158Smillert 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
312b39c5158Smillert 	    /* Do not allocate >8Mb more than the minimum needed.
313b39c5158Smillert 	       This prevents allocating too much in the rogue case of a large
314b39c5158Smillert 	       input consisting initially of long sequence uft8-byte unicode
315b39c5158Smillert 	       chars followed by single utf8-byte chars. */
316945a9a58Sjasper             /* +1
317945a9a58Sjasper                fixes  Unicode.xs!decode_xs n-byte heap-overflow
318945a9a58Sjasper               */
319945a9a58Sjasper 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
320b39c5158Smillert 	    STRLEN max_alloc = remaining + (8*1024*1024);
321b39c5158Smillert 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
322b39c5158Smillert 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
323b39c5158Smillert 		(est_alloc > max_alloc ? max_alloc : est_alloc);
324b39c5158Smillert 	    resultbuf = (U8 *) SvGROW(result, newlen);
325b39c5158Smillert 	    resultbuflen = SvLEN(result);
326b39c5158Smillert 	}
327b39c5158Smillert 
328b46d8ef2Safresh1         d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv);
329b46d8ef2Safresh1         if (hv) {
330b46d8ef2Safresh1             SV *message = *hv_fetch(hv, "text", 4, 0);
331b46d8ef2Safresh1             U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
332b46d8ef2Safresh1             sv_2mortal((SV *)hv);
333b46d8ef2Safresh1             if (check & ENCODE_DIE_ON_ERR)
334b46d8ef2Safresh1                 croak("%" SVf, SVfARG(message));
335b46d8ef2Safresh1             if (encode_ckWARN_packed(check, categories))
336b46d8ef2Safresh1                 warner(categories, "%" SVf, SVfARG(message));
337b46d8ef2Safresh1             d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0);
338b46d8ef2Safresh1         }
339b46d8ef2Safresh1 
340b39c5158Smillert 	SvCUR_set(result, d - (U8 *)SvPVX(result));
341b39c5158Smillert     }
342b39c5158Smillert 
343b39c5158Smillert     if (s < e) {
344b39c5158Smillert 	/* unlikely to happen because it's fixed-length -- dankogai */
345b46d8ef2Safresh1         if (check & ENCODE_DIE_ON_ERR)
346b46d8ef2Safresh1             croak("%" SVf ":Partial character", SVfARG(name));
347b46d8ef2Safresh1         if (encode_ckWARN(check, WARN_UTF8)) {
348b46d8ef2Safresh1             warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name));
349b39c5158Smillert 	}
350b39c5158Smillert     }
351b39c5158Smillert     if (check && !(check & ENCODE_LEAVE_SRC)) {
352b39c5158Smillert 	if (s < e) {
353b39c5158Smillert 	    Move(s,SvPVX(str),e-s,U8);
354b39c5158Smillert 	    SvCUR_set(str,(e-s));
355b39c5158Smillert 	}
356b39c5158Smillert 	else {
357b39c5158Smillert 	    SvCUR_set(str,0);
358b39c5158Smillert 	}
359b39c5158Smillert 	*SvEND(str) = '\0';
3609f11ffb7Safresh1 	SvSETMAGIC(str);
361b39c5158Smillert     }
362b39c5158Smillert 
363e5157e49Safresh1     if (!temp_result) shrink_buffer(result);
364*eac174f2Safresh1 
365*eac174f2Safresh1     /* Make sure we have a trailing NUL: */
366*eac174f2Safresh1     *SvEND(result) = '\0';
367*eac174f2Safresh1 
368e5157e49Safresh1     if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
369b39c5158Smillert     XSRETURN(1);
370b39c5158Smillert }
371b39c5158Smillert 
372b39c5158Smillert void
3739f11ffb7Safresh1 encode(obj, utf8, check = 0)
374b39c5158Smillert SV *	obj
375b39c5158Smillert SV *	utf8
376b39c5158Smillert IV	check
377b39c5158Smillert CODE:
378b39c5158Smillert {
379b46d8ef2Safresh1     SV *name = attr("Name");
380b46d8ef2Safresh1     SV *sve = attr("endian");
3819f11ffb7Safresh1     U8 endian = *((U8 *)SvPV_nolen(sve));
382b46d8ef2Safresh1     SV *svs = attr("size");
3839f11ffb7Safresh1     const int size = SvIV(svs);
384b39c5158Smillert     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
385b39c5158Smillert     const STRLEN usize = (size > 0 ? size : 1);
386b39c5158Smillert     SV *result = newSVpvn("", 0);
387b39c5158Smillert     STRLEN ulen;
3889f11ffb7Safresh1     U8 *s;
3899f11ffb7Safresh1     U8 *e;
3909f11ffb7Safresh1     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
3919f11ffb7Safresh1     bool temp_result;
3929f11ffb7Safresh1 
3939f11ffb7Safresh1     SvGETMAGIC(utf8);
3949f11ffb7Safresh1     if (!SvOK(utf8))
3959f11ffb7Safresh1         XSRETURN_UNDEF;
3969f11ffb7Safresh1     s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
3979f11ffb7Safresh1     if (!SvUTF8(utf8)) {
3989f11ffb7Safresh1         if (!modify) {
3999f11ffb7Safresh1             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
4009f11ffb7Safresh1             if (SvTAINTED(utf8))
4019f11ffb7Safresh1                 SvTAINTED_on(tmp);
4029f11ffb7Safresh1             utf8 = tmp;
4039f11ffb7Safresh1         }
4049f11ffb7Safresh1         sv_utf8_upgrade_nomg(utf8);
4059f11ffb7Safresh1         s = (U8 *)SvPV_nomg(utf8, ulen);
4069f11ffb7Safresh1     }
4079f11ffb7Safresh1     e = s+ulen;
4089f11ffb7Safresh1 
409b39c5158Smillert     /* Optimise for the common case of being called from PerlIOEncode_flush()
410b39c5158Smillert        with a standard length buffer. In this case the result SV's buffer is
411b39c5158Smillert        only used temporarily, so we can afford to allocate the maximum needed
412b39c5158Smillert        and not care about unused space. */
4139f11ffb7Safresh1     temp_result = (ulen == PERLIO_BUFSIZ);
414b39c5158Smillert 
415b39c5158Smillert     ST(0) = sv_2mortal(result);
416b39c5158Smillert 
417b39c5158Smillert     /* Preallocate the result buffer to the maximum possible size.
418b39c5158Smillert        ie. assume each UTF8 byte is 1 character.
419b39c5158Smillert        Then shrink the result's buffer if necesary at the end. */
420b39c5158Smillert     SvGROW(result, ((ulen+1) * usize));
421b39c5158Smillert 
422b39c5158Smillert     if (!endian) {
4239f11ffb7Safresh1 	SV *sv;
424b39c5158Smillert 	endian = (size == 4) ? 'N' : 'n';
425b39c5158Smillert 	enc_pack(aTHX_ result,size,endian,BOM_BE);
426b39c5158Smillert #if 1
427b39c5158Smillert 	/* Update endian for next sequence */
428b46d8ef2Safresh1 	sv = attr("renewed");
4299f11ffb7Safresh1 	if (SvTRUE(sv)) {
4309f11ffb7Safresh1 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
431b39c5158Smillert 	}
432b39c5158Smillert #endif
433b39c5158Smillert     }
434b39c5158Smillert     while (s < e && s+UTF8SKIP(s) <= e) {
435b39c5158Smillert         STRLEN len;
436b46d8ef2Safresh1         AV *msgs = NULL;
437b46d8ef2Safresh1         UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs);
438b46d8ef2Safresh1         if (msgs) {
439b46d8ef2Safresh1             SSize_t i;
440b46d8ef2Safresh1             SSize_t len = av_len(msgs)+1;
441b46d8ef2Safresh1             sv_2mortal((SV *)msgs);
442b46d8ef2Safresh1             for (i = 0; i < len; ++i) {
443b46d8ef2Safresh1                 SV *sv = *av_fetch(msgs, i, 0);
444b46d8ef2Safresh1                 HV *hv = (HV *)SvRV(sv);
445b46d8ef2Safresh1                 SV *message = *hv_fetch(hv, "text", 4, 0);
446b46d8ef2Safresh1                 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
447b46d8ef2Safresh1                 if (check & ENCODE_DIE_ON_ERR)
448b46d8ef2Safresh1                     croak("%" SVf, SVfARG(message));
449b46d8ef2Safresh1                 if (encode_ckWARN_packed(check, categories))
450b46d8ef2Safresh1                     warner(categories, "%" SVf, SVfARG(message));
451b46d8ef2Safresh1             }
452b46d8ef2Safresh1         }
453b46d8ef2Safresh1 	if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) {
454b39c5158Smillert 	    if (!issurrogate(ord)) {
455b39c5158Smillert 		if (ucs2 == -1) {
456b46d8ef2Safresh1 		    SV *sv = attr("ucs2");
4579f11ffb7Safresh1 		    ucs2 = SvTRUE(sv);
458b39c5158Smillert 		}
459e9ce3842Safresh1 		if (ucs2 || ord > 0x10FFFF) {
460b46d8ef2Safresh1 		    if (check & ENCODE_DIE_ON_ERR) {
461b39c5158Smillert 			croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
462b46d8ef2Safresh1 				  SVfARG(name),ord);
463b39c5158Smillert 		    }
464b46d8ef2Safresh1 		    if (encode_ckWARN(check, WARN_NON_UNICODE)) {
465b46d8ef2Safresh1 			warner(packWARN(WARN_NON_UNICODE),
466b46d8ef2Safresh1 				  "%" SVf ":code point \"\\x{%" UVxf "}\" too high",
467b46d8ef2Safresh1 				  SVfARG(name),ord);
468b46d8ef2Safresh1 		    }
469b46d8ef2Safresh1 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
470b46d8ef2Safresh1 		} else if (ord == 0) {
471b39c5158Smillert 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
472b39c5158Smillert 		} else {
473b39c5158Smillert 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
474b39c5158Smillert 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
475b39c5158Smillert 		    enc_pack(aTHX_ result,size,endian,hi);
476b39c5158Smillert 		    enc_pack(aTHX_ result,size,endian,lo);
477b39c5158Smillert 		}
478b39c5158Smillert 	    }
479b39c5158Smillert 	    else {
480b39c5158Smillert 		/* not supposed to happen */
481b39c5158Smillert 		enc_pack(aTHX_ result,size,endian,FBCHAR);
482b39c5158Smillert 	    }
483b39c5158Smillert 	}
484b39c5158Smillert 	else {
485b39c5158Smillert 	    enc_pack(aTHX_ result,size,endian,ord);
486b39c5158Smillert 	}
487b46d8ef2Safresh1 	s += len;
488b39c5158Smillert     }
489b39c5158Smillert     if (s < e) {
490b39c5158Smillert 	/* UTF-8 partial char happens often on PerlIO.
491b39c5158Smillert 	   Since this is okay and normal, we do not warn.
492b39c5158Smillert 	   But this is critical when you choose to LEAVE_SRC
493b39c5158Smillert 	   in which case we die */
494b39c5158Smillert 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
495b39c5158Smillert 	    Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
496b39c5158Smillert 		       "when CHECK = 0x%" UVuf,
497b46d8ef2Safresh1 		       SVfARG(name), check);
498b39c5158Smillert 	}
499b39c5158Smillert     }
500b39c5158Smillert     if (check && !(check & ENCODE_LEAVE_SRC)) {
501b39c5158Smillert 	if (s < e) {
502b39c5158Smillert 	    Move(s,SvPVX(utf8),e-s,U8);
503b39c5158Smillert 	    SvCUR_set(utf8,(e-s));
504b39c5158Smillert 	}
505b39c5158Smillert 	else {
506b39c5158Smillert 	    SvCUR_set(utf8,0);
507b39c5158Smillert 	}
508b39c5158Smillert 	*SvEND(utf8) = '\0';
5099f11ffb7Safresh1 	SvSETMAGIC(utf8);
510b39c5158Smillert     }
511b39c5158Smillert 
512e5157e49Safresh1     if (!temp_result) shrink_buffer(result);
513e5157e49Safresh1     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
514b39c5158Smillert 
515b39c5158Smillert     XSRETURN(1);
516b39c5158Smillert }
517