1b39c5158Smillert /*
2*eac174f2Safresh1 $Id: Unicode.xs,v 2.20 2021/07/23 02:26:54 dankogai Exp $
3b39c5158Smillert */
4b39c5158Smillert
5b46d8ef2Safresh1 #define IN_UNICODE_XS
6b46d8ef2Safresh1
7b39c5158Smillert #define PERL_NO_GET_CONTEXT
8b39c5158Smillert #include "EXTERN.h"
9b39c5158Smillert #include "perl.h"
10b39c5158Smillert #include "XSUB.h"
11b39c5158Smillert #include "../Encode/encode.h"
12b39c5158Smillert
13b39c5158Smillert #define FBCHAR 0xFFFd
14b39c5158Smillert #define BOM_BE 0xFeFF
15b39c5158Smillert #define BOM16LE 0xFFFe
16b39c5158Smillert #define BOM32LE 0xFFFe0000
17b39c5158Smillert #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
18b39c5158Smillert #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
19b39c5158Smillert #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
20b39c5158Smillert #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
21b39c5158Smillert
22b46d8ef2Safresh1 #ifndef SVfARG
23b46d8ef2Safresh1 #define SVfARG(p) ((void*)(p))
2448950c12Ssthen #endif
2548950c12Ssthen
26b39c5158Smillert #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
27b39c5158Smillert
28b39c5158Smillert /* Avoid wasting too much space in the result buffer */
29b39c5158Smillert /* static void */
30b39c5158Smillert /* shrink_buffer(SV *result) */
31b39c5158Smillert /* { */
32b39c5158Smillert /* if (SvLEN(result) > 42 + SvCUR(result)) { */
33b39c5158Smillert /* char *buf; */
34b39c5158Smillert /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
35b39c5158Smillert /* New(0, buf, len, char); */
36b39c5158Smillert /* Copy(SvPVX(result), buf, len, char); */
37b39c5158Smillert /* Safefree(SvPVX(result)); */
38b39c5158Smillert /* SvPV_set(result, buf); */
39b39c5158Smillert /* SvLEN_set(result, len); */
40b39c5158Smillert /* } */
41b39c5158Smillert /* } */
42b39c5158Smillert
43b39c5158Smillert #define shrink_buffer(result) { \
44b39c5158Smillert if (SvLEN(result) > 42 + SvCUR(result)) { \
45b39c5158Smillert char *newpv; \
46b39c5158Smillert STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
47b39c5158Smillert New(0, newpv, newlen, char); \
48b39c5158Smillert Copy(SvPVX(result), newpv, newlen, char); \
49b39c5158Smillert Safefree(SvPVX(result)); \
50b39c5158Smillert SvPV_set(result, newpv); \
51b39c5158Smillert SvLEN_set(result, newlen); \
52b39c5158Smillert } \
53b39c5158Smillert }
54b39c5158Smillert
55b39c5158Smillert static UV
enc_unpack(pTHX_ U8 ** sp,U8 * e,STRLEN size,U8 endian)56b39c5158Smillert enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
57b39c5158Smillert {
58b39c5158Smillert U8 *s = *sp;
59b39c5158Smillert UV v = 0;
60b39c5158Smillert if (s+size > e) {
61b39c5158Smillert croak("Partial character %c",(char) endian);
62b39c5158Smillert }
63b39c5158Smillert switch(endian) {
64b39c5158Smillert case 'N':
65b39c5158Smillert v = *s++;
66b39c5158Smillert v = (v << 8) | *s++;
67b46d8ef2Safresh1 /* FALLTHROUGH */
68b39c5158Smillert case 'n':
69b39c5158Smillert v = (v << 8) | *s++;
70b39c5158Smillert v = (v << 8) | *s++;
71b39c5158Smillert break;
72b39c5158Smillert case 'V':
73b39c5158Smillert case 'v':
74b39c5158Smillert v |= *s++;
75b39c5158Smillert v |= (*s++ << 8);
76b39c5158Smillert if (endian == 'v')
77b39c5158Smillert break;
78b39c5158Smillert v |= (*s++ << 16);
79e5157e49Safresh1 v |= ((UV)*s++ << 24);
80b39c5158Smillert break;
81b39c5158Smillert default:
82b39c5158Smillert croak("Unknown endian %c",(char) endian);
83b39c5158Smillert break;
84b39c5158Smillert }
85b39c5158Smillert *sp = s;
86b39c5158Smillert return v;
87b39c5158Smillert }
88b39c5158Smillert
89b8851fccSafresh1 static void
enc_pack(pTHX_ SV * result,STRLEN size,U8 endian,UV value)90b39c5158Smillert enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
91b39c5158Smillert {
92b39c5158Smillert U8 *d = (U8 *) SvPV_nolen(result);
93b39c5158Smillert
94b39c5158Smillert switch(endian) {
95b39c5158Smillert case 'v':
96b39c5158Smillert case 'V':
97b39c5158Smillert d += SvCUR(result);
98b39c5158Smillert SvCUR_set(result,SvCUR(result)+size);
99b39c5158Smillert while (size--) {
100b39c5158Smillert *d++ = (U8)(value & 0xFF);
101b39c5158Smillert value >>= 8;
102b39c5158Smillert }
103b39c5158Smillert break;
104b39c5158Smillert case 'n':
105b39c5158Smillert case 'N':
106b39c5158Smillert SvCUR_set(result,SvCUR(result)+size);
107b39c5158Smillert d += SvCUR(result);
108b39c5158Smillert while (size--) {
109b39c5158Smillert *--d = (U8)(value & 0xFF);
110b39c5158Smillert value >>= 8;
111b39c5158Smillert }
112b39c5158Smillert break;
113b39c5158Smillert default:
114b39c5158Smillert croak("Unknown endian %c",(char) endian);
115b39c5158Smillert break;
116b39c5158Smillert }
117b39c5158Smillert }
118b39c5158Smillert
119b39c5158Smillert MODULE = Encode::Unicode PACKAGE = Encode::Unicode
120b39c5158Smillert
121b39c5158Smillert PROTOTYPES: DISABLE
122b39c5158Smillert
123b46d8ef2Safresh1 #define attr(k) (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \
124b46d8ef2Safresh1 *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef)
125b39c5158Smillert
126b39c5158Smillert void
1279f11ffb7Safresh1 decode(obj, str, check = 0)
128b39c5158Smillert SV * obj
129b39c5158Smillert SV * str
130b39c5158Smillert IV check
131b39c5158Smillert CODE:
132b39c5158Smillert {
133b46d8ef2Safresh1 SV *name = attr("Name");
134b46d8ef2Safresh1 SV *sve = attr("endian");
1359f11ffb7Safresh1 U8 endian = *((U8 *)SvPV_nolen(sve));
136b46d8ef2Safresh1 SV *svs = attr("size");
1379f11ffb7Safresh1 int size = SvIV(svs);
138b39c5158Smillert int ucs2 = -1; /* only needed in the event of surrogate pairs */
139b39c5158Smillert SV *result = newSVpvn("",0);
140b39c5158Smillert STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
141b39c5158Smillert STRLEN ulen;
142b39c5158Smillert STRLEN resultbuflen;
143b39c5158Smillert U8 *resultbuf;
1449f11ffb7Safresh1 U8 *s;
1459f11ffb7Safresh1 U8 *e;
1469f11ffb7Safresh1 bool modify = (check && !(check & ENCODE_LEAVE_SRC));
1479f11ffb7Safresh1 bool temp_result;
1489f11ffb7Safresh1
1499f11ffb7Safresh1 SvGETMAGIC(str);
1509f11ffb7Safresh1 if (!SvOK(str))
1519f11ffb7Safresh1 XSRETURN_UNDEF;
1529f11ffb7Safresh1 s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
1539f11ffb7Safresh1 if (SvUTF8(str)) {
1549f11ffb7Safresh1 if (!modify) {
1559f11ffb7Safresh1 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
1569f11ffb7Safresh1 SvUTF8_on(tmp);
1579f11ffb7Safresh1 if (SvTAINTED(str))
1589f11ffb7Safresh1 SvTAINTED_on(tmp);
1599f11ffb7Safresh1 str = tmp;
1609f11ffb7Safresh1 s = (U8 *)SvPVX(str);
1619f11ffb7Safresh1 }
1629f11ffb7Safresh1 if (ulen) {
1639f11ffb7Safresh1 if (!utf8_to_bytes(s, &ulen))
1649f11ffb7Safresh1 croak("Wide character");
1659f11ffb7Safresh1 SvCUR_set(str, ulen);
1669f11ffb7Safresh1 }
1679f11ffb7Safresh1 SvUTF8_off(str);
1689f11ffb7Safresh1 }
1699f11ffb7Safresh1 e = s+ulen;
1709f11ffb7Safresh1
171b39c5158Smillert /* Optimise for the common case of being called from PerlIOEncode_fill()
172b39c5158Smillert with a standard length buffer. In this case the result SV's buffer is
173b39c5158Smillert only used temporarily, so we can afford to allocate the maximum needed
174b39c5158Smillert and not care about unused space. */
1759f11ffb7Safresh1 temp_result = (ulen == PERLIO_BUFSIZ);
176b39c5158Smillert
177b39c5158Smillert ST(0) = sv_2mortal(result);
178b39c5158Smillert SvUTF8_on(result);
179b39c5158Smillert
180b39c5158Smillert if (!endian && s+size <= e) {
1819f11ffb7Safresh1 SV *sv;
182b39c5158Smillert UV bom;
183b39c5158Smillert endian = (size == 4) ? 'N' : 'n';
184b39c5158Smillert bom = enc_unpack(aTHX_ &s,e,size,endian);
185b39c5158Smillert if (bom != BOM_BE) {
186b39c5158Smillert if (bom == BOM16LE) {
187b39c5158Smillert endian = 'v';
188b39c5158Smillert }
189b39c5158Smillert else if (bom == BOM32LE) {
190b39c5158Smillert endian = 'V';
191b39c5158Smillert }
192b39c5158Smillert else {
193b8851fccSafresh1 /* No BOM found, use big-endian fallback as specified in
194b8851fccSafresh1 * RFC2781 and the Unicode Standard version 8.0:
195b8851fccSafresh1 *
196b8851fccSafresh1 * The UTF-16 encoding scheme may or may not begin with
197b8851fccSafresh1 * a BOM. However, when there is no BOM, and in the
198b8851fccSafresh1 * absence of a higher-level protocol, the byte order
199b8851fccSafresh1 * of the UTF-16 encoding scheme is big-endian.
200b8851fccSafresh1 *
201b8851fccSafresh1 * If the first two octets of the text is not 0xFE
202b8851fccSafresh1 * followed by 0xFF, and is not 0xFF followed by 0xFE,
203b8851fccSafresh1 * then the text SHOULD be interpreted as big-endian.
204b8851fccSafresh1 */
205b8851fccSafresh1 s -= size;
206b39c5158Smillert }
207b39c5158Smillert }
208b39c5158Smillert #if 1
209b39c5158Smillert /* Update endian for next sequence */
210b46d8ef2Safresh1 sv = attr("renewed");
2119f11ffb7Safresh1 if (SvTRUE(sv)) {
2129f11ffb7Safresh1 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
213b39c5158Smillert }
214b39c5158Smillert #endif
215b39c5158Smillert }
216b39c5158Smillert
217b39c5158Smillert if (temp_result) {
218b39c5158Smillert resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
219b39c5158Smillert } else {
220b39c5158Smillert /* Preallocate the buffer to the minimum possible space required. */
221b39c5158Smillert resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
222b39c5158Smillert }
223b39c5158Smillert resultbuf = (U8 *) SvGROW(result, resultbuflen);
224b39c5158Smillert
225b39c5158Smillert while (s < e && s+size <= e) {
226b39c5158Smillert UV ord = enc_unpack(aTHX_ &s,e,size,endian);
227b39c5158Smillert U8 *d;
228b46d8ef2Safresh1 HV *hv = NULL;
229b39c5158Smillert if (issurrogate(ord)) {
230b39c5158Smillert if (ucs2 == -1) {
231b46d8ef2Safresh1 SV *sv = attr("ucs2");
2329f11ffb7Safresh1 ucs2 = SvTRUE(sv);
233b39c5158Smillert }
234b39c5158Smillert if (ucs2 || size == 4) {
235b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR) {
236b39c5158Smillert croak("%" SVf ":no surrogates allowed %" UVxf,
237b46d8ef2Safresh1 SVfARG(name), ord);
238b46d8ef2Safresh1 }
239b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_SURROGATE)) {
240b46d8ef2Safresh1 warner(packWARN(WARN_SURROGATE),
241b46d8ef2Safresh1 "%" SVf ":no surrogates allowed %" UVxf,
242b46d8ef2Safresh1 SVfARG(name), ord);
243b39c5158Smillert }
244b39c5158Smillert ord = FBCHAR;
245b39c5158Smillert }
246b39c5158Smillert else {
247b39c5158Smillert UV lo;
248b39c5158Smillert if (!isHiSurrogate(ord)) {
249b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR) {
250b39c5158Smillert croak("%" SVf ":Malformed HI surrogate %" UVxf,
251b46d8ef2Safresh1 SVfARG(name), ord);
252b39c5158Smillert }
253b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_SURROGATE)) {
254b46d8ef2Safresh1 warner(packWARN(WARN_SURROGATE),
255b46d8ef2Safresh1 "%" SVf ":Malformed HI surrogate %" UVxf,
256b46d8ef2Safresh1 SVfARG(name), ord);
257b46d8ef2Safresh1 }
258b39c5158Smillert ord = FBCHAR;
259b39c5158Smillert }
260e9ce3842Safresh1 else if (s+size > e) {
261e9ce3842Safresh1 if (check & ENCODE_STOP_AT_PARTIAL) {
262e9ce3842Safresh1 s -= size;
263e9ce3842Safresh1 break;
264b39c5158Smillert }
265b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR) {
266e9ce3842Safresh1 croak("%" SVf ":Malformed HI surrogate %" UVxf,
267b46d8ef2Safresh1 SVfARG(name), ord);
268e9ce3842Safresh1 }
269b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_SURROGATE)) {
270b46d8ef2Safresh1 warner(packWARN(WARN_SURROGATE),
271b46d8ef2Safresh1 "%" SVf ":Malformed HI surrogate %" UVxf,
272b46d8ef2Safresh1 SVfARG(name), ord);
273e9ce3842Safresh1 }
274e9ce3842Safresh1 ord = FBCHAR;
275e9ce3842Safresh1 }
276e9ce3842Safresh1 else {
277b39c5158Smillert lo = enc_unpack(aTHX_ &s,e,size,endian);
278b39c5158Smillert if (!isLoSurrogate(lo)) {
279b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR) {
280b39c5158Smillert croak("%" SVf ":Malformed LO surrogate %" UVxf,
281b46d8ef2Safresh1 SVfARG(name), ord);
282b39c5158Smillert }
283b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_SURROGATE)) {
284b46d8ef2Safresh1 warner(packWARN(WARN_SURROGATE),
285b46d8ef2Safresh1 "%" SVf ":Malformed LO surrogate %" UVxf,
286b46d8ef2Safresh1 SVfARG(name), ord);
287b46d8ef2Safresh1 }
288e9ce3842Safresh1 s -= size;
289b39c5158Smillert ord = FBCHAR;
290b39c5158Smillert }
291b39c5158Smillert else {
292b39c5158Smillert ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
293b39c5158Smillert }
294b39c5158Smillert }
295b39c5158Smillert }
296b39c5158Smillert }
297b39c5158Smillert
298b39c5158Smillert if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
299b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR) {
300b39c5158Smillert croak("%" SVf ":Unicode character %" UVxf " is illegal",
301b46d8ef2Safresh1 SVfARG(name), ord);
302b39c5158Smillert }
303b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_NONCHAR)) {
304b46d8ef2Safresh1 warner(packWARN(WARN_NONCHAR),
305b46d8ef2Safresh1 "%" SVf ":Unicode character %" UVxf " is illegal",
306b46d8ef2Safresh1 SVfARG(name), ord);
307b46d8ef2Safresh1 }
308b46d8ef2Safresh1 ord = FBCHAR;
309b39c5158Smillert }
310b39c5158Smillert
311b39c5158Smillert if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
312b39c5158Smillert /* Do not allocate >8Mb more than the minimum needed.
313b39c5158Smillert This prevents allocating too much in the rogue case of a large
314b39c5158Smillert input consisting initially of long sequence uft8-byte unicode
315b39c5158Smillert chars followed by single utf8-byte chars. */
316945a9a58Sjasper /* +1
317945a9a58Sjasper fixes Unicode.xs!decode_xs n-byte heap-overflow
318945a9a58Sjasper */
319945a9a58Sjasper STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
320b39c5158Smillert STRLEN max_alloc = remaining + (8*1024*1024);
321b39c5158Smillert STRLEN est_alloc = remaining * UTF8_MAXLEN;
322b39c5158Smillert STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
323b39c5158Smillert (est_alloc > max_alloc ? max_alloc : est_alloc);
324b39c5158Smillert resultbuf = (U8 *) SvGROW(result, newlen);
325b39c5158Smillert resultbuflen = SvLEN(result);
326b39c5158Smillert }
327b39c5158Smillert
328b46d8ef2Safresh1 d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv);
329b46d8ef2Safresh1 if (hv) {
330b46d8ef2Safresh1 SV *message = *hv_fetch(hv, "text", 4, 0);
331b46d8ef2Safresh1 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
332b46d8ef2Safresh1 sv_2mortal((SV *)hv);
333b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR)
334b46d8ef2Safresh1 croak("%" SVf, SVfARG(message));
335b46d8ef2Safresh1 if (encode_ckWARN_packed(check, categories))
336b46d8ef2Safresh1 warner(categories, "%" SVf, SVfARG(message));
337b46d8ef2Safresh1 d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0);
338b46d8ef2Safresh1 }
339b46d8ef2Safresh1
340b39c5158Smillert SvCUR_set(result, d - (U8 *)SvPVX(result));
341b39c5158Smillert }
342b39c5158Smillert
343b39c5158Smillert if (s < e) {
344b39c5158Smillert /* unlikely to happen because it's fixed-length -- dankogai */
345b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR)
346b46d8ef2Safresh1 croak("%" SVf ":Partial character", SVfARG(name));
347b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_UTF8)) {
348b46d8ef2Safresh1 warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name));
349b39c5158Smillert }
350b39c5158Smillert }
351b39c5158Smillert if (check && !(check & ENCODE_LEAVE_SRC)) {
352b39c5158Smillert if (s < e) {
353b39c5158Smillert Move(s,SvPVX(str),e-s,U8);
354b39c5158Smillert SvCUR_set(str,(e-s));
355b39c5158Smillert }
356b39c5158Smillert else {
357b39c5158Smillert SvCUR_set(str,0);
358b39c5158Smillert }
359b39c5158Smillert *SvEND(str) = '\0';
3609f11ffb7Safresh1 SvSETMAGIC(str);
361b39c5158Smillert }
362b39c5158Smillert
363e5157e49Safresh1 if (!temp_result) shrink_buffer(result);
364*eac174f2Safresh1
365*eac174f2Safresh1 /* Make sure we have a trailing NUL: */
366*eac174f2Safresh1 *SvEND(result) = '\0';
367*eac174f2Safresh1
368e5157e49Safresh1 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
369b39c5158Smillert XSRETURN(1);
370b39c5158Smillert }
371b39c5158Smillert
372b39c5158Smillert void
3739f11ffb7Safresh1 encode(obj, utf8, check = 0)
374b39c5158Smillert SV * obj
375b39c5158Smillert SV * utf8
376b39c5158Smillert IV check
377b39c5158Smillert CODE:
378b39c5158Smillert {
379b46d8ef2Safresh1 SV *name = attr("Name");
380b46d8ef2Safresh1 SV *sve = attr("endian");
3819f11ffb7Safresh1 U8 endian = *((U8 *)SvPV_nolen(sve));
382b46d8ef2Safresh1 SV *svs = attr("size");
3839f11ffb7Safresh1 const int size = SvIV(svs);
384b39c5158Smillert int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
385b39c5158Smillert const STRLEN usize = (size > 0 ? size : 1);
386b39c5158Smillert SV *result = newSVpvn("", 0);
387b39c5158Smillert STRLEN ulen;
3889f11ffb7Safresh1 U8 *s;
3899f11ffb7Safresh1 U8 *e;
3909f11ffb7Safresh1 bool modify = (check && !(check & ENCODE_LEAVE_SRC));
3919f11ffb7Safresh1 bool temp_result;
3929f11ffb7Safresh1
3939f11ffb7Safresh1 SvGETMAGIC(utf8);
3949f11ffb7Safresh1 if (!SvOK(utf8))
3959f11ffb7Safresh1 XSRETURN_UNDEF;
3969f11ffb7Safresh1 s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
3979f11ffb7Safresh1 if (!SvUTF8(utf8)) {
3989f11ffb7Safresh1 if (!modify) {
3999f11ffb7Safresh1 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
4009f11ffb7Safresh1 if (SvTAINTED(utf8))
4019f11ffb7Safresh1 SvTAINTED_on(tmp);
4029f11ffb7Safresh1 utf8 = tmp;
4039f11ffb7Safresh1 }
4049f11ffb7Safresh1 sv_utf8_upgrade_nomg(utf8);
4059f11ffb7Safresh1 s = (U8 *)SvPV_nomg(utf8, ulen);
4069f11ffb7Safresh1 }
4079f11ffb7Safresh1 e = s+ulen;
4089f11ffb7Safresh1
409b39c5158Smillert /* Optimise for the common case of being called from PerlIOEncode_flush()
410b39c5158Smillert with a standard length buffer. In this case the result SV's buffer is
411b39c5158Smillert only used temporarily, so we can afford to allocate the maximum needed
412b39c5158Smillert and not care about unused space. */
4139f11ffb7Safresh1 temp_result = (ulen == PERLIO_BUFSIZ);
414b39c5158Smillert
415b39c5158Smillert ST(0) = sv_2mortal(result);
416b39c5158Smillert
417b39c5158Smillert /* Preallocate the result buffer to the maximum possible size.
418b39c5158Smillert ie. assume each UTF8 byte is 1 character.
419b39c5158Smillert Then shrink the result's buffer if necesary at the end. */
420b39c5158Smillert SvGROW(result, ((ulen+1) * usize));
421b39c5158Smillert
422b39c5158Smillert if (!endian) {
4239f11ffb7Safresh1 SV *sv;
424b39c5158Smillert endian = (size == 4) ? 'N' : 'n';
425b39c5158Smillert enc_pack(aTHX_ result,size,endian,BOM_BE);
426b39c5158Smillert #if 1
427b39c5158Smillert /* Update endian for next sequence */
428b46d8ef2Safresh1 sv = attr("renewed");
4299f11ffb7Safresh1 if (SvTRUE(sv)) {
4309f11ffb7Safresh1 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
431b39c5158Smillert }
432b39c5158Smillert #endif
433b39c5158Smillert }
434b39c5158Smillert while (s < e && s+UTF8SKIP(s) <= e) {
435b39c5158Smillert STRLEN len;
436b46d8ef2Safresh1 AV *msgs = NULL;
437b46d8ef2Safresh1 UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs);
438b46d8ef2Safresh1 if (msgs) {
439b46d8ef2Safresh1 SSize_t i;
440b46d8ef2Safresh1 SSize_t len = av_len(msgs)+1;
441b46d8ef2Safresh1 sv_2mortal((SV *)msgs);
442b46d8ef2Safresh1 for (i = 0; i < len; ++i) {
443b46d8ef2Safresh1 SV *sv = *av_fetch(msgs, i, 0);
444b46d8ef2Safresh1 HV *hv = (HV *)SvRV(sv);
445b46d8ef2Safresh1 SV *message = *hv_fetch(hv, "text", 4, 0);
446b46d8ef2Safresh1 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
447b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR)
448b46d8ef2Safresh1 croak("%" SVf, SVfARG(message));
449b46d8ef2Safresh1 if (encode_ckWARN_packed(check, categories))
450b46d8ef2Safresh1 warner(categories, "%" SVf, SVfARG(message));
451b46d8ef2Safresh1 }
452b46d8ef2Safresh1 }
453b46d8ef2Safresh1 if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) {
454b39c5158Smillert if (!issurrogate(ord)) {
455b39c5158Smillert if (ucs2 == -1) {
456b46d8ef2Safresh1 SV *sv = attr("ucs2");
4579f11ffb7Safresh1 ucs2 = SvTRUE(sv);
458b39c5158Smillert }
459e9ce3842Safresh1 if (ucs2 || ord > 0x10FFFF) {
460b46d8ef2Safresh1 if (check & ENCODE_DIE_ON_ERR) {
461b39c5158Smillert croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
462b46d8ef2Safresh1 SVfARG(name),ord);
463b39c5158Smillert }
464b46d8ef2Safresh1 if (encode_ckWARN(check, WARN_NON_UNICODE)) {
465b46d8ef2Safresh1 warner(packWARN(WARN_NON_UNICODE),
466b46d8ef2Safresh1 "%" SVf ":code point \"\\x{%" UVxf "}\" too high",
467b46d8ef2Safresh1 SVfARG(name),ord);
468b46d8ef2Safresh1 }
469b46d8ef2Safresh1 enc_pack(aTHX_ result,size,endian,FBCHAR);
470b46d8ef2Safresh1 } else if (ord == 0) {
471b39c5158Smillert enc_pack(aTHX_ result,size,endian,FBCHAR);
472b39c5158Smillert } else {
473b39c5158Smillert UV hi = ((ord - 0x10000) >> 10) + 0xD800;
474b39c5158Smillert UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
475b39c5158Smillert enc_pack(aTHX_ result,size,endian,hi);
476b39c5158Smillert enc_pack(aTHX_ result,size,endian,lo);
477b39c5158Smillert }
478b39c5158Smillert }
479b39c5158Smillert else {
480b39c5158Smillert /* not supposed to happen */
481b39c5158Smillert enc_pack(aTHX_ result,size,endian,FBCHAR);
482b39c5158Smillert }
483b39c5158Smillert }
484b39c5158Smillert else {
485b39c5158Smillert enc_pack(aTHX_ result,size,endian,ord);
486b39c5158Smillert }
487b46d8ef2Safresh1 s += len;
488b39c5158Smillert }
489b39c5158Smillert if (s < e) {
490b39c5158Smillert /* UTF-8 partial char happens often on PerlIO.
491b39c5158Smillert Since this is okay and normal, we do not warn.
492b39c5158Smillert But this is critical when you choose to LEAVE_SRC
493b39c5158Smillert in which case we die */
494b39c5158Smillert if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
495b39c5158Smillert Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
496b39c5158Smillert "when CHECK = 0x%" UVuf,
497b46d8ef2Safresh1 SVfARG(name), check);
498b39c5158Smillert }
499b39c5158Smillert }
500b39c5158Smillert if (check && !(check & ENCODE_LEAVE_SRC)) {
501b39c5158Smillert if (s < e) {
502b39c5158Smillert Move(s,SvPVX(utf8),e-s,U8);
503b39c5158Smillert SvCUR_set(utf8,(e-s));
504b39c5158Smillert }
505b39c5158Smillert else {
506b39c5158Smillert SvCUR_set(utf8,0);
507b39c5158Smillert }
508b39c5158Smillert *SvEND(utf8) = '\0';
5099f11ffb7Safresh1 SvSETMAGIC(utf8);
510b39c5158Smillert }
511b39c5158Smillert
512e5157e49Safresh1 if (!temp_result) shrink_buffer(result);
513e5157e49Safresh1 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
514b39c5158Smillert
515b39c5158Smillert XSRETURN(1);
516b39c5158Smillert }
517