1 /* 2 $Id: Unicode.xs,v 2.14 2016/01/22 06:33:07 dankogai Exp $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #include "../Encode/encode.h" 10 11 #define FBCHAR 0xFFFd 12 #define BOM_BE 0xFeFF 13 #define BOM16LE 0xFFFe 14 #define BOM32LE 0xFFFe0000 15 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 16 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 17 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 18 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 19 20 /* For pre-5.14 source compatibility */ 21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE 22 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0 23 # define UTF8_DISALLOW_SURROGATE 0 24 # define UTF8_WARN_SURROGATE 0 25 # define UTF8_DISALLOW_FE_FF 0 26 # define UTF8_WARN_FE_FF 0 27 # define UTF8_WARN_NONCHAR 0 28 #endif 29 30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 31 32 /* Avoid wasting too much space in the result buffer */ 33 /* static void */ 34 /* shrink_buffer(SV *result) */ 35 /* { */ 36 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 37 /* char *buf; */ 38 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 39 /* New(0, buf, len, char); */ 40 /* Copy(SvPVX(result), buf, len, char); */ 41 /* Safefree(SvPVX(result)); */ 42 /* SvPV_set(result, buf); */ 43 /* SvLEN_set(result, len); */ 44 /* } */ 45 /* } */ 46 47 #define shrink_buffer(result) { \ 48 if (SvLEN(result) > 42 + SvCUR(result)) { \ 49 char *newpv; \ 50 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 51 New(0, newpv, newlen, char); \ 52 Copy(SvPVX(result), newpv, newlen, char); \ 53 Safefree(SvPVX(result)); \ 54 SvPV_set(result, newpv); \ 55 SvLEN_set(result, newlen); \ 56 } \ 57 } 58 59 static UV 60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 61 { 62 U8 *s = *sp; 63 UV v = 0; 64 if (s+size > e) { 65 croak("Partial character %c",(char) endian); 66 } 67 switch(endian) { 68 case 'N': 69 v = *s++; 70 v = (v << 8) | *s++; 71 case 'n': 72 v = (v << 8) | *s++; 73 v = (v << 8) | *s++; 74 break; 75 case 'V': 76 case 'v': 77 v |= *s++; 78 v |= (*s++ << 8); 79 if (endian == 'v') 80 break; 81 v |= (*s++ << 16); 82 v |= ((UV)*s++ << 24); 83 break; 84 default: 85 croak("Unknown endian %c",(char) endian); 86 break; 87 } 88 *sp = s; 89 return v; 90 } 91 92 static void 93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 94 { 95 U8 *d = (U8 *) SvPV_nolen(result); 96 97 switch(endian) { 98 case 'v': 99 case 'V': 100 d += SvCUR(result); 101 SvCUR_set(result,SvCUR(result)+size); 102 while (size--) { 103 *d++ = (U8)(value & 0xFF); 104 value >>= 8; 105 } 106 break; 107 case 'n': 108 case 'N': 109 SvCUR_set(result,SvCUR(result)+size); 110 d += SvCUR(result); 111 while (size--) { 112 *--d = (U8)(value & 0xFF); 113 value >>= 8; 114 } 115 break; 116 default: 117 croak("Unknown endian %c",(char) endian); 118 break; 119 } 120 } 121 122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 123 124 PROTOTYPES: DISABLE 125 126 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 127 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 128 #define attr_true(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 129 SvTRUE(*hv_fetch((HV *)SvRV(obj),k,l,0)) : FALSE) 130 131 void 132 decode_xs(obj, str, check = 0) 133 SV * obj 134 SV * str 135 IV check 136 CODE: 137 { 138 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 139 int size = SvIV(attr("size", 4)); 140 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 141 SV *result = newSVpvn("",0); 142 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 143 STRLEN ulen; 144 STRLEN resultbuflen; 145 U8 *resultbuf; 146 U8 *s = (U8 *)SvPVbyte(str,ulen); 147 U8 *e = (U8 *)SvEND(str); 148 /* Optimise for the common case of being called from PerlIOEncode_fill() 149 with a standard length buffer. In this case the result SV's buffer is 150 only used temporarily, so we can afford to allocate the maximum needed 151 and not care about unused space. */ 152 const bool temp_result = (ulen == PERLIO_BUFSIZ); 153 154 ST(0) = sv_2mortal(result); 155 SvUTF8_on(result); 156 157 if (!endian && s+size <= e) { 158 UV bom; 159 endian = (size == 4) ? 'N' : 'n'; 160 bom = enc_unpack(aTHX_ &s,e,size,endian); 161 if (bom != BOM_BE) { 162 if (bom == BOM16LE) { 163 endian = 'v'; 164 } 165 else if (bom == BOM32LE) { 166 endian = 'V'; 167 } 168 else { 169 /* No BOM found, use big-endian fallback as specified in 170 * RFC2781 and the Unicode Standard version 8.0: 171 * 172 * The UTF-16 encoding scheme may or may not begin with 173 * a BOM. However, when there is no BOM, and in the 174 * absence of a higher-level protocol, the byte order 175 * of the UTF-16 encoding scheme is big-endian. 176 * 177 * If the first two octets of the text is not 0xFE 178 * followed by 0xFF, and is not 0xFF followed by 0xFE, 179 * then the text SHOULD be interpreted as big-endian. 180 */ 181 s -= size; 182 } 183 } 184 #if 1 185 /* Update endian for next sequence */ 186 if (attr_true("renewed", 7)) { 187 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 188 } 189 #endif 190 } 191 192 if (temp_result) { 193 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 194 } else { 195 /* Preallocate the buffer to the minimum possible space required. */ 196 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 197 } 198 resultbuf = (U8 *) SvGROW(result, resultbuflen); 199 200 while (s < e && s+size <= e) { 201 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 202 U8 *d; 203 if (issurrogate(ord)) { 204 if (ucs2 == -1) { 205 ucs2 = attr_true("ucs2", 4); 206 } 207 if (ucs2 || size == 4) { 208 if (check) { 209 croak("%"SVf":no surrogates allowed %"UVxf, 210 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 211 ord); 212 } 213 ord = FBCHAR; 214 } 215 else { 216 UV lo; 217 if (!isHiSurrogate(ord)) { 218 if (check) { 219 croak("%"SVf":Malformed HI surrogate %"UVxf, 220 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 221 ord); 222 } 223 else { 224 ord = FBCHAR; 225 } 226 } 227 else if (s+size > e) { 228 if (check) { 229 if (check & ENCODE_STOP_AT_PARTIAL) { 230 s -= size; 231 break; 232 } 233 else { 234 croak("%"SVf":Malformed HI surrogate %"UVxf, 235 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 236 ord); 237 } 238 } 239 else { 240 ord = FBCHAR; 241 } 242 } 243 else { 244 lo = enc_unpack(aTHX_ &s,e,size,endian); 245 if (!isLoSurrogate(lo)) { 246 if (check) { 247 croak("%"SVf":Malformed LO surrogate %"UVxf, 248 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 249 ord); 250 } 251 else { 252 s -= size; 253 ord = FBCHAR; 254 } 255 } 256 else { 257 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 258 } 259 } 260 } 261 } 262 263 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 264 if (check) { 265 croak("%"SVf":Unicode character %"UVxf" is illegal", 266 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 267 ord); 268 } else { 269 ord = FBCHAR; 270 } 271 } 272 273 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 274 /* Do not allocate >8Mb more than the minimum needed. 275 This prevents allocating too much in the rogue case of a large 276 input consisting initially of long sequence uft8-byte unicode 277 chars followed by single utf8-byte chars. */ 278 /* +1 279 fixes Unicode.xs!decode_xs n-byte heap-overflow 280 */ 281 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 282 STRLEN max_alloc = remaining + (8*1024*1024); 283 STRLEN est_alloc = remaining * UTF8_MAXLEN; 284 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 285 (est_alloc > max_alloc ? max_alloc : est_alloc); 286 resultbuf = (U8 *) SvGROW(result, newlen); 287 resultbuflen = SvLEN(result); 288 } 289 290 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 291 UNICODE_WARN_ILLEGAL_INTERCHANGE); 292 SvCUR_set(result, d - (U8 *)SvPVX(result)); 293 } 294 295 if (s < e) { 296 /* unlikely to happen because it's fixed-length -- dankogai */ 297 if (check & ENCODE_WARN_ON_ERR) { 298 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", 299 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 300 } 301 } 302 if (check && !(check & ENCODE_LEAVE_SRC)) { 303 if (s < e) { 304 Move(s,SvPVX(str),e-s,U8); 305 SvCUR_set(str,(e-s)); 306 } 307 else { 308 SvCUR_set(str,0); 309 } 310 *SvEND(str) = '\0'; 311 } 312 313 if (!temp_result) shrink_buffer(result); 314 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */ 315 XSRETURN(1); 316 } 317 318 void 319 encode_xs(obj, utf8, check = 0) 320 SV * obj 321 SV * utf8 322 IV check 323 CODE: 324 { 325 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 326 const int size = SvIV(attr("size", 4)); 327 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 328 const STRLEN usize = (size > 0 ? size : 1); 329 SV *result = newSVpvn("", 0); 330 STRLEN ulen; 331 U8 *s = (U8 *) SvPVutf8(utf8, ulen); 332 const U8 *e = (U8 *) SvEND(utf8); 333 /* Optimise for the common case of being called from PerlIOEncode_flush() 334 with a standard length buffer. In this case the result SV's buffer is 335 only used temporarily, so we can afford to allocate the maximum needed 336 and not care about unused space. */ 337 const bool temp_result = (ulen == PERLIO_BUFSIZ); 338 339 ST(0) = sv_2mortal(result); 340 341 /* Preallocate the result buffer to the maximum possible size. 342 ie. assume each UTF8 byte is 1 character. 343 Then shrink the result's buffer if necesary at the end. */ 344 SvGROW(result, ((ulen+1) * usize)); 345 346 if (!endian) { 347 endian = (size == 4) ? 'N' : 'n'; 348 enc_pack(aTHX_ result,size,endian,BOM_BE); 349 #if 1 350 /* Update endian for next sequence */ 351 if (attr_true("renewed", 7)) { 352 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 353 } 354 #endif 355 } 356 while (s < e && s+UTF8SKIP(s) <= e) { 357 STRLEN len; 358 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE 359 |UTF8_WARN_SURROGATE 360 |UTF8_DISALLOW_FE_FF 361 |UTF8_WARN_FE_FF 362 |UTF8_WARN_NONCHAR)); 363 s += len; 364 if (size != 4 && invalid_ucs2(ord)) { 365 if (!issurrogate(ord)) { 366 if (ucs2 == -1) { 367 ucs2 = attr_true("ucs2", 4); 368 } 369 if (ucs2 || ord > 0x10FFFF) { 370 if (check) { 371 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", 372 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 373 } 374 enc_pack(aTHX_ result,size,endian,FBCHAR); 375 } else { 376 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 377 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 378 enc_pack(aTHX_ result,size,endian,hi); 379 enc_pack(aTHX_ result,size,endian,lo); 380 } 381 } 382 else { 383 /* not supposed to happen */ 384 enc_pack(aTHX_ result,size,endian,FBCHAR); 385 } 386 } 387 else { 388 enc_pack(aTHX_ result,size,endian,ord); 389 } 390 } 391 if (s < e) { 392 /* UTF-8 partial char happens often on PerlIO. 393 Since this is okay and normal, we do not warn. 394 But this is critical when you choose to LEAVE_SRC 395 in which case we die */ 396 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 397 Perl_croak(aTHX_ "%"SVf":partial character is not allowed " 398 "when CHECK = 0x%" UVuf, 399 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 400 } 401 } 402 if (check && !(check & ENCODE_LEAVE_SRC)) { 403 if (s < e) { 404 Move(s,SvPVX(utf8),e-s,U8); 405 SvCUR_set(utf8,(e-s)); 406 } 407 else { 408 SvCUR_set(utf8,0); 409 } 410 *SvEND(utf8) = '\0'; 411 } 412 413 if (!temp_result) shrink_buffer(result); 414 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ 415 416 SvSETMAGIC(utf8); 417 418 XSRETURN(1); 419 } 420