1 /* 2 $Id: Unicode.xs,v 2.17 2018/02/08 00:26:15 dankogai Exp $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #include "../Encode/encode.h" 10 11 #define FBCHAR 0xFFFd 12 #define BOM_BE 0xFeFF 13 #define BOM16LE 0xFFFe 14 #define BOM32LE 0xFFFe0000 15 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 16 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 17 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 18 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 19 20 /* For pre-5.14 source compatibility */ 21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE 22 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0 23 # define UTF8_DISALLOW_SURROGATE 0 24 # define UTF8_WARN_SURROGATE 0 25 # define UTF8_DISALLOW_FE_FF 0 26 # define UTF8_WARN_FE_FF 0 27 # define UTF8_WARN_NONCHAR 0 28 #endif 29 30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 31 32 /* Avoid wasting too much space in the result buffer */ 33 /* static void */ 34 /* shrink_buffer(SV *result) */ 35 /* { */ 36 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 37 /* char *buf; */ 38 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 39 /* New(0, buf, len, char); */ 40 /* Copy(SvPVX(result), buf, len, char); */ 41 /* Safefree(SvPVX(result)); */ 42 /* SvPV_set(result, buf); */ 43 /* SvLEN_set(result, len); */ 44 /* } */ 45 /* } */ 46 47 #define shrink_buffer(result) { \ 48 if (SvLEN(result) > 42 + SvCUR(result)) { \ 49 char *newpv; \ 50 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 51 New(0, newpv, newlen, char); \ 52 Copy(SvPVX(result), newpv, newlen, char); \ 53 Safefree(SvPVX(result)); \ 54 SvPV_set(result, newpv); \ 55 SvLEN_set(result, newlen); \ 56 } \ 57 } 58 59 static UV 60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 61 { 62 U8 *s = *sp; 63 UV v = 0; 64 if (s+size > e) { 65 croak("Partial character %c",(char) endian); 66 } 67 switch(endian) { 68 case 'N': 69 v = *s++; 70 v = (v << 8) | *s++; 71 case 'n': 72 v = (v << 8) | *s++; 73 v = (v << 8) | *s++; 74 break; 75 case 'V': 76 case 'v': 77 v |= *s++; 78 v |= (*s++ << 8); 79 if (endian == 'v') 80 break; 81 v |= (*s++ << 16); 82 v |= ((UV)*s++ << 24); 83 break; 84 default: 85 croak("Unknown endian %c",(char) endian); 86 break; 87 } 88 *sp = s; 89 return v; 90 } 91 92 static void 93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 94 { 95 U8 *d = (U8 *) SvPV_nolen(result); 96 97 switch(endian) { 98 case 'v': 99 case 'V': 100 d += SvCUR(result); 101 SvCUR_set(result,SvCUR(result)+size); 102 while (size--) { 103 *d++ = (U8)(value & 0xFF); 104 value >>= 8; 105 } 106 break; 107 case 'n': 108 case 'N': 109 SvCUR_set(result,SvCUR(result)+size); 110 d += SvCUR(result); 111 while (size--) { 112 *--d = (U8)(value & 0xFF); 113 value >>= 8; 114 } 115 break; 116 default: 117 croak("Unknown endian %c",(char) endian); 118 break; 119 } 120 } 121 122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 123 124 PROTOTYPES: DISABLE 125 126 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 127 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 128 129 void 130 decode(obj, str, check = 0) 131 SV * obj 132 SV * str 133 IV check 134 CODE: 135 { 136 SV *sve = attr("endian", 6); 137 U8 endian = *((U8 *)SvPV_nolen(sve)); 138 SV *svs = attr("size", 4); 139 int size = SvIV(svs); 140 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 141 SV *result = newSVpvn("",0); 142 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 143 STRLEN ulen; 144 STRLEN resultbuflen; 145 U8 *resultbuf; 146 U8 *s; 147 U8 *e; 148 bool modify = (check && !(check & ENCODE_LEAVE_SRC)); 149 bool temp_result; 150 151 SvGETMAGIC(str); 152 if (!SvOK(str)) 153 XSRETURN_UNDEF; 154 s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen); 155 if (SvUTF8(str)) { 156 if (!modify) { 157 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); 158 SvUTF8_on(tmp); 159 if (SvTAINTED(str)) 160 SvTAINTED_on(tmp); 161 str = tmp; 162 s = (U8 *)SvPVX(str); 163 } 164 if (ulen) { 165 if (!utf8_to_bytes(s, &ulen)) 166 croak("Wide character"); 167 SvCUR_set(str, ulen); 168 } 169 SvUTF8_off(str); 170 } 171 e = s+ulen; 172 173 /* Optimise for the common case of being called from PerlIOEncode_fill() 174 with a standard length buffer. In this case the result SV's buffer is 175 only used temporarily, so we can afford to allocate the maximum needed 176 and not care about unused space. */ 177 temp_result = (ulen == PERLIO_BUFSIZ); 178 179 ST(0) = sv_2mortal(result); 180 SvUTF8_on(result); 181 182 if (!endian && s+size <= e) { 183 SV *sv; 184 UV bom; 185 endian = (size == 4) ? 'N' : 'n'; 186 bom = enc_unpack(aTHX_ &s,e,size,endian); 187 if (bom != BOM_BE) { 188 if (bom == BOM16LE) { 189 endian = 'v'; 190 } 191 else if (bom == BOM32LE) { 192 endian = 'V'; 193 } 194 else { 195 /* No BOM found, use big-endian fallback as specified in 196 * RFC2781 and the Unicode Standard version 8.0: 197 * 198 * The UTF-16 encoding scheme may or may not begin with 199 * a BOM. However, when there is no BOM, and in the 200 * absence of a higher-level protocol, the byte order 201 * of the UTF-16 encoding scheme is big-endian. 202 * 203 * If the first two octets of the text is not 0xFE 204 * followed by 0xFF, and is not 0xFF followed by 0xFE, 205 * then the text SHOULD be interpreted as big-endian. 206 */ 207 s -= size; 208 } 209 } 210 #if 1 211 /* Update endian for next sequence */ 212 sv = attr("renewed", 7); 213 if (SvTRUE(sv)) { 214 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 215 } 216 #endif 217 } 218 219 if (temp_result) { 220 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 221 } else { 222 /* Preallocate the buffer to the minimum possible space required. */ 223 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 224 } 225 resultbuf = (U8 *) SvGROW(result, resultbuflen); 226 227 while (s < e && s+size <= e) { 228 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 229 U8 *d; 230 if (issurrogate(ord)) { 231 if (ucs2 == -1) { 232 SV *sv = attr("ucs2", 4); 233 ucs2 = SvTRUE(sv); 234 } 235 if (ucs2 || size == 4) { 236 if (check) { 237 croak("%" SVf ":no surrogates allowed %" UVxf, 238 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 239 ord); 240 } 241 ord = FBCHAR; 242 } 243 else { 244 UV lo; 245 if (!isHiSurrogate(ord)) { 246 if (check) { 247 croak("%" SVf ":Malformed HI surrogate %" UVxf, 248 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 249 ord); 250 } 251 else { 252 ord = FBCHAR; 253 } 254 } 255 else if (s+size > e) { 256 if (check) { 257 if (check & ENCODE_STOP_AT_PARTIAL) { 258 s -= size; 259 break; 260 } 261 else { 262 croak("%" SVf ":Malformed HI surrogate %" UVxf, 263 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 264 ord); 265 } 266 } 267 else { 268 ord = FBCHAR; 269 } 270 } 271 else { 272 lo = enc_unpack(aTHX_ &s,e,size,endian); 273 if (!isLoSurrogate(lo)) { 274 if (check) { 275 croak("%" SVf ":Malformed LO surrogate %" UVxf, 276 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 277 ord); 278 } 279 else { 280 s -= size; 281 ord = FBCHAR; 282 } 283 } 284 else { 285 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 286 } 287 } 288 } 289 } 290 291 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 292 if (check) { 293 croak("%" SVf ":Unicode character %" UVxf " is illegal", 294 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 295 ord); 296 } else { 297 ord = FBCHAR; 298 } 299 } 300 301 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 302 /* Do not allocate >8Mb more than the minimum needed. 303 This prevents allocating too much in the rogue case of a large 304 input consisting initially of long sequence uft8-byte unicode 305 chars followed by single utf8-byte chars. */ 306 /* +1 307 fixes Unicode.xs!decode_xs n-byte heap-overflow 308 */ 309 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 310 STRLEN max_alloc = remaining + (8*1024*1024); 311 STRLEN est_alloc = remaining * UTF8_MAXLEN; 312 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 313 (est_alloc > max_alloc ? max_alloc : est_alloc); 314 resultbuf = (U8 *) SvGROW(result, newlen); 315 resultbuflen = SvLEN(result); 316 } 317 318 d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), ord, 319 UNICODE_WARN_ILLEGAL_INTERCHANGE); 320 SvCUR_set(result, d - (U8 *)SvPVX(result)); 321 } 322 323 if (s < e) { 324 /* unlikely to happen because it's fixed-length -- dankogai */ 325 if (check & ENCODE_WARN_ON_ERR) { 326 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character", 327 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 328 } 329 } 330 if (check && !(check & ENCODE_LEAVE_SRC)) { 331 if (s < e) { 332 Move(s,SvPVX(str),e-s,U8); 333 SvCUR_set(str,(e-s)); 334 } 335 else { 336 SvCUR_set(str,0); 337 } 338 *SvEND(str) = '\0'; 339 SvSETMAGIC(str); 340 } 341 342 if (!temp_result) shrink_buffer(result); 343 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */ 344 XSRETURN(1); 345 } 346 347 void 348 encode(obj, utf8, check = 0) 349 SV * obj 350 SV * utf8 351 IV check 352 CODE: 353 { 354 SV *sve = attr("endian", 6); 355 U8 endian = *((U8 *)SvPV_nolen(sve)); 356 SV *svs = attr("size", 4); 357 const int size = SvIV(svs); 358 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 359 const STRLEN usize = (size > 0 ? size : 1); 360 SV *result = newSVpvn("", 0); 361 STRLEN ulen; 362 U8 *s; 363 U8 *e; 364 bool modify = (check && !(check & ENCODE_LEAVE_SRC)); 365 bool temp_result; 366 367 SvGETMAGIC(utf8); 368 if (!SvOK(utf8)) 369 XSRETURN_UNDEF; 370 s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen); 371 if (!SvUTF8(utf8)) { 372 if (!modify) { 373 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); 374 if (SvTAINTED(utf8)) 375 SvTAINTED_on(tmp); 376 utf8 = tmp; 377 } 378 sv_utf8_upgrade_nomg(utf8); 379 s = (U8 *)SvPV_nomg(utf8, ulen); 380 } 381 e = s+ulen; 382 383 /* Optimise for the common case of being called from PerlIOEncode_flush() 384 with a standard length buffer. In this case the result SV's buffer is 385 only used temporarily, so we can afford to allocate the maximum needed 386 and not care about unused space. */ 387 temp_result = (ulen == PERLIO_BUFSIZ); 388 389 ST(0) = sv_2mortal(result); 390 391 /* Preallocate the result buffer to the maximum possible size. 392 ie. assume each UTF8 byte is 1 character. 393 Then shrink the result's buffer if necesary at the end. */ 394 SvGROW(result, ((ulen+1) * usize)); 395 396 if (!endian) { 397 SV *sv; 398 endian = (size == 4) ? 'N' : 'n'; 399 enc_pack(aTHX_ result,size,endian,BOM_BE); 400 #if 1 401 /* Update endian for next sequence */ 402 sv = attr("renewed", 7); 403 if (SvTRUE(sv)) { 404 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 405 } 406 #endif 407 } 408 while (s < e && s+UTF8SKIP(s) <= e) { 409 STRLEN len; 410 UV ord = utf8n_to_uvchr(s, e-s, &len, (UTF8_DISALLOW_SURROGATE 411 |UTF8_WARN_SURROGATE 412 |UTF8_DISALLOW_FE_FF 413 |UTF8_WARN_FE_FF 414 |UTF8_WARN_NONCHAR)); 415 s += len; 416 if (size != 4 && invalid_ucs2(ord)) { 417 if (!issurrogate(ord)) { 418 if (ucs2 == -1) { 419 SV *sv = attr("ucs2", 4); 420 ucs2 = SvTRUE(sv); 421 } 422 if (ucs2 || ord > 0x10FFFF) { 423 if (check) { 424 croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high", 425 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 426 } 427 enc_pack(aTHX_ result,size,endian,FBCHAR); 428 } else { 429 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 430 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 431 enc_pack(aTHX_ result,size,endian,hi); 432 enc_pack(aTHX_ result,size,endian,lo); 433 } 434 } 435 else { 436 /* not supposed to happen */ 437 enc_pack(aTHX_ result,size,endian,FBCHAR); 438 } 439 } 440 else { 441 enc_pack(aTHX_ result,size,endian,ord); 442 } 443 } 444 if (s < e) { 445 /* UTF-8 partial char happens often on PerlIO. 446 Since this is okay and normal, we do not warn. 447 But this is critical when you choose to LEAVE_SRC 448 in which case we die */ 449 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 450 Perl_croak(aTHX_ "%" SVf ":partial character is not allowed " 451 "when CHECK = 0x%" UVuf, 452 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 453 } 454 } 455 if (check && !(check & ENCODE_LEAVE_SRC)) { 456 if (s < e) { 457 Move(s,SvPVX(utf8),e-s,U8); 458 SvCUR_set(utf8,(e-s)); 459 } 460 else { 461 SvCUR_set(utf8,0); 462 } 463 *SvEND(utf8) = '\0'; 464 SvSETMAGIC(utf8); 465 } 466 467 if (!temp_result) shrink_buffer(result); 468 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ 469 470 XSRETURN(1); 471 } 472