1 /* 2 $Id: Unicode.xs,v 2.19 2019/01/21 03:09:59 dankogai Exp $ 3 */ 4 5 #define IN_UNICODE_XS 6 7 #define PERL_NO_GET_CONTEXT 8 #include "EXTERN.h" 9 #include "perl.h" 10 #include "XSUB.h" 11 #include "../Encode/encode.h" 12 13 #define FBCHAR 0xFFFd 14 #define BOM_BE 0xFeFF 15 #define BOM16LE 0xFFFe 16 #define BOM32LE 0xFFFe0000 17 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 18 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 19 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 20 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 21 22 #ifndef SVfARG 23 #define SVfARG(p) ((void*)(p)) 24 #endif 25 26 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 27 28 /* Avoid wasting too much space in the result buffer */ 29 /* static void */ 30 /* shrink_buffer(SV *result) */ 31 /* { */ 32 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 33 /* char *buf; */ 34 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 35 /* New(0, buf, len, char); */ 36 /* Copy(SvPVX(result), buf, len, char); */ 37 /* Safefree(SvPVX(result)); */ 38 /* SvPV_set(result, buf); */ 39 /* SvLEN_set(result, len); */ 40 /* } */ 41 /* } */ 42 43 #define shrink_buffer(result) { \ 44 if (SvLEN(result) > 42 + SvCUR(result)) { \ 45 char *newpv; \ 46 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 47 New(0, newpv, newlen, char); \ 48 Copy(SvPVX(result), newpv, newlen, char); \ 49 Safefree(SvPVX(result)); \ 50 SvPV_set(result, newpv); \ 51 SvLEN_set(result, newlen); \ 52 } \ 53 } 54 55 static UV 56 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 57 { 58 U8 *s = *sp; 59 UV v = 0; 60 if (s+size > e) { 61 croak("Partial character %c",(char) endian); 62 } 63 switch(endian) { 64 case 'N': 65 v = *s++; 66 v = (v << 8) | *s++; 67 /* FALLTHROUGH */ 68 case 'n': 69 v = (v << 8) | *s++; 70 v = (v << 8) | *s++; 71 break; 72 case 'V': 73 case 'v': 74 v |= *s++; 75 v |= (*s++ << 8); 76 if (endian == 'v') 77 break; 78 v |= (*s++ << 16); 79 v |= ((UV)*s++ << 24); 80 break; 81 default: 82 croak("Unknown endian %c",(char) endian); 83 break; 84 } 85 *sp = s; 86 return v; 87 } 88 89 static void 90 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 91 { 92 U8 *d = (U8 *) SvPV_nolen(result); 93 94 switch(endian) { 95 case 'v': 96 case 'V': 97 d += SvCUR(result); 98 SvCUR_set(result,SvCUR(result)+size); 99 while (size--) { 100 *d++ = (U8)(value & 0xFF); 101 value >>= 8; 102 } 103 break; 104 case 'n': 105 case 'N': 106 SvCUR_set(result,SvCUR(result)+size); 107 d += SvCUR(result); 108 while (size--) { 109 *--d = (U8)(value & 0xFF); 110 value >>= 8; 111 } 112 break; 113 default: 114 croak("Unknown endian %c",(char) endian); 115 break; 116 } 117 } 118 119 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 120 121 PROTOTYPES: DISABLE 122 123 #define attr(k) (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \ 124 *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef) 125 126 void 127 decode(obj, str, check = 0) 128 SV * obj 129 SV * str 130 IV check 131 CODE: 132 { 133 SV *name = attr("Name"); 134 SV *sve = attr("endian"); 135 U8 endian = *((U8 *)SvPV_nolen(sve)); 136 SV *svs = attr("size"); 137 int size = SvIV(svs); 138 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 139 SV *result = newSVpvn("",0); 140 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 141 STRLEN ulen; 142 STRLEN resultbuflen; 143 U8 *resultbuf; 144 U8 *s; 145 U8 *e; 146 bool modify = (check && !(check & ENCODE_LEAVE_SRC)); 147 bool temp_result; 148 149 SvGETMAGIC(str); 150 if (!SvOK(str)) 151 XSRETURN_UNDEF; 152 s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen); 153 if (SvUTF8(str)) { 154 if (!modify) { 155 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); 156 SvUTF8_on(tmp); 157 if (SvTAINTED(str)) 158 SvTAINTED_on(tmp); 159 str = tmp; 160 s = (U8 *)SvPVX(str); 161 } 162 if (ulen) { 163 if (!utf8_to_bytes(s, &ulen)) 164 croak("Wide character"); 165 SvCUR_set(str, ulen); 166 } 167 SvUTF8_off(str); 168 } 169 e = s+ulen; 170 171 /* Optimise for the common case of being called from PerlIOEncode_fill() 172 with a standard length buffer. In this case the result SV's buffer is 173 only used temporarily, so we can afford to allocate the maximum needed 174 and not care about unused space. */ 175 temp_result = (ulen == PERLIO_BUFSIZ); 176 177 ST(0) = sv_2mortal(result); 178 SvUTF8_on(result); 179 180 if (!endian && s+size <= e) { 181 SV *sv; 182 UV bom; 183 endian = (size == 4) ? 'N' : 'n'; 184 bom = enc_unpack(aTHX_ &s,e,size,endian); 185 if (bom != BOM_BE) { 186 if (bom == BOM16LE) { 187 endian = 'v'; 188 } 189 else if (bom == BOM32LE) { 190 endian = 'V'; 191 } 192 else { 193 /* No BOM found, use big-endian fallback as specified in 194 * RFC2781 and the Unicode Standard version 8.0: 195 * 196 * The UTF-16 encoding scheme may or may not begin with 197 * a BOM. However, when there is no BOM, and in the 198 * absence of a higher-level protocol, the byte order 199 * of the UTF-16 encoding scheme is big-endian. 200 * 201 * If the first two octets of the text is not 0xFE 202 * followed by 0xFF, and is not 0xFF followed by 0xFE, 203 * then the text SHOULD be interpreted as big-endian. 204 */ 205 s -= size; 206 } 207 } 208 #if 1 209 /* Update endian for next sequence */ 210 sv = attr("renewed"); 211 if (SvTRUE(sv)) { 212 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 213 } 214 #endif 215 } 216 217 if (temp_result) { 218 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 219 } else { 220 /* Preallocate the buffer to the minimum possible space required. */ 221 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 222 } 223 resultbuf = (U8 *) SvGROW(result, resultbuflen); 224 225 while (s < e && s+size <= e) { 226 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 227 U8 *d; 228 HV *hv = NULL; 229 if (issurrogate(ord)) { 230 if (ucs2 == -1) { 231 SV *sv = attr("ucs2"); 232 ucs2 = SvTRUE(sv); 233 } 234 if (ucs2 || size == 4) { 235 if (check & ENCODE_DIE_ON_ERR) { 236 croak("%" SVf ":no surrogates allowed %" UVxf, 237 SVfARG(name), ord); 238 } 239 if (encode_ckWARN(check, WARN_SURROGATE)) { 240 warner(packWARN(WARN_SURROGATE), 241 "%" SVf ":no surrogates allowed %" UVxf, 242 SVfARG(name), ord); 243 } 244 ord = FBCHAR; 245 } 246 else { 247 UV lo; 248 if (!isHiSurrogate(ord)) { 249 if (check & ENCODE_DIE_ON_ERR) { 250 croak("%" SVf ":Malformed HI surrogate %" UVxf, 251 SVfARG(name), ord); 252 } 253 if (encode_ckWARN(check, WARN_SURROGATE)) { 254 warner(packWARN(WARN_SURROGATE), 255 "%" SVf ":Malformed HI surrogate %" UVxf, 256 SVfARG(name), ord); 257 } 258 ord = FBCHAR; 259 } 260 else if (s+size > e) { 261 if (check & ENCODE_STOP_AT_PARTIAL) { 262 s -= size; 263 break; 264 } 265 if (check & ENCODE_DIE_ON_ERR) { 266 croak("%" SVf ":Malformed HI surrogate %" UVxf, 267 SVfARG(name), ord); 268 } 269 if (encode_ckWARN(check, WARN_SURROGATE)) { 270 warner(packWARN(WARN_SURROGATE), 271 "%" SVf ":Malformed HI surrogate %" UVxf, 272 SVfARG(name), ord); 273 } 274 ord = FBCHAR; 275 } 276 else { 277 lo = enc_unpack(aTHX_ &s,e,size,endian); 278 if (!isLoSurrogate(lo)) { 279 if (check & ENCODE_DIE_ON_ERR) { 280 croak("%" SVf ":Malformed LO surrogate %" UVxf, 281 SVfARG(name), ord); 282 } 283 if (encode_ckWARN(check, WARN_SURROGATE)) { 284 warner(packWARN(WARN_SURROGATE), 285 "%" SVf ":Malformed LO surrogate %" UVxf, 286 SVfARG(name), ord); 287 } 288 s -= size; 289 ord = FBCHAR; 290 } 291 else { 292 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 293 } 294 } 295 } 296 } 297 298 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 299 if (check & ENCODE_DIE_ON_ERR) { 300 croak("%" SVf ":Unicode character %" UVxf " is illegal", 301 SVfARG(name), ord); 302 } 303 if (encode_ckWARN(check, WARN_NONCHAR)) { 304 warner(packWARN(WARN_NONCHAR), 305 "%" SVf ":Unicode character %" UVxf " is illegal", 306 SVfARG(name), ord); 307 } 308 ord = FBCHAR; 309 } 310 311 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 312 /* Do not allocate >8Mb more than the minimum needed. 313 This prevents allocating too much in the rogue case of a large 314 input consisting initially of long sequence uft8-byte unicode 315 chars followed by single utf8-byte chars. */ 316 /* +1 317 fixes Unicode.xs!decode_xs n-byte heap-overflow 318 */ 319 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 320 STRLEN max_alloc = remaining + (8*1024*1024); 321 STRLEN est_alloc = remaining * UTF8_MAXLEN; 322 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 323 (est_alloc > max_alloc ? max_alloc : est_alloc); 324 resultbuf = (U8 *) SvGROW(result, newlen); 325 resultbuflen = SvLEN(result); 326 } 327 328 d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv); 329 if (hv) { 330 SV *message = *hv_fetch(hv, "text", 4, 0); 331 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0)); 332 sv_2mortal((SV *)hv); 333 if (check & ENCODE_DIE_ON_ERR) 334 croak("%" SVf, SVfARG(message)); 335 if (encode_ckWARN_packed(check, categories)) 336 warner(categories, "%" SVf, SVfARG(message)); 337 d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0); 338 } 339 340 SvCUR_set(result, d - (U8 *)SvPVX(result)); 341 } 342 343 if (s < e) { 344 /* unlikely to happen because it's fixed-length -- dankogai */ 345 if (check & ENCODE_DIE_ON_ERR) 346 croak("%" SVf ":Partial character", SVfARG(name)); 347 if (encode_ckWARN(check, WARN_UTF8)) { 348 warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name)); 349 } 350 } 351 if (check && !(check & ENCODE_LEAVE_SRC)) { 352 if (s < e) { 353 Move(s,SvPVX(str),e-s,U8); 354 SvCUR_set(str,(e-s)); 355 } 356 else { 357 SvCUR_set(str,0); 358 } 359 *SvEND(str) = '\0'; 360 SvSETMAGIC(str); 361 } 362 363 if (!temp_result) shrink_buffer(result); 364 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */ 365 XSRETURN(1); 366 } 367 368 void 369 encode(obj, utf8, check = 0) 370 SV * obj 371 SV * utf8 372 IV check 373 CODE: 374 { 375 SV *name = attr("Name"); 376 SV *sve = attr("endian"); 377 U8 endian = *((U8 *)SvPV_nolen(sve)); 378 SV *svs = attr("size"); 379 const int size = SvIV(svs); 380 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 381 const STRLEN usize = (size > 0 ? size : 1); 382 SV *result = newSVpvn("", 0); 383 STRLEN ulen; 384 U8 *s; 385 U8 *e; 386 bool modify = (check && !(check & ENCODE_LEAVE_SRC)); 387 bool temp_result; 388 389 SvGETMAGIC(utf8); 390 if (!SvOK(utf8)) 391 XSRETURN_UNDEF; 392 s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen); 393 if (!SvUTF8(utf8)) { 394 if (!modify) { 395 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); 396 if (SvTAINTED(utf8)) 397 SvTAINTED_on(tmp); 398 utf8 = tmp; 399 } 400 sv_utf8_upgrade_nomg(utf8); 401 s = (U8 *)SvPV_nomg(utf8, ulen); 402 } 403 e = s+ulen; 404 405 /* Optimise for the common case of being called from PerlIOEncode_flush() 406 with a standard length buffer. In this case the result SV's buffer is 407 only used temporarily, so we can afford to allocate the maximum needed 408 and not care about unused space. */ 409 temp_result = (ulen == PERLIO_BUFSIZ); 410 411 ST(0) = sv_2mortal(result); 412 413 /* Preallocate the result buffer to the maximum possible size. 414 ie. assume each UTF8 byte is 1 character. 415 Then shrink the result's buffer if necesary at the end. */ 416 SvGROW(result, ((ulen+1) * usize)); 417 418 if (!endian) { 419 SV *sv; 420 endian = (size == 4) ? 'N' : 'n'; 421 enc_pack(aTHX_ result,size,endian,BOM_BE); 422 #if 1 423 /* Update endian for next sequence */ 424 sv = attr("renewed"); 425 if (SvTRUE(sv)) { 426 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 427 } 428 #endif 429 } 430 while (s < e && s+UTF8SKIP(s) <= e) { 431 STRLEN len; 432 AV *msgs = NULL; 433 UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs); 434 if (msgs) { 435 SSize_t i; 436 SSize_t len = av_len(msgs)+1; 437 sv_2mortal((SV *)msgs); 438 for (i = 0; i < len; ++i) { 439 SV *sv = *av_fetch(msgs, i, 0); 440 HV *hv = (HV *)SvRV(sv); 441 SV *message = *hv_fetch(hv, "text", 4, 0); 442 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0)); 443 if (check & ENCODE_DIE_ON_ERR) 444 croak("%" SVf, SVfARG(message)); 445 if (encode_ckWARN_packed(check, categories)) 446 warner(categories, "%" SVf, SVfARG(message)); 447 } 448 } 449 if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) { 450 if (!issurrogate(ord)) { 451 if (ucs2 == -1) { 452 SV *sv = attr("ucs2"); 453 ucs2 = SvTRUE(sv); 454 } 455 if (ucs2 || ord > 0x10FFFF) { 456 if (check & ENCODE_DIE_ON_ERR) { 457 croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high", 458 SVfARG(name),ord); 459 } 460 if (encode_ckWARN(check, WARN_NON_UNICODE)) { 461 warner(packWARN(WARN_NON_UNICODE), 462 "%" SVf ":code point \"\\x{%" UVxf "}\" too high", 463 SVfARG(name),ord); 464 } 465 enc_pack(aTHX_ result,size,endian,FBCHAR); 466 } else if (ord == 0) { 467 enc_pack(aTHX_ result,size,endian,FBCHAR); 468 } else { 469 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 470 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 471 enc_pack(aTHX_ result,size,endian,hi); 472 enc_pack(aTHX_ result,size,endian,lo); 473 } 474 } 475 else { 476 /* not supposed to happen */ 477 enc_pack(aTHX_ result,size,endian,FBCHAR); 478 } 479 } 480 else { 481 enc_pack(aTHX_ result,size,endian,ord); 482 } 483 s += len; 484 } 485 if (s < e) { 486 /* UTF-8 partial char happens often on PerlIO. 487 Since this is okay and normal, we do not warn. 488 But this is critical when you choose to LEAVE_SRC 489 in which case we die */ 490 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 491 Perl_croak(aTHX_ "%" SVf ":partial character is not allowed " 492 "when CHECK = 0x%" UVuf, 493 SVfARG(name), check); 494 } 495 } 496 if (check && !(check & ENCODE_LEAVE_SRC)) { 497 if (s < e) { 498 Move(s,SvPVX(utf8),e-s,U8); 499 SvCUR_set(utf8,(e-s)); 500 } 501 else { 502 SvCUR_set(utf8,0); 503 } 504 *SvEND(utf8) = '\0'; 505 SvSETMAGIC(utf8); 506 } 507 508 if (!temp_result) shrink_buffer(result); 509 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ 510 511 XSRETURN(1); 512 } 513