1 /* 2 $Id: Unicode.xs,v 2.9 2012/08/05 23:08:49 dankogai Exp $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #define U8 U8 10 #include "../Encode/encode.h" 11 12 #define FBCHAR 0xFFFd 13 #define BOM_BE 0xFeFF 14 #define BOM16LE 0xFFFe 15 #define BOM32LE 0xFFFe0000 16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 20 21 /* For pre-5.14 source compatibility */ 22 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE 23 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0 24 # define UTF8_DISALLOW_SURROGATE 0 25 # define UTF8_WARN_SURROGATE 0 26 # define UTF8_DISALLOW_FE_FF 0 27 # define UTF8_WARN_FE_FF 0 28 # define UTF8_WARN_NONCHAR 0 29 #endif 30 31 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 32 33 /* Avoid wasting too much space in the result buffer */ 34 /* static void */ 35 /* shrink_buffer(SV *result) */ 36 /* { */ 37 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 38 /* char *buf; */ 39 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 40 /* New(0, buf, len, char); */ 41 /* Copy(SvPVX(result), buf, len, char); */ 42 /* Safefree(SvPVX(result)); */ 43 /* SvPV_set(result, buf); */ 44 /* SvLEN_set(result, len); */ 45 /* } */ 46 /* } */ 47 48 #define shrink_buffer(result) { \ 49 if (SvLEN(result) > 42 + SvCUR(result)) { \ 50 char *newpv; \ 51 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 52 New(0, newpv, newlen, char); \ 53 Copy(SvPVX(result), newpv, newlen, char); \ 54 Safefree(SvPVX(result)); \ 55 SvPV_set(result, newpv); \ 56 SvLEN_set(result, newlen); \ 57 } \ 58 } 59 60 static UV 61 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 62 { 63 U8 *s = *sp; 64 UV v = 0; 65 if (s+size > e) { 66 croak("Partial character %c",(char) endian); 67 } 68 switch(endian) { 69 case 'N': 70 v = *s++; 71 v = (v << 8) | *s++; 72 case 'n': 73 v = (v << 8) | *s++; 74 v = (v << 8) | *s++; 75 break; 76 case 'V': 77 case 'v': 78 v |= *s++; 79 v |= (*s++ << 8); 80 if (endian == 'v') 81 break; 82 v |= (*s++ << 16); 83 v |= (*s++ << 24); 84 break; 85 default: 86 croak("Unknown endian %c",(char) endian); 87 break; 88 } 89 *sp = s; 90 return v; 91 } 92 93 void 94 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 95 { 96 U8 *d = (U8 *) SvPV_nolen(result); 97 98 switch(endian) { 99 case 'v': 100 case 'V': 101 d += SvCUR(result); 102 SvCUR_set(result,SvCUR(result)+size); 103 while (size--) { 104 *d++ = (U8)(value & 0xFF); 105 value >>= 8; 106 } 107 break; 108 case 'n': 109 case 'N': 110 SvCUR_set(result,SvCUR(result)+size); 111 d += SvCUR(result); 112 while (size--) { 113 *--d = (U8)(value & 0xFF); 114 value >>= 8; 115 } 116 break; 117 default: 118 croak("Unknown endian %c",(char) endian); 119 break; 120 } 121 } 122 123 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 124 125 PROTOTYPES: DISABLE 126 127 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 128 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 129 130 void 131 decode_xs(obj, str, check = 0) 132 SV * obj 133 SV * str 134 IV check 135 CODE: 136 { 137 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 138 int size = SvIV(attr("size", 4)); 139 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 140 SV *result = newSVpvn("",0); 141 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 142 STRLEN ulen; 143 STRLEN resultbuflen; 144 U8 *resultbuf; 145 U8 *s = (U8 *)SvPVbyte(str,ulen); 146 U8 *e = (U8 *)SvEND(str); 147 /* Optimise for the common case of being called from PerlIOEncode_fill() 148 with a standard length buffer. In this case the result SV's buffer is 149 only used temporarily, so we can afford to allocate the maximum needed 150 and not care about unused space. */ 151 const bool temp_result = (ulen == PERLIO_BUFSIZ); 152 153 ST(0) = sv_2mortal(result); 154 SvUTF8_on(result); 155 156 if (!endian && s+size <= e) { 157 UV bom; 158 endian = (size == 4) ? 'N' : 'n'; 159 bom = enc_unpack(aTHX_ &s,e,size,endian); 160 if (bom != BOM_BE) { 161 if (bom == BOM16LE) { 162 endian = 'v'; 163 } 164 else if (bom == BOM32LE) { 165 endian = 'V'; 166 } 167 else { 168 croak("%"SVf":Unrecognised BOM %"UVxf, 169 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 170 bom); 171 } 172 } 173 #if 1 174 /* Update endian for next sequence */ 175 if (SvTRUE(attr("renewed", 7))) { 176 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 177 } 178 #endif 179 } 180 181 if (temp_result) { 182 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 183 } else { 184 /* Preallocate the buffer to the minimum possible space required. */ 185 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 186 } 187 resultbuf = (U8 *) SvGROW(result, resultbuflen); 188 189 while (s < e && s+size <= e) { 190 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 191 U8 *d; 192 if (issurrogate(ord)) { 193 if (ucs2 == -1) { 194 ucs2 = SvTRUE(attr("ucs2", 4)); 195 } 196 if (ucs2 || size == 4) { 197 if (check) { 198 croak("%"SVf":no surrogates allowed %"UVxf, 199 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 200 ord); 201 } 202 ord = FBCHAR; 203 } 204 else { 205 UV lo; 206 if (!isHiSurrogate(ord)) { 207 if (check) { 208 croak("%"SVf":Malformed HI surrogate %"UVxf, 209 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 210 ord); 211 } 212 else { 213 ord = FBCHAR; 214 } 215 } 216 else if (s+size > e) { 217 if (check) { 218 if (check & ENCODE_STOP_AT_PARTIAL) { 219 s -= size; 220 break; 221 } 222 else { 223 croak("%"SVf":Malformed HI surrogate %"UVxf, 224 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 225 ord); 226 } 227 } 228 else { 229 ord = FBCHAR; 230 } 231 } 232 else { 233 lo = enc_unpack(aTHX_ &s,e,size,endian); 234 if (!isLoSurrogate(lo)) { 235 if (check) { 236 croak("%"SVf":Malformed LO surrogate %"UVxf, 237 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 238 ord); 239 } 240 else { 241 s -= size; 242 ord = FBCHAR; 243 } 244 } 245 else { 246 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 247 } 248 } 249 } 250 } 251 252 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 253 if (check) { 254 croak("%"SVf":Unicode character %"UVxf" is illegal", 255 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 256 ord); 257 } else { 258 ord = FBCHAR; 259 } 260 } 261 262 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 263 /* Do not allocate >8Mb more than the minimum needed. 264 This prevents allocating too much in the rogue case of a large 265 input consisting initially of long sequence uft8-byte unicode 266 chars followed by single utf8-byte chars. */ 267 /* +1 268 fixes Unicode.xs!decode_xs n-byte heap-overflow 269 */ 270 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 271 STRLEN max_alloc = remaining + (8*1024*1024); 272 STRLEN est_alloc = remaining * UTF8_MAXLEN; 273 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 274 (est_alloc > max_alloc ? max_alloc : est_alloc); 275 resultbuf = (U8 *) SvGROW(result, newlen); 276 resultbuflen = SvLEN(result); 277 } 278 279 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 280 UNICODE_WARN_ILLEGAL_INTERCHANGE); 281 SvCUR_set(result, d - (U8 *)SvPVX(result)); 282 } 283 284 if (s < e) { 285 /* unlikely to happen because it's fixed-length -- dankogai */ 286 if (check & ENCODE_WARN_ON_ERR) { 287 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", 288 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 289 } 290 } 291 if (check && !(check & ENCODE_LEAVE_SRC)) { 292 if (s < e) { 293 Move(s,SvPVX(str),e-s,U8); 294 SvCUR_set(str,(e-s)); 295 } 296 else { 297 SvCUR_set(str,0); 298 } 299 *SvEND(str) = '\0'; 300 } 301 302 if (!temp_result) 303 shrink_buffer(result); 304 305 XSRETURN(1); 306 } 307 308 void 309 encode_xs(obj, utf8, check = 0) 310 SV * obj 311 SV * utf8 312 IV check 313 CODE: 314 { 315 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 316 const int size = SvIV(attr("size", 4)); 317 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 318 const STRLEN usize = (size > 0 ? size : 1); 319 SV *result = newSVpvn("", 0); 320 STRLEN ulen; 321 U8 *s = (U8 *) SvPVutf8(utf8, ulen); 322 const U8 *e = (U8 *) SvEND(utf8); 323 /* Optimise for the common case of being called from PerlIOEncode_flush() 324 with a standard length buffer. In this case the result SV's buffer is 325 only used temporarily, so we can afford to allocate the maximum needed 326 and not care about unused space. */ 327 const bool temp_result = (ulen == PERLIO_BUFSIZ); 328 329 ST(0) = sv_2mortal(result); 330 331 /* Preallocate the result buffer to the maximum possible size. 332 ie. assume each UTF8 byte is 1 character. 333 Then shrink the result's buffer if necesary at the end. */ 334 SvGROW(result, ((ulen+1) * usize)); 335 336 if (!endian) { 337 endian = (size == 4) ? 'N' : 'n'; 338 enc_pack(aTHX_ result,size,endian,BOM_BE); 339 #if 1 340 /* Update endian for next sequence */ 341 if (SvTRUE(attr("renewed", 7))) { 342 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 343 } 344 #endif 345 } 346 while (s < e && s+UTF8SKIP(s) <= e) { 347 STRLEN len; 348 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE 349 |UTF8_WARN_SURROGATE 350 |UTF8_DISALLOW_FE_FF 351 |UTF8_WARN_FE_FF 352 |UTF8_WARN_NONCHAR)); 353 s += len; 354 if (size != 4 && invalid_ucs2(ord)) { 355 if (!issurrogate(ord)) { 356 if (ucs2 == -1) { 357 ucs2 = SvTRUE(attr("ucs2", 4)); 358 } 359 if (ucs2 || ord > 0x10FFFF) { 360 if (check) { 361 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", 362 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 363 } 364 enc_pack(aTHX_ result,size,endian,FBCHAR); 365 } else { 366 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 367 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 368 enc_pack(aTHX_ result,size,endian,hi); 369 enc_pack(aTHX_ result,size,endian,lo); 370 } 371 } 372 else { 373 /* not supposed to happen */ 374 enc_pack(aTHX_ result,size,endian,FBCHAR); 375 } 376 } 377 else { 378 enc_pack(aTHX_ result,size,endian,ord); 379 } 380 } 381 if (s < e) { 382 /* UTF-8 partial char happens often on PerlIO. 383 Since this is okay and normal, we do not warn. 384 But this is critical when you choose to LEAVE_SRC 385 in which case we die */ 386 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 387 Perl_croak(aTHX_ "%"SVf":partial character is not allowed " 388 "when CHECK = 0x%" UVuf, 389 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 390 } 391 } 392 if (check && !(check & ENCODE_LEAVE_SRC)) { 393 if (s < e) { 394 Move(s,SvPVX(utf8),e-s,U8); 395 SvCUR_set(utf8,(e-s)); 396 } 397 else { 398 SvCUR_set(utf8,0); 399 } 400 *SvEND(utf8) = '\0'; 401 } 402 403 if (!temp_result) 404 shrink_buffer(result); 405 406 SvSETMAGIC(utf8); 407 408 XSRETURN(1); 409 } 410