1 /* 2 $Id: Unicode.xs,v 2.8 2011/08/09 07:49:44 dankogai Exp dankogai $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #define U8 U8 10 #include "../Encode/encode.h" 11 12 #define FBCHAR 0xFFFd 13 #define BOM_BE 0xFeFF 14 #define BOM16LE 0xFFFe 15 #define BOM32LE 0xFFFe0000 16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 20 21 /* For pre-5.14 source compatibility */ 22 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE 23 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0 24 # define UTF8_DISALLOW_SURROGATE 0 25 # define UTF8_WARN_SURROGATE 0 26 # define UTF8_DISALLOW_FE_FF 0 27 # define UTF8_WARN_FE_FF 0 28 # define UTF8_WARN_NONCHAR 0 29 #endif 30 31 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 32 33 /* Avoid wasting too much space in the result buffer */ 34 /* static void */ 35 /* shrink_buffer(SV *result) */ 36 /* { */ 37 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 38 /* char *buf; */ 39 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 40 /* New(0, buf, len, char); */ 41 /* Copy(SvPVX(result), buf, len, char); */ 42 /* Safefree(SvPVX(result)); */ 43 /* SvPV_set(result, buf); */ 44 /* SvLEN_set(result, len); */ 45 /* } */ 46 /* } */ 47 48 #define shrink_buffer(result) { \ 49 if (SvLEN(result) > 42 + SvCUR(result)) { \ 50 char *newpv; \ 51 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 52 New(0, newpv, newlen, char); \ 53 Copy(SvPVX(result), newpv, newlen, char); \ 54 Safefree(SvPVX(result)); \ 55 SvPV_set(result, newpv); \ 56 SvLEN_set(result, newlen); \ 57 } \ 58 } 59 60 static UV 61 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 62 { 63 U8 *s = *sp; 64 UV v = 0; 65 if (s+size > e) { 66 croak("Partial character %c",(char) endian); 67 } 68 switch(endian) { 69 case 'N': 70 v = *s++; 71 v = (v << 8) | *s++; 72 case 'n': 73 v = (v << 8) | *s++; 74 v = (v << 8) | *s++; 75 break; 76 case 'V': 77 case 'v': 78 v |= *s++; 79 v |= (*s++ << 8); 80 if (endian == 'v') 81 break; 82 v |= (*s++ << 16); 83 v |= (*s++ << 24); 84 break; 85 default: 86 croak("Unknown endian %c",(char) endian); 87 break; 88 } 89 *sp = s; 90 return v; 91 } 92 93 void 94 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 95 { 96 U8 *d = (U8 *) SvPV_nolen(result); 97 98 switch(endian) { 99 case 'v': 100 case 'V': 101 d += SvCUR(result); 102 SvCUR_set(result,SvCUR(result)+size); 103 while (size--) { 104 *d++ = (U8)(value & 0xFF); 105 value >>= 8; 106 } 107 break; 108 case 'n': 109 case 'N': 110 SvCUR_set(result,SvCUR(result)+size); 111 d += SvCUR(result); 112 while (size--) { 113 *--d = (U8)(value & 0xFF); 114 value >>= 8; 115 } 116 break; 117 default: 118 croak("Unknown endian %c",(char) endian); 119 break; 120 } 121 } 122 123 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 124 125 PROTOTYPES: DISABLE 126 127 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 128 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 129 130 void 131 decode_xs(obj, str, check = 0) 132 SV * obj 133 SV * str 134 IV check 135 CODE: 136 { 137 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 138 int size = SvIV(attr("size", 4)); 139 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 140 SV *result = newSVpvn("",0); 141 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 142 STRLEN ulen; 143 STRLEN resultbuflen; 144 U8 *resultbuf; 145 U8 *s = (U8 *)SvPVbyte(str,ulen); 146 U8 *e = (U8 *)SvEND(str); 147 /* Optimise for the common case of being called from PerlIOEncode_fill() 148 with a standard length buffer. In this case the result SV's buffer is 149 only used temporarily, so we can afford to allocate the maximum needed 150 and not care about unused space. */ 151 const bool temp_result = (ulen == PERLIO_BUFSIZ); 152 153 ST(0) = sv_2mortal(result); 154 SvUTF8_on(result); 155 156 if (!endian && s+size <= e) { 157 UV bom; 158 endian = (size == 4) ? 'N' : 'n'; 159 bom = enc_unpack(aTHX_ &s,e,size,endian); 160 if (bom != BOM_BE) { 161 if (bom == BOM16LE) { 162 endian = 'v'; 163 } 164 else if (bom == BOM32LE) { 165 endian = 'V'; 166 } 167 else { 168 croak("%"SVf":Unrecognised BOM %"UVxf, 169 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 170 bom); 171 } 172 } 173 #if 1 174 /* Update endian for next sequence */ 175 if (SvTRUE(attr("renewed", 7))) { 176 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 177 } 178 #endif 179 } 180 181 if (temp_result) { 182 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 183 } else { 184 /* Preallocate the buffer to the minimum possible space required. */ 185 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 186 } 187 resultbuf = (U8 *) SvGROW(result, resultbuflen); 188 189 while (s < e && s+size <= e) { 190 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 191 U8 *d; 192 if (issurrogate(ord)) { 193 if (ucs2 == -1) { 194 ucs2 = SvTRUE(attr("ucs2", 4)); 195 } 196 if (ucs2 || size == 4) { 197 if (check) { 198 croak("%"SVf":no surrogates allowed %"UVxf, 199 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 200 ord); 201 } 202 if (s+size <= e) { 203 /* skip the next one as well */ 204 enc_unpack(aTHX_ &s,e,size,endian); 205 } 206 ord = FBCHAR; 207 } 208 else { 209 UV lo; 210 if (!isHiSurrogate(ord)) { 211 if (check) { 212 croak("%"SVf":Malformed HI surrogate %"UVxf, 213 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 214 ord); 215 } 216 else { 217 ord = FBCHAR; 218 } 219 } 220 else { 221 if (s+size > e) { 222 /* Partial character */ 223 s -= size; /* back up to 1st half */ 224 break; /* And exit loop */ 225 } 226 lo = enc_unpack(aTHX_ &s,e,size,endian); 227 if (!isLoSurrogate(lo)) { 228 if (check) { 229 croak("%"SVf":Malformed LO surrogate %"UVxf, 230 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 231 ord); 232 } 233 else { 234 ord = FBCHAR; 235 } 236 } 237 else { 238 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 239 } 240 } 241 } 242 } 243 244 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 245 if (check) { 246 croak("%"SVf":Unicode character %"UVxf" is illegal", 247 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 248 ord); 249 } else { 250 ord = FBCHAR; 251 } 252 } 253 254 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 255 /* Do not allocate >8Mb more than the minimum needed. 256 This prevents allocating too much in the rogue case of a large 257 input consisting initially of long sequence uft8-byte unicode 258 chars followed by single utf8-byte chars. */ 259 /* +1 260 fixes Unicode.xs!decode_xs n-byte heap-overflow 261 */ 262 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 263 STRLEN max_alloc = remaining + (8*1024*1024); 264 STRLEN est_alloc = remaining * UTF8_MAXLEN; 265 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 266 (est_alloc > max_alloc ? max_alloc : est_alloc); 267 resultbuf = (U8 *) SvGROW(result, newlen); 268 resultbuflen = SvLEN(result); 269 } 270 271 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 272 UNICODE_WARN_ILLEGAL_INTERCHANGE); 273 SvCUR_set(result, d - (U8 *)SvPVX(result)); 274 } 275 276 if (s < e) { 277 /* unlikely to happen because it's fixed-length -- dankogai */ 278 if (check & ENCODE_WARN_ON_ERR) { 279 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", 280 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 281 } 282 } 283 if (check && !(check & ENCODE_LEAVE_SRC)) { 284 if (s < e) { 285 Move(s,SvPVX(str),e-s,U8); 286 SvCUR_set(str,(e-s)); 287 } 288 else { 289 SvCUR_set(str,0); 290 } 291 *SvEND(str) = '\0'; 292 } 293 294 if (!temp_result) 295 shrink_buffer(result); 296 297 XSRETURN(1); 298 } 299 300 void 301 encode_xs(obj, utf8, check = 0) 302 SV * obj 303 SV * utf8 304 IV check 305 CODE: 306 { 307 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 308 const int size = SvIV(attr("size", 4)); 309 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 310 const STRLEN usize = (size > 0 ? size : 1); 311 SV *result = newSVpvn("", 0); 312 STRLEN ulen; 313 U8 *s = (U8 *) SvPVutf8(utf8, ulen); 314 const U8 *e = (U8 *) SvEND(utf8); 315 /* Optimise for the common case of being called from PerlIOEncode_flush() 316 with a standard length buffer. In this case the result SV's buffer is 317 only used temporarily, so we can afford to allocate the maximum needed 318 and not care about unused space. */ 319 const bool temp_result = (ulen == PERLIO_BUFSIZ); 320 321 ST(0) = sv_2mortal(result); 322 323 /* Preallocate the result buffer to the maximum possible size. 324 ie. assume each UTF8 byte is 1 character. 325 Then shrink the result's buffer if necesary at the end. */ 326 SvGROW(result, ((ulen+1) * usize)); 327 328 if (!endian) { 329 endian = (size == 4) ? 'N' : 'n'; 330 enc_pack(aTHX_ result,size,endian,BOM_BE); 331 #if 1 332 /* Update endian for next sequence */ 333 if (SvTRUE(attr("renewed", 7))) { 334 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 335 } 336 #endif 337 } 338 while (s < e && s+UTF8SKIP(s) <= e) { 339 STRLEN len; 340 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE 341 |UTF8_WARN_SURROGATE 342 |UTF8_DISALLOW_FE_FF 343 |UTF8_WARN_FE_FF 344 |UTF8_WARN_NONCHAR)); 345 s += len; 346 if (size != 4 && invalid_ucs2(ord)) { 347 if (!issurrogate(ord)) { 348 if (ucs2 == -1) { 349 ucs2 = SvTRUE(attr("ucs2", 4)); 350 } 351 if (ucs2) { 352 if (check) { 353 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", 354 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 355 } 356 enc_pack(aTHX_ result,size,endian,FBCHAR); 357 } else { 358 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 359 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 360 enc_pack(aTHX_ result,size,endian,hi); 361 enc_pack(aTHX_ result,size,endian,lo); 362 } 363 } 364 else { 365 /* not supposed to happen */ 366 enc_pack(aTHX_ result,size,endian,FBCHAR); 367 } 368 } 369 else { 370 enc_pack(aTHX_ result,size,endian,ord); 371 } 372 } 373 if (s < e) { 374 /* UTF-8 partial char happens often on PerlIO. 375 Since this is okay and normal, we do not warn. 376 But this is critical when you choose to LEAVE_SRC 377 in which case we die */ 378 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 379 Perl_croak(aTHX_ "%"SVf":partial character is not allowed " 380 "when CHECK = 0x%" UVuf, 381 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 382 } 383 } 384 if (check && !(check & ENCODE_LEAVE_SRC)) { 385 if (s < e) { 386 Move(s,SvPVX(utf8),e-s,U8); 387 SvCUR_set(utf8,(e-s)); 388 } 389 else { 390 SvCUR_set(utf8,0); 391 } 392 *SvEND(utf8) = '\0'; 393 } 394 395 if (!temp_result) 396 shrink_buffer(result); 397 398 SvSETMAGIC(utf8); 399 400 XSRETURN(1); 401 } 402