1 /* utf8.c 2 * 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 4 * by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 13 * heard of that we don't want to see any closer; and that's the one place 14 * we're trying to get to! And that's just where we can't get, nohow.' 15 * 16 * 'Well do I understand your speech,' he answered in the same language; 17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 18 * as is the custom in the West, if you wish to be answered?' 19 * 20 * ...the travellers perceived that the floor was paved with stones of many 21 * hues; branching runes and strange devices intertwined beneath their feet. 22 */ 23 24 #include "EXTERN.h" 25 #define PERL_IN_UTF8_C 26 #include "perl.h" 27 28 #ifndef EBCDIC 29 /* Separate prototypes needed because in ASCII systems these 30 * usually macros but they still are compiled as code, too. */ 31 PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); 32 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); 33 #endif 34 35 static const char unees[] = 36 "Malformed UTF-8 character (unexpected end of string)"; 37 38 /* 39 =head1 Unicode Support 40 41 This file contains various utility functions for manipulating UTF8-encoded 42 strings. For the uninitiated, this is a method of representing arbitrary 43 Unicode characters as a variable number of bytes, in such a way that 44 characters in the ASCII range are unmodified, and a zero byte never appears 45 within non-zero characters. 46 47 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags 48 49 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end 50 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free 51 bytes available. The return value is the pointer to the byte after the 52 end of the new character. In other words, 53 54 d = uvuni_to_utf8_flags(d, uv, flags); 55 56 or, in most cases, 57 58 d = uvuni_to_utf8(d, uv); 59 60 (which is equivalent to) 61 62 d = uvuni_to_utf8_flags(d, uv, 0); 63 64 is the recommended Unicode-aware way of saying 65 66 *(d++) = uv; 67 68 =cut 69 */ 70 71 U8 * 72 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 73 { 74 if (ckWARN(WARN_UTF8)) { 75 if (UNICODE_IS_SURROGATE(uv) && 76 !(flags & UNICODE_ALLOW_SURROGATE)) 77 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv); 78 else if ( 79 ((uv >= 0xFDD0 && uv <= 0xFDEF && 80 !(flags & UNICODE_ALLOW_FDD0)) 81 || 82 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */ 83 !(flags & UNICODE_ALLOW_FFFF))) && 84 /* UNICODE_ALLOW_SUPER includes 85 * FFFEs and FFFFs beyond 0x10FFFF. */ 86 ((uv <= PERL_UNICODE_MAX) || 87 !(flags & UNICODE_ALLOW_SUPER)) 88 ) 89 Perl_warner(aTHX_ packWARN(WARN_UTF8), 90 "Unicode character 0x%04"UVxf" is illegal", uv); 91 } 92 if (UNI_IS_INVARIANT(uv)) { 93 *d++ = (U8)UTF_TO_NATIVE(uv); 94 return d; 95 } 96 #if defined(EBCDIC) 97 else { 98 STRLEN len = UNISKIP(uv); 99 U8 *p = d+len-1; 100 while (p > d) { 101 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); 102 uv >>= UTF_ACCUMULATION_SHIFT; 103 } 104 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); 105 return d+len; 106 } 107 #else /* Non loop style */ 108 if (uv < 0x800) { 109 *d++ = (U8)(( uv >> 6) | 0xc0); 110 *d++ = (U8)(( uv & 0x3f) | 0x80); 111 return d; 112 } 113 if (uv < 0x10000) { 114 *d++ = (U8)(( uv >> 12) | 0xe0); 115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 116 *d++ = (U8)(( uv & 0x3f) | 0x80); 117 return d; 118 } 119 if (uv < 0x200000) { 120 *d++ = (U8)(( uv >> 18) | 0xf0); 121 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 122 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 123 *d++ = (U8)(( uv & 0x3f) | 0x80); 124 return d; 125 } 126 if (uv < 0x4000000) { 127 *d++ = (U8)(( uv >> 24) | 0xf8); 128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 131 *d++ = (U8)(( uv & 0x3f) | 0x80); 132 return d; 133 } 134 if (uv < 0x80000000) { 135 *d++ = (U8)(( uv >> 30) | 0xfc); 136 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 140 *d++ = (U8)(( uv & 0x3f) | 0x80); 141 return d; 142 } 143 #ifdef HAS_QUAD 144 if (uv < UTF8_QUAD_MAX) 145 #endif 146 { 147 *d++ = 0xfe; /* Can't match U+FEFF! */ 148 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); 149 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 150 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 151 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 152 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 153 *d++ = (U8)(( uv & 0x3f) | 0x80); 154 return d; 155 } 156 #ifdef HAS_QUAD 157 { 158 *d++ = 0xff; /* Can't match U+FFFE! */ 159 *d++ = 0x80; /* 6 Reserved bits */ 160 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */ 161 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80); 162 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80); 163 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80); 164 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80); 165 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); 166 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 167 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 168 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 169 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 170 *d++ = (U8)(( uv & 0x3f) | 0x80); 171 return d; 172 } 173 #endif 174 #endif /* Loop style */ 175 } 176 177 /* 178 179 Tests if some arbitrary number of bytes begins in a valid UTF-8 180 character. Note that an INVARIANT (i.e. ASCII) character is a valid 181 UTF-8 character. The actual number of bytes in the UTF-8 character 182 will be returned if it is valid, otherwise 0. 183 184 This is the "slow" version as opposed to the "fast" version which is 185 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed 186 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four 187 or less you should use the IS_UTF8_CHAR(), for lengths of five or more 188 you should use the _slow(). In practice this means that the _slow() 189 will be used very rarely, since the maximum Unicode code point (as of 190 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only 191 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into 192 five bytes or more. 193 194 =cut */ 195 STATIC STRLEN 196 S_is_utf8_char_slow(const U8 *s, const STRLEN len) 197 { 198 U8 u = *s; 199 STRLEN slen; 200 UV uv, ouv; 201 202 if (UTF8_IS_INVARIANT(u)) 203 return 1; 204 205 if (!UTF8_IS_START(u)) 206 return 0; 207 208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1])) 209 return 0; 210 211 slen = len - 1; 212 s++; 213 #ifdef EBCDIC 214 u = NATIVE_TO_UTF(u); 215 #endif 216 u &= UTF_START_MASK(len); 217 uv = u; 218 ouv = uv; 219 while (slen--) { 220 if (!UTF8_IS_CONTINUATION(*s)) 221 return 0; 222 uv = UTF8_ACCUMULATE(uv, *s); 223 if (uv < ouv) 224 return 0; 225 ouv = uv; 226 s++; 227 } 228 229 if ((STRLEN)UNISKIP(uv) < len) 230 return 0; 231 232 return len; 233 } 234 235 /* 236 =for apidoc A|STRLEN|is_utf8_char|const U8 *s 237 238 Tests if some arbitrary number of bytes begins in a valid UTF-8 239 character. Note that an INVARIANT (i.e. ASCII) character is a valid 240 UTF-8 character. The actual number of bytes in the UTF-8 character 241 will be returned if it is valid, otherwise 0. 242 243 =cut */ 244 STRLEN 245 Perl_is_utf8_char(pTHX_ const U8 *s) 246 { 247 const STRLEN len = UTF8SKIP(s); 248 PERL_UNUSED_CONTEXT; 249 #ifdef IS_UTF8_CHAR 250 if (IS_UTF8_CHAR_FAST(len)) 251 return IS_UTF8_CHAR(s, len) ? len : 0; 252 #endif /* #ifdef IS_UTF8_CHAR */ 253 return is_utf8_char_slow(s, len); 254 } 255 256 /* 257 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len 258 259 Returns true if first C<len> bytes of the given string form a valid 260 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does 261 not mean 'a string that contains code points above 0x7F encoded in UTF-8' 262 because a valid ASCII string is a valid UTF-8 string. 263 264 See also is_utf8_string_loclen() and is_utf8_string_loc(). 265 266 =cut 267 */ 268 269 bool 270 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) 271 { 272 const U8* const send = s + (len ? len : strlen((const char *)s)); 273 const U8* x = s; 274 275 PERL_UNUSED_CONTEXT; 276 277 while (x < send) { 278 STRLEN c; 279 /* Inline the easy bits of is_utf8_char() here for speed... */ 280 if (UTF8_IS_INVARIANT(*x)) 281 c = 1; 282 else if (!UTF8_IS_START(*x)) 283 goto out; 284 else { 285 /* ... and call is_utf8_char() only if really needed. */ 286 #ifdef IS_UTF8_CHAR 287 c = UTF8SKIP(x); 288 if (IS_UTF8_CHAR_FAST(c)) { 289 if (!IS_UTF8_CHAR(x, c)) 290 c = 0; 291 } 292 else 293 c = is_utf8_char_slow(x, c); 294 #else 295 c = is_utf8_char(x); 296 #endif /* #ifdef IS_UTF8_CHAR */ 297 if (!c) 298 goto out; 299 } 300 x += c; 301 } 302 303 out: 304 if (x != send) 305 return FALSE; 306 307 return TRUE; 308 } 309 310 /* 311 Implemented as a macro in utf8.h 312 313 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep 314 315 Like is_utf8_string() but stores the location of the failure (in the 316 case of "utf8ness failure") or the location s+len (in the case of 317 "utf8ness success") in the C<ep>. 318 319 See also is_utf8_string_loclen() and is_utf8_string(). 320 321 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el 322 323 Like is_utf8_string() but stores the location of the failure (in the 324 case of "utf8ness failure") or the location s+len (in the case of 325 "utf8ness success") in the C<ep>, and the number of UTF-8 326 encoded characters in the C<el>. 327 328 See also is_utf8_string_loc() and is_utf8_string(). 329 330 =cut 331 */ 332 333 bool 334 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 335 { 336 const U8* const send = s + (len ? len : strlen((const char *)s)); 337 const U8* x = s; 338 STRLEN c; 339 STRLEN outlen = 0; 340 PERL_UNUSED_CONTEXT; 341 342 while (x < send) { 343 /* Inline the easy bits of is_utf8_char() here for speed... */ 344 if (UTF8_IS_INVARIANT(*x)) 345 c = 1; 346 else if (!UTF8_IS_START(*x)) 347 goto out; 348 else { 349 /* ... and call is_utf8_char() only if really needed. */ 350 #ifdef IS_UTF8_CHAR 351 c = UTF8SKIP(x); 352 if (IS_UTF8_CHAR_FAST(c)) { 353 if (!IS_UTF8_CHAR(x, c)) 354 c = 0; 355 } else 356 c = is_utf8_char_slow(x, c); 357 #else 358 c = is_utf8_char(x); 359 #endif /* #ifdef IS_UTF8_CHAR */ 360 if (!c) 361 goto out; 362 } 363 x += c; 364 outlen++; 365 } 366 367 out: 368 if (el) 369 *el = outlen; 370 371 if (ep) 372 *ep = x; 373 return (x == send); 374 } 375 376 /* 377 378 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags 379 380 Bottom level UTF-8 decode routine. 381 Returns the Unicode code point value of the first character in the string C<s> 382 which is assumed to be in UTF-8 encoding and no longer than C<curlen>; 383 C<retlen> will be set to the length, in bytes, of that character. 384 385 If C<s> does not point to a well-formed UTF-8 character, the behaviour 386 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, 387 it is assumed that the caller will raise a warning, and this function 388 will silently just set C<retlen> to C<-1> and return zero. If the 389 C<flags> does not contain UTF8_CHECK_ONLY, warnings about 390 malformations will be given, C<retlen> will be set to the expected 391 length of the UTF-8 character in bytes, and zero will be returned. 392 393 The C<flags> can also contain various flags to allow deviations from 394 the strict UTF-8 encoding (see F<utf8.h>). 395 396 Most code should use utf8_to_uvchr() rather than call this directly. 397 398 =cut 399 */ 400 401 UV 402 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) 403 { 404 dVAR; 405 const U8 * const s0 = s; 406 UV uv = *s, ouv = 0; 407 STRLEN len = 1; 408 const bool dowarn = ckWARN_d(WARN_UTF8); 409 const UV startbyte = *s; 410 STRLEN expectlen = 0; 411 U32 warning = 0; 412 413 /* This list is a superset of the UTF8_ALLOW_XXX. */ 414 415 #define UTF8_WARN_EMPTY 1 416 #define UTF8_WARN_CONTINUATION 2 417 #define UTF8_WARN_NON_CONTINUATION 3 418 #define UTF8_WARN_FE_FF 4 419 #define UTF8_WARN_SHORT 5 420 #define UTF8_WARN_OVERFLOW 6 421 #define UTF8_WARN_SURROGATE 7 422 #define UTF8_WARN_LONG 8 423 #define UTF8_WARN_FFFF 9 /* Also FFFE. */ 424 425 if (curlen == 0 && 426 !(flags & UTF8_ALLOW_EMPTY)) { 427 warning = UTF8_WARN_EMPTY; 428 goto malformed; 429 } 430 431 if (UTF8_IS_INVARIANT(uv)) { 432 if (retlen) 433 *retlen = 1; 434 return (UV) (NATIVE_TO_UTF(*s)); 435 } 436 437 if (UTF8_IS_CONTINUATION(uv) && 438 !(flags & UTF8_ALLOW_CONTINUATION)) { 439 warning = UTF8_WARN_CONTINUATION; 440 goto malformed; 441 } 442 443 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && 444 !(flags & UTF8_ALLOW_NON_CONTINUATION)) { 445 warning = UTF8_WARN_NON_CONTINUATION; 446 goto malformed; 447 } 448 449 #ifdef EBCDIC 450 uv = NATIVE_TO_UTF(uv); 451 #else 452 if ((uv == 0xfe || uv == 0xff) && 453 !(flags & UTF8_ALLOW_FE_FF)) { 454 warning = UTF8_WARN_FE_FF; 455 goto malformed; 456 } 457 #endif 458 459 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; } 460 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; } 461 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; } 462 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; } 463 #ifdef EBCDIC 464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } 465 else { len = 7; uv &= 0x01; } 466 #else 467 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } 468 else if (!(uv & 0x01)) { len = 7; uv = 0; } 469 else { len = 13; uv = 0; } /* whoa! */ 470 #endif 471 472 if (retlen) 473 *retlen = len; 474 475 expectlen = len; 476 477 if ((curlen < expectlen) && 478 !(flags & UTF8_ALLOW_SHORT)) { 479 warning = UTF8_WARN_SHORT; 480 goto malformed; 481 } 482 483 len--; 484 s++; 485 ouv = uv; 486 487 while (len--) { 488 if (!UTF8_IS_CONTINUATION(*s) && 489 !(flags & UTF8_ALLOW_NON_CONTINUATION)) { 490 s--; 491 warning = UTF8_WARN_NON_CONTINUATION; 492 goto malformed; 493 } 494 else 495 uv = UTF8_ACCUMULATE(uv, *s); 496 if (!(uv > ouv)) { 497 /* These cannot be allowed. */ 498 if (uv == ouv) { 499 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) { 500 warning = UTF8_WARN_LONG; 501 goto malformed; 502 } 503 } 504 else { /* uv < ouv */ 505 /* This cannot be allowed. */ 506 warning = UTF8_WARN_OVERFLOW; 507 goto malformed; 508 } 509 } 510 s++; 511 ouv = uv; 512 } 513 514 if (UNICODE_IS_SURROGATE(uv) && 515 !(flags & UTF8_ALLOW_SURROGATE)) { 516 warning = UTF8_WARN_SURROGATE; 517 goto malformed; 518 } else if ((expectlen > (STRLEN)UNISKIP(uv)) && 519 !(flags & UTF8_ALLOW_LONG)) { 520 warning = UTF8_WARN_LONG; 521 goto malformed; 522 } else if (UNICODE_IS_ILLEGAL(uv) && 523 !(flags & UTF8_ALLOW_FFFF)) { 524 warning = UTF8_WARN_FFFF; 525 goto malformed; 526 } 527 528 return uv; 529 530 malformed: 531 532 if (flags & UTF8_CHECK_ONLY) { 533 if (retlen) 534 *retlen = ((STRLEN) -1); 535 return 0; 536 } 537 538 if (dowarn) { 539 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character ")); 540 541 switch (warning) { 542 case 0: /* Intentionally empty. */ break; 543 case UTF8_WARN_EMPTY: 544 sv_catpvs(sv, "(empty string)"); 545 break; 546 case UTF8_WARN_CONTINUATION: 547 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv); 548 break; 549 case UTF8_WARN_NON_CONTINUATION: 550 if (s == s0) 551 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")", 552 (UV)s[1], startbyte); 553 else { 554 const int len = (int)(s-s0); 555 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)", 556 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen); 557 } 558 559 break; 560 case UTF8_WARN_FE_FF: 561 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); 562 break; 563 case UTF8_WARN_SHORT: 564 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", 565 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte); 566 expectlen = curlen; /* distance for caller to skip */ 567 break; 568 case UTF8_WARN_OVERFLOW: 569 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")", 570 ouv, *s, startbyte); 571 break; 572 case UTF8_WARN_SURROGATE: 573 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); 574 break; 575 case UTF8_WARN_LONG: 576 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", 577 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); 578 break; 579 case UTF8_WARN_FFFF: 580 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv); 581 break; 582 default: 583 sv_catpvs(sv, "(unknown reason)"); 584 break; 585 } 586 587 if (warning) { 588 const char * const s = SvPVX_const(sv); 589 590 if (PL_op) 591 Perl_warner(aTHX_ packWARN(WARN_UTF8), 592 "%s in %s", s, OP_DESC(PL_op)); 593 else 594 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s); 595 } 596 } 597 598 if (retlen) 599 *retlen = expectlen ? expectlen : len; 600 601 return 0; 602 } 603 604 /* 605 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen 606 607 Returns the native character value of the first character in the string C<s> 608 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 609 length, in bytes, of that character. 610 611 If C<s> does not point to a well-formed UTF-8 character, zero is 612 returned and retlen is set, if possible, to -1. 613 614 =cut 615 */ 616 617 UV 618 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) 619 { 620 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen, 621 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 622 } 623 624 /* 625 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen 626 627 Returns the Unicode code point of the first character in the string C<s> 628 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 629 length, in bytes, of that character. 630 631 This function should only be used when returned UV is considered 632 an index into the Unicode semantic tables (e.g. swashes). 633 634 If C<s> does not point to a well-formed UTF-8 character, zero is 635 returned and retlen is set, if possible, to -1. 636 637 =cut 638 */ 639 640 UV 641 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) 642 { 643 /* Call the low level routine asking for checks */ 644 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen, 645 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 646 } 647 648 /* 649 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e 650 651 Return the length of the UTF-8 char encoded string C<s> in characters. 652 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end 653 up past C<e>, croaks. 654 655 =cut 656 */ 657 658 STRLEN 659 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) 660 { 661 dVAR; 662 STRLEN len = 0; 663 U8 t = 0; 664 665 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. 666 * the bitops (especially ~) can create illegal UTF-8. 667 * In other words: in Perl UTF-8 is not just for Unicode. */ 668 669 if (e < s) 670 goto warn_and_return; 671 while (s < e) { 672 t = UTF8SKIP(s); 673 if (e - s < t) { 674 warn_and_return: 675 if (ckWARN_d(WARN_UTF8)) { 676 if (PL_op) 677 Perl_warner(aTHX_ packWARN(WARN_UTF8), 678 "%s in %s", unees, OP_DESC(PL_op)); 679 else 680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); 681 } 682 return len; 683 } 684 s += t; 685 len++; 686 } 687 688 return len; 689 } 690 691 /* 692 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b 693 694 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 695 and C<b>. 696 697 WARNING: use only if you *know* that the pointers point inside the 698 same UTF-8 buffer. 699 700 =cut 701 */ 702 703 IV 704 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 705 { 706 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 707 } 708 709 /* 710 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off 711 712 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 713 forward or backward. 714 715 WARNING: do not use the following unless you *know* C<off> is within 716 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned 717 on the first byte of character or just after the last byte of a character. 718 719 =cut 720 */ 721 722 U8 * 723 Perl_utf8_hop(pTHX_ const U8 *s, I32 off) 724 { 725 PERL_UNUSED_CONTEXT; 726 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 727 * the bitops (especially ~) can create illegal UTF-8. 728 * In other words: in Perl UTF-8 is not just for Unicode. */ 729 730 if (off >= 0) { 731 while (off--) 732 s += UTF8SKIP(s); 733 } 734 else { 735 while (off++) { 736 s--; 737 while (UTF8_IS_CONTINUATION(*s)) 738 s--; 739 } 740 } 741 return (U8 *)s; 742 } 743 744 /* 745 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len 746 747 Converts a string C<s> of length C<len> from UTF-8 into byte encoding. 748 Unlike C<bytes_to_utf8>, this over-writes the original string, and 749 updates len to contain the new length. 750 Returns zero on failure, setting C<len> to -1. 751 752 If you need a copy of the string, see C<bytes_from_utf8>. 753 754 =cut 755 */ 756 757 U8 * 758 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) 759 { 760 U8 * const save = s; 761 U8 * const send = s + *len; 762 U8 *d; 763 764 /* ensure valid UTF-8 and chars < 256 before updating string */ 765 while (s < send) { 766 U8 c = *s++; 767 768 if (!UTF8_IS_INVARIANT(c) && 769 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send) 770 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) { 771 *len = ((STRLEN) -1); 772 return 0; 773 } 774 } 775 776 d = s = save; 777 while (s < send) { 778 STRLEN ulen; 779 *d++ = (U8)utf8_to_uvchr(s, &ulen); 780 s += ulen; 781 } 782 *d = '\0'; 783 *len = d - save; 784 return save; 785 } 786 787 /* 788 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8 789 790 Converts a string C<s> of length C<len> from UTF-8 into byte encoding. 791 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to 792 the newly-created string, and updates C<len> to contain the new 793 length. Returns the original string if no conversion occurs, C<len> 794 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to 795 0 if C<s> is converted or contains all 7bit characters. 796 797 =cut 798 */ 799 800 U8 * 801 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) 802 { 803 U8 *d; 804 const U8 *start = s; 805 const U8 *send; 806 I32 count = 0; 807 808 PERL_UNUSED_CONTEXT; 809 if (!*is_utf8) 810 return (U8 *)start; 811 812 /* ensure valid UTF-8 and chars < 256 before converting string */ 813 for (send = s + *len; s < send;) { 814 U8 c = *s++; 815 if (!UTF8_IS_INVARIANT(c)) { 816 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send && 817 (c = *s++) && UTF8_IS_CONTINUATION(c)) 818 count++; 819 else 820 return (U8 *)start; 821 } 822 } 823 824 *is_utf8 = FALSE; 825 826 Newx(d, (*len) - count + 1, U8); 827 s = start; start = d; 828 while (s < send) { 829 U8 c = *s++; 830 if (!UTF8_IS_INVARIANT(c)) { 831 /* Then it is two-byte encoded */ 832 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++); 833 c = ASCII_TO_NATIVE(c); 834 } 835 *d++ = c; 836 } 837 *d = '\0'; 838 *len = d - start; 839 return (U8 *)start; 840 } 841 842 /* 843 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len 844 845 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding. 846 Returns a pointer to the newly-created string, and sets C<len> to 847 reflect the new length. 848 849 If you want to convert to UTF-8 from other encodings than ASCII, 850 see sv_recode_to_utf8(). 851 852 =cut 853 */ 854 855 U8* 856 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) 857 { 858 const U8 * const send = s + (*len); 859 U8 *d; 860 U8 *dst; 861 PERL_UNUSED_CONTEXT; 862 863 Newx(d, (*len) * 2 + 1, U8); 864 dst = d; 865 866 while (s < send) { 867 const UV uv = NATIVE_TO_ASCII(*s++); 868 if (UNI_IS_INVARIANT(uv)) 869 *d++ = (U8)UTF_TO_NATIVE(uv); 870 else { 871 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); 872 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); 873 } 874 } 875 *d = '\0'; 876 *len = d-dst; 877 return dst; 878 } 879 880 /* 881 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. 882 * 883 * Destination must be pre-extended to 3/2 source. Do not use in-place. 884 * We optimize for native, for obvious reasons. */ 885 886 U8* 887 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) 888 { 889 U8* pend; 890 U8* dstart = d; 891 892 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ 893 d[0] = 0; 894 *newlen = 1; 895 return d; 896 } 897 898 if (bytelen & 1) 899 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); 900 901 pend = p + bytelen; 902 903 while (p < pend) { 904 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ 905 p += 2; 906 if (uv < 0x80) { 907 #ifdef EBCDIC 908 *d++ = UNI_TO_NATIVE(uv); 909 #else 910 *d++ = (U8)uv; 911 #endif 912 continue; 913 } 914 if (uv < 0x800) { 915 *d++ = (U8)(( uv >> 6) | 0xc0); 916 *d++ = (U8)(( uv & 0x3f) | 0x80); 917 continue; 918 } 919 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ 920 UV low = (p[0] << 8) + p[1]; 921 p += 2; 922 if (low < 0xdc00 || low >= 0xdfff) 923 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 924 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; 925 } 926 if (uv < 0x10000) { 927 *d++ = (U8)(( uv >> 12) | 0xe0); 928 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 929 *d++ = (U8)(( uv & 0x3f) | 0x80); 930 continue; 931 } 932 else { 933 *d++ = (U8)(( uv >> 18) | 0xf0); 934 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 935 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 936 *d++ = (U8)(( uv & 0x3f) | 0x80); 937 continue; 938 } 939 } 940 *newlen = d - dstart; 941 return d; 942 } 943 944 /* Note: this one is slightly destructive of the source. */ 945 946 U8* 947 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) 948 { 949 U8* s = (U8*)p; 950 U8* const send = s + bytelen; 951 while (s < send) { 952 const U8 tmp = s[0]; 953 s[0] = s[1]; 954 s[1] = tmp; 955 s += 2; 956 } 957 return utf16_to_utf8(p, d, bytelen, newlen); 958 } 959 960 /* for now these are all defined (inefficiently) in terms of the utf8 versions */ 961 962 bool 963 Perl_is_uni_alnum(pTHX_ UV c) 964 { 965 U8 tmpbuf[UTF8_MAXBYTES+1]; 966 uvchr_to_utf8(tmpbuf, c); 967 return is_utf8_alnum(tmpbuf); 968 } 969 970 bool 971 Perl_is_uni_alnumc(pTHX_ UV c) 972 { 973 U8 tmpbuf[UTF8_MAXBYTES+1]; 974 uvchr_to_utf8(tmpbuf, c); 975 return is_utf8_alnumc(tmpbuf); 976 } 977 978 bool 979 Perl_is_uni_idfirst(pTHX_ UV c) 980 { 981 U8 tmpbuf[UTF8_MAXBYTES+1]; 982 uvchr_to_utf8(tmpbuf, c); 983 return is_utf8_idfirst(tmpbuf); 984 } 985 986 bool 987 Perl_is_uni_alpha(pTHX_ UV c) 988 { 989 U8 tmpbuf[UTF8_MAXBYTES+1]; 990 uvchr_to_utf8(tmpbuf, c); 991 return is_utf8_alpha(tmpbuf); 992 } 993 994 bool 995 Perl_is_uni_ascii(pTHX_ UV c) 996 { 997 U8 tmpbuf[UTF8_MAXBYTES+1]; 998 uvchr_to_utf8(tmpbuf, c); 999 return is_utf8_ascii(tmpbuf); 1000 } 1001 1002 bool 1003 Perl_is_uni_space(pTHX_ UV c) 1004 { 1005 U8 tmpbuf[UTF8_MAXBYTES+1]; 1006 uvchr_to_utf8(tmpbuf, c); 1007 return is_utf8_space(tmpbuf); 1008 } 1009 1010 bool 1011 Perl_is_uni_digit(pTHX_ UV c) 1012 { 1013 U8 tmpbuf[UTF8_MAXBYTES+1]; 1014 uvchr_to_utf8(tmpbuf, c); 1015 return is_utf8_digit(tmpbuf); 1016 } 1017 1018 bool 1019 Perl_is_uni_upper(pTHX_ UV c) 1020 { 1021 U8 tmpbuf[UTF8_MAXBYTES+1]; 1022 uvchr_to_utf8(tmpbuf, c); 1023 return is_utf8_upper(tmpbuf); 1024 } 1025 1026 bool 1027 Perl_is_uni_lower(pTHX_ UV c) 1028 { 1029 U8 tmpbuf[UTF8_MAXBYTES+1]; 1030 uvchr_to_utf8(tmpbuf, c); 1031 return is_utf8_lower(tmpbuf); 1032 } 1033 1034 bool 1035 Perl_is_uni_cntrl(pTHX_ UV c) 1036 { 1037 U8 tmpbuf[UTF8_MAXBYTES+1]; 1038 uvchr_to_utf8(tmpbuf, c); 1039 return is_utf8_cntrl(tmpbuf); 1040 } 1041 1042 bool 1043 Perl_is_uni_graph(pTHX_ UV c) 1044 { 1045 U8 tmpbuf[UTF8_MAXBYTES+1]; 1046 uvchr_to_utf8(tmpbuf, c); 1047 return is_utf8_graph(tmpbuf); 1048 } 1049 1050 bool 1051 Perl_is_uni_print(pTHX_ UV c) 1052 { 1053 U8 tmpbuf[UTF8_MAXBYTES+1]; 1054 uvchr_to_utf8(tmpbuf, c); 1055 return is_utf8_print(tmpbuf); 1056 } 1057 1058 bool 1059 Perl_is_uni_punct(pTHX_ UV c) 1060 { 1061 U8 tmpbuf[UTF8_MAXBYTES+1]; 1062 uvchr_to_utf8(tmpbuf, c); 1063 return is_utf8_punct(tmpbuf); 1064 } 1065 1066 bool 1067 Perl_is_uni_xdigit(pTHX_ UV c) 1068 { 1069 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1070 uvchr_to_utf8(tmpbuf, c); 1071 return is_utf8_xdigit(tmpbuf); 1072 } 1073 1074 UV 1075 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) 1076 { 1077 uvchr_to_utf8(p, c); 1078 return to_utf8_upper(p, p, lenp); 1079 } 1080 1081 UV 1082 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) 1083 { 1084 uvchr_to_utf8(p, c); 1085 return to_utf8_title(p, p, lenp); 1086 } 1087 1088 UV 1089 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) 1090 { 1091 uvchr_to_utf8(p, c); 1092 return to_utf8_lower(p, p, lenp); 1093 } 1094 1095 UV 1096 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp) 1097 { 1098 uvchr_to_utf8(p, c); 1099 return to_utf8_fold(p, p, lenp); 1100 } 1101 1102 /* for now these all assume no locale info available for Unicode > 255 */ 1103 1104 bool 1105 Perl_is_uni_alnum_lc(pTHX_ UV c) 1106 { 1107 return is_uni_alnum(c); /* XXX no locale support yet */ 1108 } 1109 1110 bool 1111 Perl_is_uni_alnumc_lc(pTHX_ UV c) 1112 { 1113 return is_uni_alnumc(c); /* XXX no locale support yet */ 1114 } 1115 1116 bool 1117 Perl_is_uni_idfirst_lc(pTHX_ UV c) 1118 { 1119 return is_uni_idfirst(c); /* XXX no locale support yet */ 1120 } 1121 1122 bool 1123 Perl_is_uni_alpha_lc(pTHX_ UV c) 1124 { 1125 return is_uni_alpha(c); /* XXX no locale support yet */ 1126 } 1127 1128 bool 1129 Perl_is_uni_ascii_lc(pTHX_ UV c) 1130 { 1131 return is_uni_ascii(c); /* XXX no locale support yet */ 1132 } 1133 1134 bool 1135 Perl_is_uni_space_lc(pTHX_ UV c) 1136 { 1137 return is_uni_space(c); /* XXX no locale support yet */ 1138 } 1139 1140 bool 1141 Perl_is_uni_digit_lc(pTHX_ UV c) 1142 { 1143 return is_uni_digit(c); /* XXX no locale support yet */ 1144 } 1145 1146 bool 1147 Perl_is_uni_upper_lc(pTHX_ UV c) 1148 { 1149 return is_uni_upper(c); /* XXX no locale support yet */ 1150 } 1151 1152 bool 1153 Perl_is_uni_lower_lc(pTHX_ UV c) 1154 { 1155 return is_uni_lower(c); /* XXX no locale support yet */ 1156 } 1157 1158 bool 1159 Perl_is_uni_cntrl_lc(pTHX_ UV c) 1160 { 1161 return is_uni_cntrl(c); /* XXX no locale support yet */ 1162 } 1163 1164 bool 1165 Perl_is_uni_graph_lc(pTHX_ UV c) 1166 { 1167 return is_uni_graph(c); /* XXX no locale support yet */ 1168 } 1169 1170 bool 1171 Perl_is_uni_print_lc(pTHX_ UV c) 1172 { 1173 return is_uni_print(c); /* XXX no locale support yet */ 1174 } 1175 1176 bool 1177 Perl_is_uni_punct_lc(pTHX_ UV c) 1178 { 1179 return is_uni_punct(c); /* XXX no locale support yet */ 1180 } 1181 1182 bool 1183 Perl_is_uni_xdigit_lc(pTHX_ UV c) 1184 { 1185 return is_uni_xdigit(c); /* XXX no locale support yet */ 1186 } 1187 1188 U32 1189 Perl_to_uni_upper_lc(pTHX_ U32 c) 1190 { 1191 /* XXX returns only the first character -- do not use XXX */ 1192 /* XXX no locale support yet */ 1193 STRLEN len; 1194 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1195 return (U32)to_uni_upper(c, tmpbuf, &len); 1196 } 1197 1198 U32 1199 Perl_to_uni_title_lc(pTHX_ U32 c) 1200 { 1201 /* XXX returns only the first character XXX -- do not use XXX */ 1202 /* XXX no locale support yet */ 1203 STRLEN len; 1204 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1205 return (U32)to_uni_title(c, tmpbuf, &len); 1206 } 1207 1208 U32 1209 Perl_to_uni_lower_lc(pTHX_ U32 c) 1210 { 1211 /* XXX returns only the first character -- do not use XXX */ 1212 /* XXX no locale support yet */ 1213 STRLEN len; 1214 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1215 return (U32)to_uni_lower(c, tmpbuf, &len); 1216 } 1217 1218 static bool 1219 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, 1220 const char *const swashname) 1221 { 1222 dVAR; 1223 if (!is_utf8_char(p)) 1224 return FALSE; 1225 if (!*swash) 1226 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); 1227 return swash_fetch(*swash, p, TRUE) != 0; 1228 } 1229 1230 bool 1231 Perl_is_utf8_alnum(pTHX_ const U8 *p) 1232 { 1233 dVAR; 1234 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true 1235 * descendant of isalnum(3), in other words, it doesn't 1236 * contain the '_'. --jhi */ 1237 return is_utf8_common(p, &PL_utf8_alnum, "IsWord"); 1238 } 1239 1240 bool 1241 Perl_is_utf8_alnumc(pTHX_ const U8 *p) 1242 { 1243 dVAR; 1244 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC"); 1245 } 1246 1247 bool 1248 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ 1249 { 1250 dVAR; 1251 if (*p == '_') 1252 return TRUE; 1253 /* is_utf8_idstart would be more logical. */ 1254 return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); 1255 } 1256 1257 bool 1258 Perl_is_utf8_idcont(pTHX_ const U8 *p) 1259 { 1260 dVAR; 1261 if (*p == '_') 1262 return TRUE; 1263 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue"); 1264 } 1265 1266 bool 1267 Perl_is_utf8_alpha(pTHX_ const U8 *p) 1268 { 1269 dVAR; 1270 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha"); 1271 } 1272 1273 bool 1274 Perl_is_utf8_ascii(pTHX_ const U8 *p) 1275 { 1276 dVAR; 1277 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii"); 1278 } 1279 1280 bool 1281 Perl_is_utf8_space(pTHX_ const U8 *p) 1282 { 1283 dVAR; 1284 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl"); 1285 } 1286 1287 bool 1288 Perl_is_utf8_digit(pTHX_ const U8 *p) 1289 { 1290 dVAR; 1291 return is_utf8_common(p, &PL_utf8_digit, "IsDigit"); 1292 } 1293 1294 bool 1295 Perl_is_utf8_upper(pTHX_ const U8 *p) 1296 { 1297 dVAR; 1298 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase"); 1299 } 1300 1301 bool 1302 Perl_is_utf8_lower(pTHX_ const U8 *p) 1303 { 1304 dVAR; 1305 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase"); 1306 } 1307 1308 bool 1309 Perl_is_utf8_cntrl(pTHX_ const U8 *p) 1310 { 1311 dVAR; 1312 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl"); 1313 } 1314 1315 bool 1316 Perl_is_utf8_graph(pTHX_ const U8 *p) 1317 { 1318 dVAR; 1319 return is_utf8_common(p, &PL_utf8_graph, "IsGraph"); 1320 } 1321 1322 bool 1323 Perl_is_utf8_print(pTHX_ const U8 *p) 1324 { 1325 dVAR; 1326 return is_utf8_common(p, &PL_utf8_print, "IsPrint"); 1327 } 1328 1329 bool 1330 Perl_is_utf8_punct(pTHX_ const U8 *p) 1331 { 1332 dVAR; 1333 return is_utf8_common(p, &PL_utf8_punct, "IsPunct"); 1334 } 1335 1336 bool 1337 Perl_is_utf8_xdigit(pTHX_ const U8 *p) 1338 { 1339 dVAR; 1340 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit"); 1341 } 1342 1343 bool 1344 Perl_is_utf8_mark(pTHX_ const U8 *p) 1345 { 1346 dVAR; 1347 return is_utf8_common(p, &PL_utf8_mark, "IsM"); 1348 } 1349 1350 /* 1351 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special 1352 1353 The "p" contains the pointer to the UTF-8 string encoding 1354 the character that is being converted. 1355 1356 The "ustrp" is a pointer to the character buffer to put the 1357 conversion result to. The "lenp" is a pointer to the length 1358 of the result. 1359 1360 The "swashp" is a pointer to the swash to use. 1361 1362 Both the special and normal mappings are stored lib/unicore/To/Foo.pl, 1363 and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually, 1364 but not always, a multicharacter mapping), is tried first. 1365 1366 The "special" is a string like "utf8::ToSpecLower", which means the 1367 hash %utf8::ToSpecLower. The access to the hash is through 1368 Perl_to_utf8_case(). 1369 1370 The "normal" is a string like "ToLower" which means the swash 1371 %utf8::ToLower. 1372 1373 =cut */ 1374 1375 UV 1376 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, 1377 SV **swashp, const char *normal, const char *special) 1378 { 1379 dVAR; 1380 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1381 STRLEN len = 0; 1382 1383 const UV uv0 = utf8_to_uvchr(p, NULL); 1384 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings 1385 * are necessary in EBCDIC, they are redundant no-ops 1386 * in ASCII-ish platforms, and hopefully optimized away. */ 1387 const UV uv1 = NATIVE_TO_UNI(uv0); 1388 uvuni_to_utf8(tmpbuf, uv1); 1389 1390 if (!*swashp) /* load on-demand */ 1391 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); 1392 1393 /* The 0xDF is the only special casing Unicode code point below 0x100. */ 1394 if (special && (uv1 == 0xDF || uv1 > 0xFF)) { 1395 /* It might be "special" (sometimes, but not always, 1396 * a multicharacter mapping) */ 1397 HV * const hv = get_hv(special, FALSE); 1398 SV **svp; 1399 1400 if (hv && 1401 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && 1402 (*svp)) { 1403 const char *s; 1404 1405 s = SvPV_const(*svp, len); 1406 if (len == 1) 1407 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp; 1408 else { 1409 #ifdef EBCDIC 1410 /* If we have EBCDIC we need to remap the characters 1411 * since any characters in the low 256 are Unicode 1412 * code points, not EBCDIC. */ 1413 U8 *t = (U8*)s, *tend = t + len, *d; 1414 1415 d = tmpbuf; 1416 if (SvUTF8(*svp)) { 1417 STRLEN tlen = 0; 1418 1419 while (t < tend) { 1420 const UV c = utf8_to_uvchr(t, &tlen); 1421 if (tlen > 0) { 1422 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); 1423 t += tlen; 1424 } 1425 else 1426 break; 1427 } 1428 } 1429 else { 1430 while (t < tend) { 1431 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); 1432 t++; 1433 } 1434 } 1435 len = d - tmpbuf; 1436 Copy(tmpbuf, ustrp, len, U8); 1437 #else 1438 Copy(s, ustrp, len, U8); 1439 #endif 1440 } 1441 } 1442 } 1443 1444 if (!len && *swashp) { 1445 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); 1446 1447 if (uv2) { 1448 /* It was "normal" (a single character mapping). */ 1449 const UV uv3 = UNI_TO_NATIVE(uv2); 1450 len = uvchr_to_utf8(ustrp, uv3) - ustrp; 1451 } 1452 } 1453 1454 if (!len) /* Neither: just copy. */ 1455 len = uvchr_to_utf8(ustrp, uv0) - ustrp; 1456 1457 if (lenp) 1458 *lenp = len; 1459 1460 return len ? utf8_to_uvchr(ustrp, 0) : 0; 1461 } 1462 1463 /* 1464 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp 1465 1466 Convert the UTF-8 encoded character at p to its uppercase version and 1467 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1468 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since 1469 the uppercase version may be longer than the original character. 1470 1471 The first character of the uppercased version is returned 1472 (but note, as explained above, that there may be more.) 1473 1474 =cut */ 1475 1476 UV 1477 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1478 { 1479 dVAR; 1480 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1481 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); 1482 } 1483 1484 /* 1485 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp 1486 1487 Convert the UTF-8 encoded character at p to its titlecase version and 1488 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1489 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the 1490 titlecase version may be longer than the original character. 1491 1492 The first character of the titlecased version is returned 1493 (but note, as explained above, that there may be more.) 1494 1495 =cut */ 1496 1497 UV 1498 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1499 { 1500 dVAR; 1501 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1502 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); 1503 } 1504 1505 /* 1506 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp 1507 1508 Convert the UTF-8 encoded character at p to its lowercase version and 1509 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1510 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the 1511 lowercase version may be longer than the original character. 1512 1513 The first character of the lowercased version is returned 1514 (but note, as explained above, that there may be more.) 1515 1516 =cut */ 1517 1518 UV 1519 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1520 { 1521 dVAR; 1522 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1523 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); 1524 } 1525 1526 /* 1527 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp 1528 1529 Convert the UTF-8 encoded character at p to its foldcase version and 1530 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1531 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the 1532 foldcase version may be longer than the original character (up to 1533 three characters). 1534 1535 The first character of the foldcased version is returned 1536 (but note, as explained above, that there may be more.) 1537 1538 =cut */ 1539 1540 UV 1541 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1542 { 1543 dVAR; 1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold"); 1546 } 1547 1548 /* Note: 1549 * A "swash" is a swatch hash. 1550 * A "swatch" is a bit vector generated by utf8.c:S_swash_get(). 1551 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8". 1552 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl. 1553 */ 1554 SV* 1555 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none) 1556 { 1557 dVAR; 1558 SV* retval; 1559 dSP; 1560 const size_t pkg_len = strlen(pkg); 1561 const size_t name_len = strlen(name); 1562 HV * const stash = gv_stashpvn(pkg, pkg_len, 0); 1563 SV* errsv_save; 1564 1565 PUSHSTACKi(PERLSI_MAGIC); 1566 ENTER; 1567 SAVEI32(PL_hints); 1568 PL_hints = 0; 1569 save_re_context(); 1570 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */ 1571 ENTER; 1572 errsv_save = newSVsv(ERRSV); 1573 /* It is assumed that callers of this routine are not passing in any 1574 user derived data. */ 1575 /* Need to do this after save_re_context() as it will set PL_tainted to 1576 1 while saving $1 etc (see the code after getrx: in Perl_magic_get). 1577 Even line to create errsv_save can turn on PL_tainted. */ 1578 SAVEBOOL(PL_tainted); 1579 PL_tainted = 0; 1580 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), 1581 NULL); 1582 if (!SvTRUE(ERRSV)) 1583 sv_setsv(ERRSV, errsv_save); 1584 SvREFCNT_dec(errsv_save); 1585 LEAVE; 1586 } 1587 SPAGAIN; 1588 PUSHMARK(SP); 1589 EXTEND(SP,5); 1590 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len))); 1591 PUSHs(sv_2mortal(newSVpvn(name, name_len))); 1592 PUSHs(listsv); 1593 PUSHs(sv_2mortal(newSViv(minbits))); 1594 PUSHs(sv_2mortal(newSViv(none))); 1595 PUTBACK; 1596 errsv_save = newSVsv(ERRSV); 1597 if (call_method("SWASHNEW", G_SCALAR)) 1598 retval = newSVsv(*PL_stack_sp--); 1599 else 1600 retval = &PL_sv_undef; 1601 if (!SvTRUE(ERRSV)) 1602 sv_setsv(ERRSV, errsv_save); 1603 SvREFCNT_dec(errsv_save); 1604 LEAVE; 1605 POPSTACK; 1606 if (IN_PERL_COMPILETIME) { 1607 CopHINTS_set(PL_curcop, PL_hints); 1608 } 1609 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { 1610 if (SvPOK(retval)) 1611 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"", 1612 SVfARG(retval)); 1613 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); 1614 } 1615 return retval; 1616 } 1617 1618 1619 /* This API is wrong for special case conversions since we may need to 1620 * return several Unicode characters for a single Unicode character 1621 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is 1622 * the lower-level routine, and it is similarly broken for returning 1623 * multiple values. --jhi */ 1624 /* Now SWASHGET is recasted into S_swash_get in this file. */ 1625 1626 /* Note: 1627 * Returns the value of property/mapping C<swash> for the first character 1628 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is 1629 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is 1630 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>. 1631 */ 1632 UV 1633 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) 1634 { 1635 dVAR; 1636 HV* const hv = (HV*)SvRV(swash); 1637 U32 klen; 1638 U32 off; 1639 STRLEN slen; 1640 STRLEN needents; 1641 const U8 *tmps = NULL; 1642 U32 bit; 1643 SV *swatch; 1644 U8 tmputf8[2]; 1645 const UV c = NATIVE_TO_ASCII(*ptr); 1646 1647 if (!do_utf8 && !UNI_IS_INVARIANT(c)) { 1648 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); 1649 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c); 1650 ptr = tmputf8; 1651 } 1652 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ 1653 * then the "swatch" is a vec() for al the chars which start 1654 * with 0xAA..0xYY 1655 * So the key in the hash (klen) is length of encoded char -1 1656 */ 1657 klen = UTF8SKIP(ptr) - 1; 1658 off = ptr[klen]; 1659 1660 if (klen == 0) { 1661 /* If char in invariant then swatch is for all the invariant chars 1662 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK 1663 */ 1664 needents = UTF_CONTINUATION_MARK; 1665 off = NATIVE_TO_UTF(ptr[klen]); 1666 } 1667 else { 1668 /* If char is encoded then swatch is for the prefix */ 1669 needents = (1 << UTF_ACCUMULATION_SHIFT); 1670 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK; 1671 } 1672 1673 /* 1674 * This single-entry cache saves about 1/3 of the utf8 overhead in test 1675 * suite. (That is, only 7-8% overall over just a hash cache. Still, 1676 * it's nothing to sniff at.) Pity we usually come through at least 1677 * two function calls to get here... 1678 * 1679 * NB: this code assumes that swatches are never modified, once generated! 1680 */ 1681 1682 if (hv == PL_last_swash_hv && 1683 klen == PL_last_swash_klen && 1684 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) ) 1685 { 1686 tmps = PL_last_swash_tmps; 1687 slen = PL_last_swash_slen; 1688 } 1689 else { 1690 /* Try our second-level swatch cache, kept in a hash. */ 1691 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE); 1692 1693 /* If not cached, generate it via swash_get */ 1694 if (!svp || !SvPOK(*svp) 1695 || !(tmps = (const U8*)SvPV_const(*svp, slen))) { 1696 /* We use utf8n_to_uvuni() as we want an index into 1697 Unicode tables, not a native character number. 1698 */ 1699 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 1700 ckWARN(WARN_UTF8) ? 1701 0 : UTF8_ALLOW_ANY); 1702 swatch = swash_get(swash, 1703 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ 1704 (klen) ? (code_point & ~(needents - 1)) : 0, 1705 needents); 1706 1707 if (IN_PERL_COMPILETIME) 1708 CopHINTS_set(PL_curcop, PL_hints); 1709 1710 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); 1711 1712 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) 1713 || (slen << 3) < needents) 1714 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch"); 1715 } 1716 1717 PL_last_swash_hv = hv; 1718 assert(klen <= sizeof(PL_last_swash_key)); 1719 PL_last_swash_klen = (U8)klen; 1720 /* FIXME change interpvar.h? */ 1721 PL_last_swash_tmps = (U8 *) tmps; 1722 PL_last_swash_slen = slen; 1723 if (klen) 1724 Copy(ptr, PL_last_swash_key, klen, U8); 1725 } 1726 1727 switch ((int)((slen << 3) / needents)) { 1728 case 1: 1729 bit = 1 << (off & 7); 1730 off >>= 3; 1731 return (tmps[off] & bit) != 0; 1732 case 8: 1733 return tmps[off]; 1734 case 16: 1735 off <<= 1; 1736 return (tmps[off] << 8) + tmps[off + 1] ; 1737 case 32: 1738 off <<= 2; 1739 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; 1740 } 1741 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width"); 1742 NORETURN_FUNCTION_END; 1743 } 1744 1745 /* Note: 1746 * Returns a swatch (a bit vector string) for a code point sequence 1747 * that starts from the value C<start> and comprises the number C<span>. 1748 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl). 1749 * Should be used via swash_fetch, which will cache the swatch in C<swash>. 1750 */ 1751 STATIC SV* 1752 S_swash_get(pTHX_ SV* swash, UV start, UV span) 1753 { 1754 SV *swatch; 1755 U8 *l, *lend, *x, *xend, *s; 1756 STRLEN lcur, xcur, scur; 1757 1758 HV* const hv = (HV*)SvRV(swash); 1759 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); 1760 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); 1761 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); 1762 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); 1763 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE); 1764 const U8* const typestr = (U8*)SvPV_nolen(*typesvp); 1765 const int typeto = typestr[0] == 'T' && typestr[1] == 'o'; 1766 const STRLEN bits = SvUV(*bitssvp); 1767 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ 1768 const UV none = SvUV(*nonesvp); 1769 const UV end = start + span; 1770 1771 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { 1772 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf, 1773 (UV)bits); 1774 } 1775 1776 /* create and initialize $swatch */ 1777 scur = octets ? (span * octets) : (span + 7) / 8; 1778 swatch = newSV(scur); 1779 SvPOK_on(swatch); 1780 s = (U8*)SvPVX(swatch); 1781 if (octets && none) { 1782 const U8* const e = s + scur; 1783 while (s < e) { 1784 if (bits == 8) 1785 *s++ = (U8)(none & 0xff); 1786 else if (bits == 16) { 1787 *s++ = (U8)((none >> 8) & 0xff); 1788 *s++ = (U8)( none & 0xff); 1789 } 1790 else if (bits == 32) { 1791 *s++ = (U8)((none >> 24) & 0xff); 1792 *s++ = (U8)((none >> 16) & 0xff); 1793 *s++ = (U8)((none >> 8) & 0xff); 1794 *s++ = (U8)( none & 0xff); 1795 } 1796 } 1797 *s = '\0'; 1798 } 1799 else { 1800 (void)memzero((U8*)s, scur + 1); 1801 } 1802 SvCUR_set(swatch, scur); 1803 s = (U8*)SvPVX(swatch); 1804 1805 /* read $swash->{LIST} */ 1806 l = (U8*)SvPV(*listsvp, lcur); 1807 lend = l + lcur; 1808 while (l < lend) { 1809 UV min, max, val; 1810 STRLEN numlen; 1811 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; 1812 1813 U8* const nl = (U8*)memchr(l, '\n', lend - l); 1814 1815 numlen = lend - l; 1816 min = grok_hex((char *)l, &numlen, &flags, NULL); 1817 if (numlen) 1818 l += numlen; 1819 else if (nl) { 1820 l = nl + 1; /* 1 is length of "\n" */ 1821 continue; 1822 } 1823 else { 1824 l = lend; /* to LIST's end at which \n is not found */ 1825 break; 1826 } 1827 1828 if (isBLANK(*l)) { 1829 ++l; 1830 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; 1831 numlen = lend - l; 1832 max = grok_hex((char *)l, &numlen, &flags, NULL); 1833 if (numlen) 1834 l += numlen; 1835 else 1836 max = min; 1837 1838 if (octets) { 1839 if (isBLANK(*l)) { 1840 ++l; 1841 flags = PERL_SCAN_SILENT_ILLDIGIT | 1842 PERL_SCAN_DISALLOW_PREFIX; 1843 numlen = lend - l; 1844 val = grok_hex((char *)l, &numlen, &flags, NULL); 1845 if (numlen) 1846 l += numlen; 1847 else 1848 val = 0; 1849 } 1850 else { 1851 val = 0; 1852 if (typeto) { 1853 Perl_croak(aTHX_ "%s: illegal mapping '%s'", 1854 typestr, l); 1855 } 1856 } 1857 } 1858 else 1859 val = 0; /* bits == 1, then val should be ignored */ 1860 } 1861 else { 1862 max = min; 1863 if (octets) { 1864 val = 0; 1865 if (typeto) { 1866 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l); 1867 } 1868 } 1869 else 1870 val = 0; /* bits == 1, then val should be ignored */ 1871 } 1872 1873 if (nl) 1874 l = nl + 1; 1875 else 1876 l = lend; 1877 1878 if (max < start) 1879 continue; 1880 1881 if (octets) { 1882 UV key; 1883 if (min < start) { 1884 if (!none || val < none) { 1885 val += start - min; 1886 } 1887 min = start; 1888 } 1889 for (key = min; key <= max; key++) { 1890 STRLEN offset; 1891 if (key >= end) 1892 goto go_out_list; 1893 /* offset must be non-negative (start <= min <= key < end) */ 1894 offset = octets * (key - start); 1895 if (bits == 8) 1896 s[offset] = (U8)(val & 0xff); 1897 else if (bits == 16) { 1898 s[offset ] = (U8)((val >> 8) & 0xff); 1899 s[offset + 1] = (U8)( val & 0xff); 1900 } 1901 else if (bits == 32) { 1902 s[offset ] = (U8)((val >> 24) & 0xff); 1903 s[offset + 1] = (U8)((val >> 16) & 0xff); 1904 s[offset + 2] = (U8)((val >> 8) & 0xff); 1905 s[offset + 3] = (U8)( val & 0xff); 1906 } 1907 1908 if (!none || val < none) 1909 ++val; 1910 } 1911 } 1912 else { /* bits == 1, then val should be ignored */ 1913 UV key; 1914 if (min < start) 1915 min = start; 1916 for (key = min; key <= max; key++) { 1917 const STRLEN offset = (STRLEN)(key - start); 1918 if (key >= end) 1919 goto go_out_list; 1920 s[offset >> 3] |= 1 << (offset & 7); 1921 } 1922 } 1923 } /* while */ 1924 go_out_list: 1925 1926 /* read $swash->{EXTRAS} */ 1927 x = (U8*)SvPV(*extssvp, xcur); 1928 xend = x + xcur; 1929 while (x < xend) { 1930 STRLEN namelen; 1931 U8 *namestr; 1932 SV** othersvp; 1933 HV* otherhv; 1934 STRLEN otherbits; 1935 SV **otherbitssvp, *other; 1936 U8 *s, *o, *nl; 1937 STRLEN slen, olen; 1938 1939 const U8 opc = *x++; 1940 if (opc == '\n') 1941 continue; 1942 1943 nl = (U8*)memchr(x, '\n', xend - x); 1944 1945 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { 1946 if (nl) { 1947 x = nl + 1; /* 1 is length of "\n" */ 1948 continue; 1949 } 1950 else { 1951 x = xend; /* to EXTRAS' end at which \n is not found */ 1952 break; 1953 } 1954 } 1955 1956 namestr = x; 1957 if (nl) { 1958 namelen = nl - namestr; 1959 x = nl + 1; 1960 } 1961 else { 1962 namelen = xend - namestr; 1963 x = xend; 1964 } 1965 1966 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); 1967 otherhv = (HV*)SvRV(*othersvp); 1968 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); 1969 otherbits = (STRLEN)SvUV(*otherbitssvp); 1970 if (bits < otherbits) 1971 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch"); 1972 1973 /* The "other" swatch must be destroyed after. */ 1974 other = swash_get(*othersvp, start, span); 1975 o = (U8*)SvPV(other, olen); 1976 1977 if (!olen) 1978 Perl_croak(aTHX_ "panic: swash_get got improper swatch"); 1979 1980 s = (U8*)SvPV(swatch, slen); 1981 if (bits == 1 && otherbits == 1) { 1982 if (slen != olen) 1983 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch"); 1984 1985 switch (opc) { 1986 case '+': 1987 while (slen--) 1988 *s++ |= *o++; 1989 break; 1990 case '!': 1991 while (slen--) 1992 *s++ |= ~*o++; 1993 break; 1994 case '-': 1995 while (slen--) 1996 *s++ &= ~*o++; 1997 break; 1998 case '&': 1999 while (slen--) 2000 *s++ &= *o++; 2001 break; 2002 default: 2003 break; 2004 } 2005 } 2006 else { 2007 STRLEN otheroctets = otherbits >> 3; 2008 STRLEN offset = 0; 2009 U8* const send = s + slen; 2010 2011 while (s < send) { 2012 UV otherval = 0; 2013 2014 if (otherbits == 1) { 2015 otherval = (o[offset >> 3] >> (offset & 7)) & 1; 2016 ++offset; 2017 } 2018 else { 2019 STRLEN vlen = otheroctets; 2020 otherval = *o++; 2021 while (--vlen) { 2022 otherval <<= 8; 2023 otherval |= *o++; 2024 } 2025 } 2026 2027 if (opc == '+' && otherval) 2028 NOOP; /* replace with otherval */ 2029 else if (opc == '!' && !otherval) 2030 otherval = 1; 2031 else if (opc == '-' && otherval) 2032 otherval = 0; 2033 else if (opc == '&' && !otherval) 2034 otherval = 0; 2035 else { 2036 s += octets; /* no replacement */ 2037 continue; 2038 } 2039 2040 if (bits == 8) 2041 *s++ = (U8)( otherval & 0xff); 2042 else if (bits == 16) { 2043 *s++ = (U8)((otherval >> 8) & 0xff); 2044 *s++ = (U8)( otherval & 0xff); 2045 } 2046 else if (bits == 32) { 2047 *s++ = (U8)((otherval >> 24) & 0xff); 2048 *s++ = (U8)((otherval >> 16) & 0xff); 2049 *s++ = (U8)((otherval >> 8) & 0xff); 2050 *s++ = (U8)( otherval & 0xff); 2051 } 2052 } 2053 } 2054 sv_free(other); /* through with it! */ 2055 } /* while */ 2056 return swatch; 2057 } 2058 2059 /* 2060 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv 2061 2062 Adds the UTF-8 representation of the Native codepoint C<uv> to the end 2063 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free 2064 bytes available. The return value is the pointer to the byte after the 2065 end of the new character. In other words, 2066 2067 d = uvchr_to_utf8(d, uv); 2068 2069 is the recommended wide native character-aware way of saying 2070 2071 *(d++) = uv; 2072 2073 =cut 2074 */ 2075 2076 /* On ASCII machines this is normally a macro but we want a 2077 real function in case XS code wants it 2078 */ 2079 U8 * 2080 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) 2081 { 2082 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); 2083 } 2084 2085 U8 * 2086 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 2087 { 2088 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); 2089 } 2090 2091 /* 2092 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 2093 flags 2094 2095 Returns the native character value of the first character in the string 2096 C<s> 2097 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 2098 length, in bytes, of that character. 2099 2100 Allows length and flags to be passed to low level routine. 2101 2102 =cut 2103 */ 2104 /* On ASCII machines this is normally a macro but we want 2105 a real function in case XS code wants it 2106 */ 2107 UV 2108 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, 2109 U32 flags) 2110 { 2111 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); 2112 return UNI_TO_NATIVE(uv); 2113 } 2114 2115 /* 2116 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags 2117 2118 Build to the scalar dsv a displayable version of the string spv, 2119 length len, the displayable version being at most pvlim bytes long 2120 (if longer, the rest is truncated and "..." will be appended). 2121 2122 The flags argument can have UNI_DISPLAY_ISPRINT set to display 2123 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH 2124 to display the \\[nrfta\\] as the backslashed versions (like '\n') 2125 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\). 2126 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both 2127 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on. 2128 2129 The pointer to the PV of the dsv is returned. 2130 2131 =cut */ 2132 char * 2133 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags) 2134 { 2135 int truncated = 0; 2136 const char *s, *e; 2137 2138 sv_setpvn(dsv, "", 0); 2139 SvUTF8_off(dsv); 2140 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { 2141 UV u; 2142 /* This serves double duty as a flag and a character to print after 2143 a \ when flags & UNI_DISPLAY_BACKSLASH is true. 2144 */ 2145 char ok = 0; 2146 2147 if (pvlim && SvCUR(dsv) >= pvlim) { 2148 truncated++; 2149 break; 2150 } 2151 u = utf8_to_uvchr((U8*)s, 0); 2152 if (u < 256) { 2153 const unsigned char c = (unsigned char)u & 0xFF; 2154 if (flags & UNI_DISPLAY_BACKSLASH) { 2155 switch (c) { 2156 case '\n': 2157 ok = 'n'; break; 2158 case '\r': 2159 ok = 'r'; break; 2160 case '\t': 2161 ok = 't'; break; 2162 case '\f': 2163 ok = 'f'; break; 2164 case '\a': 2165 ok = 'a'; break; 2166 case '\\': 2167 ok = '\\'; break; 2168 default: break; 2169 } 2170 if (ok) { 2171 const char string = ok; 2172 sv_catpvn(dsv, &string, 1); 2173 } 2174 } 2175 /* isPRINT() is the locale-blind version. */ 2176 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { 2177 const char string = c; 2178 sv_catpvn(dsv, &string, 1); 2179 ok = 1; 2180 } 2181 } 2182 if (!ok) 2183 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); 2184 } 2185 if (truncated) 2186 sv_catpvs(dsv, "..."); 2187 2188 return SvPVX(dsv); 2189 } 2190 2191 /* 2192 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags 2193 2194 Build to the scalar dsv a displayable version of the scalar sv, 2195 the displayable version being at most pvlim bytes long 2196 (if longer, the rest is truncated and "..." will be appended). 2197 2198 The flags argument is as in pv_uni_display(). 2199 2200 The pointer to the PV of the dsv is returned. 2201 2202 =cut 2203 */ 2204 char * 2205 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) 2206 { 2207 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv), 2208 SvCUR(ssv), pvlim, flags); 2209 } 2210 2211 /* 2212 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2 2213 2214 Return true if the strings s1 and s2 differ case-insensitively, false 2215 if not (if they are equal case-insensitively). If u1 is true, the 2216 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, 2217 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 2218 are false, the respective string is assumed to be in native 8-bit 2219 encoding. 2220 2221 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied 2222 in there (they will point at the beginning of the I<next> character). 2223 If the pointers behind pe1 or pe2 are non-NULL, they are the end 2224 pointers beyond which scanning will not continue under any 2225 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and 2226 s2+l2 will be used as goal end pointers that will also stop the scan, 2227 and which qualify towards defining a successful match: all the scans 2228 that define an explicit length must reach their goal pointers for 2229 a match to succeed). 2230 2231 For case-insensitiveness, the "casefolding" of Unicode is used 2232 instead of upper/lowercasing both the characters, see 2233 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). 2234 2235 =cut */ 2236 I32 2237 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) 2238 { 2239 dVAR; 2240 register const U8 *p1 = (const U8*)s1; 2241 register const U8 *p2 = (const U8*)s2; 2242 register const U8 *f1 = NULL; 2243 register const U8 *f2 = NULL; 2244 register U8 *e1 = NULL; 2245 register U8 *q1 = NULL; 2246 register U8 *e2 = NULL; 2247 register U8 *q2 = NULL; 2248 STRLEN n1 = 0, n2 = 0; 2249 U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; 2250 U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; 2251 U8 natbuf[1+1]; 2252 STRLEN foldlen1, foldlen2; 2253 bool match; 2254 2255 if (pe1) 2256 e1 = *(U8**)pe1; 2257 /* assert(e1 || l1); */ 2258 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1))) 2259 f1 = (const U8*)s1 + l1; 2260 if (pe2) 2261 e2 = *(U8**)pe2; 2262 /* assert(e2 || l2); */ 2263 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2))) 2264 f2 = (const U8*)s2 + l2; 2265 2266 /* This shouldn't happen. However, putting an assert() there makes some 2267 * tests fail. */ 2268 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */ 2269 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) 2270 return 1; /* mismatch; possible infinite loop or false positive */ 2271 2272 if (!u1 || !u2) 2273 natbuf[1] = 0; /* Need to terminate the buffer. */ 2274 2275 while ((e1 == 0 || p1 < e1) && 2276 (f1 == 0 || p1 < f1) && 2277 (e2 == 0 || p2 < e2) && 2278 (f2 == 0 || p2 < f2)) { 2279 if (n1 == 0) { 2280 if (u1) 2281 to_utf8_fold(p1, foldbuf1, &foldlen1); 2282 else { 2283 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1))); 2284 to_utf8_fold(natbuf, foldbuf1, &foldlen1); 2285 } 2286 q1 = foldbuf1; 2287 n1 = foldlen1; 2288 } 2289 if (n2 == 0) { 2290 if (u2) 2291 to_utf8_fold(p2, foldbuf2, &foldlen2); 2292 else { 2293 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2))); 2294 to_utf8_fold(natbuf, foldbuf2, &foldlen2); 2295 } 2296 q2 = foldbuf2; 2297 n2 = foldlen2; 2298 } 2299 while (n1 && n2) { 2300 if ( UTF8SKIP(q1) != UTF8SKIP(q2) || 2301 (UTF8SKIP(q1) == 1 && *q1 != *q2) || 2302 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) ) 2303 return 1; /* mismatch */ 2304 n1 -= UTF8SKIP(q1); 2305 q1 += UTF8SKIP(q1); 2306 n2 -= UTF8SKIP(q2); 2307 q2 += UTF8SKIP(q2); 2308 } 2309 if (n1 == 0) 2310 p1 += u1 ? UTF8SKIP(p1) : 1; 2311 if (n2 == 0) 2312 p2 += u2 ? UTF8SKIP(p2) : 1; 2313 2314 } 2315 2316 /* A match is defined by all the scans that specified 2317 * an explicit length reaching their final goals. */ 2318 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2); 2319 2320 if (match) { 2321 if (pe1) 2322 *pe1 = (char*)p1; 2323 if (pe2) 2324 *pe2 = (char*)p2; 2325 } 2326 2327 return match ? 0 : 1; /* 0 match, 1 mismatch */ 2328 } 2329 2330 /* 2331 * Local variables: 2332 * c-indentation-style: bsd 2333 * c-basic-offset: 4 2334 * indent-tabs-mode: t 2335 * End: 2336 * 2337 * ex: set ts=8 sts=4 sw=4 noet: 2338 */ 2339