1 /* utf8.c 2 * 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 4 * by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 13 * heard of that we don't want to see any closer; and that's the one place 14 * we're trying to get to! And that's just where we can't get, nohow.' 15 * 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sm�agol"] 17 * 18 * 'Well do I understand your speech,' he answered in the same language; 19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 20 * as is the custom in the West, if you wish to be answered?' 21 * --Gandalf, addressing Th�oden's door wardens 22 * 23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 24 * 25 * ...the travellers perceived that the floor was paved with stones of many 26 * hues; branching runes and strange devices intertwined beneath their feet. 27 * 28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_UTF8_C 33 #include "perl.h" 34 35 #ifndef EBCDIC 36 /* Separate prototypes needed because in ASCII systems these 37 * usually macros but they still are compiled as code, too. */ 38 PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); 39 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); 40 #endif 41 42 static const char unees[] = 43 "Malformed UTF-8 character (unexpected end of string)"; 44 45 /* 46 =head1 Unicode Support 47 48 This file contains various utility functions for manipulating UTF8-encoded 49 strings. For the uninitiated, this is a method of representing arbitrary 50 Unicode characters as a variable number of bytes, in such a way that 51 characters in the ASCII range are unmodified, and a zero byte never appears 52 within non-zero characters. 53 54 =for apidoc uvuni_to_utf8_flags 55 56 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end 57 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free 58 bytes available. The return value is the pointer to the byte after the 59 end of the new character. In other words, 60 61 d = uvuni_to_utf8_flags(d, uv, flags); 62 63 or, in most cases, 64 65 d = uvuni_to_utf8(d, uv); 66 67 (which is equivalent to) 68 69 d = uvuni_to_utf8_flags(d, uv, 0); 70 71 is the recommended Unicode-aware way of saying 72 73 *(d++) = uv; 74 75 =cut 76 */ 77 78 U8 * 79 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 80 { 81 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; 82 83 if (ckWARN(WARN_UTF8)) { 84 if (UNICODE_IS_SURROGATE(uv) && 85 !(flags & UNICODE_ALLOW_SURROGATE)) 86 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv); 87 else if ( 88 ((uv >= 0xFDD0 && uv <= 0xFDEF && 89 !(flags & UNICODE_ALLOW_FDD0)) 90 || 91 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */ 92 !(flags & UNICODE_ALLOW_FFFF))) && 93 /* UNICODE_ALLOW_SUPER includes 94 * FFFEs and FFFFs beyond 0x10FFFF. */ 95 ((uv <= PERL_UNICODE_MAX) || 96 !(flags & UNICODE_ALLOW_SUPER)) 97 ) 98 Perl_warner(aTHX_ packWARN(WARN_UTF8), 99 "Unicode character 0x%04"UVxf" is illegal", uv); 100 } 101 if (UNI_IS_INVARIANT(uv)) { 102 *d++ = (U8)UTF_TO_NATIVE(uv); 103 return d; 104 } 105 #if defined(EBCDIC) 106 else { 107 STRLEN len = UNISKIP(uv); 108 U8 *p = d+len-1; 109 while (p > d) { 110 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); 111 uv >>= UTF_ACCUMULATION_SHIFT; 112 } 113 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); 114 return d+len; 115 } 116 #else /* Non loop style */ 117 if (uv < 0x800) { 118 *d++ = (U8)(( uv >> 6) | 0xc0); 119 *d++ = (U8)(( uv & 0x3f) | 0x80); 120 return d; 121 } 122 if (uv < 0x10000) { 123 *d++ = (U8)(( uv >> 12) | 0xe0); 124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 125 *d++ = (U8)(( uv & 0x3f) | 0x80); 126 return d; 127 } 128 if (uv < 0x200000) { 129 *d++ = (U8)(( uv >> 18) | 0xf0); 130 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 131 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 132 *d++ = (U8)(( uv & 0x3f) | 0x80); 133 return d; 134 } 135 if (uv < 0x4000000) { 136 *d++ = (U8)(( uv >> 24) | 0xf8); 137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 140 *d++ = (U8)(( uv & 0x3f) | 0x80); 141 return d; 142 } 143 if (uv < 0x80000000) { 144 *d++ = (U8)(( uv >> 30) | 0xfc); 145 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 146 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 147 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 148 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 149 *d++ = (U8)(( uv & 0x3f) | 0x80); 150 return d; 151 } 152 #ifdef HAS_QUAD 153 if (uv < UTF8_QUAD_MAX) 154 #endif 155 { 156 *d++ = 0xfe; /* Can't match U+FEFF! */ 157 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); 158 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 159 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 160 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 161 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 162 *d++ = (U8)(( uv & 0x3f) | 0x80); 163 return d; 164 } 165 #ifdef HAS_QUAD 166 { 167 *d++ = 0xff; /* Can't match U+FFFE! */ 168 *d++ = 0x80; /* 6 Reserved bits */ 169 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */ 170 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80); 171 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80); 172 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80); 173 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80); 174 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); 175 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 176 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 177 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 178 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 179 *d++ = (U8)(( uv & 0x3f) | 0x80); 180 return d; 181 } 182 #endif 183 #endif /* Loop style */ 184 } 185 186 /* 187 188 Tests if some arbitrary number of bytes begins in a valid UTF-8 189 character. Note that an INVARIANT (i.e. ASCII) character is a valid 190 UTF-8 character. The actual number of bytes in the UTF-8 character 191 will be returned if it is valid, otherwise 0. 192 193 This is the "slow" version as opposed to the "fast" version which is 194 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed 195 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four 196 or less you should use the IS_UTF8_CHAR(), for lengths of five or more 197 you should use the _slow(). In practice this means that the _slow() 198 will be used very rarely, since the maximum Unicode code point (as of 199 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only 200 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into 201 five bytes or more. 202 203 =cut */ 204 STATIC STRLEN 205 S_is_utf8_char_slow(const U8 *s, const STRLEN len) 206 { 207 U8 u = *s; 208 STRLEN slen; 209 UV uv, ouv; 210 211 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; 212 213 if (UTF8_IS_INVARIANT(u)) 214 return 1; 215 216 if (!UTF8_IS_START(u)) 217 return 0; 218 219 if (len < 2 || !UTF8_IS_CONTINUATION(s[1])) 220 return 0; 221 222 slen = len - 1; 223 s++; 224 #ifdef EBCDIC 225 u = NATIVE_TO_UTF(u); 226 #endif 227 u &= UTF_START_MASK(len); 228 uv = u; 229 ouv = uv; 230 while (slen--) { 231 if (!UTF8_IS_CONTINUATION(*s)) 232 return 0; 233 uv = UTF8_ACCUMULATE(uv, *s); 234 if (uv < ouv) 235 return 0; 236 ouv = uv; 237 s++; 238 } 239 240 if ((STRLEN)UNISKIP(uv) < len) 241 return 0; 242 243 return len; 244 } 245 246 /* 247 =for apidoc is_utf8_char 248 249 Tests if some arbitrary number of bytes begins in a valid UTF-8 250 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) 251 character is a valid UTF-8 character. The actual number of bytes in the UTF-8 252 character will be returned if it is valid, otherwise 0. 253 254 =cut */ 255 STRLEN 256 Perl_is_utf8_char(pTHX_ const U8 *s) 257 { 258 const STRLEN len = UTF8SKIP(s); 259 260 PERL_ARGS_ASSERT_IS_UTF8_CHAR; 261 PERL_UNUSED_CONTEXT; 262 #ifdef IS_UTF8_CHAR 263 if (IS_UTF8_CHAR_FAST(len)) 264 return IS_UTF8_CHAR(s, len) ? len : 0; 265 #endif /* #ifdef IS_UTF8_CHAR */ 266 return is_utf8_char_slow(s, len); 267 } 268 269 /* 270 =for apidoc is_utf8_string 271 272 Returns true if first C<len> bytes of the given string form a valid 273 UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does 274 not mean 'a string that contains code points above 0x7F encoded in UTF-8' 275 because a valid ASCII string is a valid UTF-8 string. 276 277 See also is_utf8_string_loclen() and is_utf8_string_loc(). 278 279 =cut 280 */ 281 282 bool 283 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) 284 { 285 const U8* const send = s + (len ? len : strlen((const char *)s)); 286 const U8* x = s; 287 288 PERL_ARGS_ASSERT_IS_UTF8_STRING; 289 PERL_UNUSED_CONTEXT; 290 291 while (x < send) { 292 STRLEN c; 293 /* Inline the easy bits of is_utf8_char() here for speed... */ 294 if (UTF8_IS_INVARIANT(*x)) 295 c = 1; 296 else if (!UTF8_IS_START(*x)) 297 goto out; 298 else { 299 /* ... and call is_utf8_char() only if really needed. */ 300 #ifdef IS_UTF8_CHAR 301 c = UTF8SKIP(x); 302 if (IS_UTF8_CHAR_FAST(c)) { 303 if (!IS_UTF8_CHAR(x, c)) 304 c = 0; 305 } 306 else 307 c = is_utf8_char_slow(x, c); 308 #else 309 c = is_utf8_char(x); 310 #endif /* #ifdef IS_UTF8_CHAR */ 311 if (!c) 312 goto out; 313 } 314 x += c; 315 } 316 317 out: 318 if (x != send) 319 return FALSE; 320 321 return TRUE; 322 } 323 324 /* 325 Implemented as a macro in utf8.h 326 327 =for apidoc is_utf8_string_loc 328 329 Like is_utf8_string() but stores the location of the failure (in the 330 case of "utf8ness failure") or the location s+len (in the case of 331 "utf8ness success") in the C<ep>. 332 333 See also is_utf8_string_loclen() and is_utf8_string(). 334 335 =for apidoc is_utf8_string_loclen 336 337 Like is_utf8_string() but stores the location of the failure (in the 338 case of "utf8ness failure") or the location s+len (in the case of 339 "utf8ness success") in the C<ep>, and the number of UTF-8 340 encoded characters in the C<el>. 341 342 See also is_utf8_string_loc() and is_utf8_string(). 343 344 =cut 345 */ 346 347 bool 348 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 349 { 350 const U8* const send = s + (len ? len : strlen((const char *)s)); 351 const U8* x = s; 352 STRLEN c; 353 STRLEN outlen = 0; 354 355 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 356 PERL_UNUSED_CONTEXT; 357 358 while (x < send) { 359 /* Inline the easy bits of is_utf8_char() here for speed... */ 360 if (UTF8_IS_INVARIANT(*x)) 361 c = 1; 362 else if (!UTF8_IS_START(*x)) 363 goto out; 364 else { 365 /* ... and call is_utf8_char() only if really needed. */ 366 #ifdef IS_UTF8_CHAR 367 c = UTF8SKIP(x); 368 if (IS_UTF8_CHAR_FAST(c)) { 369 if (!IS_UTF8_CHAR(x, c)) 370 c = 0; 371 } else 372 c = is_utf8_char_slow(x, c); 373 #else 374 c = is_utf8_char(x); 375 #endif /* #ifdef IS_UTF8_CHAR */ 376 if (!c) 377 goto out; 378 } 379 x += c; 380 outlen++; 381 } 382 383 out: 384 if (el) 385 *el = outlen; 386 387 if (ep) 388 *ep = x; 389 return (x == send); 390 } 391 392 /* 393 394 =for apidoc utf8n_to_uvuni 395 396 Bottom level UTF-8 decode routine. 397 Returns the Unicode code point value of the first character in the string C<s> 398 which is assumed to be in UTF-8 encoding and no longer than C<curlen>; 399 C<retlen> will be set to the length, in bytes, of that character. 400 401 If C<s> does not point to a well-formed UTF-8 character, the behaviour 402 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, 403 it is assumed that the caller will raise a warning, and this function 404 will silently just set C<retlen> to C<-1> and return zero. If the 405 C<flags> does not contain UTF8_CHECK_ONLY, warnings about 406 malformations will be given, C<retlen> will be set to the expected 407 length of the UTF-8 character in bytes, and zero will be returned. 408 409 The C<flags> can also contain various flags to allow deviations from 410 the strict UTF-8 encoding (see F<utf8.h>). 411 412 Most code should use utf8_to_uvchr() rather than call this directly. 413 414 =cut 415 */ 416 417 UV 418 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) 419 { 420 dVAR; 421 const U8 * const s0 = s; 422 UV uv = *s, ouv = 0; 423 STRLEN len = 1; 424 const bool dowarn = ckWARN_d(WARN_UTF8); 425 const UV startbyte = *s; 426 STRLEN expectlen = 0; 427 U32 warning = 0; 428 429 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; 430 431 /* This list is a superset of the UTF8_ALLOW_XXX. */ 432 433 #define UTF8_WARN_EMPTY 1 434 #define UTF8_WARN_CONTINUATION 2 435 #define UTF8_WARN_NON_CONTINUATION 3 436 #define UTF8_WARN_FE_FF 4 437 #define UTF8_WARN_SHORT 5 438 #define UTF8_WARN_OVERFLOW 6 439 #define UTF8_WARN_SURROGATE 7 440 #define UTF8_WARN_LONG 8 441 #define UTF8_WARN_FFFF 9 /* Also FFFE. */ 442 443 if (curlen == 0 && 444 !(flags & UTF8_ALLOW_EMPTY)) { 445 warning = UTF8_WARN_EMPTY; 446 goto malformed; 447 } 448 449 if (UTF8_IS_INVARIANT(uv)) { 450 if (retlen) 451 *retlen = 1; 452 return (UV) (NATIVE_TO_UTF(*s)); 453 } 454 455 if (UTF8_IS_CONTINUATION(uv) && 456 !(flags & UTF8_ALLOW_CONTINUATION)) { 457 warning = UTF8_WARN_CONTINUATION; 458 goto malformed; 459 } 460 461 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && 462 !(flags & UTF8_ALLOW_NON_CONTINUATION)) { 463 warning = UTF8_WARN_NON_CONTINUATION; 464 goto malformed; 465 } 466 467 #ifdef EBCDIC 468 uv = NATIVE_TO_UTF(uv); 469 #else 470 if ((uv == 0xfe || uv == 0xff) && 471 !(flags & UTF8_ALLOW_FE_FF)) { 472 warning = UTF8_WARN_FE_FF; 473 goto malformed; 474 } 475 #endif 476 477 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; } 478 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; } 479 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; } 480 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; } 481 #ifdef EBCDIC 482 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } 483 else { len = 7; uv &= 0x01; } 484 #else 485 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } 486 else if (!(uv & 0x01)) { len = 7; uv = 0; } 487 else { len = 13; uv = 0; } /* whoa! */ 488 #endif 489 490 if (retlen) 491 *retlen = len; 492 493 expectlen = len; 494 495 if ((curlen < expectlen) && 496 !(flags & UTF8_ALLOW_SHORT)) { 497 warning = UTF8_WARN_SHORT; 498 goto malformed; 499 } 500 501 len--; 502 s++; 503 ouv = uv; 504 505 while (len--) { 506 if (!UTF8_IS_CONTINUATION(*s) && 507 !(flags & UTF8_ALLOW_NON_CONTINUATION)) { 508 s--; 509 warning = UTF8_WARN_NON_CONTINUATION; 510 goto malformed; 511 } 512 else 513 uv = UTF8_ACCUMULATE(uv, *s); 514 if (!(uv > ouv)) { 515 /* These cannot be allowed. */ 516 if (uv == ouv) { 517 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) { 518 warning = UTF8_WARN_LONG; 519 goto malformed; 520 } 521 } 522 else { /* uv < ouv */ 523 /* This cannot be allowed. */ 524 warning = UTF8_WARN_OVERFLOW; 525 goto malformed; 526 } 527 } 528 s++; 529 ouv = uv; 530 } 531 532 if (UNICODE_IS_SURROGATE(uv) && 533 !(flags & UTF8_ALLOW_SURROGATE)) { 534 warning = UTF8_WARN_SURROGATE; 535 goto malformed; 536 } else if ((expectlen > (STRLEN)UNISKIP(uv)) && 537 !(flags & UTF8_ALLOW_LONG)) { 538 warning = UTF8_WARN_LONG; 539 goto malformed; 540 } else if (UNICODE_IS_ILLEGAL(uv) && 541 !(flags & UTF8_ALLOW_FFFF)) { 542 warning = UTF8_WARN_FFFF; 543 goto malformed; 544 } 545 546 return uv; 547 548 malformed: 549 550 if (flags & UTF8_CHECK_ONLY) { 551 if (retlen) 552 *retlen = ((STRLEN) -1); 553 return 0; 554 } 555 556 if (dowarn) { 557 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP); 558 559 switch (warning) { 560 case 0: /* Intentionally empty. */ break; 561 case UTF8_WARN_EMPTY: 562 sv_catpvs(sv, "(empty string)"); 563 break; 564 case UTF8_WARN_CONTINUATION: 565 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv); 566 break; 567 case UTF8_WARN_NON_CONTINUATION: 568 if (s == s0) 569 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")", 570 (UV)s[1], startbyte); 571 else { 572 const int len = (int)(s-s0); 573 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)", 574 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen); 575 } 576 577 break; 578 case UTF8_WARN_FE_FF: 579 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); 580 break; 581 case UTF8_WARN_SHORT: 582 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", 583 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte); 584 expectlen = curlen; /* distance for caller to skip */ 585 break; 586 case UTF8_WARN_OVERFLOW: 587 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")", 588 ouv, *s, startbyte); 589 break; 590 case UTF8_WARN_SURROGATE: 591 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); 592 break; 593 case UTF8_WARN_LONG: 594 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", 595 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); 596 break; 597 case UTF8_WARN_FFFF: 598 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv); 599 break; 600 default: 601 sv_catpvs(sv, "(unknown reason)"); 602 break; 603 } 604 605 if (warning) { 606 const char * const s = SvPVX_const(sv); 607 608 if (PL_op) 609 Perl_warner(aTHX_ packWARN(WARN_UTF8), 610 "%s in %s", s, OP_DESC(PL_op)); 611 else 612 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s); 613 } 614 } 615 616 if (retlen) 617 *retlen = expectlen ? expectlen : len; 618 619 return 0; 620 } 621 622 /* 623 =for apidoc utf8_to_uvchr 624 625 Returns the native character value of the first character in the string C<s> 626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 627 length, in bytes, of that character. 628 629 If C<s> does not point to a well-formed UTF-8 character, zero is 630 returned and retlen is set, if possible, to -1. 631 632 =cut 633 */ 634 635 UV 636 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) 637 { 638 PERL_ARGS_ASSERT_UTF8_TO_UVCHR; 639 640 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen, 641 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 642 } 643 644 /* 645 =for apidoc utf8_to_uvuni 646 647 Returns the Unicode code point of the first character in the string C<s> 648 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 649 length, in bytes, of that character. 650 651 This function should only be used when the returned UV is considered 652 an index into the Unicode semantic tables (e.g. swashes). 653 654 If C<s> does not point to a well-formed UTF-8 character, zero is 655 returned and retlen is set, if possible, to -1. 656 657 =cut 658 */ 659 660 UV 661 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) 662 { 663 PERL_ARGS_ASSERT_UTF8_TO_UVUNI; 664 665 /* Call the low level routine asking for checks */ 666 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen, 667 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 668 } 669 670 /* 671 =for apidoc utf8_length 672 673 Return the length of the UTF-8 char encoded string C<s> in characters. 674 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end 675 up past C<e>, croaks. 676 677 =cut 678 */ 679 680 STRLEN 681 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) 682 { 683 dVAR; 684 STRLEN len = 0; 685 686 PERL_ARGS_ASSERT_UTF8_LENGTH; 687 688 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. 689 * the bitops (especially ~) can create illegal UTF-8. 690 * In other words: in Perl UTF-8 is not just for Unicode. */ 691 692 if (e < s) 693 goto warn_and_return; 694 while (s < e) { 695 if (!UTF8_IS_INVARIANT(*s)) 696 s += UTF8SKIP(s); 697 else 698 s++; 699 len++; 700 } 701 702 if (e != s) { 703 len--; 704 warn_and_return: 705 if (ckWARN_d(WARN_UTF8)) { 706 if (PL_op) 707 Perl_warner(aTHX_ packWARN(WARN_UTF8), 708 "%s in %s", unees, OP_DESC(PL_op)); 709 else 710 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); 711 } 712 } 713 714 return len; 715 } 716 717 /* 718 =for apidoc utf8_distance 719 720 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 721 and C<b>. 722 723 WARNING: use only if you *know* that the pointers point inside the 724 same UTF-8 buffer. 725 726 =cut 727 */ 728 729 IV 730 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 731 { 732 PERL_ARGS_ASSERT_UTF8_DISTANCE; 733 734 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 735 } 736 737 /* 738 =for apidoc utf8_hop 739 740 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 741 forward or backward. 742 743 WARNING: do not use the following unless you *know* C<off> is within 744 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned 745 on the first byte of character or just after the last byte of a character. 746 747 =cut 748 */ 749 750 U8 * 751 Perl_utf8_hop(pTHX_ const U8 *s, I32 off) 752 { 753 PERL_ARGS_ASSERT_UTF8_HOP; 754 755 PERL_UNUSED_CONTEXT; 756 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 757 * the bitops (especially ~) can create illegal UTF-8. 758 * In other words: in Perl UTF-8 is not just for Unicode. */ 759 760 if (off >= 0) { 761 while (off--) 762 s += UTF8SKIP(s); 763 } 764 else { 765 while (off++) { 766 s--; 767 while (UTF8_IS_CONTINUATION(*s)) 768 s--; 769 } 770 } 771 return (U8 *)s; 772 } 773 774 /* 775 =for apidoc utf8_to_bytes 776 777 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding. 778 Unlike C<bytes_to_utf8>, this over-writes the original string, and 779 updates len to contain the new length. 780 Returns zero on failure, setting C<len> to -1. 781 782 If you need a copy of the string, see C<bytes_from_utf8>. 783 784 =cut 785 */ 786 787 U8 * 788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) 789 { 790 U8 * const save = s; 791 U8 * const send = s + *len; 792 U8 *d; 793 794 PERL_ARGS_ASSERT_UTF8_TO_BYTES; 795 796 /* ensure valid UTF-8 and chars < 256 before updating string */ 797 while (s < send) { 798 U8 c = *s++; 799 800 if (!UTF8_IS_INVARIANT(c) && 801 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send) 802 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) { 803 *len = ((STRLEN) -1); 804 return 0; 805 } 806 } 807 808 d = s = save; 809 while (s < send) { 810 STRLEN ulen; 811 *d++ = (U8)utf8_to_uvchr(s, &ulen); 812 s += ulen; 813 } 814 *d = '\0'; 815 *len = d - save; 816 return save; 817 } 818 819 /* 820 =for apidoc bytes_from_utf8 821 822 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding. 823 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to 824 the newly-created string, and updates C<len> to contain the new 825 length. Returns the original string if no conversion occurs, C<len> 826 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to 827 0 if C<s> is converted or consisted entirely of characters that are invariant 828 in utf8 (i.e., US-ASCII on non-EBCDIC machines). 829 830 =cut 831 */ 832 833 U8 * 834 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) 835 { 836 U8 *d; 837 const U8 *start = s; 838 const U8 *send; 839 I32 count = 0; 840 841 PERL_ARGS_ASSERT_BYTES_FROM_UTF8; 842 843 PERL_UNUSED_CONTEXT; 844 if (!*is_utf8) 845 return (U8 *)start; 846 847 /* ensure valid UTF-8 and chars < 256 before converting string */ 848 for (send = s + *len; s < send;) { 849 U8 c = *s++; 850 if (!UTF8_IS_INVARIANT(c)) { 851 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send && 852 (c = *s++) && UTF8_IS_CONTINUATION(c)) 853 count++; 854 else 855 return (U8 *)start; 856 } 857 } 858 859 *is_utf8 = FALSE; 860 861 Newx(d, (*len) - count + 1, U8); 862 s = start; start = d; 863 while (s < send) { 864 U8 c = *s++; 865 if (!UTF8_IS_INVARIANT(c)) { 866 /* Then it is two-byte encoded */ 867 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++); 868 c = ASCII_TO_NATIVE(c); 869 } 870 *d++ = c; 871 } 872 *d = '\0'; 873 *len = d - start; 874 return (U8 *)start; 875 } 876 877 /* 878 =for apidoc bytes_to_utf8 879 880 Converts a string C<s> of length C<len> from the native encoding into UTF-8. 881 Returns a pointer to the newly-created string, and sets C<len> to 882 reflect the new length. 883 884 A NUL character will be written after the end of the string. 885 886 If you want to convert to UTF-8 from encodings other than 887 the native (Latin1 or EBCDIC), 888 see sv_recode_to_utf8(). 889 890 =cut 891 */ 892 893 U8* 894 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) 895 { 896 const U8 * const send = s + (*len); 897 U8 *d; 898 U8 *dst; 899 900 PERL_ARGS_ASSERT_BYTES_TO_UTF8; 901 PERL_UNUSED_CONTEXT; 902 903 Newx(d, (*len) * 2 + 1, U8); 904 dst = d; 905 906 while (s < send) { 907 const UV uv = NATIVE_TO_ASCII(*s++); 908 if (UNI_IS_INVARIANT(uv)) 909 *d++ = (U8)UTF_TO_NATIVE(uv); 910 else { 911 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); 912 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); 913 } 914 } 915 *d = '\0'; 916 *len = d-dst; 917 return dst; 918 } 919 920 /* 921 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. 922 * 923 * Destination must be pre-extended to 3/2 source. Do not use in-place. 924 * We optimize for native, for obvious reasons. */ 925 926 U8* 927 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) 928 { 929 U8* pend; 930 U8* dstart = d; 931 932 PERL_ARGS_ASSERT_UTF16_TO_UTF8; 933 934 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ 935 d[0] = 0; 936 *newlen = 1; 937 return d; 938 } 939 940 if (bytelen & 1) 941 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); 942 943 pend = p + bytelen; 944 945 while (p < pend) { 946 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ 947 p += 2; 948 if (uv < 0x80) { 949 #ifdef EBCDIC 950 *d++ = UNI_TO_NATIVE(uv); 951 #else 952 *d++ = (U8)uv; 953 #endif 954 continue; 955 } 956 if (uv < 0x800) { 957 *d++ = (U8)(( uv >> 6) | 0xc0); 958 *d++ = (U8)(( uv & 0x3f) | 0x80); 959 continue; 960 } 961 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ 962 UV low = (p[0] << 8) + p[1]; 963 p += 2; 964 if (low < 0xdc00 || low >= 0xdfff) 965 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 966 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; 967 } 968 if (uv < 0x10000) { 969 *d++ = (U8)(( uv >> 12) | 0xe0); 970 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 971 *d++ = (U8)(( uv & 0x3f) | 0x80); 972 continue; 973 } 974 else { 975 *d++ = (U8)(( uv >> 18) | 0xf0); 976 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 977 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 978 *d++ = (U8)(( uv & 0x3f) | 0x80); 979 continue; 980 } 981 } 982 *newlen = d - dstart; 983 return d; 984 } 985 986 /* Note: this one is slightly destructive of the source. */ 987 988 U8* 989 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) 990 { 991 U8* s = (U8*)p; 992 U8* const send = s + bytelen; 993 994 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; 995 996 while (s < send) { 997 const U8 tmp = s[0]; 998 s[0] = s[1]; 999 s[1] = tmp; 1000 s += 2; 1001 } 1002 return utf16_to_utf8(p, d, bytelen, newlen); 1003 } 1004 1005 /* for now these are all defined (inefficiently) in terms of the utf8 versions */ 1006 1007 bool 1008 Perl_is_uni_alnum(pTHX_ UV c) 1009 { 1010 U8 tmpbuf[UTF8_MAXBYTES+1]; 1011 uvchr_to_utf8(tmpbuf, c); 1012 return is_utf8_alnum(tmpbuf); 1013 } 1014 1015 bool 1016 Perl_is_uni_alnumc(pTHX_ UV c) 1017 { 1018 U8 tmpbuf[UTF8_MAXBYTES+1]; 1019 uvchr_to_utf8(tmpbuf, c); 1020 return is_utf8_alnumc(tmpbuf); 1021 } 1022 1023 bool 1024 Perl_is_uni_idfirst(pTHX_ UV c) 1025 { 1026 U8 tmpbuf[UTF8_MAXBYTES+1]; 1027 uvchr_to_utf8(tmpbuf, c); 1028 return is_utf8_idfirst(tmpbuf); 1029 } 1030 1031 bool 1032 Perl_is_uni_alpha(pTHX_ UV c) 1033 { 1034 U8 tmpbuf[UTF8_MAXBYTES+1]; 1035 uvchr_to_utf8(tmpbuf, c); 1036 return is_utf8_alpha(tmpbuf); 1037 } 1038 1039 bool 1040 Perl_is_uni_ascii(pTHX_ UV c) 1041 { 1042 U8 tmpbuf[UTF8_MAXBYTES+1]; 1043 uvchr_to_utf8(tmpbuf, c); 1044 return is_utf8_ascii(tmpbuf); 1045 } 1046 1047 bool 1048 Perl_is_uni_space(pTHX_ UV c) 1049 { 1050 U8 tmpbuf[UTF8_MAXBYTES+1]; 1051 uvchr_to_utf8(tmpbuf, c); 1052 return is_utf8_space(tmpbuf); 1053 } 1054 1055 bool 1056 Perl_is_uni_digit(pTHX_ UV c) 1057 { 1058 U8 tmpbuf[UTF8_MAXBYTES+1]; 1059 uvchr_to_utf8(tmpbuf, c); 1060 return is_utf8_digit(tmpbuf); 1061 } 1062 1063 bool 1064 Perl_is_uni_upper(pTHX_ UV c) 1065 { 1066 U8 tmpbuf[UTF8_MAXBYTES+1]; 1067 uvchr_to_utf8(tmpbuf, c); 1068 return is_utf8_upper(tmpbuf); 1069 } 1070 1071 bool 1072 Perl_is_uni_lower(pTHX_ UV c) 1073 { 1074 U8 tmpbuf[UTF8_MAXBYTES+1]; 1075 uvchr_to_utf8(tmpbuf, c); 1076 return is_utf8_lower(tmpbuf); 1077 } 1078 1079 bool 1080 Perl_is_uni_cntrl(pTHX_ UV c) 1081 { 1082 U8 tmpbuf[UTF8_MAXBYTES+1]; 1083 uvchr_to_utf8(tmpbuf, c); 1084 return is_utf8_cntrl(tmpbuf); 1085 } 1086 1087 bool 1088 Perl_is_uni_graph(pTHX_ UV c) 1089 { 1090 U8 tmpbuf[UTF8_MAXBYTES+1]; 1091 uvchr_to_utf8(tmpbuf, c); 1092 return is_utf8_graph(tmpbuf); 1093 } 1094 1095 bool 1096 Perl_is_uni_print(pTHX_ UV c) 1097 { 1098 U8 tmpbuf[UTF8_MAXBYTES+1]; 1099 uvchr_to_utf8(tmpbuf, c); 1100 return is_utf8_print(tmpbuf); 1101 } 1102 1103 bool 1104 Perl_is_uni_punct(pTHX_ UV c) 1105 { 1106 U8 tmpbuf[UTF8_MAXBYTES+1]; 1107 uvchr_to_utf8(tmpbuf, c); 1108 return is_utf8_punct(tmpbuf); 1109 } 1110 1111 bool 1112 Perl_is_uni_xdigit(pTHX_ UV c) 1113 { 1114 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1115 uvchr_to_utf8(tmpbuf, c); 1116 return is_utf8_xdigit(tmpbuf); 1117 } 1118 1119 UV 1120 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) 1121 { 1122 PERL_ARGS_ASSERT_TO_UNI_UPPER; 1123 1124 uvchr_to_utf8(p, c); 1125 return to_utf8_upper(p, p, lenp); 1126 } 1127 1128 UV 1129 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) 1130 { 1131 PERL_ARGS_ASSERT_TO_UNI_TITLE; 1132 1133 uvchr_to_utf8(p, c); 1134 return to_utf8_title(p, p, lenp); 1135 } 1136 1137 UV 1138 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) 1139 { 1140 PERL_ARGS_ASSERT_TO_UNI_LOWER; 1141 1142 uvchr_to_utf8(p, c); 1143 return to_utf8_lower(p, p, lenp); 1144 } 1145 1146 UV 1147 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp) 1148 { 1149 PERL_ARGS_ASSERT_TO_UNI_FOLD; 1150 1151 uvchr_to_utf8(p, c); 1152 return to_utf8_fold(p, p, lenp); 1153 } 1154 1155 /* for now these all assume no locale info available for Unicode > 255 */ 1156 1157 bool 1158 Perl_is_uni_alnum_lc(pTHX_ UV c) 1159 { 1160 return is_uni_alnum(c); /* XXX no locale support yet */ 1161 } 1162 1163 bool 1164 Perl_is_uni_alnumc_lc(pTHX_ UV c) 1165 { 1166 return is_uni_alnumc(c); /* XXX no locale support yet */ 1167 } 1168 1169 bool 1170 Perl_is_uni_idfirst_lc(pTHX_ UV c) 1171 { 1172 return is_uni_idfirst(c); /* XXX no locale support yet */ 1173 } 1174 1175 bool 1176 Perl_is_uni_alpha_lc(pTHX_ UV c) 1177 { 1178 return is_uni_alpha(c); /* XXX no locale support yet */ 1179 } 1180 1181 bool 1182 Perl_is_uni_ascii_lc(pTHX_ UV c) 1183 { 1184 return is_uni_ascii(c); /* XXX no locale support yet */ 1185 } 1186 1187 bool 1188 Perl_is_uni_space_lc(pTHX_ UV c) 1189 { 1190 return is_uni_space(c); /* XXX no locale support yet */ 1191 } 1192 1193 bool 1194 Perl_is_uni_digit_lc(pTHX_ UV c) 1195 { 1196 return is_uni_digit(c); /* XXX no locale support yet */ 1197 } 1198 1199 bool 1200 Perl_is_uni_upper_lc(pTHX_ UV c) 1201 { 1202 return is_uni_upper(c); /* XXX no locale support yet */ 1203 } 1204 1205 bool 1206 Perl_is_uni_lower_lc(pTHX_ UV c) 1207 { 1208 return is_uni_lower(c); /* XXX no locale support yet */ 1209 } 1210 1211 bool 1212 Perl_is_uni_cntrl_lc(pTHX_ UV c) 1213 { 1214 return is_uni_cntrl(c); /* XXX no locale support yet */ 1215 } 1216 1217 bool 1218 Perl_is_uni_graph_lc(pTHX_ UV c) 1219 { 1220 return is_uni_graph(c); /* XXX no locale support yet */ 1221 } 1222 1223 bool 1224 Perl_is_uni_print_lc(pTHX_ UV c) 1225 { 1226 return is_uni_print(c); /* XXX no locale support yet */ 1227 } 1228 1229 bool 1230 Perl_is_uni_punct_lc(pTHX_ UV c) 1231 { 1232 return is_uni_punct(c); /* XXX no locale support yet */ 1233 } 1234 1235 bool 1236 Perl_is_uni_xdigit_lc(pTHX_ UV c) 1237 { 1238 return is_uni_xdigit(c); /* XXX no locale support yet */ 1239 } 1240 1241 U32 1242 Perl_to_uni_upper_lc(pTHX_ U32 c) 1243 { 1244 /* XXX returns only the first character -- do not use XXX */ 1245 /* XXX no locale support yet */ 1246 STRLEN len; 1247 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1248 return (U32)to_uni_upper(c, tmpbuf, &len); 1249 } 1250 1251 U32 1252 Perl_to_uni_title_lc(pTHX_ U32 c) 1253 { 1254 /* XXX returns only the first character XXX -- do not use XXX */ 1255 /* XXX no locale support yet */ 1256 STRLEN len; 1257 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1258 return (U32)to_uni_title(c, tmpbuf, &len); 1259 } 1260 1261 U32 1262 Perl_to_uni_lower_lc(pTHX_ U32 c) 1263 { 1264 /* XXX returns only the first character -- do not use XXX */ 1265 /* XXX no locale support yet */ 1266 STRLEN len; 1267 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1268 return (U32)to_uni_lower(c, tmpbuf, &len); 1269 } 1270 1271 static bool 1272 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, 1273 const char *const swashname) 1274 { 1275 dVAR; 1276 1277 PERL_ARGS_ASSERT_IS_UTF8_COMMON; 1278 1279 if (!is_utf8_char(p)) 1280 return FALSE; 1281 if (!*swash) 1282 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); 1283 return swash_fetch(*swash, p, TRUE) != 0; 1284 } 1285 1286 bool 1287 Perl_is_utf8_alnum(pTHX_ const U8 *p) 1288 { 1289 dVAR; 1290 1291 PERL_ARGS_ASSERT_IS_UTF8_ALNUM; 1292 1293 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true 1294 * descendant of isalnum(3), in other words, it doesn't 1295 * contain the '_'. --jhi */ 1296 return is_utf8_common(p, &PL_utf8_alnum, "IsWord"); 1297 } 1298 1299 bool 1300 Perl_is_utf8_alnumc(pTHX_ const U8 *p) 1301 { 1302 dVAR; 1303 1304 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC; 1305 1306 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC"); 1307 } 1308 1309 bool 1310 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ 1311 { 1312 dVAR; 1313 1314 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; 1315 1316 if (*p == '_') 1317 return TRUE; 1318 /* is_utf8_idstart would be more logical. */ 1319 return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); 1320 } 1321 1322 bool 1323 Perl_is_utf8_idcont(pTHX_ const U8 *p) 1324 { 1325 dVAR; 1326 1327 PERL_ARGS_ASSERT_IS_UTF8_IDCONT; 1328 1329 if (*p == '_') 1330 return TRUE; 1331 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue"); 1332 } 1333 1334 bool 1335 Perl_is_utf8_alpha(pTHX_ const U8 *p) 1336 { 1337 dVAR; 1338 1339 PERL_ARGS_ASSERT_IS_UTF8_ALPHA; 1340 1341 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha"); 1342 } 1343 1344 bool 1345 Perl_is_utf8_ascii(pTHX_ const U8 *p) 1346 { 1347 dVAR; 1348 1349 PERL_ARGS_ASSERT_IS_UTF8_ASCII; 1350 1351 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii"); 1352 } 1353 1354 bool 1355 Perl_is_utf8_space(pTHX_ const U8 *p) 1356 { 1357 dVAR; 1358 1359 PERL_ARGS_ASSERT_IS_UTF8_SPACE; 1360 1361 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl"); 1362 } 1363 1364 bool 1365 Perl_is_utf8_digit(pTHX_ const U8 *p) 1366 { 1367 dVAR; 1368 1369 PERL_ARGS_ASSERT_IS_UTF8_DIGIT; 1370 1371 return is_utf8_common(p, &PL_utf8_digit, "IsDigit"); 1372 } 1373 1374 bool 1375 Perl_is_utf8_upper(pTHX_ const U8 *p) 1376 { 1377 dVAR; 1378 1379 PERL_ARGS_ASSERT_IS_UTF8_UPPER; 1380 1381 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase"); 1382 } 1383 1384 bool 1385 Perl_is_utf8_lower(pTHX_ const U8 *p) 1386 { 1387 dVAR; 1388 1389 PERL_ARGS_ASSERT_IS_UTF8_LOWER; 1390 1391 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase"); 1392 } 1393 1394 bool 1395 Perl_is_utf8_cntrl(pTHX_ const U8 *p) 1396 { 1397 dVAR; 1398 1399 PERL_ARGS_ASSERT_IS_UTF8_CNTRL; 1400 1401 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl"); 1402 } 1403 1404 bool 1405 Perl_is_utf8_graph(pTHX_ const U8 *p) 1406 { 1407 dVAR; 1408 1409 PERL_ARGS_ASSERT_IS_UTF8_GRAPH; 1410 1411 return is_utf8_common(p, &PL_utf8_graph, "IsGraph"); 1412 } 1413 1414 bool 1415 Perl_is_utf8_print(pTHX_ const U8 *p) 1416 { 1417 dVAR; 1418 1419 PERL_ARGS_ASSERT_IS_UTF8_PRINT; 1420 1421 return is_utf8_common(p, &PL_utf8_print, "IsPrint"); 1422 } 1423 1424 bool 1425 Perl_is_utf8_punct(pTHX_ const U8 *p) 1426 { 1427 dVAR; 1428 1429 PERL_ARGS_ASSERT_IS_UTF8_PUNCT; 1430 1431 return is_utf8_common(p, &PL_utf8_punct, "IsPunct"); 1432 } 1433 1434 bool 1435 Perl_is_utf8_xdigit(pTHX_ const U8 *p) 1436 { 1437 dVAR; 1438 1439 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; 1440 1441 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit"); 1442 } 1443 1444 bool 1445 Perl_is_utf8_mark(pTHX_ const U8 *p) 1446 { 1447 dVAR; 1448 1449 PERL_ARGS_ASSERT_IS_UTF8_MARK; 1450 1451 return is_utf8_common(p, &PL_utf8_mark, "IsM"); 1452 } 1453 1454 /* 1455 =for apidoc to_utf8_case 1456 1457 The "p" contains the pointer to the UTF-8 string encoding 1458 the character that is being converted. 1459 1460 The "ustrp" is a pointer to the character buffer to put the 1461 conversion result to. The "lenp" is a pointer to the length 1462 of the result. 1463 1464 The "swashp" is a pointer to the swash to use. 1465 1466 Both the special and normal mappings are stored lib/unicore/To/Foo.pl, 1467 and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually, 1468 but not always, a multicharacter mapping), is tried first. 1469 1470 The "special" is a string like "utf8::ToSpecLower", which means the 1471 hash %utf8::ToSpecLower. The access to the hash is through 1472 Perl_to_utf8_case(). 1473 1474 The "normal" is a string like "ToLower" which means the swash 1475 %utf8::ToLower. 1476 1477 =cut */ 1478 1479 UV 1480 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, 1481 SV **swashp, const char *normal, const char *special) 1482 { 1483 dVAR; 1484 U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; 1485 STRLEN len = 0; 1486 const UV uv0 = utf8_to_uvchr(p, NULL); 1487 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings 1488 * are necessary in EBCDIC, they are redundant no-ops 1489 * in ASCII-ish platforms, and hopefully optimized away. */ 1490 const UV uv1 = NATIVE_TO_UNI(uv0); 1491 1492 PERL_ARGS_ASSERT_TO_UTF8_CASE; 1493 1494 uvuni_to_utf8(tmpbuf, uv1); 1495 1496 if (!*swashp) /* load on-demand */ 1497 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); 1498 1499 /* The 0xDF is the only special casing Unicode code point below 0x100. */ 1500 if (special && (uv1 == 0xDF || uv1 > 0xFF)) { 1501 /* It might be "special" (sometimes, but not always, 1502 * a multicharacter mapping) */ 1503 HV * const hv = get_hv(special, 0); 1504 SV **svp; 1505 1506 if (hv && 1507 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && 1508 (*svp)) { 1509 const char *s; 1510 1511 s = SvPV_const(*svp, len); 1512 if (len == 1) 1513 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp; 1514 else { 1515 #ifdef EBCDIC 1516 /* If we have EBCDIC we need to remap the characters 1517 * since any characters in the low 256 are Unicode 1518 * code points, not EBCDIC. */ 1519 U8 *t = (U8*)s, *tend = t + len, *d; 1520 1521 d = tmpbuf; 1522 if (SvUTF8(*svp)) { 1523 STRLEN tlen = 0; 1524 1525 while (t < tend) { 1526 const UV c = utf8_to_uvchr(t, &tlen); 1527 if (tlen > 0) { 1528 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); 1529 t += tlen; 1530 } 1531 else 1532 break; 1533 } 1534 } 1535 else { 1536 while (t < tend) { 1537 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t)); 1538 t++; 1539 } 1540 } 1541 len = d - tmpbuf; 1542 Copy(tmpbuf, ustrp, len, U8); 1543 #else 1544 Copy(s, ustrp, len, U8); 1545 #endif 1546 } 1547 } 1548 } 1549 1550 if (!len && *swashp) { 1551 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); 1552 1553 if (uv2) { 1554 /* It was "normal" (a single character mapping). */ 1555 const UV uv3 = UNI_TO_NATIVE(uv2); 1556 len = uvchr_to_utf8(ustrp, uv3) - ustrp; 1557 } 1558 } 1559 1560 if (!len) /* Neither: just copy. */ 1561 len = uvchr_to_utf8(ustrp, uv0) - ustrp; 1562 1563 if (lenp) 1564 *lenp = len; 1565 1566 return len ? utf8_to_uvchr(ustrp, 0) : 0; 1567 } 1568 1569 /* 1570 =for apidoc to_utf8_upper 1571 1572 Convert the UTF-8 encoded character at p to its uppercase version and 1573 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1574 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since 1575 the uppercase version may be longer than the original character. 1576 1577 The first character of the uppercased version is returned 1578 (but note, as explained above, that there may be more.) 1579 1580 =cut */ 1581 1582 UV 1583 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1584 { 1585 dVAR; 1586 1587 PERL_ARGS_ASSERT_TO_UTF8_UPPER; 1588 1589 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1590 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); 1591 } 1592 1593 /* 1594 =for apidoc to_utf8_title 1595 1596 Convert the UTF-8 encoded character at p to its titlecase version and 1597 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1598 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the 1599 titlecase version may be longer than the original character. 1600 1601 The first character of the titlecased version is returned 1602 (but note, as explained above, that there may be more.) 1603 1604 =cut */ 1605 1606 UV 1607 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1608 { 1609 dVAR; 1610 1611 PERL_ARGS_ASSERT_TO_UTF8_TITLE; 1612 1613 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1614 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); 1615 } 1616 1617 /* 1618 =for apidoc to_utf8_lower 1619 1620 Convert the UTF-8 encoded character at p to its lowercase version and 1621 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1622 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the 1623 lowercase version may be longer than the original character. 1624 1625 The first character of the lowercased version is returned 1626 (but note, as explained above, that there may be more.) 1627 1628 =cut */ 1629 1630 UV 1631 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1632 { 1633 dVAR; 1634 1635 PERL_ARGS_ASSERT_TO_UTF8_LOWER; 1636 1637 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1638 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); 1639 } 1640 1641 /* 1642 =for apidoc to_utf8_fold 1643 1644 Convert the UTF-8 encoded character at p to its foldcase version and 1645 store that in UTF-8 in ustrp and its length in bytes in lenp. Note 1646 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the 1647 foldcase version may be longer than the original character (up to 1648 three characters). 1649 1650 The first character of the foldcased version is returned 1651 (but note, as explained above, that there may be more.) 1652 1653 =cut */ 1654 1655 UV 1656 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) 1657 { 1658 dVAR; 1659 1660 PERL_ARGS_ASSERT_TO_UTF8_FOLD; 1661 1662 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, 1663 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold"); 1664 } 1665 1666 /* Note: 1667 * A "swash" is a swatch hash. 1668 * A "swatch" is a bit vector generated by utf8.c:S_swash_get(). 1669 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8". 1670 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl. 1671 */ 1672 SV* 1673 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none) 1674 { 1675 dVAR; 1676 SV* retval; 1677 dSP; 1678 const size_t pkg_len = strlen(pkg); 1679 const size_t name_len = strlen(name); 1680 HV * const stash = gv_stashpvn(pkg, pkg_len, 0); 1681 SV* errsv_save; 1682 1683 PERL_ARGS_ASSERT_SWASH_INIT; 1684 1685 PUSHSTACKi(PERLSI_MAGIC); 1686 ENTER; 1687 SAVEI32(PL_hints); 1688 PL_hints = 0; 1689 save_re_context(); 1690 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */ 1691 ENTER; 1692 errsv_save = newSVsv(ERRSV); 1693 /* It is assumed that callers of this routine are not passing in any 1694 user derived data. */ 1695 /* Need to do this after save_re_context() as it will set PL_tainted to 1696 1 while saving $1 etc (see the code after getrx: in Perl_magic_get). 1697 Even line to create errsv_save can turn on PL_tainted. */ 1698 SAVEBOOL(PL_tainted); 1699 PL_tainted = 0; 1700 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), 1701 NULL); 1702 if (!SvTRUE(ERRSV)) 1703 sv_setsv(ERRSV, errsv_save); 1704 SvREFCNT_dec(errsv_save); 1705 LEAVE; 1706 } 1707 SPAGAIN; 1708 PUSHMARK(SP); 1709 EXTEND(SP,5); 1710 mPUSHp(pkg, pkg_len); 1711 mPUSHp(name, name_len); 1712 PUSHs(listsv); 1713 mPUSHi(minbits); 1714 mPUSHi(none); 1715 PUTBACK; 1716 errsv_save = newSVsv(ERRSV); 1717 if (call_method("SWASHNEW", G_SCALAR)) 1718 retval = newSVsv(*PL_stack_sp--); 1719 else 1720 retval = &PL_sv_undef; 1721 if (!SvTRUE(ERRSV)) 1722 sv_setsv(ERRSV, errsv_save); 1723 SvREFCNT_dec(errsv_save); 1724 LEAVE; 1725 POPSTACK; 1726 if (IN_PERL_COMPILETIME) { 1727 CopHINTS_set(PL_curcop, PL_hints); 1728 } 1729 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { 1730 if (SvPOK(retval)) 1731 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"", 1732 SVfARG(retval)); 1733 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); 1734 } 1735 return retval; 1736 } 1737 1738 1739 /* This API is wrong for special case conversions since we may need to 1740 * return several Unicode characters for a single Unicode character 1741 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is 1742 * the lower-level routine, and it is similarly broken for returning 1743 * multiple values. --jhi */ 1744 /* Now SWASHGET is recasted into S_swash_get in this file. */ 1745 1746 /* Note: 1747 * Returns the value of property/mapping C<swash> for the first character 1748 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is 1749 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is 1750 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>. 1751 */ 1752 UV 1753 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) 1754 { 1755 dVAR; 1756 HV *const hv = MUTABLE_HV(SvRV(swash)); 1757 U32 klen; 1758 U32 off; 1759 STRLEN slen; 1760 STRLEN needents; 1761 const U8 *tmps = NULL; 1762 U32 bit; 1763 SV *swatch; 1764 U8 tmputf8[2]; 1765 const UV c = NATIVE_TO_ASCII(*ptr); 1766 1767 PERL_ARGS_ASSERT_SWASH_FETCH; 1768 1769 if (!do_utf8 && !UNI_IS_INVARIANT(c)) { 1770 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); 1771 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c); 1772 ptr = tmputf8; 1773 } 1774 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ 1775 * then the "swatch" is a vec() for al the chars which start 1776 * with 0xAA..0xYY 1777 * So the key in the hash (klen) is length of encoded char -1 1778 */ 1779 klen = UTF8SKIP(ptr) - 1; 1780 off = ptr[klen]; 1781 1782 if (klen == 0) { 1783 /* If char in invariant then swatch is for all the invariant chars 1784 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK 1785 */ 1786 needents = UTF_CONTINUATION_MARK; 1787 off = NATIVE_TO_UTF(ptr[klen]); 1788 } 1789 else { 1790 /* If char is encoded then swatch is for the prefix */ 1791 needents = (1 << UTF_ACCUMULATION_SHIFT); 1792 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK; 1793 } 1794 1795 /* 1796 * This single-entry cache saves about 1/3 of the utf8 overhead in test 1797 * suite. (That is, only 7-8% overall over just a hash cache. Still, 1798 * it's nothing to sniff at.) Pity we usually come through at least 1799 * two function calls to get here... 1800 * 1801 * NB: this code assumes that swatches are never modified, once generated! 1802 */ 1803 1804 if (hv == PL_last_swash_hv && 1805 klen == PL_last_swash_klen && 1806 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) ) 1807 { 1808 tmps = PL_last_swash_tmps; 1809 slen = PL_last_swash_slen; 1810 } 1811 else { 1812 /* Try our second-level swatch cache, kept in a hash. */ 1813 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE); 1814 1815 /* If not cached, generate it via swash_get */ 1816 if (!svp || !SvPOK(*svp) 1817 || !(tmps = (const U8*)SvPV_const(*svp, slen))) { 1818 /* We use utf8n_to_uvuni() as we want an index into 1819 Unicode tables, not a native character number. 1820 */ 1821 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 1822 ckWARN(WARN_UTF8) ? 1823 0 : UTF8_ALLOW_ANY); 1824 swatch = swash_get(swash, 1825 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ 1826 (klen) ? (code_point & ~(needents - 1)) : 0, 1827 needents); 1828 1829 if (IN_PERL_COMPILETIME) 1830 CopHINTS_set(PL_curcop, PL_hints); 1831 1832 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); 1833 1834 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) 1835 || (slen << 3) < needents) 1836 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch"); 1837 } 1838 1839 PL_last_swash_hv = hv; 1840 assert(klen <= sizeof(PL_last_swash_key)); 1841 PL_last_swash_klen = (U8)klen; 1842 /* FIXME change interpvar.h? */ 1843 PL_last_swash_tmps = (U8 *) tmps; 1844 PL_last_swash_slen = slen; 1845 if (klen) 1846 Copy(ptr, PL_last_swash_key, klen, U8); 1847 } 1848 1849 switch ((int)((slen << 3) / needents)) { 1850 case 1: 1851 bit = 1 << (off & 7); 1852 off >>= 3; 1853 return (tmps[off] & bit) != 0; 1854 case 8: 1855 return tmps[off]; 1856 case 16: 1857 off <<= 1; 1858 return (tmps[off] << 8) + tmps[off + 1] ; 1859 case 32: 1860 off <<= 2; 1861 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; 1862 } 1863 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width"); 1864 NORETURN_FUNCTION_END; 1865 } 1866 1867 /* Note: 1868 * Returns a swatch (a bit vector string) for a code point sequence 1869 * that starts from the value C<start> and comprises the number C<span>. 1870 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl). 1871 * Should be used via swash_fetch, which will cache the swatch in C<swash>. 1872 */ 1873 STATIC SV* 1874 S_swash_get(pTHX_ SV* swash, UV start, UV span) 1875 { 1876 SV *swatch; 1877 U8 *l, *lend, *x, *xend, *s; 1878 STRLEN lcur, xcur, scur; 1879 HV *const hv = MUTABLE_HV(SvRV(swash)); 1880 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); 1881 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); 1882 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); 1883 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); 1884 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE); 1885 const U8* const typestr = (U8*)SvPV_nolen(*typesvp); 1886 const int typeto = typestr[0] == 'T' && typestr[1] == 'o'; 1887 const STRLEN bits = SvUV(*bitssvp); 1888 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ 1889 const UV none = SvUV(*nonesvp); 1890 const UV end = start + span; 1891 1892 PERL_ARGS_ASSERT_SWASH_GET; 1893 1894 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { 1895 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf, 1896 (UV)bits); 1897 } 1898 1899 /* create and initialize $swatch */ 1900 scur = octets ? (span * octets) : (span + 7) / 8; 1901 swatch = newSV(scur); 1902 SvPOK_on(swatch); 1903 s = (U8*)SvPVX(swatch); 1904 if (octets && none) { 1905 const U8* const e = s + scur; 1906 while (s < e) { 1907 if (bits == 8) 1908 *s++ = (U8)(none & 0xff); 1909 else if (bits == 16) { 1910 *s++ = (U8)((none >> 8) & 0xff); 1911 *s++ = (U8)( none & 0xff); 1912 } 1913 else if (bits == 32) { 1914 *s++ = (U8)((none >> 24) & 0xff); 1915 *s++ = (U8)((none >> 16) & 0xff); 1916 *s++ = (U8)((none >> 8) & 0xff); 1917 *s++ = (U8)( none & 0xff); 1918 } 1919 } 1920 *s = '\0'; 1921 } 1922 else { 1923 (void)memzero((U8*)s, scur + 1); 1924 } 1925 SvCUR_set(swatch, scur); 1926 s = (U8*)SvPVX(swatch); 1927 1928 /* read $swash->{LIST} */ 1929 l = (U8*)SvPV(*listsvp, lcur); 1930 lend = l + lcur; 1931 while (l < lend) { 1932 UV min, max, val; 1933 STRLEN numlen; 1934 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; 1935 1936 U8* const nl = (U8*)memchr(l, '\n', lend - l); 1937 1938 numlen = lend - l; 1939 min = grok_hex((char *)l, &numlen, &flags, NULL); 1940 if (numlen) 1941 l += numlen; 1942 else if (nl) { 1943 l = nl + 1; /* 1 is length of "\n" */ 1944 continue; 1945 } 1946 else { 1947 l = lend; /* to LIST's end at which \n is not found */ 1948 break; 1949 } 1950 1951 if (isBLANK(*l)) { 1952 ++l; 1953 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; 1954 numlen = lend - l; 1955 max = grok_hex((char *)l, &numlen, &flags, NULL); 1956 if (numlen) 1957 l += numlen; 1958 else 1959 max = min; 1960 1961 if (octets) { 1962 if (isBLANK(*l)) { 1963 ++l; 1964 flags = PERL_SCAN_SILENT_ILLDIGIT | 1965 PERL_SCAN_DISALLOW_PREFIX; 1966 numlen = lend - l; 1967 val = grok_hex((char *)l, &numlen, &flags, NULL); 1968 if (numlen) 1969 l += numlen; 1970 else 1971 val = 0; 1972 } 1973 else { 1974 val = 0; 1975 if (typeto) { 1976 Perl_croak(aTHX_ "%s: illegal mapping '%s'", 1977 typestr, l); 1978 } 1979 } 1980 } 1981 else 1982 val = 0; /* bits == 1, then val should be ignored */ 1983 } 1984 else { 1985 max = min; 1986 if (octets) { 1987 val = 0; 1988 if (typeto) { 1989 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l); 1990 } 1991 } 1992 else 1993 val = 0; /* bits == 1, then val should be ignored */ 1994 } 1995 1996 if (nl) 1997 l = nl + 1; 1998 else 1999 l = lend; 2000 2001 if (max < start) 2002 continue; 2003 2004 if (octets) { 2005 UV key; 2006 if (min < start) { 2007 if (!none || val < none) { 2008 val += start - min; 2009 } 2010 min = start; 2011 } 2012 for (key = min; key <= max; key++) { 2013 STRLEN offset; 2014 if (key >= end) 2015 goto go_out_list; 2016 /* offset must be non-negative (start <= min <= key < end) */ 2017 offset = octets * (key - start); 2018 if (bits == 8) 2019 s[offset] = (U8)(val & 0xff); 2020 else if (bits == 16) { 2021 s[offset ] = (U8)((val >> 8) & 0xff); 2022 s[offset + 1] = (U8)( val & 0xff); 2023 } 2024 else if (bits == 32) { 2025 s[offset ] = (U8)((val >> 24) & 0xff); 2026 s[offset + 1] = (U8)((val >> 16) & 0xff); 2027 s[offset + 2] = (U8)((val >> 8) & 0xff); 2028 s[offset + 3] = (U8)( val & 0xff); 2029 } 2030 2031 if (!none || val < none) 2032 ++val; 2033 } 2034 } 2035 else { /* bits == 1, then val should be ignored */ 2036 UV key; 2037 if (min < start) 2038 min = start; 2039 for (key = min; key <= max; key++) { 2040 const STRLEN offset = (STRLEN)(key - start); 2041 if (key >= end) 2042 goto go_out_list; 2043 s[offset >> 3] |= 1 << (offset & 7); 2044 } 2045 } 2046 } /* while */ 2047 go_out_list: 2048 2049 /* read $swash->{EXTRAS} */ 2050 x = (U8*)SvPV(*extssvp, xcur); 2051 xend = x + xcur; 2052 while (x < xend) { 2053 STRLEN namelen; 2054 U8 *namestr; 2055 SV** othersvp; 2056 HV* otherhv; 2057 STRLEN otherbits; 2058 SV **otherbitssvp, *other; 2059 U8 *s, *o, *nl; 2060 STRLEN slen, olen; 2061 2062 const U8 opc = *x++; 2063 if (opc == '\n') 2064 continue; 2065 2066 nl = (U8*)memchr(x, '\n', xend - x); 2067 2068 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { 2069 if (nl) { 2070 x = nl + 1; /* 1 is length of "\n" */ 2071 continue; 2072 } 2073 else { 2074 x = xend; /* to EXTRAS' end at which \n is not found */ 2075 break; 2076 } 2077 } 2078 2079 namestr = x; 2080 if (nl) { 2081 namelen = nl - namestr; 2082 x = nl + 1; 2083 } 2084 else { 2085 namelen = xend - namestr; 2086 x = xend; 2087 } 2088 2089 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); 2090 otherhv = MUTABLE_HV(SvRV(*othersvp)); 2091 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); 2092 otherbits = (STRLEN)SvUV(*otherbitssvp); 2093 if (bits < otherbits) 2094 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch"); 2095 2096 /* The "other" swatch must be destroyed after. */ 2097 other = swash_get(*othersvp, start, span); 2098 o = (U8*)SvPV(other, olen); 2099 2100 if (!olen) 2101 Perl_croak(aTHX_ "panic: swash_get got improper swatch"); 2102 2103 s = (U8*)SvPV(swatch, slen); 2104 if (bits == 1 && otherbits == 1) { 2105 if (slen != olen) 2106 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch"); 2107 2108 switch (opc) { 2109 case '+': 2110 while (slen--) 2111 *s++ |= *o++; 2112 break; 2113 case '!': 2114 while (slen--) 2115 *s++ |= ~*o++; 2116 break; 2117 case '-': 2118 while (slen--) 2119 *s++ &= ~*o++; 2120 break; 2121 case '&': 2122 while (slen--) 2123 *s++ &= *o++; 2124 break; 2125 default: 2126 break; 2127 } 2128 } 2129 else { 2130 STRLEN otheroctets = otherbits >> 3; 2131 STRLEN offset = 0; 2132 U8* const send = s + slen; 2133 2134 while (s < send) { 2135 UV otherval = 0; 2136 2137 if (otherbits == 1) { 2138 otherval = (o[offset >> 3] >> (offset & 7)) & 1; 2139 ++offset; 2140 } 2141 else { 2142 STRLEN vlen = otheroctets; 2143 otherval = *o++; 2144 while (--vlen) { 2145 otherval <<= 8; 2146 otherval |= *o++; 2147 } 2148 } 2149 2150 if (opc == '+' && otherval) 2151 NOOP; /* replace with otherval */ 2152 else if (opc == '!' && !otherval) 2153 otherval = 1; 2154 else if (opc == '-' && otherval) 2155 otherval = 0; 2156 else if (opc == '&' && !otherval) 2157 otherval = 0; 2158 else { 2159 s += octets; /* no replacement */ 2160 continue; 2161 } 2162 2163 if (bits == 8) 2164 *s++ = (U8)( otherval & 0xff); 2165 else if (bits == 16) { 2166 *s++ = (U8)((otherval >> 8) & 0xff); 2167 *s++ = (U8)( otherval & 0xff); 2168 } 2169 else if (bits == 32) { 2170 *s++ = (U8)((otherval >> 24) & 0xff); 2171 *s++ = (U8)((otherval >> 16) & 0xff); 2172 *s++ = (U8)((otherval >> 8) & 0xff); 2173 *s++ = (U8)( otherval & 0xff); 2174 } 2175 } 2176 } 2177 sv_free(other); /* through with it! */ 2178 } /* while */ 2179 return swatch; 2180 } 2181 2182 /* 2183 =for apidoc uvchr_to_utf8 2184 2185 Adds the UTF-8 representation of the Native codepoint C<uv> to the end 2186 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free 2187 bytes available. The return value is the pointer to the byte after the 2188 end of the new character. In other words, 2189 2190 d = uvchr_to_utf8(d, uv); 2191 2192 is the recommended wide native character-aware way of saying 2193 2194 *(d++) = uv; 2195 2196 =cut 2197 */ 2198 2199 /* On ASCII machines this is normally a macro but we want a 2200 real function in case XS code wants it 2201 */ 2202 U8 * 2203 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) 2204 { 2205 PERL_ARGS_ASSERT_UVCHR_TO_UTF8; 2206 2207 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); 2208 } 2209 2210 U8 * 2211 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 2212 { 2213 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS; 2214 2215 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); 2216 } 2217 2218 /* 2219 =for apidoc utf8n_to_uvchr 2220 flags 2221 2222 Returns the native character value of the first character in the string 2223 C<s> 2224 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 2225 length, in bytes, of that character. 2226 2227 Allows length and flags to be passed to low level routine. 2228 2229 =cut 2230 */ 2231 /* On ASCII machines this is normally a macro but we want 2232 a real function in case XS code wants it 2233 */ 2234 UV 2235 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, 2236 U32 flags) 2237 { 2238 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); 2239 2240 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; 2241 2242 return UNI_TO_NATIVE(uv); 2243 } 2244 2245 /* 2246 =for apidoc pv_uni_display 2247 2248 Build to the scalar dsv a displayable version of the string spv, 2249 length len, the displayable version being at most pvlim bytes long 2250 (if longer, the rest is truncated and "..." will be appended). 2251 2252 The flags argument can have UNI_DISPLAY_ISPRINT set to display 2253 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH 2254 to display the \\[nrfta\\] as the backslashed versions (like '\n') 2255 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\). 2256 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both 2257 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on. 2258 2259 The pointer to the PV of the dsv is returned. 2260 2261 =cut */ 2262 char * 2263 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags) 2264 { 2265 int truncated = 0; 2266 const char *s, *e; 2267 2268 PERL_ARGS_ASSERT_PV_UNI_DISPLAY; 2269 2270 sv_setpvs(dsv, ""); 2271 SvUTF8_off(dsv); 2272 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { 2273 UV u; 2274 /* This serves double duty as a flag and a character to print after 2275 a \ when flags & UNI_DISPLAY_BACKSLASH is true. 2276 */ 2277 char ok = 0; 2278 2279 if (pvlim && SvCUR(dsv) >= pvlim) { 2280 truncated++; 2281 break; 2282 } 2283 u = utf8_to_uvchr((U8*)s, 0); 2284 if (u < 256) { 2285 const unsigned char c = (unsigned char)u & 0xFF; 2286 if (flags & UNI_DISPLAY_BACKSLASH) { 2287 switch (c) { 2288 case '\n': 2289 ok = 'n'; break; 2290 case '\r': 2291 ok = 'r'; break; 2292 case '\t': 2293 ok = 't'; break; 2294 case '\f': 2295 ok = 'f'; break; 2296 case '\a': 2297 ok = 'a'; break; 2298 case '\\': 2299 ok = '\\'; break; 2300 default: break; 2301 } 2302 if (ok) { 2303 const char string = ok; 2304 sv_catpvs(dsv, "\\"); 2305 sv_catpvn(dsv, &string, 1); 2306 } 2307 } 2308 /* isPRINT() is the locale-blind version. */ 2309 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { 2310 const char string = c; 2311 sv_catpvn(dsv, &string, 1); 2312 ok = 1; 2313 } 2314 } 2315 if (!ok) 2316 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); 2317 } 2318 if (truncated) 2319 sv_catpvs(dsv, "..."); 2320 2321 return SvPVX(dsv); 2322 } 2323 2324 /* 2325 =for apidoc sv_uni_display 2326 2327 Build to the scalar dsv a displayable version of the scalar sv, 2328 the displayable version being at most pvlim bytes long 2329 (if longer, the rest is truncated and "..." will be appended). 2330 2331 The flags argument is as in pv_uni_display(). 2332 2333 The pointer to the PV of the dsv is returned. 2334 2335 =cut 2336 */ 2337 char * 2338 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) 2339 { 2340 PERL_ARGS_ASSERT_SV_UNI_DISPLAY; 2341 2342 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv), 2343 SvCUR(ssv), pvlim, flags); 2344 } 2345 2346 /* 2347 =for apidoc ibcmp_utf8 2348 2349 Return true if the strings s1 and s2 differ case-insensitively, false 2350 if not (if they are equal case-insensitively). If u1 is true, the 2351 string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, 2352 the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 2353 are false, the respective string is assumed to be in native 8-bit 2354 encoding. 2355 2356 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied 2357 in there (they will point at the beginning of the I<next> character). 2358 If the pointers behind pe1 or pe2 are non-NULL, they are the end 2359 pointers beyond which scanning will not continue under any 2360 circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and 2361 s2+l2 will be used as goal end pointers that will also stop the scan, 2362 and which qualify towards defining a successful match: all the scans 2363 that define an explicit length must reach their goal pointers for 2364 a match to succeed). 2365 2366 For case-insensitiveness, the "casefolding" of Unicode is used 2367 instead of upper/lowercasing both the characters, see 2368 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). 2369 2370 =cut */ 2371 I32 2372 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) 2373 { 2374 dVAR; 2375 register const U8 *p1 = (const U8*)s1; 2376 register const U8 *p2 = (const U8*)s2; 2377 register const U8 *f1 = NULL; 2378 register const U8 *f2 = NULL; 2379 register U8 *e1 = NULL; 2380 register U8 *q1 = NULL; 2381 register U8 *e2 = NULL; 2382 register U8 *q2 = NULL; 2383 STRLEN n1 = 0, n2 = 0; 2384 U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; 2385 U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; 2386 U8 natbuf[1+1]; 2387 STRLEN foldlen1, foldlen2; 2388 bool match; 2389 2390 PERL_ARGS_ASSERT_IBCMP_UTF8; 2391 2392 if (pe1) 2393 e1 = *(U8**)pe1; 2394 /* assert(e1 || l1); */ 2395 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1))) 2396 f1 = (const U8*)s1 + l1; 2397 if (pe2) 2398 e2 = *(U8**)pe2; 2399 /* assert(e2 || l2); */ 2400 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2))) 2401 f2 = (const U8*)s2 + l2; 2402 2403 /* This shouldn't happen. However, putting an assert() there makes some 2404 * tests fail. */ 2405 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */ 2406 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) 2407 return 1; /* mismatch; possible infinite loop or false positive */ 2408 2409 if (!u1 || !u2) 2410 natbuf[1] = 0; /* Need to terminate the buffer. */ 2411 2412 while ((e1 == 0 || p1 < e1) && 2413 (f1 == 0 || p1 < f1) && 2414 (e2 == 0 || p2 < e2) && 2415 (f2 == 0 || p2 < f2)) { 2416 if (n1 == 0) { 2417 if (u1) 2418 to_utf8_fold(p1, foldbuf1, &foldlen1); 2419 else { 2420 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1))); 2421 to_utf8_fold(natbuf, foldbuf1, &foldlen1); 2422 } 2423 q1 = foldbuf1; 2424 n1 = foldlen1; 2425 } 2426 if (n2 == 0) { 2427 if (u2) 2428 to_utf8_fold(p2, foldbuf2, &foldlen2); 2429 else { 2430 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2))); 2431 to_utf8_fold(natbuf, foldbuf2, &foldlen2); 2432 } 2433 q2 = foldbuf2; 2434 n2 = foldlen2; 2435 } 2436 while (n1 && n2) { 2437 if ( UTF8SKIP(q1) != UTF8SKIP(q2) || 2438 (UTF8SKIP(q1) == 1 && *q1 != *q2) || 2439 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) ) 2440 return 1; /* mismatch */ 2441 n1 -= UTF8SKIP(q1); 2442 q1 += UTF8SKIP(q1); 2443 n2 -= UTF8SKIP(q2); 2444 q2 += UTF8SKIP(q2); 2445 } 2446 if (n1 == 0) 2447 p1 += u1 ? UTF8SKIP(p1) : 1; 2448 if (n2 == 0) 2449 p2 += u2 ? UTF8SKIP(p2) : 1; 2450 2451 } 2452 2453 /* A match is defined by all the scans that specified 2454 * an explicit length reaching their final goals. */ 2455 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2); 2456 2457 if (match) { 2458 if (pe1) 2459 *pe1 = (char*)p1; 2460 if (pe2) 2461 *pe2 = (char*)p2; 2462 } 2463 2464 return match ? 0 : 1; /* 0 match, 1 mismatch */ 2465 } 2466 2467 /* 2468 * Local variables: 2469 * c-indentation-style: bsd 2470 * c-basic-offset: 4 2471 * indent-tabs-mode: t 2472 * End: 2473 * 2474 * ex: set ts=8 sts=4 sw=4 noet: 2475 */ 2476