1 /* utf8.c 2 * 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 4 * by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 13 * heard of that we don't want to see any closer; and that's the one place 14 * we're trying to get to! And that's just where we can't get, nohow.' 15 * 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] 17 * 18 * 'Well do I understand your speech,' he answered in the same language; 19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 20 * as is the custom in the West, if you wish to be answered?' 21 * --Gandalf, addressing Théoden's door wardens 22 * 23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 24 * 25 * ...the travellers perceived that the floor was paved with stones of many 26 * hues; branching runes and strange devices intertwined beneath their feet. 27 * 28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_UTF8_C 33 #include "perl.h" 34 #include "inline_invlist.c" 35 #include "charclass_invlists.h" 36 37 static const char unees[] = 38 "Malformed UTF-8 character (unexpected end of string)"; 39 40 /* 41 =head1 Unicode Support 42 43 This file contains various utility functions for manipulating UTF8-encoded 44 strings. For the uninitiated, this is a method of representing arbitrary 45 Unicode characters as a variable number of bytes, in such a way that 46 characters in the ASCII range are unmodified, and a zero byte never appears 47 within non-zero characters. 48 49 =cut 50 */ 51 52 /* 53 =for apidoc is_ascii_string 54 55 Returns true if the first C<len> bytes of the string C<s> are the same whether 56 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That 57 is, if they are invariant. On ASCII-ish machines, only ASCII characters 58 fit this definition, hence the function's name. 59 60 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you 61 use this option, that C<s> can't have embedded C<NUL> characters and has to 62 have a terminating C<NUL> byte). 63 64 See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>(). 65 66 =cut 67 */ 68 69 bool 70 Perl_is_ascii_string(const U8 *s, STRLEN len) 71 { 72 const U8* const send = s + (len ? len : strlen((const char *)s)); 73 const U8* x = s; 74 75 PERL_ARGS_ASSERT_IS_ASCII_STRING; 76 77 for (; x < send; ++x) { 78 if (!UTF8_IS_INVARIANT(*x)) 79 break; 80 } 81 82 return x == send; 83 } 84 85 /* 86 =for apidoc uvoffuni_to_utf8_flags 87 88 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 89 Instead, B<Almost all code should use L</uvchr_to_utf8> or 90 L</uvchr_to_utf8_flags>>. 91 92 This function is like them, but the input is a strict Unicode 93 (as opposed to native) code point. Only in very rare circumstances should code 94 not be using the native code point. 95 96 For details, see the description for L</uvchr_to_utf8_flags>>. 97 98 =cut 99 */ 100 101 U8 * 102 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 103 { 104 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; 105 106 if (UNI_IS_INVARIANT(uv)) { 107 *d++ = (U8) LATIN1_TO_NATIVE(uv); 108 return d; 109 } 110 111 /* The first problematic code point is the first surrogate */ 112 if (uv >= UNICODE_SURROGATE_FIRST 113 && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR)) 114 { 115 if (UNICODE_IS_SURROGATE(uv)) { 116 if (flags & UNICODE_WARN_SURROGATE) { 117 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), 118 "UTF-16 surrogate U+%04"UVXf, uv); 119 } 120 if (flags & UNICODE_DISALLOW_SURROGATE) { 121 return NULL; 122 } 123 } 124 else if (UNICODE_IS_SUPER(uv)) { 125 if (flags & UNICODE_WARN_SUPER 126 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF))) 127 { 128 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), 129 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv); 130 } 131 if (flags & UNICODE_DISALLOW_SUPER 132 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF))) 133 { 134 return NULL; 135 } 136 } 137 else if (UNICODE_IS_NONCHAR(uv)) { 138 if (flags & UNICODE_WARN_NONCHAR) { 139 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), 140 "Unicode non-character U+%04"UVXf" is illegal for open interchange", 141 uv); 142 } 143 if (flags & UNICODE_DISALLOW_NONCHAR) { 144 return NULL; 145 } 146 } 147 } 148 149 #if defined(EBCDIC) 150 { 151 STRLEN len = OFFUNISKIP(uv); 152 U8 *p = d+len-1; 153 while (p > d) { 154 *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); 155 uv >>= UTF_ACCUMULATION_SHIFT; 156 } 157 *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); 158 return d+len; 159 } 160 #else /* Non loop style */ 161 if (uv < 0x800) { 162 *d++ = (U8)(( uv >> 6) | 0xc0); 163 *d++ = (U8)(( uv & 0x3f) | 0x80); 164 return d; 165 } 166 if (uv < 0x10000) { 167 *d++ = (U8)(( uv >> 12) | 0xe0); 168 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 169 *d++ = (U8)(( uv & 0x3f) | 0x80); 170 return d; 171 } 172 if (uv < 0x200000) { 173 *d++ = (U8)(( uv >> 18) | 0xf0); 174 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 175 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 176 *d++ = (U8)(( uv & 0x3f) | 0x80); 177 return d; 178 } 179 if (uv < 0x4000000) { 180 *d++ = (U8)(( uv >> 24) | 0xf8); 181 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 182 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 183 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 184 *d++ = (U8)(( uv & 0x3f) | 0x80); 185 return d; 186 } 187 if (uv < 0x80000000) { 188 *d++ = (U8)(( uv >> 30) | 0xfc); 189 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 190 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 191 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 192 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 193 *d++ = (U8)(( uv & 0x3f) | 0x80); 194 return d; 195 } 196 #ifdef UTF8_QUAD_MAX 197 if (uv < UTF8_QUAD_MAX) 198 #endif 199 { 200 *d++ = 0xfe; /* Can't match U+FEFF! */ 201 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); 202 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 203 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 204 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 205 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 206 *d++ = (U8)(( uv & 0x3f) | 0x80); 207 return d; 208 } 209 #ifdef UTF8_QUAD_MAX 210 { 211 *d++ = 0xff; /* Can't match U+FFFE! */ 212 *d++ = 0x80; /* 6 Reserved bits */ 213 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */ 214 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80); 215 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80); 216 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80); 217 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80); 218 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80); 219 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80); 220 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80); 221 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 222 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 223 *d++ = (U8)(( uv & 0x3f) | 0x80); 224 return d; 225 } 226 #endif 227 #endif /* Non loop style */ 228 } 229 /* 230 =for apidoc uvchr_to_utf8 231 232 Adds the UTF-8 representation of the native code point C<uv> to the end 233 of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to 234 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 235 the byte after the end of the new character. In other words, 236 237 d = uvchr_to_utf8(d, uv); 238 239 is the recommended wide native character-aware way of saying 240 241 *(d++) = uv; 242 243 This function accepts any UV as input. To forbid or warn on non-Unicode code 244 points, or those that may be problematic, see L</uvchr_to_utf8_flags>. 245 246 =cut 247 */ 248 249 /* This is also a macro */ 250 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); 251 252 U8 * 253 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) 254 { 255 return uvchr_to_utf8(d, uv); 256 } 257 258 /* 259 =for apidoc uvchr_to_utf8_flags 260 261 Adds the UTF-8 representation of the native code point C<uv> to the end 262 of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to 263 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 264 the byte after the end of the new character. In other words, 265 266 d = uvchr_to_utf8_flags(d, uv, flags); 267 268 or, in most cases, 269 270 d = uvchr_to_utf8_flags(d, uv, 0); 271 272 This is the Unicode-aware way of saying 273 274 *(d++) = uv; 275 276 This function will convert to UTF-8 (and not warn) even code points that aren't 277 legal Unicode or are problematic, unless C<flags> contains one or more of the 278 following flags: 279 280 If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set, 281 the function will raise a warning, provided UTF8 warnings are enabled. If instead 282 UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL. 283 If both flags are set, the function will both warn and return NULL. 284 285 The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags 286 affect how the function handles a Unicode non-character. And likewise, the 287 UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of 288 code points that are 289 above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are 290 even less portable) can be warned and/or disallowed even if other above-Unicode 291 code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF 292 flags. 293 294 And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the 295 above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four 296 DISALLOW flags. 297 298 =cut 299 */ 300 301 /* This is also a macro */ 302 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); 303 304 U8 * 305 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 306 { 307 return uvchr_to_utf8_flags(d, uv, flags); 308 } 309 310 /* 311 312 Tests if the first C<len> bytes of string C<s> form a valid UTF-8 313 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a 314 valid UTF-8 character. The number of bytes in the UTF-8 character 315 will be returned if it is valid, otherwise 0. 316 317 This is the "slow" version as opposed to the "fast" version which is 318 the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed 319 difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four 320 or less you should use the IS_UTF8_CHAR(), for lengths of five or more 321 you should use the _slow(). In practice this means that the _slow() 322 will be used very rarely, since the maximum Unicode code point (as of 323 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only 324 the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into 325 five bytes or more. 326 327 =cut */ 328 PERL_STATIC_INLINE STRLEN 329 S_is_utf8_char_slow(const U8 *s, const STRLEN len) 330 { 331 dTHX; /* The function called below requires thread context */ 332 333 STRLEN actual_len; 334 335 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; 336 337 utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY); 338 339 return (actual_len == (STRLEN) -1) ? 0 : actual_len; 340 } 341 342 /* 343 =for apidoc is_utf8_char_buf 344 345 Returns the number of bytes that comprise the first UTF-8 encoded character in 346 buffer C<buf>. C<buf_end> should point to one position beyond the end of the 347 buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8 348 encoded character. 349 350 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC 351 machines) is a valid UTF-8 character. 352 353 =cut */ 354 355 STRLEN 356 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) 357 { 358 359 STRLEN len; 360 361 PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF; 362 363 if (buf_end <= buf) { 364 return 0; 365 } 366 367 len = buf_end - buf; 368 if (len > UTF8SKIP(buf)) { 369 len = UTF8SKIP(buf); 370 } 371 372 if (IS_UTF8_CHAR_FAST(len)) 373 return IS_UTF8_CHAR(buf, len) ? len : 0; 374 return is_utf8_char_slow(buf, len); 375 } 376 377 /* 378 =for apidoc is_utf8_char 379 380 Tests if some arbitrary number of bytes begins in a valid UTF-8 381 character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) 382 character is a valid UTF-8 character. The actual number of bytes in the UTF-8 383 character will be returned if it is valid, otherwise 0. 384 385 This function is deprecated due to the possibility that malformed input could 386 cause reading beyond the end of the input buffer. Use L</is_utf8_char_buf> 387 instead. 388 389 =cut */ 390 391 STRLEN 392 Perl_is_utf8_char(const U8 *s) 393 { 394 PERL_ARGS_ASSERT_IS_UTF8_CHAR; 395 396 /* Assumes we have enough space, which is why this is deprecated */ 397 return is_utf8_char_buf(s, s + UTF8SKIP(s)); 398 } 399 400 401 /* 402 =for apidoc is_utf8_string 403 404 Returns true if the first C<len> bytes of string C<s> form a valid 405 UTF-8 string, false otherwise. If C<len> is 0, it will be calculated 406 using C<strlen(s)> (which means if you use this option, that C<s> can't have 407 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note 408 that all characters being ASCII constitute 'a valid UTF-8 string'. 409 410 See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>(). 411 412 =cut 413 */ 414 415 bool 416 Perl_is_utf8_string(const U8 *s, STRLEN len) 417 { 418 const U8* const send = s + (len ? len : strlen((const char *)s)); 419 const U8* x = s; 420 421 PERL_ARGS_ASSERT_IS_UTF8_STRING; 422 423 while (x < send) { 424 /* Inline the easy bits of is_utf8_char() here for speed... */ 425 if (UTF8_IS_INVARIANT(*x)) { 426 x++; 427 } 428 else { 429 /* ... and call is_utf8_char() only if really needed. */ 430 const STRLEN c = UTF8SKIP(x); 431 const U8* const next_char_ptr = x + c; 432 433 if (next_char_ptr > send) { 434 return FALSE; 435 } 436 437 if (IS_UTF8_CHAR_FAST(c)) { 438 if (!IS_UTF8_CHAR(x, c)) 439 return FALSE; 440 } 441 else if (! is_utf8_char_slow(x, c)) { 442 return FALSE; 443 } 444 x = next_char_ptr; 445 } 446 } 447 448 return TRUE; 449 } 450 451 /* 452 Implemented as a macro in utf8.h 453 454 =for apidoc is_utf8_string_loc 455 456 Like L</is_utf8_string> but stores the location of the failure (in the 457 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 458 "utf8ness success") in the C<ep>. 459 460 See also L</is_utf8_string_loclen>() and L</is_utf8_string>(). 461 462 =for apidoc is_utf8_string_loclen 463 464 Like L</is_utf8_string>() but stores the location of the failure (in the 465 case of "utf8ness failure") or the location C<s>+C<len> (in the case of 466 "utf8ness success") in the C<ep>, and the number of UTF-8 467 encoded characters in the C<el>. 468 469 See also L</is_utf8_string_loc>() and L</is_utf8_string>(). 470 471 =cut 472 */ 473 474 bool 475 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) 476 { 477 const U8* const send = s + (len ? len : strlen((const char *)s)); 478 const U8* x = s; 479 STRLEN c; 480 STRLEN outlen = 0; 481 482 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; 483 484 while (x < send) { 485 const U8* next_char_ptr; 486 487 /* Inline the easy bits of is_utf8_char() here for speed... */ 488 if (UTF8_IS_INVARIANT(*x)) 489 next_char_ptr = x + 1; 490 else { 491 /* ... and call is_utf8_char() only if really needed. */ 492 c = UTF8SKIP(x); 493 next_char_ptr = c + x; 494 if (next_char_ptr > send) { 495 goto out; 496 } 497 if (IS_UTF8_CHAR_FAST(c)) { 498 if (!IS_UTF8_CHAR(x, c)) 499 c = 0; 500 } else 501 c = is_utf8_char_slow(x, c); 502 if (!c) 503 goto out; 504 } 505 x = next_char_ptr; 506 outlen++; 507 } 508 509 out: 510 if (el) 511 *el = outlen; 512 513 if (ep) 514 *ep = x; 515 return (x == send); 516 } 517 518 /* 519 520 =for apidoc utf8n_to_uvchr 521 522 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 523 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly. 524 525 Bottom level UTF-8 decode routine. 526 Returns the native code point value of the first character in the string C<s>, 527 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than 528 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to 529 the length, in bytes, of that character. 530 531 The value of C<flags> determines the behavior when C<s> does not point to a 532 well-formed UTF-8 character. If C<flags> is 0, when a malformation is found, 533 zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the 534 next possible position in C<s> that could begin a non-malformed character. 535 Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised. 536 537 Various ALLOW flags can be set in C<flags> to allow (and not warn on) 538 individual types of malformations, such as the sequence being overlong (that 539 is, when there is a shorter sequence that can express the same code point; 540 overlong sequences are expressly forbidden in the UTF-8 standard due to 541 potential security issues). Another malformation example is the first byte of 542 a character not being a legal first byte. See F<utf8.h> for the list of such 543 flags. For allowed 0 length strings, this function returns 0; for allowed 544 overlong sequences, the computed code point is returned; for all other allowed 545 malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no 546 determinable reasonable value. 547 548 The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other 549 flags) malformation is found. If this flag is set, the routine assumes that 550 the caller will raise a warning, and this function will silently just set 551 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero. 552 553 Note that this API requires disambiguation between successful decoding a C<NUL> 554 character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as 555 in both cases, 0 is returned. To disambiguate, upon a zero return, see if the 556 first byte of C<s> is 0 as well. If so, the input was a C<NUL>; if not, the 557 input had an error. 558 559 Certain code points are considered problematic. These are Unicode surrogates, 560 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. 561 By default these are considered regular code points, but certain situations 562 warrant special handling for them. If C<flags> contains 563 UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as 564 malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE, 565 UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode 566 maximum) can be set to disallow these categories individually. 567 568 The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE, 569 UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised 570 for their respective categories, but otherwise the code points are considered 571 valid (not malformations). To get a category to both be treated as a 572 malformation and raise a warning, specify both the WARN and DISALLOW flags. 573 (But note that warnings are not raised if lexically disabled nor if 574 UTF8_CHECK_ONLY is also specified.) 575 576 Very large code points (above 0x7FFF_FFFF) are considered more problematic than 577 the others that are above the Unicode legal maximum. There are several 578 reasons: they requre at least 32 bits to represent them on ASCII platforms, are 579 not representable at all on EBCDIC platforms, and the original UTF-8 580 specification never went above this number (the current 0x10FFFF limit was 581 imposed later). (The smaller ones, those that fit into 32 bits, are 582 representable by a UV on ASCII platforms, but not by an IV, which means that 583 the number of operations that can be performed on them is quite restricted.) 584 The UTF-8 encoding on ASCII platforms for these large code points begins with a 585 byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to 586 be treated as malformations, while allowing smaller above-Unicode code points. 587 (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points, 588 including these, as malformations.) 589 Similarly, UTF8_WARN_FE_FF acts just like 590 the other WARN flags, but applies just to these code points. 591 592 All other code points corresponding to Unicode characters, including private 593 use and those yet to be assigned, are never considered malformed and never 594 warn. 595 596 =cut 597 */ 598 599 UV 600 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) 601 { 602 dVAR; 603 const U8 * const s0 = s; 604 U8 overflow_byte = '\0'; /* Save byte in case of overflow */ 605 U8 * send; 606 UV uv = *s; 607 STRLEN expectlen; 608 SV* sv = NULL; 609 UV outlier_ret = 0; /* return value when input is in error or problematic 610 */ 611 UV pack_warn = 0; /* Save result of packWARN() for later */ 612 bool unexpected_non_continuation = FALSE; 613 bool overflowed = FALSE; 614 bool do_overlong_test = TRUE; /* May have to skip this test */ 615 616 const char* const malformed_text = "Malformed UTF-8 character"; 617 618 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; 619 620 /* The order of malformation tests here is important. We should consume as 621 * few bytes as possible in order to not skip any valid character. This is 622 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also 623 * http://unicode.org/reports/tr36 for more discussion as to why. For 624 * example, once we've done a UTF8SKIP, we can tell the expected number of 625 * bytes, and could fail right off the bat if the input parameters indicate 626 * that there are too few available. But it could be that just that first 627 * byte is garbled, and the intended character occupies fewer bytes. If we 628 * blindly assumed that the first byte is correct, and skipped based on 629 * that number, we could skip over a valid input character. So instead, we 630 * always examine the sequence byte-by-byte. 631 * 632 * We also should not consume too few bytes, otherwise someone could inject 633 * things. For example, an input could be deliberately designed to 634 * overflow, and if this code bailed out immediately upon discovering that, 635 * returning to the caller C<*retlen> pointing to the very next byte (one 636 * which is actually part of of the overflowing sequence), that could look 637 * legitimate to the caller, which could discard the initial partial 638 * sequence and process the rest, inappropriately */ 639 640 /* Zero length strings, if allowed, of necessity are zero */ 641 if (UNLIKELY(curlen == 0)) { 642 if (retlen) { 643 *retlen = 0; 644 } 645 646 if (flags & UTF8_ALLOW_EMPTY) { 647 return 0; 648 } 649 if (! (flags & UTF8_CHECK_ONLY)) { 650 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text)); 651 } 652 goto malformed; 653 } 654 655 expectlen = UTF8SKIP(s); 656 657 /* A well-formed UTF-8 character, as the vast majority of calls to this 658 * function will be for, has this expected length. For efficiency, set 659 * things up here to return it. It will be overriden only in those rare 660 * cases where a malformation is found */ 661 if (retlen) { 662 *retlen = expectlen; 663 } 664 665 /* An invariant is trivially well-formed */ 666 if (UTF8_IS_INVARIANT(uv)) { 667 return uv; 668 } 669 670 /* A continuation character can't start a valid sequence */ 671 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { 672 if (flags & UTF8_ALLOW_CONTINUATION) { 673 if (retlen) { 674 *retlen = 1; 675 } 676 return UNICODE_REPLACEMENT; 677 } 678 679 if (! (flags & UTF8_CHECK_ONLY)) { 680 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0)); 681 } 682 curlen = 1; 683 goto malformed; 684 } 685 686 /* Here is not a continuation byte, nor an invariant. The only thing left 687 * is a start byte (possibly for an overlong) */ 688 689 #ifdef EBCDIC 690 uv = NATIVE_UTF8_TO_I8(uv); 691 #endif 692 693 /* Remove the leading bits that indicate the number of bytes in the 694 * character's whole UTF-8 sequence, leaving just the bits that are part of 695 * the value */ 696 uv &= UTF_START_MASK(expectlen); 697 698 /* Now, loop through the remaining bytes in the character's sequence, 699 * accumulating each into the working value as we go. Be sure to not look 700 * past the end of the input string */ 701 send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen); 702 703 for (s = s0 + 1; s < send; s++) { 704 if (LIKELY(UTF8_IS_CONTINUATION(*s))) { 705 #ifndef EBCDIC /* Can't overflow in EBCDIC */ 706 if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) { 707 708 /* The original implementors viewed this malformation as more 709 * serious than the others (though I, khw, don't understand 710 * why, since other malformations also give very very wrong 711 * results), so there is no way to turn off checking for it. 712 * Set a flag, but keep going in the loop, so that we absorb 713 * the rest of the bytes that comprise the character. */ 714 overflowed = TRUE; 715 overflow_byte = *s; /* Save for warning message's use */ 716 } 717 #endif 718 uv = UTF8_ACCUMULATE(uv, *s); 719 } 720 else { 721 /* Here, found a non-continuation before processing all expected 722 * bytes. This byte begins a new character, so quit, even if 723 * allowing this malformation. */ 724 unexpected_non_continuation = TRUE; 725 break; 726 } 727 } /* End of loop through the character's bytes */ 728 729 /* Save how many bytes were actually in the character */ 730 curlen = s - s0; 731 732 /* The loop above finds two types of malformations: non-continuation and/or 733 * overflow. The non-continuation malformation is really a too-short 734 * malformation, as it means that the current character ended before it was 735 * expected to (being terminated prematurely by the beginning of the next 736 * character, whereas in the too-short malformation there just are too few 737 * bytes available to hold the character. In both cases, the check below 738 * that we have found the expected number of bytes would fail if executed.) 739 * Thus the non-continuation malformation is really unnecessary, being a 740 * subset of the too-short malformation. But there may be existing 741 * applications that are expecting the non-continuation type, so we retain 742 * it, and return it in preference to the too-short malformation. (If this 743 * code were being written from scratch, the two types might be collapsed 744 * into one.) I, khw, am also giving priority to returning the 745 * non-continuation and too-short malformations over overflow when multiple 746 * ones are present. I don't know of any real reason to prefer one over 747 * the other, except that it seems to me that multiple-byte errors trumps 748 * errors from a single byte */ 749 if (UNLIKELY(unexpected_non_continuation)) { 750 if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) { 751 if (! (flags & UTF8_CHECK_ONLY)) { 752 if (curlen == 1) { 753 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0)); 754 } 755 else { 756 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen)); 757 } 758 } 759 goto malformed; 760 } 761 uv = UNICODE_REPLACEMENT; 762 763 /* Skip testing for overlongs, as the REPLACEMENT may not be the same 764 * as what the original expectations were. */ 765 do_overlong_test = FALSE; 766 if (retlen) { 767 *retlen = curlen; 768 } 769 } 770 else if (UNLIKELY(curlen < expectlen)) { 771 if (! (flags & UTF8_ALLOW_SHORT)) { 772 if (! (flags & UTF8_CHECK_ONLY)) { 773 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0)); 774 } 775 goto malformed; 776 } 777 uv = UNICODE_REPLACEMENT; 778 do_overlong_test = FALSE; 779 if (retlen) { 780 *retlen = curlen; 781 } 782 } 783 784 #ifndef EBCDIC /* EBCDIC can't overflow */ 785 if (UNLIKELY(overflowed)) { 786 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0)); 787 goto malformed; 788 } 789 #endif 790 791 if (do_overlong_test 792 && expectlen > (STRLEN) OFFUNISKIP(uv) 793 && ! (flags & UTF8_ALLOW_LONG)) 794 { 795 /* The overlong malformation has lower precedence than the others. 796 * Note that if this malformation is allowed, we return the actual 797 * value, instead of the replacement character. This is because this 798 * value is actually well-defined. */ 799 if (! (flags & UTF8_CHECK_ONLY)) { 800 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0)); 801 } 802 goto malformed; 803 } 804 805 /* Here, the input is considered to be well-formed, but it still could be a 806 * problematic code point that is not allowed by the input parameters. */ 807 if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */ 808 && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE 809 |UTF8_WARN_ILLEGAL_INTERCHANGE))) 810 { 811 if (UNICODE_IS_SURROGATE(uv)) { 812 813 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary 814 * generation of the sv, since no warnings are raised under CHECK */ 815 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE 816 && ckWARN_d(WARN_SURROGATE)) 817 { 818 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv)); 819 pack_warn = packWARN(WARN_SURROGATE); 820 } 821 if (flags & UTF8_DISALLOW_SURROGATE) { 822 goto disallowed; 823 } 824 } 825 else if ((uv > PERL_UNICODE_MAX)) { 826 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER 827 && ckWARN_d(WARN_NON_UNICODE)) 828 { 829 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv)); 830 pack_warn = packWARN(WARN_NON_UNICODE); 831 } 832 #ifndef EBCDIC /* EBCDIC always allows FE, FF */ 833 834 /* The first byte being 0xFE or 0xFF is a subset of the SUPER code 835 * points. We test for these after the regular SUPER ones, and 836 * before possibly bailing out, so that the more dire warning 837 * overrides the regular one, if applicable */ 838 if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */ 839 && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF))) 840 { 841 if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) 842 == UTF8_WARN_FE_FF 843 && ckWARN_d(WARN_UTF8)) 844 { 845 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv)); 846 pack_warn = packWARN(WARN_UTF8); 847 } 848 if (flags & UTF8_DISALLOW_FE_FF) { 849 goto disallowed; 850 } 851 } 852 #endif 853 if (flags & UTF8_DISALLOW_SUPER) { 854 goto disallowed; 855 } 856 } 857 else if (UNICODE_IS_NONCHAR(uv)) { 858 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR 859 && ckWARN_d(WARN_NONCHAR)) 860 { 861 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv)); 862 pack_warn = packWARN(WARN_NONCHAR); 863 } 864 if (flags & UTF8_DISALLOW_NONCHAR) { 865 goto disallowed; 866 } 867 } 868 869 if (sv) { 870 outlier_ret = uv; /* Note we don't bother to convert to native, 871 as all the outlier code points are the same 872 in both ASCII and EBCDIC */ 873 goto do_warn; 874 } 875 876 /* Here, this is not considered a malformed character, so drop through 877 * to return it */ 878 } 879 880 return UNI_TO_NATIVE(uv); 881 882 /* There are three cases which get to beyond this point. In all 3 cases: 883 * <sv> if not null points to a string to print as a warning. 884 * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't 885 * set. 886 * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set. 887 * This is done by initializing it to 0, and changing it only 888 * for case 1). 889 * The 3 cases are: 890 * 1) The input is valid but problematic, and to be warned about. The 891 * return value is the resultant code point; <*retlen> is set to 892 * <curlen>, the number of bytes that comprise the code point. 893 * <pack_warn> contains the result of packWARN() for the warning 894 * types. The entry point for this case is the label <do_warn>; 895 * 2) The input is a valid code point but disallowed by the parameters to 896 * this function. The return value is 0. If UTF8_CHECK_ONLY is set, 897 * <*relen> is -1; otherwise it is <curlen>, the number of bytes that 898 * comprise the code point. <pack_warn> contains the result of 899 * packWARN() for the warning types. The entry point for this case is 900 * the label <disallowed>. 901 * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY 902 * is set, <*relen> is -1; otherwise it is <curlen>, the number of 903 * bytes that comprise the malformation. All such malformations are 904 * assumed to be warning type <utf8>. The entry point for this case 905 * is the label <malformed>. 906 */ 907 908 malformed: 909 910 if (sv && ckWARN_d(WARN_UTF8)) { 911 pack_warn = packWARN(WARN_UTF8); 912 } 913 914 disallowed: 915 916 if (flags & UTF8_CHECK_ONLY) { 917 if (retlen) 918 *retlen = ((STRLEN) -1); 919 return 0; 920 } 921 922 do_warn: 923 924 if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only 925 if warnings are to be raised. */ 926 const char * const string = SvPVX_const(sv); 927 928 if (PL_op) 929 Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op)); 930 else 931 Perl_warner(aTHX_ pack_warn, "%s", string); 932 } 933 934 if (retlen) { 935 *retlen = curlen; 936 } 937 938 return outlier_ret; 939 } 940 941 /* 942 =for apidoc utf8_to_uvchr_buf 943 944 Returns the native code point of the first character in the string C<s> which 945 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>. 946 C<*retlen> will be set to the length, in bytes, of that character. 947 948 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are 949 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 950 NULL) to -1. If those warnings are off, the computed value, if well-defined 951 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and 952 C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is 953 the next possible position in C<s> that could begin a non-malformed character. 954 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is 955 returned. 956 957 =cut 958 */ 959 960 961 UV 962 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 963 { 964 assert(s < send); 965 966 return utf8n_to_uvchr(s, send - s, retlen, 967 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 968 } 969 970 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that 971 * there are no malformations in the input UTF-8 string C<s>. surrogates, 972 * non-character code points, and non-Unicode code points are allowed. */ 973 974 UV 975 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) 976 { 977 UV expectlen = UTF8SKIP(s); 978 const U8* send = s + expectlen; 979 UV uv = *s; 980 981 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; 982 983 if (retlen) { 984 *retlen = expectlen; 985 } 986 987 /* An invariant is trivially returned */ 988 if (expectlen == 1) { 989 return uv; 990 } 991 992 #ifdef EBCDIC 993 uv = NATIVE_UTF8_TO_I8(uv); 994 #endif 995 996 /* Remove the leading bits that indicate the number of bytes, leaving just 997 * the bits that are part of the value */ 998 uv &= UTF_START_MASK(expectlen); 999 1000 /* Now, loop through the remaining bytes, accumulating each into the 1001 * working total as we go. (I khw tried unrolling the loop for up to 4 1002 * bytes, but there was no performance improvement) */ 1003 for (++s; s < send; s++) { 1004 uv = UTF8_ACCUMULATE(uv, *s); 1005 } 1006 1007 return UNI_TO_NATIVE(uv); 1008 1009 } 1010 1011 /* 1012 =for apidoc utf8_to_uvchr 1013 1014 Returns the native code point of the first character in the string C<s> 1015 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 1016 length, in bytes, of that character. 1017 1018 Some, but not all, UTF-8 malformations are detected, and in fact, some 1019 malformed input could cause reading beyond the end of the input buffer, which 1020 is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead. 1021 1022 If C<s> points to one of the detected malformations, and UTF8 warnings are 1023 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 1024 NULL) to -1. If those warnings are off, the computed value if well-defined (or 1025 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> 1026 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the 1027 next possible position in C<s> that could begin a non-malformed character. 1028 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned. 1029 1030 =cut 1031 */ 1032 1033 UV 1034 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) 1035 { 1036 PERL_ARGS_ASSERT_UTF8_TO_UVCHR; 1037 1038 return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen); 1039 } 1040 1041 /* 1042 =for apidoc utf8_to_uvuni_buf 1043 1044 Only in very rare circumstances should code need to be dealing in Unicode 1045 (as opposed to native) code points. In those few cases, use 1046 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead. 1047 1048 Returns the Unicode (not-native) code point of the first character in the 1049 string C<s> which 1050 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>. 1051 C<retlen> will be set to the length, in bytes, of that character. 1052 1053 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are 1054 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 1055 NULL) to -1. If those warnings are off, the computed value if well-defined (or 1056 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> 1057 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the 1058 next possible position in C<s> that could begin a non-malformed character. 1059 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned. 1060 1061 =cut 1062 */ 1063 1064 UV 1065 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 1066 { 1067 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF; 1068 1069 assert(send > s); 1070 1071 /* Call the low level routine asking for checks */ 1072 return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen, 1073 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)); 1074 } 1075 1076 /* DEPRECATED! 1077 * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that 1078 * there are no malformations in the input UTF-8 string C<s>. Surrogates, 1079 * non-character code points, and non-Unicode code points are allowed */ 1080 1081 UV 1082 Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) 1083 { 1084 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI; 1085 1086 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); 1087 } 1088 1089 /* 1090 =for apidoc utf8_to_uvuni 1091 1092 Returns the Unicode code point of the first character in the string C<s> 1093 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the 1094 length, in bytes, of that character. 1095 1096 Some, but not all, UTF-8 malformations are detected, and in fact, some 1097 malformed input could cause reading beyond the end of the input buffer, which 1098 is one reason why this function is deprecated. The other is that only in 1099 extremely limited circumstances should the Unicode versus native code point be 1100 of any interest to you. See L</utf8_to_uvuni_buf> for alternatives. 1101 1102 If C<s> points to one of the detected malformations, and UTF8 warnings are 1103 enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to 1104 NULL) to -1. If those warnings are off, the computed value if well-defined (or 1105 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> 1106 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the 1107 next possible position in C<s> that could begin a non-malformed character. 1108 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned. 1109 1110 =cut 1111 */ 1112 1113 UV 1114 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) 1115 { 1116 PERL_ARGS_ASSERT_UTF8_TO_UVUNI; 1117 1118 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); 1119 } 1120 1121 /* 1122 =for apidoc utf8_length 1123 1124 Return the length of the UTF-8 char encoded string C<s> in characters. 1125 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end 1126 up past C<e>, croaks. 1127 1128 =cut 1129 */ 1130 1131 STRLEN 1132 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) 1133 { 1134 dVAR; 1135 STRLEN len = 0; 1136 1137 PERL_ARGS_ASSERT_UTF8_LENGTH; 1138 1139 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. 1140 * the bitops (especially ~) can create illegal UTF-8. 1141 * In other words: in Perl UTF-8 is not just for Unicode. */ 1142 1143 if (e < s) 1144 goto warn_and_return; 1145 while (s < e) { 1146 s += UTF8SKIP(s); 1147 len++; 1148 } 1149 1150 if (e != s) { 1151 len--; 1152 warn_and_return: 1153 if (PL_op) 1154 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 1155 "%s in %s", unees, OP_DESC(PL_op)); 1156 else 1157 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); 1158 } 1159 1160 return len; 1161 } 1162 1163 /* 1164 =for apidoc utf8_distance 1165 1166 Returns the number of UTF-8 characters between the UTF-8 pointers C<a> 1167 and C<b>. 1168 1169 WARNING: use only if you *know* that the pointers point inside the 1170 same UTF-8 buffer. 1171 1172 =cut 1173 */ 1174 1175 IV 1176 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) 1177 { 1178 PERL_ARGS_ASSERT_UTF8_DISTANCE; 1179 1180 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); 1181 } 1182 1183 /* 1184 =for apidoc utf8_hop 1185 1186 Return the UTF-8 pointer C<s> displaced by C<off> characters, either 1187 forward or backward. 1188 1189 WARNING: do not use the following unless you *know* C<off> is within 1190 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned 1191 on the first byte of character or just after the last byte of a character. 1192 1193 =cut 1194 */ 1195 1196 U8 * 1197 Perl_utf8_hop(pTHX_ const U8 *s, I32 off) 1198 { 1199 PERL_ARGS_ASSERT_UTF8_HOP; 1200 1201 PERL_UNUSED_CONTEXT; 1202 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g 1203 * the bitops (especially ~) can create illegal UTF-8. 1204 * In other words: in Perl UTF-8 is not just for Unicode. */ 1205 1206 if (off >= 0) { 1207 while (off--) 1208 s += UTF8SKIP(s); 1209 } 1210 else { 1211 while (off++) { 1212 s--; 1213 while (UTF8_IS_CONTINUATION(*s)) 1214 s--; 1215 } 1216 } 1217 return (U8 *)s; 1218 } 1219 1220 /* 1221 =for apidoc bytes_cmp_utf8 1222 1223 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the 1224 sequence of characters (stored as UTF-8) 1225 in C<u>, C<ulen>. Returns 0 if they are 1226 equal, -1 or -2 if the first string is less than the second string, +1 or +2 1227 if the first string is greater than the second string. 1228 1229 -1 or +1 is returned if the shorter string was identical to the start of the 1230 longer string. -2 or +2 is returned if 1231 there was a difference between characters 1232 within the strings. 1233 1234 =cut 1235 */ 1236 1237 int 1238 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) 1239 { 1240 const U8 *const bend = b + blen; 1241 const U8 *const uend = u + ulen; 1242 1243 PERL_ARGS_ASSERT_BYTES_CMP_UTF8; 1244 1245 PERL_UNUSED_CONTEXT; 1246 1247 while (b < bend && u < uend) { 1248 U8 c = *u++; 1249 if (!UTF8_IS_INVARIANT(c)) { 1250 if (UTF8_IS_DOWNGRADEABLE_START(c)) { 1251 if (u < uend) { 1252 U8 c1 = *u++; 1253 if (UTF8_IS_CONTINUATION(c1)) { 1254 c = TWO_BYTE_UTF8_TO_NATIVE(c, c1); 1255 } else { 1256 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 1257 "Malformed UTF-8 character " 1258 "(unexpected non-continuation byte 0x%02x" 1259 ", immediately after start byte 0x%02x)" 1260 /* Dear diag.t, it's in the pod. */ 1261 "%s%s", c1, c, 1262 PL_op ? " in " : "", 1263 PL_op ? OP_DESC(PL_op) : ""); 1264 return -2; 1265 } 1266 } else { 1267 if (PL_op) 1268 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 1269 "%s in %s", unees, OP_DESC(PL_op)); 1270 else 1271 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); 1272 return -2; /* Really want to return undef :-) */ 1273 } 1274 } else { 1275 return -2; 1276 } 1277 } 1278 if (*b != c) { 1279 return *b < c ? -2 : +2; 1280 } 1281 ++b; 1282 } 1283 1284 if (b == bend && u == uend) 1285 return 0; 1286 1287 return b < bend ? +1 : -1; 1288 } 1289 1290 /* 1291 =for apidoc utf8_to_bytes 1292 1293 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding. 1294 Unlike L</bytes_to_utf8>, this over-writes the original string, and 1295 updates C<len> to contain the new length. 1296 Returns zero on failure, setting C<len> to -1. 1297 1298 If you need a copy of the string, see L</bytes_from_utf8>. 1299 1300 =cut 1301 */ 1302 1303 U8 * 1304 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) 1305 { 1306 U8 * const save = s; 1307 U8 * const send = s + *len; 1308 U8 *d; 1309 1310 PERL_ARGS_ASSERT_UTF8_TO_BYTES; 1311 1312 /* ensure valid UTF-8 and chars < 256 before updating string */ 1313 while (s < send) { 1314 if (! UTF8_IS_INVARIANT(*s)) { 1315 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 1316 *len = ((STRLEN) -1); 1317 return 0; 1318 } 1319 s++; 1320 } 1321 s++; 1322 } 1323 1324 d = s = save; 1325 while (s < send) { 1326 U8 c = *s++; 1327 if (! UTF8_IS_INVARIANT(c)) { 1328 /* Then it is two-byte encoded */ 1329 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); 1330 s++; 1331 } 1332 *d++ = c; 1333 } 1334 *d = '\0'; 1335 *len = d - save; 1336 return save; 1337 } 1338 1339 /* 1340 =for apidoc bytes_from_utf8 1341 1342 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding. 1343 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to 1344 the newly-created string, and updates C<len> to contain the new 1345 length. Returns the original string if no conversion occurs, C<len> 1346 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to 1347 0 if C<s> is converted or consisted entirely of characters that are invariant 1348 in utf8 (i.e., US-ASCII on non-EBCDIC machines). 1349 1350 =cut 1351 */ 1352 1353 U8 * 1354 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) 1355 { 1356 U8 *d; 1357 const U8 *start = s; 1358 const U8 *send; 1359 I32 count = 0; 1360 1361 PERL_ARGS_ASSERT_BYTES_FROM_UTF8; 1362 1363 PERL_UNUSED_CONTEXT; 1364 if (!*is_utf8) 1365 return (U8 *)start; 1366 1367 /* ensure valid UTF-8 and chars < 256 before converting string */ 1368 for (send = s + *len; s < send;) { 1369 if (! UTF8_IS_INVARIANT(*s)) { 1370 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 1371 return (U8 *)start; 1372 } 1373 count++; 1374 s++; 1375 } 1376 s++; 1377 } 1378 1379 *is_utf8 = FALSE; 1380 1381 Newx(d, (*len) - count + 1, U8); 1382 s = start; start = d; 1383 while (s < send) { 1384 U8 c = *s++; 1385 if (! UTF8_IS_INVARIANT(c)) { 1386 /* Then it is two-byte encoded */ 1387 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s); 1388 s++; 1389 } 1390 *d++ = c; 1391 } 1392 *d = '\0'; 1393 *len = d - start; 1394 return (U8 *)start; 1395 } 1396 1397 /* 1398 =for apidoc bytes_to_utf8 1399 1400 Converts a string C<s> of length C<len> bytes from the native encoding into 1401 UTF-8. 1402 Returns a pointer to the newly-created string, and sets C<len> to 1403 reflect the new length in bytes. 1404 1405 A C<NUL> character will be written after the end of the string. 1406 1407 If you want to convert to UTF-8 from encodings other than 1408 the native (Latin1 or EBCDIC), 1409 see L</sv_recode_to_utf8>(). 1410 1411 =cut 1412 */ 1413 1414 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will 1415 likewise need duplication. */ 1416 1417 U8* 1418 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) 1419 { 1420 const U8 * const send = s + (*len); 1421 U8 *d; 1422 U8 *dst; 1423 1424 PERL_ARGS_ASSERT_BYTES_TO_UTF8; 1425 PERL_UNUSED_CONTEXT; 1426 1427 Newx(d, (*len) * 2 + 1, U8); 1428 dst = d; 1429 1430 while (s < send) { 1431 append_utf8_from_native_byte(*s, &d); 1432 s++; 1433 } 1434 *d = '\0'; 1435 *len = d-dst; 1436 return dst; 1437 } 1438 1439 /* 1440 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. 1441 * 1442 * Destination must be pre-extended to 3/2 source. Do not use in-place. 1443 * We optimize for native, for obvious reasons. */ 1444 1445 U8* 1446 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) 1447 { 1448 U8* pend; 1449 U8* dstart = d; 1450 1451 PERL_ARGS_ASSERT_UTF16_TO_UTF8; 1452 1453 if (bytelen & 1) 1454 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); 1455 1456 pend = p + bytelen; 1457 1458 while (p < pend) { 1459 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ 1460 p += 2; 1461 if (UNI_IS_INVARIANT(uv)) { 1462 *d++ = LATIN1_TO_NATIVE((U8) uv); 1463 continue; 1464 } 1465 if (uv <= MAX_UTF8_TWO_BYTE) { 1466 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv)); 1467 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); 1468 continue; 1469 } 1470 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST 1471 #define LAST_HIGH_SURROGATE 0xDBFF 1472 #define FIRST_LOW_SURROGATE 0xDC00 1473 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST 1474 if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) { 1475 if (p >= pend) { 1476 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 1477 } else { 1478 UV low = (p[0] << 8) + p[1]; 1479 p += 2; 1480 if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE) 1481 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 1482 uv = ((uv - FIRST_HIGH_SURROGATE) << 10) 1483 + (low - FIRST_LOW_SURROGATE) + 0x10000; 1484 } 1485 } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) { 1486 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 1487 } 1488 #ifdef EBCDIC 1489 d = uvoffuni_to_utf8_flags(d, uv, 0); 1490 #else 1491 if (uv < 0x10000) { 1492 *d++ = (U8)(( uv >> 12) | 0xe0); 1493 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 1494 *d++ = (U8)(( uv & 0x3f) | 0x80); 1495 continue; 1496 } 1497 else { 1498 *d++ = (U8)(( uv >> 18) | 0xf0); 1499 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80); 1500 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); 1501 *d++ = (U8)(( uv & 0x3f) | 0x80); 1502 continue; 1503 } 1504 #endif 1505 } 1506 *newlen = d - dstart; 1507 return d; 1508 } 1509 1510 /* Note: this one is slightly destructive of the source. */ 1511 1512 U8* 1513 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) 1514 { 1515 U8* s = (U8*)p; 1516 U8* const send = s + bytelen; 1517 1518 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; 1519 1520 if (bytelen & 1) 1521 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf, 1522 (UV)bytelen); 1523 1524 while (s < send) { 1525 const U8 tmp = s[0]; 1526 s[0] = s[1]; 1527 s[1] = tmp; 1528 s += 2; 1529 } 1530 return utf16_to_utf8(p, d, bytelen, newlen); 1531 } 1532 1533 bool 1534 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) 1535 { 1536 U8 tmpbuf[UTF8_MAXBYTES+1]; 1537 uvchr_to_utf8(tmpbuf, c); 1538 return _is_utf8_FOO(classnum, tmpbuf); 1539 } 1540 1541 /* Internal function so we can deprecate the external one, and call 1542 this one from other deprecated functions in this file */ 1543 1544 PERL_STATIC_INLINE bool 1545 S_is_utf8_idfirst(pTHX_ const U8 *p) 1546 { 1547 dVAR; 1548 1549 if (*p == '_') 1550 return TRUE; 1551 /* is_utf8_idstart would be more logical. */ 1552 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL); 1553 } 1554 1555 bool 1556 Perl_is_uni_idfirst(pTHX_ UV c) 1557 { 1558 U8 tmpbuf[UTF8_MAXBYTES+1]; 1559 uvchr_to_utf8(tmpbuf, c); 1560 return S_is_utf8_idfirst(aTHX_ tmpbuf); 1561 } 1562 1563 bool 1564 Perl__is_uni_perl_idcont(pTHX_ UV c) 1565 { 1566 U8 tmpbuf[UTF8_MAXBYTES+1]; 1567 uvchr_to_utf8(tmpbuf, c); 1568 return _is_utf8_perl_idcont(tmpbuf); 1569 } 1570 1571 bool 1572 Perl__is_uni_perl_idstart(pTHX_ UV c) 1573 { 1574 U8 tmpbuf[UTF8_MAXBYTES+1]; 1575 uvchr_to_utf8(tmpbuf, c); 1576 return _is_utf8_perl_idstart(tmpbuf); 1577 } 1578 1579 UV 1580 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s) 1581 { 1582 /* We have the latin1-range values compiled into the core, so just use 1583 * those, converting the result to utf8. The only difference between upper 1584 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is 1585 * either "SS" or "Ss". Which one to use is passed into the routine in 1586 * 'S_or_s' to avoid a test */ 1587 1588 UV converted = toUPPER_LATIN1_MOD(c); 1589 1590 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; 1591 1592 assert(S_or_s == 'S' || S_or_s == 's'); 1593 1594 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for 1595 characters in this range */ 1596 *p = (U8) converted; 1597 *lenp = 1; 1598 return converted; 1599 } 1600 1601 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, 1602 * which it maps to one of them, so as to only have to have one check for 1603 * it in the main case */ 1604 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { 1605 switch (c) { 1606 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 1607 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; 1608 break; 1609 case MICRO_SIGN: 1610 converted = GREEK_CAPITAL_LETTER_MU; 1611 break; 1612 case LATIN_SMALL_LETTER_SHARP_S: 1613 *(p)++ = 'S'; 1614 *p = S_or_s; 1615 *lenp = 2; 1616 return 'S'; 1617 default: 1618 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); 1619 assert(0); /* NOTREACHED */ 1620 } 1621 } 1622 1623 *(p)++ = UTF8_TWO_BYTE_HI(converted); 1624 *p = UTF8_TWO_BYTE_LO(converted); 1625 *lenp = 2; 1626 1627 return converted; 1628 } 1629 1630 /* Call the function to convert a UTF-8 encoded character to the specified case. 1631 * Note that there may be more than one character in the result. 1632 * INP is a pointer to the first byte of the input character 1633 * OUTP will be set to the first byte of the string of changed characters. It 1634 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes 1635 * LENP will be set to the length in bytes of the string of changed characters 1636 * 1637 * The functions return the ordinal of the first character in the string of OUTP */ 1638 #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "") 1639 #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "") 1640 #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "") 1641 1642 /* This additionally has the input parameter SPECIALS, which if non-zero will 1643 * cause this to use the SPECIALS hash for folding (meaning get full case 1644 * folding); otherwise, when zero, this implies a simple case fold */ 1645 #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL) 1646 1647 UV 1648 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) 1649 { 1650 dVAR; 1651 1652 /* Convert the Unicode character whose ordinal is <c> to its uppercase 1653 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>. 1654 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since 1655 * the changed version may be longer than the original character. 1656 * 1657 * The ordinal of the first character of the changed version is returned 1658 * (but note, as explained above, that there may be more.) */ 1659 1660 PERL_ARGS_ASSERT_TO_UNI_UPPER; 1661 1662 if (c < 256) { 1663 return _to_upper_title_latin1((U8) c, p, lenp, 'S'); 1664 } 1665 1666 uvchr_to_utf8(p, c); 1667 return CALL_UPPER_CASE(p, p, lenp); 1668 } 1669 1670 UV 1671 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) 1672 { 1673 dVAR; 1674 1675 PERL_ARGS_ASSERT_TO_UNI_TITLE; 1676 1677 if (c < 256) { 1678 return _to_upper_title_latin1((U8) c, p, lenp, 's'); 1679 } 1680 1681 uvchr_to_utf8(p, c); 1682 return CALL_TITLE_CASE(p, p, lenp); 1683 } 1684 1685 STATIC U8 1686 S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp) 1687 { 1688 /* We have the latin1-range values compiled into the core, so just use 1689 * those, converting the result to utf8. Since the result is always just 1690 * one character, we allow <p> to be NULL */ 1691 1692 U8 converted = toLOWER_LATIN1(c); 1693 1694 if (p != NULL) { 1695 if (NATIVE_BYTE_IS_INVARIANT(converted)) { 1696 *p = converted; 1697 *lenp = 1; 1698 } 1699 else { 1700 *p = UTF8_TWO_BYTE_HI(converted); 1701 *(p+1) = UTF8_TWO_BYTE_LO(converted); 1702 *lenp = 2; 1703 } 1704 } 1705 return converted; 1706 } 1707 1708 UV 1709 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) 1710 { 1711 dVAR; 1712 1713 PERL_ARGS_ASSERT_TO_UNI_LOWER; 1714 1715 if (c < 256) { 1716 return to_lower_latin1((U8) c, p, lenp); 1717 } 1718 1719 uvchr_to_utf8(p, c); 1720 return CALL_LOWER_CASE(p, p, lenp); 1721 } 1722 1723 UV 1724 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) 1725 { 1726 /* Corresponds to to_lower_latin1(); <flags> bits meanings: 1727 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 1728 * FOLD_FLAGS_FULL iff full folding is to be used; 1729 * 1730 * Not to be used for locale folds 1731 */ 1732 1733 UV converted; 1734 1735 PERL_ARGS_ASSERT__TO_FOLD_LATIN1; 1736 1737 assert (! (flags & FOLD_FLAGS_LOCALE)); 1738 1739 if (c == MICRO_SIGN) { 1740 converted = GREEK_SMALL_LETTER_MU; 1741 } 1742 else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) { 1743 1744 /* If can't cross 127/128 boundary, can't return "ss"; instead return 1745 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") 1746 * under those circumstances. */ 1747 if (flags & FOLD_FLAGS_NOMIX_ASCII) { 1748 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; 1749 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 1750 p, *lenp, U8); 1751 return LATIN_SMALL_LETTER_LONG_S; 1752 } 1753 else { 1754 *(p)++ = 's'; 1755 *p = 's'; 1756 *lenp = 2; 1757 return 's'; 1758 } 1759 } 1760 else { /* In this range the fold of all other characters is their lower 1761 case */ 1762 converted = toLOWER_LATIN1(c); 1763 } 1764 1765 if (UVCHR_IS_INVARIANT(converted)) { 1766 *p = (U8) converted; 1767 *lenp = 1; 1768 } 1769 else { 1770 *(p)++ = UTF8_TWO_BYTE_HI(converted); 1771 *p = UTF8_TWO_BYTE_LO(converted); 1772 *lenp = 2; 1773 } 1774 1775 return converted; 1776 } 1777 1778 UV 1779 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) 1780 { 1781 1782 /* Not currently externally documented, and subject to change 1783 * <flags> bits meanings: 1784 * FOLD_FLAGS_FULL iff full folding is to be used; 1785 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 1786 * locale are to be used. 1787 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 1788 */ 1789 1790 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; 1791 1792 /* Tread a UTF-8 locale as not being in locale at all */ 1793 if (IN_UTF8_CTYPE_LOCALE) { 1794 flags &= ~FOLD_FLAGS_LOCALE; 1795 } 1796 1797 if (c < 256) { 1798 UV result = _to_fold_latin1((U8) c, p, lenp, 1799 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); 1800 /* It is illegal for the fold to cross the 255/256 boundary under 1801 * locale; in this case return the original */ 1802 return (result > 256 && flags & FOLD_FLAGS_LOCALE) 1803 ? c 1804 : result; 1805 } 1806 1807 /* If no special needs, just use the macro */ 1808 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { 1809 uvchr_to_utf8(p, c); 1810 return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL); 1811 } 1812 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with 1813 the special flags. */ 1814 U8 utf8_c[UTF8_MAXBYTES + 1]; 1815 uvchr_to_utf8(utf8_c, c); 1816 return _to_utf8_fold_flags(utf8_c, p, lenp, flags); 1817 } 1818 } 1819 1820 PERL_STATIC_INLINE bool 1821 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, 1822 const char *const swashname, SV* const invlist) 1823 { 1824 /* returns a boolean giving whether or not the UTF8-encoded character that 1825 * starts at <p> is in the swash indicated by <swashname>. <swash> 1826 * contains a pointer to where the swash indicated by <swashname> 1827 * is to be stored; which this routine will do, so that future calls will 1828 * look at <*swash> and only generate a swash if it is not null. <invlist> 1829 * is NULL or an inversion list that defines the swash. If not null, it 1830 * saves time during initialization of the swash. 1831 * 1832 * Note that it is assumed that the buffer length of <p> is enough to 1833 * contain all the bytes that comprise the character. Thus, <*p> should 1834 * have been checked before this call for mal-formedness enough to assure 1835 * that. */ 1836 1837 dVAR; 1838 1839 PERL_ARGS_ASSERT_IS_UTF8_COMMON; 1840 1841 /* The API should have included a length for the UTF-8 character in <p>, 1842 * but it doesn't. We therefore assume that p has been validated at least 1843 * as far as there being enough bytes available in it to accommodate the 1844 * character without reading beyond the end, and pass that number on to the 1845 * validating routine */ 1846 if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) { 1847 if (ckWARN_d(WARN_UTF8)) { 1848 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), 1849 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); 1850 if (ckWARN(WARN_UTF8)) { /* This will output details as to the 1851 what the malformation is */ 1852 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL); 1853 } 1854 } 1855 return FALSE; 1856 } 1857 if (!*swash) { 1858 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; 1859 *swash = _core_swash_init("utf8", 1860 1861 /* Only use the name if there is no inversion 1862 * list; otherwise will go out to disk */ 1863 (invlist) ? "" : swashname, 1864 1865 &PL_sv_undef, 1, 0, invlist, &flags); 1866 } 1867 1868 return swash_fetch(*swash, p, TRUE) != 0; 1869 } 1870 1871 bool 1872 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) 1873 { 1874 dVAR; 1875 1876 PERL_ARGS_ASSERT__IS_UTF8_FOO; 1877 1878 assert(classnum < _FIRST_NON_SWASH_CC); 1879 1880 return is_utf8_common(p, 1881 &PL_utf8_swash_ptrs[classnum], 1882 swash_property_names[classnum], 1883 PL_XPosix_ptrs[classnum]); 1884 } 1885 1886 bool 1887 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ 1888 { 1889 dVAR; 1890 1891 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; 1892 1893 return S_is_utf8_idfirst(aTHX_ p); 1894 } 1895 1896 bool 1897 Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */ 1898 { 1899 dVAR; 1900 1901 PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST; 1902 1903 if (*p == '_') 1904 return TRUE; 1905 /* is_utf8_idstart would be more logical. */ 1906 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL); 1907 } 1908 1909 bool 1910 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) 1911 { 1912 dVAR; 1913 SV* invlist = NULL; 1914 1915 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; 1916 1917 if (! PL_utf8_perl_idstart) { 1918 invlist = _new_invlist_C_array(_Perl_IDStart_invlist); 1919 } 1920 return is_utf8_common(p, &PL_utf8_perl_idstart, "", invlist); 1921 } 1922 1923 bool 1924 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) 1925 { 1926 dVAR; 1927 SV* invlist = NULL; 1928 1929 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; 1930 1931 if (! PL_utf8_perl_idcont) { 1932 invlist = _new_invlist_C_array(_Perl_IDCont_invlist); 1933 } 1934 return is_utf8_common(p, &PL_utf8_perl_idcont, "", invlist); 1935 } 1936 1937 1938 bool 1939 Perl_is_utf8_idcont(pTHX_ const U8 *p) 1940 { 1941 dVAR; 1942 1943 PERL_ARGS_ASSERT_IS_UTF8_IDCONT; 1944 1945 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL); 1946 } 1947 1948 bool 1949 Perl_is_utf8_xidcont(pTHX_ const U8 *p) 1950 { 1951 dVAR; 1952 1953 PERL_ARGS_ASSERT_IS_UTF8_XIDCONT; 1954 1955 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL); 1956 } 1957 1958 bool 1959 Perl__is_utf8_mark(pTHX_ const U8 *p) 1960 { 1961 dVAR; 1962 1963 PERL_ARGS_ASSERT__IS_UTF8_MARK; 1964 1965 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); 1966 } 1967 1968 /* 1969 =for apidoc to_utf8_case 1970 1971 C<p> contains the pointer to the UTF-8 string encoding 1972 the character that is being converted. This routine assumes that the character 1973 at C<p> is well-formed. 1974 1975 C<ustrp> is a pointer to the character buffer to put the 1976 conversion result to. C<lenp> is a pointer to the length 1977 of the result. 1978 1979 C<swashp> is a pointer to the swash to use. 1980 1981 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>, 1982 and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. C<special> (usually, 1983 but not always, a multicharacter mapping), is tried first. 1984 1985 C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use 1986 any special mappings; C<""> means to use the special mappings. Values other 1987 than these two are treated as the name of the hash containing the special 1988 mappings, like C<"utf8::ToSpecLower">. 1989 1990 C<normal> is a string like "ToLower" which means the swash 1991 %utf8::ToLower. 1992 1993 =cut */ 1994 1995 UV 1996 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, 1997 SV **swashp, const char *normal, const char *special) 1998 { 1999 dVAR; 2000 STRLEN len = 0; 2001 const UV uv1 = valid_utf8_to_uvchr(p, NULL); 2002 2003 PERL_ARGS_ASSERT_TO_UTF8_CASE; 2004 2005 /* Note that swash_fetch() doesn't output warnings for these because it 2006 * assumes we will */ 2007 if (uv1 >= UNICODE_SURROGATE_FIRST) { 2008 if (uv1 <= UNICODE_SURROGATE_LAST) { 2009 if (ckWARN_d(WARN_SURROGATE)) { 2010 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 2011 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 2012 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1); 2013 } 2014 } 2015 else if (UNICODE_IS_SUPER(uv1)) { 2016 if (ckWARN_d(WARN_NON_UNICODE)) { 2017 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 2018 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 2019 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1); 2020 } 2021 } 2022 2023 /* Note that non-characters are perfectly legal, so no warning should 2024 * be given */ 2025 } 2026 2027 if (!*swashp) /* load on-demand */ 2028 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); 2029 2030 if (special) { 2031 /* It might be "special" (sometimes, but not always, 2032 * a multicharacter mapping) */ 2033 HV *hv = NULL; 2034 SV **svp; 2035 2036 /* If passed in the specials name, use that; otherwise use any 2037 * given in the swash */ 2038 if (*special != '\0') { 2039 hv = get_hv(special, 0); 2040 } 2041 else { 2042 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0); 2043 if (svp) { 2044 hv = MUTABLE_HV(SvRV(*svp)); 2045 } 2046 } 2047 2048 if (hv 2049 && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) 2050 && (*svp)) 2051 { 2052 const char *s; 2053 2054 s = SvPV_const(*svp, len); 2055 if (len == 1) 2056 /* EIGHTBIT */ 2057 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp; 2058 else { 2059 Copy(s, ustrp, len, U8); 2060 } 2061 } 2062 } 2063 2064 if (!len && *swashp) { 2065 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */); 2066 2067 if (uv2) { 2068 /* It was "normal" (a single character mapping). */ 2069 len = uvchr_to_utf8(ustrp, uv2) - ustrp; 2070 } 2071 } 2072 2073 if (len) { 2074 if (lenp) { 2075 *lenp = len; 2076 } 2077 return valid_utf8_to_uvchr(ustrp, 0); 2078 } 2079 2080 /* Here, there was no mapping defined, which means that the code point maps 2081 * to itself. Return the inputs */ 2082 len = UTF8SKIP(p); 2083 if (p != ustrp) { /* Don't copy onto itself */ 2084 Copy(p, ustrp, len, U8); 2085 } 2086 2087 if (lenp) 2088 *lenp = len; 2089 2090 return uv1; 2091 2092 } 2093 2094 STATIC UV 2095 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) 2096 { 2097 /* This is called when changing the case of a utf8-encoded character above 2098 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the 2099 * result contains a character that crosses the 255/256 boundary, disallow 2100 * the change, and return the original code point. See L<perlfunc/lc> for 2101 * why; 2102 * 2103 * p points to the original string whose case was changed; assumed 2104 * by this routine to be well-formed 2105 * result the code point of the first character in the changed-case string 2106 * ustrp points to the changed-case string (<result> represents its first char) 2107 * lenp points to the length of <ustrp> */ 2108 2109 UV original; /* To store the first code point of <p> */ 2110 2111 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING; 2112 2113 assert(UTF8_IS_ABOVE_LATIN1(*p)); 2114 2115 /* We know immediately if the first character in the string crosses the 2116 * boundary, so can skip */ 2117 if (result > 255) { 2118 2119 /* Look at every character in the result; if any cross the 2120 * boundary, the whole thing is disallowed */ 2121 U8* s = ustrp + UTF8SKIP(ustrp); 2122 U8* e = ustrp + *lenp; 2123 while (s < e) { 2124 if (! UTF8_IS_ABOVE_LATIN1(*s)) { 2125 goto bad_crossing; 2126 } 2127 s += UTF8SKIP(s); 2128 } 2129 2130 /* Here, no characters crossed, result is ok as-is */ 2131 return result; 2132 } 2133 2134 bad_crossing: 2135 2136 /* Failed, have to return the original */ 2137 original = valid_utf8_to_uvchr(p, lenp); 2138 Copy(p, ustrp, *lenp, char); 2139 return original; 2140 } 2141 2142 /* 2143 =for apidoc to_utf8_upper 2144 2145 Instead use L</toUPPER_utf8>. 2146 2147 =cut */ 2148 2149 /* Not currently externally documented, and subject to change: 2150 * <flags> is set iff iff the rules from the current underlying locale are to 2151 * be used. */ 2152 2153 UV 2154 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) 2155 { 2156 dVAR; 2157 2158 UV result; 2159 2160 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; 2161 2162 if (flags && IN_UTF8_CTYPE_LOCALE) { 2163 flags = FALSE; 2164 } 2165 2166 if (UTF8_IS_INVARIANT(*p)) { 2167 if (flags) { 2168 result = toUPPER_LC(*p); 2169 } 2170 else { 2171 return _to_upper_title_latin1(*p, ustrp, lenp, 'S'); 2172 } 2173 } 2174 else if UTF8_IS_DOWNGRADEABLE_START(*p) { 2175 if (flags) { 2176 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); 2177 result = toUPPER_LC(c); 2178 } 2179 else { 2180 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), 2181 ustrp, lenp, 'S'); 2182 } 2183 } 2184 else { /* utf8, ord above 255 */ 2185 result = CALL_UPPER_CASE(p, ustrp, lenp); 2186 2187 if (flags) { 2188 result = check_locale_boundary_crossing(p, result, ustrp, lenp); 2189 } 2190 return result; 2191 } 2192 2193 /* Here, used locale rules. Convert back to utf8 */ 2194 if (UTF8_IS_INVARIANT(result)) { 2195 *ustrp = (U8) result; 2196 *lenp = 1; 2197 } 2198 else { 2199 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 2200 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 2201 *lenp = 2; 2202 } 2203 2204 return result; 2205 } 2206 2207 /* 2208 =for apidoc to_utf8_title 2209 2210 Instead use L</toTITLE_utf8>. 2211 2212 =cut */ 2213 2214 /* Not currently externally documented, and subject to change: 2215 * <flags> is set iff the rules from the current underlying locale are to be 2216 * used. Since titlecase is not defined in POSIX, for other than a 2217 * UTF-8 locale, uppercase is used instead for code points < 256. 2218 */ 2219 2220 UV 2221 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) 2222 { 2223 dVAR; 2224 2225 UV result; 2226 2227 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; 2228 2229 if (flags && IN_UTF8_CTYPE_LOCALE) { 2230 flags = FALSE; 2231 } 2232 2233 if (UTF8_IS_INVARIANT(*p)) { 2234 if (flags) { 2235 result = toUPPER_LC(*p); 2236 } 2237 else { 2238 return _to_upper_title_latin1(*p, ustrp, lenp, 's'); 2239 } 2240 } 2241 else if UTF8_IS_DOWNGRADEABLE_START(*p) { 2242 if (flags) { 2243 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); 2244 result = toUPPER_LC(c); 2245 } 2246 else { 2247 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), 2248 ustrp, lenp, 's'); 2249 } 2250 } 2251 else { /* utf8, ord above 255 */ 2252 result = CALL_TITLE_CASE(p, ustrp, lenp); 2253 2254 if (flags) { 2255 result = check_locale_boundary_crossing(p, result, ustrp, lenp); 2256 } 2257 return result; 2258 } 2259 2260 /* Here, used locale rules. Convert back to utf8 */ 2261 if (UTF8_IS_INVARIANT(result)) { 2262 *ustrp = (U8) result; 2263 *lenp = 1; 2264 } 2265 else { 2266 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 2267 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 2268 *lenp = 2; 2269 } 2270 2271 return result; 2272 } 2273 2274 /* 2275 =for apidoc to_utf8_lower 2276 2277 Instead use L</toLOWER_utf8>. 2278 2279 =cut */ 2280 2281 /* Not currently externally documented, and subject to change: 2282 * <flags> is set iff iff the rules from the current underlying locale are to 2283 * be used. 2284 */ 2285 2286 UV 2287 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) 2288 { 2289 UV result; 2290 2291 dVAR; 2292 2293 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; 2294 2295 if (flags && IN_UTF8_CTYPE_LOCALE) { 2296 flags = FALSE; 2297 } 2298 2299 if (UTF8_IS_INVARIANT(*p)) { 2300 if (flags) { 2301 result = toLOWER_LC(*p); 2302 } 2303 else { 2304 return to_lower_latin1(*p, ustrp, lenp); 2305 } 2306 } 2307 else if UTF8_IS_DOWNGRADEABLE_START(*p) { 2308 if (flags) { 2309 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); 2310 result = toLOWER_LC(c); 2311 } 2312 else { 2313 return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), 2314 ustrp, lenp); 2315 } 2316 } 2317 else { /* utf8, ord above 255 */ 2318 result = CALL_LOWER_CASE(p, ustrp, lenp); 2319 2320 if (flags) { 2321 result = check_locale_boundary_crossing(p, result, ustrp, lenp); 2322 } 2323 2324 return result; 2325 } 2326 2327 /* Here, used locale rules. Convert back to utf8 */ 2328 if (UTF8_IS_INVARIANT(result)) { 2329 *ustrp = (U8) result; 2330 *lenp = 1; 2331 } 2332 else { 2333 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 2334 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 2335 *lenp = 2; 2336 } 2337 2338 return result; 2339 } 2340 2341 /* 2342 =for apidoc to_utf8_fold 2343 2344 Instead use L</toFOLD_utf8>. 2345 2346 =cut */ 2347 2348 /* Not currently externally documented, and subject to change, 2349 * in <flags> 2350 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 2351 * locale are to be used. 2352 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; 2353 * otherwise simple folds 2354 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are 2355 * prohibited 2356 */ 2357 2358 UV 2359 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) 2360 { 2361 dVAR; 2362 2363 UV result; 2364 2365 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; 2366 2367 /* These are mutually exclusive */ 2368 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII))); 2369 2370 assert(p != ustrp); /* Otherwise overwrites */ 2371 2372 if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) { 2373 flags &= ~FOLD_FLAGS_LOCALE; 2374 } 2375 2376 if (UTF8_IS_INVARIANT(*p)) { 2377 if (flags & FOLD_FLAGS_LOCALE) { 2378 result = toFOLD_LC(*p); 2379 } 2380 else { 2381 return _to_fold_latin1(*p, ustrp, lenp, 2382 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); 2383 } 2384 } 2385 else if UTF8_IS_DOWNGRADEABLE_START(*p) { 2386 if (flags & FOLD_FLAGS_LOCALE) { 2387 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)); 2388 result = toFOLD_LC(c); 2389 } 2390 else { 2391 return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)), 2392 ustrp, lenp, 2393 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); 2394 } 2395 } 2396 else { /* utf8, ord above 255 */ 2397 result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL); 2398 2399 if (flags & FOLD_FLAGS_LOCALE) { 2400 2401 /* Special case these two characters, as what normally gets 2402 * returned under locale doesn't work */ 2403 if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1 2404 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8, 2405 sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1)) 2406 { 2407 goto return_long_s; 2408 } 2409 else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1 2410 && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8, 2411 sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1)) 2412 { 2413 goto return_ligature_st; 2414 } 2415 return check_locale_boundary_crossing(p, result, ustrp, lenp); 2416 } 2417 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { 2418 return result; 2419 } 2420 else { 2421 /* This is called when changing the case of a utf8-encoded 2422 * character above the ASCII range, and the result should not 2423 * contain an ASCII character. */ 2424 2425 UV original; /* To store the first code point of <p> */ 2426 2427 /* Look at every character in the result; if any cross the 2428 * boundary, the whole thing is disallowed */ 2429 U8* s = ustrp; 2430 U8* e = ustrp + *lenp; 2431 while (s < e) { 2432 if (isASCII(*s)) { 2433 /* Crossed, have to return the original */ 2434 original = valid_utf8_to_uvchr(p, lenp); 2435 2436 /* But in these instances, there is an alternative we can 2437 * return that is valid */ 2438 if (original == LATIN_CAPITAL_LETTER_SHARP_S 2439 || original == LATIN_SMALL_LETTER_SHARP_S) 2440 { 2441 goto return_long_s; 2442 } 2443 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { 2444 goto return_ligature_st; 2445 } 2446 Copy(p, ustrp, *lenp, char); 2447 return original; 2448 } 2449 s += UTF8SKIP(s); 2450 } 2451 2452 /* Here, no characters crossed, result is ok as-is */ 2453 return result; 2454 } 2455 } 2456 2457 /* Here, used locale rules. Convert back to utf8 */ 2458 if (UTF8_IS_INVARIANT(result)) { 2459 *ustrp = (U8) result; 2460 *lenp = 1; 2461 } 2462 else { 2463 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 2464 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 2465 *lenp = 2; 2466 } 2467 2468 return result; 2469 2470 return_long_s: 2471 /* Certain folds to 'ss' are prohibited by the options, but they do allow 2472 * folds to a string of two of these characters. By returning this 2473 * instead, then, e.g., 2474 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") 2475 * works. */ 2476 2477 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; 2478 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 2479 ustrp, *lenp, U8); 2480 return LATIN_SMALL_LETTER_LONG_S; 2481 2482 return_ligature_st: 2483 /* Two folds to 'st' are prohibited by the options; instead we pick one and 2484 * have the other one fold to it */ 2485 2486 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1; 2487 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); 2488 return LATIN_SMALL_LIGATURE_ST; 2489 } 2490 2491 /* Note: 2492 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch(). 2493 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8". 2494 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl. 2495 */ 2496 2497 SV* 2498 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none) 2499 { 2500 PERL_ARGS_ASSERT_SWASH_INIT; 2501 2502 /* Returns a copy of a swash initiated by the called function. This is the 2503 * public interface, and returning a copy prevents others from doing 2504 * mischief on the original */ 2505 2506 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL)); 2507 } 2508 2509 SV* 2510 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p) 2511 { 2512 2513 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST 2514 * use the following define */ 2515 2516 #define CORE_SWASH_INIT_RETURN(x) \ 2517 PL_curpm= old_PL_curpm; \ 2518 return x 2519 2520 /* Initialize and return a swash, creating it if necessary. It does this 2521 * by calling utf8_heavy.pl in the general case. The returned value may be 2522 * the swash's inversion list instead if the input parameters allow it. 2523 * Which is returned should be immaterial to callers, as the only 2524 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(), 2525 * and swash_to_invlist() handle both these transparently. 2526 * 2527 * This interface should only be used by functions that won't destroy or 2528 * adversely change the swash, as doing so affects all other uses of the 2529 * swash in the program; the general public should use 'Perl_swash_init' 2530 * instead. 2531 * 2532 * pkg is the name of the package that <name> should be in. 2533 * name is the name of the swash to find. Typically it is a Unicode 2534 * property name, including user-defined ones 2535 * listsv is a string to initialize the swash with. It must be of the form 2536 * documented as the subroutine return value in 2537 * L<perlunicode/User-Defined Character Properties> 2538 * minbits is the number of bits required to represent each data element. 2539 * It is '1' for binary properties. 2540 * none I (khw) do not understand this one, but it is used only in tr///. 2541 * invlist is an inversion list to initialize the swash with (or NULL) 2542 * flags_p if non-NULL is the address of various input and output flag bits 2543 * to the routine, as follows: ('I' means is input to the routine; 2544 * 'O' means output from the routine. Only flags marked O are 2545 * meaningful on return.) 2546 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash 2547 * came from a user-defined property. (I O) 2548 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking 2549 * when the swash cannot be located, to simply return NULL. (I) 2550 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a 2551 * return of an inversion list instead of a swash hash if this routine 2552 * thinks that would result in faster execution of swash_fetch() later 2553 * on. (I) 2554 * 2555 * Thus there are three possible inputs to find the swash: <name>, 2556 * <listsv>, and <invlist>. At least one must be specified. The result 2557 * will be the union of the specified ones, although <listsv>'s various 2558 * actions can intersect, etc. what <name> gives. To avoid going out to 2559 * disk at all, <invlist> should specify completely what the swash should 2560 * have, and <listsv> should be &PL_sv_undef and <name> should be "". 2561 * 2562 * <invlist> is only valid for binary properties */ 2563 2564 dVAR; 2565 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */ 2566 2567 SV* retval = &PL_sv_undef; 2568 HV* swash_hv = NULL; 2569 const int invlist_swash_boundary = 2570 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST) 2571 ? 512 /* Based on some benchmarking, but not extensive, see commit 2572 message */ 2573 : -1; /* Never return just an inversion list */ 2574 2575 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); 2576 assert(! invlist || minbits == 1); 2577 2578 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex 2579 that triggered the swash init and the swash init perl logic itself. 2580 See perl #122747 */ 2581 2582 /* If data was passed in to go out to utf8_heavy to find the swash of, do 2583 * so */ 2584 if (listsv != &PL_sv_undef || strNE(name, "")) { 2585 dSP; 2586 const size_t pkg_len = strlen(pkg); 2587 const size_t name_len = strlen(name); 2588 HV * const stash = gv_stashpvn(pkg, pkg_len, 0); 2589 SV* errsv_save; 2590 GV *method; 2591 2592 PERL_ARGS_ASSERT__CORE_SWASH_INIT; 2593 2594 PUSHSTACKi(PERLSI_MAGIC); 2595 ENTER; 2596 SAVEHINTS(); 2597 save_re_context(); 2598 /* We might get here via a subroutine signature which uses a utf8 2599 * parameter name, at which point PL_subname will have been set 2600 * but not yet used. */ 2601 save_item(PL_subname); 2602 if (PL_parser && PL_parser->error_count) 2603 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0; 2604 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1); 2605 if (!method) { /* demand load utf8 */ 2606 ENTER; 2607 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); 2608 GvSV(PL_errgv) = NULL; 2609 /* It is assumed that callers of this routine are not passing in 2610 * any user derived data. */ 2611 /* Need to do this after save_re_context() as it will set 2612 * PL_tainted to 1 while saving $1 etc (see the code after getrx: 2613 * in Perl_magic_get). Even line to create errsv_save can turn on 2614 * PL_tainted. */ 2615 #ifndef NO_TAINT_SUPPORT 2616 SAVEBOOL(TAINT_get); 2617 TAINT_NOT; 2618 #endif 2619 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), 2620 NULL); 2621 { 2622 /* Not ERRSV, as there is no need to vivify a scalar we are 2623 about to discard. */ 2624 SV * const errsv = GvSV(PL_errgv); 2625 if (!SvTRUE(errsv)) { 2626 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); 2627 SvREFCNT_dec(errsv); 2628 } 2629 } 2630 LEAVE; 2631 } 2632 SPAGAIN; 2633 PUSHMARK(SP); 2634 EXTEND(SP,5); 2635 mPUSHp(pkg, pkg_len); 2636 mPUSHp(name, name_len); 2637 PUSHs(listsv); 2638 mPUSHi(minbits); 2639 mPUSHi(none); 2640 PUTBACK; 2641 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); 2642 GvSV(PL_errgv) = NULL; 2643 /* If we already have a pointer to the method, no need to use 2644 * call_method() to repeat the lookup. */ 2645 if (method 2646 ? call_sv(MUTABLE_SV(method), G_SCALAR) 2647 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD)) 2648 { 2649 retval = *PL_stack_sp--; 2650 SvREFCNT_inc(retval); 2651 } 2652 { 2653 /* Not ERRSV. See above. */ 2654 SV * const errsv = GvSV(PL_errgv); 2655 if (!SvTRUE(errsv)) { 2656 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); 2657 SvREFCNT_dec(errsv); 2658 } 2659 } 2660 LEAVE; 2661 POPSTACK; 2662 if (IN_PERL_COMPILETIME) { 2663 CopHINTS_set(PL_curcop, PL_hints); 2664 } 2665 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { 2666 if (SvPOK(retval)) 2667 2668 /* If caller wants to handle missing properties, let them */ 2669 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) { 2670 CORE_SWASH_INIT_RETURN(NULL); 2671 } 2672 Perl_croak(aTHX_ 2673 "Can't find Unicode property definition \"%"SVf"\"", 2674 SVfARG(retval)); 2675 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); 2676 } 2677 } /* End of calling the module to find the swash */ 2678 2679 /* If this operation fetched a swash, and we will need it later, get it */ 2680 if (retval != &PL_sv_undef 2681 && (minbits == 1 || (flags_p 2682 && ! (*flags_p 2683 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)))) 2684 { 2685 swash_hv = MUTABLE_HV(SvRV(retval)); 2686 2687 /* If we don't already know that there is a user-defined component to 2688 * this swash, and the user has indicated they wish to know if there is 2689 * one (by passing <flags_p>), find out */ 2690 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) { 2691 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE); 2692 if (user_defined && SvUV(*user_defined)) { 2693 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; 2694 } 2695 } 2696 } 2697 2698 /* Make sure there is an inversion list for binary properties */ 2699 if (minbits == 1) { 2700 SV** swash_invlistsvp = NULL; 2701 SV* swash_invlist = NULL; 2702 bool invlist_in_swash_is_valid = FALSE; 2703 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has 2704 an unclaimed reference count */ 2705 2706 /* If this operation fetched a swash, get its already existing 2707 * inversion list, or create one for it */ 2708 2709 if (swash_hv) { 2710 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE); 2711 if (swash_invlistsvp) { 2712 swash_invlist = *swash_invlistsvp; 2713 invlist_in_swash_is_valid = TRUE; 2714 } 2715 else { 2716 swash_invlist = _swash_to_invlist(retval); 2717 swash_invlist_unclaimed = TRUE; 2718 } 2719 } 2720 2721 /* If an inversion list was passed in, have to include it */ 2722 if (invlist) { 2723 2724 /* Any fetched swash will by now have an inversion list in it; 2725 * otherwise <swash_invlist> will be NULL, indicating that we 2726 * didn't fetch a swash */ 2727 if (swash_invlist) { 2728 2729 /* Add the passed-in inversion list, which invalidates the one 2730 * already stored in the swash */ 2731 invlist_in_swash_is_valid = FALSE; 2732 _invlist_union(invlist, swash_invlist, &swash_invlist); 2733 } 2734 else { 2735 2736 /* Here, there is no swash already. Set up a minimal one, if 2737 * we are going to return a swash */ 2738 if ((int) _invlist_len(invlist) > invlist_swash_boundary) { 2739 swash_hv = newHV(); 2740 retval = newRV_noinc(MUTABLE_SV(swash_hv)); 2741 } 2742 swash_invlist = invlist; 2743 } 2744 } 2745 2746 /* Here, we have computed the union of all the passed-in data. It may 2747 * be that there was an inversion list in the swash which didn't get 2748 * touched; otherwise save the computed one */ 2749 if (! invlist_in_swash_is_valid 2750 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) 2751 { 2752 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist)) 2753 { 2754 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 2755 } 2756 /* We just stole a reference count. */ 2757 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE; 2758 else SvREFCNT_inc_simple_void_NN(swash_invlist); 2759 } 2760 2761 SvREADONLY_on(swash_invlist); 2762 2763 /* Use the inversion list stand-alone if small enough */ 2764 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { 2765 SvREFCNT_dec(retval); 2766 if (!swash_invlist_unclaimed) 2767 SvREFCNT_inc_simple_void_NN(swash_invlist); 2768 retval = newRV_noinc(swash_invlist); 2769 } 2770 } 2771 2772 CORE_SWASH_INIT_RETURN(retval); 2773 #undef CORE_SWASH_INIT_RETURN 2774 } 2775 2776 2777 /* This API is wrong for special case conversions since we may need to 2778 * return several Unicode characters for a single Unicode character 2779 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is 2780 * the lower-level routine, and it is similarly broken for returning 2781 * multiple values. --jhi 2782 * For those, you should use to_utf8_case() instead */ 2783 /* Now SWASHGET is recasted into S_swatch_get in this file. */ 2784 2785 /* Note: 2786 * Returns the value of property/mapping C<swash> for the first character 2787 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is 2788 * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr> 2789 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>. 2790 * 2791 * A "swash" is a hash which contains initially the keys/values set up by 2792 * SWASHNEW. The purpose is to be able to completely represent a Unicode 2793 * property for all possible code points. Things are stored in a compact form 2794 * (see utf8_heavy.pl) so that calculation is required to find the actual 2795 * property value for a given code point. As code points are looked up, new 2796 * key/value pairs are added to the hash, so that the calculation doesn't have 2797 * to ever be re-done. Further, each calculation is done, not just for the 2798 * desired one, but for a whole block of code points adjacent to that one. 2799 * For binary properties on ASCII machines, the block is usually for 64 code 2800 * points, starting with a code point evenly divisible by 64. Thus if the 2801 * property value for code point 257 is requested, the code goes out and 2802 * calculates the property values for all 64 code points between 256 and 319, 2803 * and stores these as a single 64-bit long bit vector, called a "swatch", 2804 * under the key for code point 256. The key is the UTF-8 encoding for code 2805 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding 2806 * for a code point is 13 bytes, the key will be 12 bytes long. If the value 2807 * for code point 258 is then requested, this code realizes that it would be 2808 * stored under the key for 256, and would find that value and extract the 2809 * relevant bit, offset from 256. 2810 * 2811 * Non-binary properties are stored in as many bits as necessary to represent 2812 * their values (32 currently, though the code is more general than that), not 2813 * as single bits, but the principal is the same: the value for each key is a 2814 * vector that encompasses the property values for all code points whose UTF-8 2815 * representations are represented by the key. That is, for all code points 2816 * whose UTF-8 representations are length N bytes, and the key is the first N-1 2817 * bytes of that. 2818 */ 2819 UV 2820 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) 2821 { 2822 dVAR; 2823 HV *const hv = MUTABLE_HV(SvRV(swash)); 2824 U32 klen; 2825 U32 off; 2826 STRLEN slen = 0; 2827 STRLEN needents; 2828 const U8 *tmps = NULL; 2829 U32 bit; 2830 SV *swatch; 2831 const U8 c = *ptr; 2832 2833 PERL_ARGS_ASSERT_SWASH_FETCH; 2834 2835 /* If it really isn't a hash, it isn't really swash; must be an inversion 2836 * list */ 2837 if (SvTYPE(hv) != SVt_PVHV) { 2838 return _invlist_contains_cp((SV*)hv, 2839 (do_utf8) 2840 ? valid_utf8_to_uvchr(ptr, NULL) 2841 : c); 2842 } 2843 2844 /* We store the values in a "swatch" which is a vec() value in a swash 2845 * hash. Code points 0-255 are a single vec() stored with key length 2846 * (klen) 0. All other code points have a UTF-8 representation 2847 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which 2848 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key 2849 * length for them is the length of the encoded char - 1. ptr[klen] is the 2850 * final byte in the sequence representing the character */ 2851 if (!do_utf8 || UTF8_IS_INVARIANT(c)) { 2852 klen = 0; 2853 needents = 256; 2854 off = c; 2855 } 2856 else if (UTF8_IS_DOWNGRADEABLE_START(c)) { 2857 klen = 0; 2858 needents = 256; 2859 off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1)); 2860 } 2861 else { 2862 klen = UTF8SKIP(ptr) - 1; 2863 2864 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into 2865 * the vec is the final byte in the sequence. (In EBCDIC this is 2866 * converted to I8 to get consecutive values.) To help you visualize 2867 * all this: 2868 * Straight 1047 After final byte 2869 * UTF-8 UTF-EBCDIC I8 transform 2870 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0 2871 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1 2872 * ... 2873 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9 2874 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA 2875 * ... 2876 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2 2877 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3 2878 * ... 2879 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB 2880 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC 2881 * ... 2882 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF 2883 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41 2884 * 2885 * (There are no discontinuities in the elided (...) entries.) 2886 * The UTF-8 key for these 33 code points is '\xD0' (which also is the 2887 * key for the next 31, up through U+043F, whose UTF-8 final byte is 2888 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points. 2889 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an 2890 * index into the vec() swatch (after subtracting 0x80, which we 2891 * actually do with an '&'). 2892 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32 2893 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has 2894 * dicontinuities which go away by transforming it into I8, and we 2895 * effectively subtract 0xA0 to get the index. */ 2896 needents = (1 << UTF_ACCUMULATION_SHIFT); 2897 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK; 2898 } 2899 2900 /* 2901 * This single-entry cache saves about 1/3 of the utf8 overhead in test 2902 * suite. (That is, only 7-8% overall over just a hash cache. Still, 2903 * it's nothing to sniff at.) Pity we usually come through at least 2904 * two function calls to get here... 2905 * 2906 * NB: this code assumes that swatches are never modified, once generated! 2907 */ 2908 2909 if (hv == PL_last_swash_hv && 2910 klen == PL_last_swash_klen && 2911 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) ) 2912 { 2913 tmps = PL_last_swash_tmps; 2914 slen = PL_last_swash_slen; 2915 } 2916 else { 2917 /* Try our second-level swatch cache, kept in a hash. */ 2918 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE); 2919 2920 /* If not cached, generate it via swatch_get */ 2921 if (!svp || !SvPOK(*svp) 2922 || !(tmps = (const U8*)SvPV_const(*svp, slen))) 2923 { 2924 if (klen) { 2925 const UV code_point = valid_utf8_to_uvchr(ptr, NULL); 2926 swatch = swatch_get(swash, 2927 code_point & ~((UV)needents - 1), 2928 needents); 2929 } 2930 else { /* For the first 256 code points, the swatch has a key of 2931 length 0 */ 2932 swatch = swatch_get(swash, 0, needents); 2933 } 2934 2935 if (IN_PERL_COMPILETIME) 2936 CopHINTS_set(PL_curcop, PL_hints); 2937 2938 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); 2939 2940 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) 2941 || (slen << 3) < needents) 2942 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, " 2943 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf, 2944 svp, tmps, (UV)slen, (UV)needents); 2945 } 2946 2947 PL_last_swash_hv = hv; 2948 assert(klen <= sizeof(PL_last_swash_key)); 2949 PL_last_swash_klen = (U8)klen; 2950 /* FIXME change interpvar.h? */ 2951 PL_last_swash_tmps = (U8 *) tmps; 2952 PL_last_swash_slen = slen; 2953 if (klen) 2954 Copy(ptr, PL_last_swash_key, klen, U8); 2955 } 2956 2957 switch ((int)((slen << 3) / needents)) { 2958 case 1: 2959 bit = 1 << (off & 7); 2960 off >>= 3; 2961 return (tmps[off] & bit) != 0; 2962 case 8: 2963 return tmps[off]; 2964 case 16: 2965 off <<= 1; 2966 return (tmps[off] << 8) + tmps[off + 1] ; 2967 case 32: 2968 off <<= 2; 2969 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; 2970 } 2971 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, " 2972 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents); 2973 NORETURN_FUNCTION_END; 2974 } 2975 2976 /* Read a single line of the main body of the swash input text. These are of 2977 * the form: 2978 * 0053 0056 0073 2979 * where each number is hex. The first two numbers form the minimum and 2980 * maximum of a range, and the third is the value associated with the range. 2981 * Not all swashes should have a third number 2982 * 2983 * On input: l points to the beginning of the line to be examined; it points 2984 * to somewhere in the string of the whole input text, and is 2985 * terminated by a \n or the null string terminator. 2986 * lend points to the null terminator of that string 2987 * wants_value is non-zero if the swash expects a third number 2988 * typestr is the name of the swash's mapping, like 'ToLower' 2989 * On output: *min, *max, and *val are set to the values read from the line. 2990 * returns a pointer just beyond the line examined. If there was no 2991 * valid min number on the line, returns lend+1 2992 */ 2993 2994 STATIC U8* 2995 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, 2996 const bool wants_value, const U8* const typestr) 2997 { 2998 const int typeto = typestr[0] == 'T' && typestr[1] == 'o'; 2999 STRLEN numlen; /* Length of the number */ 3000 I32 flags = PERL_SCAN_SILENT_ILLDIGIT 3001 | PERL_SCAN_DISALLOW_PREFIX 3002 | PERL_SCAN_SILENT_NON_PORTABLE; 3003 3004 /* nl points to the next \n in the scan */ 3005 U8* const nl = (U8*)memchr(l, '\n', lend - l); 3006 3007 /* Get the first number on the line: the range minimum */ 3008 numlen = lend - l; 3009 *min = grok_hex((char *)l, &numlen, &flags, NULL); 3010 if (numlen) /* If found a hex number, position past it */ 3011 l += numlen; 3012 else if (nl) { /* Else, go handle next line, if any */ 3013 return nl + 1; /* 1 is length of "\n" */ 3014 } 3015 else { /* Else, no next line */ 3016 return lend + 1; /* to LIST's end at which \n is not found */ 3017 } 3018 3019 /* The max range value follows, separated by a BLANK */ 3020 if (isBLANK(*l)) { 3021 ++l; 3022 flags = PERL_SCAN_SILENT_ILLDIGIT 3023 | PERL_SCAN_DISALLOW_PREFIX 3024 | PERL_SCAN_SILENT_NON_PORTABLE; 3025 numlen = lend - l; 3026 *max = grok_hex((char *)l, &numlen, &flags, NULL); 3027 if (numlen) 3028 l += numlen; 3029 else /* If no value here, it is a single element range */ 3030 *max = *min; 3031 3032 /* Non-binary tables have a third entry: what the first element of the 3033 * range maps to. The map for those currently read here is in hex */ 3034 if (wants_value) { 3035 if (isBLANK(*l)) { 3036 ++l; 3037 flags = PERL_SCAN_SILENT_ILLDIGIT 3038 | PERL_SCAN_DISALLOW_PREFIX 3039 | PERL_SCAN_SILENT_NON_PORTABLE; 3040 numlen = lend - l; 3041 *val = grok_hex((char *)l, &numlen, &flags, NULL); 3042 if (numlen) 3043 l += numlen; 3044 else 3045 *val = 0; 3046 } 3047 else { 3048 *val = 0; 3049 if (typeto) { 3050 /* diag_listed_as: To%s: illegal mapping '%s' */ 3051 Perl_croak(aTHX_ "%s: illegal mapping '%s'", 3052 typestr, l); 3053 } 3054 } 3055 } 3056 else 3057 *val = 0; /* bits == 1, then any val should be ignored */ 3058 } 3059 else { /* Nothing following range min, should be single element with no 3060 mapping expected */ 3061 *max = *min; 3062 if (wants_value) { 3063 *val = 0; 3064 if (typeto) { 3065 /* diag_listed_as: To%s: illegal mapping '%s' */ 3066 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l); 3067 } 3068 } 3069 else 3070 *val = 0; /* bits == 1, then val should be ignored */ 3071 } 3072 3073 /* Position to next line if any, or EOF */ 3074 if (nl) 3075 l = nl + 1; 3076 else 3077 l = lend; 3078 3079 return l; 3080 } 3081 3082 /* Note: 3083 * Returns a swatch (a bit vector string) for a code point sequence 3084 * that starts from the value C<start> and comprises the number C<span>. 3085 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl). 3086 * Should be used via swash_fetch, which will cache the swatch in C<swash>. 3087 */ 3088 STATIC SV* 3089 S_swatch_get(pTHX_ SV* swash, UV start, UV span) 3090 { 3091 SV *swatch; 3092 U8 *l, *lend, *x, *xend, *s, *send; 3093 STRLEN lcur, xcur, scur; 3094 HV *const hv = MUTABLE_HV(SvRV(swash)); 3095 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE); 3096 3097 SV** listsvp = NULL; /* The string containing the main body of the table */ 3098 SV** extssvp = NULL; 3099 SV** invert_it_svp = NULL; 3100 U8* typestr = NULL; 3101 STRLEN bits; 3102 STRLEN octets; /* if bits == 1, then octets == 0 */ 3103 UV none; 3104 UV end = start + span; 3105 3106 if (invlistsvp == NULL) { 3107 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); 3108 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); 3109 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); 3110 extssvp = hv_fetchs(hv, "EXTRAS", FALSE); 3111 listsvp = hv_fetchs(hv, "LIST", FALSE); 3112 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); 3113 3114 bits = SvUV(*bitssvp); 3115 none = SvUV(*nonesvp); 3116 typestr = (U8*)SvPV_nolen(*typesvp); 3117 } 3118 else { 3119 bits = 1; 3120 none = 0; 3121 } 3122 octets = bits >> 3; /* if bits == 1, then octets == 0 */ 3123 3124 PERL_ARGS_ASSERT_SWATCH_GET; 3125 3126 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { 3127 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf, 3128 (UV)bits); 3129 } 3130 3131 /* If overflowed, use the max possible */ 3132 if (end < start) { 3133 end = UV_MAX; 3134 span = end - start; 3135 } 3136 3137 /* create and initialize $swatch */ 3138 scur = octets ? (span * octets) : (span + 7) / 8; 3139 swatch = newSV(scur); 3140 SvPOK_on(swatch); 3141 s = (U8*)SvPVX(swatch); 3142 if (octets && none) { 3143 const U8* const e = s + scur; 3144 while (s < e) { 3145 if (bits == 8) 3146 *s++ = (U8)(none & 0xff); 3147 else if (bits == 16) { 3148 *s++ = (U8)((none >> 8) & 0xff); 3149 *s++ = (U8)( none & 0xff); 3150 } 3151 else if (bits == 32) { 3152 *s++ = (U8)((none >> 24) & 0xff); 3153 *s++ = (U8)((none >> 16) & 0xff); 3154 *s++ = (U8)((none >> 8) & 0xff); 3155 *s++ = (U8)( none & 0xff); 3156 } 3157 } 3158 *s = '\0'; 3159 } 3160 else { 3161 (void)memzero((U8*)s, scur + 1); 3162 } 3163 SvCUR_set(swatch, scur); 3164 s = (U8*)SvPVX(swatch); 3165 3166 if (invlistsvp) { /* If has an inversion list set up use that */ 3167 _invlist_populate_swatch(*invlistsvp, start, end, s); 3168 return swatch; 3169 } 3170 3171 /* read $swash->{LIST} */ 3172 l = (U8*)SvPV(*listsvp, lcur); 3173 lend = l + lcur; 3174 while (l < lend) { 3175 UV min, max, val, upper; 3176 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val, 3177 cBOOL(octets), typestr); 3178 if (l > lend) { 3179 break; 3180 } 3181 3182 /* If looking for something beyond this range, go try the next one */ 3183 if (max < start) 3184 continue; 3185 3186 /* <end> is generally 1 beyond where we want to set things, but at the 3187 * platform's infinity, where we can't go any higher, we want to 3188 * include the code point at <end> */ 3189 upper = (max < end) 3190 ? max 3191 : (max != UV_MAX || end != UV_MAX) 3192 ? end - 1 3193 : end; 3194 3195 if (octets) { 3196 UV key; 3197 if (min < start) { 3198 if (!none || val < none) { 3199 val += start - min; 3200 } 3201 min = start; 3202 } 3203 for (key = min; key <= upper; key++) { 3204 STRLEN offset; 3205 /* offset must be non-negative (start <= min <= key < end) */ 3206 offset = octets * (key - start); 3207 if (bits == 8) 3208 s[offset] = (U8)(val & 0xff); 3209 else if (bits == 16) { 3210 s[offset ] = (U8)((val >> 8) & 0xff); 3211 s[offset + 1] = (U8)( val & 0xff); 3212 } 3213 else if (bits == 32) { 3214 s[offset ] = (U8)((val >> 24) & 0xff); 3215 s[offset + 1] = (U8)((val >> 16) & 0xff); 3216 s[offset + 2] = (U8)((val >> 8) & 0xff); 3217 s[offset + 3] = (U8)( val & 0xff); 3218 } 3219 3220 if (!none || val < none) 3221 ++val; 3222 } 3223 } 3224 else { /* bits == 1, then val should be ignored */ 3225 UV key; 3226 if (min < start) 3227 min = start; 3228 3229 for (key = min; key <= upper; key++) { 3230 const STRLEN offset = (STRLEN)(key - start); 3231 s[offset >> 3] |= 1 << (offset & 7); 3232 } 3233 } 3234 } /* while */ 3235 3236 /* Invert if the data says it should be. Assumes that bits == 1 */ 3237 if (invert_it_svp && SvUV(*invert_it_svp)) { 3238 3239 /* Unicode properties should come with all bits above PERL_UNICODE_MAX 3240 * be 0, and their inversion should also be 0, as we don't succeed any 3241 * Unicode property matches for non-Unicode code points */ 3242 if (start <= PERL_UNICODE_MAX) { 3243 3244 /* The code below assumes that we never cross the 3245 * Unicode/above-Unicode boundary in a range, as otherwise we would 3246 * have to figure out where to stop flipping the bits. Since this 3247 * boundary is divisible by a large power of 2, and swatches comes 3248 * in small powers of 2, this should be a valid assumption */ 3249 assert(start + span - 1 <= PERL_UNICODE_MAX); 3250 3251 send = s + scur; 3252 while (s < send) { 3253 *s = ~(*s); 3254 s++; 3255 } 3256 } 3257 } 3258 3259 /* read $swash->{EXTRAS} 3260 * This code also copied to swash_to_invlist() below */ 3261 x = (U8*)SvPV(*extssvp, xcur); 3262 xend = x + xcur; 3263 while (x < xend) { 3264 STRLEN namelen; 3265 U8 *namestr; 3266 SV** othersvp; 3267 HV* otherhv; 3268 STRLEN otherbits; 3269 SV **otherbitssvp, *other; 3270 U8 *s, *o, *nl; 3271 STRLEN slen, olen; 3272 3273 const U8 opc = *x++; 3274 if (opc == '\n') 3275 continue; 3276 3277 nl = (U8*)memchr(x, '\n', xend - x); 3278 3279 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { 3280 if (nl) { 3281 x = nl + 1; /* 1 is length of "\n" */ 3282 continue; 3283 } 3284 else { 3285 x = xend; /* to EXTRAS' end at which \n is not found */ 3286 break; 3287 } 3288 } 3289 3290 namestr = x; 3291 if (nl) { 3292 namelen = nl - namestr; 3293 x = nl + 1; 3294 } 3295 else { 3296 namelen = xend - namestr; 3297 x = xend; 3298 } 3299 3300 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); 3301 otherhv = MUTABLE_HV(SvRV(*othersvp)); 3302 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); 3303 otherbits = (STRLEN)SvUV(*otherbitssvp); 3304 if (bits < otherbits) 3305 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, " 3306 "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits); 3307 3308 /* The "other" swatch must be destroyed after. */ 3309 other = swatch_get(*othersvp, start, span); 3310 o = (U8*)SvPV(other, olen); 3311 3312 if (!olen) 3313 Perl_croak(aTHX_ "panic: swatch_get got improper swatch"); 3314 3315 s = (U8*)SvPV(swatch, slen); 3316 if (bits == 1 && otherbits == 1) { 3317 if (slen != olen) 3318 Perl_croak(aTHX_ "panic: swatch_get found swatch length " 3319 "mismatch, slen=%"UVuf", olen=%"UVuf, 3320 (UV)slen, (UV)olen); 3321 3322 switch (opc) { 3323 case '+': 3324 while (slen--) 3325 *s++ |= *o++; 3326 break; 3327 case '!': 3328 while (slen--) 3329 *s++ |= ~*o++; 3330 break; 3331 case '-': 3332 while (slen--) 3333 *s++ &= ~*o++; 3334 break; 3335 case '&': 3336 while (slen--) 3337 *s++ &= *o++; 3338 break; 3339 default: 3340 break; 3341 } 3342 } 3343 else { 3344 STRLEN otheroctets = otherbits >> 3; 3345 STRLEN offset = 0; 3346 U8* const send = s + slen; 3347 3348 while (s < send) { 3349 UV otherval = 0; 3350 3351 if (otherbits == 1) { 3352 otherval = (o[offset >> 3] >> (offset & 7)) & 1; 3353 ++offset; 3354 } 3355 else { 3356 STRLEN vlen = otheroctets; 3357 otherval = *o++; 3358 while (--vlen) { 3359 otherval <<= 8; 3360 otherval |= *o++; 3361 } 3362 } 3363 3364 if (opc == '+' && otherval) 3365 NOOP; /* replace with otherval */ 3366 else if (opc == '!' && !otherval) 3367 otherval = 1; 3368 else if (opc == '-' && otherval) 3369 otherval = 0; 3370 else if (opc == '&' && !otherval) 3371 otherval = 0; 3372 else { 3373 s += octets; /* no replacement */ 3374 continue; 3375 } 3376 3377 if (bits == 8) 3378 *s++ = (U8)( otherval & 0xff); 3379 else if (bits == 16) { 3380 *s++ = (U8)((otherval >> 8) & 0xff); 3381 *s++ = (U8)( otherval & 0xff); 3382 } 3383 else if (bits == 32) { 3384 *s++ = (U8)((otherval >> 24) & 0xff); 3385 *s++ = (U8)((otherval >> 16) & 0xff); 3386 *s++ = (U8)((otherval >> 8) & 0xff); 3387 *s++ = (U8)( otherval & 0xff); 3388 } 3389 } 3390 } 3391 sv_free(other); /* through with it! */ 3392 } /* while */ 3393 return swatch; 3394 } 3395 3396 HV* 3397 Perl__swash_inversion_hash(pTHX_ SV* const swash) 3398 { 3399 3400 /* Subject to change or removal. For use only in regcomp.c and regexec.c 3401 * Can't be used on a property that is subject to user override, as it 3402 * relies on the value of SPECIALS in the swash which would be set by 3403 * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set 3404 * for overridden properties 3405 * 3406 * Returns a hash which is the inversion and closure of a swash mapping. 3407 * For example, consider the input lines: 3408 * 004B 006B 3409 * 004C 006C 3410 * 212A 006B 3411 * 3412 * The returned hash would have two keys, the utf8 for 006B and the utf8 for 3413 * 006C. The value for each key is an array. For 006C, the array would 3414 * have two elements, the utf8 for itself, and for 004C. For 006B, there 3415 * would be three elements in its array, the utf8 for 006B, 004B and 212A. 3416 * 3417 * Note that there are no elements in the hash for 004B, 004C, 212A. The 3418 * keys are only code points that are folded-to, so it isn't a full closure. 3419 * 3420 * Essentially, for any code point, it gives all the code points that map to 3421 * it, or the list of 'froms' for that point. 3422 * 3423 * Currently it ignores any additions or deletions from other swashes, 3424 * looking at just the main body of the swash, and if there are SPECIALS 3425 * in the swash, at that hash 3426 * 3427 * The specials hash can be extra code points, and most likely consists of 3428 * maps from single code points to multiple ones (each expressed as a string 3429 * of utf8 characters). This function currently returns only 1-1 mappings. 3430 * However consider this possible input in the specials hash: 3431 * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074 3432 * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074 3433 * 3434 * Both FB05 and FB06 map to the same multi-char sequence, which we don't 3435 * currently handle. But it also means that FB05 and FB06 are equivalent in 3436 * a 1-1 mapping which we should handle, and this relationship may not be in 3437 * the main table. Therefore this function examines all the multi-char 3438 * sequences and adds the 1-1 mappings that come out of that. */ 3439 3440 U8 *l, *lend; 3441 STRLEN lcur; 3442 HV *const hv = MUTABLE_HV(SvRV(swash)); 3443 3444 /* The string containing the main body of the table. This will have its 3445 * assertion fail if the swash has been converted to its inversion list */ 3446 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); 3447 3448 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); 3449 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); 3450 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); 3451 /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/ 3452 const U8* const typestr = (U8*)SvPV_nolen(*typesvp); 3453 const STRLEN bits = SvUV(*bitssvp); 3454 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ 3455 const UV none = SvUV(*nonesvp); 3456 SV **specials_p = hv_fetchs(hv, "SPECIALS", 0); 3457 3458 HV* ret = newHV(); 3459 3460 PERL_ARGS_ASSERT__SWASH_INVERSION_HASH; 3461 3462 /* Must have at least 8 bits to get the mappings */ 3463 if (bits != 8 && bits != 16 && bits != 32) { 3464 Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf, 3465 (UV)bits); 3466 } 3467 3468 if (specials_p) { /* It might be "special" (sometimes, but not always, a 3469 mapping to more than one character */ 3470 3471 /* Construct an inverse mapping hash for the specials */ 3472 HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p)); 3473 HV * specials_inverse = newHV(); 3474 char *char_from; /* the lhs of the map */ 3475 I32 from_len; /* its byte length */ 3476 char *char_to; /* the rhs of the map */ 3477 I32 to_len; /* its byte length */ 3478 SV *sv_to; /* and in a sv */ 3479 AV* from_list; /* list of things that map to each 'to' */ 3480 3481 hv_iterinit(specials_hv); 3482 3483 /* The keys are the characters (in utf8) that map to the corresponding 3484 * utf8 string value. Iterate through the list creating the inverse 3485 * list. */ 3486 while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) { 3487 SV** listp; 3488 if (! SvPOK(sv_to)) { 3489 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() " 3490 "unexpectedly is not a string, flags=%lu", 3491 (unsigned long)SvFLAGS(sv_to)); 3492 } 3493 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ 3494 3495 /* Each key in the inverse list is a mapped-to value, and the key's 3496 * hash value is a list of the strings (each in utf8) that map to 3497 * it. Those strings are all one character long */ 3498 if ((listp = hv_fetch(specials_inverse, 3499 SvPVX(sv_to), 3500 SvCUR(sv_to), 0))) 3501 { 3502 from_list = (AV*) *listp; 3503 } 3504 else { /* No entry yet for it: create one */ 3505 from_list = newAV(); 3506 if (! hv_store(specials_inverse, 3507 SvPVX(sv_to), 3508 SvCUR(sv_to), 3509 (SV*) from_list, 0)) 3510 { 3511 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 3512 } 3513 } 3514 3515 /* Here have the list associated with this 'to' (perhaps newly 3516 * created and empty). Just add to it. Note that we ASSUME that 3517 * the input is guaranteed to not have duplications, so we don't 3518 * check for that. Duplications just slow down execution time. */ 3519 av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE)); 3520 } 3521 3522 /* Here, 'specials_inverse' contains the inverse mapping. Go through 3523 * it looking for cases like the FB05/FB06 examples above. There would 3524 * be an entry in the hash like 3525 * 'st' => [ FB05, FB06 ] 3526 * In this example we will create two lists that get stored in the 3527 * returned hash, 'ret': 3528 * FB05 => [ FB05, FB06 ] 3529 * FB06 => [ FB05, FB06 ] 3530 * 3531 * Note that there is nothing to do if the array only has one element. 3532 * (In the normal 1-1 case handled below, we don't have to worry about 3533 * two lists, as everything gets tied to the single list that is 3534 * generated for the single character 'to'. But here, we are omitting 3535 * that list, ('st' in the example), so must have multiple lists.) */ 3536 while ((from_list = (AV *) hv_iternextsv(specials_inverse, 3537 &char_to, &to_len))) 3538 { 3539 if (av_tindex(from_list) > 0) { 3540 SSize_t i; 3541 3542 /* We iterate over all combinations of i,j to place each code 3543 * point on each list */ 3544 for (i = 0; i <= av_tindex(from_list); i++) { 3545 SSize_t j; 3546 AV* i_list = newAV(); 3547 SV** entryp = av_fetch(from_list, i, FALSE); 3548 if (entryp == NULL) { 3549 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); 3550 } 3551 if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) { 3552 Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp)); 3553 } 3554 if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp), 3555 (SV*) i_list, FALSE)) 3556 { 3557 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 3558 } 3559 3560 /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ 3561 for (j = 0; j <= av_tindex(from_list); j++) { 3562 entryp = av_fetch(from_list, j, FALSE); 3563 if (entryp == NULL) { 3564 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); 3565 } 3566 3567 /* When i==j this adds itself to the list */ 3568 av_push(i_list, newSVuv(utf8_to_uvchr_buf( 3569 (U8*) SvPVX(*entryp), 3570 (U8*) SvPVX(*entryp) + SvCUR(*entryp), 3571 0))); 3572 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ 3573 } 3574 } 3575 } 3576 } 3577 SvREFCNT_dec(specials_inverse); /* done with it */ 3578 } /* End of specials */ 3579 3580 /* read $swash->{LIST} */ 3581 l = (U8*)SvPV(*listsvp, lcur); 3582 lend = l + lcur; 3583 3584 /* Go through each input line */ 3585 while (l < lend) { 3586 UV min, max, val; 3587 UV inverse; 3588 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val, 3589 cBOOL(octets), typestr); 3590 if (l > lend) { 3591 break; 3592 } 3593 3594 /* Each element in the range is to be inverted */ 3595 for (inverse = min; inverse <= max; inverse++) { 3596 AV* list; 3597 SV** listp; 3598 IV i; 3599 bool found_key = FALSE; 3600 bool found_inverse = FALSE; 3601 3602 /* The key is the inverse mapping */ 3603 char key[UTF8_MAXBYTES+1]; 3604 char* key_end = (char *) uvchr_to_utf8((U8*) key, val); 3605 STRLEN key_len = key_end - key; 3606 3607 /* Get the list for the map */ 3608 if ((listp = hv_fetch(ret, key, key_len, FALSE))) { 3609 list = (AV*) *listp; 3610 } 3611 else { /* No entry yet for it: create one */ 3612 list = newAV(); 3613 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) { 3614 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 3615 } 3616 } 3617 3618 /* Look through list to see if this inverse mapping already is 3619 * listed, or if there is a mapping to itself already */ 3620 for (i = 0; i <= av_tindex(list); i++) { 3621 SV** entryp = av_fetch(list, i, FALSE); 3622 SV* entry; 3623 if (entryp == NULL) { 3624 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); 3625 } 3626 entry = *entryp; 3627 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/ 3628 if (SvUV(entry) == val) { 3629 found_key = TRUE; 3630 } 3631 if (SvUV(entry) == inverse) { 3632 found_inverse = TRUE; 3633 } 3634 3635 /* No need to continue searching if found everything we are 3636 * looking for */ 3637 if (found_key && found_inverse) { 3638 break; 3639 } 3640 } 3641 3642 /* Make sure there is a mapping to itself on the list */ 3643 if (! found_key) { 3644 av_push(list, newSVuv(val)); 3645 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/ 3646 } 3647 3648 3649 /* Simply add the value to the list */ 3650 if (! found_inverse) { 3651 av_push(list, newSVuv(inverse)); 3652 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/ 3653 } 3654 3655 /* swatch_get() increments the value of val for each element in the 3656 * range. That makes more compact tables possible. You can 3657 * express the capitalization, for example, of all consecutive 3658 * letters with a single line: 0061\t007A\t0041 This maps 0061 to 3659 * 0041, 0062 to 0042, etc. I (khw) have never understood 'none', 3660 * and it's not documented; it appears to be used only in 3661 * implementing tr//; I copied the semantics from swatch_get(), just 3662 * in case */ 3663 if (!none || val < none) { 3664 ++val; 3665 } 3666 } 3667 } 3668 3669 return ret; 3670 } 3671 3672 SV* 3673 Perl__swash_to_invlist(pTHX_ SV* const swash) 3674 { 3675 3676 /* Subject to change or removal. For use only in one place in regcomp.c. 3677 * Ownership is given to one reference count in the returned SV* */ 3678 3679 U8 *l, *lend; 3680 char *loc; 3681 STRLEN lcur; 3682 HV *const hv = MUTABLE_HV(SvRV(swash)); 3683 UV elements = 0; /* Number of elements in the inversion list */ 3684 U8 empty[] = ""; 3685 SV** listsvp; 3686 SV** typesvp; 3687 SV** bitssvp; 3688 SV** extssvp; 3689 SV** invert_it_svp; 3690 3691 U8* typestr; 3692 STRLEN bits; 3693 STRLEN octets; /* if bits == 1, then octets == 0 */ 3694 U8 *x, *xend; 3695 STRLEN xcur; 3696 3697 SV* invlist; 3698 3699 PERL_ARGS_ASSERT__SWASH_TO_INVLIST; 3700 3701 /* If not a hash, it must be the swash's inversion list instead */ 3702 if (SvTYPE(hv) != SVt_PVHV) { 3703 return SvREFCNT_inc_simple_NN((SV*) hv); 3704 } 3705 3706 /* The string containing the main body of the table */ 3707 listsvp = hv_fetchs(hv, "LIST", FALSE); 3708 typesvp = hv_fetchs(hv, "TYPE", FALSE); 3709 bitssvp = hv_fetchs(hv, "BITS", FALSE); 3710 extssvp = hv_fetchs(hv, "EXTRAS", FALSE); 3711 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); 3712 3713 typestr = (U8*)SvPV_nolen(*typesvp); 3714 bits = SvUV(*bitssvp); 3715 octets = bits >> 3; /* if bits == 1, then octets == 0 */ 3716 3717 /* read $swash->{LIST} */ 3718 if (SvPOK(*listsvp)) { 3719 l = (U8*)SvPV(*listsvp, lcur); 3720 } 3721 else { 3722 /* LIST legitimately doesn't contain a string during compilation phases 3723 * of Perl itself, before the Unicode tables are generated. In this 3724 * case, just fake things up by creating an empty list */ 3725 l = empty; 3726 lcur = 0; 3727 } 3728 loc = (char *) l; 3729 lend = l + lcur; 3730 3731 if (*l == 'V') { /* Inversion list format */ 3732 char *after_strtol = (char *) lend; 3733 UV element0; 3734 UV* other_elements_ptr; 3735 3736 /* The first number is a count of the rest */ 3737 l++; 3738 elements = Strtoul((char *)l, &after_strtol, 10); 3739 if (elements == 0) { 3740 invlist = _new_invlist(0); 3741 } 3742 else { 3743 l = (U8 *) after_strtol; 3744 3745 /* Get the 0th element, which is needed to setup the inversion list */ 3746 element0 = (UV) Strtoul((char *)l, &after_strtol, 10); 3747 l = (U8 *) after_strtol; 3748 invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr); 3749 elements--; 3750 3751 /* Then just populate the rest of the input */ 3752 while (elements-- > 0) { 3753 if (l > lend) { 3754 Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements); 3755 } 3756 *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10); 3757 l = (U8 *) after_strtol; 3758 } 3759 } 3760 } 3761 else { 3762 3763 /* Scan the input to count the number of lines to preallocate array 3764 * size based on worst possible case, which is each line in the input 3765 * creates 2 elements in the inversion list: 1) the beginning of a 3766 * range in the list; 2) the beginning of a range not in the list. */ 3767 while ((loc = (strchr(loc, '\n'))) != NULL) { 3768 elements += 2; 3769 loc++; 3770 } 3771 3772 /* If the ending is somehow corrupt and isn't a new line, add another 3773 * element for the final range that isn't in the inversion list */ 3774 if (! (*lend == '\n' 3775 || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n')))) 3776 { 3777 elements++; 3778 } 3779 3780 invlist = _new_invlist(elements); 3781 3782 /* Now go through the input again, adding each range to the list */ 3783 while (l < lend) { 3784 UV start, end; 3785 UV val; /* Not used by this function */ 3786 3787 l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val, 3788 cBOOL(octets), typestr); 3789 3790 if (l > lend) { 3791 break; 3792 } 3793 3794 invlist = _add_range_to_invlist(invlist, start, end); 3795 } 3796 } 3797 3798 /* Invert if the data says it should be */ 3799 if (invert_it_svp && SvUV(*invert_it_svp)) { 3800 _invlist_invert(invlist); 3801 } 3802 3803 /* This code is copied from swatch_get() 3804 * read $swash->{EXTRAS} */ 3805 x = (U8*)SvPV(*extssvp, xcur); 3806 xend = x + xcur; 3807 while (x < xend) { 3808 STRLEN namelen; 3809 U8 *namestr; 3810 SV** othersvp; 3811 HV* otherhv; 3812 STRLEN otherbits; 3813 SV **otherbitssvp, *other; 3814 U8 *nl; 3815 3816 const U8 opc = *x++; 3817 if (opc == '\n') 3818 continue; 3819 3820 nl = (U8*)memchr(x, '\n', xend - x); 3821 3822 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { 3823 if (nl) { 3824 x = nl + 1; /* 1 is length of "\n" */ 3825 continue; 3826 } 3827 else { 3828 x = xend; /* to EXTRAS' end at which \n is not found */ 3829 break; 3830 } 3831 } 3832 3833 namestr = x; 3834 if (nl) { 3835 namelen = nl - namestr; 3836 x = nl + 1; 3837 } 3838 else { 3839 namelen = xend - namestr; 3840 x = xend; 3841 } 3842 3843 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); 3844 otherhv = MUTABLE_HV(SvRV(*othersvp)); 3845 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); 3846 otherbits = (STRLEN)SvUV(*otherbitssvp); 3847 3848 if (bits != otherbits || bits != 1) { 3849 Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean " 3850 "properties, bits=%"UVuf", otherbits=%"UVuf, 3851 (UV)bits, (UV)otherbits); 3852 } 3853 3854 /* The "other" swatch must be destroyed after. */ 3855 other = _swash_to_invlist((SV *)*othersvp); 3856 3857 /* End of code copied from swatch_get() */ 3858 switch (opc) { 3859 case '+': 3860 _invlist_union(invlist, other, &invlist); 3861 break; 3862 case '!': 3863 _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist); 3864 break; 3865 case '-': 3866 _invlist_subtract(invlist, other, &invlist); 3867 break; 3868 case '&': 3869 _invlist_intersection(invlist, other, &invlist); 3870 break; 3871 default: 3872 break; 3873 } 3874 sv_free(other); /* through with it! */ 3875 } 3876 3877 SvREADONLY_on(invlist); 3878 return invlist; 3879 } 3880 3881 SV* 3882 Perl__get_swash_invlist(pTHX_ SV* const swash) 3883 { 3884 SV** ptr; 3885 3886 PERL_ARGS_ASSERT__GET_SWASH_INVLIST; 3887 3888 if (! SvROK(swash)) { 3889 return NULL; 3890 } 3891 3892 /* If it really isn't a hash, it isn't really swash; must be an inversion 3893 * list */ 3894 if (SvTYPE(SvRV(swash)) != SVt_PVHV) { 3895 return SvRV(swash); 3896 } 3897 3898 ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); 3899 if (! ptr) { 3900 return NULL; 3901 } 3902 3903 return *ptr; 3904 } 3905 3906 bool 3907 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) 3908 { 3909 /* May change: warns if surrogates, non-character code points, or 3910 * non-Unicode code points are in s which has length len bytes. Returns 3911 * TRUE if none found; FALSE otherwise. The only other validity check is 3912 * to make sure that this won't exceed the string's length */ 3913 3914 const U8* const e = s + len; 3915 bool ok = TRUE; 3916 3917 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT; 3918 3919 while (s < e) { 3920 if (UTF8SKIP(s) > len) { 3921 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 3922 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print"); 3923 return FALSE; 3924 } 3925 if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) { 3926 STRLEN char_len; 3927 if (UTF8_IS_SUPER(s)) { 3928 if (ckWARN_d(WARN_NON_UNICODE)) { 3929 UV uv = utf8_to_uvchr_buf(s, e, &char_len); 3930 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 3931 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv); 3932 ok = FALSE; 3933 } 3934 } 3935 else if (UTF8_IS_SURROGATE(s)) { 3936 if (ckWARN_d(WARN_SURROGATE)) { 3937 UV uv = utf8_to_uvchr_buf(s, e, &char_len); 3938 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 3939 "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv); 3940 ok = FALSE; 3941 } 3942 } 3943 else if 3944 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)) 3945 && (ckWARN_d(WARN_NONCHAR))) 3946 { 3947 UV uv = utf8_to_uvchr_buf(s, e, &char_len); 3948 Perl_warner(aTHX_ packWARN(WARN_NONCHAR), 3949 "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv); 3950 ok = FALSE; 3951 } 3952 } 3953 s += UTF8SKIP(s); 3954 } 3955 3956 return ok; 3957 } 3958 3959 /* 3960 =for apidoc pv_uni_display 3961 3962 Build to the scalar C<dsv> a displayable version of the string C<spv>, 3963 length C<len>, the displayable version being at most C<pvlim> bytes long 3964 (if longer, the rest is truncated and "..." will be appended). 3965 3966 The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display 3967 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH 3968 to display the \\[nrfta\\] as the backslashed versions (like '\n') 3969 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\). 3970 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both 3971 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on. 3972 3973 The pointer to the PV of the C<dsv> is returned. 3974 3975 =cut */ 3976 char * 3977 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags) 3978 { 3979 int truncated = 0; 3980 const char *s, *e; 3981 3982 PERL_ARGS_ASSERT_PV_UNI_DISPLAY; 3983 3984 sv_setpvs(dsv, ""); 3985 SvUTF8_off(dsv); 3986 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { 3987 UV u; 3988 /* This serves double duty as a flag and a character to print after 3989 a \ when flags & UNI_DISPLAY_BACKSLASH is true. 3990 */ 3991 char ok = 0; 3992 3993 if (pvlim && SvCUR(dsv) >= pvlim) { 3994 truncated++; 3995 break; 3996 } 3997 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0); 3998 if (u < 256) { 3999 const unsigned char c = (unsigned char)u & 0xFF; 4000 if (flags & UNI_DISPLAY_BACKSLASH) { 4001 switch (c) { 4002 case '\n': 4003 ok = 'n'; break; 4004 case '\r': 4005 ok = 'r'; break; 4006 case '\t': 4007 ok = 't'; break; 4008 case '\f': 4009 ok = 'f'; break; 4010 case '\a': 4011 ok = 'a'; break; 4012 case '\\': 4013 ok = '\\'; break; 4014 default: break; 4015 } 4016 if (ok) { 4017 const char string = ok; 4018 sv_catpvs(dsv, "\\"); 4019 sv_catpvn(dsv, &string, 1); 4020 } 4021 } 4022 /* isPRINT() is the locale-blind version. */ 4023 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { 4024 const char string = c; 4025 sv_catpvn(dsv, &string, 1); 4026 ok = 1; 4027 } 4028 } 4029 if (!ok) 4030 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); 4031 } 4032 if (truncated) 4033 sv_catpvs(dsv, "..."); 4034 4035 return SvPVX(dsv); 4036 } 4037 4038 /* 4039 =for apidoc sv_uni_display 4040 4041 Build to the scalar C<dsv> a displayable version of the scalar C<sv>, 4042 the displayable version being at most C<pvlim> bytes long 4043 (if longer, the rest is truncated and "..." will be appended). 4044 4045 The C<flags> argument is as in L</pv_uni_display>(). 4046 4047 The pointer to the PV of the C<dsv> is returned. 4048 4049 =cut 4050 */ 4051 char * 4052 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) 4053 { 4054 const char * const ptr = 4055 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv); 4056 4057 PERL_ARGS_ASSERT_SV_UNI_DISPLAY; 4058 4059 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr, 4060 SvCUR(ssv), pvlim, flags); 4061 } 4062 4063 /* 4064 =for apidoc foldEQ_utf8 4065 4066 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both 4067 of which may be in UTF-8) are the same case-insensitively; false otherwise. 4068 How far into the strings to compare is determined by other input parameters. 4069 4070 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode; 4071 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C<u2> 4072 with respect to C<s2>. 4073 4074 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold 4075 equality. In other words, C<s1>+C<l1> will be used as a goal to reach. The 4076 scan will not be considered to be a match unless the goal is reached, and 4077 scanning won't continue past that goal. Correspondingly for C<l2> with respect to 4078 C<s2>. 4079 4080 If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is 4081 considered an end pointer to the position 1 byte past the maximum point 4082 in C<s1> beyond which scanning will not continue under any circumstances. 4083 (This routine assumes that UTF-8 encoded input strings are not malformed; 4084 malformed input can cause it to read past C<pe1>). 4085 This means that if both C<l1> and C<pe1> are specified, and C<pe1> 4086 is less than C<s1>+C<l1>, the match will never be successful because it can 4087 never 4088 get as far as its goal (and in fact is asserted against). Correspondingly for 4089 C<pe2> with respect to C<s2>. 4090 4091 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and 4092 C<l2> must be non-zero), and if both do, both have to be 4093 reached for a successful match. Also, if the fold of a character is multiple 4094 characters, all of them must be matched (see tr21 reference below for 4095 'folding'). 4096 4097 Upon a successful match, if C<pe1> is non-NULL, 4098 it will be set to point to the beginning of the I<next> character of C<s1> 4099 beyond what was matched. Correspondingly for C<pe2> and C<s2>. 4100 4101 For case-insensitiveness, the "casefolding" of Unicode is used 4102 instead of upper/lowercasing both the characters, see 4103 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings). 4104 4105 =cut */ 4106 4107 /* A flags parameter has been added which may change, and hence isn't 4108 * externally documented. Currently it is: 4109 * 0 for as-documented above 4110 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an 4111 ASCII one, to not match 4112 * FOLDEQ_LOCALE is set iff the rules from the current underlying 4113 * locale are to be used. 4114 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this 4115 * routine. This allows that step to be skipped. 4116 * FOLDEQ_S2_ALREADY_FOLDED Similarly. 4117 */ 4118 I32 4119 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags) 4120 { 4121 dVAR; 4122 const U8 *p1 = (const U8*)s1; /* Point to current char */ 4123 const U8 *p2 = (const U8*)s2; 4124 const U8 *g1 = NULL; /* goal for s1 */ 4125 const U8 *g2 = NULL; 4126 const U8 *e1 = NULL; /* Don't scan s1 past this */ 4127 U8 *f1 = NULL; /* Point to current folded */ 4128 const U8 *e2 = NULL; 4129 U8 *f2 = NULL; 4130 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ 4131 U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; 4132 U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; 4133 4134 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; 4135 4136 assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) 4137 && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED)))); 4138 /* The algorithm is to trial the folds without regard to the flags on 4139 * the first line of the above assert(), and then see if the result 4140 * violates them. This means that the inputs can't be pre-folded to a 4141 * violating result, hence the assert. This could be changed, with the 4142 * addition of extra tests here for the already-folded case, which would 4143 * slow it down. That cost is more than any possible gain for when these 4144 * flags are specified, as the flags indicate /il or /iaa matching which 4145 * is less common than /iu, and I (khw) also believe that real-world /il 4146 * and /iaa matches are most likely to involve code points 0-255, and this 4147 * function only under rare conditions gets called for 0-255. */ 4148 4149 if (IN_UTF8_CTYPE_LOCALE) { 4150 flags &= ~FOLDEQ_LOCALE; 4151 } 4152 4153 if (pe1) { 4154 e1 = *(U8**)pe1; 4155 } 4156 4157 if (l1) { 4158 g1 = (const U8*)s1 + l1; 4159 } 4160 4161 if (pe2) { 4162 e2 = *(U8**)pe2; 4163 } 4164 4165 if (l2) { 4166 g2 = (const U8*)s2 + l2; 4167 } 4168 4169 /* Must have at least one goal */ 4170 assert(g1 || g2); 4171 4172 if (g1) { 4173 4174 /* Will never match if goal is out-of-bounds */ 4175 assert(! e1 || e1 >= g1); 4176 4177 /* Here, there isn't an end pointer, or it is beyond the goal. We 4178 * only go as far as the goal */ 4179 e1 = g1; 4180 } 4181 else { 4182 assert(e1); /* Must have an end for looking at s1 */ 4183 } 4184 4185 /* Same for goal for s2 */ 4186 if (g2) { 4187 assert(! e2 || e2 >= g2); 4188 e2 = g2; 4189 } 4190 else { 4191 assert(e2); 4192 } 4193 4194 /* If both operands are already folded, we could just do a memEQ on the 4195 * whole strings at once, but it would be better if the caller realized 4196 * this and didn't even call us */ 4197 4198 /* Look through both strings, a character at a time */ 4199 while (p1 < e1 && p2 < e2) { 4200 4201 /* If at the beginning of a new character in s1, get its fold to use 4202 * and the length of the fold. (exception: locale rules just get the 4203 * character to a single byte) */ 4204 if (n1 == 0) { 4205 if (flags & FOLDEQ_S1_ALREADY_FOLDED) { 4206 f1 = (U8 *) p1; 4207 n1 = UTF8SKIP(f1); 4208 } 4209 else { 4210 /* If in locale matching, we use two sets of rules, depending 4211 * on if the code point is above or below 255. Here, we test 4212 * for and handle locale rules */ 4213 if ((flags & FOLDEQ_LOCALE) 4214 && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1))) 4215 { 4216 /* There is no mixing of code points above and below 255. */ 4217 if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) { 4218 return 0; 4219 } 4220 4221 /* We handle locale rules by converting, if necessary, the 4222 * code point to a single byte. */ 4223 if (! u1 || UTF8_IS_INVARIANT(*p1)) { 4224 *foldbuf1 = *p1; 4225 } 4226 else { 4227 *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1)); 4228 } 4229 n1 = 1; 4230 } 4231 else if (isASCII(*p1)) { /* Note, that here won't be both 4232 ASCII and using locale rules */ 4233 4234 /* If trying to mix non- with ASCII, and not supposed to, 4235 * fail */ 4236 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) { 4237 return 0; 4238 } 4239 n1 = 1; 4240 *foldbuf1 = toFOLD(*p1); 4241 } 4242 else if (u1) { 4243 to_utf8_fold(p1, foldbuf1, &n1); 4244 } 4245 else { /* Not utf8, get utf8 fold */ 4246 to_uni_fold(*p1, foldbuf1, &n1); 4247 } 4248 f1 = foldbuf1; 4249 } 4250 } 4251 4252 if (n2 == 0) { /* Same for s2 */ 4253 if (flags & FOLDEQ_S2_ALREADY_FOLDED) { 4254 f2 = (U8 *) p2; 4255 n2 = UTF8SKIP(f2); 4256 } 4257 else { 4258 if ((flags & FOLDEQ_LOCALE) 4259 && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2))) 4260 { 4261 /* Here, the next char in s2 is < 256. We've already 4262 * worked on s1, and if it isn't also < 256, can't match */ 4263 if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) { 4264 return 0; 4265 } 4266 if (! u2 || UTF8_IS_INVARIANT(*p2)) { 4267 *foldbuf2 = *p2; 4268 } 4269 else { 4270 *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1)); 4271 } 4272 4273 /* Use another function to handle locale rules. We've made 4274 * sure that both characters to compare are single bytes */ 4275 if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) { 4276 return 0; 4277 } 4278 n1 = n2 = 0; 4279 } 4280 else if (isASCII(*p2)) { 4281 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) { 4282 return 0; 4283 } 4284 n2 = 1; 4285 *foldbuf2 = toFOLD(*p2); 4286 } 4287 else if (u2) { 4288 to_utf8_fold(p2, foldbuf2, &n2); 4289 } 4290 else { 4291 to_uni_fold(*p2, foldbuf2, &n2); 4292 } 4293 f2 = foldbuf2; 4294 } 4295 } 4296 4297 /* Here f1 and f2 point to the beginning of the strings to compare. 4298 * These strings are the folds of the next character from each input 4299 * string, stored in utf8. */ 4300 4301 /* While there is more to look for in both folds, see if they 4302 * continue to match */ 4303 while (n1 && n2) { 4304 U8 fold_length = UTF8SKIP(f1); 4305 if (fold_length != UTF8SKIP(f2) 4306 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE 4307 function call for single 4308 byte */ 4309 || memNE((char*)f1, (char*)f2, fold_length)) 4310 { 4311 return 0; /* mismatch */ 4312 } 4313 4314 /* Here, they matched, advance past them */ 4315 n1 -= fold_length; 4316 f1 += fold_length; 4317 n2 -= fold_length; 4318 f2 += fold_length; 4319 } 4320 4321 /* When reach the end of any fold, advance the input past it */ 4322 if (n1 == 0) { 4323 p1 += u1 ? UTF8SKIP(p1) : 1; 4324 } 4325 if (n2 == 0) { 4326 p2 += u2 ? UTF8SKIP(p2) : 1; 4327 } 4328 } /* End of loop through both strings */ 4329 4330 /* A match is defined by each scan that specified an explicit length 4331 * reaching its final goal, and the other not having matched a partial 4332 * character (which can happen when the fold of a character is more than one 4333 * character). */ 4334 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { 4335 return 0; 4336 } 4337 4338 /* Successful match. Set output pointers */ 4339 if (pe1) { 4340 *pe1 = (char*)p1; 4341 } 4342 if (pe2) { 4343 *pe2 = (char*)p2; 4344 } 4345 return 1; 4346 } 4347 4348 /* XXX The next four functions should likely be moved to mathoms.c once all 4349 * occurrences of them are removed from the core; some cpan-upstream modules 4350 * still use them */ 4351 4352 U8 * 4353 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) 4354 { 4355 PERL_ARGS_ASSERT_UVUNI_TO_UTF8; 4356 4357 return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0); 4358 } 4359 4360 UV 4361 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) 4362 { 4363 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; 4364 4365 return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); 4366 } 4367 4368 /* 4369 =for apidoc uvuni_to_utf8_flags 4370 4371 Instead you almost certainly want to use L</uvchr_to_utf8> or 4372 L</uvchr_to_utf8_flags>>. 4373 4374 This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>, 4375 which itself, while not deprecated, should be used only in isolated 4376 circumstances. These functions were useful for code that wanted to handle 4377 both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl 4378 v5.20, the distinctions between the platforms have mostly been made invisible 4379 to most code, so this function is quite unlikely to be what you want. 4380 4381 =cut 4382 */ 4383 4384 U8 * 4385 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 4386 { 4387 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; 4388 4389 return uvoffuni_to_utf8_flags(d, uv, flags); 4390 } 4391 4392 /* 4393 =for apidoc utf8n_to_uvuni 4394 4395 Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>. 4396 4397 This function was useful for code that wanted to handle both EBCDIC and 4398 ASCII platforms with Unicode properties, but starting in Perl v5.20, the 4399 distinctions between the platforms have mostly been made invisible to most 4400 code, so this function is quite unlikely to be what you want. If you do need 4401 this precise functionality, use instead 4402 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> 4403 or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>. 4404 4405 =cut 4406 */ 4407 4408 /* 4409 * Local variables: 4410 * c-indentation-style: bsd 4411 * c-basic-offset: 4 4412 * indent-tabs-mode: nil 4413 * End: 4414 * 4415 * ex: set ts=8 sts=4 sw=4 et: 4416 */ 4417