1 /* utf8.c 2 * 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 4 * by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 13 * heard of that we don't want to see any closer; and that's the one place 14 * we're trying to get to! And that's just where we can't get, nohow.' 15 * 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] 17 * 18 * 'Well do I understand your speech,' he answered in the same language; 19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 20 * as is the custom in the West, if you wish to be answered?' 21 * --Gandalf, addressing Théoden's door wardens 22 * 23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 24 * 25 * ...the travellers perceived that the floor was paved with stones of many 26 * hues; branching runes and strange devices intertwined beneath their feet. 27 * 28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_UTF8_C 33 #include "perl.h" 34 #include "invlist_inline.h" 35 36 static const char malformed_text[] = "Malformed UTF-8 character"; 37 static const char unees[] = 38 "Malformed UTF-8 character (unexpected end of string)"; 39 40 /* 41 These are various utility functions for manipulating UTF8-encoded 42 strings. For the uninitiated, this is a method of representing arbitrary 43 Unicode characters as a variable number of bytes, in such a way that 44 characters in the ASCII range are unmodified, and a zero byte never appears 45 within non-zero characters. 46 */ 47 48 void 49 Perl__force_out_malformed_utf8_message(pTHX_ 50 const U8 *const p, /* First byte in UTF-8 sequence */ 51 const U8 * const e, /* Final byte in sequence (may include 52 multiple chars */ 53 const U32 flags, /* Flags to pass to utf8n_to_uvchr(), 54 usually 0, or some DISALLOW flags */ 55 const bool die_here) /* If TRUE, this function does not return */ 56 { 57 /* This core-only function is to be called when a malformed UTF-8 character 58 * is found, in order to output the detailed information about the 59 * malformation before dieing. The reason it exists is for the occasions 60 * when such a malformation is fatal, but warnings might be turned off, so 61 * that normally they would not be actually output. This ensures that they 62 * do get output. Because a sequence may be malformed in more than one 63 * way, multiple messages may be generated, so we can't make them fatal, as 64 * that would cause the first one to die. 65 * 66 * Instead we pretend -W was passed to perl, then die afterwards. The 67 * flexibility is here to return to the caller so they can finish up and 68 * die themselves */ 69 U32 errors; 70 71 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; 72 73 ENTER; 74 SAVEI8(PL_dowarn); 75 SAVESPTR(PL_curcop); 76 77 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; 78 if (PL_curcop) { 79 SAVECURCOPWARNINGS(); 80 PL_curcop->cop_warnings = pWARN_ALL; 81 } 82 83 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); 84 85 LEAVE; 86 87 if (! errors) { 88 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should" 89 " be called only when there are errors found"); 90 } 91 92 if (die_here) { 93 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 94 } 95 } 96 97 STATIC HV * 98 S_new_msg_hv(pTHX_ const char * const message, /* The message text */ 99 U32 categories, /* Packed warning categories */ 100 U32 flag) /* Flag associated with this message */ 101 { 102 /* Creates, populates, and returns an HV* that describes an error message 103 * for the translators between UTF8 and code point */ 104 105 SV* msg_sv = newSVpv(message, 0); 106 SV* category_sv = newSVuv(categories); 107 SV* flag_bit_sv = newSVuv(flag); 108 109 HV* msg_hv = newHV(); 110 111 PERL_ARGS_ASSERT_NEW_MSG_HV; 112 113 (void) hv_stores(msg_hv, "text", msg_sv); 114 (void) hv_stores(msg_hv, "warn_categories", category_sv); 115 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv); 116 117 return msg_hv; 118 } 119 120 /* 121 =for apidoc uvoffuni_to_utf8_flags 122 123 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 124 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or 125 L<perlapi/uvchr_to_utf8_flags>>. 126 127 This function is like them, but the input is a strict Unicode 128 (as opposed to native) code point. Only in very rare circumstances should code 129 not be using the native code point. 130 131 For details, see the description for L<perlapi/uvchr_to_utf8_flags>. 132 133 =cut 134 */ 135 136 U8 * 137 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) 138 { 139 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; 140 141 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL); 142 } 143 144 /* All these formats take a single UV code point argument */ 145 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf; 146 const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf 147 " is not recommended for open interchange"; 148 const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," 149 " may not be portable"; 150 151 /* Use shorter names internally in this file */ 152 #define SHIFT UTF_ACCUMULATION_SHIFT 153 #undef MARK 154 #define MARK UTF_CONTINUATION_MARK 155 #define MASK UTF_CONTINUATION_MASK 156 157 /* 158 =for apidoc uvchr_to_utf8_flags_msgs 159 160 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 161 162 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly. 163 164 This function is for code that wants any warning and/or error messages to be 165 returned to the caller rather than be displayed. All messages that would have 166 been displayed if all lexical warnings are enabled will be returned. 167 168 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter 169 placed after all the others, C<msgs>. If this parameter is 0, this function 170 behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should 171 be a pointer to an C<HV *> variable, in which this function creates a new HV to 172 contain any appropriate messages. The hash has three key-value pairs, as 173 follows: 174 175 =over 4 176 177 =item C<text> 178 179 The text of the message as a C<SVpv>. 180 181 =item C<warn_categories> 182 183 The warning category (or categories) packed into a C<SVuv>. 184 185 =item C<flag> 186 187 A single flag bit associated with this message, in a C<SVuv>. 188 The bit corresponds to some bit in the C<*errors> return value, 189 such as C<UNICODE_GOT_SURROGATE>. 190 191 =back 192 193 It's important to note that specifying this parameter as non-null will cause 194 any warnings this function would otherwise generate to be suppressed, and 195 instead be placed in C<*msgs>. The caller can check the lexical warnings state 196 (or not) when choosing what to do with the returned messages. 197 198 The caller, of course, is responsible for freeing any returned HV. 199 200 =cut 201 */ 202 203 /* Undocumented; we don't want people using this. Instead they should use 204 * uvchr_to_utf8_flags_msgs() */ 205 U8 * 206 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs) 207 { 208 U8 *p; 209 UV shifted_uv = input_uv; 210 STRLEN utf8_skip = OFFUNISKIP(input_uv); 211 212 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS; 213 214 if (msgs) { 215 *msgs = NULL; 216 } 217 218 switch (utf8_skip) { 219 case 1: 220 *d++ = LATIN1_TO_NATIVE(input_uv); 221 return d; 222 223 default: 224 if ( UNLIKELY(input_uv > MAX_LEGAL_CP 225 && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))) 226 { 227 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */ 228 NULL, 0, input_uv)); 229 } 230 231 if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) { 232 U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); 233 const char * format = PL_extended_cp_format; 234 if (msgs) { 235 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 236 category, 237 UNICODE_GOT_PERL_EXTENDED); 238 } 239 else { 240 Perl_ck_warner_d(aTHX_ category, format, input_uv); 241 } 242 243 /* Don't output a 2nd msg */ 244 flags &= ~UNICODE_WARN_SUPER; 245 } 246 247 if (flags & UNICODE_DISALLOW_PERL_EXTENDED) { 248 return NULL; 249 } 250 251 p = d + utf8_skip - 1; 252 while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) { 253 *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 254 shifted_uv >>= SHIFT; 255 } 256 257 /* FALLTHROUGH */ 258 259 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: 260 d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT] 261 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 262 shifted_uv >>= SHIFT; 263 /* FALLTHROUGH */ 264 265 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: 266 d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT] 267 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 268 shifted_uv >>= SHIFT; 269 /* FALLTHROUGH */ 270 271 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT: 272 if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) { 273 if (flags & UNICODE_WARN_SUPER) { 274 U32 category = packWARN(WARN_NON_UNICODE); 275 const char * format = super_cp_format; 276 277 if (msgs) { 278 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 279 category, 280 UNICODE_GOT_SUPER); 281 } 282 else { 283 Perl_ck_warner_d(aTHX_ category, format, input_uv); 284 } 285 286 if (flags & UNICODE_DISALLOW_SUPER) { 287 return NULL; 288 } 289 } 290 if ( (flags & UNICODE_DISALLOW_SUPER) 291 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) 292 && UNICODE_IS_PERL_EXTENDED(input_uv))) 293 { 294 return NULL; 295 } 296 } 297 298 d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT] 299 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 300 shifted_uv >>= SHIFT; 301 /* FALLTHROUGH */ 302 303 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT: 304 if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) { 305 if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) { 306 if (flags & UNICODE_WARN_NONCHAR) { 307 U32 category = packWARN(WARN_NONCHAR); 308 const char * format = nonchar_cp_format; 309 if (msgs) { 310 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 311 category, 312 UNICODE_GOT_NONCHAR); 313 } 314 else { 315 Perl_ck_warner_d(aTHX_ category, format, input_uv); 316 } 317 } 318 if (flags & UNICODE_DISALLOW_NONCHAR) { 319 return NULL; 320 } 321 } 322 else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) { 323 if (flags & UNICODE_WARN_SURROGATE) { 324 U32 category = packWARN(WARN_SURROGATE); 325 const char * format = surrogate_cp_format; 326 if (msgs) { 327 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 328 category, 329 UNICODE_GOT_SURROGATE); 330 } 331 else { 332 Perl_ck_warner_d(aTHX_ category, format, input_uv); 333 } 334 } 335 if (flags & UNICODE_DISALLOW_SURROGATE) { 336 return NULL; 337 } 338 } 339 } 340 341 d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT] 342 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 343 shifted_uv >>= SHIFT; 344 /* FALLTHROUGH */ 345 346 #ifdef EBCDIC 347 348 case 3: 349 d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 350 shifted_uv >>= SHIFT; 351 /* FALLTHROUGH */ 352 353 #endif 354 355 /* FALLTHROUGH */ 356 case 2: 357 d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 358 shifted_uv >>= SHIFT; 359 d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip)) 360 | UTF_START_MARK(utf8_skip)); 361 break; 362 } 363 364 return d + utf8_skip; 365 } 366 367 /* 368 =for apidoc uvchr_to_utf8 369 370 Adds the UTF-8 representation of the native code point C<uv> to the end 371 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to 372 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 373 the byte after the end of the new character. In other words, 374 375 d = uvchr_to_utf8(d, uv); 376 377 is the recommended wide native character-aware way of saying 378 379 *(d++) = uv; 380 381 This function accepts any code point from 0..C<IV_MAX> as input. 382 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word. 383 384 It is possible to forbid or warn on non-Unicode code points, or those that may 385 be problematic by using L</uvchr_to_utf8_flags>. 386 387 =cut 388 */ 389 390 /* This is also a macro */ 391 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); 392 393 U8 * 394 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) 395 { 396 return uvchr_to_utf8(d, uv); 397 } 398 399 /* 400 =for apidoc uvchr_to_utf8_flags 401 402 Adds the UTF-8 representation of the native code point C<uv> to the end 403 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to 404 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 405 the byte after the end of the new character. In other words, 406 407 d = uvchr_to_utf8_flags(d, uv, flags); 408 409 or, in most cases, 410 411 d = uvchr_to_utf8_flags(d, uv, 0); 412 413 This is the Unicode-aware way of saying 414 415 *(d++) = uv; 416 417 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as 418 input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word. 419 420 Specifying C<flags> can further restrict what is allowed and not warned on, as 421 follows: 422 423 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set, 424 the function will raise a warning, provided UTF8 warnings are enabled. If 425 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return 426 NULL. If both flags are set, the function will both warn and return NULL. 427 428 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags 429 affect how the function handles a Unicode non-character. 430 431 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags 432 affect the handling of code points that are above the Unicode maximum of 433 0x10FFFF. Languages other than Perl may not be able to accept files that 434 contain these. 435 436 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of 437 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all 438 three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the 439 allowed inputs to the strict UTF-8 traditionally defined by Unicode. 440 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and 441 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the 442 above-Unicode and surrogate flags, but not the non-character ones, as 443 defined in 444 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>. 445 See L<perlunicode/Noncharacter code points>. 446 447 Extremely high code points were never specified in any standard, and require an 448 extension to UTF-8 to express, which Perl does. It is likely that programs 449 written in something other than Perl would not be able to read files that 450 contain these; nor would Perl understand files written by something that uses a 451 different extension. For these reasons, there is a separate set of flags that 452 can warn and/or disallow these extremely high code points, even if other 453 above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED> 454 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see 455 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will 456 treat all above-Unicode code points, including these, as malformations. (Note 457 that the Unicode standard considers anything above 0x10FFFF to be illegal, but 458 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1)) 459 460 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is 461 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly, 462 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named 463 C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC 464 platforms,these flags can apply to code points that actually do fit in 31 bits. 465 The new names accurately describe the situation in all cases. 466 467 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT 468 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE 469 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 470 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR 471 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED 472 =for apidoc Amnh||UNICODE_DISALLOW_SUPER 473 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE 474 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT 475 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE 476 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE 477 =for apidoc Amnh||UNICODE_WARN_NONCHAR 478 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED 479 =for apidoc Amnh||UNICODE_WARN_SUPER 480 =for apidoc Amnh||UNICODE_WARN_SURROGATE 481 482 =cut 483 */ 484 485 /* This is also a macro */ 486 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); 487 488 U8 * 489 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 490 { 491 return uvchr_to_utf8_flags(d, uv, flags); 492 } 493 494 PERL_STATIC_INLINE int 495 S_is_utf8_overlong(const U8 * const s, const STRLEN len) 496 { 497 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 498 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if 499 * it isn't, and -1 if there isn't enough information to tell. This last 500 * return value can happen if the sequence is incomplete, missing some 501 * trailing bytes that would form a complete character. If there are 502 * enough bytes to make a definitive decision, this function does so. 503 * Usually 2 bytes are sufficient. 504 * 505 * Overlongs can occur whenever the number of continuation bytes changes. 506 * That means whenever the number of leading 1 bits in a start byte 507 * increases from the next lower start byte. That happens for start bytes 508 * C0, E0, F0, F8, FC, FE, and FF. 509 */ 510 511 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG; 512 513 /* Each platform has overlongs after the start bytes given above (expressed 514 * in I8 for EBCDIC). The values below were found by manually inspecting 515 * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */ 516 517 switch (NATIVE_UTF8_TO_I8(s[0])) { 518 default: 519 assert(UTF8_IS_START(s[0])); 520 return 0; 521 522 case 0xC0: 523 case 0xC1: 524 return 1; 525 526 #ifdef EBCDIC 527 528 case 0xC2: 529 case 0xC3: 530 case 0xC4: 531 case 0xE0: 532 return 1; 533 #else 534 case 0xE0: 535 return (len < 2) ? -1 : s[1] < 0xA0; 536 #endif 537 538 case 0xF0: 539 return (len < 2) 540 ? -1 541 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10; 542 case 0xF8: 543 return (len < 2) 544 ? -1 545 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08; 546 case 0xFC: 547 return (len < 2) 548 ? -1 549 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04; 550 case 0xFE: 551 return (len < 2) 552 ? -1 553 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02; 554 case 0xFF: 555 return isFF_overlong(s, len); 556 } 557 } 558 559 PERL_STATIC_INLINE int 560 S_isFF_overlong(const U8 * const s, const STRLEN len) 561 { 562 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 563 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if 564 * it isn't, and -1 if there isn't enough information to tell. This last 565 * return value can happen if the sequence is incomplete, missing some 566 * trailing bytes that would form a complete character. If there are 567 * enough bytes to make a definitive decision, this function does so. */ 568 569 PERL_ARGS_ASSERT_ISFF_OVERLONG; 570 571 #ifdef EBCDIC 572 /* This works on all three EBCDIC code pages traditionally supported by 573 * perl */ 574 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41" 575 #else 576 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80" 577 #endif 578 579 /* To be an FF overlong, all the available bytes must match */ 580 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX, 581 MIN(len, STRLENs(FF_OVERLONG_PREFIX))))) 582 { 583 return 0; 584 } 585 586 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must 587 * be there; what comes after them doesn't matter. See tables in utf8.h, 588 * utfebcdic.h. */ 589 if (len >= STRLENs(FF_OVERLONG_PREFIX)) { 590 return 1; 591 } 592 593 /* The missing bytes could cause the result to go one way or the other, so 594 * the result is indeterminate */ 595 return -1; 596 } 597 598 /* At some point we may want to allow core to use up to UV_MAX */ 599 600 #ifdef EBCDIC /* Actually is I8 */ 601 # if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */ 602 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA7" 603 /* UV_MAX "\xFF\xAF" */ 604 # else /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */ 605 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1" 606 /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */ 607 # endif 608 #else 609 # if defined(UV_IS_QUAD) 610 # define HIGHEST_REPRESENTABLE_UTF "\xFF\x80\x87" 611 /* UV_MAX "\xFF\x80" */ 612 # else 613 # define HIGHEST_REPRESENTABLE_UTF "\xFD" 614 /* UV_MAX "\xFE\x83" */ 615 # endif 616 #endif 617 618 PERL_STATIC_INLINE int 619 S_does_utf8_overflow(const U8 * const s, 620 const U8 * e, 621 const bool consider_overlongs) 622 { 623 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 624 * 'e' - 1 would overflow an IV on this platform; that is if it represents 625 * a code point larger than the highest representable code point. It 626 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't 627 * enough information to tell. This last return value can happen if the 628 * sequence is incomplete, missing some trailing bytes that would form a 629 * complete character. If there are enough bytes to make a definitive 630 * decision, this function does so. 631 * 632 * If 'consider_overlongs' is TRUE, the function checks for the possibility 633 * that the sequence is an overlong that doesn't overflow. Otherwise, it 634 * assumes the sequence is not an overlong. This can give different 635 * results only on ASCII 32-bit platforms. 636 * 637 * (For ASCII platforms, we could use memcmp() because we don't have to 638 * convert each byte to I8, but it's very rare input indeed that would 639 * approach overflow, so the loop below will likely only get executed once.) 640 * 641 */ 642 const STRLEN len = e - s; 643 const U8 *x; 644 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF; 645 int is_overlong = 0; 646 647 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; 648 649 for (x = s; x < e; x++, y++) { 650 651 /* 'y' is set up to not include the trailing bytes that are all the 652 * maximum possible continuation byte. So when we reach the end of 'y' 653 * (known to be NUL terminated), it is impossible for 'x' to contain 654 * bytes larger than those omitted bytes, and therefore 'x' can't 655 * overflow */ 656 if (*y == '\0') { 657 return 0; 658 } 659 660 /* If this byte is less than the corresponding highest non-overflowing 661 * UTF-8, the sequence doesn't overflow */ 662 if (NATIVE_UTF8_TO_I8(*x) < *y) { 663 return 0; 664 } 665 666 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { 667 goto overflows_if_not_overlong; 668 } 669 } 670 671 /* Got to the end, and all bytes are the same. If the input is a whole 672 * character, it doesn't overflow. And if it is a partial character, 673 * there's not enough information to tell */ 674 return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1; 675 676 overflows_if_not_overlong: 677 678 /* Here, a well-formed sequence overflows. If we are assuming 679 * well-formedness, return that it overflows. */ 680 if (! consider_overlongs) { 681 return 1; 682 } 683 684 /* Here, it could be the overlong malformation, and might not actually 685 * overflow if you were to calculate it out. 686 * 687 * See if it actually is overlong */ 688 is_overlong = is_utf8_overlong(s, len); 689 690 /* If it isn't overlong, is well-formed, so overflows */ 691 if (is_overlong == 0) { 692 return 1; 693 } 694 695 /* Not long enough to determine */ 696 if (is_overlong < 0) { 697 return -1; 698 } 699 700 /* Here, it appears to overflow, but it is also overlong */ 701 702 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS 703 704 /* On many platforms, it is impossible for an overlong to overflow. For 705 * these, no further work is necessary: we can return immediately that this 706 * overlong that is an apparent overflow actually isn't 707 * 708 * To see why, note that a length_N sequence can represent as overlongs all 709 * the code points representable by shorter length sequences, but no 710 * higher. If it could represent a higher code point without being an 711 * overlong, we wouldn't have had to increase the sequence length! 712 * 713 * The highest possible start byte is FF; the next highest is FE. The 714 * highest code point representable as an overlong on the platform is thus 715 * the highest code point representable by a non-overlong sequence whose 716 * start byte is FE. If that value doesn't overflow the platform's word 717 * size, overlongs can't overflow. 718 * 719 * FE consists of 7 bytes total; the FE start byte contributes 0 bits of 720 * information (the high 7 bits, all ones, say that the sequence is 7 bytes 721 * long, and the bottom, zero, bit is s placeholder. That leaves the 6 722 * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each. 723 If that number of bits doesn't exceed the word size, it can't overflow. */ 724 725 return 0; 726 727 #else 728 729 /* In practice, only a 32-bit ASCII box gets here. The FE start byte can 730 * represent, as an overlong, the highest code point representable by an FD 731 * start byte, which is 5*6 continuation bytes of info plus one bit from 732 * the start byte, or 31 bits. That doesn't overflow. More explicitly: 733 * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1. 734 * 735 * That means only the FF start byte can have an overflowing overlong. */ 736 if (*s < 0xFF) { 737 return 0; 738 } 739 740 /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that 741 * evaluates to 2**31, so overflows an IV. For a UV it's 742 * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */ 743 # define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82" 744 745 if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */ 746 return -1; 747 } 748 749 # define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0) 750 751 return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS)); 752 753 #endif 754 755 } 756 757 STRLEN 758 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags) 759 { 760 SSize_t len, full_len; 761 762 /* An internal helper function. 763 * 764 * On input: 765 * 's' is a string, which is known to be syntactically valid UTF-8 as far 766 * as (e - 1); e > s must hold. 767 * 'e' This function is allowed to look at any byte from 's'...'e-1', but 768 * nowhere else. The function has to cope as best it can if that 769 * sequence does not form a full character. 770 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags 771 * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns 772 * 0 if it determines the input will match something disallowed. 773 * On output: 774 * The return is the number of bytes required to represent the code point 775 * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the 776 * input is for a partial character, a successful return will be larger 777 * than 'e - s'. 778 * 779 * If *s..*(e-1) is only for a partial character, the function will return 780 * non-zero if there is any sequence of well-formed UTF-8 that, when 781 * appended to the input sequence, could result in an allowed code point; 782 * otherwise it returns 0. Non characters cannot be determined based on 783 * partial character input. But many of the other excluded types can be 784 * determined with just the first one or two bytes. 785 * 786 */ 787 788 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_; 789 790 assert(e > s); 791 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 792 |UTF8_DISALLOW_PERL_EXTENDED))); 793 794 full_len = UTF8SKIP(s); 795 796 len = e - s; 797 if (len > full_len) { 798 e = s + full_len; 799 len = full_len; 800 } 801 802 switch (full_len) { 803 bool is_super; 804 805 default: /* Extended */ 806 if (flags & UTF8_DISALLOW_PERL_EXTENDED) { 807 return 0; 808 } 809 810 /* FALLTHROUGH */ 811 812 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */ 813 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */ 814 815 if (flags & UTF8_DISALLOW_SUPER) { 816 return 0; /* Above Unicode */ 817 } 818 819 return full_len; 820 821 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT: 822 is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_) 823 || ( len > 1 824 && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_ 825 && NATIVE_UTF8_TO_I8(s[1]) 826 >= UTF_FIRST_CONT_BYTE_110000_)); 827 if (is_super) { 828 if (flags & UTF8_DISALLOW_SUPER) { 829 return 0; 830 } 831 } 832 else if ( (flags & UTF8_DISALLOW_NONCHAR) 833 && len == full_len 834 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s))) 835 { 836 return 0; 837 } 838 839 return full_len; 840 841 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT: 842 843 if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) { 844 return full_len; 845 } 846 847 if ( (flags & UTF8_DISALLOW_SURROGATE) 848 && UNLIKELY(is_SURROGATE_utf8(s))) 849 { 850 return 0; /* Surrogate */ 851 } 852 853 if ( (flags & UTF8_DISALLOW_NONCHAR) 854 && len == full_len 855 && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s))) 856 { 857 return 0; 858 } 859 860 return full_len; 861 862 /* The lower code points don't have any disallowable characters */ 863 #ifdef EBCDIC 864 case 3: 865 return full_len; 866 #endif 867 868 case 2: 869 case 1: 870 return full_len; 871 } 872 } 873 874 Size_t 875 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, 876 const bool require_partial) 877 { 878 /* This is called to determine if the UTF-8 sequence starting at s0 and 879 * continuing for up to one full character of bytes, but looking no further 880 * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native 881 * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being 882 * for the largest code points recognized by Perl, the ones that require 883 * the most UTF-8 bytes per character to represent (somewhat less than 884 * twice the size of the next longest kind). This sequence will only ever 885 * be Perl extended UTF-8. 886 * 887 * The routine returns 0 if the sequence is not fully valid, syntactically 888 * or semantically. That means it checks that everything following the 889 * start byte is a continuation byte, and that it doesn't overflow, nor is 890 * an overlong representation. 891 * 892 * If 'require_partial' is FALSE, the routine returns non-zero only if the 893 * input (as far as 'e-1') is a full character. The return is the count of 894 * the bytes in the character. 895 * 896 * If 'require_partial' is TRUE, the routine returns non-zero only if the 897 * input as far as 'e-1' is a partial, not full character, with no 898 * malformations found before position 'e'. The return is either just 899 * FALSE, or TRUE. */ 900 901 const U8 *s = s0 + 1; 902 const U8 *send = e; 903 904 PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_; 905 906 assert(s0 < e); 907 assert(*s0 == I8_TO_NATIVE_UTF8(0xFF)); 908 909 send = s + MIN(UTF8_MAXBYTES - 1, e - s); 910 while (s < send) { 911 if (! UTF8_IS_CONTINUATION(*s)) { 912 return 0; 913 } 914 915 s++; 916 } 917 918 if (0 < does_utf8_overflow(s0, e, 919 FALSE /* Don't consider_overlongs */ 920 )) { 921 return 0; 922 } 923 924 if (0 < isFF_overlong(s0, e - s0)) { 925 return 0; 926 } 927 928 /* Here, the character is valid as far as it got. Check if got a partial 929 * character */ 930 if (s - s0 < UTF8_MAXBYTES) { 931 return (require_partial) ? 1 : 0; 932 } 933 934 /* Here, got a full character */ 935 return (require_partial) ? 0 : UTF8_MAXBYTES; 936 } 937 938 char * 939 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) 940 { 941 /* Returns a mortalized C string that is a displayable copy of the 'len' 942 * bytes starting at 'start'. 'format' gives how to display each byte. 943 * Currently, there are only two formats, so it is currently a bool: 944 * 0 \xab 945 * 1 ab (that is a space between two hex digit bytes) 946 */ 947 948 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a 949 trailing NUL */ 950 const U8 * s = start; 951 const U8 * const e = start + len; 952 char * output; 953 char * d; 954 955 PERL_ARGS_ASSERT__BYTE_DUMP_STRING; 956 957 Newx(output, output_len, char); 958 SAVEFREEPV(output); 959 960 d = output; 961 for (s = start; s < e; s++) { 962 const unsigned high_nibble = (*s & 0xF0) >> 4; 963 const unsigned low_nibble = (*s & 0x0F); 964 965 if (format) { 966 if (s > start) { 967 *d++ = ' '; 968 } 969 } 970 else { 971 *d++ = '\\'; 972 *d++ = 'x'; 973 } 974 975 if (high_nibble < 10) { 976 *d++ = high_nibble + '0'; 977 } 978 else { 979 *d++ = high_nibble - 10 + 'a'; 980 } 981 982 if (low_nibble < 10) { 983 *d++ = low_nibble + '0'; 984 } 985 else { 986 *d++ = low_nibble - 10 + 'a'; 987 } 988 } 989 990 *d = '\0'; 991 return output; 992 } 993 994 PERL_STATIC_INLINE char * 995 S_unexpected_non_continuation_text(pTHX_ const U8 * const s, 996 997 /* Max number of bytes to print */ 998 STRLEN print_len, 999 1000 /* Which one is the non-continuation */ 1001 const STRLEN non_cont_byte_pos, 1002 1003 /* How many bytes should there be? */ 1004 const STRLEN expect_len) 1005 { 1006 /* Return the malformation warning text for an unexpected continuation 1007 * byte. */ 1008 1009 const char * const where = (non_cont_byte_pos == 1) 1010 ? "immediately" 1011 : Perl_form(aTHX_ "%d bytes", 1012 (int) non_cont_byte_pos); 1013 const U8 * x = s + non_cont_byte_pos; 1014 const U8 * e = s + print_len; 1015 1016 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; 1017 1018 /* We don't need to pass this parameter, but since it has already been 1019 * calculated, it's likely faster to pass it; verify under DEBUGGING */ 1020 assert(expect_len == UTF8SKIP(s)); 1021 1022 /* As a defensive coding measure, don't output anything past a NUL. Such 1023 * bytes shouldn't be in the middle of a malformation, and could mark the 1024 * end of the allocated string, and what comes after is undefined */ 1025 for (; x < e; x++) { 1026 if (*x == '\0') { 1027 x++; /* Output this particular NUL */ 1028 break; 1029 } 1030 } 1031 1032 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," 1033 " %s after start byte 0x%02x; need %d bytes, got %d)", 1034 malformed_text, 1035 _byte_dump_string(s, x - s, 0), 1036 *(s + non_cont_byte_pos), 1037 where, 1038 *s, 1039 (int) expect_len, 1040 (int) non_cont_byte_pos); 1041 } 1042 1043 /* 1044 1045 =for apidoc utf8n_to_uvchr 1046 1047 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1048 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1049 directly. 1050 1051 Bottom level UTF-8 decode routine. 1052 Returns the native code point value of the first character in the string C<s>, 1053 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than 1054 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to 1055 the length, in bytes, of that character. 1056 1057 The value of C<flags> determines the behavior when C<s> does not point to a 1058 well-formed UTF-8 character. If C<flags> is 0, encountering a malformation 1059 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) 1060 is the next possible position in C<s> that could begin a non-malformed 1061 character. Also, if UTF-8 warnings haven't been lexically disabled, a warning 1062 is raised. Some UTF-8 input sequences may contain multiple malformations. 1063 This function tries to find every possible one in each call, so multiple 1064 warnings can be raised for the same sequence. 1065 1066 Various ALLOW flags can be set in C<flags> to allow (and not warn on) 1067 individual types of malformations, such as the sequence being overlong (that 1068 is, when there is a shorter sequence that can express the same code point; 1069 overlong sequences are expressly forbidden in the UTF-8 standard due to 1070 potential security issues). Another malformation example is the first byte of 1071 a character not being a legal first byte. See F<utf8.h> for the list of such 1072 flags. Even if allowed, this function generally returns the Unicode 1073 REPLACEMENT CHARACTER when it encounters a malformation. There are flags in 1074 F<utf8.h> to override this behavior for the overlong malformations, but don't 1075 do that except for very specialized purposes. 1076 1077 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other 1078 flags) malformation is found. If this flag is set, the routine assumes that 1079 the caller will raise a warning, and this function will silently just set 1080 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero. 1081 1082 Note that this API requires disambiguation between successful decoding a C<NUL> 1083 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as 1084 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may 1085 be set to 1. To disambiguate, upon a zero return, see if the first byte of 1086 C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an 1087 error. Or you can use C<L</utf8n_to_uvchr_error>>. 1088 1089 Certain code points are considered problematic. These are Unicode surrogates, 1090 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. 1091 By default these are considered regular code points, but certain situations 1092 warrant special handling for them, which can be specified using the C<flags> 1093 parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all 1094 three classes are treated as malformations and handled as such. The flags 1095 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and 1096 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to 1097 disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE> 1098 restricts the allowed inputs to the strict UTF-8 traditionally defined by 1099 Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness 1100 definition given by 1101 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>. 1102 The difference between traditional strictness and C9 strictness is that the 1103 latter does not forbid non-character code points. (They are still discouraged, 1104 however.) For more discussion see L<perlunicode/Noncharacter code points>. 1105 1106 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>, 1107 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>, 1108 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be 1109 raised for their respective categories, but otherwise the code points are 1110 considered valid (not malformations). To get a category to both be treated as 1111 a malformation and raise a warning, specify both the WARN and DISALLOW flags. 1112 (But note that warnings are not raised if lexically disabled nor if 1113 C<UTF8_CHECK_ONLY> is also specified.) 1114 1115 Extremely high code points were never specified in any standard, and require an 1116 extension to UTF-8 to express, which Perl does. It is likely that programs 1117 written in something other than Perl would not be able to read files that 1118 contain these; nor would Perl understand files written by something that uses a 1119 different extension. For these reasons, there is a separate set of flags that 1120 can warn and/or disallow these extremely high code points, even if other 1121 above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and 1122 C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see 1123 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all 1124 above-Unicode code points, including these, as malformations. 1125 (Note that the Unicode standard considers anything above 0x10FFFF to be 1126 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF 1127 (2**31 -1)) 1128 1129 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is 1130 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly, 1131 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named 1132 C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags 1133 can apply to code points that actually do fit in 31 bits. This happens on 1134 EBCDIC platforms, and sometimes when the L<overlong 1135 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately 1136 describe the situation in all cases. 1137 1138 1139 All other code points corresponding to Unicode characters, including private 1140 use and those yet to be assigned, are never considered malformed and never 1141 warn. 1142 1143 =for apidoc Amnh||UTF8_CHECK_ONLY 1144 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1145 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE 1146 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE 1147 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR 1148 =for apidoc Amnh||UTF8_DISALLOW_SUPER 1149 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE 1150 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE 1151 =for apidoc Amnh||UTF8_WARN_SURROGATE 1152 =for apidoc Amnh||UTF8_WARN_NONCHAR 1153 =for apidoc Amnh||UTF8_WARN_SUPER 1154 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED 1155 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED 1156 1157 =cut 1158 1159 Also implemented as a macro in utf8.h 1160 */ 1161 1162 UV 1163 Perl_utf8n_to_uvchr(const U8 *s, 1164 STRLEN curlen, 1165 STRLEN *retlen, 1166 const U32 flags) 1167 { 1168 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; 1169 1170 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); 1171 } 1172 1173 /* 1174 1175 =for apidoc utf8n_to_uvchr_error 1176 1177 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1178 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1179 directly. 1180 1181 This function is for code that needs to know what the precise malformation(s) 1182 are when an error is found. If you also need to know the generated warning 1183 messages, use L</utf8n_to_uvchr_msgs>() instead. 1184 1185 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after 1186 all the others, C<errors>. If this parameter is 0, this function behaves 1187 identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer 1188 to a C<U32> variable, which this function sets to indicate any errors found. 1189 Upon return, if C<*errors> is 0, there were no errors found. Otherwise, 1190 C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some 1191 of these bits will be set if a malformation is found, even if the input 1192 C<flags> parameter indicates that the given malformation is allowed; those 1193 exceptions are noted: 1194 1195 =over 4 1196 1197 =item C<UTF8_GOT_PERL_EXTENDED> 1198 1199 The input sequence is not standard UTF-8, but a Perl extension. This bit is 1200 set only if the input C<flags> parameter contains either the 1201 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags. 1202 1203 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, 1204 and so some extension must be used to express them. Perl uses a natural 1205 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further 1206 extension to represent even higher ones, so that any code point that fits in a 1207 64-bit word can be represented. Text using these extensions is not likely to 1208 be portable to non-Perl code. We lump both of these extensions together and 1209 refer to them as Perl extended UTF-8. There exist other extensions that people 1210 have invented, incompatible with Perl's. 1211 1212 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing 1213 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower 1214 than on ASCII. Prior to that, code points 2**31 and higher were simply 1215 unrepresentable, and a different, incompatible method was used to represent 1216 code points between 2**30 and 2**31 - 1. 1217 1218 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if 1219 Perl extended UTF-8 is used. 1220 1221 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still 1222 may use for backward compatibility. That name is misleading, as this flag may 1223 be set when the code point actually does fit in 31 bits. This happens on 1224 EBCDIC platforms, and sometimes when the L<overlong 1225 malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately 1226 describes the situation in all cases. 1227 1228 =item C<UTF8_GOT_CONTINUATION> 1229 1230 The input sequence was malformed in that the first byte was a UTF-8 1231 continuation byte. 1232 1233 =item C<UTF8_GOT_EMPTY> 1234 1235 The input C<curlen> parameter was 0. 1236 1237 =item C<UTF8_GOT_LONG> 1238 1239 The input sequence was malformed in that there is some other sequence that 1240 evaluates to the same code point, but that sequence is shorter than this one. 1241 1242 Until Unicode 3.1, it was legal for programs to accept this malformation, but 1243 it was discovered that this created security issues. 1244 1245 =item C<UTF8_GOT_NONCHAR> 1246 1247 The code point represented by the input UTF-8 sequence is for a Unicode 1248 non-character code point. 1249 This bit is set only if the input C<flags> parameter contains either the 1250 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags. 1251 1252 =item C<UTF8_GOT_NON_CONTINUATION> 1253 1254 The input sequence was malformed in that a non-continuation type byte was found 1255 in a position where only a continuation type one should be. See also 1256 C<L</UTF8_GOT_SHORT>>. 1257 1258 =item C<UTF8_GOT_OVERFLOW> 1259 1260 The input sequence was malformed in that it is for a code point that is not 1261 representable in the number of bits available in an IV on the current platform. 1262 1263 =item C<UTF8_GOT_SHORT> 1264 1265 The input sequence was malformed in that C<curlen> is smaller than required for 1266 a complete sequence. In other words, the input is for a partial character 1267 sequence. 1268 1269 1270 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short 1271 sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always 1272 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete 1273 sequence was looked at. If no other flags are present, it means that the 1274 sequence was valid as far as it went. Depending on the application, this could 1275 mean one of three things: 1276 1277 =over 1278 1279 =item * 1280 1281 The C<curlen> length parameter passed in was too small, and the function was 1282 prevented from examining all the necessary bytes. 1283 1284 =item * 1285 1286 The buffer being looked at is based on reading data, and the data received so 1287 far stopped in the middle of a character, so that the next read will 1288 read the remainder of this character. (It is up to the caller to deal with the 1289 split bytes somehow.) 1290 1291 =item * 1292 1293 This is a real error, and the partial sequence is all we're going to get. 1294 1295 =back 1296 1297 =item C<UTF8_GOT_SUPER> 1298 1299 The input sequence was malformed in that it is for a non-Unicode code point; 1300 that is, one above the legal Unicode maximum. 1301 This bit is set only if the input C<flags> parameter contains either the 1302 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags. 1303 1304 =item C<UTF8_GOT_SURROGATE> 1305 1306 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate 1307 code point. 1308 This bit is set only if the input C<flags> parameter contains either the 1309 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags. 1310 1311 =back 1312 1313 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY> 1314 flag to suppress any warnings, and then examine the C<*errors> return. 1315 1316 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED 1317 =for apidoc Amnh||UTF8_GOT_CONTINUATION 1318 =for apidoc Amnh||UTF8_GOT_EMPTY 1319 =for apidoc Amnh||UTF8_GOT_LONG 1320 =for apidoc Amnh||UTF8_GOT_NONCHAR 1321 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION 1322 =for apidoc Amnh||UTF8_GOT_OVERFLOW 1323 =for apidoc Amnh||UTF8_GOT_SHORT 1324 =for apidoc Amnh||UTF8_GOT_SUPER 1325 =for apidoc Amnh||UTF8_GOT_SURROGATE 1326 1327 =cut 1328 1329 Also implemented as a macro in utf8.h 1330 */ 1331 1332 UV 1333 Perl_utf8n_to_uvchr_error(const U8 *s, 1334 STRLEN curlen, 1335 STRLEN *retlen, 1336 const U32 flags, 1337 U32 * errors) 1338 { 1339 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; 1340 1341 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); 1342 } 1343 1344 /* 1345 1346 =for apidoc utf8n_to_uvchr_msgs 1347 1348 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1349 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1350 directly. 1351 1352 This function is for code that needs to know what the precise malformation(s) 1353 are when an error is found, and wants the corresponding warning and/or error 1354 messages to be returned to the caller rather than be displayed. All messages 1355 that would have been displayed if all lexical warnings are enabled will be 1356 returned. 1357 1358 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter 1359 placed after all the others, C<msgs>. If this parameter is 0, this function 1360 behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should 1361 be a pointer to an C<AV *> variable, in which this function creates a new AV to 1362 contain any appropriate messages. The elements of the array are ordered so 1363 that the first message that would have been displayed is in the 0th element, 1364 and so on. Each element is a hash with three key-value pairs, as follows: 1365 1366 =over 4 1367 1368 =item C<text> 1369 1370 The text of the message as a C<SVpv>. 1371 1372 =item C<warn_categories> 1373 1374 The warning category (or categories) packed into a C<SVuv>. 1375 1376 =item C<flag> 1377 1378 A single flag bit associated with this message, in a C<SVuv>. 1379 The bit corresponds to some bit in the C<*errors> return value, 1380 such as C<UTF8_GOT_LONG>. 1381 1382 =back 1383 1384 It's important to note that specifying this parameter as non-null will cause 1385 any warnings this function would otherwise generate to be suppressed, and 1386 instead be placed in C<*msgs>. The caller can check the lexical warnings state 1387 (or not) when choosing what to do with the returned messages. 1388 1389 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence 1390 no AV is created. 1391 1392 The caller, of course, is responsible for freeing any returned AV. 1393 1394 =cut 1395 */ 1396 1397 UV 1398 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, 1399 STRLEN curlen, 1400 STRLEN *retlen, 1401 const U32 flags, 1402 U32 * errors, 1403 AV ** msgs) 1404 { 1405 const U8 * const s0 = s; 1406 const U8 * send = s0 + curlen; 1407 U32 possible_problems; /* A bit is set here for each potential problem 1408 found as we go along */ 1409 UV uv; 1410 STRLEN expectlen; /* How long should this sequence be? */ 1411 STRLEN avail_len; /* When input is too short, gives what that is */ 1412 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this 1413 gets set and discarded */ 1414 1415 /* The below are used only if there is both an overlong malformation and a 1416 * too short one. Otherwise the first two are set to 's0' and 'send', and 1417 * the third not used at all */ 1418 U8 * adjusted_s0; 1419 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this 1420 routine; see [perl #130921] */ 1421 UV uv_so_far; 1422 dTHX; 1423 1424 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; 1425 1426 /* Here, is one of: a) malformed; b) a problematic code point (surrogate, 1427 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul 1428 * syllables that the dfa doesn't properly handle. Quickly dispose of the 1429 * final case. */ 1430 1431 /* Each of the affected Hanguls starts with \xED */ 1432 1433 if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */ 1434 if (retlen) { 1435 *retlen = 3; 1436 } 1437 if (errors) { 1438 *errors = 0; 1439 } 1440 if (msgs) { 1441 *msgs = NULL; 1442 } 1443 1444 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) 1445 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) 1446 | (s0[2] & UTF_CONTINUATION_MASK); 1447 } 1448 1449 /* In conjunction with the exhaustive tests that can be enabled in 1450 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely 1451 * what it is intended to do, and that no flaws in it are masked by 1452 * dropping down and executing the code below 1453 assert(! isUTF8_CHAR(s0, send) 1454 || UTF8_IS_SURROGATE(s0, send) 1455 || UTF8_IS_SUPER(s0, send) 1456 || UTF8_IS_NONCHAR(s0,send)); 1457 */ 1458 1459 s = s0; 1460 possible_problems = 0; 1461 expectlen = 0; 1462 avail_len = 0; 1463 discard_errors = 0; 1464 adjusted_s0 = (U8 *) s0; 1465 uv_so_far = 0; 1466 1467 if (errors) { 1468 *errors = 0; 1469 } 1470 else { 1471 errors = &discard_errors; 1472 } 1473 1474 /* The order of malformation tests here is important. We should consume as 1475 * few bytes as possible in order to not skip any valid character. This is 1476 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also 1477 * https://unicode.org/reports/tr36 for more discussion as to why. For 1478 * example, once we've done a UTF8SKIP, we can tell the expected number of 1479 * bytes, and could fail right off the bat if the input parameters indicate 1480 * that there are too few available. But it could be that just that first 1481 * byte is garbled, and the intended character occupies fewer bytes. If we 1482 * blindly assumed that the first byte is correct, and skipped based on 1483 * that number, we could skip over a valid input character. So instead, we 1484 * always examine the sequence byte-by-byte. 1485 * 1486 * We also should not consume too few bytes, otherwise someone could inject 1487 * things. For example, an input could be deliberately designed to 1488 * overflow, and if this code bailed out immediately upon discovering that, 1489 * returning to the caller C<*retlen> pointing to the very next byte (one 1490 * which is actually part of the overflowing sequence), that could look 1491 * legitimate to the caller, which could discard the initial partial 1492 * sequence and process the rest, inappropriately. 1493 * 1494 * Some possible input sequences are malformed in more than one way. This 1495 * function goes to lengths to try to find all of them. This is necessary 1496 * for correctness, as the inputs may allow one malformation but not 1497 * another, and if we abandon searching for others after finding the 1498 * allowed one, we could allow in something that shouldn't have been. 1499 */ 1500 1501 if (UNLIKELY(curlen == 0)) { 1502 possible_problems |= UTF8_GOT_EMPTY; 1503 curlen = 0; 1504 uv = UNICODE_REPLACEMENT; 1505 goto ready_to_handle_errors; 1506 } 1507 1508 /* We now know we can examine the first byte of the input */ 1509 expectlen = UTF8SKIP(s); 1510 uv = *s; 1511 1512 /* A well-formed UTF-8 character, as the vast majority of calls to this 1513 * function will be for, has this expected length. For efficiency, set 1514 * things up here to return it. It will be overridden only in those rare 1515 * cases where a malformation is found */ 1516 if (retlen) { 1517 *retlen = expectlen; 1518 } 1519 1520 /* A continuation character can't start a valid sequence */ 1521 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { 1522 possible_problems |= UTF8_GOT_CONTINUATION; 1523 curlen = 1; 1524 uv = UNICODE_REPLACEMENT; 1525 goto ready_to_handle_errors; 1526 } 1527 1528 /* Here is not a continuation byte, nor an invariant. The only thing left 1529 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START 1530 * because it excludes start bytes like \xC0 that always lead to 1531 * overlongs.) */ 1532 1533 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits 1534 * that indicate the number of bytes in the character's whole UTF-8 1535 * sequence, leaving just the bits that are part of the value. */ 1536 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 1537 1538 /* Setup the loop end point, making sure to not look past the end of the 1539 * input string, and flag it as too short if the size isn't big enough. */ 1540 if (UNLIKELY(curlen < expectlen)) { 1541 possible_problems |= UTF8_GOT_SHORT; 1542 avail_len = curlen; 1543 } 1544 else { 1545 send = (U8*) s0 + expectlen; 1546 } 1547 1548 /* Now, loop through the remaining bytes in the character's sequence, 1549 * accumulating each into the working value as we go. */ 1550 for (s = s0 + 1; s < send; s++) { 1551 if (LIKELY(UTF8_IS_CONTINUATION(*s))) { 1552 uv = UTF8_ACCUMULATE(uv, *s); 1553 continue; 1554 } 1555 1556 /* Here, found a non-continuation before processing all expected bytes. 1557 * This byte indicates the beginning of a new character, so quit, even 1558 * if allowing this malformation. */ 1559 possible_problems |= UTF8_GOT_NON_CONTINUATION; 1560 break; 1561 } /* End of loop through the character's bytes */ 1562 1563 /* Save how many bytes were actually in the character */ 1564 curlen = s - s0; 1565 1566 /* Note that there are two types of too-short malformation. One is when 1567 * there is actual wrong data before the normal termination of the 1568 * sequence. The other is that the sequence wasn't complete before the end 1569 * of the data we are allowed to look at, based on the input 'curlen'. 1570 * This means that we were passed data for a partial character, but it is 1571 * valid as far as we saw. The other is definitely invalid. This 1572 * distinction could be important to a caller, so the two types are kept 1573 * separate. 1574 * 1575 * A convenience macro that matches either of the too-short conditions. */ 1576 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) 1577 1578 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { 1579 uv_so_far = uv; 1580 uv = UNICODE_REPLACEMENT; 1581 } 1582 1583 /* Check for overflow. The algorithm requires us to not look past the end 1584 * of the current character, even if partial, so the upper limit is 's' */ 1585 if (UNLIKELY(0 < does_utf8_overflow(s0, s, 1586 1 /* Do consider overlongs */ 1587 ))) 1588 { 1589 possible_problems |= UTF8_GOT_OVERFLOW; 1590 uv = UNICODE_REPLACEMENT; 1591 } 1592 1593 /* Check for overlong. If no problems so far, 'uv' is the correct code 1594 * point value. Simply see if it is expressible in fewer bytes. Otherwise 1595 * we must look at the UTF-8 byte sequence itself to see if it is for an 1596 * overlong */ 1597 if ( ( LIKELY(! possible_problems) 1598 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) 1599 || ( UNLIKELY(possible_problems) 1600 && ( UNLIKELY(! UTF8_IS_START(*s0)) 1601 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) 1602 { 1603 possible_problems |= UTF8_GOT_LONG; 1604 1605 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT) 1606 1607 /* The calculation in the 'true' branch of this 'if' 1608 * below won't work if overflows, and isn't needed 1609 * anyway. Further below we handle all overflow 1610 * cases */ 1611 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) 1612 { 1613 UV min_uv = uv_so_far; 1614 STRLEN i; 1615 1616 /* Here, the input is both overlong and is missing some trailing 1617 * bytes. There is no single code point it could be for, but there 1618 * may be enough information present to determine if what we have 1619 * so far is for an unallowed code point, such as for a surrogate. 1620 * The code further below has the intelligence to determine this, 1621 * but just for non-overlong UTF-8 sequences. What we do here is 1622 * calculate the smallest code point the input could represent if 1623 * there were no too short malformation. Then we compute and save 1624 * the UTF-8 for that, which is what the code below looks at 1625 * instead of the raw input. It turns out that the smallest such 1626 * code point is all we need. */ 1627 for (i = curlen; i < expectlen; i++) { 1628 min_uv = UTF8_ACCUMULATE(min_uv, 1629 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE)); 1630 } 1631 1632 adjusted_s0 = temp_char_buf; 1633 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); 1634 } 1635 } 1636 1637 /* Here, we have found all the possible problems, except for when the input 1638 * is for a problematic code point not allowed by the input parameters. */ 1639 1640 /* uv is valid for overlongs */ 1641 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) 1642 && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) 1643 || ( UNLIKELY(possible_problems) 1644 1645 /* if overflow, we know without looking further 1646 * precisely which of the problematic types it is, 1647 * and we deal with those in the overflow handling 1648 * code */ 1649 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) 1650 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) 1651 || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) 1652 && ((flags & ( UTF8_DISALLOW_NONCHAR 1653 |UTF8_DISALLOW_SURROGATE 1654 |UTF8_DISALLOW_SUPER 1655 |UTF8_DISALLOW_PERL_EXTENDED 1656 |UTF8_WARN_NONCHAR 1657 |UTF8_WARN_SURROGATE 1658 |UTF8_WARN_SUPER 1659 |UTF8_WARN_PERL_EXTENDED)))) 1660 { 1661 /* If there were no malformations, or the only malformation is an 1662 * overlong, 'uv' is valid */ 1663 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) { 1664 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { 1665 possible_problems |= UTF8_GOT_SURROGATE; 1666 } 1667 else if (UNLIKELY(UNICODE_IS_SUPER(uv))) { 1668 possible_problems |= UTF8_GOT_SUPER; 1669 } 1670 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) { 1671 possible_problems |= UTF8_GOT_NONCHAR; 1672 } 1673 } 1674 else { /* Otherwise, need to look at the source UTF-8, possibly 1675 adjusted to be non-overlong */ 1676 1677 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) 1678 > UTF_START_BYTE_110000_)) 1679 { 1680 possible_problems |= UTF8_GOT_SUPER; 1681 } 1682 else if (curlen > 1) { 1683 if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0) 1684 == UTF_START_BYTE_110000_ 1685 && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)) 1686 >= UTF_FIRST_CONT_BYTE_110000_)) 1687 { 1688 possible_problems |= UTF8_GOT_SUPER; 1689 } 1690 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) { 1691 possible_problems |= UTF8_GOT_SURROGATE; 1692 } 1693 } 1694 1695 /* We need a complete well-formed UTF-8 character to discern 1696 * non-characters, so can't look for them here */ 1697 } 1698 } 1699 1700 ready_to_handle_errors: 1701 1702 /* At this point: 1703 * curlen contains the number of bytes in the sequence that 1704 * this call should advance the input by. 1705 * avail_len gives the available number of bytes passed in, but 1706 * only if this is less than the expected number of 1707 * bytes, based on the code point's start byte. 1708 * possible_problems is 0 if there weren't any problems; otherwise a bit 1709 * is set in it for each potential problem found. 1710 * uv contains the code point the input sequence 1711 * represents; or if there is a problem that prevents 1712 * a well-defined value from being computed, it is 1713 * some substitute value, typically the REPLACEMENT 1714 * CHARACTER. 1715 * s0 points to the first byte of the character 1716 * s points to just after where we left off processing 1717 * the character 1718 * send points to just after where that character should 1719 * end, based on how many bytes the start byte tells 1720 * us should be in it, but no further than s0 + 1721 * avail_len 1722 */ 1723 1724 if (UNLIKELY(possible_problems)) { 1725 bool disallowed = FALSE; 1726 const U32 orig_problems = possible_problems; 1727 1728 if (msgs) { 1729 *msgs = NULL; 1730 } 1731 1732 while (possible_problems) { /* Handle each possible problem */ 1733 U32 pack_warn = 0; 1734 char * message = NULL; 1735 U32 this_flag_bit = 0; 1736 1737 /* Each 'if' clause handles one problem. They are ordered so that 1738 * the first ones' messages will be displayed before the later 1739 * ones; this is kinda in decreasing severity order. But the 1740 * overlong must come last, as it changes 'uv' looked at by the 1741 * others */ 1742 if (possible_problems & UTF8_GOT_OVERFLOW) { 1743 1744 /* Overflow means also got a super and are using Perl's 1745 * extended UTF-8, but we handle all three cases here */ 1746 possible_problems 1747 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED); 1748 *errors |= UTF8_GOT_OVERFLOW; 1749 1750 /* But the API says we flag all errors found */ 1751 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) { 1752 *errors |= UTF8_GOT_SUPER; 1753 } 1754 if (flags 1755 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED)) 1756 { 1757 *errors |= UTF8_GOT_PERL_EXTENDED; 1758 } 1759 1760 /* Disallow if any of the three categories say to */ 1761 if ( ! (flags & UTF8_ALLOW_OVERFLOW) 1762 || (flags & ( UTF8_DISALLOW_SUPER 1763 |UTF8_DISALLOW_PERL_EXTENDED))) 1764 { 1765 disallowed = TRUE; 1766 } 1767 1768 /* Likewise, warn if any say to */ 1769 if ( ! (flags & UTF8_ALLOW_OVERFLOW) 1770 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) 1771 { 1772 1773 /* The warnings code explicitly says it doesn't handle the 1774 * case of packWARN2 and two categories which have 1775 * parent-child relationship. Even if it works now to 1776 * raise the warning if either is enabled, it wouldn't 1777 * necessarily do so in the future. We output (only) the 1778 * most dire warning */ 1779 if (! (flags & UTF8_CHECK_ONLY)) { 1780 if (msgs || ckWARN_d(WARN_UTF8)) { 1781 pack_warn = packWARN(WARN_UTF8); 1782 } 1783 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) { 1784 pack_warn = packWARN(WARN_NON_UNICODE); 1785 } 1786 if (pack_warn) { 1787 message = Perl_form(aTHX_ "%s: %s (overflows)", 1788 malformed_text, 1789 _byte_dump_string(s0, curlen, 0)); 1790 this_flag_bit = UTF8_GOT_OVERFLOW; 1791 } 1792 } 1793 } 1794 } 1795 else if (possible_problems & UTF8_GOT_EMPTY) { 1796 possible_problems &= ~UTF8_GOT_EMPTY; 1797 *errors |= UTF8_GOT_EMPTY; 1798 1799 if (! (flags & UTF8_ALLOW_EMPTY)) { 1800 1801 /* This so-called malformation is now treated as a bug in 1802 * the caller. If you have nothing to decode, skip calling 1803 * this function */ 1804 assert(0); 1805 1806 disallowed = TRUE; 1807 if ( (msgs 1808 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1809 { 1810 pack_warn = packWARN(WARN_UTF8); 1811 message = Perl_form(aTHX_ "%s (empty string)", 1812 malformed_text); 1813 this_flag_bit = UTF8_GOT_EMPTY; 1814 } 1815 } 1816 } 1817 else if (possible_problems & UTF8_GOT_CONTINUATION) { 1818 possible_problems &= ~UTF8_GOT_CONTINUATION; 1819 *errors |= UTF8_GOT_CONTINUATION; 1820 1821 if (! (flags & UTF8_ALLOW_CONTINUATION)) { 1822 disallowed = TRUE; 1823 if (( msgs 1824 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1825 { 1826 pack_warn = packWARN(WARN_UTF8); 1827 message = Perl_form(aTHX_ 1828 "%s: %s (unexpected continuation byte 0x%02x," 1829 " with no preceding start byte)", 1830 malformed_text, 1831 _byte_dump_string(s0, 1, 0), *s0); 1832 this_flag_bit = UTF8_GOT_CONTINUATION; 1833 } 1834 } 1835 } 1836 else if (possible_problems & UTF8_GOT_SHORT) { 1837 possible_problems &= ~UTF8_GOT_SHORT; 1838 *errors |= UTF8_GOT_SHORT; 1839 1840 if (! (flags & UTF8_ALLOW_SHORT)) { 1841 disallowed = TRUE; 1842 if (( msgs 1843 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1844 { 1845 pack_warn = packWARN(WARN_UTF8); 1846 message = Perl_form(aTHX_ 1847 "%s: %s (too short; %d byte%s available, need %d)", 1848 malformed_text, 1849 _byte_dump_string(s0, send - s0, 0), 1850 (int)avail_len, 1851 avail_len == 1 ? "" : "s", 1852 (int)expectlen); 1853 this_flag_bit = UTF8_GOT_SHORT; 1854 } 1855 } 1856 1857 } 1858 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) { 1859 possible_problems &= ~UTF8_GOT_NON_CONTINUATION; 1860 *errors |= UTF8_GOT_NON_CONTINUATION; 1861 1862 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { 1863 disallowed = TRUE; 1864 if (( msgs 1865 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1866 { 1867 1868 /* If we don't know for sure that the input length is 1869 * valid, avoid as much as possible reading past the 1870 * end of the buffer */ 1871 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) 1872 ? (int) (s - s0) 1873 : (int) (send - s0); 1874 pack_warn = packWARN(WARN_UTF8); 1875 message = Perl_form(aTHX_ "%s", 1876 unexpected_non_continuation_text(s0, 1877 printlen, 1878 s - s0, 1879 (int) expectlen)); 1880 this_flag_bit = UTF8_GOT_NON_CONTINUATION; 1881 } 1882 } 1883 } 1884 else if (possible_problems & UTF8_GOT_SURROGATE) { 1885 possible_problems &= ~UTF8_GOT_SURROGATE; 1886 1887 if (flags & UTF8_WARN_SURROGATE) { 1888 *errors |= UTF8_GOT_SURROGATE; 1889 1890 if ( ! (flags & UTF8_CHECK_ONLY) 1891 && (msgs || ckWARN_d(WARN_SURROGATE))) 1892 { 1893 pack_warn = packWARN(WARN_SURROGATE); 1894 1895 /* These are the only errors that can occur with a 1896 * surrogate when the 'uv' isn't valid */ 1897 if (orig_problems & UTF8_GOT_TOO_SHORT) { 1898 message = Perl_form(aTHX_ 1899 "UTF-16 surrogate (any UTF-8 sequence that" 1900 " starts with \"%s\" is for a surrogate)", 1901 _byte_dump_string(s0, curlen, 0)); 1902 } 1903 else { 1904 message = Perl_form(aTHX_ surrogate_cp_format, uv); 1905 } 1906 this_flag_bit = UTF8_GOT_SURROGATE; 1907 } 1908 } 1909 1910 if (flags & UTF8_DISALLOW_SURROGATE) { 1911 disallowed = TRUE; 1912 *errors |= UTF8_GOT_SURROGATE; 1913 } 1914 } 1915 else if (possible_problems & UTF8_GOT_SUPER) { 1916 possible_problems &= ~UTF8_GOT_SUPER; 1917 1918 if (flags & UTF8_WARN_SUPER) { 1919 *errors |= UTF8_GOT_SUPER; 1920 1921 if ( ! (flags & UTF8_CHECK_ONLY) 1922 && (msgs || ckWARN_d(WARN_NON_UNICODE))) 1923 { 1924 pack_warn = packWARN(WARN_NON_UNICODE); 1925 1926 if (orig_problems & UTF8_GOT_TOO_SHORT) { 1927 message = Perl_form(aTHX_ 1928 "Any UTF-8 sequence that starts with" 1929 " \"%s\" is for a non-Unicode code point," 1930 " may not be portable", 1931 _byte_dump_string(s0, curlen, 0)); 1932 } 1933 else { 1934 message = Perl_form(aTHX_ super_cp_format, uv); 1935 } 1936 this_flag_bit = UTF8_GOT_SUPER; 1937 } 1938 } 1939 1940 /* Test for Perl's extended UTF-8 after the regular SUPER ones, 1941 * and before possibly bailing out, so that the more dire 1942 * warning will override the regular one. */ 1943 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) { 1944 if ( ! (flags & UTF8_CHECK_ONLY) 1945 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) 1946 && (msgs || ( ckWARN_d(WARN_NON_UNICODE) 1947 || ckWARN(WARN_PORTABLE)))) 1948 { 1949 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); 1950 1951 /* If it is an overlong that evaluates to a code point 1952 * that doesn't have to use the Perl extended UTF-8, it 1953 * still used it, and so we output a message that 1954 * doesn't refer to the code point. The same is true 1955 * if there was a SHORT malformation where the code 1956 * point is not valid. In that case, 'uv' will have 1957 * been set to the REPLACEMENT CHAR, and the message 1958 * below without the code point in it will be selected 1959 * */ 1960 if (UNICODE_IS_PERL_EXTENDED(uv)) { 1961 message = Perl_form(aTHX_ 1962 PL_extended_cp_format, uv); 1963 } 1964 else { 1965 message = Perl_form(aTHX_ 1966 "Any UTF-8 sequence that starts with" 1967 " \"%s\" is a Perl extension, and" 1968 " so is not portable", 1969 _byte_dump_string(s0, curlen, 0)); 1970 } 1971 this_flag_bit = UTF8_GOT_PERL_EXTENDED; 1972 } 1973 1974 if (flags & ( UTF8_WARN_PERL_EXTENDED 1975 |UTF8_DISALLOW_PERL_EXTENDED)) 1976 { 1977 *errors |= UTF8_GOT_PERL_EXTENDED; 1978 1979 if (flags & UTF8_DISALLOW_PERL_EXTENDED) { 1980 disallowed = TRUE; 1981 } 1982 } 1983 } 1984 1985 if (flags & UTF8_DISALLOW_SUPER) { 1986 *errors |= UTF8_GOT_SUPER; 1987 disallowed = TRUE; 1988 } 1989 } 1990 else if (possible_problems & UTF8_GOT_NONCHAR) { 1991 possible_problems &= ~UTF8_GOT_NONCHAR; 1992 1993 if (flags & UTF8_WARN_NONCHAR) { 1994 *errors |= UTF8_GOT_NONCHAR; 1995 1996 if ( ! (flags & UTF8_CHECK_ONLY) 1997 && (msgs || ckWARN_d(WARN_NONCHAR))) 1998 { 1999 /* The code above should have guaranteed that we don't 2000 * get here with errors other than overlong */ 2001 assert (! (orig_problems 2002 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR))); 2003 2004 pack_warn = packWARN(WARN_NONCHAR); 2005 message = Perl_form(aTHX_ nonchar_cp_format, uv); 2006 this_flag_bit = UTF8_GOT_NONCHAR; 2007 } 2008 } 2009 2010 if (flags & UTF8_DISALLOW_NONCHAR) { 2011 disallowed = TRUE; 2012 *errors |= UTF8_GOT_NONCHAR; 2013 } 2014 } 2015 else if (possible_problems & UTF8_GOT_LONG) { 2016 possible_problems &= ~UTF8_GOT_LONG; 2017 *errors |= UTF8_GOT_LONG; 2018 2019 if (flags & UTF8_ALLOW_LONG) { 2020 2021 /* We don't allow the actual overlong value, unless the 2022 * special extra bit is also set */ 2023 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE 2024 & ~UTF8_ALLOW_LONG))) 2025 { 2026 uv = UNICODE_REPLACEMENT; 2027 } 2028 } 2029 else { 2030 disallowed = TRUE; 2031 2032 if (( msgs 2033 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 2034 { 2035 pack_warn = packWARN(WARN_UTF8); 2036 2037 /* These error types cause 'uv' to be something that 2038 * isn't what was intended, so can't use it in the 2039 * message. The other error types either can't 2040 * generate an overlong, or else the 'uv' is valid */ 2041 if (orig_problems & 2042 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) 2043 { 2044 message = Perl_form(aTHX_ 2045 "%s: %s (any UTF-8 sequence that starts" 2046 " with \"%s\" is overlong which can and" 2047 " should be represented with a" 2048 " different, shorter sequence)", 2049 malformed_text, 2050 _byte_dump_string(s0, send - s0, 0), 2051 _byte_dump_string(s0, curlen, 0)); 2052 } 2053 else { 2054 U8 tmpbuf[UTF8_MAXBYTES+1]; 2055 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, 2056 uv, 0); 2057 /* Don't use U+ for non-Unicode code points, which 2058 * includes those in the Latin1 range */ 2059 const char * preface = ( UNICODE_IS_SUPER(uv) 2060 #ifdef EBCDIC 2061 || uv <= 0xFF 2062 #endif 2063 ) 2064 ? "0x" 2065 : "U+"; 2066 message = Perl_form(aTHX_ 2067 "%s: %s (overlong; instead use %s to represent" 2068 " %s%0*" UVXf ")", 2069 malformed_text, 2070 _byte_dump_string(s0, send - s0, 0), 2071 _byte_dump_string(tmpbuf, e - tmpbuf, 0), 2072 preface, 2073 ((uv < 256) ? 2 : 4), /* Field width of 2 for 2074 small code points */ 2075 UNI_TO_NATIVE(uv)); 2076 } 2077 this_flag_bit = UTF8_GOT_LONG; 2078 } 2079 } 2080 } /* End of looking through the possible flags */ 2081 2082 /* Display the message (if any) for the problem being handled in 2083 * this iteration of the loop */ 2084 if (message) { 2085 if (msgs) { 2086 assert(this_flag_bit); 2087 2088 if (*msgs == NULL) { 2089 *msgs = newAV(); 2090 } 2091 2092 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message, 2093 pack_warn, 2094 this_flag_bit))); 2095 } 2096 else if (PL_op) 2097 Perl_warner(aTHX_ pack_warn, "%s in %s", message, 2098 OP_DESC(PL_op)); 2099 else 2100 Perl_warner(aTHX_ pack_warn, "%s", message); 2101 } 2102 } /* End of 'while (possible_problems)' */ 2103 2104 /* Since there was a possible problem, the returned length may need to 2105 * be changed from the one stored at the beginning of this function. 2106 * Instead of trying to figure out if it has changed, just do it. */ 2107 if (retlen) { 2108 *retlen = curlen; 2109 } 2110 2111 if (disallowed) { 2112 if (flags & UTF8_CHECK_ONLY && retlen) { 2113 *retlen = ((STRLEN) -1); 2114 } 2115 return 0; 2116 } 2117 } 2118 2119 return UNI_TO_NATIVE(uv); 2120 } 2121 2122 /* 2123 =for apidoc utf8_to_uvchr_buf 2124 2125 Returns the native code point of the first character in the string C<s> which 2126 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>. 2127 C<*retlen> will be set to the length, in bytes, of that character. 2128 2129 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are 2130 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 2131 C<NULL>) to -1. If those warnings are off, the computed value, if well-defined 2132 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and 2133 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is 2134 the next possible position in C<s> that could begin a non-malformed character. 2135 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is 2136 returned. 2137 2138 =cut 2139 2140 Also implemented as a macro in utf8.h 2141 2142 */ 2143 2144 2145 UV 2146 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2147 { 2148 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; 2149 2150 return utf8_to_uvchr_buf_helper(s, send, retlen); 2151 } 2152 2153 /* 2154 =for apidoc utf8_length 2155 2156 Returns the number of characters in the sequence of UTF-8-encoded bytes starting 2157 at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the 2158 same place, it returns 0 with no warning raised. 2159 2160 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning 2161 and returns the number of valid characters. 2162 2163 =cut 2164 2165 For long strings we process the input word-at-a-time, and count 2166 continuations, instead of otherwise counting characters and using UTF8SKIP 2167 to find the next one. If our input were 13-byte characters, the per-word 2168 would be a loser, as we would be doing things in 8 byte chunks (or 4 on a 2169 32-bit platform). But the maximum legal Unicode code point is 4 bytes, and 2170 most text will have a significant number of 1 and 2 byte characters, so the 2171 per-word is generally a winner. 2172 2173 There are start-up and finish costs with the per-word method, so we use the 2174 standard method unless the input has a relatively large length. 2175 */ 2176 2177 STRLEN 2178 Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e) 2179 { 2180 STRLEN continuations = 0; 2181 STRLEN len = 0; 2182 const U8 * s = s0; 2183 2184 PERL_ARGS_ASSERT_UTF8_LENGTH; 2185 2186 /* For EBCDCIC and short strings, we count the characters. The boundary 2187 * was determined by eyeballing the output of Porting/bench.pl and 2188 * choosing a number where the continuations method gave better results (on 2189 * a 64 bit system, khw not having access to a 32 bit system with 2190 * cachegrind). The number isn't critical, as at these sizes, the total 2191 * time spent isn't large either way */ 2192 2193 #ifndef EBCDIC 2194 2195 if (e - s0 < 96) 2196 2197 #endif 2198 2199 { 2200 while (s < e) { /* Count characters directly */ 2201 2202 /* Take extra care to not exceed 'e' (which would be undefined 2203 * behavior) should the input be malformed, with a partial 2204 * character at the end */ 2205 Ptrdiff_t expected_byte_count = UTF8SKIP(s); 2206 if (UNLIKELY(e - s < expected_byte_count)) { 2207 goto warn_and_return; 2208 } 2209 2210 len++; 2211 s += expected_byte_count; 2212 } 2213 2214 if (LIKELY(e == s)) { 2215 return len; 2216 } 2217 2218 warn_and_return: 2219 if (ckWARN_d(WARN_UTF8)) { 2220 if (PL_op) 2221 Perl_warner(aTHX_ packWARN(WARN_UTF8), 2222 "%s in %s", unees, OP_DESC(PL_op)); 2223 else 2224 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", unees); 2225 } 2226 2227 return s - s0; 2228 } 2229 2230 #ifndef EBCDIC 2231 2232 /* Count continuations, word-at-a-time. 2233 * 2234 * We need to stop before the final start character in order to 2235 * preserve the limited error checking that's always been done */ 2236 const U8 * e_limit = e - UTF8_MAXBYTES; 2237 2238 /* Points to the first byte >=s which is positioned at a word boundary. If 2239 * s is on a word boundary, it is s, otherwise it is to the next word. */ 2240 const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 2241 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK); 2242 2243 /* Process up to a full word boundary. */ 2244 while (s < partial_word_end) { 2245 const Size_t skip = UTF8SKIP(s); 2246 2247 continuations += skip - 1; 2248 s += skip; 2249 } 2250 2251 /* Adjust back down any overshoot */ 2252 continuations -= s - partial_word_end; 2253 s = partial_word_end; 2254 2255 do { /* Process per-word */ 2256 2257 /* The idea for counting continuation bytes came from 2258 * http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html 2259 * One thing it does that this doesn't is to prefetch the buffer 2260 * __builtin_prefetch(&s[256], 0, 0); 2261 * 2262 * A continuation byte has the upper 2 bits be '10', and the rest 2263 * dont-cares. The VARIANTS mask zeroes out all but the upper bit of 2264 * each byte in the word. That gets shifted to the byte's lowest bit, 2265 * and 'anded' with the complement of the 2nd highest bit of the byte, 2266 * which has also been shifted to that position. Hence the bit in that 2267 * position will be 1 iff the upper bit is 1 and the next one is 0. We 2268 * then use the same integer multiplcation and shifting that are used 2269 * in variant_under_utf8_count() to count how many of those are set in 2270 * the word. */ 2271 2272 continuations += (((((* (const PERL_UINTMAX_T *) s) 2273 & PERL_VARIANTS_WORD_MASK) >> 7) 2274 & (((~ (* (const PERL_UINTMAX_T *) s))) >> 6)) 2275 * PERL_COUNT_MULTIPLIER) 2276 >> ((PERL_WORDSIZE - 1) * CHARBITS); 2277 s += PERL_WORDSIZE; 2278 } while (s + PERL_WORDSIZE <= e_limit); 2279 2280 /* Process remainder per-byte */ 2281 while (s < e) { 2282 if (UTF8_IS_CONTINUATION(*s)) { 2283 continuations++; 2284 s++; 2285 continue; 2286 } 2287 2288 /* Here is a starter byte. Use UTF8SKIP from now on */ 2289 do { 2290 Ptrdiff_t expected_byte_count = UTF8SKIP(s); 2291 if (UNLIKELY(e - s < expected_byte_count)) { 2292 break; 2293 } 2294 2295 continuations += expected_byte_count- 1; 2296 s += expected_byte_count; 2297 } while (s < e); 2298 2299 break; 2300 } 2301 2302 # endif 2303 2304 if (LIKELY(e == s)) { 2305 return s - s0 - continuations; 2306 } 2307 2308 /* Convert to characters */ 2309 s -= continuations; 2310 2311 goto warn_and_return; 2312 } 2313 2314 /* 2315 =for apidoc bytes_cmp_utf8 2316 2317 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the 2318 sequence of characters (stored as UTF-8) 2319 in C<u>, C<ulen>. Returns 0 if they are 2320 equal, -1 or -2 if the first string is less than the second string, +1 or +2 2321 if the first string is greater than the second string. 2322 2323 -1 or +1 is returned if the shorter string was identical to the start of the 2324 longer string. -2 or +2 is returned if 2325 there was a difference between characters 2326 within the strings. 2327 2328 =cut 2329 */ 2330 2331 int 2332 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) 2333 { 2334 const U8 *const bend = b + blen; 2335 const U8 *const uend = u + ulen; 2336 2337 PERL_ARGS_ASSERT_BYTES_CMP_UTF8; 2338 2339 while (b < bend && u < uend) { 2340 U8 c = *u++; 2341 if (!UTF8_IS_INVARIANT(c)) { 2342 if (UTF8_IS_DOWNGRADEABLE_START(c)) { 2343 if (u < uend) { 2344 U8 c1 = *u++; 2345 if (UTF8_IS_CONTINUATION(c1)) { 2346 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1); 2347 } else { 2348 /* diag_listed_as: Malformed UTF-8 character%s */ 2349 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2350 "%s %s%s", 2351 unexpected_non_continuation_text(u - 2, 2, 1, 2), 2352 PL_op ? " in " : "", 2353 PL_op ? OP_DESC(PL_op) : ""); 2354 return -2; 2355 } 2356 } else { 2357 if (PL_op) 2358 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2359 "%s in %s", unees, OP_DESC(PL_op)); 2360 else 2361 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); 2362 return -2; /* Really want to return undef :-) */ 2363 } 2364 } else { 2365 return -2; 2366 } 2367 } 2368 if (*b != c) { 2369 return *b < c ? -2 : +2; 2370 } 2371 ++b; 2372 } 2373 2374 if (b == bend && u == uend) 2375 return 0; 2376 2377 return b < bend ? +1 : -1; 2378 } 2379 2380 /* 2381 =for apidoc utf8_to_bytes 2382 2383 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding. 2384 Unlike L</bytes_to_utf8>, this over-writes the original string, and 2385 updates C<*lenp> to contain the new length. 2386 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1. 2387 2388 Upon successful return, the number of variants in the string can be computed by 2389 having saved the value of C<*lenp> before the call, and subtracting the 2390 after-call value of C<*lenp> from it. 2391 2392 If you need a copy of the string, see L</bytes_from_utf8>. 2393 2394 =cut 2395 */ 2396 2397 U8 * 2398 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) 2399 { 2400 U8 * first_variant; 2401 2402 PERL_ARGS_ASSERT_UTF8_TO_BYTES; 2403 PERL_UNUSED_CONTEXT; 2404 2405 /* This is a no-op if no variants at all in the input */ 2406 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) { 2407 return s; 2408 } 2409 2410 /* Nothing before 'first_variant' needs to be changed, so start the real 2411 * work there */ 2412 2413 U8 * const save = s; 2414 U8 * const send = s + *lenp; 2415 U8 * d; 2416 2417 #ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */ 2418 2419 /* There is some start-up/tear-down overhead with this, so no real gain 2420 * unless the string is long enough. The current value is just a 2421 * guess. */ 2422 if (*lenp > 5 * PERL_WORDSIZE) { 2423 2424 /* First, go through the string a word at-a-time to verify that it is 2425 * downgradable. If it contains any start byte besides C2 and C3, then 2426 * it isn't. */ 2427 2428 const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0; 2429 const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2; 2430 const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE; 2431 2432 /* Points to the first byte >=s which is positioned at a word boundary. 2433 * If s is on a word boundary, it is s, otherwise it is the first byte 2434 * of the next word. */ 2435 U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 2436 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK); 2437 2438 /* Here there is at least a full word beyond the first word boundary. 2439 * Process up to that boundary. */ 2440 while (s < partial_word_end) { 2441 if (! UTF8_IS_INVARIANT(*s)) { 2442 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 2443 *lenp = ((STRLEN) -1); 2444 return NULL; 2445 } 2446 s++; 2447 } 2448 s++; 2449 } 2450 2451 /* Adjust back down any overshoot */ 2452 s = partial_word_end; 2453 2454 /* Process per-word */ 2455 do { 2456 2457 PERL_UINTMAX_T C2_C3_start_bytes; 2458 2459 /* First find the bytes that are start bytes. ANDing with 2460 * C0C0...C0 causes any start byte to become C0; any other byte 2461 * becomes something else. Then XORing with C0 causes any start 2462 * byte to become 0; all other bytes non-zero. */ 2463 PERL_UINTMAX_T start_bytes 2464 = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask; 2465 2466 /* These shifts causes the most significant bit to be set to 1 for 2467 * any bytes in the word that aren't completely 0. Hence after 2468 * these, only the start bytes have 0 in their msb */ 2469 start_bytes |= start_bytes << 1; 2470 start_bytes |= start_bytes << 2; 2471 start_bytes |= start_bytes << 4; 2472 2473 /* When we complement, then AND with 8080...80, the start bytes 2474 * will have 1 in their msb, and all other bits are 0 */ 2475 start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK; 2476 2477 /* Now repeat the procedure, but look for bytes that match only 2478 * C2-C3. */ 2479 C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask) 2480 ^ C2_mask; 2481 C2_C3_start_bytes |= C2_C3_start_bytes << 1; 2482 C2_C3_start_bytes |= C2_C3_start_bytes << 2; 2483 C2_C3_start_bytes |= C2_C3_start_bytes << 4; 2484 C2_C3_start_bytes = ~ C2_C3_start_bytes 2485 & PERL_VARIANTS_WORD_MASK; 2486 2487 /* Here, start_bytes has a 1 in the msb of each byte that has a 2488 * start_byte; And 2489 * C2_C3_start_bytes has a 1 in the msb of each byte that has a 2490 * start_byte of C2 or C3 2491 * If they're not equal, there are start bytes that aren't C2 2492 * nor C3, hence this is not downgradable */ 2493 if (start_bytes != C2_C3_start_bytes) { 2494 *lenp = ((STRLEN) -1); 2495 return NULL; 2496 } 2497 2498 s += PERL_WORDSIZE; 2499 } while (s + PERL_WORDSIZE <= send); 2500 2501 /* If the final byte was a start byte, it means that the character 2502 * straddles two words, so back off one to start looking below at the 2503 * first byte of the character */ 2504 if (s > first_variant && UTF8_IS_START(*(s-1))) { 2505 s--; 2506 } 2507 } 2508 2509 #endif 2510 2511 /* Do the straggler bytes beyond the final word boundary (or all bytes 2512 * in the case of EBCDIC) */ 2513 while (s < send) { 2514 if (! UTF8_IS_INVARIANT(*s)) { 2515 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 2516 *lenp = ((STRLEN) -1); 2517 return NULL; 2518 } 2519 s++; 2520 } 2521 s++; 2522 } 2523 2524 /* Here, we passed the tests above. For the EBCDIC case, everything 2525 * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC, 2526 * it means only that all start bytes were C2 or C3, hence any 2527 * well-formed sequences are downgradable. But we didn't test, for 2528 * example, that there weren't two C2's in a row. That means that in 2529 * the loop below, we have to be sure things are well-formed. Because 2530 * this is very very likely, and we don't care about having speedy 2531 * handling of malformed input, the loop proceeds as if well formed, 2532 * and should a malformed one come along, it undoes what it already has 2533 * done */ 2534 2535 d = s = first_variant; 2536 2537 while (s < send) { 2538 U8 * s1; 2539 2540 if (UVCHR_IS_INVARIANT(*s)) { 2541 *d++ = *s++; 2542 continue; 2543 } 2544 2545 /* Here it is two-byte encoded. */ 2546 if ( LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s)) 2547 && LIKELY(UTF8_IS_CONTINUATION((s[1])))) 2548 { 2549 U8 first_byte = *s++; 2550 *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s); 2551 s++; 2552 continue; 2553 } 2554 2555 /* Here, it is malformed. This shouldn't happen on EBCDIC, and on 2556 * ASCII platforms, we know that the only start bytes in the text 2557 * are C2 and C3, and the code above has made sure that it doesn't 2558 * end with a start byte. That means the only malformations that 2559 * are possible are a start byte without a continuation (either 2560 * followed by another start byte or an invariant) or an unexpected 2561 * continuation. 2562 * 2563 * We have to undo all we've done before, back down to the first 2564 * UTF-8 variant. Note that each 2-byte variant we've done so far 2565 * (converted to single byte) slides things to the left one byte, 2566 * and so we have bytes that haven't been written over. 2567 * 2568 * Here, 'd' points to the next position to overwrite, and 's' 2569 * points to the first invalid byte. That means 'd's contents 2570 * haven't been changed yet, nor has anything else beyond it in the 2571 * string. In restoring to the original contents, we don't need to 2572 * do anything past (d-1). 2573 * 2574 * In particular, the bytes from 'd' to 's' have not been changed. 2575 * This loop uses a new variable 's1' (to avoid confusing 'source' 2576 * and 'destination') set to 'd', and moves 's' and 's1' in lock 2577 * step back so that afterwards, 's1' points to the first changed 2578 * byte that will be the source for the first byte (or bytes) at 2579 * 's' that need to be changed back. Note that s1 can expand to 2580 * two bytes */ 2581 s1 = d; 2582 while (s >= d) { 2583 s--; 2584 if (! UVCHR_IS_INVARIANT(*s1)) { 2585 s--; 2586 } 2587 s1--; 2588 } 2589 2590 /* Do the changing back */ 2591 while (s1 >= first_variant) { 2592 if (UVCHR_IS_INVARIANT(*s1)) { 2593 *s-- = *s1--; 2594 } 2595 else { 2596 *s-- = UTF8_EIGHT_BIT_LO(*s1); 2597 *s-- = UTF8_EIGHT_BIT_HI(*s1); 2598 s1--; 2599 } 2600 } 2601 2602 *lenp = ((STRLEN) -1); 2603 return NULL; 2604 } 2605 2606 /* Success! */ 2607 *d = '\0'; 2608 *lenp = d - save; 2609 2610 return save; 2611 } 2612 2613 /* 2614 =for apidoc bytes_from_utf8 2615 2616 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native 2617 byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is 2618 actually encoded in UTF-8. 2619 2620 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of 2621 the input string. 2622 2623 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string 2624 not expressible in native byte encoding. In these cases, C<*is_utf8p> and 2625 C<*lenp> are unchanged, and the return value is the original C<s>. 2626 2627 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a 2628 newly created string containing a downgraded copy of C<s>, and whose length is 2629 returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The 2630 caller is responsible for arranging for the memory used by this string to get 2631 freed. 2632 2633 Upon successful return, the number of variants in the string can be computed by 2634 having saved the value of C<*lenp> before the call, and subtracting the 2635 after-call value of C<*lenp> from it. 2636 2637 =cut 2638 2639 There is a macro that avoids this function call, but this is retained for 2640 anyone who calls it with the Perl_ prefix */ 2641 2642 U8 * 2643 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) 2644 { 2645 PERL_ARGS_ASSERT_BYTES_FROM_UTF8; 2646 PERL_UNUSED_CONTEXT; 2647 2648 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); 2649 } 2650 2651 /* 2652 =for apidoc bytes_from_utf8_loc 2653 2654 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer 2655 to where to store the location of the first character in C<"s"> that cannot be 2656 converted to non-UTF8. 2657 2658 If that parameter is C<NULL>, this function behaves identically to 2659 C<bytes_from_utf8>. 2660 2661 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to 2662 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>. 2663 2664 Otherwise, the function returns a newly created C<NUL>-terminated string 2665 containing the non-UTF8 equivalent of the convertible first portion of 2666 C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>. 2667 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, 2668 and C<*first_non_downgradable> is set to C<NULL>. 2669 2670 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the 2671 first character in the original string that wasn't converted. C<*is_utf8p> is 2672 unchanged. Note that the new string may have length 0. 2673 2674 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and 2675 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and 2676 converts as many characters in it as possible stopping at the first one it 2677 finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is 2678 set to point to that. The function returns the portion that could be converted 2679 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length, 2680 not including the terminating C<NUL>. If the very first character in the 2681 original could not be converted, C<*lenp> will be 0, and the new string will 2682 contain just a single C<NUL>. If the entire input string was converted, 2683 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>. 2684 2685 Upon successful return, the number of variants in the converted portion of the 2686 string can be computed by having saved the value of C<*lenp> before the call, 2687 and subtracting the after-call value of C<*lenp> from it. 2688 2689 =cut 2690 2691 2692 */ 2693 2694 U8 * 2695 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) 2696 { 2697 U8 *d; 2698 const U8 *original = s; 2699 U8 *converted_start; 2700 const U8 *send = s + *lenp; 2701 2702 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; 2703 2704 if (! *is_utf8p) { 2705 if (first_unconverted) { 2706 *first_unconverted = NULL; 2707 } 2708 2709 return (U8 *) original; 2710 } 2711 2712 Newx(d, (*lenp) + 1, U8); 2713 2714 converted_start = d; 2715 while (s < send) { 2716 U8 c = *s++; 2717 if (! UTF8_IS_INVARIANT(c)) { 2718 2719 /* Then it is multi-byte encoded. If the code point is above 0xFF, 2720 * have to stop now */ 2721 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { 2722 if (first_unconverted) { 2723 *first_unconverted = s - 1; 2724 goto finish_and_return; 2725 } 2726 else { 2727 Safefree(converted_start); 2728 return (U8 *) original; 2729 } 2730 } 2731 2732 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); 2733 s++; 2734 } 2735 *d++ = c; 2736 } 2737 2738 /* Here, converted the whole of the input */ 2739 *is_utf8p = FALSE; 2740 if (first_unconverted) { 2741 *first_unconverted = NULL; 2742 } 2743 2744 finish_and_return: 2745 *d = '\0'; 2746 *lenp = d - converted_start; 2747 2748 /* Trim unused space */ 2749 Renew(converted_start, *lenp + 1, U8); 2750 2751 return converted_start; 2752 } 2753 2754 /* 2755 =for apidoc bytes_to_utf8 2756 2757 Converts a string C<s> of length C<*lenp> bytes from the native encoding into 2758 UTF-8. 2759 Returns a pointer to the newly-created string, and sets C<*lenp> to 2760 reflect the new length in bytes. The caller is responsible for arranging for 2761 the memory used by this string to get freed. 2762 2763 Upon successful return, the number of variants in the string can be computed by 2764 having saved the value of C<*lenp> before the call, and subtracting it from the 2765 after-call value of C<*lenp>. 2766 2767 A C<NUL> character will be written after the end of the string. 2768 2769 If you want to convert to UTF-8 from encodings other than 2770 the native (Latin1 or EBCDIC), 2771 see L</sv_recode_to_utf8>(). 2772 2773 =cut 2774 */ 2775 2776 U8* 2777 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) 2778 { 2779 const U8 * const send = s + (*lenp); 2780 U8 *d; 2781 U8 *dst; 2782 2783 PERL_ARGS_ASSERT_BYTES_TO_UTF8; 2784 PERL_UNUSED_CONTEXT; 2785 2786 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ 2787 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); 2788 dst = d; 2789 2790 while (s < send) { 2791 append_utf8_from_native_byte(*s, &d); 2792 s++; 2793 } 2794 2795 *d = '\0'; 2796 *lenp = d-dst; 2797 2798 return dst; 2799 } 2800 2801 /* 2802 * Convert native UTF-16 to UTF-8. Called via the more public functions 2803 * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for 2804 * little-endian, 2805 * 2806 * 'p' is the UTF-16 input string, passed as a pointer to U8. 2807 * 'bytelen' is its length (must be even) 2808 * 'd' is the pointer to the destination buffer. The caller must ensure that 2809 * the space is large enough. The maximum expansion factor is 2 times 2810 * 'bytelen'. 1.5 if never going to run on an EBCDIC box. 2811 * '*newlen' will contain the number of bytes this function filled of 'd'. 2812 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE 2813 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE 2814 * 2815 * The expansion factor is because UTF-16 requires 2 bytes for every code point 2816 * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code 2817 * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for 2818 * every code point below 0x1000; otherwise 4-5 bytes. 2819 * 2820 * The worst case is where every code point is below U+10000, hence requiring 2 2821 * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8 2822 * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes. 2823 * 2824 * Do not use in-place. */ 2825 2826 U8* 2827 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen, 2828 const bool high_byte, /* Which of next two bytes is 2829 high order */ 2830 const bool low_byte) 2831 { 2832 U8* pend; 2833 U8* dstart = d; 2834 2835 PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE; 2836 2837 if (bytelen & 1) 2838 Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf, 2839 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen); 2840 pend = p + bytelen; 2841 2842 while (p < pend) { 2843 2844 /* Next 16 bits is what we want. (The bool is cast to U8 because on 2845 * platforms where a bool is implemented as a signed char, a compiler 2846 * warning may be generated) */ 2847 U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte]; 2848 p += 2; 2849 2850 /* If it's a surrogate, we find the uv that the surrogate pair encodes. 2851 * */ 2852 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { 2853 2854 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST 2855 #define LAST_HIGH_SURROGATE 0xDBFF 2856 #define FIRST_LOW_SURROGATE 0xDC00 2857 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST 2858 #define FIRST_IN_PLANE1 0x10000 2859 2860 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) { 2861 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 2862 } 2863 else { 2864 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte]; 2865 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE, 2866 LAST_LOW_SURROGATE))) 2867 { 2868 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 2869 } 2870 2871 p += 2; 2872 2873 /* Here uv is the high surrogate. Combine with low surrogate 2874 * just computed to form the actual U32 code point. 2875 * 2876 * From https://unicode.org/faq/utf_bom.html#utf16-4 */ 2877 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10) 2878 + low_surrogate - FIRST_LOW_SURROGATE; 2879 } 2880 } 2881 2882 /* Here, 'uv' is the real U32 we want to find the UTF-8 of */ 2883 d = uvchr_to_utf8(d, uv); 2884 } 2885 2886 *newlen = d - dstart; 2887 return d; 2888 } 2889 2890 U8* 2891 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) 2892 { 2893 PERL_ARGS_ASSERT_UTF16_TO_UTF8; 2894 2895 return utf16_to_utf8(p, d, bytelen, newlen); 2896 } 2897 2898 U8* 2899 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) 2900 { 2901 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; 2902 2903 return utf16_to_utf8_reversed(p, d, bytelen, newlen); 2904 } 2905 2906 /* 2907 * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for 2908 * big-endian and utf8_to_utf16_reversed() for little-endian, 2909 * 2910 * 's' is the UTF-8 input string, passed as a pointer to U8. 2911 * 'bytelen' is its length 2912 * 'd' is the pointer to the destination buffer, currently passed as U8 *. The 2913 * caller must ensure that the space is large enough. The maximum 2914 * expansion factor is 2 times 'bytelen'. This happens when the input is 2915 * entirely single-byte ASCII, expanding to two-byte UTF-16. 2916 * '*newlen' will contain the number of bytes this function filled of 'd'. 2917 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE 2918 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE 2919 * 2920 * Do not use in-place. */ 2921 U8* 2922 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen, 2923 const bool high_byte, /* Which of next two bytes 2924 is high order */ 2925 const bool low_byte) 2926 { 2927 U8* send; 2928 U8* dstart = d; 2929 2930 PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE; 2931 2932 send = s + bytelen; 2933 2934 while (s < send) { 2935 STRLEN retlen; 2936 UV uv = utf8n_to_uvchr(s, send - s, &retlen, 2937 /* No surrogates nor above-Unicode */ 2938 UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); 2939 2940 /* The modern method is to keep going with malformed input, 2941 * substituting the REPLACEMENT CHARACTER */ 2942 if (UNLIKELY(uv == 0 && *s != '\0')) { 2943 uv = UNICODE_REPLACEMENT; 2944 } 2945 2946 if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */ 2947 2948 /* From https://unicode.org/faq/utf_bom.html#utf16-4 */ 2949 U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10) 2950 + FIRST_HIGH_SURROGATE; 2951 2952 /* (The bool is cast to U8 because on platforms where a bool is 2953 * implemented as a signed char, a compiler warning may be 2954 * generated) */ 2955 d[(U8) high_byte] = high_surrogate >> 8; 2956 d[(U8) low_byte] = high_surrogate & nBIT_MASK(8); 2957 d += 2; 2958 2959 /* The low surrogate is the lower 10 bits plus the offset */ 2960 uv &= nBIT_MASK(10); 2961 uv += FIRST_LOW_SURROGATE; 2962 2963 /* Drop down to output the low surrogate like it were a 2964 * non-surrogate */ 2965 } 2966 2967 d[(U8) high_byte] = uv >> 8; 2968 d[(U8) low_byte] = uv & nBIT_MASK(8); 2969 d += 2; 2970 2971 s += retlen; 2972 } 2973 2974 *newlen = d - dstart; 2975 return d; 2976 } 2977 2978 bool 2979 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) 2980 { 2981 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c); 2982 } 2983 2984 bool 2985 Perl__is_uni_perl_idcont(pTHX_ UV c) 2986 { 2987 return _invlist_contains_cp(PL_utf8_perl_idcont, c); 2988 } 2989 2990 bool 2991 Perl__is_uni_perl_idstart(pTHX_ UV c) 2992 { 2993 return _invlist_contains_cp(PL_utf8_perl_idstart, c); 2994 } 2995 2996 UV 2997 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, 2998 const char S_or_s) 2999 { 3000 /* We have the latin1-range values compiled into the core, so just use 3001 * those, converting the result to UTF-8. The only difference between upper 3002 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is 3003 * either "SS" or "Ss". Which one to use is passed into the routine in 3004 * 'S_or_s' to avoid a test */ 3005 3006 UV converted = toUPPER_LATIN1_MOD(c); 3007 3008 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; 3009 3010 assert(S_or_s == 'S' || S_or_s == 's'); 3011 3012 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for 3013 characters in this range */ 3014 *p = (U8) converted; 3015 *lenp = 1; 3016 return converted; 3017 } 3018 3019 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, 3020 * which it maps to one of them, so as to only have to have one check for 3021 * it in the main case */ 3022 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { 3023 switch (c) { 3024 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 3025 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; 3026 break; 3027 case MICRO_SIGN: 3028 converted = GREEK_CAPITAL_LETTER_MU; 3029 break; 3030 #if UNICODE_MAJOR_VERSION > 2 \ 3031 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ 3032 && UNICODE_DOT_DOT_VERSION >= 8) 3033 case LATIN_SMALL_LETTER_SHARP_S: 3034 *(p)++ = 'S'; 3035 *p = S_or_s; 3036 *lenp = 2; 3037 return 'S'; 3038 #endif 3039 default: 3040 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect" 3041 " '%c' to map to '%c'", 3042 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); 3043 NOT_REACHED; /* NOTREACHED */ 3044 } 3045 } 3046 3047 *(p)++ = UTF8_TWO_BYTE_HI(converted); 3048 *p = UTF8_TWO_BYTE_LO(converted); 3049 *lenp = 2; 3050 3051 return converted; 3052 } 3053 3054 /* If compiled on an early Unicode version, there may not be auxiliary tables 3055 * */ 3056 #ifndef HAS_UC_AUX_TABLES 3057 # define UC_AUX_TABLE_ptrs NULL 3058 # define UC_AUX_TABLE_lengths NULL 3059 #endif 3060 #ifndef HAS_TC_AUX_TABLES 3061 # define TC_AUX_TABLE_ptrs NULL 3062 # define TC_AUX_TABLE_lengths NULL 3063 #endif 3064 #ifndef HAS_LC_AUX_TABLES 3065 # define LC_AUX_TABLE_ptrs NULL 3066 # define LC_AUX_TABLE_lengths NULL 3067 #endif 3068 #ifndef HAS_CF_AUX_TABLES 3069 # define CF_AUX_TABLE_ptrs NULL 3070 # define CF_AUX_TABLE_lengths NULL 3071 #endif 3072 3073 /* Call the function to convert a UTF-8 encoded character to the specified case. 3074 * Note that there may be more than one character in the result. 3075 * 's' is a pointer to the first byte of the input character 3076 * 'd' will be set to the first byte of the string of changed characters. It 3077 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes 3078 * 'lenp' will be set to the length in bytes of the string of changed characters 3079 * 3080 * The functions return the ordinal of the first character in the string of 3081 * 'd' */ 3082 #define CALL_UPPER_CASE(uv, s, d, lenp) \ 3083 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \ 3084 Uppercase_Mapping_invmap, \ 3085 UC_AUX_TABLE_ptrs, \ 3086 UC_AUX_TABLE_lengths, \ 3087 "uppercase") 3088 #define CALL_TITLE_CASE(uv, s, d, lenp) \ 3089 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \ 3090 Titlecase_Mapping_invmap, \ 3091 TC_AUX_TABLE_ptrs, \ 3092 TC_AUX_TABLE_lengths, \ 3093 "titlecase") 3094 #define CALL_LOWER_CASE(uv, s, d, lenp) \ 3095 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \ 3096 Lowercase_Mapping_invmap, \ 3097 LC_AUX_TABLE_ptrs, \ 3098 LC_AUX_TABLE_lengths, \ 3099 "lowercase") 3100 3101 3102 /* This additionally has the input parameter 'specials', which if non-zero will 3103 * cause this to use the specials hash for folding (meaning get full case 3104 * folding); otherwise, when zero, this implies a simple case fold */ 3105 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \ 3106 (specials) \ 3107 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \ 3108 Case_Folding_invmap, \ 3109 CF_AUX_TABLE_ptrs, \ 3110 CF_AUX_TABLE_lengths, \ 3111 "foldcase") \ 3112 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \ 3113 Simple_Case_Folding_invmap, \ 3114 NULL, NULL, \ 3115 "foldcase") 3116 3117 UV 3118 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) 3119 { 3120 /* Convert the Unicode character whose ordinal is <c> to its uppercase 3121 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>. 3122 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since 3123 * the changed version may be longer than the original character. 3124 * 3125 * The ordinal of the first character of the changed version is returned 3126 * (but note, as explained above, that there may be more.) */ 3127 3128 PERL_ARGS_ASSERT_TO_UNI_UPPER; 3129 3130 if (c < 256) { 3131 return _to_upper_title_latin1((U8) c, p, lenp, 'S'); 3132 } 3133 3134 return CALL_UPPER_CASE(c, NULL, p, lenp); 3135 } 3136 3137 UV 3138 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) 3139 { 3140 PERL_ARGS_ASSERT_TO_UNI_TITLE; 3141 3142 if (c < 256) { 3143 return _to_upper_title_latin1((U8) c, p, lenp, 's'); 3144 } 3145 3146 return CALL_TITLE_CASE(c, NULL, p, lenp); 3147 } 3148 3149 STATIC U8 3150 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy) 3151 { 3152 /* We have the latin1-range values compiled into the core, so just use 3153 * those, converting the result to UTF-8. Since the result is always just 3154 * one character, we allow <p> to be NULL */ 3155 3156 U8 converted = toLOWER_LATIN1(c); 3157 3158 PERL_UNUSED_ARG(dummy); 3159 3160 if (p != NULL) { 3161 if (NATIVE_BYTE_IS_INVARIANT(converted)) { 3162 *p = converted; 3163 *lenp = 1; 3164 } 3165 else { 3166 /* Result is known to always be < 256, so can use the EIGHT_BIT 3167 * macros */ 3168 *p = UTF8_EIGHT_BIT_HI(converted); 3169 *(p+1) = UTF8_EIGHT_BIT_LO(converted); 3170 *lenp = 2; 3171 } 3172 } 3173 return converted; 3174 } 3175 3176 UV 3177 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) 3178 { 3179 PERL_ARGS_ASSERT_TO_UNI_LOWER; 3180 3181 if (c < 256) { 3182 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); 3183 } 3184 3185 return CALL_LOWER_CASE(c, NULL, p, lenp); 3186 } 3187 3188 UV 3189 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) 3190 { 3191 /* Corresponds to to_lower_latin1(); <flags> bits meanings: 3192 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 3193 * FOLD_FLAGS_FULL iff full folding is to be used; 3194 * 3195 * Not to be used for locale folds 3196 */ 3197 3198 UV converted; 3199 3200 PERL_ARGS_ASSERT__TO_FOLD_LATIN1; 3201 3202 assert (! (flags & FOLD_FLAGS_LOCALE)); 3203 3204 if (UNLIKELY(c == MICRO_SIGN)) { 3205 converted = GREEK_SMALL_LETTER_MU; 3206 } 3207 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 3208 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 3209 || UNICODE_DOT_DOT_VERSION > 0) 3210 else if ( (flags & FOLD_FLAGS_FULL) 3211 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S)) 3212 { 3213 /* If can't cross 127/128 boundary, can't return "ss"; instead return 3214 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") 3215 * under those circumstances. */ 3216 if (flags & FOLD_FLAGS_NOMIX_ASCII) { 3217 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8); 3218 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 3219 p, *lenp, U8); 3220 return LATIN_SMALL_LETTER_LONG_S; 3221 } 3222 else { 3223 *(p)++ = 's'; 3224 *p = 's'; 3225 *lenp = 2; 3226 return 's'; 3227 } 3228 } 3229 #endif 3230 else { /* In this range the fold of all other characters is their lower 3231 case */ 3232 converted = toLOWER_LATIN1(c); 3233 } 3234 3235 if (UVCHR_IS_INVARIANT(converted)) { 3236 *p = (U8) converted; 3237 *lenp = 1; 3238 } 3239 else { 3240 *(p)++ = UTF8_TWO_BYTE_HI(converted); 3241 *p = UTF8_TWO_BYTE_LO(converted); 3242 *lenp = 2; 3243 } 3244 3245 return converted; 3246 } 3247 3248 UV 3249 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) 3250 { 3251 3252 /* Not currently externally documented, and subject to change 3253 * <flags> bits meanings: 3254 * FOLD_FLAGS_FULL iff full folding is to be used; 3255 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 3256 * locale are to be used. 3257 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 3258 */ 3259 3260 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; 3261 3262 if (flags & FOLD_FLAGS_LOCALE) { 3263 /* Treat a non-Turkic UTF-8 locale as not being in locale at all, 3264 * except for potentially warning */ 3265 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 3266 if (IN_UTF8_CTYPE_LOCALE && ! IN_UTF8_TURKIC_LOCALE) { 3267 flags &= ~FOLD_FLAGS_LOCALE; 3268 } 3269 else { 3270 goto needs_full_generality; 3271 } 3272 } 3273 3274 if (c < 256) { 3275 return _to_fold_latin1((U8) c, p, lenp, 3276 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); 3277 } 3278 3279 /* Here, above 255. If no special needs, just use the macro */ 3280 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { 3281 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL); 3282 } 3283 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with 3284 the special flags. */ 3285 U8 utf8_c[UTF8_MAXBYTES + 1]; 3286 3287 needs_full_generality: 3288 uvchr_to_utf8(utf8_c, c); 3289 return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c), 3290 p, lenp, flags); 3291 } 3292 } 3293 3294 PERL_STATIC_INLINE bool 3295 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e, 3296 SV* const invlist) 3297 { 3298 /* returns a boolean giving whether or not the UTF8-encoded character that 3299 * starts at <p>, and extending no further than <e - 1> is in the inversion 3300 * list <invlist>. */ 3301 3302 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0); 3303 3304 PERL_ARGS_ASSERT_IS_UTF8_COMMON; 3305 3306 if (cp == 0 && (p >= e || *p != '\0')) { 3307 _force_out_malformed_utf8_message(p, e, 0, 1); 3308 NOT_REACHED; /* NOTREACHED */ 3309 } 3310 3311 assert(invlist); 3312 return _invlist_contains_cp(invlist, cp); 3313 } 3314 3315 #if 0 /* Not currently used, but may be needed in the future */ 3316 PERLVAR(I, seen_deprecated_macro, HV *) 3317 3318 STATIC void 3319 S_warn_on_first_deprecated_use(pTHX_ U32 category, 3320 const char * const name, 3321 const char * const alternative, 3322 const bool use_locale, 3323 const char * const file, 3324 const unsigned line) 3325 { 3326 const char * key; 3327 3328 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE; 3329 3330 if (ckWARN_d(category)) { 3331 3332 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line); 3333 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) { 3334 if (! PL_seen_deprecated_macro) { 3335 PL_seen_deprecated_macro = newHV(); 3336 } 3337 if (! hv_store(PL_seen_deprecated_macro, key, 3338 strlen(key), &PL_sv_undef, 0)) 3339 { 3340 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 3341 } 3342 3343 if (instr(file, "mathoms.c")) { 3344 Perl_warner(aTHX_ category, 3345 "In %s, line %d, starting in Perl v5.32, %s()" 3346 " will be removed. Avoid this message by" 3347 " converting to use %s().\n", 3348 file, line, name, alternative); 3349 } 3350 else { 3351 Perl_warner(aTHX_ category, 3352 "In %s, line %d, starting in Perl v5.32, %s() will" 3353 " require an additional parameter. Avoid this" 3354 " message by converting to use %s().\n", 3355 file, line, name, alternative); 3356 } 3357 } 3358 } 3359 } 3360 #endif 3361 3362 bool 3363 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) 3364 { 3365 PERL_ARGS_ASSERT__IS_UTF8_FOO; 3366 3367 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]); 3368 } 3369 3370 bool 3371 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e) 3372 { 3373 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; 3374 3375 return is_utf8_common(p, e, PL_utf8_perl_idstart); 3376 } 3377 3378 bool 3379 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e) 3380 { 3381 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; 3382 3383 return is_utf8_common(p, e, PL_utf8_perl_idcont); 3384 } 3385 3386 STATIC UV 3387 S_to_case_cp_list(pTHX_ 3388 const UV original, 3389 const U32 ** const remaining_list, 3390 Size_t * remaining_count, 3391 SV *invlist, const I32 * const invmap, 3392 const U32 * const * const aux_tables, 3393 const U8 * const aux_table_lengths, 3394 const char * const normal) 3395 { 3396 SSize_t index; 3397 I32 base; 3398 3399 /* Calculate the changed case of code point 'original'. The first code 3400 * point of the changed case is returned. 3401 * 3402 * If 'remaining_count' is not NULL, *remaining_count will be set to how 3403 * many *other* code points are in the changed case. If non-zero and 3404 * 'remaining_list' is also not NULL, *remaining_list will be set to point 3405 * to a non-modifiable array containing the second and potentially third 3406 * code points in the changed case. (Unicode guarantees a maximum of 3.) 3407 * Note that this means that *remaining_list is undefined unless there are 3408 * multiple code points, and the caller has chosen to find out how many by 3409 * making 'remaining_count' not NULL. 3410 * 3411 * 'normal' is a string to use to name the new case in any generated 3412 * messages, as a fallback if the operation being used is not available. 3413 * 3414 * The casing to use is given by the data structures in the remaining 3415 * arguments. 3416 */ 3417 3418 PERL_ARGS_ASSERT_TO_CASE_CP_LIST; 3419 3420 /* 'index' is guaranteed to be non-negative, as this is an inversion map 3421 * that covers all possible inputs. See [perl #133365] */ 3422 index = _invlist_search(invlist, original); 3423 base = invmap[index]; 3424 3425 /* Most likely, the case change will contain just a single code point */ 3426 if (remaining_count) { 3427 *remaining_count = 0; 3428 } 3429 3430 if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */ 3431 3432 /* At this bottom level routine is where we warn about illegal code 3433 * points */ 3434 if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) { 3435 if (UNLIKELY(UNICODE_IS_SURROGATE(original))) { 3436 if (ckWARN_d(WARN_SURROGATE)) { 3437 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 3438 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 3439 "Operation \"%s\" returns its argument for" 3440 " UTF-16 surrogate U+%04" UVXf, desc, original); 3441 } 3442 } 3443 else if (UNLIKELY(UNICODE_IS_SUPER(original))) { 3444 if (UNLIKELY(original > MAX_LEGAL_CP)) { 3445 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original)); 3446 } 3447 if (ckWARN_d(WARN_NON_UNICODE)) { 3448 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 3449 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 3450 "Operation \"%s\" returns its argument for" 3451 " non-Unicode code point 0x%04" UVXf, desc, original); 3452 } 3453 } 3454 3455 /* Note that non-characters are perfectly legal, so no warning 3456 * should be given. */ 3457 } 3458 3459 return original; 3460 } 3461 3462 if (LIKELY(base > 0)) { /* means original mapped to a single code point, 3463 different from itself */ 3464 return base + original - invlist_array(invlist)[index]; 3465 } 3466 3467 /* Here 'base' is negative. That means the mapping is 1-to-many, and 3468 * requires an auxiliary table look up. abs(base) gives the index into a 3469 * list of such tables which points to the proper aux table. And a 3470 * parallel list gives the length of each corresponding aux table. Skip 3471 * the first entry in the *remaining returns, as it is returned by the 3472 * function. */ 3473 base = -base; 3474 if (remaining_count) { 3475 *remaining_count = (Size_t) (aux_table_lengths[base] - 1); 3476 3477 if (remaining_list) { 3478 *remaining_list = aux_tables[base] + 1; 3479 } 3480 } 3481 3482 return (UV) aux_tables[base][0]; 3483 } 3484 3485 STATIC UV 3486 S__to_utf8_case(pTHX_ const UV original, const U8 *p, 3487 U8* ustrp, STRLEN *lenp, 3488 SV *invlist, const I32 * const invmap, 3489 const U32 * const * const aux_tables, 3490 const U8 * const aux_table_lengths, 3491 const char * const normal) 3492 { 3493 /* Change the case of code point 'original'. If 'p' is non-NULL, it points to 3494 * the beginning of the (assumed to be valid) UTF-8 representation of 3495 * 'original'. 'normal' is a string to use to name the new case in any 3496 * generated messages, as a fallback if the operation being used is not 3497 * available. The new case is given by the data structures in the 3498 * remaining arguments. 3499 * 3500 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the 3501 * entire changed case string, and the return value is the first code point 3502 * in that string 3503 * 3504 * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes 3505 * since the changed version may be longer than the original character. */ 3506 3507 const U32 * remaining_list; 3508 Size_t remaining_count; 3509 UV first = to_case_cp_list(original, 3510 &remaining_list, &remaining_count, 3511 invlist, invmap, 3512 aux_tables, aux_table_lengths, 3513 normal); 3514 3515 PERL_ARGS_ASSERT__TO_UTF8_CASE; 3516 3517 /* If the code point maps to itself and we already have its representation, 3518 * copy it instead of recalculating */ 3519 if (original == first && p) { 3520 *lenp = UTF8SKIP(p); 3521 3522 if (p != ustrp) { /* Don't copy onto itself */ 3523 Copy(p, ustrp, *lenp, U8); 3524 } 3525 } 3526 else { 3527 U8 * d = ustrp; 3528 Size_t i; 3529 3530 d = uvchr_to_utf8(d, first); 3531 3532 for (i = 0; i < remaining_count; i++) { 3533 d = uvchr_to_utf8(d, remaining_list[i]); 3534 } 3535 3536 *d = '\0'; 3537 *lenp = d - ustrp; 3538 } 3539 3540 return first; 3541 } 3542 3543 Size_t 3544 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to, 3545 const U32 ** remaining_folds_to) 3546 { 3547 /* Returns the count of the number of code points that fold to the input 3548 * 'cp' (besides itself). 3549 * 3550 * If the return is 0, there is nothing else that folds to it, and 3551 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL. 3552 * 3553 * If the return is 1, '*first_folds_to' is set to the single code point, 3554 * and '*remaining_folds_to' is set to NULL. 3555 * 3556 * Otherwise, '*first_folds_to' is set to a code point, and 3557 * '*remaining_fold_to' is set to an array that contains the others. The 3558 * length of this array is the returned count minus 1. 3559 * 3560 * The reason for this convolution is to avoid having to deal with 3561 * allocating and freeing memory. The lists are already constructed, so 3562 * the return can point to them, but single code points aren't, so would 3563 * need to be constructed if we didn't employ something like this API 3564 * 3565 * The code points returned by this function are all legal Unicode, which 3566 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are 3567 * constructed with this size (to save space and memory), and we return 3568 * pointers, so they must be this size */ 3569 3570 /* 'index' is guaranteed to be non-negative, as this is an inversion map 3571 * that covers all possible inputs. See [perl #133365] */ 3572 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); 3573 I32 base = _Perl_IVCF_invmap[index]; 3574 3575 PERL_ARGS_ASSERT__INVERSE_FOLDS; 3576 3577 if (base == 0) { /* No fold */ 3578 *first_folds_to = 0; 3579 *remaining_folds_to = NULL; 3580 return 0; 3581 } 3582 3583 #ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */ 3584 3585 assert(base > 0); 3586 3587 #else 3588 3589 if (UNLIKELY(base < 0)) { /* Folds to more than one character */ 3590 3591 /* The data structure is set up so that the absolute value of 'base' is 3592 * an index into a table of pointers to arrays, with the array 3593 * corresponding to the index being the list of code points that fold 3594 * to 'cp', and the parallel array containing the length of the list 3595 * array */ 3596 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; 3597 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; 3598 /* +1 excludes first_folds_to */ 3599 return IVCF_AUX_TABLE_lengths[-base]; 3600 } 3601 3602 #endif 3603 3604 /* Only the single code point. This works like 'fc(G) = G - A + a' */ 3605 *first_folds_to = (U32) (base + cp 3606 - invlist_array(PL_utf8_foldclosures)[index]); 3607 *remaining_folds_to = NULL; 3608 return 1; 3609 } 3610 3611 STATIC UV 3612 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, 3613 U8* const ustrp, STRLEN *lenp) 3614 { 3615 /* This is called when changing the case of a UTF-8-encoded character above 3616 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the 3617 * result contains a character that crosses the 255/256 boundary, disallow 3618 * the change, and return the original code point. See L<perlfunc/lc> for 3619 * why; 3620 * 3621 * p points to the original string whose case was changed; assumed 3622 * by this routine to be well-formed 3623 * result the code point of the first character in the changed-case string 3624 * ustrp points to the changed-case string (<result> represents its 3625 * first char) 3626 * lenp points to the length of <ustrp> */ 3627 3628 UV original; /* To store the first code point of <p> */ 3629 3630 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING; 3631 3632 assert(UTF8_IS_ABOVE_LATIN1(*p)); 3633 3634 /* We know immediately if the first character in the string crosses the 3635 * boundary, so can skip testing */ 3636 if (result > 255) { 3637 3638 /* Look at every character in the result; if any cross the 3639 * boundary, the whole thing is disallowed */ 3640 U8* s = ustrp + UTF8SKIP(ustrp); 3641 U8* e = ustrp + *lenp; 3642 while (s < e) { 3643 if (! UTF8_IS_ABOVE_LATIN1(*s)) { 3644 goto bad_crossing; 3645 } 3646 s += UTF8SKIP(s); 3647 } 3648 3649 /* Here, no characters crossed, result is ok as-is, but we warn. */ 3650 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p)); 3651 return result; 3652 } 3653 3654 bad_crossing: 3655 3656 /* Failed, have to return the original */ 3657 original = valid_utf8_to_uvchr(p, lenp); 3658 3659 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3660 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3661 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8" 3662 " locale; resolved to \"\\x{%" UVXf "}\".", 3663 OP_DESC(PL_op), 3664 original, 3665 original); 3666 Copy(p, ustrp, *lenp, char); 3667 return original; 3668 } 3669 3670 STATIC UV 3671 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, 3672 U8 * ustrp, STRLEN *lenp) 3673 { 3674 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from 3675 * p0..e-1 according to Turkic rules is the same as for non-Turkic. 3676 * Otherwise, it returns the first code point of the Turkic foldcased 3677 * sequence, and the entire sequence will be stored in *ustrp. ustrp will 3678 * contain *lenp bytes 3679 * 3680 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER 3681 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER 3682 * DOTLESS I */ 3683 3684 PERL_ARGS_ASSERT_TURKIC_FC; 3685 assert(e > p); 3686 3687 if (UNLIKELY(*p == 'I')) { 3688 *lenp = 2; 3689 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); 3690 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); 3691 return LATIN_SMALL_LETTER_DOTLESS_I; 3692 } 3693 3694 if (UNLIKELY(memBEGINs(p, e - p, 3695 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8))) 3696 { 3697 *lenp = 1; 3698 *ustrp = 'i'; 3699 return 'i'; 3700 } 3701 3702 return 0; 3703 } 3704 3705 STATIC UV 3706 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, 3707 U8 * ustrp, STRLEN *lenp) 3708 { 3709 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from 3710 * p0..e-1 according to Turkic rules is the same as for non-Turkic. 3711 * Otherwise, it returns the first code point of the Turkic lowercased 3712 * sequence, and the entire sequence will be stored in *ustrp. ustrp will 3713 * contain *lenp bytes */ 3714 3715 PERL_ARGS_ASSERT_TURKIC_LC; 3716 assert(e > p0); 3717 3718 /* A 'I' requires context as to what to do */ 3719 if (UNLIKELY(*p0 == 'I')) { 3720 const U8 * p = p0 + 1; 3721 3722 /* According to the Unicode SpecialCasing.txt file, a capital 'I' 3723 * modified by a dot above lowercases to 'i' even in turkic locales. */ 3724 while (p < e) { 3725 UV cp; 3726 3727 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) { 3728 ustrp[0] = 'i'; 3729 *lenp = 1; 3730 return 'i'; 3731 } 3732 3733 /* For the dot above to modify the 'I', it must be part of a 3734 * combining sequence immediately following the 'I', and no other 3735 * modifier with a ccc of 230 may intervene */ 3736 cp = utf8_to_uvchr_buf(p, e, NULL); 3737 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) { 3738 break; 3739 } 3740 3741 /* Here the combining sequence continues */ 3742 p += UTF8SKIP(p); 3743 } 3744 } 3745 3746 /* In all other cases the lc is the same as the fold */ 3747 return turkic_fc(p0, e, ustrp, lenp); 3748 } 3749 3750 STATIC UV 3751 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, 3752 U8 * ustrp, STRLEN *lenp) 3753 { 3754 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence 3755 * from p0..e-1 according to Turkic rules is the same as for non-Turkic. 3756 * Otherwise, it returns the first code point of the Turkic upper or 3757 * title-cased sequence, and the entire sequence will be stored in *ustrp. 3758 * ustrp will contain *lenp bytes 3759 * 3760 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER 3761 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER 3762 * DOTLESS I */ 3763 3764 PERL_ARGS_ASSERT_TURKIC_UC; 3765 assert(e > p); 3766 3767 if (*p == 'i') { 3768 *lenp = 2; 3769 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 3770 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 3771 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 3772 } 3773 3774 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) { 3775 *lenp = 1; 3776 *ustrp = 'I'; 3777 return 'I'; 3778 } 3779 3780 return 0; 3781 } 3782 3783 /* The process for changing the case is essentially the same for the four case 3784 * change types, except there are complications for folding. Otherwise the 3785 * difference is only which case to change to. To make sure that they all do 3786 * the same thing, the bodies of the functions are extracted out into the 3787 * following two macros. The functions are written with the same variable 3788 * names, and these are known and used inside these macros. It would be 3789 * better, of course, to have inline functions to do it, but since different 3790 * macros are called, depending on which case is being changed to, this is not 3791 * feasible in C (to khw's knowledge). Two macros are created so that the fold 3792 * function can start with the common start macro, then finish with its special 3793 * handling; while the other three cases can just use the common end macro. 3794 * 3795 * The algorithm is to use the proper (passed in) macro or function to change 3796 * the case for code points that are below 256. The macro is used if using 3797 * locale rules for the case change; the function if not. If the code point is 3798 * above 255, it is computed from the input UTF-8, and another macro is called 3799 * to do the conversion. If necessary, the output is converted to UTF-8. If 3800 * using a locale, we have to check that the change did not cross the 255/256 3801 * boundary, see check_locale_boundary_crossing() for further details. 3802 * 3803 * The macros are split with the correct case change for the below-256 case 3804 * stored into 'result', and in the middle of an else clause for the above-255 3805 * case. At that point in the 'else', 'result' is not the final result, but is 3806 * the input code point calculated from the UTF-8. The fold code needs to 3807 * realize all this and take it from there. 3808 * 3809 * To deal with Turkic locales, the function specified by the parameter 3810 * 'turkic' is called when appropriate. 3811 * 3812 * If you read the two macros as sequential, it's easier to understand what's 3813 * going on. */ 3814 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \ 3815 L1_func_extra_param, turkic) \ 3816 \ 3817 if (flags & (locale_flags)) { \ 3818 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 3819 if (IN_UTF8_CTYPE_LOCALE) { \ 3820 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) { \ 3821 UV ret = turkic(p, e, ustrp, lenp); \ 3822 if (ret) return ret; \ 3823 } \ 3824 \ 3825 /* Otherwise, treat a UTF-8 locale as not being in locale at \ 3826 * all */ \ 3827 flags &= ~(locale_flags); \ 3828 } \ 3829 } \ 3830 \ 3831 if (UTF8_IS_INVARIANT(*p)) { \ 3832 if (flags & (locale_flags)) { \ 3833 result = libc_change_function(*p); \ 3834 } \ 3835 else { \ 3836 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \ 3837 } \ 3838 } \ 3839 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ 3840 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \ 3841 if (flags & (locale_flags)) { \ 3842 result = libc_change_function(c); \ 3843 } \ 3844 else { \ 3845 return L1_func(c, ustrp, lenp, L1_func_extra_param); \ 3846 } \ 3847 } \ 3848 else { /* malformed UTF-8 or ord above 255 */ \ 3849 STRLEN len_result; \ 3850 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \ 3851 if (len_result == (STRLEN) -1) { \ 3852 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \ 3853 } 3854 3855 #define CASE_CHANGE_BODY_END(locale_flags, change_macro) \ 3856 result = change_macro(result, p, ustrp, lenp); \ 3857 \ 3858 if (flags & (locale_flags)) { \ 3859 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \ 3860 } \ 3861 return result; \ 3862 } \ 3863 \ 3864 /* Here, used locale rules. Convert back to UTF-8 */ \ 3865 if (UTF8_IS_INVARIANT(result)) { \ 3866 *ustrp = (U8) result; \ 3867 *lenp = 1; \ 3868 } \ 3869 else { \ 3870 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \ 3871 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \ 3872 *lenp = 2; \ 3873 } \ 3874 \ 3875 return result; 3876 3877 /* Not currently externally documented, and subject to change: 3878 * <flags> is set iff the rules from the current underlying locale are to 3879 * be used. */ 3880 3881 UV 3882 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, 3883 const U8 *e, 3884 U8* ustrp, 3885 STRLEN *lenp, 3886 bool flags) 3887 { 3888 UV result; 3889 3890 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; 3891 3892 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ 3893 /* 2nd char of uc(U+DF) is 'S' */ 3894 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S', 3895 turkic_uc); 3896 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); 3897 } 3898 3899 /* Not currently externally documented, and subject to change: 3900 * <flags> is set iff the rules from the current underlying locale are to be 3901 * used. Since titlecase is not defined in POSIX, for other than a 3902 * UTF-8 locale, uppercase is used instead for code points < 256. 3903 */ 3904 3905 UV 3906 Perl__to_utf8_title_flags(pTHX_ const U8 *p, 3907 const U8 *e, 3908 U8* ustrp, 3909 STRLEN *lenp, 3910 bool flags) 3911 { 3912 UV result; 3913 3914 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; 3915 3916 /* 2nd char of ucfirst(U+DF) is 's' */ 3917 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's', 3918 turkic_uc); 3919 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); 3920 } 3921 3922 /* Not currently externally documented, and subject to change: 3923 * <flags> is set iff the rules from the current underlying locale are to 3924 * be used. 3925 */ 3926 3927 UV 3928 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, 3929 const U8 *e, 3930 U8* ustrp, 3931 STRLEN *lenp, 3932 bool flags) 3933 { 3934 UV result; 3935 3936 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; 3937 3938 CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */, 3939 turkic_lc); 3940 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) 3941 } 3942 3943 /* Not currently externally documented, and subject to change, 3944 * in <flags> 3945 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 3946 * locale are to be used. 3947 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; 3948 * otherwise simple folds 3949 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are 3950 * prohibited 3951 */ 3952 3953 UV 3954 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, 3955 const U8 *e, 3956 U8* ustrp, 3957 STRLEN *lenp, 3958 U8 flags) 3959 { 3960 UV result; 3961 3962 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; 3963 3964 /* These are mutually exclusive */ 3965 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII))); 3966 3967 assert(p != ustrp); /* Otherwise overwrites */ 3968 3969 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1, 3970 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)), 3971 turkic_fc); 3972 3973 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); 3974 3975 if (flags & FOLD_FLAGS_LOCALE) { 3976 3977 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 3978 # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 3979 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 3980 3981 /* Special case these two characters, as what normally gets 3982 * returned under locale doesn't work */ 3983 if (memBEGINs((char *) p, e - p, CAP_SHARP_S)) 3984 { 3985 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3986 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3987 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; " 3988 "resolved to \"\\x{17F}\\x{17F}\"."); 3989 goto return_long_s; 3990 } 3991 else 3992 #endif 3993 if (memBEGINs((char *) p, e - p, LONG_S_T)) 3994 { 3995 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3996 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3997 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; " 3998 "resolved to \"\\x{FB06}\"."); 3999 goto return_ligature_st; 4000 } 4001 4002 #if UNICODE_MAJOR_VERSION == 3 \ 4003 && UNICODE_DOT_VERSION == 0 \ 4004 && UNICODE_DOT_DOT_VERSION == 1 4005 # define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 4006 4007 /* And special case this on this Unicode version only, for the same 4008 * reaons the other two are special cased. They would cross the 4009 * 255/256 boundary which is forbidden under /l, and so the code 4010 * wouldn't catch that they are equivalent (which they are only in 4011 * this release) */ 4012 else if (memBEGINs((char *) p, e - p, DOTTED_I)) { 4013 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 4014 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 4015 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " 4016 "resolved to \"\\x{0131}\"."); 4017 goto return_dotless_i; 4018 } 4019 #endif 4020 4021 return check_locale_boundary_crossing(p, result, ustrp, lenp); 4022 } 4023 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { 4024 return result; 4025 } 4026 else { 4027 /* This is called when changing the case of a UTF-8-encoded 4028 * character above the ASCII range, and the result should not 4029 * contain an ASCII character. */ 4030 4031 UV original; /* To store the first code point of <p> */ 4032 4033 /* Look at every character in the result; if any cross the 4034 * boundary, the whole thing is disallowed */ 4035 U8* s = ustrp; 4036 U8* send = ustrp + *lenp; 4037 while (s < send) { 4038 if (isASCII(*s)) { 4039 /* Crossed, have to return the original */ 4040 original = valid_utf8_to_uvchr(p, lenp); 4041 4042 /* But in these instances, there is an alternative we can 4043 * return that is valid */ 4044 if (original == LATIN_SMALL_LETTER_SHARP_S 4045 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ 4046 || original == LATIN_CAPITAL_LETTER_SHARP_S 4047 #endif 4048 ) { 4049 goto return_long_s; 4050 } 4051 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { 4052 goto return_ligature_st; 4053 } 4054 #if UNICODE_MAJOR_VERSION == 3 \ 4055 && UNICODE_DOT_VERSION == 0 \ 4056 && UNICODE_DOT_DOT_VERSION == 1 4057 4058 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 4059 goto return_dotless_i; 4060 } 4061 #endif 4062 Copy(p, ustrp, *lenp, char); 4063 return original; 4064 } 4065 s += UTF8SKIP(s); 4066 } 4067 4068 /* Here, no characters crossed, result is ok as-is */ 4069 return result; 4070 } 4071 } 4072 4073 /* Here, used locale rules. Convert back to UTF-8 */ 4074 if (UTF8_IS_INVARIANT(result)) { 4075 *ustrp = (U8) result; 4076 *lenp = 1; 4077 } 4078 else { 4079 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 4080 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 4081 *lenp = 2; 4082 } 4083 4084 return result; 4085 4086 return_long_s: 4087 /* Certain folds to 'ss' are prohibited by the options, but they do allow 4088 * folds to a string of two of these characters. By returning this 4089 * instead, then, e.g., 4090 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") 4091 * works. */ 4092 4093 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8); 4094 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 4095 ustrp, *lenp, U8); 4096 return LATIN_SMALL_LETTER_LONG_S; 4097 4098 return_ligature_st: 4099 /* Two folds to 'st' are prohibited by the options; instead we pick one and 4100 * have the other one fold to it */ 4101 4102 *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8); 4103 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); 4104 return LATIN_SMALL_LIGATURE_ST; 4105 4106 #if UNICODE_MAJOR_VERSION == 3 \ 4107 && UNICODE_DOT_VERSION == 0 \ 4108 && UNICODE_DOT_DOT_VERSION == 1 4109 4110 return_dotless_i: 4111 *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8); 4112 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8); 4113 return LATIN_SMALL_LETTER_DOTLESS_I; 4114 4115 #endif 4116 4117 } 4118 4119 bool 4120 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) 4121 { 4122 /* May change: warns if surrogates, non-character code points, or 4123 * non-Unicode code points are in 's' which has length 'len' bytes. 4124 * Returns TRUE if none found; FALSE otherwise. The only other validity 4125 * check is to make sure that this won't exceed the string's length nor 4126 * overflow */ 4127 4128 const U8* const e = s + len; 4129 bool ok = TRUE; 4130 4131 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT; 4132 4133 while (s < e) { 4134 if (UTF8SKIP(s) > len) { 4135 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 4136 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print"); 4137 return FALSE; 4138 } 4139 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { 4140 if (UNLIKELY(UTF8_IS_SUPER(s, e))) { 4141 if ( ckWARN_d(WARN_NON_UNICODE) 4142 || UNLIKELY(0 < does_utf8_overflow(s, s + len, 4143 0 /* Don't consider overlongs */ 4144 ))) 4145 { 4146 /* A side effect of this function will be to warn */ 4147 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); 4148 ok = FALSE; 4149 } 4150 } 4151 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) { 4152 if (ckWARN_d(WARN_SURROGATE)) { 4153 /* This has a different warning than the one the called 4154 * function would output, so can't just call it, unlike we 4155 * do for the non-chars and above-unicodes */ 4156 UV uv = utf8_to_uvchr_buf(s, e, NULL); 4157 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 4158 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", 4159 uv); 4160 ok = FALSE; 4161 } 4162 } 4163 else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e)) 4164 && (ckWARN_d(WARN_NONCHAR))) 4165 { 4166 /* A side effect of this function will be to warn */ 4167 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR); 4168 ok = FALSE; 4169 } 4170 } 4171 s += UTF8SKIP(s); 4172 } 4173 4174 return ok; 4175 } 4176 4177 /* 4178 =for apidoc pv_uni_display 4179 4180 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string 4181 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes 4182 long (if longer, the rest is truncated and C<"..."> will be appended). 4183 4184 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display 4185 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH> 4186 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">) 4187 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">). 4188 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both 4189 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on. 4190 4191 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a 4192 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set. 4193 4194 The pointer to the PV of the C<dsv> is returned. 4195 4196 See also L</sv_uni_display>. 4197 4198 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH 4199 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE 4200 =for apidoc Amnh||UNI_DISPLAY_ISPRINT 4201 =for apidoc Amnh||UNI_DISPLAY_QQ 4202 =for apidoc Amnh||UNI_DISPLAY_REGEX 4203 =cut 4204 */ 4205 char * 4206 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, 4207 UV flags) 4208 { 4209 int truncated = 0; 4210 const char *s, *e; 4211 4212 PERL_ARGS_ASSERT_PV_UNI_DISPLAY; 4213 4214 SvPVCLEAR(dsv); 4215 SvUTF8_off(dsv); 4216 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { 4217 UV u; 4218 bool ok = 0; 4219 4220 if (pvlim && SvCUR(dsv) >= pvlim) { 4221 truncated++; 4222 break; 4223 } 4224 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0); 4225 if (u < 256) { 4226 const U8 c = (U8) u; 4227 if (flags & UNI_DISPLAY_BACKSLASH) { 4228 if ( isMNEMONIC_CNTRL(c) 4229 && ( c != '\b' 4230 || (flags & UNI_DISPLAY_BACKSPACE))) 4231 { 4232 const char * mnemonic = cntrl_to_mnemonic(c); 4233 sv_catpvn(dsv, mnemonic, strlen(mnemonic)); 4234 ok = 1; 4235 } 4236 else if (c == '\\') { 4237 sv_catpvs(dsv, "\\\\"); 4238 ok = 1; 4239 } 4240 } 4241 /* isPRINT() is the locale-blind version. */ 4242 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { 4243 const char string = c; 4244 sv_catpvn(dsv, &string, 1); 4245 ok = 1; 4246 } 4247 } 4248 if (!ok) 4249 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u); 4250 } 4251 if (truncated) 4252 sv_catpvs(dsv, "..."); 4253 4254 return SvPVX(dsv); 4255 } 4256 4257 /* 4258 =for apidoc sv_uni_display 4259 4260 Build to the scalar C<dsv> a displayable version of the scalar C<sv>, 4261 the displayable version being at most C<pvlim> bytes long 4262 (if longer, the rest is truncated and "..." will be appended). 4263 4264 The C<flags> argument is as in L</pv_uni_display>(). 4265 4266 The pointer to the PV of the C<dsv> is returned. 4267 4268 =cut 4269 */ 4270 char * 4271 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) 4272 { 4273 const char * const ptr = 4274 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv); 4275 4276 PERL_ARGS_ASSERT_SV_UNI_DISPLAY; 4277 4278 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr, 4279 SvCUR(ssv), pvlim, flags); 4280 } 4281 4282 /* 4283 =for apidoc foldEQ_utf8 4284 4285 Returns true if the leading portions of the strings C<s1> and C<s2> (either or 4286 both of which may be in UTF-8) are the same case-insensitively; false 4287 otherwise. How far into the strings to compare is determined by other input 4288 parameters. 4289 4290 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode; 4291 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for 4292 C<u2> with respect to C<s2>. 4293 4294 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for 4295 fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach. 4296 The scan will not be considered to be a match unless the goal is reached, and 4297 scanning won't continue past that goal. Correspondingly for C<l2> with respect 4298 to C<s2>. 4299 4300 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that 4301 pointer is considered an end pointer to the position 1 byte past the maximum 4302 point in C<s1> beyond which scanning will not continue under any circumstances. 4303 (This routine assumes that UTF-8 encoded input strings are not malformed; 4304 malformed input can cause it to read past C<pe1>). This means that if both 4305 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match 4306 will never be successful because it can never 4307 get as far as its goal (and in fact is asserted against). Correspondingly for 4308 C<pe2> with respect to C<s2>. 4309 4310 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and 4311 C<l2> must be non-zero), and if both do, both have to be 4312 reached for a successful match. Also, if the fold of a character is multiple 4313 characters, all of them must be matched (see tr21 reference below for 4314 'folding'). 4315 4316 Upon a successful match, if C<pe1> is non-C<NULL>, 4317 it will be set to point to the beginning of the I<next> character of C<s1> 4318 beyond what was matched. Correspondingly for C<pe2> and C<s2>. 4319 4320 For case-insensitiveness, the "casefolding" of Unicode is used 4321 instead of upper/lowercasing both the characters, see 4322 L<https://www.unicode.org/reports/tr21/> (Case Mappings). 4323 4324 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII 4325 =for apidoc Cmnh||FOLDEQ_LOCALE 4326 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED 4327 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE 4328 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED 4329 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE 4330 4331 =cut */ 4332 4333 /* A flags parameter has been added which may change, and hence isn't 4334 * externally documented. Currently it is: 4335 * 0 for as-documented above 4336 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an 4337 ASCII one, to not match 4338 * FOLDEQ_LOCALE is set iff the rules from the current underlying 4339 * locale are to be used. 4340 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this 4341 * routine. This allows that step to be skipped. 4342 * Currently, this requires s1 to be encoded as UTF-8 4343 * (u1 must be true), which is asserted for. 4344 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may 4345 * cross certain boundaries. Hence, the caller should 4346 * let this function do the folding instead of 4347 * pre-folding. This code contains an assertion to 4348 * that effect. However, if the caller knows what 4349 * it's doing, it can pass this flag to indicate that, 4350 * and the assertion is skipped. 4351 * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies 4352 * to s2, and s2 doesn't have to be UTF-8 encoded. 4353 * This introduces an asymmetry to save a few branches 4354 * in a loop. Currently, this is not a problem, as 4355 * never are both inputs pre-folded. Simply call this 4356 * function with the pre-folded one as the second 4357 * string. 4358 * FOLDEQ_S2_FOLDS_SANE 4359 */ 4360 4361 I32 4362 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, 4363 const char *s2, char **pe2, UV l2, bool u2, 4364 U32 flags) 4365 { 4366 const U8 *p1 = (const U8*)s1; /* Point to current char */ 4367 const U8 *p2 = (const U8*)s2; 4368 const U8 *g1 = NULL; /* goal for s1 */ 4369 const U8 *g2 = NULL; 4370 const U8 *e1 = NULL; /* Don't scan s1 past this */ 4371 U8 *f1 = NULL; /* Point to current folded */ 4372 const U8 *e2 = NULL; 4373 U8 *f2 = NULL; 4374 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ 4375 U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; 4376 U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; 4377 U8 flags_for_folder = FOLD_FLAGS_FULL; 4378 4379 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; 4380 4381 assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) 4382 && (( (flags & FOLDEQ_S1_ALREADY_FOLDED) 4383 && !(flags & FOLDEQ_S1_FOLDS_SANE)) 4384 || ( (flags & FOLDEQ_S2_ALREADY_FOLDED) 4385 && !(flags & FOLDEQ_S2_FOLDS_SANE))))); 4386 /* The algorithm is to trial the folds without regard to the flags on 4387 * the first line of the above assert(), and then see if the result 4388 * violates them. This means that the inputs can't be pre-folded to a 4389 * violating result, hence the assert. This could be changed, with the 4390 * addition of extra tests here for the already-folded case, which would 4391 * slow it down. That cost is more than any possible gain for when these 4392 * flags are specified, as the flags indicate /il or /iaa matching which 4393 * is less common than /iu, and I (khw) also believe that real-world /il 4394 * and /iaa matches are most likely to involve code points 0-255, and this 4395 * function only under rare conditions gets called for 0-255. */ 4396 4397 if (flags & FOLDEQ_LOCALE) { 4398 if (IN_UTF8_CTYPE_LOCALE) { 4399 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) { 4400 flags_for_folder |= FOLD_FLAGS_LOCALE; 4401 } 4402 else { 4403 flags &= ~FOLDEQ_LOCALE; 4404 } 4405 } 4406 else { 4407 flags_for_folder |= FOLD_FLAGS_LOCALE; 4408 } 4409 } 4410 if (flags & FOLDEQ_UTF8_NOMIX_ASCII) { 4411 flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII; 4412 } 4413 4414 if (pe1) { 4415 e1 = *(U8**)pe1; 4416 } 4417 4418 if (l1) { 4419 g1 = (const U8*)s1 + l1; 4420 } 4421 4422 if (pe2) { 4423 e2 = *(U8**)pe2; 4424 } 4425 4426 if (l2) { 4427 g2 = (const U8*)s2 + l2; 4428 } 4429 4430 /* Must have at least one goal */ 4431 assert(g1 || g2); 4432 4433 if (g1) { 4434 4435 /* Will never match if goal is out-of-bounds */ 4436 assert(! e1 || e1 >= g1); 4437 4438 /* Here, there isn't an end pointer, or it is beyond the goal. We 4439 * only go as far as the goal */ 4440 e1 = g1; 4441 } 4442 else { 4443 assert(e1); /* Must have an end for looking at s1 */ 4444 } 4445 4446 /* Same for goal for s2 */ 4447 if (g2) { 4448 assert(! e2 || e2 >= g2); 4449 e2 = g2; 4450 } 4451 else { 4452 assert(e2); 4453 } 4454 4455 /* If both operands are already folded, we could just do a memEQ on the 4456 * whole strings at once, but it would be better if the caller realized 4457 * this and didn't even call us */ 4458 4459 /* Look through both strings, a character at a time */ 4460 while (p1 < e1 && p2 < e2) { 4461 4462 /* If at the beginning of a new character in s1, get its fold to use 4463 * and the length of the fold. */ 4464 if (n1 == 0) { 4465 if (flags & FOLDEQ_S1_ALREADY_FOLDED) { 4466 f1 = (U8 *) p1; 4467 assert(u1); 4468 n1 = UTF8SKIP(f1); 4469 } 4470 else { 4471 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) { 4472 4473 /* We have to forbid mixing ASCII with non-ASCII if the 4474 * flags so indicate. And, we can short circuit having to 4475 * call the general functions for this common ASCII case, 4476 * all of whose non-locale folds are also ASCII, and hence 4477 * UTF-8 invariants, so the UTF8ness of the strings is not 4478 * relevant. */ 4479 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) { 4480 return 0; 4481 } 4482 n1 = 1; 4483 *foldbuf1 = toFOLD(*p1); 4484 } 4485 else if (u1) { 4486 _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder); 4487 } 4488 else { /* Not UTF-8, get UTF-8 fold */ 4489 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder); 4490 } 4491 f1 = foldbuf1; 4492 } 4493 } 4494 4495 if (n2 == 0) { /* Same for s2 */ 4496 if (flags & FOLDEQ_S2_ALREADY_FOLDED) { 4497 4498 /* Point to the already-folded character. But for non-UTF-8 4499 * variants, convert to UTF-8 for the algorithm below */ 4500 if (UTF8_IS_INVARIANT(*p2)) { 4501 f2 = (U8 *) p2; 4502 n2 = 1; 4503 } 4504 else if (u2) { 4505 f2 = (U8 *) p2; 4506 n2 = UTF8SKIP(f2); 4507 } 4508 else { 4509 foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2); 4510 foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2); 4511 f2 = foldbuf2; 4512 n2 = 2; 4513 } 4514 } 4515 else { 4516 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) { 4517 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) { 4518 return 0; 4519 } 4520 n2 = 1; 4521 *foldbuf2 = toFOLD(*p2); 4522 } 4523 else if (u2) { 4524 _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder); 4525 } 4526 else { 4527 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder); 4528 } 4529 f2 = foldbuf2; 4530 } 4531 } 4532 4533 /* Here f1 and f2 point to the beginning of the strings to compare. 4534 * These strings are the folds of the next character from each input 4535 * string, stored in UTF-8. */ 4536 4537 /* While there is more to look for in both folds, see if they 4538 * continue to match */ 4539 while (n1 && n2) { 4540 U8 fold_length = UTF8SKIP(f1); 4541 if (fold_length != UTF8SKIP(f2) 4542 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE 4543 function call for single 4544 byte */ 4545 || memNE((char*)f1, (char*)f2, fold_length)) 4546 { 4547 return 0; /* mismatch */ 4548 } 4549 4550 /* Here, they matched, advance past them */ 4551 n1 -= fold_length; 4552 f1 += fold_length; 4553 n2 -= fold_length; 4554 f2 += fold_length; 4555 } 4556 4557 /* When reach the end of any fold, advance the input past it */ 4558 if (n1 == 0) { 4559 p1 += u1 ? UTF8SKIP(p1) : 1; 4560 } 4561 if (n2 == 0) { 4562 p2 += u2 ? UTF8SKIP(p2) : 1; 4563 } 4564 } /* End of loop through both strings */ 4565 4566 /* A match is defined by each scan that specified an explicit length 4567 * reaching its final goal, and the other not having matched a partial 4568 * character (which can happen when the fold of a character is more than one 4569 * character). */ 4570 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { 4571 return 0; 4572 } 4573 4574 /* Successful match. Set output pointers */ 4575 if (pe1) { 4576 *pe1 = (char*)p1; 4577 } 4578 if (pe2) { 4579 *pe2 = (char*)p2; 4580 } 4581 return 1; 4582 } 4583 4584 /* 4585 * ex: set ts=8 sts=4 sw=4 et: 4586 */ 4587