1 /* utf8.c 2 * 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 4 * by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 13 * heard of that we don't want to see any closer; and that's the one place 14 * we're trying to get to! And that's just where we can't get, nohow.' 15 * 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] 17 * 18 * 'Well do I understand your speech,' he answered in the same language; 19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 20 * as is the custom in the West, if you wish to be answered?' 21 * --Gandalf, addressing Théoden's door wardens 22 * 23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 24 * 25 * ...the travellers perceived that the floor was paved with stones of many 26 * hues; branching runes and strange devices intertwined beneath their feet. 27 * 28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_UTF8_C 33 #include "perl.h" 34 #include "invlist_inline.h" 35 36 static const char malformed_text[] = "Malformed UTF-8 character"; 37 static const char unees[] = 38 "Malformed UTF-8 character (unexpected end of string)"; 39 40 /* 41 These are various utility functions for manipulating UTF8-encoded 42 strings. For the uninitiated, this is a method of representing arbitrary 43 Unicode characters as a variable number of bytes, in such a way that 44 characters in the ASCII range are unmodified, and a zero byte never appears 45 within non-zero characters. 46 */ 47 48 void 49 Perl__force_out_malformed_utf8_message(pTHX_ 50 const U8 *const p, /* First byte in UTF-8 sequence */ 51 const U8 * const e, /* Final byte in sequence (may include 52 multiple chars */ 53 const U32 flags, /* Flags to pass to utf8n_to_uvchr(), 54 usually 0, or some DISALLOW flags */ 55 const bool die_here) /* If TRUE, this function does not return */ 56 { 57 /* This core-only function is to be called when a malformed UTF-8 character 58 * is found, in order to output the detailed information about the 59 * malformation before dieing. The reason it exists is for the occasions 60 * when such a malformation is fatal, but warnings might be turned off, so 61 * that normally they would not be actually output. This ensures that they 62 * do get output. Because a sequence may be malformed in more than one 63 * way, multiple messages may be generated, so we can't make them fatal, as 64 * that would cause the first one to die. 65 * 66 * Instead we pretend -W was passed to perl, then die afterwards. The 67 * flexibility is here to return to the caller so they can finish up and 68 * die themselves */ 69 U32 errors; 70 71 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; 72 73 ENTER; 74 SAVEI8(PL_dowarn); 75 SAVESPTR(PL_curcop); 76 77 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; 78 if (PL_curcop) { 79 SAVECURCOPWARNINGS(); 80 PL_curcop->cop_warnings = pWARN_ALL; 81 } 82 83 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); 84 85 LEAVE; 86 87 if (! errors) { 88 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should" 89 " be called only when there are errors found"); 90 } 91 92 if (die_here) { 93 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 94 } 95 } 96 97 STATIC HV * 98 S_new_msg_hv(pTHX_ const char * const message, /* The message text */ 99 U32 categories, /* Packed warning categories */ 100 U32 flag) /* Flag associated with this message */ 101 { 102 /* Creates, populates, and returns an HV* that describes an error message 103 * for the translators between UTF8 and code point */ 104 105 SV* msg_sv = newSVpv(message, 0); 106 SV* category_sv = newSVuv(categories); 107 SV* flag_bit_sv = newSVuv(flag); 108 109 HV* msg_hv = newHV(); 110 111 PERL_ARGS_ASSERT_NEW_MSG_HV; 112 113 (void) hv_stores(msg_hv, "text", msg_sv); 114 (void) hv_stores(msg_hv, "warn_categories", category_sv); 115 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv); 116 117 return msg_hv; 118 } 119 120 /* 121 =for apidoc uvoffuni_to_utf8_flags 122 123 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 124 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or 125 L<perlapi/uvchr_to_utf8_flags>>. 126 127 This function is like them, but the input is a strict Unicode 128 (as opposed to native) code point. Only in very rare circumstances should code 129 not be using the native code point. 130 131 For details, see the description for L<perlapi/uvchr_to_utf8_flags>. 132 133 =cut 134 */ 135 136 U8 * 137 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) 138 { 139 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; 140 141 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL); 142 } 143 144 /* All these formats take a single UV code point argument */ 145 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf; 146 const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf 147 " is not recommended for open interchange"; 148 const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," 149 " may not be portable"; 150 151 /* Use shorter names internally in this file */ 152 #define SHIFT UTF_ACCUMULATION_SHIFT 153 #undef MARK 154 #define MARK UTF_CONTINUATION_MARK 155 #define MASK UTF_CONTINUATION_MASK 156 157 /* 158 =for apidoc uvchr_to_utf8_flags_msgs 159 160 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 161 162 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly. 163 164 This function is for code that wants any warning and/or error messages to be 165 returned to the caller rather than be displayed. All messages that would have 166 been displayed if all lexical warnings are enabled will be returned. 167 168 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter 169 placed after all the others, C<msgs>. If this parameter is 0, this function 170 behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should 171 be a pointer to an C<HV *> variable, in which this function creates a new HV to 172 contain any appropriate messages. The hash has three key-value pairs, as 173 follows: 174 175 =over 4 176 177 =item C<text> 178 179 The text of the message as a C<SVpv>. 180 181 =item C<warn_categories> 182 183 The warning category (or categories) packed into a C<SVuv>. 184 185 =item C<flag> 186 187 A single flag bit associated with this message, in a C<SVuv>. 188 The bit corresponds to some bit in the C<*errors> return value, 189 such as C<UNICODE_GOT_SURROGATE>. 190 191 =back 192 193 It's important to note that specifying this parameter as non-null will cause 194 any warnings this function would otherwise generate to be suppressed, and 195 instead be placed in C<*msgs>. The caller can check the lexical warnings state 196 (or not) when choosing what to do with the returned messages. 197 198 The caller, of course, is responsible for freeing any returned HV. 199 200 =cut 201 */ 202 203 /* Undocumented; we don't want people using this. Instead they should use 204 * uvchr_to_utf8_flags_msgs() */ 205 U8 * 206 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs) 207 { 208 U8 *p; 209 UV shifted_uv = input_uv; 210 STRLEN utf8_skip = OFFUNISKIP(input_uv); 211 212 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS; 213 214 if (msgs) { 215 *msgs = NULL; 216 } 217 218 switch (utf8_skip) { 219 case 1: 220 *d++ = LATIN1_TO_NATIVE(input_uv); 221 return d; 222 223 default: 224 if ( UNLIKELY(input_uv > MAX_LEGAL_CP 225 && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))) 226 { 227 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */ 228 NULL, 0, input_uv)); 229 } 230 231 if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) { 232 U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); 233 const char * format = PL_extended_cp_format; 234 if (msgs) { 235 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 236 category, 237 UNICODE_GOT_PERL_EXTENDED); 238 } 239 else { 240 Perl_ck_warner_d(aTHX_ category, format, input_uv); 241 } 242 243 /* Don't output a 2nd msg */ 244 flags &= ~UNICODE_WARN_SUPER; 245 } 246 247 if (flags & UNICODE_DISALLOW_PERL_EXTENDED) { 248 return NULL; 249 } 250 251 p = d + utf8_skip - 1; 252 while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) { 253 *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 254 shifted_uv >>= SHIFT; 255 } 256 257 /* FALLTHROUGH */ 258 259 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: 260 d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT] 261 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 262 shifted_uv >>= SHIFT; 263 /* FALLTHROUGH */ 264 265 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: 266 d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT] 267 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 268 shifted_uv >>= SHIFT; 269 /* FALLTHROUGH */ 270 271 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT: 272 if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) { 273 if (flags & UNICODE_WARN_SUPER) { 274 U32 category = packWARN(WARN_NON_UNICODE); 275 const char * format = super_cp_format; 276 277 if (msgs) { 278 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 279 category, 280 UNICODE_GOT_SUPER); 281 } 282 else { 283 Perl_ck_warner_d(aTHX_ category, format, input_uv); 284 } 285 286 if (flags & UNICODE_DISALLOW_SUPER) { 287 return NULL; 288 } 289 } 290 if ( (flags & UNICODE_DISALLOW_SUPER) 291 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) 292 && UNICODE_IS_PERL_EXTENDED(input_uv))) 293 { 294 return NULL; 295 } 296 } 297 298 d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT] 299 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 300 shifted_uv >>= SHIFT; 301 /* FALLTHROUGH */ 302 303 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT: 304 if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) { 305 if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) { 306 if (flags & UNICODE_WARN_NONCHAR) { 307 U32 category = packWARN(WARN_NONCHAR); 308 const char * format = nonchar_cp_format; 309 if (msgs) { 310 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 311 category, 312 UNICODE_GOT_NONCHAR); 313 } 314 else { 315 Perl_ck_warner_d(aTHX_ category, format, input_uv); 316 } 317 } 318 if (flags & UNICODE_DISALLOW_NONCHAR) { 319 return NULL; 320 } 321 } 322 else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) { 323 if (flags & UNICODE_WARN_SURROGATE) { 324 U32 category = packWARN(WARN_SURROGATE); 325 const char * format = surrogate_cp_format; 326 if (msgs) { 327 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 328 category, 329 UNICODE_GOT_SURROGATE); 330 } 331 else { 332 Perl_ck_warner_d(aTHX_ category, format, input_uv); 333 } 334 } 335 if (flags & UNICODE_DISALLOW_SURROGATE) { 336 return NULL; 337 } 338 } 339 } 340 341 d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT] 342 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 343 shifted_uv >>= SHIFT; 344 /* FALLTHROUGH */ 345 346 #ifdef EBCDIC 347 348 case 3: 349 d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 350 shifted_uv >>= SHIFT; 351 /* FALLTHROUGH */ 352 353 #endif 354 355 /* FALLTHROUGH */ 356 case 2: 357 d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 358 shifted_uv >>= SHIFT; 359 d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip)) 360 | UTF_START_MARK(utf8_skip)); 361 break; 362 } 363 364 return d + utf8_skip; 365 } 366 367 /* 368 =for apidoc uvchr_to_utf8 369 370 Adds the UTF-8 representation of the native code point C<uv> to the end 371 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to 372 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 373 the byte after the end of the new character. In other words, 374 375 d = uvchr_to_utf8(d, uv); 376 377 is the recommended wide native character-aware way of saying 378 379 *(d++) = uv; 380 381 This function accepts any code point from 0..C<IV_MAX> as input. 382 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word. 383 384 It is possible to forbid or warn on non-Unicode code points, or those that may 385 be problematic by using L</uvchr_to_utf8_flags>. 386 387 =cut 388 */ 389 390 /* This is also a macro */ 391 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); 392 393 U8 * 394 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) 395 { 396 return uvchr_to_utf8(d, uv); 397 } 398 399 /* 400 =for apidoc uvchr_to_utf8_flags 401 402 Adds the UTF-8 representation of the native code point C<uv> to the end 403 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to 404 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 405 the byte after the end of the new character. In other words, 406 407 d = uvchr_to_utf8_flags(d, uv, flags); 408 409 or, in most cases, 410 411 d = uvchr_to_utf8_flags(d, uv, 0); 412 413 This is the Unicode-aware way of saying 414 415 *(d++) = uv; 416 417 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as 418 input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word. 419 420 Specifying C<flags> can further restrict what is allowed and not warned on, as 421 follows: 422 423 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set, 424 the function will raise a warning, provided UTF8 warnings are enabled. If 425 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return 426 NULL. If both flags are set, the function will both warn and return NULL. 427 428 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags 429 affect how the function handles a Unicode non-character. 430 431 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags 432 affect the handling of code points that are above the Unicode maximum of 433 0x10FFFF. Languages other than Perl may not be able to accept files that 434 contain these. 435 436 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of 437 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all 438 three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the 439 allowed inputs to the strict UTF-8 traditionally defined by Unicode. 440 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and 441 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the 442 above-Unicode and surrogate flags, but not the non-character ones, as 443 defined in 444 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>. 445 See L<perlunicode/Noncharacter code points>. 446 447 Extremely high code points were never specified in any standard, and require an 448 extension to UTF-8 to express, which Perl does. It is likely that programs 449 written in something other than Perl would not be able to read files that 450 contain these; nor would Perl understand files written by something that uses a 451 different extension. For these reasons, there is a separate set of flags that 452 can warn and/or disallow these extremely high code points, even if other 453 above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED> 454 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see 455 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will 456 treat all above-Unicode code points, including these, as malformations. (Note 457 that the Unicode standard considers anything above 0x10FFFF to be illegal, but 458 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1)) 459 460 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is 461 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly, 462 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named 463 C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC 464 platforms,these flags can apply to code points that actually do fit in 31 bits. 465 The new names accurately describe the situation in all cases. 466 467 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT 468 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE 469 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 470 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR 471 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED 472 =for apidoc Amnh||UNICODE_DISALLOW_SUPER 473 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE 474 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT 475 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE 476 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE 477 =for apidoc Amnh||UNICODE_WARN_NONCHAR 478 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED 479 =for apidoc Amnh||UNICODE_WARN_SUPER 480 =for apidoc Amnh||UNICODE_WARN_SURROGATE 481 482 =cut 483 */ 484 485 /* This is also a macro */ 486 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); 487 488 U8 * 489 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 490 { 491 return uvchr_to_utf8_flags(d, uv, flags); 492 } 493 494 PERL_STATIC_INLINE int 495 S_is_utf8_overlong(const U8 * const s, const STRLEN len) 496 { 497 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 498 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if 499 * it isn't, and -1 if there isn't enough information to tell. This last 500 * return value can happen if the sequence is incomplete, missing some 501 * trailing bytes that would form a complete character. If there are 502 * enough bytes to make a definitive decision, this function does so. 503 * Usually 2 bytes are sufficient. 504 * 505 * Overlongs can occur whenever the number of continuation bytes changes. 506 * That means whenever the number of leading 1 bits in a start byte 507 * increases from the next lower start byte. That happens for start bytes 508 * C0, E0, F0, F8, FC, FE, and FF. 509 */ 510 511 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG; 512 513 /* Each platform has overlongs after the start bytes given above (expressed 514 * in I8 for EBCDIC). The values below were found by manually inspecting 515 * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */ 516 517 switch (NATIVE_UTF8_TO_I8(s[0])) { 518 default: 519 assert(UTF8_IS_START(s[0])); 520 return 0; 521 522 case 0xC0: 523 case 0xC1: 524 return 1; 525 526 #ifdef EBCDIC 527 528 case 0xC2: 529 case 0xC3: 530 case 0xC4: 531 case 0xE0: 532 return 1; 533 #else 534 case 0xE0: 535 return (len < 2) ? -1 : s[1] < 0xA0; 536 #endif 537 538 case 0xF0: 539 return (len < 2) 540 ? -1 541 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10; 542 case 0xF8: 543 return (len < 2) 544 ? -1 545 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08; 546 case 0xFC: 547 return (len < 2) 548 ? -1 549 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04; 550 case 0xFE: 551 return (len < 2) 552 ? -1 553 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02; 554 case 0xFF: 555 return isFF_overlong(s, len); 556 } 557 } 558 559 PERL_STATIC_INLINE int 560 S_isFF_overlong(const U8 * const s, const STRLEN len) 561 { 562 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 563 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if 564 * it isn't, and -1 if there isn't enough information to tell. This last 565 * return value can happen if the sequence is incomplete, missing some 566 * trailing bytes that would form a complete character. If there are 567 * enough bytes to make a definitive decision, this function does so. */ 568 569 PERL_ARGS_ASSERT_ISFF_OVERLONG; 570 571 #ifdef EBCDIC 572 /* This works on all three EBCDIC code pages traditionally supported by 573 * perl */ 574 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41" 575 #else 576 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80" 577 #endif 578 579 /* To be an FF overlong, all the available bytes must match */ 580 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX, 581 MIN(len, STRLENs(FF_OVERLONG_PREFIX))))) 582 { 583 return 0; 584 } 585 586 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must 587 * be there; what comes after them doesn't matter. See tables in utf8.h, 588 * utfebcdic.h. */ 589 if (len >= STRLENs(FF_OVERLONG_PREFIX)) { 590 return 1; 591 } 592 593 /* The missing bytes could cause the result to go one way or the other, so 594 * the result is indeterminate */ 595 return -1; 596 } 597 598 /* At some point we may want to allow core to use up to UV_MAX */ 599 600 #ifdef EBCDIC /* Actually is I8 */ 601 # if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */ 602 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA7" 603 /* UV_MAX "\xFF\xAF" */ 604 # else /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */ 605 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1" 606 /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */ 607 # endif 608 #else 609 # if defined(UV_IS_QUAD) 610 # define HIGHEST_REPRESENTABLE_UTF "\xFF\x80\x87" 611 /* UV_MAX "\xFF\x80" */ 612 # else 613 # define HIGHEST_REPRESENTABLE_UTF "\xFD" 614 /* UV_MAX "\xFE\x83" */ 615 # endif 616 #endif 617 618 PERL_STATIC_INLINE int 619 S_does_utf8_overflow(const U8 * const s, 620 const U8 * e, 621 const bool consider_overlongs) 622 { 623 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 624 * 'e' - 1 would overflow an IV on this platform; that is if it represents 625 * a code point larger than the highest representable code point. It 626 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't 627 * enough information to tell. This last return value can happen if the 628 * sequence is incomplete, missing some trailing bytes that would form a 629 * complete character. If there are enough bytes to make a definitive 630 * decision, this function does so. 631 * 632 * If 'consider_overlongs' is TRUE, the function checks for the possibility 633 * that the sequence is an overlong that doesn't overflow. Otherwise, it 634 * assumes the sequence is not an overlong. This can give different 635 * results only on ASCII 32-bit platforms. 636 * 637 * (For ASCII platforms, we could use memcmp() because we don't have to 638 * convert each byte to I8, but it's very rare input indeed that would 639 * approach overflow, so the loop below will likely only get executed once.) 640 * 641 */ 642 const STRLEN len = e - s; 643 const U8 *x; 644 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF; 645 int is_overlong = 0; 646 647 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; 648 649 for (x = s; x < e; x++, y++) { 650 651 /* 'y' is set up to not include the trailing bytes that are all the 652 * maximum possible continuation byte. So when we reach the end of 'y' 653 * (known to be NUL terminated), it is impossible for 'x' to contain 654 * bytes larger than those omitted bytes, and therefore 'x' can't 655 * overflow */ 656 if (*y == '\0') { 657 return 0; 658 } 659 660 /* If this byte is less than the corresponding highest non-overflowing 661 * UTF-8, the sequence doesn't overflow */ 662 if (NATIVE_UTF8_TO_I8(*x) < *y) { 663 return 0; 664 } 665 666 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { 667 goto overflows_if_not_overlong; 668 } 669 } 670 671 /* Got to the end, and all bytes are the same. If the input is a whole 672 * character, it doesn't overflow. And if it is a partial character, 673 * there's not enough information to tell */ 674 return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1; 675 676 overflows_if_not_overlong: 677 678 /* Here, a well-formed sequence overflows. If we are assuming 679 * well-formedness, return that it overflows. */ 680 if (! consider_overlongs) { 681 return 1; 682 } 683 684 /* Here, it could be the overlong malformation, and might not actually 685 * overflow if you were to calculate it out. 686 * 687 * See if it actually is overlong */ 688 is_overlong = is_utf8_overlong(s, len); 689 690 /* If it isn't overlong, is well-formed, so overflows */ 691 if (is_overlong == 0) { 692 return 1; 693 } 694 695 /* Not long enough to determine */ 696 if (is_overlong < 0) { 697 return -1; 698 } 699 700 /* Here, it appears to overflow, but it is also overlong */ 701 702 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS 703 704 /* On many platforms, it is impossible for an overlong to overflow. For 705 * these, no further work is necessary: we can return immediately that this 706 * overlong that is an apparent overflow actually isn't 707 * 708 * To see why, note that a length_N sequence can represent as overlongs all 709 * the code points representable by shorter length sequences, but no 710 * higher. If it could represent a higher code point without being an 711 * overlong, we wouldn't have had to increase the sequence length! 712 * 713 * The highest possible start byte is FF; the next highest is FE. The 714 * highest code point representable as an overlong on the platform is thus 715 * the highest code point representable by a non-overlong sequence whose 716 * start byte is FE. If that value doesn't overflow the platform's word 717 * size, overlongs can't overflow. 718 * 719 * FE consists of 7 bytes total; the FE start byte contributes 0 bits of 720 * information (the high 7 bits, all ones, say that the sequence is 7 bytes 721 * long, and the bottom, zero, bit is s placeholder. That leaves the 6 722 * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each. 723 If that number of bits doesn't exceed the word size, it can't overflow. */ 724 725 return 0; 726 727 #else 728 729 /* In practice, only a 32-bit ASCII box gets here. The FE start byte can 730 * represent, as an overlong, the highest code point representable by an FD 731 * start byte, which is 5*6 continuation bytes of info plus one bit from 732 * the start byte, or 31 bits. That doesn't overflow. More explicitly: 733 * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1. 734 * 735 * That means only the FF start byte can have an overflowing overlong. */ 736 if (*s < 0xFF) { 737 return 0; 738 } 739 740 /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that 741 * evaluates to 2**31, so overflows an IV. For a UV it's 742 * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */ 743 # define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82" 744 745 if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */ 746 return -1; 747 } 748 749 # define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0) 750 751 return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS)); 752 753 #endif 754 755 } 756 757 STRLEN 758 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags) 759 { 760 SSize_t len, full_len; 761 762 /* An internal helper function. 763 * 764 * On input: 765 * 's' is a string, which is known to be syntactically valid UTF-8 as far 766 * as (e - 1); e > s must hold. 767 * 'e' This function is allowed to look at any byte from 's'...'e-1', but 768 * nowhere else. The function has to cope as best it can if that 769 * sequence does not form a full character. 770 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags 771 * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns 772 * 0 if it determines the input will match something disallowed. 773 * On output: 774 * The return is the number of bytes required to represent the code point 775 * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the 776 * input is for a partial character, a successful return will be larger 777 * than 'e - s'. 778 * 779 * If *s..*(e-1) is only for a partial character, the function will return 780 * non-zero if there is any sequence of well-formed UTF-8 that, when 781 * appended to the input sequence, could result in an allowed code point; 782 * otherwise it returns 0. Non characters cannot be determined based on 783 * partial character input. But many of the other excluded types can be 784 * determined with just the first one or two bytes. 785 * 786 */ 787 788 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_; 789 790 assert(e > s); 791 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 792 |UTF8_DISALLOW_PERL_EXTENDED))); 793 794 full_len = UTF8SKIP(s); 795 796 len = e - s; 797 if (len > full_len) { 798 e = s + full_len; 799 len = full_len; 800 } 801 802 switch (full_len) { 803 bool is_super; 804 805 default: /* Extended */ 806 if (flags & UTF8_DISALLOW_PERL_EXTENDED) { 807 return 0; 808 } 809 810 /* FALLTHROUGH */ 811 812 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */ 813 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */ 814 815 if (flags & UTF8_DISALLOW_SUPER) { 816 return 0; /* Above Unicode */ 817 } 818 819 return full_len; 820 821 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT: 822 is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_) 823 || ( len > 1 824 && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_ 825 && NATIVE_UTF8_TO_I8(s[1]) 826 >= UTF_FIRST_CONT_BYTE_110000_)); 827 if (is_super) { 828 if (flags & UTF8_DISALLOW_SUPER) { 829 return 0; 830 } 831 } 832 else if ( (flags & UTF8_DISALLOW_NONCHAR) 833 && len == full_len 834 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s))) 835 { 836 return 0; 837 } 838 839 return full_len; 840 841 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT: 842 843 if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) { 844 return full_len; 845 } 846 847 if ( (flags & UTF8_DISALLOW_SURROGATE) 848 && UNLIKELY(is_SURROGATE_utf8(s))) 849 { 850 return 0; /* Surrogate */ 851 } 852 853 if ( (flags & UTF8_DISALLOW_NONCHAR) 854 && len == full_len 855 && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s))) 856 { 857 return 0; 858 } 859 860 return full_len; 861 862 /* The lower code points don't have any disallowable characters */ 863 #ifdef EBCDIC 864 case 3: 865 return full_len; 866 #endif 867 868 case 2: 869 case 1: 870 return full_len; 871 } 872 } 873 874 Size_t 875 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, 876 const bool require_partial) 877 { 878 /* This is called to determine if the UTF-8 sequence starting at s0 and 879 * continuing for up to one full character of bytes, but looking no further 880 * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native 881 * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being 882 * for the largest code points recognized by Perl, the ones that require 883 * the most UTF-8 bytes per character to represent (somewhat less than 884 * twice the size of the next longest kind). This sequence will only ever 885 * be Perl extended UTF-8. 886 * 887 * The routine returns 0 if the sequence is not fully valid, syntactically 888 * or semantically. That means it checks that everything following the 889 * start byte is a continuation byte, and that it doesn't overflow, nor is 890 * an overlong representation. 891 * 892 * If 'require_partial' is FALSE, the routine returns non-zero only if the 893 * input (as far as 'e-1') is a full character. The return is the count of 894 * the bytes in the character. 895 * 896 * If 'require_partial' is TRUE, the routine returns non-zero only if the 897 * input as far as 'e-1' is a partial, not full character, with no 898 * malformations found before position 'e'. The return is either just 899 * FALSE, or TRUE. */ 900 901 const U8 *s = s0 + 1; 902 const U8 *send = e; 903 904 PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_; 905 906 assert(s0 < e); 907 assert(*s0 == I8_TO_NATIVE_UTF8(0xFF)); 908 909 send = s + MIN(UTF8_MAXBYTES - 1, e - s); 910 while (s < send) { 911 if (! UTF8_IS_CONTINUATION(*s)) { 912 return 0; 913 } 914 915 s++; 916 } 917 918 if (0 < does_utf8_overflow(s0, e, 919 FALSE /* Don't consider_overlongs */ 920 )) { 921 return 0; 922 } 923 924 if (0 < isFF_overlong(s0, e - s0)) { 925 return 0; 926 } 927 928 /* Here, the character is valid as far as it got. Check if got a partial 929 * character */ 930 if (s - s0 < UTF8_MAXBYTES) { 931 return (require_partial) ? 1 : 0; 932 } 933 934 /* Here, got a full character */ 935 return (require_partial) ? 0 : UTF8_MAXBYTES; 936 } 937 938 const char * 939 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) 940 { 941 /* Returns a mortalized C string that is a displayable copy of the 'len' 942 * bytes starting at 'start'. 'format' gives how to display each byte. 943 * Currently, there are only two formats, so it is currently a bool: 944 * 0 \xab 945 * 1 ab (that is a space between two hex digit bytes) 946 */ 947 948 if (start == NULL) { 949 return "(nil)"; 950 } 951 952 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a 953 trailing NUL */ 954 const U8 * s = start; 955 const U8 * const e = start + len; 956 char * output; 957 char * d; 958 959 PERL_ARGS_ASSERT__BYTE_DUMP_STRING; 960 961 Newx(output, output_len, char); 962 SAVEFREEPV(output); 963 964 d = output; 965 for (s = start; s < e; s++) { 966 const unsigned high_nibble = (*s & 0xF0) >> 4; 967 const unsigned low_nibble = (*s & 0x0F); 968 969 if (format) { 970 if (s > start) { 971 *d++ = ' '; 972 } 973 } 974 else { 975 *d++ = '\\'; 976 *d++ = 'x'; 977 } 978 979 if (high_nibble < 10) { 980 *d++ = high_nibble + '0'; 981 } 982 else { 983 *d++ = high_nibble - 10 + 'a'; 984 } 985 986 if (low_nibble < 10) { 987 *d++ = low_nibble + '0'; 988 } 989 else { 990 *d++ = low_nibble - 10 + 'a'; 991 } 992 } 993 994 *d = '\0'; 995 return output; 996 } 997 998 PERL_STATIC_INLINE char * 999 S_unexpected_non_continuation_text(pTHX_ const U8 * const s, 1000 1001 /* Max number of bytes to print */ 1002 STRLEN print_len, 1003 1004 /* Which one is the non-continuation */ 1005 const STRLEN non_cont_byte_pos, 1006 1007 /* How many bytes should there be? */ 1008 const STRLEN expect_len) 1009 { 1010 /* Return the malformation warning text for an unexpected continuation 1011 * byte. */ 1012 1013 const char * const where = (non_cont_byte_pos == 1) 1014 ? "immediately" 1015 : Perl_form(aTHX_ "%d bytes", 1016 (int) non_cont_byte_pos); 1017 const U8 * x = s + non_cont_byte_pos; 1018 const U8 * e = s + print_len; 1019 1020 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; 1021 1022 /* We don't need to pass this parameter, but since it has already been 1023 * calculated, it's likely faster to pass it; verify under DEBUGGING */ 1024 assert(expect_len == UTF8SKIP(s)); 1025 1026 /* As a defensive coding measure, don't output anything past a NUL. Such 1027 * bytes shouldn't be in the middle of a malformation, and could mark the 1028 * end of the allocated string, and what comes after is undefined */ 1029 for (; x < e; x++) { 1030 if (*x == '\0') { 1031 x++; /* Output this particular NUL */ 1032 break; 1033 } 1034 } 1035 1036 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," 1037 " %s after start byte 0x%02x; need %d bytes, got %d)", 1038 malformed_text, 1039 _byte_dump_string(s, x - s, 0), 1040 *(s + non_cont_byte_pos), 1041 where, 1042 *s, 1043 (int) expect_len, 1044 (int) non_cont_byte_pos); 1045 } 1046 1047 /* 1048 1049 =for apidoc utf8n_to_uvchr 1050 1051 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1052 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1053 directly. 1054 1055 Bottom level UTF-8 decode routine. 1056 Returns the native code point value of the first character in the string C<s>, 1057 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than 1058 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to 1059 the length, in bytes, of that character. 1060 1061 The value of C<flags> determines the behavior when C<s> does not point to a 1062 well-formed UTF-8 character. If C<flags> is 0, encountering a malformation 1063 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) 1064 is the next possible position in C<s> that could begin a non-malformed 1065 character. Also, if UTF-8 warnings haven't been lexically disabled, a warning 1066 is raised. Some UTF-8 input sequences may contain multiple malformations. 1067 This function tries to find every possible one in each call, so multiple 1068 warnings can be raised for the same sequence. 1069 1070 Various ALLOW flags can be set in C<flags> to allow (and not warn on) 1071 individual types of malformations, such as the sequence being overlong (that 1072 is, when there is a shorter sequence that can express the same code point; 1073 overlong sequences are expressly forbidden in the UTF-8 standard due to 1074 potential security issues). Another malformation example is the first byte of 1075 a character not being a legal first byte. See F<utf8.h> for the list of such 1076 flags. Even if allowed, this function generally returns the Unicode 1077 REPLACEMENT CHARACTER when it encounters a malformation. There are flags in 1078 F<utf8.h> to override this behavior for the overlong malformations, but don't 1079 do that except for very specialized purposes. 1080 1081 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other 1082 flags) malformation is found. If this flag is set, the routine assumes that 1083 the caller will raise a warning, and this function will silently just set 1084 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero. 1085 1086 Note that this API requires disambiguation between successful decoding a C<NUL> 1087 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as 1088 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may 1089 be set to 1. To disambiguate, upon a zero return, see if the first byte of 1090 C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an 1091 error. Or you can use C<L</utf8n_to_uvchr_error>>. 1092 1093 Certain code points are considered problematic. These are Unicode surrogates, 1094 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. 1095 By default these are considered regular code points, but certain situations 1096 warrant special handling for them, which can be specified using the C<flags> 1097 parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all 1098 three classes are treated as malformations and handled as such. The flags 1099 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and 1100 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to 1101 disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE> 1102 restricts the allowed inputs to the strict UTF-8 traditionally defined by 1103 Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness 1104 definition given by 1105 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>. 1106 The difference between traditional strictness and C9 strictness is that the 1107 latter does not forbid non-character code points. (They are still discouraged, 1108 however.) For more discussion see L<perlunicode/Noncharacter code points>. 1109 1110 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>, 1111 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>, 1112 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be 1113 raised for their respective categories, but otherwise the code points are 1114 considered valid (not malformations). To get a category to both be treated as 1115 a malformation and raise a warning, specify both the WARN and DISALLOW flags. 1116 (But note that warnings are not raised if lexically disabled nor if 1117 C<UTF8_CHECK_ONLY> is also specified.) 1118 1119 Extremely high code points were never specified in any standard, and require an 1120 extension to UTF-8 to express, which Perl does. It is likely that programs 1121 written in something other than Perl would not be able to read files that 1122 contain these; nor would Perl understand files written by something that uses a 1123 different extension. For these reasons, there is a separate set of flags that 1124 can warn and/or disallow these extremely high code points, even if other 1125 above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and 1126 C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see 1127 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all 1128 above-Unicode code points, including these, as malformations. 1129 (Note that the Unicode standard considers anything above 0x10FFFF to be 1130 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF 1131 (2**31 -1)) 1132 1133 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is 1134 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly, 1135 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named 1136 C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags 1137 can apply to code points that actually do fit in 31 bits. This happens on 1138 EBCDIC platforms, and sometimes when the L<overlong 1139 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately 1140 describe the situation in all cases. 1141 1142 1143 All other code points corresponding to Unicode characters, including private 1144 use and those yet to be assigned, are never considered malformed and never 1145 warn. 1146 1147 =for apidoc Amnh||UTF8_CHECK_ONLY 1148 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1149 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE 1150 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE 1151 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR 1152 =for apidoc Amnh||UTF8_DISALLOW_SUPER 1153 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE 1154 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE 1155 =for apidoc Amnh||UTF8_WARN_SURROGATE 1156 =for apidoc Amnh||UTF8_WARN_NONCHAR 1157 =for apidoc Amnh||UTF8_WARN_SUPER 1158 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED 1159 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED 1160 1161 =cut 1162 1163 Also implemented as a macro in utf8.h 1164 */ 1165 1166 UV 1167 Perl_utf8n_to_uvchr(const U8 *s, 1168 STRLEN curlen, 1169 STRLEN *retlen, 1170 const U32 flags) 1171 { 1172 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; 1173 1174 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); 1175 } 1176 1177 /* 1178 1179 =for apidoc utf8n_to_uvchr_error 1180 1181 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1182 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1183 directly. 1184 1185 This function is for code that needs to know what the precise malformation(s) 1186 are when an error is found. If you also need to know the generated warning 1187 messages, use L</utf8n_to_uvchr_msgs>() instead. 1188 1189 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after 1190 all the others, C<errors>. If this parameter is 0, this function behaves 1191 identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer 1192 to a C<U32> variable, which this function sets to indicate any errors found. 1193 Upon return, if C<*errors> is 0, there were no errors found. Otherwise, 1194 C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some 1195 of these bits will be set if a malformation is found, even if the input 1196 C<flags> parameter indicates that the given malformation is allowed; those 1197 exceptions are noted: 1198 1199 =over 4 1200 1201 =item C<UTF8_GOT_PERL_EXTENDED> 1202 1203 The input sequence is not standard UTF-8, but a Perl extension. This bit is 1204 set only if the input C<flags> parameter contains either the 1205 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags. 1206 1207 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, 1208 and so some extension must be used to express them. Perl uses a natural 1209 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further 1210 extension to represent even higher ones, so that any code point that fits in a 1211 64-bit word can be represented. Text using these extensions is not likely to 1212 be portable to non-Perl code. We lump both of these extensions together and 1213 refer to them as Perl extended UTF-8. There exist other extensions that people 1214 have invented, incompatible with Perl's. 1215 1216 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing 1217 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower 1218 than on ASCII. Prior to that, code points 2**31 and higher were simply 1219 unrepresentable, and a different, incompatible method was used to represent 1220 code points between 2**30 and 2**31 - 1. 1221 1222 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if 1223 Perl extended UTF-8 is used. 1224 1225 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still 1226 may use for backward compatibility. That name is misleading, as this flag may 1227 be set when the code point actually does fit in 31 bits. This happens on 1228 EBCDIC platforms, and sometimes when the L<overlong 1229 malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately 1230 describes the situation in all cases. 1231 1232 =item C<UTF8_GOT_CONTINUATION> 1233 1234 The input sequence was malformed in that the first byte was a UTF-8 1235 continuation byte. 1236 1237 =item C<UTF8_GOT_EMPTY> 1238 1239 The input C<curlen> parameter was 0. 1240 1241 =item C<UTF8_GOT_LONG> 1242 1243 The input sequence was malformed in that there is some other sequence that 1244 evaluates to the same code point, but that sequence is shorter than this one. 1245 1246 Until Unicode 3.1, it was legal for programs to accept this malformation, but 1247 it was discovered that this created security issues. 1248 1249 =item C<UTF8_GOT_NONCHAR> 1250 1251 The code point represented by the input UTF-8 sequence is for a Unicode 1252 non-character code point. 1253 This bit is set only if the input C<flags> parameter contains either the 1254 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags. 1255 1256 =item C<UTF8_GOT_NON_CONTINUATION> 1257 1258 The input sequence was malformed in that a non-continuation type byte was found 1259 in a position where only a continuation type one should be. See also 1260 C<L</UTF8_GOT_SHORT>>. 1261 1262 =item C<UTF8_GOT_OVERFLOW> 1263 1264 The input sequence was malformed in that it is for a code point that is not 1265 representable in the number of bits available in an IV on the current platform. 1266 1267 =item C<UTF8_GOT_SHORT> 1268 1269 The input sequence was malformed in that C<curlen> is smaller than required for 1270 a complete sequence. In other words, the input is for a partial character 1271 sequence. 1272 1273 1274 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short 1275 sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always 1276 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete 1277 sequence was looked at. If no other flags are present, it means that the 1278 sequence was valid as far as it went. Depending on the application, this could 1279 mean one of three things: 1280 1281 =over 1282 1283 =item * 1284 1285 The C<curlen> length parameter passed in was too small, and the function was 1286 prevented from examining all the necessary bytes. 1287 1288 =item * 1289 1290 The buffer being looked at is based on reading data, and the data received so 1291 far stopped in the middle of a character, so that the next read will 1292 read the remainder of this character. (It is up to the caller to deal with the 1293 split bytes somehow.) 1294 1295 =item * 1296 1297 This is a real error, and the partial sequence is all we're going to get. 1298 1299 =back 1300 1301 =item C<UTF8_GOT_SUPER> 1302 1303 The input sequence was malformed in that it is for a non-Unicode code point; 1304 that is, one above the legal Unicode maximum. 1305 This bit is set only if the input C<flags> parameter contains either the 1306 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags. 1307 1308 =item C<UTF8_GOT_SURROGATE> 1309 1310 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate 1311 code point. 1312 This bit is set only if the input C<flags> parameter contains either the 1313 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags. 1314 1315 =back 1316 1317 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY> 1318 flag to suppress any warnings, and then examine the C<*errors> return. 1319 1320 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED 1321 =for apidoc Amnh||UTF8_GOT_CONTINUATION 1322 =for apidoc Amnh||UTF8_GOT_EMPTY 1323 =for apidoc Amnh||UTF8_GOT_LONG 1324 =for apidoc Amnh||UTF8_GOT_NONCHAR 1325 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION 1326 =for apidoc Amnh||UTF8_GOT_OVERFLOW 1327 =for apidoc Amnh||UTF8_GOT_SHORT 1328 =for apidoc Amnh||UTF8_GOT_SUPER 1329 =for apidoc Amnh||UTF8_GOT_SURROGATE 1330 1331 =cut 1332 1333 Also implemented as a macro in utf8.h 1334 */ 1335 1336 UV 1337 Perl_utf8n_to_uvchr_error(const U8 *s, 1338 STRLEN curlen, 1339 STRLEN *retlen, 1340 const U32 flags, 1341 U32 * errors) 1342 { 1343 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; 1344 1345 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); 1346 } 1347 1348 /* 1349 1350 =for apidoc utf8n_to_uvchr_msgs 1351 1352 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1353 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1354 directly. 1355 1356 This function is for code that needs to know what the precise malformation(s) 1357 are when an error is found, and wants the corresponding warning and/or error 1358 messages to be returned to the caller rather than be displayed. All messages 1359 that would have been displayed if all lexical warnings are enabled will be 1360 returned. 1361 1362 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter 1363 placed after all the others, C<msgs>. If this parameter is 0, this function 1364 behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should 1365 be a pointer to an C<AV *> variable, in which this function creates a new AV to 1366 contain any appropriate messages. The elements of the array are ordered so 1367 that the first message that would have been displayed is in the 0th element, 1368 and so on. Each element is a hash with three key-value pairs, as follows: 1369 1370 =over 4 1371 1372 =item C<text> 1373 1374 The text of the message as a C<SVpv>. 1375 1376 =item C<warn_categories> 1377 1378 The warning category (or categories) packed into a C<SVuv>. 1379 1380 =item C<flag> 1381 1382 A single flag bit associated with this message, in a C<SVuv>. 1383 The bit corresponds to some bit in the C<*errors> return value, 1384 such as C<UTF8_GOT_LONG>. 1385 1386 =back 1387 1388 It's important to note that specifying this parameter as non-null will cause 1389 any warnings this function would otherwise generate to be suppressed, and 1390 instead be placed in C<*msgs>. The caller can check the lexical warnings state 1391 (or not) when choosing what to do with the returned messages. 1392 1393 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence 1394 no AV is created. 1395 1396 The caller, of course, is responsible for freeing any returned AV. 1397 1398 =cut 1399 */ 1400 1401 UV 1402 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, 1403 STRLEN curlen, 1404 STRLEN *retlen, 1405 const U32 flags, 1406 U32 * errors, 1407 AV ** msgs) 1408 { 1409 const U8 * const s0 = s; 1410 const U8 * send = s0 + curlen; 1411 U32 possible_problems; /* A bit is set here for each potential problem 1412 found as we go along */ 1413 UV uv; 1414 STRLEN expectlen; /* How long should this sequence be? */ 1415 STRLEN avail_len; /* When input is too short, gives what that is */ 1416 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this 1417 gets set and discarded */ 1418 1419 /* The below are used only if there is both an overlong malformation and a 1420 * too short one. Otherwise the first two are set to 's0' and 'send', and 1421 * the third not used at all */ 1422 U8 * adjusted_s0; 1423 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this 1424 routine; see [perl #130921] */ 1425 UV uv_so_far; 1426 dTHX; 1427 1428 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; 1429 1430 /* Here, is one of: a) malformed; b) a problematic code point (surrogate, 1431 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul 1432 * syllables that the dfa doesn't properly handle. Quickly dispose of the 1433 * final case. */ 1434 1435 /* Each of the affected Hanguls starts with \xED */ 1436 1437 if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */ 1438 if (retlen) { 1439 *retlen = 3; 1440 } 1441 if (errors) { 1442 *errors = 0; 1443 } 1444 if (msgs) { 1445 *msgs = NULL; 1446 } 1447 1448 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) 1449 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) 1450 | (s0[2] & UTF_CONTINUATION_MASK); 1451 } 1452 1453 /* In conjunction with the exhaustive tests that can be enabled in 1454 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely 1455 * what it is intended to do, and that no flaws in it are masked by 1456 * dropping down and executing the code below 1457 assert(! isUTF8_CHAR(s0, send) 1458 || UTF8_IS_SURROGATE(s0, send) 1459 || UTF8_IS_SUPER(s0, send) 1460 || UTF8_IS_NONCHAR(s0,send)); 1461 */ 1462 1463 s = s0; 1464 possible_problems = 0; 1465 expectlen = 0; 1466 avail_len = 0; 1467 discard_errors = 0; 1468 adjusted_s0 = (U8 *) s0; 1469 uv_so_far = 0; 1470 1471 if (errors) { 1472 *errors = 0; 1473 } 1474 else { 1475 errors = &discard_errors; 1476 } 1477 1478 /* The order of malformation tests here is important. We should consume as 1479 * few bytes as possible in order to not skip any valid character. This is 1480 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also 1481 * https://unicode.org/reports/tr36 for more discussion as to why. For 1482 * example, once we've done a UTF8SKIP, we can tell the expected number of 1483 * bytes, and could fail right off the bat if the input parameters indicate 1484 * that there are too few available. But it could be that just that first 1485 * byte is garbled, and the intended character occupies fewer bytes. If we 1486 * blindly assumed that the first byte is correct, and skipped based on 1487 * that number, we could skip over a valid input character. So instead, we 1488 * always examine the sequence byte-by-byte. 1489 * 1490 * We also should not consume too few bytes, otherwise someone could inject 1491 * things. For example, an input could be deliberately designed to 1492 * overflow, and if this code bailed out immediately upon discovering that, 1493 * returning to the caller C<*retlen> pointing to the very next byte (one 1494 * which is actually part of the overflowing sequence), that could look 1495 * legitimate to the caller, which could discard the initial partial 1496 * sequence and process the rest, inappropriately. 1497 * 1498 * Some possible input sequences are malformed in more than one way. This 1499 * function goes to lengths to try to find all of them. This is necessary 1500 * for correctness, as the inputs may allow one malformation but not 1501 * another, and if we abandon searching for others after finding the 1502 * allowed one, we could allow in something that shouldn't have been. 1503 */ 1504 1505 if (UNLIKELY(curlen == 0)) { 1506 possible_problems |= UTF8_GOT_EMPTY; 1507 curlen = 0; 1508 uv = UNICODE_REPLACEMENT; 1509 goto ready_to_handle_errors; 1510 } 1511 1512 /* We now know we can examine the first byte of the input */ 1513 expectlen = UTF8SKIP(s); 1514 uv = *s; 1515 1516 /* A well-formed UTF-8 character, as the vast majority of calls to this 1517 * function will be for, has this expected length. For efficiency, set 1518 * things up here to return it. It will be overridden only in those rare 1519 * cases where a malformation is found */ 1520 if (retlen) { 1521 *retlen = expectlen; 1522 } 1523 1524 /* A continuation character can't start a valid sequence */ 1525 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { 1526 possible_problems |= UTF8_GOT_CONTINUATION; 1527 curlen = 1; 1528 uv = UNICODE_REPLACEMENT; 1529 goto ready_to_handle_errors; 1530 } 1531 1532 /* Here is not a continuation byte, nor an invariant. The only thing left 1533 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START 1534 * because it excludes start bytes like \xC0 that always lead to 1535 * overlongs.) */ 1536 1537 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits 1538 * that indicate the number of bytes in the character's whole UTF-8 1539 * sequence, leaving just the bits that are part of the value. */ 1540 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 1541 1542 /* Setup the loop end point, making sure to not look past the end of the 1543 * input string, and flag it as too short if the size isn't big enough. */ 1544 if (UNLIKELY(curlen < expectlen)) { 1545 possible_problems |= UTF8_GOT_SHORT; 1546 avail_len = curlen; 1547 } 1548 else { 1549 send = (U8*) s0 + expectlen; 1550 } 1551 1552 /* Now, loop through the remaining bytes in the character's sequence, 1553 * accumulating each into the working value as we go. */ 1554 for (s = s0 + 1; s < send; s++) { 1555 if (LIKELY(UTF8_IS_CONTINUATION(*s))) { 1556 uv = UTF8_ACCUMULATE(uv, *s); 1557 continue; 1558 } 1559 1560 /* Here, found a non-continuation before processing all expected bytes. 1561 * This byte indicates the beginning of a new character, so quit, even 1562 * if allowing this malformation. */ 1563 possible_problems |= UTF8_GOT_NON_CONTINUATION; 1564 break; 1565 } /* End of loop through the character's bytes */ 1566 1567 /* Save how many bytes were actually in the character */ 1568 curlen = s - s0; 1569 1570 /* Note that there are two types of too-short malformation. One is when 1571 * there is actual wrong data before the normal termination of the 1572 * sequence. The other is that the sequence wasn't complete before the end 1573 * of the data we are allowed to look at, based on the input 'curlen'. 1574 * This means that we were passed data for a partial character, but it is 1575 * valid as far as we saw. The other is definitely invalid. This 1576 * distinction could be important to a caller, so the two types are kept 1577 * separate. 1578 * 1579 * A convenience macro that matches either of the too-short conditions. */ 1580 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) 1581 1582 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { 1583 uv_so_far = uv; 1584 uv = UNICODE_REPLACEMENT; 1585 } 1586 1587 /* Check for overflow. The algorithm requires us to not look past the end 1588 * of the current character, even if partial, so the upper limit is 's' */ 1589 if (UNLIKELY(0 < does_utf8_overflow(s0, s, 1590 1 /* Do consider overlongs */ 1591 ))) 1592 { 1593 possible_problems |= UTF8_GOT_OVERFLOW; 1594 uv = UNICODE_REPLACEMENT; 1595 } 1596 1597 /* Check for overlong. If no problems so far, 'uv' is the correct code 1598 * point value. Simply see if it is expressible in fewer bytes. Otherwise 1599 * we must look at the UTF-8 byte sequence itself to see if it is for an 1600 * overlong */ 1601 if ( ( LIKELY(! possible_problems) 1602 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) 1603 || ( UNLIKELY(possible_problems) 1604 && ( UNLIKELY(! UTF8_IS_START(*s0)) 1605 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) 1606 { 1607 possible_problems |= UTF8_GOT_LONG; 1608 1609 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT) 1610 1611 /* The calculation in the 'true' branch of this 'if' 1612 * below won't work if overflows, and isn't needed 1613 * anyway. Further below we handle all overflow 1614 * cases */ 1615 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) 1616 { 1617 UV min_uv = uv_so_far; 1618 STRLEN i; 1619 1620 /* Here, the input is both overlong and is missing some trailing 1621 * bytes. There is no single code point it could be for, but there 1622 * may be enough information present to determine if what we have 1623 * so far is for an unallowed code point, such as for a surrogate. 1624 * The code further below has the intelligence to determine this, 1625 * but just for non-overlong UTF-8 sequences. What we do here is 1626 * calculate the smallest code point the input could represent if 1627 * there were no too short malformation. Then we compute and save 1628 * the UTF-8 for that, which is what the code below looks at 1629 * instead of the raw input. It turns out that the smallest such 1630 * code point is all we need. */ 1631 for (i = curlen; i < expectlen; i++) { 1632 min_uv = UTF8_ACCUMULATE(min_uv, 1633 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE)); 1634 } 1635 1636 adjusted_s0 = temp_char_buf; 1637 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); 1638 } 1639 } 1640 1641 /* Here, we have found all the possible problems, except for when the input 1642 * is for a problematic code point not allowed by the input parameters. */ 1643 1644 /* uv is valid for overlongs */ 1645 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) 1646 && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) 1647 || ( UNLIKELY(possible_problems) 1648 1649 /* if overflow, we know without looking further 1650 * precisely which of the problematic types it is, 1651 * and we deal with those in the overflow handling 1652 * code */ 1653 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) 1654 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) 1655 || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) 1656 && ((flags & ( UTF8_DISALLOW_NONCHAR 1657 |UTF8_DISALLOW_SURROGATE 1658 |UTF8_DISALLOW_SUPER 1659 |UTF8_DISALLOW_PERL_EXTENDED 1660 |UTF8_WARN_NONCHAR 1661 |UTF8_WARN_SURROGATE 1662 |UTF8_WARN_SUPER 1663 |UTF8_WARN_PERL_EXTENDED)))) 1664 { 1665 /* If there were no malformations, or the only malformation is an 1666 * overlong, 'uv' is valid */ 1667 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) { 1668 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { 1669 possible_problems |= UTF8_GOT_SURROGATE; 1670 } 1671 else if (UNLIKELY(UNICODE_IS_SUPER(uv))) { 1672 possible_problems |= UTF8_GOT_SUPER; 1673 } 1674 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) { 1675 possible_problems |= UTF8_GOT_NONCHAR; 1676 } 1677 } 1678 else { /* Otherwise, need to look at the source UTF-8, possibly 1679 adjusted to be non-overlong */ 1680 1681 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) 1682 > UTF_START_BYTE_110000_)) 1683 { 1684 possible_problems |= UTF8_GOT_SUPER; 1685 } 1686 else if (curlen > 1) { 1687 if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0) 1688 == UTF_START_BYTE_110000_ 1689 && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)) 1690 >= UTF_FIRST_CONT_BYTE_110000_)) 1691 { 1692 possible_problems |= UTF8_GOT_SUPER; 1693 } 1694 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) { 1695 possible_problems |= UTF8_GOT_SURROGATE; 1696 } 1697 } 1698 1699 /* We need a complete well-formed UTF-8 character to discern 1700 * non-characters, so can't look for them here */ 1701 } 1702 } 1703 1704 ready_to_handle_errors: 1705 1706 /* At this point: 1707 * curlen contains the number of bytes in the sequence that 1708 * this call should advance the input by. 1709 * avail_len gives the available number of bytes passed in, but 1710 * only if this is less than the expected number of 1711 * bytes, based on the code point's start byte. 1712 * possible_problems is 0 if there weren't any problems; otherwise a bit 1713 * is set in it for each potential problem found. 1714 * uv contains the code point the input sequence 1715 * represents; or if there is a problem that prevents 1716 * a well-defined value from being computed, it is 1717 * some substitute value, typically the REPLACEMENT 1718 * CHARACTER. 1719 * s0 points to the first byte of the character 1720 * s points to just after where we left off processing 1721 * the character 1722 * send points to just after where that character should 1723 * end, based on how many bytes the start byte tells 1724 * us should be in it, but no further than s0 + 1725 * avail_len 1726 */ 1727 1728 if (UNLIKELY(possible_problems)) { 1729 bool disallowed = FALSE; 1730 const U32 orig_problems = possible_problems; 1731 1732 if (msgs) { 1733 *msgs = NULL; 1734 } 1735 1736 while (possible_problems) { /* Handle each possible problem */ 1737 U32 pack_warn = 0; 1738 char * message = NULL; 1739 U32 this_flag_bit = 0; 1740 1741 /* Each 'if' clause handles one problem. They are ordered so that 1742 * the first ones' messages will be displayed before the later 1743 * ones; this is kinda in decreasing severity order. But the 1744 * overlong must come last, as it changes 'uv' looked at by the 1745 * others */ 1746 if (possible_problems & UTF8_GOT_OVERFLOW) { 1747 1748 /* Overflow means also got a super and are using Perl's 1749 * extended UTF-8, but we handle all three cases here */ 1750 possible_problems 1751 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED); 1752 *errors |= UTF8_GOT_OVERFLOW; 1753 1754 /* But the API says we flag all errors found */ 1755 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) { 1756 *errors |= UTF8_GOT_SUPER; 1757 } 1758 if (flags 1759 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED)) 1760 { 1761 *errors |= UTF8_GOT_PERL_EXTENDED; 1762 } 1763 1764 /* Disallow if any of the three categories say to */ 1765 if ( ! (flags & UTF8_ALLOW_OVERFLOW) 1766 || (flags & ( UTF8_DISALLOW_SUPER 1767 |UTF8_DISALLOW_PERL_EXTENDED))) 1768 { 1769 disallowed = TRUE; 1770 } 1771 1772 /* Likewise, warn if any say to */ 1773 if ( ! (flags & UTF8_ALLOW_OVERFLOW) 1774 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) 1775 { 1776 1777 /* The warnings code explicitly says it doesn't handle the 1778 * case of packWARN2 and two categories which have 1779 * parent-child relationship. Even if it works now to 1780 * raise the warning if either is enabled, it wouldn't 1781 * necessarily do so in the future. We output (only) the 1782 * most dire warning */ 1783 if (! (flags & UTF8_CHECK_ONLY)) { 1784 if (msgs || ckWARN_d(WARN_UTF8)) { 1785 pack_warn = packWARN(WARN_UTF8); 1786 } 1787 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) { 1788 pack_warn = packWARN(WARN_NON_UNICODE); 1789 } 1790 if (pack_warn) { 1791 message = Perl_form(aTHX_ "%s: %s (overflows)", 1792 malformed_text, 1793 _byte_dump_string(s0, curlen, 0)); 1794 this_flag_bit = UTF8_GOT_OVERFLOW; 1795 } 1796 } 1797 } 1798 } 1799 else if (possible_problems & UTF8_GOT_EMPTY) { 1800 possible_problems &= ~UTF8_GOT_EMPTY; 1801 *errors |= UTF8_GOT_EMPTY; 1802 1803 if (! (flags & UTF8_ALLOW_EMPTY)) { 1804 1805 /* This so-called malformation is now treated as a bug in 1806 * the caller. If you have nothing to decode, skip calling 1807 * this function */ 1808 assert(0); 1809 1810 disallowed = TRUE; 1811 if ( (msgs 1812 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1813 { 1814 pack_warn = packWARN(WARN_UTF8); 1815 message = Perl_form(aTHX_ "%s (empty string)", 1816 malformed_text); 1817 this_flag_bit = UTF8_GOT_EMPTY; 1818 } 1819 } 1820 } 1821 else if (possible_problems & UTF8_GOT_CONTINUATION) { 1822 possible_problems &= ~UTF8_GOT_CONTINUATION; 1823 *errors |= UTF8_GOT_CONTINUATION; 1824 1825 if (! (flags & UTF8_ALLOW_CONTINUATION)) { 1826 disallowed = TRUE; 1827 if (( msgs 1828 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1829 { 1830 pack_warn = packWARN(WARN_UTF8); 1831 message = Perl_form(aTHX_ 1832 "%s: %s (unexpected continuation byte 0x%02x," 1833 " with no preceding start byte)", 1834 malformed_text, 1835 _byte_dump_string(s0, 1, 0), *s0); 1836 this_flag_bit = UTF8_GOT_CONTINUATION; 1837 } 1838 } 1839 } 1840 else if (possible_problems & UTF8_GOT_SHORT) { 1841 possible_problems &= ~UTF8_GOT_SHORT; 1842 *errors |= UTF8_GOT_SHORT; 1843 1844 if (! (flags & UTF8_ALLOW_SHORT)) { 1845 disallowed = TRUE; 1846 if (( msgs 1847 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1848 { 1849 pack_warn = packWARN(WARN_UTF8); 1850 message = Perl_form(aTHX_ 1851 "%s: %s (too short; %d byte%s available, need %d)", 1852 malformed_text, 1853 _byte_dump_string(s0, send - s0, 0), 1854 (int)avail_len, 1855 avail_len == 1 ? "" : "s", 1856 (int)expectlen); 1857 this_flag_bit = UTF8_GOT_SHORT; 1858 } 1859 } 1860 1861 } 1862 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) { 1863 possible_problems &= ~UTF8_GOT_NON_CONTINUATION; 1864 *errors |= UTF8_GOT_NON_CONTINUATION; 1865 1866 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { 1867 disallowed = TRUE; 1868 if (( msgs 1869 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1870 { 1871 1872 /* If we don't know for sure that the input length is 1873 * valid, avoid as much as possible reading past the 1874 * end of the buffer */ 1875 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) 1876 ? (int) (s - s0) 1877 : (int) (send - s0); 1878 pack_warn = packWARN(WARN_UTF8); 1879 message = Perl_form(aTHX_ "%s", 1880 unexpected_non_continuation_text(s0, 1881 printlen, 1882 s - s0, 1883 (int) expectlen)); 1884 this_flag_bit = UTF8_GOT_NON_CONTINUATION; 1885 } 1886 } 1887 } 1888 else if (possible_problems & UTF8_GOT_SURROGATE) { 1889 possible_problems &= ~UTF8_GOT_SURROGATE; 1890 1891 if (flags & UTF8_WARN_SURROGATE) { 1892 *errors |= UTF8_GOT_SURROGATE; 1893 1894 if ( ! (flags & UTF8_CHECK_ONLY) 1895 && (msgs || ckWARN_d(WARN_SURROGATE))) 1896 { 1897 pack_warn = packWARN(WARN_SURROGATE); 1898 1899 /* These are the only errors that can occur with a 1900 * surrogate when the 'uv' isn't valid */ 1901 if (orig_problems & UTF8_GOT_TOO_SHORT) { 1902 message = Perl_form(aTHX_ 1903 "UTF-16 surrogate (any UTF-8 sequence that" 1904 " starts with \"%s\" is for a surrogate)", 1905 _byte_dump_string(s0, curlen, 0)); 1906 } 1907 else { 1908 message = Perl_form(aTHX_ surrogate_cp_format, uv); 1909 } 1910 this_flag_bit = UTF8_GOT_SURROGATE; 1911 } 1912 } 1913 1914 if (flags & UTF8_DISALLOW_SURROGATE) { 1915 disallowed = TRUE; 1916 *errors |= UTF8_GOT_SURROGATE; 1917 } 1918 } 1919 else if (possible_problems & UTF8_GOT_SUPER) { 1920 possible_problems &= ~UTF8_GOT_SUPER; 1921 1922 if (flags & UTF8_WARN_SUPER) { 1923 *errors |= UTF8_GOT_SUPER; 1924 1925 if ( ! (flags & UTF8_CHECK_ONLY) 1926 && (msgs || ckWARN_d(WARN_NON_UNICODE))) 1927 { 1928 pack_warn = packWARN(WARN_NON_UNICODE); 1929 1930 if (orig_problems & UTF8_GOT_TOO_SHORT) { 1931 message = Perl_form(aTHX_ 1932 "Any UTF-8 sequence that starts with" 1933 " \"%s\" is for a non-Unicode code point," 1934 " may not be portable", 1935 _byte_dump_string(s0, curlen, 0)); 1936 } 1937 else { 1938 message = Perl_form(aTHX_ super_cp_format, uv); 1939 } 1940 this_flag_bit = UTF8_GOT_SUPER; 1941 } 1942 } 1943 1944 /* Test for Perl's extended UTF-8 after the regular SUPER ones, 1945 * and before possibly bailing out, so that the more dire 1946 * warning will override the regular one. */ 1947 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) { 1948 if ( ! (flags & UTF8_CHECK_ONLY) 1949 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) 1950 && (msgs || ( ckWARN_d(WARN_NON_UNICODE) 1951 || ckWARN(WARN_PORTABLE)))) 1952 { 1953 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); 1954 1955 /* If it is an overlong that evaluates to a code point 1956 * that doesn't have to use the Perl extended UTF-8, it 1957 * still used it, and so we output a message that 1958 * doesn't refer to the code point. The same is true 1959 * if there was a SHORT malformation where the code 1960 * point is not valid. In that case, 'uv' will have 1961 * been set to the REPLACEMENT CHAR, and the message 1962 * below without the code point in it will be selected 1963 * */ 1964 if (UNICODE_IS_PERL_EXTENDED(uv)) { 1965 message = Perl_form(aTHX_ 1966 PL_extended_cp_format, uv); 1967 } 1968 else { 1969 message = Perl_form(aTHX_ 1970 "Any UTF-8 sequence that starts with" 1971 " \"%s\" is a Perl extension, and" 1972 " so is not portable", 1973 _byte_dump_string(s0, curlen, 0)); 1974 } 1975 this_flag_bit = UTF8_GOT_PERL_EXTENDED; 1976 } 1977 1978 if (flags & ( UTF8_WARN_PERL_EXTENDED 1979 |UTF8_DISALLOW_PERL_EXTENDED)) 1980 { 1981 *errors |= UTF8_GOT_PERL_EXTENDED; 1982 1983 if (flags & UTF8_DISALLOW_PERL_EXTENDED) { 1984 disallowed = TRUE; 1985 } 1986 } 1987 } 1988 1989 if (flags & UTF8_DISALLOW_SUPER) { 1990 *errors |= UTF8_GOT_SUPER; 1991 disallowed = TRUE; 1992 } 1993 } 1994 else if (possible_problems & UTF8_GOT_NONCHAR) { 1995 possible_problems &= ~UTF8_GOT_NONCHAR; 1996 1997 if (flags & UTF8_WARN_NONCHAR) { 1998 *errors |= UTF8_GOT_NONCHAR; 1999 2000 if ( ! (flags & UTF8_CHECK_ONLY) 2001 && (msgs || ckWARN_d(WARN_NONCHAR))) 2002 { 2003 /* The code above should have guaranteed that we don't 2004 * get here with errors other than overlong */ 2005 assert (! (orig_problems 2006 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR))); 2007 2008 pack_warn = packWARN(WARN_NONCHAR); 2009 message = Perl_form(aTHX_ nonchar_cp_format, uv); 2010 this_flag_bit = UTF8_GOT_NONCHAR; 2011 } 2012 } 2013 2014 if (flags & UTF8_DISALLOW_NONCHAR) { 2015 disallowed = TRUE; 2016 *errors |= UTF8_GOT_NONCHAR; 2017 } 2018 } 2019 else if (possible_problems & UTF8_GOT_LONG) { 2020 possible_problems &= ~UTF8_GOT_LONG; 2021 *errors |= UTF8_GOT_LONG; 2022 2023 if (flags & UTF8_ALLOW_LONG) { 2024 2025 /* We don't allow the actual overlong value, unless the 2026 * special extra bit is also set */ 2027 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE 2028 & ~UTF8_ALLOW_LONG))) 2029 { 2030 uv = UNICODE_REPLACEMENT; 2031 } 2032 } 2033 else { 2034 disallowed = TRUE; 2035 2036 if (( msgs 2037 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 2038 { 2039 pack_warn = packWARN(WARN_UTF8); 2040 2041 /* These error types cause 'uv' to be something that 2042 * isn't what was intended, so can't use it in the 2043 * message. The other error types either can't 2044 * generate an overlong, or else the 'uv' is valid */ 2045 if (orig_problems & 2046 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) 2047 { 2048 message = Perl_form(aTHX_ 2049 "%s: %s (any UTF-8 sequence that starts" 2050 " with \"%s\" is overlong which can and" 2051 " should be represented with a" 2052 " different, shorter sequence)", 2053 malformed_text, 2054 _byte_dump_string(s0, send - s0, 0), 2055 _byte_dump_string(s0, curlen, 0)); 2056 } 2057 else { 2058 U8 tmpbuf[UTF8_MAXBYTES+1]; 2059 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, 2060 uv, 0); 2061 /* Don't use U+ for non-Unicode code points, which 2062 * includes those in the Latin1 range */ 2063 const char * preface = ( UNICODE_IS_SUPER(uv) 2064 #ifdef EBCDIC 2065 || uv <= 0xFF 2066 #endif 2067 ) 2068 ? "0x" 2069 : "U+"; 2070 message = Perl_form(aTHX_ 2071 "%s: %s (overlong; instead use %s to represent" 2072 " %s%0*" UVXf ")", 2073 malformed_text, 2074 _byte_dump_string(s0, send - s0, 0), 2075 _byte_dump_string(tmpbuf, e - tmpbuf, 0), 2076 preface, 2077 ((uv < 256) ? 2 : 4), /* Field width of 2 for 2078 small code points */ 2079 UNI_TO_NATIVE(uv)); 2080 } 2081 this_flag_bit = UTF8_GOT_LONG; 2082 } 2083 } 2084 } /* End of looking through the possible flags */ 2085 2086 /* Display the message (if any) for the problem being handled in 2087 * this iteration of the loop */ 2088 if (message) { 2089 if (msgs) { 2090 assert(this_flag_bit); 2091 2092 if (*msgs == NULL) { 2093 *msgs = newAV(); 2094 } 2095 2096 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message, 2097 pack_warn, 2098 this_flag_bit))); 2099 } 2100 else if (PL_op) 2101 Perl_warner(aTHX_ pack_warn, "%s in %s", message, 2102 OP_DESC(PL_op)); 2103 else 2104 Perl_warner(aTHX_ pack_warn, "%s", message); 2105 } 2106 } /* End of 'while (possible_problems)' */ 2107 2108 /* Since there was a possible problem, the returned length may need to 2109 * be changed from the one stored at the beginning of this function. 2110 * Instead of trying to figure out if it has changed, just do it. */ 2111 if (retlen) { 2112 *retlen = curlen; 2113 } 2114 2115 if (disallowed) { 2116 if (flags & UTF8_CHECK_ONLY && retlen) { 2117 *retlen = ((STRLEN) -1); 2118 } 2119 return 0; 2120 } 2121 } 2122 2123 return UNI_TO_NATIVE(uv); 2124 } 2125 2126 /* 2127 =for apidoc utf8_to_uvchr_buf 2128 2129 Returns the native code point of the first character in the string C<s> which 2130 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>. 2131 C<*retlen> will be set to the length, in bytes, of that character. 2132 2133 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are 2134 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 2135 C<NULL>) to -1. If those warnings are off, the computed value, if well-defined 2136 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and 2137 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is 2138 the next possible position in C<s> that could begin a non-malformed character. 2139 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is 2140 returned. 2141 2142 =cut 2143 2144 Also implemented as a macro in utf8.h 2145 2146 */ 2147 2148 2149 UV 2150 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2151 { 2152 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; 2153 2154 return utf8_to_uvchr_buf_helper(s, send, retlen); 2155 } 2156 2157 /* 2158 =for apidoc utf8_length 2159 2160 Returns the number of characters in the sequence of UTF-8-encoded bytes starting 2161 at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the 2162 same place, it returns 0 with no warning raised. 2163 2164 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning 2165 and returns the number of valid characters. 2166 2167 =cut 2168 2169 For long strings we process the input word-at-a-time, and count 2170 continuations, instead of otherwise counting characters and using UTF8SKIP 2171 to find the next one. If our input were 13-byte characters, the per-word 2172 would be a loser, as we would be doing things in 8 byte chunks (or 4 on a 2173 32-bit platform). But the maximum legal Unicode code point is 4 bytes, and 2174 most text will have a significant number of 1 and 2 byte characters, so the 2175 per-word is generally a winner. 2176 2177 There are start-up and finish costs with the per-word method, so we use the 2178 standard method unless the input has a relatively large length. 2179 */ 2180 2181 STRLEN 2182 Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e) 2183 { 2184 STRLEN continuations = 0; 2185 STRLEN len = 0; 2186 const U8 * s = s0; 2187 2188 PERL_ARGS_ASSERT_UTF8_LENGTH; 2189 2190 /* For EBCDCIC and short strings, we count the characters. The boundary 2191 * was determined by eyeballing the output of Porting/bench.pl and 2192 * choosing a number where the continuations method gave better results (on 2193 * a 64 bit system, khw not having access to a 32 bit system with 2194 * cachegrind). The number isn't critical, as at these sizes, the total 2195 * time spent isn't large either way */ 2196 2197 #ifndef EBCDIC 2198 2199 if (e - s0 < 96) 2200 2201 #endif 2202 2203 { 2204 while (s < e) { /* Count characters directly */ 2205 2206 /* Take extra care to not exceed 'e' (which would be undefined 2207 * behavior) should the input be malformed, with a partial 2208 * character at the end */ 2209 Ptrdiff_t expected_byte_count = UTF8SKIP(s); 2210 if (UNLIKELY(e - s < expected_byte_count)) { 2211 goto warn_and_return; 2212 } 2213 2214 len++; 2215 s += expected_byte_count; 2216 } 2217 2218 if (LIKELY(e == s)) { 2219 return len; 2220 } 2221 2222 warn_and_return: 2223 if (ckWARN_d(WARN_UTF8)) { 2224 if (PL_op) 2225 Perl_warner(aTHX_ packWARN(WARN_UTF8), 2226 "%s in %s", unees, OP_DESC(PL_op)); 2227 else 2228 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", unees); 2229 } 2230 2231 return s - s0; 2232 } 2233 2234 #ifndef EBCDIC 2235 2236 /* Count continuations, word-at-a-time. 2237 * 2238 * We need to stop before the final start character in order to 2239 * preserve the limited error checking that's always been done */ 2240 const U8 * e_limit = e - UTF8_MAXBYTES; 2241 2242 /* Points to the first byte >=s which is positioned at a word boundary. If 2243 * s is on a word boundary, it is s, otherwise it is to the next word. */ 2244 const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 2245 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK); 2246 2247 /* Process up to a full word boundary. */ 2248 while (s < partial_word_end) { 2249 const Size_t skip = UTF8SKIP(s); 2250 2251 continuations += skip - 1; 2252 s += skip; 2253 } 2254 2255 /* Adjust back down any overshoot */ 2256 continuations -= s - partial_word_end; 2257 s = partial_word_end; 2258 2259 do { /* Process per-word */ 2260 2261 /* The idea for counting continuation bytes came from 2262 * https://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html 2263 * One thing it does that this doesn't is to prefetch the buffer 2264 * __builtin_prefetch(&s[256], 0, 0); 2265 * 2266 * A continuation byte has the upper 2 bits be '10', and the rest 2267 * dont-cares. The VARIANTS mask zeroes out all but the upper bit of 2268 * each byte in the word. That gets shifted to the byte's lowest bit, 2269 * and 'anded' with the complement of the 2nd highest bit of the byte, 2270 * which has also been shifted to that position. Hence the bit in that 2271 * position will be 1 iff the upper bit is 1 and the next one is 0. We 2272 * then use the same integer multiplcation and shifting that are used 2273 * in variant_under_utf8_count() to count how many of those are set in 2274 * the word. */ 2275 2276 continuations += (((((* (const PERL_UINTMAX_T *) s) 2277 & PERL_VARIANTS_WORD_MASK) >> 7) 2278 & (((~ (* (const PERL_UINTMAX_T *) s))) >> 6)) 2279 * PERL_COUNT_MULTIPLIER) 2280 >> ((PERL_WORDSIZE - 1) * CHARBITS); 2281 s += PERL_WORDSIZE; 2282 } while (s + PERL_WORDSIZE <= e_limit); 2283 2284 /* Process remainder per-byte */ 2285 while (s < e) { 2286 if (UTF8_IS_CONTINUATION(*s)) { 2287 continuations++; 2288 s++; 2289 continue; 2290 } 2291 2292 /* Here is a starter byte. Use UTF8SKIP from now on */ 2293 do { 2294 Ptrdiff_t expected_byte_count = UTF8SKIP(s); 2295 if (UNLIKELY(e - s < expected_byte_count)) { 2296 break; 2297 } 2298 2299 continuations += expected_byte_count- 1; 2300 s += expected_byte_count; 2301 } while (s < e); 2302 2303 break; 2304 } 2305 2306 # endif 2307 2308 if (LIKELY(e == s)) { 2309 return s - s0 - continuations; 2310 } 2311 2312 /* Convert to characters */ 2313 s -= continuations; 2314 2315 goto warn_and_return; 2316 } 2317 2318 /* 2319 =for apidoc bytes_cmp_utf8 2320 2321 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the 2322 sequence of characters (stored as UTF-8) 2323 in C<u>, C<ulen>. Returns 0 if they are 2324 equal, -1 or -2 if the first string is less than the second string, +1 or +2 2325 if the first string is greater than the second string. 2326 2327 -1 or +1 is returned if the shorter string was identical to the start of the 2328 longer string. -2 or +2 is returned if 2329 there was a difference between characters 2330 within the strings. 2331 2332 =cut 2333 */ 2334 2335 int 2336 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) 2337 { 2338 const U8 *const bend = b + blen; 2339 const U8 *const uend = u + ulen; 2340 2341 PERL_ARGS_ASSERT_BYTES_CMP_UTF8; 2342 2343 while (b < bend && u < uend) { 2344 U8 c = *u++; 2345 if (!UTF8_IS_INVARIANT(c)) { 2346 if (UTF8_IS_DOWNGRADEABLE_START(c)) { 2347 if (u < uend) { 2348 U8 c1 = *u++; 2349 if (UTF8_IS_CONTINUATION(c1)) { 2350 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1); 2351 } else { 2352 /* diag_listed_as: Malformed UTF-8 character%s */ 2353 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2354 "%s %s%s", 2355 unexpected_non_continuation_text(u - 2, 2, 1, 2), 2356 PL_op ? " in " : "", 2357 PL_op ? OP_DESC(PL_op) : ""); 2358 return -2; 2359 } 2360 } else { 2361 if (PL_op) 2362 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2363 "%s in %s", unees, OP_DESC(PL_op)); 2364 else 2365 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); 2366 return -2; /* Really want to return undef :-) */ 2367 } 2368 } else { 2369 return -2; 2370 } 2371 } 2372 if (*b != c) { 2373 return *b < c ? -2 : +2; 2374 } 2375 ++b; 2376 } 2377 2378 if (b == bend && u == uend) 2379 return 0; 2380 2381 return b < bend ? +1 : -1; 2382 } 2383 2384 /* 2385 =for apidoc utf8_to_bytes 2386 2387 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding. 2388 Unlike L</bytes_to_utf8>, this over-writes the original string, and 2389 updates C<*lenp> to contain the new length. 2390 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1. 2391 2392 Upon successful return, the number of variants in the string can be computed by 2393 having saved the value of C<*lenp> before the call, and subtracting the 2394 after-call value of C<*lenp> from it. 2395 2396 If you need a copy of the string, see L</bytes_from_utf8>. 2397 2398 =cut 2399 */ 2400 2401 U8 * 2402 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) 2403 { 2404 U8 * first_variant; 2405 2406 PERL_ARGS_ASSERT_UTF8_TO_BYTES; 2407 PERL_UNUSED_CONTEXT; 2408 2409 /* This is a no-op if no variants at all in the input */ 2410 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) { 2411 return s; 2412 } 2413 2414 /* Nothing before 'first_variant' needs to be changed, so start the real 2415 * work there */ 2416 2417 U8 * const save = s; 2418 U8 * const send = s + *lenp; 2419 U8 * d; 2420 2421 #ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */ 2422 2423 /* There is some start-up/tear-down overhead with this, so no real gain 2424 * unless the string is long enough. The current value is just a 2425 * guess. */ 2426 if (*lenp > 5 * PERL_WORDSIZE) { 2427 2428 /* First, go through the string a word at-a-time to verify that it is 2429 * downgradable. If it contains any start byte besides C2 and C3, then 2430 * it isn't. */ 2431 2432 const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0; 2433 const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2; 2434 const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE; 2435 2436 /* Points to the first byte >=s which is positioned at a word boundary. 2437 * If s is on a word boundary, it is s, otherwise it is the first byte 2438 * of the next word. */ 2439 U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) 2440 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK); 2441 2442 /* Here there is at least a full word beyond the first word boundary. 2443 * Process up to that boundary. */ 2444 while (s < partial_word_end) { 2445 if (! UTF8_IS_INVARIANT(*s)) { 2446 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 2447 *lenp = ((STRLEN) -1); 2448 return NULL; 2449 } 2450 s++; 2451 } 2452 s++; 2453 } 2454 2455 /* Adjust back down any overshoot */ 2456 s = partial_word_end; 2457 2458 /* Process per-word */ 2459 do { 2460 2461 PERL_UINTMAX_T C2_C3_start_bytes; 2462 2463 /* First find the bytes that are start bytes. ANDing with 2464 * C0C0...C0 causes any start byte to become C0; any other byte 2465 * becomes something else. Then XORing with C0 causes any start 2466 * byte to become 0; all other bytes non-zero. */ 2467 PERL_UINTMAX_T start_bytes 2468 = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask; 2469 2470 /* These shifts causes the most significant bit to be set to 1 for 2471 * any bytes in the word that aren't completely 0. Hence after 2472 * these, only the start bytes have 0 in their msb */ 2473 start_bytes |= start_bytes << 1; 2474 start_bytes |= start_bytes << 2; 2475 start_bytes |= start_bytes << 4; 2476 2477 /* When we complement, then AND with 8080...80, the start bytes 2478 * will have 1 in their msb, and all other bits are 0 */ 2479 start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK; 2480 2481 /* Now repeat the procedure, but look for bytes that match only 2482 * C2-C3. */ 2483 C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask) 2484 ^ C2_mask; 2485 C2_C3_start_bytes |= C2_C3_start_bytes << 1; 2486 C2_C3_start_bytes |= C2_C3_start_bytes << 2; 2487 C2_C3_start_bytes |= C2_C3_start_bytes << 4; 2488 C2_C3_start_bytes = ~ C2_C3_start_bytes 2489 & PERL_VARIANTS_WORD_MASK; 2490 2491 /* Here, start_bytes has a 1 in the msb of each byte that has a 2492 * start_byte; And 2493 * C2_C3_start_bytes has a 1 in the msb of each byte that has a 2494 * start_byte of C2 or C3 2495 * If they're not equal, there are start bytes that aren't C2 2496 * nor C3, hence this is not downgradable */ 2497 if (start_bytes != C2_C3_start_bytes) { 2498 *lenp = ((STRLEN) -1); 2499 return NULL; 2500 } 2501 2502 s += PERL_WORDSIZE; 2503 } while (s + PERL_WORDSIZE <= send); 2504 2505 /* If the final byte was a start byte, it means that the character 2506 * straddles two words, so back off one to start looking below at the 2507 * first byte of the character */ 2508 if (s > first_variant && UTF8_IS_START(*(s-1))) { 2509 s--; 2510 } 2511 } 2512 2513 #endif 2514 2515 /* Do the straggler bytes beyond the final word boundary (or all bytes 2516 * in the case of EBCDIC) */ 2517 while (s < send) { 2518 if (! UTF8_IS_INVARIANT(*s)) { 2519 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 2520 *lenp = ((STRLEN) -1); 2521 return NULL; 2522 } 2523 s++; 2524 } 2525 s++; 2526 } 2527 2528 /* Here, we passed the tests above. For the EBCDIC case, everything 2529 * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC, 2530 * it means only that all start bytes were C2 or C3, hence any 2531 * well-formed sequences are downgradable. But we didn't test, for 2532 * example, that there weren't two C2's in a row. That means that in 2533 * the loop below, we have to be sure things are well-formed. Because 2534 * this is very very likely, and we don't care about having speedy 2535 * handling of malformed input, the loop proceeds as if well formed, 2536 * and should a malformed one come along, it undoes what it already has 2537 * done */ 2538 2539 d = s = first_variant; 2540 2541 while (s < send) { 2542 U8 * s1; 2543 2544 if (UVCHR_IS_INVARIANT(*s)) { 2545 *d++ = *s++; 2546 continue; 2547 } 2548 2549 /* Here it is two-byte encoded. */ 2550 if ( LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s)) 2551 && LIKELY(UTF8_IS_CONTINUATION((s[1])))) 2552 { 2553 U8 first_byte = *s++; 2554 *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s); 2555 s++; 2556 continue; 2557 } 2558 2559 /* Here, it is malformed. This shouldn't happen on EBCDIC, and on 2560 * ASCII platforms, we know that the only start bytes in the text 2561 * are C2 and C3, and the code above has made sure that it doesn't 2562 * end with a start byte. That means the only malformations that 2563 * are possible are a start byte without a continuation (either 2564 * followed by another start byte or an invariant) or an unexpected 2565 * continuation. 2566 * 2567 * We have to undo all we've done before, back down to the first 2568 * UTF-8 variant. Note that each 2-byte variant we've done so far 2569 * (converted to single byte) slides things to the left one byte, 2570 * and so we have bytes that haven't been written over. 2571 * 2572 * Here, 'd' points to the next position to overwrite, and 's' 2573 * points to the first invalid byte. That means 'd's contents 2574 * haven't been changed yet, nor has anything else beyond it in the 2575 * string. In restoring to the original contents, we don't need to 2576 * do anything past (d-1). 2577 * 2578 * In particular, the bytes from 'd' to 's' have not been changed. 2579 * This loop uses a new variable 's1' (to avoid confusing 'source' 2580 * and 'destination') set to 'd', and moves 's' and 's1' in lock 2581 * step back so that afterwards, 's1' points to the first changed 2582 * byte that will be the source for the first byte (or bytes) at 2583 * 's' that need to be changed back. Note that s1 can expand to 2584 * two bytes */ 2585 s1 = d; 2586 while (s >= d) { 2587 s--; 2588 if (! UVCHR_IS_INVARIANT(*s1)) { 2589 s--; 2590 } 2591 s1--; 2592 } 2593 2594 /* Do the changing back */ 2595 while (s1 >= first_variant) { 2596 if (UVCHR_IS_INVARIANT(*s1)) { 2597 *s-- = *s1--; 2598 } 2599 else { 2600 *s-- = UTF8_EIGHT_BIT_LO(*s1); 2601 *s-- = UTF8_EIGHT_BIT_HI(*s1); 2602 s1--; 2603 } 2604 } 2605 2606 *lenp = ((STRLEN) -1); 2607 return NULL; 2608 } 2609 2610 /* Success! */ 2611 *d = '\0'; 2612 *lenp = d - save; 2613 2614 return save; 2615 } 2616 2617 /* 2618 =for apidoc bytes_from_utf8 2619 2620 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native 2621 byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is 2622 actually encoded in UTF-8. 2623 2624 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of 2625 the input string. 2626 2627 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string 2628 not expressible in native byte encoding. In these cases, C<*is_utf8p> and 2629 C<*lenp> are unchanged, and the return value is the original C<s>. 2630 2631 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a 2632 newly created string containing a downgraded copy of C<s>, and whose length is 2633 returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The 2634 caller is responsible for arranging for the memory used by this string to get 2635 freed. 2636 2637 Upon successful return, the number of variants in the string can be computed by 2638 having saved the value of C<*lenp> before the call, and subtracting the 2639 after-call value of C<*lenp> from it. 2640 2641 =cut 2642 2643 There is a macro that avoids this function call, but this is retained for 2644 anyone who calls it with the Perl_ prefix */ 2645 2646 U8 * 2647 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) 2648 { 2649 PERL_ARGS_ASSERT_BYTES_FROM_UTF8; 2650 PERL_UNUSED_CONTEXT; 2651 2652 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); 2653 } 2654 2655 /* 2656 =for apidoc bytes_from_utf8_loc 2657 2658 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer 2659 to where to store the location of the first character in C<"s"> that cannot be 2660 converted to non-UTF8. 2661 2662 If that parameter is C<NULL>, this function behaves identically to 2663 C<bytes_from_utf8>. 2664 2665 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to 2666 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>. 2667 2668 Otherwise, the function returns a newly created C<NUL>-terminated string 2669 containing the non-UTF8 equivalent of the convertible first portion of 2670 C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>. 2671 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, 2672 and C<*first_non_downgradable> is set to C<NULL>. 2673 2674 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the 2675 first character in the original string that wasn't converted. C<*is_utf8p> is 2676 unchanged. Note that the new string may have length 0. 2677 2678 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and 2679 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and 2680 converts as many characters in it as possible stopping at the first one it 2681 finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is 2682 set to point to that. The function returns the portion that could be converted 2683 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length, 2684 not including the terminating C<NUL>. If the very first character in the 2685 original could not be converted, C<*lenp> will be 0, and the new string will 2686 contain just a single C<NUL>. If the entire input string was converted, 2687 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>. 2688 2689 Upon successful return, the number of variants in the converted portion of the 2690 string can be computed by having saved the value of C<*lenp> before the call, 2691 and subtracting the after-call value of C<*lenp> from it. 2692 2693 =cut 2694 2695 2696 */ 2697 2698 U8 * 2699 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) 2700 { 2701 U8 *d; 2702 const U8 *original = s; 2703 U8 *converted_start; 2704 const U8 *send = s + *lenp; 2705 2706 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; 2707 2708 if (! *is_utf8p) { 2709 if (first_unconverted) { 2710 *first_unconverted = NULL; 2711 } 2712 2713 return (U8 *) original; 2714 } 2715 2716 Newx(d, (*lenp) + 1, U8); 2717 2718 converted_start = d; 2719 while (s < send) { 2720 U8 c = *s++; 2721 if (! UTF8_IS_INVARIANT(c)) { 2722 2723 /* Then it is multi-byte encoded. If the code point is above 0xFF, 2724 * have to stop now */ 2725 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { 2726 if (first_unconverted) { 2727 *first_unconverted = s - 1; 2728 goto finish_and_return; 2729 } 2730 else { 2731 Safefree(converted_start); 2732 return (U8 *) original; 2733 } 2734 } 2735 2736 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); 2737 s++; 2738 } 2739 *d++ = c; 2740 } 2741 2742 /* Here, converted the whole of the input */ 2743 *is_utf8p = FALSE; 2744 if (first_unconverted) { 2745 *first_unconverted = NULL; 2746 } 2747 2748 finish_and_return: 2749 *d = '\0'; 2750 *lenp = d - converted_start; 2751 2752 /* Trim unused space */ 2753 Renew(converted_start, *lenp + 1, U8); 2754 2755 return converted_start; 2756 } 2757 2758 /* 2759 =for apidoc bytes_to_utf8 2760 2761 Converts a string C<s> of length C<*lenp> bytes from the native encoding into 2762 UTF-8. 2763 Returns a pointer to the newly-created string, and sets C<*lenp> to 2764 reflect the new length in bytes. The caller is responsible for arranging for 2765 the memory used by this string to get freed. 2766 2767 Upon successful return, the number of variants in the string can be computed by 2768 having saved the value of C<*lenp> before the call, and subtracting it from the 2769 after-call value of C<*lenp>. 2770 2771 A C<NUL> character will be written after the end of the string. 2772 2773 If you want to convert to UTF-8 from encodings other than 2774 the native (Latin1 or EBCDIC), 2775 see L</sv_recode_to_utf8>(). 2776 2777 =cut 2778 */ 2779 2780 U8* 2781 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) 2782 { 2783 const U8 * const send = s + (*lenp); 2784 U8 *d; 2785 U8 *dst; 2786 2787 PERL_ARGS_ASSERT_BYTES_TO_UTF8; 2788 PERL_UNUSED_CONTEXT; 2789 2790 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ 2791 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); 2792 dst = d; 2793 2794 while (s < send) { 2795 append_utf8_from_native_byte(*s, &d); 2796 s++; 2797 } 2798 2799 *d = '\0'; 2800 *lenp = d-dst; 2801 2802 return dst; 2803 } 2804 2805 /* 2806 * Convert native UTF-16 to UTF-8. Called via the more public functions 2807 * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for 2808 * little-endian, 2809 * 2810 * 'p' is the UTF-16 input string, passed as a pointer to U8. 2811 * 'bytelen' is its length (must be even) 2812 * 'd' is the pointer to the destination buffer. The caller must ensure that 2813 * the space is large enough. The maximum expansion factor is 2 times 2814 * 'bytelen'. 1.5 if never going to run on an EBCDIC box. 2815 * '*newlen' will contain the number of bytes this function filled of 'd'. 2816 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE 2817 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE 2818 * 2819 * The expansion factor is because UTF-16 requires 2 bytes for every code point 2820 * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code 2821 * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for 2822 * every code point below 0x1000; otherwise 4-5 bytes. 2823 * 2824 * The worst case is where every code point is below U+10000, hence requiring 2 2825 * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8 2826 * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes. 2827 * 2828 * Do not use in-place. */ 2829 2830 U8* 2831 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen, 2832 const bool high_byte, /* Which of next two bytes is 2833 high order */ 2834 const bool low_byte) 2835 { 2836 U8* pend; 2837 U8* dstart = d; 2838 2839 PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE; 2840 2841 if (bytelen & 1) 2842 Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf, 2843 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen); 2844 pend = p + bytelen; 2845 2846 while (p < pend) { 2847 2848 /* Next 16 bits is what we want. (The bool is cast to U8 because on 2849 * platforms where a bool is implemented as a signed char, a compiler 2850 * warning may be generated) */ 2851 U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte]; 2852 p += 2; 2853 2854 /* If it's a surrogate, we find the uv that the surrogate pair encodes. 2855 * */ 2856 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { 2857 2858 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST 2859 #define LAST_HIGH_SURROGATE 0xDBFF 2860 #define FIRST_LOW_SURROGATE 0xDC00 2861 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST 2862 #define FIRST_IN_PLANE1 0x10000 2863 2864 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) { 2865 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 2866 } 2867 else { 2868 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte]; 2869 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE, 2870 LAST_LOW_SURROGATE))) 2871 { 2872 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 2873 } 2874 2875 p += 2; 2876 2877 /* Here uv is the high surrogate. Combine with low surrogate 2878 * just computed to form the actual U32 code point. 2879 * 2880 * From https://unicode.org/faq/utf_bom.html#utf16-4 */ 2881 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10) 2882 + low_surrogate - FIRST_LOW_SURROGATE; 2883 } 2884 } 2885 2886 /* Here, 'uv' is the real U32 we want to find the UTF-8 of */ 2887 d = uvchr_to_utf8(d, uv); 2888 } 2889 2890 *newlen = d - dstart; 2891 return d; 2892 } 2893 2894 U8* 2895 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) 2896 { 2897 PERL_ARGS_ASSERT_UTF16_TO_UTF8; 2898 2899 return utf16_to_utf8(p, d, bytelen, newlen); 2900 } 2901 2902 U8* 2903 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) 2904 { 2905 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; 2906 2907 return utf16_to_utf8_reversed(p, d, bytelen, newlen); 2908 } 2909 2910 /* 2911 * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for 2912 * big-endian and utf8_to_utf16_reversed() for little-endian, 2913 * 2914 * 's' is the UTF-8 input string, passed as a pointer to U8. 2915 * 'bytelen' is its length 2916 * 'd' is the pointer to the destination buffer, currently passed as U8 *. The 2917 * caller must ensure that the space is large enough. The maximum 2918 * expansion factor is 2 times 'bytelen'. This happens when the input is 2919 * entirely single-byte ASCII, expanding to two-byte UTF-16. 2920 * '*newlen' will contain the number of bytes this function filled of 'd'. 2921 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE 2922 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE 2923 * 2924 * Do not use in-place. */ 2925 U8* 2926 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen, 2927 const bool high_byte, /* Which of next two bytes 2928 is high order */ 2929 const bool low_byte) 2930 { 2931 U8* send; 2932 U8* dstart = d; 2933 2934 PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE; 2935 2936 send = s + bytelen; 2937 2938 while (s < send) { 2939 STRLEN retlen; 2940 UV uv = utf8n_to_uvchr(s, send - s, &retlen, 2941 /* No surrogates nor above-Unicode */ 2942 UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); 2943 2944 /* The modern method is to keep going with malformed input, 2945 * substituting the REPLACEMENT CHARACTER */ 2946 if (UNLIKELY(uv == 0 && *s != '\0')) { 2947 uv = UNICODE_REPLACEMENT; 2948 } 2949 2950 if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */ 2951 2952 /* From https://unicode.org/faq/utf_bom.html#utf16-4 */ 2953 U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10) 2954 + FIRST_HIGH_SURROGATE; 2955 2956 /* (The bool is cast to U8 because on platforms where a bool is 2957 * implemented as a signed char, a compiler warning may be 2958 * generated) */ 2959 d[(U8) high_byte] = high_surrogate >> 8; 2960 d[(U8) low_byte] = high_surrogate & nBIT_MASK(8); 2961 d += 2; 2962 2963 /* The low surrogate is the lower 10 bits plus the offset */ 2964 uv &= nBIT_MASK(10); 2965 uv += FIRST_LOW_SURROGATE; 2966 2967 /* Drop down to output the low surrogate like it were a 2968 * non-surrogate */ 2969 } 2970 2971 d[(U8) high_byte] = uv >> 8; 2972 d[(U8) low_byte] = uv & nBIT_MASK(8); 2973 d += 2; 2974 2975 s += retlen; 2976 } 2977 2978 *newlen = d - dstart; 2979 return d; 2980 } 2981 2982 bool 2983 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) 2984 { 2985 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c); 2986 } 2987 2988 bool 2989 Perl__is_uni_perl_idcont(pTHX_ UV c) 2990 { 2991 return _invlist_contains_cp(PL_utf8_perl_idcont, c); 2992 } 2993 2994 bool 2995 Perl__is_uni_perl_idstart(pTHX_ UV c) 2996 { 2997 return _invlist_contains_cp(PL_utf8_perl_idstart, c); 2998 } 2999 3000 UV 3001 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, 3002 const char S_or_s) 3003 { 3004 /* We have the latin1-range values compiled into the core, so just use 3005 * those, converting the result to UTF-8. The only difference between upper 3006 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is 3007 * either "SS" or "Ss". Which one to use is passed into the routine in 3008 * 'S_or_s' to avoid a test */ 3009 3010 UV converted = toUPPER_LATIN1_MOD(c); 3011 3012 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; 3013 3014 assert(S_or_s == 'S' || S_or_s == 's'); 3015 3016 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for 3017 characters in this range */ 3018 *p = (U8) converted; 3019 *lenp = 1; 3020 return converted; 3021 } 3022 3023 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, 3024 * which it maps to one of them, so as to only have to have one check for 3025 * it in the main case */ 3026 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { 3027 switch (c) { 3028 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 3029 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; 3030 break; 3031 case MICRO_SIGN: 3032 converted = GREEK_CAPITAL_LETTER_MU; 3033 break; 3034 #if UNICODE_MAJOR_VERSION > 2 \ 3035 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ 3036 && UNICODE_DOT_DOT_VERSION >= 8) 3037 case LATIN_SMALL_LETTER_SHARP_S: 3038 *(p)++ = 'S'; 3039 *p = S_or_s; 3040 *lenp = 2; 3041 return 'S'; 3042 #endif 3043 default: 3044 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect" 3045 " '%c' to map to '%c'", 3046 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); 3047 NOT_REACHED; /* NOTREACHED */ 3048 } 3049 } 3050 3051 *(p)++ = UTF8_TWO_BYTE_HI(converted); 3052 *p = UTF8_TWO_BYTE_LO(converted); 3053 *lenp = 2; 3054 3055 return converted; 3056 } 3057 3058 /* If compiled on an early Unicode version, there may not be auxiliary tables 3059 * */ 3060 #ifndef HAS_UC_AUX_TABLES 3061 # define UC_AUX_TABLE_ptrs NULL 3062 # define UC_AUX_TABLE_lengths NULL 3063 #endif 3064 #ifndef HAS_TC_AUX_TABLES 3065 # define TC_AUX_TABLE_ptrs NULL 3066 # define TC_AUX_TABLE_lengths NULL 3067 #endif 3068 #ifndef HAS_LC_AUX_TABLES 3069 # define LC_AUX_TABLE_ptrs NULL 3070 # define LC_AUX_TABLE_lengths NULL 3071 #endif 3072 #ifndef HAS_CF_AUX_TABLES 3073 # define CF_AUX_TABLE_ptrs NULL 3074 # define CF_AUX_TABLE_lengths NULL 3075 #endif 3076 3077 /* Call the function to convert a UTF-8 encoded character to the specified case. 3078 * Note that there may be more than one character in the result. 3079 * 's' is a pointer to the first byte of the input character 3080 * 'd' will be set to the first byte of the string of changed characters. It 3081 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes 3082 * 'lenp' will be set to the length in bytes of the string of changed characters 3083 * 3084 * The functions return the ordinal of the first character in the string of 3085 * 'd' */ 3086 #define CALL_UPPER_CASE(uv, s, d, lenp) \ 3087 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \ 3088 Uppercase_Mapping_invmap, \ 3089 UC_AUX_TABLE_ptrs, \ 3090 UC_AUX_TABLE_lengths, \ 3091 "uppercase") 3092 #define CALL_TITLE_CASE(uv, s, d, lenp) \ 3093 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \ 3094 Titlecase_Mapping_invmap, \ 3095 TC_AUX_TABLE_ptrs, \ 3096 TC_AUX_TABLE_lengths, \ 3097 "titlecase") 3098 #define CALL_LOWER_CASE(uv, s, d, lenp) \ 3099 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \ 3100 Lowercase_Mapping_invmap, \ 3101 LC_AUX_TABLE_ptrs, \ 3102 LC_AUX_TABLE_lengths, \ 3103 "lowercase") 3104 3105 3106 /* This additionally has the input parameter 'specials', which if non-zero will 3107 * cause this to use the specials hash for folding (meaning get full case 3108 * folding); otherwise, when zero, this implies a simple case fold */ 3109 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \ 3110 (specials) \ 3111 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \ 3112 Case_Folding_invmap, \ 3113 CF_AUX_TABLE_ptrs, \ 3114 CF_AUX_TABLE_lengths, \ 3115 "foldcase") \ 3116 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \ 3117 Simple_Case_Folding_invmap, \ 3118 NULL, NULL, \ 3119 "foldcase") 3120 3121 UV 3122 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) 3123 { 3124 /* Convert the Unicode character whose ordinal is <c> to its uppercase 3125 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>. 3126 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since 3127 * the changed version may be longer than the original character. 3128 * 3129 * The ordinal of the first character of the changed version is returned 3130 * (but note, as explained above, that there may be more.) */ 3131 3132 PERL_ARGS_ASSERT_TO_UNI_UPPER; 3133 3134 if (c < 256) { 3135 return _to_upper_title_latin1((U8) c, p, lenp, 'S'); 3136 } 3137 3138 return CALL_UPPER_CASE(c, NULL, p, lenp); 3139 } 3140 3141 UV 3142 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) 3143 { 3144 PERL_ARGS_ASSERT_TO_UNI_TITLE; 3145 3146 if (c < 256) { 3147 return _to_upper_title_latin1((U8) c, p, lenp, 's'); 3148 } 3149 3150 return CALL_TITLE_CASE(c, NULL, p, lenp); 3151 } 3152 3153 STATIC U8 3154 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy) 3155 { 3156 /* We have the latin1-range values compiled into the core, so just use 3157 * those, converting the result to UTF-8. Since the result is always just 3158 * one character, we allow <p> to be NULL */ 3159 3160 U8 converted = toLOWER_LATIN1(c); 3161 3162 PERL_UNUSED_ARG(dummy); 3163 3164 if (p != NULL) { 3165 if (NATIVE_BYTE_IS_INVARIANT(converted)) { 3166 *p = converted; 3167 *lenp = 1; 3168 } 3169 else { 3170 /* Result is known to always be < 256, so can use the EIGHT_BIT 3171 * macros */ 3172 *p = UTF8_EIGHT_BIT_HI(converted); 3173 *(p+1) = UTF8_EIGHT_BIT_LO(converted); 3174 *lenp = 2; 3175 } 3176 } 3177 return converted; 3178 } 3179 3180 UV 3181 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) 3182 { 3183 PERL_ARGS_ASSERT_TO_UNI_LOWER; 3184 3185 if (c < 256) { 3186 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); 3187 } 3188 3189 return CALL_LOWER_CASE(c, NULL, p, lenp); 3190 } 3191 3192 UV 3193 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) 3194 { 3195 /* Corresponds to to_lower_latin1(); <flags> bits meanings: 3196 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 3197 * FOLD_FLAGS_FULL iff full folding is to be used; 3198 * 3199 * Not to be used for locale folds 3200 */ 3201 3202 UV converted; 3203 3204 PERL_ARGS_ASSERT__TO_FOLD_LATIN1; 3205 3206 assert (! (flags & FOLD_FLAGS_LOCALE)); 3207 3208 if (UNLIKELY(c == MICRO_SIGN)) { 3209 converted = GREEK_SMALL_LETTER_MU; 3210 } 3211 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 3212 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 3213 || UNICODE_DOT_DOT_VERSION > 0) 3214 else if ( (flags & FOLD_FLAGS_FULL) 3215 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S)) 3216 { 3217 /* If can't cross 127/128 boundary, can't return "ss"; instead return 3218 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") 3219 * under those circumstances. */ 3220 if (flags & FOLD_FLAGS_NOMIX_ASCII) { 3221 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8); 3222 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 3223 p, *lenp, U8); 3224 return LATIN_SMALL_LETTER_LONG_S; 3225 } 3226 else { 3227 *(p)++ = 's'; 3228 *p = 's'; 3229 *lenp = 2; 3230 return 's'; 3231 } 3232 } 3233 #endif 3234 else { /* In this range the fold of all other characters is their lower 3235 case */ 3236 converted = toLOWER_LATIN1(c); 3237 } 3238 3239 if (UVCHR_IS_INVARIANT(converted)) { 3240 *p = (U8) converted; 3241 *lenp = 1; 3242 } 3243 else { 3244 *(p)++ = UTF8_TWO_BYTE_HI(converted); 3245 *p = UTF8_TWO_BYTE_LO(converted); 3246 *lenp = 2; 3247 } 3248 3249 return converted; 3250 } 3251 3252 UV 3253 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) 3254 { 3255 3256 /* Not currently externally documented, and subject to change 3257 * <flags> bits meanings: 3258 * FOLD_FLAGS_FULL iff full folding is to be used; 3259 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 3260 * locale are to be used. 3261 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 3262 */ 3263 3264 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; 3265 3266 if (flags & FOLD_FLAGS_LOCALE) { 3267 /* Treat a non-Turkic UTF-8 locale as not being in locale at all, 3268 * except for potentially warning */ 3269 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; 3270 if (IN_UTF8_CTYPE_LOCALE && ! IN_UTF8_TURKIC_LOCALE) { 3271 flags &= ~FOLD_FLAGS_LOCALE; 3272 } 3273 else { 3274 goto needs_full_generality; 3275 } 3276 } 3277 3278 if (c < 256) { 3279 return _to_fold_latin1((U8) c, p, lenp, 3280 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); 3281 } 3282 3283 /* Here, above 255. If no special needs, just use the macro */ 3284 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { 3285 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL); 3286 } 3287 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with 3288 the special flags. */ 3289 U8 utf8_c[UTF8_MAXBYTES + 1]; 3290 3291 needs_full_generality: 3292 uvchr_to_utf8(utf8_c, c); 3293 return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c), 3294 p, lenp, flags); 3295 } 3296 } 3297 3298 PERL_STATIC_INLINE bool 3299 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e, 3300 SV* const invlist) 3301 { 3302 /* returns a boolean giving whether or not the UTF8-encoded character that 3303 * starts at <p>, and extending no further than <e - 1> is in the inversion 3304 * list <invlist>. */ 3305 3306 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0); 3307 3308 PERL_ARGS_ASSERT_IS_UTF8_COMMON; 3309 3310 if (cp == 0 && (p >= e || *p != '\0')) { 3311 _force_out_malformed_utf8_message(p, e, 0, 1); 3312 NOT_REACHED; /* NOTREACHED */ 3313 } 3314 3315 assert(invlist); 3316 return _invlist_contains_cp(invlist, cp); 3317 } 3318 3319 #if 0 /* Not currently used, but may be needed in the future */ 3320 PERLVAR(I, seen_deprecated_macro, HV *) 3321 3322 STATIC void 3323 S_warn_on_first_deprecated_use(pTHX_ U32 category, 3324 const char * const name, 3325 const char * const alternative, 3326 const bool use_locale, 3327 const char * const file, 3328 const unsigned line) 3329 { 3330 const char * key; 3331 3332 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE; 3333 3334 if (ckWARN_d(category)) { 3335 3336 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line); 3337 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) { 3338 if (! PL_seen_deprecated_macro) { 3339 PL_seen_deprecated_macro = newHV(); 3340 } 3341 if (! hv_store(PL_seen_deprecated_macro, key, 3342 strlen(key), &PL_sv_undef, 0)) 3343 { 3344 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 3345 } 3346 3347 if (instr(file, "mathoms.c")) { 3348 Perl_warner(aTHX_ category, 3349 "In %s, line %d, starting in Perl v5.32, %s()" 3350 " will be removed. Avoid this message by" 3351 " converting to use %s().\n", 3352 file, line, name, alternative); 3353 } 3354 else { 3355 Perl_warner(aTHX_ category, 3356 "In %s, line %d, starting in Perl v5.32, %s() will" 3357 " require an additional parameter. Avoid this" 3358 " message by converting to use %s().\n", 3359 file, line, name, alternative); 3360 } 3361 } 3362 } 3363 } 3364 #endif 3365 3366 bool 3367 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) 3368 { 3369 PERL_ARGS_ASSERT__IS_UTF8_FOO; 3370 3371 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]); 3372 } 3373 3374 bool 3375 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e) 3376 { 3377 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; 3378 3379 return is_utf8_common(p, e, PL_utf8_perl_idstart); 3380 } 3381 3382 bool 3383 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e) 3384 { 3385 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; 3386 3387 return is_utf8_common(p, e, PL_utf8_perl_idcont); 3388 } 3389 3390 STATIC UV 3391 S_to_case_cp_list(pTHX_ 3392 const UV original, 3393 const U32 ** const remaining_list, 3394 Size_t * remaining_count, 3395 SV *invlist, const I32 * const invmap, 3396 const U32 * const * const aux_tables, 3397 const U8 * const aux_table_lengths, 3398 const char * const normal) 3399 { 3400 SSize_t index; 3401 I32 base; 3402 3403 /* Calculate the changed case of code point 'original'. The first code 3404 * point of the changed case is returned. 3405 * 3406 * If 'remaining_count' is not NULL, *remaining_count will be set to how 3407 * many *other* code points are in the changed case. If non-zero and 3408 * 'remaining_list' is also not NULL, *remaining_list will be set to point 3409 * to a non-modifiable array containing the second and potentially third 3410 * code points in the changed case. (Unicode guarantees a maximum of 3.) 3411 * Note that this means that *remaining_list is undefined unless there are 3412 * multiple code points, and the caller has chosen to find out how many by 3413 * making 'remaining_count' not NULL. 3414 * 3415 * 'normal' is a string to use to name the new case in any generated 3416 * messages, as a fallback if the operation being used is not available. 3417 * 3418 * The casing to use is given by the data structures in the remaining 3419 * arguments. 3420 */ 3421 3422 PERL_ARGS_ASSERT_TO_CASE_CP_LIST; 3423 3424 /* 'index' is guaranteed to be non-negative, as this is an inversion map 3425 * that covers all possible inputs. See [perl #133365] */ 3426 index = _invlist_search(invlist, original); 3427 base = invmap[index]; 3428 3429 /* Most likely, the case change will contain just a single code point */ 3430 if (remaining_count) { 3431 *remaining_count = 0; 3432 } 3433 3434 if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */ 3435 3436 /* At this bottom level routine is where we warn about illegal code 3437 * points */ 3438 if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) { 3439 if (UNLIKELY(UNICODE_IS_SURROGATE(original))) { 3440 if (ckWARN_d(WARN_SURROGATE)) { 3441 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 3442 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 3443 "Operation \"%s\" returns its argument for" 3444 " UTF-16 surrogate U+%04" UVXf, desc, original); 3445 } 3446 } 3447 else if (UNLIKELY(UNICODE_IS_SUPER(original))) { 3448 if (UNLIKELY(original > MAX_LEGAL_CP)) { 3449 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original)); 3450 } 3451 if (ckWARN_d(WARN_NON_UNICODE)) { 3452 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 3453 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 3454 "Operation \"%s\" returns its argument for" 3455 " non-Unicode code point 0x%04" UVXf, desc, original); 3456 } 3457 } 3458 3459 /* Note that non-characters are perfectly legal, so no warning 3460 * should be given. */ 3461 } 3462 3463 return original; 3464 } 3465 3466 if (LIKELY(base > 0)) { /* means original mapped to a single code point, 3467 different from itself */ 3468 return base + original - invlist_array(invlist)[index]; 3469 } 3470 3471 /* Here 'base' is negative. That means the mapping is 1-to-many, and 3472 * requires an auxiliary table look up. abs(base) gives the index into a 3473 * list of such tables which points to the proper aux table. And a 3474 * parallel list gives the length of each corresponding aux table. Skip 3475 * the first entry in the *remaining returns, as it is returned by the 3476 * function. */ 3477 base = -base; 3478 if (remaining_count) { 3479 *remaining_count = (Size_t) (aux_table_lengths[base] - 1); 3480 3481 if (remaining_list) { 3482 *remaining_list = aux_tables[base] + 1; 3483 } 3484 } 3485 3486 return (UV) aux_tables[base][0]; 3487 } 3488 3489 STATIC UV 3490 S__to_utf8_case(pTHX_ const UV original, const U8 *p, 3491 U8* ustrp, STRLEN *lenp, 3492 SV *invlist, const I32 * const invmap, 3493 const U32 * const * const aux_tables, 3494 const U8 * const aux_table_lengths, 3495 const char * const normal) 3496 { 3497 /* Change the case of code point 'original'. If 'p' is non-NULL, it points to 3498 * the beginning of the (assumed to be valid) UTF-8 representation of 3499 * 'original'. 'normal' is a string to use to name the new case in any 3500 * generated messages, as a fallback if the operation being used is not 3501 * available. The new case is given by the data structures in the 3502 * remaining arguments. 3503 * 3504 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the 3505 * entire changed case string, and the return value is the first code point 3506 * in that string 3507 * 3508 * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes 3509 * since the changed version may be longer than the original character. */ 3510 3511 const U32 * remaining_list; 3512 Size_t remaining_count; 3513 UV first = to_case_cp_list(original, 3514 &remaining_list, &remaining_count, 3515 invlist, invmap, 3516 aux_tables, aux_table_lengths, 3517 normal); 3518 3519 PERL_ARGS_ASSERT__TO_UTF8_CASE; 3520 3521 /* If the code point maps to itself and we already have its representation, 3522 * copy it instead of recalculating */ 3523 if (original == first && p) { 3524 *lenp = UTF8SKIP(p); 3525 3526 if (p != ustrp) { /* Don't copy onto itself */ 3527 Copy(p, ustrp, *lenp, U8); 3528 } 3529 } 3530 else { 3531 U8 * d = ustrp; 3532 Size_t i; 3533 3534 d = uvchr_to_utf8(d, first); 3535 3536 for (i = 0; i < remaining_count; i++) { 3537 d = uvchr_to_utf8(d, remaining_list[i]); 3538 } 3539 3540 *d = '\0'; 3541 *lenp = d - ustrp; 3542 } 3543 3544 return first; 3545 } 3546 3547 Size_t 3548 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to, 3549 const U32 ** remaining_folds_to) 3550 { 3551 /* Returns the count of the number of code points that fold to the input 3552 * 'cp' (besides itself). 3553 * 3554 * If the return is 0, there is nothing else that folds to it, and 3555 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL. 3556 * 3557 * If the return is 1, '*first_folds_to' is set to the single code point, 3558 * and '*remaining_folds_to' is set to NULL. 3559 * 3560 * Otherwise, '*first_folds_to' is set to a code point, and 3561 * '*remaining_fold_to' is set to an array that contains the others. The 3562 * length of this array is the returned count minus 1. 3563 * 3564 * The reason for this convolution is to avoid having to deal with 3565 * allocating and freeing memory. The lists are already constructed, so 3566 * the return can point to them, but single code points aren't, so would 3567 * need to be constructed if we didn't employ something like this API 3568 * 3569 * The code points returned by this function are all legal Unicode, which 3570 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are 3571 * constructed with this size (to save space and memory), and we return 3572 * pointers, so they must be this size */ 3573 3574 /* 'index' is guaranteed to be non-negative, as this is an inversion map 3575 * that covers all possible inputs. See [perl #133365] */ 3576 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); 3577 I32 base = _Perl_IVCF_invmap[index]; 3578 3579 PERL_ARGS_ASSERT__INVERSE_FOLDS; 3580 3581 if (base == 0) { /* No fold */ 3582 *first_folds_to = 0; 3583 *remaining_folds_to = NULL; 3584 return 0; 3585 } 3586 3587 #ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */ 3588 3589 assert(base > 0); 3590 3591 #else 3592 3593 if (UNLIKELY(base < 0)) { /* Folds to more than one character */ 3594 3595 /* The data structure is set up so that the absolute value of 'base' is 3596 * an index into a table of pointers to arrays, with the array 3597 * corresponding to the index being the list of code points that fold 3598 * to 'cp', and the parallel array containing the length of the list 3599 * array */ 3600 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; 3601 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; 3602 /* +1 excludes first_folds_to */ 3603 return IVCF_AUX_TABLE_lengths[-base]; 3604 } 3605 3606 #endif 3607 3608 /* Only the single code point. This works like 'fc(G) = G - A + a' */ 3609 *first_folds_to = (U32) (base + cp 3610 - invlist_array(PL_utf8_foldclosures)[index]); 3611 *remaining_folds_to = NULL; 3612 return 1; 3613 } 3614 3615 STATIC UV 3616 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, 3617 U8* const ustrp, STRLEN *lenp) 3618 { 3619 /* This is called when changing the case of a UTF-8-encoded character above 3620 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the 3621 * result contains a character that crosses the 255/256 boundary, disallow 3622 * the change, and return the original code point. See L<perlfunc/lc> for 3623 * why; 3624 * 3625 * p points to the original string whose case was changed; assumed 3626 * by this routine to be well-formed 3627 * result the code point of the first character in the changed-case string 3628 * ustrp points to the changed-case string (<result> represents its 3629 * first char) 3630 * lenp points to the length of <ustrp> */ 3631 3632 UV original; /* To store the first code point of <p> */ 3633 3634 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING; 3635 3636 assert(UTF8_IS_ABOVE_LATIN1(*p)); 3637 3638 /* We know immediately if the first character in the string crosses the 3639 * boundary, so can skip testing */ 3640 if (result > 255) { 3641 3642 /* Look at every character in the result; if any cross the 3643 * boundary, the whole thing is disallowed */ 3644 U8* s = ustrp + UTF8SKIP(ustrp); 3645 U8* e = ustrp + *lenp; 3646 while (s < e) { 3647 if (! UTF8_IS_ABOVE_LATIN1(*s)) { 3648 goto bad_crossing; 3649 } 3650 s += UTF8SKIP(s); 3651 } 3652 3653 /* Here, no characters crossed, result is ok as-is, but we warn. */ 3654 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p)); 3655 return result; 3656 } 3657 3658 bad_crossing: 3659 3660 /* Failed, have to return the original */ 3661 original = valid_utf8_to_uvchr(p, lenp); 3662 3663 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3664 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3665 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8" 3666 " locale; resolved to \"\\x{%" UVXf "}\".", 3667 OP_DESC(PL_op), 3668 original, 3669 original); 3670 Copy(p, ustrp, *lenp, char); 3671 return original; 3672 } 3673 3674 STATIC UV 3675 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, 3676 U8 * ustrp, STRLEN *lenp) 3677 { 3678 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from 3679 * p0..e-1 according to Turkic rules is the same as for non-Turkic. 3680 * Otherwise, it returns the first code point of the Turkic foldcased 3681 * sequence, and the entire sequence will be stored in *ustrp. ustrp will 3682 * contain *lenp bytes 3683 * 3684 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER 3685 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER 3686 * DOTLESS I */ 3687 3688 PERL_ARGS_ASSERT_TURKIC_FC; 3689 assert(e > p); 3690 3691 if (UNLIKELY(*p == 'I')) { 3692 *lenp = 2; 3693 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); 3694 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); 3695 return LATIN_SMALL_LETTER_DOTLESS_I; 3696 } 3697 3698 if (UNLIKELY(memBEGINs(p, e - p, 3699 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8))) 3700 { 3701 *lenp = 1; 3702 *ustrp = 'i'; 3703 return 'i'; 3704 } 3705 3706 return 0; 3707 } 3708 3709 STATIC UV 3710 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, 3711 U8 * ustrp, STRLEN *lenp) 3712 { 3713 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from 3714 * p0..e-1 according to Turkic rules is the same as for non-Turkic. 3715 * Otherwise, it returns the first code point of the Turkic lowercased 3716 * sequence, and the entire sequence will be stored in *ustrp. ustrp will 3717 * contain *lenp bytes */ 3718 3719 PERL_ARGS_ASSERT_TURKIC_LC; 3720 assert(e > p0); 3721 3722 /* A 'I' requires context as to what to do */ 3723 if (UNLIKELY(*p0 == 'I')) { 3724 const U8 * p = p0 + 1; 3725 3726 /* According to the Unicode SpecialCasing.txt file, a capital 'I' 3727 * modified by a dot above lowercases to 'i' even in turkic locales. */ 3728 while (p < e) { 3729 UV cp; 3730 3731 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) { 3732 ustrp[0] = 'i'; 3733 *lenp = 1; 3734 return 'i'; 3735 } 3736 3737 /* For the dot above to modify the 'I', it must be part of a 3738 * combining sequence immediately following the 'I', and no other 3739 * modifier with a ccc of 230 may intervene */ 3740 cp = utf8_to_uvchr_buf(p, e, NULL); 3741 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) { 3742 break; 3743 } 3744 3745 /* Here the combining sequence continues */ 3746 p += UTF8SKIP(p); 3747 } 3748 } 3749 3750 /* In all other cases the lc is the same as the fold */ 3751 return turkic_fc(p0, e, ustrp, lenp); 3752 } 3753 3754 STATIC UV 3755 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, 3756 U8 * ustrp, STRLEN *lenp) 3757 { 3758 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence 3759 * from p0..e-1 according to Turkic rules is the same as for non-Turkic. 3760 * Otherwise, it returns the first code point of the Turkic upper or 3761 * title-cased sequence, and the entire sequence will be stored in *ustrp. 3762 * ustrp will contain *lenp bytes 3763 * 3764 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER 3765 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER 3766 * DOTLESS I */ 3767 3768 PERL_ARGS_ASSERT_TURKIC_UC; 3769 assert(e > p); 3770 3771 if (*p == 'i') { 3772 *lenp = 2; 3773 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 3774 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 3775 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 3776 } 3777 3778 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) { 3779 *lenp = 1; 3780 *ustrp = 'I'; 3781 return 'I'; 3782 } 3783 3784 return 0; 3785 } 3786 3787 /* The process for changing the case is essentially the same for the four case 3788 * change types, except there are complications for folding. Otherwise the 3789 * difference is only which case to change to. To make sure that they all do 3790 * the same thing, the bodies of the functions are extracted out into the 3791 * following two macros. The functions are written with the same variable 3792 * names, and these are known and used inside these macros. It would be 3793 * better, of course, to have inline functions to do it, but since different 3794 * macros are called, depending on which case is being changed to, this is not 3795 * feasible in C (to khw's knowledge). Two macros are created so that the fold 3796 * function can start with the common start macro, then finish with its special 3797 * handling; while the other three cases can just use the common end macro. 3798 * 3799 * The algorithm is to use the proper (passed in) macro or function to change 3800 * the case for code points that are below 256. The macro is used if using 3801 * locale rules for the case change; the function if not. If the code point is 3802 * above 255, it is computed from the input UTF-8, and another macro is called 3803 * to do the conversion. If necessary, the output is converted to UTF-8. If 3804 * using a locale, we have to check that the change did not cross the 255/256 3805 * boundary, see check_locale_boundary_crossing() for further details. 3806 * 3807 * The macros are split with the correct case change for the below-256 case 3808 * stored into 'result', and in the middle of an else clause for the above-255 3809 * case. At that point in the 'else', 'result' is not the final result, but is 3810 * the input code point calculated from the UTF-8. The fold code needs to 3811 * realize all this and take it from there. 3812 * 3813 * To deal with Turkic locales, the function specified by the parameter 3814 * 'turkic' is called when appropriate. 3815 * 3816 * If you read the two macros as sequential, it's easier to understand what's 3817 * going on. */ 3818 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \ 3819 L1_func_extra_param, turkic) \ 3820 \ 3821 if (flags & (locale_flags)) { \ 3822 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \ 3823 if (IN_UTF8_CTYPE_LOCALE) { \ 3824 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) { \ 3825 UV ret = turkic(p, e, ustrp, lenp); \ 3826 if (ret) return ret; \ 3827 } \ 3828 \ 3829 /* Otherwise, treat a UTF-8 locale as not being in locale at \ 3830 * all */ \ 3831 flags &= ~(locale_flags); \ 3832 } \ 3833 } \ 3834 \ 3835 if (UTF8_IS_INVARIANT(*p)) { \ 3836 if (flags & (locale_flags)) { \ 3837 result = libc_change_function(*p); \ 3838 } \ 3839 else { \ 3840 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \ 3841 } \ 3842 } \ 3843 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ 3844 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \ 3845 if (flags & (locale_flags)) { \ 3846 result = libc_change_function(c); \ 3847 } \ 3848 else { \ 3849 return L1_func(c, ustrp, lenp, L1_func_extra_param); \ 3850 } \ 3851 } \ 3852 else { /* malformed UTF-8 or ord above 255 */ \ 3853 STRLEN len_result; \ 3854 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \ 3855 if (len_result == (STRLEN) -1) { \ 3856 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \ 3857 } 3858 3859 #define CASE_CHANGE_BODY_END(locale_flags, change_macro) \ 3860 result = change_macro(result, p, ustrp, lenp); \ 3861 \ 3862 if (flags & (locale_flags)) { \ 3863 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \ 3864 } \ 3865 return result; \ 3866 } \ 3867 \ 3868 /* Here, used locale rules. Convert back to UTF-8 */ \ 3869 if (UTF8_IS_INVARIANT(result)) { \ 3870 *ustrp = (U8) result; \ 3871 *lenp = 1; \ 3872 } \ 3873 else { \ 3874 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \ 3875 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \ 3876 *lenp = 2; \ 3877 } \ 3878 \ 3879 return result; 3880 3881 /* Not currently externally documented, and subject to change: 3882 * <flags> is set iff the rules from the current underlying locale are to 3883 * be used. */ 3884 3885 UV 3886 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, 3887 const U8 *e, 3888 U8* ustrp, 3889 STRLEN *lenp, 3890 bool flags) 3891 { 3892 UV result; 3893 3894 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; 3895 3896 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ 3897 /* 2nd char of uc(U+DF) is 'S' */ 3898 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S', 3899 turkic_uc); 3900 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); 3901 } 3902 3903 /* Not currently externally documented, and subject to change: 3904 * <flags> is set iff the rules from the current underlying locale are to be 3905 * used. Since titlecase is not defined in POSIX, for other than a 3906 * UTF-8 locale, uppercase is used instead for code points < 256. 3907 */ 3908 3909 UV 3910 Perl__to_utf8_title_flags(pTHX_ const U8 *p, 3911 const U8 *e, 3912 U8* ustrp, 3913 STRLEN *lenp, 3914 bool flags) 3915 { 3916 UV result; 3917 3918 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; 3919 3920 /* 2nd char of ucfirst(U+DF) is 's' */ 3921 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's', 3922 turkic_uc); 3923 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); 3924 } 3925 3926 /* Not currently externally documented, and subject to change: 3927 * <flags> is set iff the rules from the current underlying locale are to 3928 * be used. 3929 */ 3930 3931 UV 3932 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, 3933 const U8 *e, 3934 U8* ustrp, 3935 STRLEN *lenp, 3936 bool flags) 3937 { 3938 UV result; 3939 3940 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; 3941 3942 CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */, 3943 turkic_lc); 3944 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) 3945 } 3946 3947 /* Not currently externally documented, and subject to change, 3948 * in <flags> 3949 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 3950 * locale are to be used. 3951 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; 3952 * otherwise simple folds 3953 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are 3954 * prohibited 3955 */ 3956 3957 UV 3958 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, 3959 const U8 *e, 3960 U8* ustrp, 3961 STRLEN *lenp, 3962 U8 flags) 3963 { 3964 UV result; 3965 3966 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; 3967 3968 /* These are mutually exclusive */ 3969 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII))); 3970 3971 assert(p != ustrp); /* Otherwise overwrites */ 3972 3973 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1, 3974 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)), 3975 turkic_fc); 3976 3977 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); 3978 3979 if (flags & FOLD_FLAGS_LOCALE) { 3980 3981 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 3982 # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 3983 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 3984 3985 /* Special case these two characters, as what normally gets 3986 * returned under locale doesn't work */ 3987 if (memBEGINs((char *) p, e - p, CAP_SHARP_S)) 3988 { 3989 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3990 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3991 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; " 3992 "resolved to \"\\x{17F}\\x{17F}\"."); 3993 goto return_long_s; 3994 } 3995 else 3996 #endif 3997 if (memBEGINs((char *) p, e - p, LONG_S_T)) 3998 { 3999 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 4000 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 4001 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; " 4002 "resolved to \"\\x{FB06}\"."); 4003 goto return_ligature_st; 4004 } 4005 4006 #if UNICODE_MAJOR_VERSION == 3 \ 4007 && UNICODE_DOT_VERSION == 0 \ 4008 && UNICODE_DOT_DOT_VERSION == 1 4009 # define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 4010 4011 /* And special case this on this Unicode version only, for the same 4012 * reaons the other two are special cased. They would cross the 4013 * 255/256 boundary which is forbidden under /l, and so the code 4014 * wouldn't catch that they are equivalent (which they are only in 4015 * this release) */ 4016 else if (memBEGINs((char *) p, e - p, DOTTED_I)) { 4017 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 4018 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 4019 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " 4020 "resolved to \"\\x{0131}\"."); 4021 goto return_dotless_i; 4022 } 4023 #endif 4024 4025 return check_locale_boundary_crossing(p, result, ustrp, lenp); 4026 } 4027 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { 4028 return result; 4029 } 4030 else { 4031 /* This is called when changing the case of a UTF-8-encoded 4032 * character above the ASCII range, and the result should not 4033 * contain an ASCII character. */ 4034 4035 UV original; /* To store the first code point of <p> */ 4036 4037 /* Look at every character in the result; if any cross the 4038 * boundary, the whole thing is disallowed */ 4039 U8* s = ustrp; 4040 U8* send = ustrp + *lenp; 4041 while (s < send) { 4042 if (isASCII(*s)) { 4043 /* Crossed, have to return the original */ 4044 original = valid_utf8_to_uvchr(p, lenp); 4045 4046 /* But in these instances, there is an alternative we can 4047 * return that is valid */ 4048 if (original == LATIN_SMALL_LETTER_SHARP_S 4049 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ 4050 || original == LATIN_CAPITAL_LETTER_SHARP_S 4051 #endif 4052 ) { 4053 goto return_long_s; 4054 } 4055 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { 4056 goto return_ligature_st; 4057 } 4058 #if UNICODE_MAJOR_VERSION == 3 \ 4059 && UNICODE_DOT_VERSION == 0 \ 4060 && UNICODE_DOT_DOT_VERSION == 1 4061 4062 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 4063 goto return_dotless_i; 4064 } 4065 #endif 4066 Copy(p, ustrp, *lenp, char); 4067 return original; 4068 } 4069 s += UTF8SKIP(s); 4070 } 4071 4072 /* Here, no characters crossed, result is ok as-is */ 4073 return result; 4074 } 4075 } 4076 4077 /* Here, used locale rules. Convert back to UTF-8 */ 4078 if (UTF8_IS_INVARIANT(result)) { 4079 *ustrp = (U8) result; 4080 *lenp = 1; 4081 } 4082 else { 4083 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 4084 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 4085 *lenp = 2; 4086 } 4087 4088 return result; 4089 4090 return_long_s: 4091 /* Certain folds to 'ss' are prohibited by the options, but they do allow 4092 * folds to a string of two of these characters. By returning this 4093 * instead, then, e.g., 4094 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") 4095 * works. */ 4096 4097 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8); 4098 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 4099 ustrp, *lenp, U8); 4100 return LATIN_SMALL_LETTER_LONG_S; 4101 4102 return_ligature_st: 4103 /* Two folds to 'st' are prohibited by the options; instead we pick one and 4104 * have the other one fold to it */ 4105 4106 *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8); 4107 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); 4108 return LATIN_SMALL_LIGATURE_ST; 4109 4110 #if UNICODE_MAJOR_VERSION == 3 \ 4111 && UNICODE_DOT_VERSION == 0 \ 4112 && UNICODE_DOT_DOT_VERSION == 1 4113 4114 return_dotless_i: 4115 *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8); 4116 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8); 4117 return LATIN_SMALL_LETTER_DOTLESS_I; 4118 4119 #endif 4120 4121 } 4122 4123 bool 4124 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) 4125 { 4126 /* May change: warns if surrogates, non-character code points, or 4127 * non-Unicode code points are in 's' which has length 'len' bytes. 4128 * Returns TRUE if none found; FALSE otherwise. The only other validity 4129 * check is to make sure that this won't exceed the string's length nor 4130 * overflow */ 4131 4132 const U8* const e = s + len; 4133 bool ok = TRUE; 4134 4135 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT; 4136 4137 while (s < e) { 4138 if (UTF8SKIP(s) > len) { 4139 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 4140 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print"); 4141 return FALSE; 4142 } 4143 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { 4144 if (UNLIKELY(UTF8_IS_SUPER(s, e))) { 4145 if ( ckWARN_d(WARN_NON_UNICODE) 4146 || UNLIKELY(0 < does_utf8_overflow(s, s + len, 4147 0 /* Don't consider overlongs */ 4148 ))) 4149 { 4150 /* A side effect of this function will be to warn */ 4151 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); 4152 ok = FALSE; 4153 } 4154 } 4155 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) { 4156 if (ckWARN_d(WARN_SURROGATE)) { 4157 /* This has a different warning than the one the called 4158 * function would output, so can't just call it, unlike we 4159 * do for the non-chars and above-unicodes */ 4160 UV uv = utf8_to_uvchr_buf(s, e, NULL); 4161 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 4162 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", 4163 uv); 4164 ok = FALSE; 4165 } 4166 } 4167 else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e)) 4168 && (ckWARN_d(WARN_NONCHAR))) 4169 { 4170 /* A side effect of this function will be to warn */ 4171 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR); 4172 ok = FALSE; 4173 } 4174 } 4175 s += UTF8SKIP(s); 4176 } 4177 4178 return ok; 4179 } 4180 4181 /* 4182 =for apidoc pv_uni_display 4183 4184 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string 4185 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes 4186 long (if longer, the rest is truncated and C<"..."> will be appended). 4187 4188 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display 4189 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH> 4190 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">) 4191 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">). 4192 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both 4193 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on. 4194 4195 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a 4196 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set. 4197 4198 The pointer to the PV of the C<dsv> is returned. 4199 4200 See also L</sv_uni_display>. 4201 4202 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH 4203 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE 4204 =for apidoc Amnh||UNI_DISPLAY_ISPRINT 4205 =for apidoc Amnh||UNI_DISPLAY_QQ 4206 =for apidoc Amnh||UNI_DISPLAY_REGEX 4207 =cut 4208 */ 4209 char * 4210 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, 4211 UV flags) 4212 { 4213 int truncated = 0; 4214 const char *s, *e; 4215 4216 PERL_ARGS_ASSERT_PV_UNI_DISPLAY; 4217 4218 SvPVCLEAR(dsv); 4219 SvUTF8_off(dsv); 4220 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { 4221 UV u; 4222 bool ok = 0; 4223 4224 if (pvlim && SvCUR(dsv) >= pvlim) { 4225 truncated++; 4226 break; 4227 } 4228 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0); 4229 if (u < 256) { 4230 const U8 c = (U8) u; 4231 if (flags & UNI_DISPLAY_BACKSLASH) { 4232 if ( isMNEMONIC_CNTRL(c) 4233 && ( c != '\b' 4234 || (flags & UNI_DISPLAY_BACKSPACE))) 4235 { 4236 const char * mnemonic = cntrl_to_mnemonic(c); 4237 sv_catpvn(dsv, mnemonic, strlen(mnemonic)); 4238 ok = 1; 4239 } 4240 else if (c == '\\') { 4241 sv_catpvs(dsv, "\\\\"); 4242 ok = 1; 4243 } 4244 } 4245 /* isPRINT() is the locale-blind version. */ 4246 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { 4247 const char string = c; 4248 sv_catpvn(dsv, &string, 1); 4249 ok = 1; 4250 } 4251 } 4252 if (!ok) 4253 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u); 4254 } 4255 if (truncated) 4256 sv_catpvs(dsv, "..."); 4257 4258 return SvPVX(dsv); 4259 } 4260 4261 /* 4262 =for apidoc sv_uni_display 4263 4264 Build to the scalar C<dsv> a displayable version of the scalar C<sv>, 4265 the displayable version being at most C<pvlim> bytes long 4266 (if longer, the rest is truncated and "..." will be appended). 4267 4268 The C<flags> argument is as in L</pv_uni_display>(). 4269 4270 The pointer to the PV of the C<dsv> is returned. 4271 4272 =cut 4273 */ 4274 char * 4275 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) 4276 { 4277 const char * const ptr = 4278 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv); 4279 4280 PERL_ARGS_ASSERT_SV_UNI_DISPLAY; 4281 4282 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr, 4283 SvCUR(ssv), pvlim, flags); 4284 } 4285 4286 /* 4287 =for apidoc foldEQ_utf8 4288 4289 Returns true if the leading portions of the strings C<s1> and C<s2> (either or 4290 both of which may be in UTF-8) are the same case-insensitively; false 4291 otherwise. How far into the strings to compare is determined by other input 4292 parameters. 4293 4294 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode; 4295 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for 4296 C<u2> with respect to C<s2>. 4297 4298 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for 4299 fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach. 4300 The scan will not be considered to be a match unless the goal is reached, and 4301 scanning won't continue past that goal. Correspondingly for C<l2> with respect 4302 to C<s2>. 4303 4304 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that 4305 pointer is considered an end pointer to the position 1 byte past the maximum 4306 point in C<s1> beyond which scanning will not continue under any circumstances. 4307 (This routine assumes that UTF-8 encoded input strings are not malformed; 4308 malformed input can cause it to read past C<pe1>). This means that if both 4309 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match 4310 will never be successful because it can never 4311 get as far as its goal (and in fact is asserted against). Correspondingly for 4312 C<pe2> with respect to C<s2>. 4313 4314 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and 4315 C<l2> must be non-zero), and if both do, both have to be 4316 reached for a successful match. Also, if the fold of a character is multiple 4317 characters, all of them must be matched (see tr21 reference below for 4318 'folding'). 4319 4320 Upon a successful match, if C<pe1> is non-C<NULL>, 4321 it will be set to point to the beginning of the I<next> character of C<s1> 4322 beyond what was matched. Correspondingly for C<pe2> and C<s2>. 4323 4324 For case-insensitiveness, the "casefolding" of Unicode is used 4325 instead of upper/lowercasing both the characters, see 4326 L<https://www.unicode.org/reports/tr21/> (Case Mappings). 4327 4328 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII 4329 =for apidoc Cmnh||FOLDEQ_LOCALE 4330 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED 4331 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE 4332 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED 4333 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE 4334 4335 =cut */ 4336 4337 /* A flags parameter has been added which may change, and hence isn't 4338 * externally documented. Currently it is: 4339 * 0 for as-documented above 4340 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an 4341 ASCII one, to not match 4342 * FOLDEQ_LOCALE is set iff the rules from the current underlying 4343 * locale are to be used. 4344 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this 4345 * routine. This allows that step to be skipped. 4346 * Currently, this requires s1 to be encoded as UTF-8 4347 * (u1 must be true), which is asserted for. 4348 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may 4349 * cross certain boundaries. Hence, the caller should 4350 * let this function do the folding instead of 4351 * pre-folding. This code contains an assertion to 4352 * that effect. However, if the caller knows what 4353 * it's doing, it can pass this flag to indicate that, 4354 * and the assertion is skipped. 4355 * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies 4356 * to s2, and s2 doesn't have to be UTF-8 encoded. 4357 * This introduces an asymmetry to save a few branches 4358 * in a loop. Currently, this is not a problem, as 4359 * never are both inputs pre-folded. Simply call this 4360 * function with the pre-folded one as the second 4361 * string. 4362 * FOLDEQ_S2_FOLDS_SANE 4363 */ 4364 4365 I32 4366 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, 4367 const char *s2, char **pe2, UV l2, bool u2, 4368 U32 flags) 4369 { 4370 const U8 *p1 = (const U8*)s1; /* Point to current char */ 4371 const U8 *p2 = (const U8*)s2; 4372 const U8 *g1 = NULL; /* goal for s1 */ 4373 const U8 *g2 = NULL; 4374 const U8 *e1 = NULL; /* Don't scan s1 past this */ 4375 U8 *f1 = NULL; /* Point to current folded */ 4376 const U8 *e2 = NULL; 4377 U8 *f2 = NULL; 4378 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ 4379 U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; 4380 U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; 4381 U8 flags_for_folder = FOLD_FLAGS_FULL; 4382 4383 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; 4384 4385 assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) 4386 && (( (flags & FOLDEQ_S1_ALREADY_FOLDED) 4387 && !(flags & FOLDEQ_S1_FOLDS_SANE)) 4388 || ( (flags & FOLDEQ_S2_ALREADY_FOLDED) 4389 && !(flags & FOLDEQ_S2_FOLDS_SANE))))); 4390 /* The algorithm is to trial the folds without regard to the flags on 4391 * the first line of the above assert(), and then see if the result 4392 * violates them. This means that the inputs can't be pre-folded to a 4393 * violating result, hence the assert. This could be changed, with the 4394 * addition of extra tests here for the already-folded case, which would 4395 * slow it down. That cost is more than any possible gain for when these 4396 * flags are specified, as the flags indicate /il or /iaa matching which 4397 * is less common than /iu, and I (khw) also believe that real-world /il 4398 * and /iaa matches are most likely to involve code points 0-255, and this 4399 * function only under rare conditions gets called for 0-255. */ 4400 4401 if (flags & FOLDEQ_LOCALE) { 4402 if (IN_UTF8_CTYPE_LOCALE) { 4403 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) { 4404 flags_for_folder |= FOLD_FLAGS_LOCALE; 4405 } 4406 else { 4407 flags &= ~FOLDEQ_LOCALE; 4408 } 4409 } 4410 else { 4411 flags_for_folder |= FOLD_FLAGS_LOCALE; 4412 } 4413 } 4414 if (flags & FOLDEQ_UTF8_NOMIX_ASCII) { 4415 flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII; 4416 } 4417 4418 if (pe1) { 4419 e1 = *(U8**)pe1; 4420 } 4421 4422 if (l1) { 4423 g1 = (const U8*)s1 + l1; 4424 } 4425 4426 if (pe2) { 4427 e2 = *(U8**)pe2; 4428 } 4429 4430 if (l2) { 4431 g2 = (const U8*)s2 + l2; 4432 } 4433 4434 /* Must have at least one goal */ 4435 assert(g1 || g2); 4436 4437 if (g1) { 4438 4439 /* Will never match if goal is out-of-bounds */ 4440 assert(! e1 || e1 >= g1); 4441 4442 /* Here, there isn't an end pointer, or it is beyond the goal. We 4443 * only go as far as the goal */ 4444 e1 = g1; 4445 } 4446 else { 4447 assert(e1); /* Must have an end for looking at s1 */ 4448 } 4449 4450 /* Same for goal for s2 */ 4451 if (g2) { 4452 assert(! e2 || e2 >= g2); 4453 e2 = g2; 4454 } 4455 else { 4456 assert(e2); 4457 } 4458 4459 /* If both operands are already folded, we could just do a memEQ on the 4460 * whole strings at once, but it would be better if the caller realized 4461 * this and didn't even call us */ 4462 4463 /* Look through both strings, a character at a time */ 4464 while (p1 < e1 && p2 < e2) { 4465 4466 /* If at the beginning of a new character in s1, get its fold to use 4467 * and the length of the fold. */ 4468 if (n1 == 0) { 4469 if (flags & FOLDEQ_S1_ALREADY_FOLDED) { 4470 f1 = (U8 *) p1; 4471 assert(u1); 4472 n1 = UTF8SKIP(f1); 4473 } 4474 else { 4475 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) { 4476 4477 /* We have to forbid mixing ASCII with non-ASCII if the 4478 * flags so indicate. And, we can short circuit having to 4479 * call the general functions for this common ASCII case, 4480 * all of whose non-locale folds are also ASCII, and hence 4481 * UTF-8 invariants, so the UTF8ness of the strings is not 4482 * relevant. */ 4483 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) { 4484 return 0; 4485 } 4486 n1 = 1; 4487 *foldbuf1 = toFOLD(*p1); 4488 } 4489 else if (u1) { 4490 _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder); 4491 } 4492 else { /* Not UTF-8, get UTF-8 fold */ 4493 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder); 4494 } 4495 f1 = foldbuf1; 4496 } 4497 } 4498 4499 if (n2 == 0) { /* Same for s2 */ 4500 if (flags & FOLDEQ_S2_ALREADY_FOLDED) { 4501 4502 /* Point to the already-folded character. But for non-UTF-8 4503 * variants, convert to UTF-8 for the algorithm below */ 4504 if (UTF8_IS_INVARIANT(*p2)) { 4505 f2 = (U8 *) p2; 4506 n2 = 1; 4507 } 4508 else if (u2) { 4509 f2 = (U8 *) p2; 4510 n2 = UTF8SKIP(f2); 4511 } 4512 else { 4513 foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2); 4514 foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2); 4515 f2 = foldbuf2; 4516 n2 = 2; 4517 } 4518 } 4519 else { 4520 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) { 4521 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) { 4522 return 0; 4523 } 4524 n2 = 1; 4525 *foldbuf2 = toFOLD(*p2); 4526 } 4527 else if (u2) { 4528 _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder); 4529 } 4530 else { 4531 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder); 4532 } 4533 f2 = foldbuf2; 4534 } 4535 } 4536 4537 /* Here f1 and f2 point to the beginning of the strings to compare. 4538 * These strings are the folds of the next character from each input 4539 * string, stored in UTF-8. */ 4540 4541 /* While there is more to look for in both folds, see if they 4542 * continue to match */ 4543 while (n1 && n2) { 4544 U8 fold_length = UTF8SKIP(f1); 4545 if (fold_length != UTF8SKIP(f2) 4546 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE 4547 function call for single 4548 byte */ 4549 || memNE((char*)f1, (char*)f2, fold_length)) 4550 { 4551 return 0; /* mismatch */ 4552 } 4553 4554 /* Here, they matched, advance past them */ 4555 n1 -= fold_length; 4556 f1 += fold_length; 4557 n2 -= fold_length; 4558 f2 += fold_length; 4559 } 4560 4561 /* When reach the end of any fold, advance the input past it */ 4562 if (n1 == 0) { 4563 p1 += u1 ? UTF8SKIP(p1) : 1; 4564 } 4565 if (n2 == 0) { 4566 p2 += u2 ? UTF8SKIP(p2) : 1; 4567 } 4568 } /* End of loop through both strings */ 4569 4570 /* A match is defined by each scan that specified an explicit length 4571 * reaching its final goal, and the other not having matched a partial 4572 * character (which can happen when the fold of a character is more than one 4573 * character). */ 4574 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { 4575 return 0; 4576 } 4577 4578 /* Successful match. Set output pointers */ 4579 if (pe1) { 4580 *pe1 = (char*)p1; 4581 } 4582 if (pe2) { 4583 *pe2 = (char*)p2; 4584 } 4585 return 1; 4586 } 4587 4588 /* 4589 * ex: set ts=8 sts=4 sw=4 et: 4590 */ 4591