1 /* utf8.c 2 * 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 4 * by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever 13 * heard of that we don't want to see any closer; and that's the one place 14 * we're trying to get to! And that's just where we can't get, nohow.' 15 * 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] 17 * 18 * 'Well do I understand your speech,' he answered in the same language; 19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue, 20 * as is the custom in the West, if you wish to be answered?' 21 * --Gandalf, addressing Théoden's door wardens 22 * 23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 24 * 25 * ...the travellers perceived that the floor was paved with stones of many 26 * hues; branching runes and strange devices intertwined beneath their feet. 27 * 28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_UTF8_C 33 #include "perl.h" 34 #include "invlist_inline.h" 35 36 static const char malformed_text[] = "Malformed UTF-8 character"; 37 static const char unees[] = 38 "Malformed UTF-8 character (unexpected end of string)"; 39 40 /* strlen() of a literal string constant. We might want this more general, 41 * but using it in just this file for now. A problem with more generality is 42 * the compiler warnings about comparing unlike signs */ 43 #define STRLENs(s) (sizeof("" s "") - 1) 44 45 /* 46 These are various utility functions for manipulating UTF8-encoded 47 strings. For the uninitiated, this is a method of representing arbitrary 48 Unicode characters as a variable number of bytes, in such a way that 49 characters in the ASCII range are unmodified, and a zero byte never appears 50 within non-zero characters. 51 */ 52 53 /* helper for Perl__force_out_malformed_utf8_message(). Like 54 * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than 55 * PL_compiling */ 56 57 static void 58 S_restore_cop_warnings(pTHX_ void *p) 59 { 60 free_and_set_cop_warnings(PL_curcop, (STRLEN*) p); 61 } 62 63 64 void 65 Perl__force_out_malformed_utf8_message(pTHX_ 66 const U8 *const p, /* First byte in UTF-8 sequence */ 67 const U8 * const e, /* Final byte in sequence (may include 68 multiple chars */ 69 const U32 flags, /* Flags to pass to utf8n_to_uvchr(), 70 usually 0, or some DISALLOW flags */ 71 const bool die_here) /* If TRUE, this function does not return */ 72 { 73 /* This core-only function is to be called when a malformed UTF-8 character 74 * is found, in order to output the detailed information about the 75 * malformation before dieing. The reason it exists is for the occasions 76 * when such a malformation is fatal, but warnings might be turned off, so 77 * that normally they would not be actually output. This ensures that they 78 * do get output. Because a sequence may be malformed in more than one 79 * way, multiple messages may be generated, so we can't make them fatal, as 80 * that would cause the first one to die. 81 * 82 * Instead we pretend -W was passed to perl, then die afterwards. The 83 * flexibility is here to return to the caller so they can finish up and 84 * die themselves */ 85 U32 errors; 86 87 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; 88 89 ENTER; 90 SAVEI8(PL_dowarn); 91 SAVESPTR(PL_curcop); 92 93 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; 94 if (PL_curcop) { 95 /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather 96 * than PL_compiling */ 97 SAVEDESTRUCTOR_X(S_restore_cop_warnings, 98 (void*)PL_curcop->cop_warnings); 99 PL_curcop->cop_warnings = pWARN_ALL; 100 } 101 102 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); 103 104 LEAVE; 105 106 if (! errors) { 107 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should" 108 " be called only when there are errors found"); 109 } 110 111 if (die_here) { 112 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); 113 } 114 } 115 116 STATIC HV * 117 S_new_msg_hv(pTHX_ const char * const message, /* The message text */ 118 U32 categories, /* Packed warning categories */ 119 U32 flag) /* Flag associated with this message */ 120 { 121 /* Creates, populates, and returns an HV* that describes an error message 122 * for the translators between UTF8 and code point */ 123 124 SV* msg_sv = newSVpv(message, 0); 125 SV* category_sv = newSVuv(categories); 126 SV* flag_bit_sv = newSVuv(flag); 127 128 HV* msg_hv = newHV(); 129 130 PERL_ARGS_ASSERT_NEW_MSG_HV; 131 132 (void) hv_stores(msg_hv, "text", msg_sv); 133 (void) hv_stores(msg_hv, "warn_categories", category_sv); 134 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv); 135 136 return msg_hv; 137 } 138 139 /* 140 =for apidoc uvoffuni_to_utf8_flags 141 142 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 143 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or 144 L<perlapi/uvchr_to_utf8_flags>>. 145 146 This function is like them, but the input is a strict Unicode 147 (as opposed to native) code point. Only in very rare circumstances should code 148 not be using the native code point. 149 150 For details, see the description for L<perlapi/uvchr_to_utf8_flags>. 151 152 =cut 153 */ 154 155 U8 * 156 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) 157 { 158 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; 159 160 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL); 161 } 162 163 /* All these formats take a single UV code point argument */ 164 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf; 165 const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf 166 " is not recommended for open interchange"; 167 const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," 168 " may not be portable"; 169 170 /* Use shorter names internally in this file */ 171 #define SHIFT UTF_ACCUMULATION_SHIFT 172 #undef MARK 173 #define MARK UTF_CONTINUATION_MARK 174 #define MASK UTF_CONTINUATION_MASK 175 176 /* 177 =for apidoc uvchr_to_utf8_flags_msgs 178 179 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 180 181 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly. 182 183 This function is for code that wants any warning and/or error messages to be 184 returned to the caller rather than be displayed. All messages that would have 185 been displayed if all lexical warnings are enabled will be returned. 186 187 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter 188 placed after all the others, C<msgs>. If this parameter is 0, this function 189 behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should 190 be a pointer to an C<HV *> variable, in which this function creates a new HV to 191 contain any appropriate messages. The hash has three key-value pairs, as 192 follows: 193 194 =over 4 195 196 =item C<text> 197 198 The text of the message as a C<SVpv>. 199 200 =item C<warn_categories> 201 202 The warning category (or categories) packed into a C<SVuv>. 203 204 =item C<flag> 205 206 A single flag bit associated with this message, in a C<SVuv>. 207 The bit corresponds to some bit in the C<*errors> return value, 208 such as C<UNICODE_GOT_SURROGATE>. 209 210 =back 211 212 It's important to note that specifying this parameter as non-null will cause 213 any warnings this function would otherwise generate to be suppressed, and 214 instead be placed in C<*msgs>. The caller can check the lexical warnings state 215 (or not) when choosing what to do with the returned messages. 216 217 The caller, of course, is responsible for freeing any returned HV. 218 219 =cut 220 */ 221 222 /* Undocumented; we don't want people using this. Instead they should use 223 * uvchr_to_utf8_flags_msgs() */ 224 U8 * 225 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs) 226 { 227 U8 *p; 228 UV shifted_uv = input_uv; 229 STRLEN utf8_skip = OFFUNISKIP(input_uv); 230 231 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS; 232 233 if (msgs) { 234 *msgs = NULL; 235 } 236 237 switch (utf8_skip) { 238 case 1: 239 *d++ = LATIN1_TO_NATIVE(input_uv); 240 return d; 241 242 default: 243 if ( UNLIKELY(input_uv > MAX_LEGAL_CP 244 && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))) 245 { 246 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */ 247 NULL, 0, input_uv)); 248 } 249 250 if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) { 251 U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); 252 const char * format = PL_extended_cp_format; 253 if (msgs) { 254 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 255 category, 256 UNICODE_GOT_PERL_EXTENDED); 257 } 258 else { 259 Perl_ck_warner_d(aTHX_ category, format, input_uv); 260 } 261 262 /* Don't output a 2nd msg */ 263 flags &= ~UNICODE_WARN_SUPER; 264 } 265 266 if (flags & UNICODE_DISALLOW_PERL_EXTENDED) { 267 return NULL; 268 } 269 270 p = d + utf8_skip - 1; 271 while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) { 272 *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 273 shifted_uv >>= SHIFT; 274 } 275 276 /* FALLTHROUGH */ 277 278 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: 279 d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT] 280 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 281 shifted_uv >>= SHIFT; 282 /* FALLTHROUGH */ 283 284 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: 285 d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT] 286 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 287 shifted_uv >>= SHIFT; 288 /* FALLTHROUGH */ 289 290 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT: 291 if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) { 292 if (flags & UNICODE_WARN_SUPER) { 293 U32 category = packWARN(WARN_NON_UNICODE); 294 const char * format = super_cp_format; 295 296 if (msgs) { 297 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 298 category, 299 UNICODE_GOT_SUPER); 300 } 301 else { 302 Perl_ck_warner_d(aTHX_ category, format, input_uv); 303 } 304 305 if (flags & UNICODE_DISALLOW_SUPER) { 306 return NULL; 307 } 308 } 309 if ( (flags & UNICODE_DISALLOW_SUPER) 310 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) 311 && UNICODE_IS_PERL_EXTENDED(input_uv))) 312 { 313 return NULL; 314 } 315 } 316 317 d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT] 318 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 319 shifted_uv >>= SHIFT; 320 /* FALLTHROUGH */ 321 322 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT: 323 if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) { 324 if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) { 325 if (flags & UNICODE_WARN_NONCHAR) { 326 U32 category = packWARN(WARN_NONCHAR); 327 const char * format = nonchar_cp_format; 328 if (msgs) { 329 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 330 category, 331 UNICODE_GOT_NONCHAR); 332 } 333 else { 334 Perl_ck_warner_d(aTHX_ category, format, input_uv); 335 } 336 } 337 if (flags & UNICODE_DISALLOW_NONCHAR) { 338 return NULL; 339 } 340 } 341 else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) { 342 if (flags & UNICODE_WARN_SURROGATE) { 343 U32 category = packWARN(WARN_SURROGATE); 344 const char * format = surrogate_cp_format; 345 if (msgs) { 346 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv), 347 category, 348 UNICODE_GOT_SURROGATE); 349 } 350 else { 351 Perl_ck_warner_d(aTHX_ category, format, input_uv); 352 } 353 } 354 if (flags & UNICODE_DISALLOW_SURROGATE) { 355 return NULL; 356 } 357 } 358 } 359 360 d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT] 361 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 362 shifted_uv >>= SHIFT; 363 /* FALLTHROUGH */ 364 365 #ifdef EBCDIC 366 367 case 3: 368 d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 369 shifted_uv >>= SHIFT; 370 /* FALLTHROUGH */ 371 372 #endif 373 374 /* FALLTHROUGH */ 375 case 2: 376 d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK); 377 shifted_uv >>= SHIFT; 378 d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip)) 379 | UTF_START_MARK(utf8_skip)); 380 break; 381 } 382 383 return d + utf8_skip; 384 } 385 386 /* 387 =for apidoc uvchr_to_utf8 388 389 Adds the UTF-8 representation of the native code point C<uv> to the end 390 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to 391 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 392 the byte after the end of the new character. In other words, 393 394 d = uvchr_to_utf8(d, uv); 395 396 is the recommended wide native character-aware way of saying 397 398 *(d++) = uv; 399 400 This function accepts any code point from 0..C<IV_MAX> as input. 401 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word. 402 403 It is possible to forbid or warn on non-Unicode code points, or those that may 404 be problematic by using L</uvchr_to_utf8_flags>. 405 406 =cut 407 */ 408 409 /* This is also a macro */ 410 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); 411 412 U8 * 413 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) 414 { 415 return uvchr_to_utf8(d, uv); 416 } 417 418 /* 419 =for apidoc uvchr_to_utf8_flags 420 421 Adds the UTF-8 representation of the native code point C<uv> to the end 422 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to 423 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to 424 the byte after the end of the new character. In other words, 425 426 d = uvchr_to_utf8_flags(d, uv, flags); 427 428 or, in most cases, 429 430 d = uvchr_to_utf8_flags(d, uv, 0); 431 432 This is the Unicode-aware way of saying 433 434 *(d++) = uv; 435 436 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as 437 input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word. 438 439 Specifying C<flags> can further restrict what is allowed and not warned on, as 440 follows: 441 442 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set, 443 the function will raise a warning, provided UTF8 warnings are enabled. If 444 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return 445 NULL. If both flags are set, the function will both warn and return NULL. 446 447 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags 448 affect how the function handles a Unicode non-character. 449 450 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags 451 affect the handling of code points that are above the Unicode maximum of 452 0x10FFFF. Languages other than Perl may not be able to accept files that 453 contain these. 454 455 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of 456 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all 457 three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the 458 allowed inputs to the strict UTF-8 traditionally defined by Unicode. 459 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and 460 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the 461 above-Unicode and surrogate flags, but not the non-character ones, as 462 defined in 463 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>. 464 See L<perlunicode/Noncharacter code points>. 465 466 Extremely high code points were never specified in any standard, and require an 467 extension to UTF-8 to express, which Perl does. It is likely that programs 468 written in something other than Perl would not be able to read files that 469 contain these; nor would Perl understand files written by something that uses a 470 different extension. For these reasons, there is a separate set of flags that 471 can warn and/or disallow these extremely high code points, even if other 472 above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED> 473 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see 474 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will 475 treat all above-Unicode code points, including these, as malformations. (Note 476 that the Unicode standard considers anything above 0x10FFFF to be illegal, but 477 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1)) 478 479 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is 480 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly, 481 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named 482 C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC 483 platforms,these flags can apply to code points that actually do fit in 31 bits. 484 The new names accurately describe the situation in all cases. 485 486 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT 487 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE 488 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 489 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR 490 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED 491 =for apidoc Amnh||UNICODE_DISALLOW_SUPER 492 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE 493 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT 494 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE 495 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE 496 =for apidoc Amnh||UNICODE_WARN_NONCHAR 497 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED 498 =for apidoc Amnh||UNICODE_WARN_SUPER 499 =for apidoc Amnh||UNICODE_WARN_SURROGATE 500 501 =cut 502 */ 503 504 /* This is also a macro */ 505 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags); 506 507 U8 * 508 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) 509 { 510 return uvchr_to_utf8_flags(d, uv, flags); 511 } 512 513 PERL_STATIC_INLINE int 514 S_is_utf8_overlong(const U8 * const s, const STRLEN len) 515 { 516 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 517 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if 518 * it isn't, and -1 if there isn't enough information to tell. This last 519 * return value can happen if the sequence is incomplete, missing some 520 * trailing bytes that would form a complete character. If there are 521 * enough bytes to make a definitive decision, this function does so. 522 * Usually 2 bytes are sufficient. 523 * 524 * Overlongs can occur whenever the number of continuation bytes changes. 525 * That means whenever the number of leading 1 bits in a start byte 526 * increases from the next lower start byte. That happens for start bytes 527 * C0, E0, F0, F8, FC, FE, and FF. 528 */ 529 530 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG; 531 532 /* Each platform has overlongs after the start bytes given above (expressed 533 * in I8 for EBCDIC). The values below were found by manually inspecting 534 * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */ 535 536 switch (NATIVE_UTF8_TO_I8(s[0])) { 537 default: 538 assert(UTF8_IS_START(s[0])); 539 return 0; 540 541 case 0xC0: 542 case 0xC1: 543 return 1; 544 545 #ifdef EBCDIC 546 547 case 0xC2: 548 case 0xC3: 549 case 0xC4: 550 case 0xE0: 551 return 1; 552 #else 553 case 0xE0: 554 return (len < 2) ? -1 : s[1] < 0xA0; 555 #endif 556 557 case 0xF0: 558 return (len < 2) 559 ? -1 560 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10; 561 case 0xF8: 562 return (len < 2) 563 ? -1 564 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08; 565 case 0xFC: 566 return (len < 2) 567 ? -1 568 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04; 569 case 0xFE: 570 return (len < 2) 571 ? -1 572 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02; 573 case 0xFF: 574 return isFF_overlong(s, len); 575 } 576 } 577 578 PERL_STATIC_INLINE int 579 S_isFF_overlong(const U8 * const s, const STRLEN len) 580 { 581 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 582 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if 583 * it isn't, and -1 if there isn't enough information to tell. This last 584 * return value can happen if the sequence is incomplete, missing some 585 * trailing bytes that would form a complete character. If there are 586 * enough bytes to make a definitive decision, this function does so. */ 587 588 PERL_ARGS_ASSERT_ISFF_OVERLONG; 589 590 #ifdef EBCDIC 591 /* This works on all three EBCDIC code pages traditionally supported by 592 * perl */ 593 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41" 594 #else 595 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80" 596 #endif 597 598 /* To be an FF overlong, all the available bytes must match */ 599 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX, 600 MIN(len, STRLENs(FF_OVERLONG_PREFIX))))) 601 { 602 return 0; 603 } 604 605 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must 606 * be there; what comes after them doesn't matter. See tables in utf8.h, 607 * utfebcdic.h. */ 608 if (len >= STRLENs(FF_OVERLONG_PREFIX)) { 609 return 1; 610 } 611 612 /* The missing bytes could cause the result to go one way or the other, so 613 * the result is indeterminate */ 614 return -1; 615 } 616 617 /* At some point we may want to allow core to use up to UV_MAX */ 618 619 #ifdef EBCDIC /* Actually is I8 */ 620 # if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */ 621 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA7" 622 /* UV_MAX "\xFF\xAF" */ 623 # else /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */ 624 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1" 625 /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */ 626 # endif 627 #else 628 # if defined(UV_IS_QUAD) 629 # define HIGHEST_REPRESENTABLE_UTF "\xFF\x80\x87" 630 /* UV_MAX "\xFF\x80" */ 631 # else 632 # define HIGHEST_REPRESENTABLE_UTF "\xFD" 633 /* UV_MAX "\xFE\x83" */ 634 # endif 635 #endif 636 637 PERL_STATIC_INLINE int 638 S_does_utf8_overflow(const U8 * const s, 639 const U8 * e, 640 const bool consider_overlongs) 641 { 642 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to 643 * 'e' - 1 would overflow an IV on this platform; that is if it represents 644 * a code point larger than the highest representable code point. It 645 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't 646 * enough information to tell. This last return value can happen if the 647 * sequence is incomplete, missing some trailing bytes that would form a 648 * complete character. If there are enough bytes to make a definitive 649 * decision, this function does so. 650 * 651 * If 'consider_overlongs' is TRUE, the function checks for the possibility 652 * that the sequence is an overlong that doesn't overflow. Otherwise, it 653 * assumes the sequence is not an overlong. This can give different 654 * results only on ASCII 32-bit platforms. 655 * 656 * (For ASCII platforms, we could use memcmp() because we don't have to 657 * convert each byte to I8, but it's very rare input indeed that would 658 * approach overflow, so the loop below will likely only get executed once.) 659 * 660 */ 661 const STRLEN len = e - s; 662 const U8 *x; 663 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF; 664 int is_overlong = 0; 665 666 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; 667 668 for (x = s; x < e; x++, y++) { 669 670 /* 'y' is set up to not include the trailing bytes that are all the 671 * maximum possible continuation byte. So when we reach the end of 'y' 672 * (known to be NUL terminated), it is impossible for 'x' to contain 673 * bytes larger than those omitted bytes, and therefore 'x' can't 674 * overflow */ 675 if (*y == '\0') { 676 return 0; 677 } 678 679 /* If this byte is less than the corresponding highest non-overflowing 680 * UTF-8, the sequence doesn't overflow */ 681 if (NATIVE_UTF8_TO_I8(*x) < *y) { 682 return 0; 683 } 684 685 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { 686 goto overflows_if_not_overlong; 687 } 688 } 689 690 /* Got to the end, and all bytes are the same. If the input is a whole 691 * character, it doesn't overflow. And if it is a partial character, 692 * there's not enough information to tell */ 693 return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1; 694 695 overflows_if_not_overlong: 696 697 /* Here, a well-formed sequence overflows. If we are assuming 698 * well-formedness, return that it overflows. */ 699 if (! consider_overlongs) { 700 return 1; 701 } 702 703 /* Here, it could be the overlong malformation, and might not actuallly 704 * overflow if you were to calculate it out. 705 * 706 * See if it actually is overlong */ 707 is_overlong = is_utf8_overlong(s, len); 708 709 /* If it isn't overlong, is well-formed, so overflows */ 710 if (is_overlong == 0) { 711 return 1; 712 } 713 714 /* Not long enough to determine */ 715 if (is_overlong < 0) { 716 return -1; 717 } 718 719 /* Here, it appears to overflow, but it is also overlong */ 720 721 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS 722 723 /* On many platforms, it is impossible for an overlong to overflow. For 724 * these, no further work is necessary: we can return immediately that this 725 * overlong that is an apparent overflow actually isn't 726 * 727 * To see why, note that a length_N sequence can represent as overlongs all 728 * the code points representable by shorter length sequences, but no 729 * higher. If it could represent a higher code point without being an 730 * overlong, we wouldn't have had to increase the sequence length! 731 * 732 * The highest possible start byte is FF; the next highest is FE. The 733 * highest code point representable as an overlong on the platform is thus 734 * the highest code point representable by a non-overlong sequence whose 735 * start byte is FE. If that value doesn't overflow the platform's word 736 * size, overlongs can't overflow. 737 * 738 * FE consists of 7 bytes total; the FE start byte contributes 0 bits of 739 * information (the high 7 bits, all ones, say that the sequence is 7 bytes 740 * long, and the bottom, zero, bit is s placeholder. That leaves the 6 741 * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each. 742 If that number of bits doesn't exceed the word size, it can't overflow. */ 743 744 return 0; 745 746 #else 747 748 /* In practice, only a 32-bit ASCII box gets here. The FE start byte can 749 * represent, as an overlong, the highest code point representable by an FD 750 * start byte, which is 5*6 continuation bytes of info plus one bit from 751 * the start byte, or 31 bits. That doesn't overflow. More explicitly: 752 * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1. 753 * 754 * That means only the FF start byte can have an overflowing overlong. */ 755 if (*s < 0xFF) { 756 return 0; 757 } 758 759 /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that 760 * evaluates to 2**31, so overflows an IV. For a UV it's 761 * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */ 762 # define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82" 763 764 if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */ 765 return -1; 766 } 767 768 # define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0) 769 770 return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS)); 771 772 #endif 773 774 } 775 776 STRLEN 777 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags) 778 { 779 SSize_t len, full_len; 780 781 /* An internal helper function. 782 * 783 * On input: 784 * 's' is a string, which is known to be syntactically valid UTF-8 as far 785 * as (e - 1); e > s must hold. 786 * 'e' This function is allowed to look at any byte from 's'...'e-1', but 787 * nowhere else. The function has to cope as best it can if that 788 * sequence does not form a full character. 789 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags 790 * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns 791 * 0 if it determines the input will match something disallowed. 792 * On output: 793 * The return is the number of bytes required to represent the code point 794 * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the 795 * input is for a partial character, a successful return will be larger 796 * than 'e - s'. 797 * 798 * If *s..*(e-1) is only for a partial character, the function will return 799 * non-zero if there is any sequence of well-formed UTF-8 that, when 800 * appended to the input sequence, could result in an allowed code point; 801 * otherwise it returns 0. Non characters cannot be determined based on 802 * partial character input. But many of the other excluded types can be 803 * determined with just the first one or two bytes. 804 * 805 */ 806 807 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_; 808 809 assert(e > s); 810 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE 811 |UTF8_DISALLOW_PERL_EXTENDED))); 812 813 full_len = UTF8SKIP(s); 814 815 len = e - s; 816 if (len > full_len) { 817 e = s + full_len; 818 len = full_len; 819 } 820 821 switch (full_len) { 822 bool is_super; 823 824 default: /* Extended */ 825 if (flags & UTF8_DISALLOW_PERL_EXTENDED) { 826 return 0; 827 } 828 829 /* FALLTHROUGH */ 830 831 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */ 832 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */ 833 834 if (flags & UTF8_DISALLOW_SUPER) { 835 return 0; /* Above Unicode */ 836 } 837 838 return full_len; 839 840 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT: 841 is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_) 842 || ( len > 1 843 && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_ 844 && NATIVE_UTF8_TO_I8(s[1]) 845 >= UTF_FIRST_CONT_BYTE_110000_)); 846 if (is_super) { 847 if (flags & UTF8_DISALLOW_SUPER) { 848 return 0; 849 } 850 } 851 else if ( (flags & UTF8_DISALLOW_NONCHAR) 852 && len == full_len 853 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s))) 854 { 855 return 0; 856 } 857 858 return full_len; 859 860 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT: 861 862 if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) { 863 return full_len; 864 } 865 866 if ( (flags & UTF8_DISALLOW_SURROGATE) 867 && UNLIKELY(is_SURROGATE_utf8(s))) 868 { 869 return 0; /* Surrogate */ 870 } 871 872 if ( (flags & UTF8_DISALLOW_NONCHAR) 873 && len == full_len 874 && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s))) 875 { 876 return 0; 877 } 878 879 return full_len; 880 881 /* The lower code points don't have any disallowable characters */ 882 #ifdef EBCDIC 883 case 3: 884 return full_len; 885 #endif 886 887 case 2: 888 case 1: 889 return full_len; 890 } 891 } 892 893 Size_t 894 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e, 895 const bool require_partial) 896 { 897 /* This is called to determine if the UTF-8 sequence starting at s0 and 898 * continuing for up to one full character of bytes, but looking no further 899 * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native 900 * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being 901 * for the largest code points recognized by Perl, the ones that require 902 * the most UTF-8 bytes per character to represent (somewhat less than 903 * twice the size of the next longest kind). This sequence will only ever 904 * be Perl extended UTF-8. 905 * 906 * The routine returns 0 if the sequence is not fully valid, syntactically 907 * or semantically. That means it checks that everything following the 908 * start byte is a continuation byte, and that it doesn't overflow, nor is 909 * an overlong representation. 910 * 911 * If 'require_partial' is FALSE, the routine returns non-zero only if the 912 * input (as far as 'e-1') is a full character. The return is the count of 913 * the bytes in the character. 914 * 915 * If 'require_partial' is TRUE, the routine returns non-zero only if the 916 * input as far as 'e-1' is a partial, not full character, with no 917 * malformations found before position 'e'. The return is either just 918 * FALSE, or TRUE. */ 919 920 const U8 *s = s0 + 1; 921 const U8 *send = e; 922 923 PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_; 924 925 assert(s0 < e); 926 assert(*s0 == I8_TO_NATIVE_UTF8(0xFF)); 927 928 send = s + MIN(UTF8_MAXBYTES - 1, e - s); 929 while (s < send) { 930 if (! UTF8_IS_CONTINUATION(*s)) { 931 return 0; 932 } 933 934 s++; 935 } 936 937 if (0 < does_utf8_overflow(s0, e, 938 FALSE /* Don't consider_overlongs */ 939 )) { 940 return 0; 941 } 942 943 if (0 < isFF_overlong(s0, e - s0)) { 944 return 0; 945 } 946 947 /* Here, the character is valid as far as it got. Check if got a partial 948 * character */ 949 if (s - s0 < UTF8_MAXBYTES) { 950 return (require_partial) ? 1 : 0; 951 } 952 953 /* Here, got a full character */ 954 return (require_partial) ? 0 : UTF8_MAXBYTES; 955 } 956 957 char * 958 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) 959 { 960 /* Returns a mortalized C string that is a displayable copy of the 'len' 961 * bytes starting at 'start'. 'format' gives how to display each byte. 962 * Currently, there are only two formats, so it is currently a bool: 963 * 0 \xab 964 * 1 ab (that is a space between two hex digit bytes) 965 */ 966 967 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a 968 trailing NUL */ 969 const U8 * s = start; 970 const U8 * const e = start + len; 971 char * output; 972 char * d; 973 974 PERL_ARGS_ASSERT__BYTE_DUMP_STRING; 975 976 Newx(output, output_len, char); 977 SAVEFREEPV(output); 978 979 d = output; 980 for (s = start; s < e; s++) { 981 const unsigned high_nibble = (*s & 0xF0) >> 4; 982 const unsigned low_nibble = (*s & 0x0F); 983 984 if (format) { 985 if (s > start) { 986 *d++ = ' '; 987 } 988 } 989 else { 990 *d++ = '\\'; 991 *d++ = 'x'; 992 } 993 994 if (high_nibble < 10) { 995 *d++ = high_nibble + '0'; 996 } 997 else { 998 *d++ = high_nibble - 10 + 'a'; 999 } 1000 1001 if (low_nibble < 10) { 1002 *d++ = low_nibble + '0'; 1003 } 1004 else { 1005 *d++ = low_nibble - 10 + 'a'; 1006 } 1007 } 1008 1009 *d = '\0'; 1010 return output; 1011 } 1012 1013 PERL_STATIC_INLINE char * 1014 S_unexpected_non_continuation_text(pTHX_ const U8 * const s, 1015 1016 /* Max number of bytes to print */ 1017 STRLEN print_len, 1018 1019 /* Which one is the non-continuation */ 1020 const STRLEN non_cont_byte_pos, 1021 1022 /* How many bytes should there be? */ 1023 const STRLEN expect_len) 1024 { 1025 /* Return the malformation warning text for an unexpected continuation 1026 * byte. */ 1027 1028 const char * const where = (non_cont_byte_pos == 1) 1029 ? "immediately" 1030 : Perl_form(aTHX_ "%d bytes", 1031 (int) non_cont_byte_pos); 1032 const U8 * x = s + non_cont_byte_pos; 1033 const U8 * e = s + print_len; 1034 1035 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; 1036 1037 /* We don't need to pass this parameter, but since it has already been 1038 * calculated, it's likely faster to pass it; verify under DEBUGGING */ 1039 assert(expect_len == UTF8SKIP(s)); 1040 1041 /* As a defensive coding measure, don't output anything past a NUL. Such 1042 * bytes shouldn't be in the middle of a malformation, and could mark the 1043 * end of the allocated string, and what comes after is undefined */ 1044 for (; x < e; x++) { 1045 if (*x == '\0') { 1046 x++; /* Output this particular NUL */ 1047 break; 1048 } 1049 } 1050 1051 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," 1052 " %s after start byte 0x%02x; need %d bytes, got %d)", 1053 malformed_text, 1054 _byte_dump_string(s, x - s, 0), 1055 *(s + non_cont_byte_pos), 1056 where, 1057 *s, 1058 (int) expect_len, 1059 (int) non_cont_byte_pos); 1060 } 1061 1062 /* 1063 1064 =for apidoc utf8n_to_uvchr 1065 1066 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1067 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1068 directly. 1069 1070 Bottom level UTF-8 decode routine. 1071 Returns the native code point value of the first character in the string C<s>, 1072 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than 1073 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to 1074 the length, in bytes, of that character. 1075 1076 The value of C<flags> determines the behavior when C<s> does not point to a 1077 well-formed UTF-8 character. If C<flags> is 0, encountering a malformation 1078 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) 1079 is the next possible position in C<s> that could begin a non-malformed 1080 character. Also, if UTF-8 warnings haven't been lexically disabled, a warning 1081 is raised. Some UTF-8 input sequences may contain multiple malformations. 1082 This function tries to find every possible one in each call, so multiple 1083 warnings can be raised for the same sequence. 1084 1085 Various ALLOW flags can be set in C<flags> to allow (and not warn on) 1086 individual types of malformations, such as the sequence being overlong (that 1087 is, when there is a shorter sequence that can express the same code point; 1088 overlong sequences are expressly forbidden in the UTF-8 standard due to 1089 potential security issues). Another malformation example is the first byte of 1090 a character not being a legal first byte. See F<utf8.h> for the list of such 1091 flags. Even if allowed, this function generally returns the Unicode 1092 REPLACEMENT CHARACTER when it encounters a malformation. There are flags in 1093 F<utf8.h> to override this behavior for the overlong malformations, but don't 1094 do that except for very specialized purposes. 1095 1096 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other 1097 flags) malformation is found. If this flag is set, the routine assumes that 1098 the caller will raise a warning, and this function will silently just set 1099 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero. 1100 1101 Note that this API requires disambiguation between successful decoding a C<NUL> 1102 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as 1103 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may 1104 be set to 1. To disambiguate, upon a zero return, see if the first byte of 1105 C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an 1106 error. Or you can use C<L</utf8n_to_uvchr_error>>. 1107 1108 Certain code points are considered problematic. These are Unicode surrogates, 1109 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. 1110 By default these are considered regular code points, but certain situations 1111 warrant special handling for them, which can be specified using the C<flags> 1112 parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all 1113 three classes are treated as malformations and handled as such. The flags 1114 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and 1115 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to 1116 disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE> 1117 restricts the allowed inputs to the strict UTF-8 traditionally defined by 1118 Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness 1119 definition given by 1120 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>. 1121 The difference between traditional strictness and C9 strictness is that the 1122 latter does not forbid non-character code points. (They are still discouraged, 1123 however.) For more discussion see L<perlunicode/Noncharacter code points>. 1124 1125 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>, 1126 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>, 1127 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be 1128 raised for their respective categories, but otherwise the code points are 1129 considered valid (not malformations). To get a category to both be treated as 1130 a malformation and raise a warning, specify both the WARN and DISALLOW flags. 1131 (But note that warnings are not raised if lexically disabled nor if 1132 C<UTF8_CHECK_ONLY> is also specified.) 1133 1134 Extremely high code points were never specified in any standard, and require an 1135 extension to UTF-8 to express, which Perl does. It is likely that programs 1136 written in something other than Perl would not be able to read files that 1137 contain these; nor would Perl understand files written by something that uses a 1138 different extension. For these reasons, there is a separate set of flags that 1139 can warn and/or disallow these extremely high code points, even if other 1140 above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and 1141 C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see 1142 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all 1143 above-Unicode code points, including these, as malformations. 1144 (Note that the Unicode standard considers anything above 0x10FFFF to be 1145 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF 1146 (2**31 -1)) 1147 1148 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is 1149 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly, 1150 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named 1151 C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags 1152 can apply to code points that actually do fit in 31 bits. This happens on 1153 EBCDIC platforms, and sometimes when the L<overlong 1154 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately 1155 describe the situation in all cases. 1156 1157 1158 All other code points corresponding to Unicode characters, including private 1159 use and those yet to be assigned, are never considered malformed and never 1160 warn. 1161 1162 =for apidoc Amnh||UTF8_CHECK_ONLY 1163 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE 1164 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE 1165 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE 1166 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR 1167 =for apidoc Amnh||UTF8_DISALLOW_SUPER 1168 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE 1169 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE 1170 =for apidoc Amnh||UTF8_WARN_SURROGATE 1171 =for apidoc Amnh||UTF8_WARN_NONCHAR 1172 =for apidoc Amnh||UTF8_WARN_SUPER 1173 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED 1174 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED 1175 1176 =cut 1177 1178 Also implemented as a macro in utf8.h 1179 */ 1180 1181 UV 1182 Perl_utf8n_to_uvchr(const U8 *s, 1183 STRLEN curlen, 1184 STRLEN *retlen, 1185 const U32 flags) 1186 { 1187 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; 1188 1189 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); 1190 } 1191 1192 /* 1193 1194 =for apidoc utf8n_to_uvchr_error 1195 1196 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1197 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1198 directly. 1199 1200 This function is for code that needs to know what the precise malformation(s) 1201 are when an error is found. If you also need to know the generated warning 1202 messages, use L</utf8n_to_uvchr_msgs>() instead. 1203 1204 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after 1205 all the others, C<errors>. If this parameter is 0, this function behaves 1206 identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer 1207 to a C<U32> variable, which this function sets to indicate any errors found. 1208 Upon return, if C<*errors> is 0, there were no errors found. Otherwise, 1209 C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some 1210 of these bits will be set if a malformation is found, even if the input 1211 C<flags> parameter indicates that the given malformation is allowed; those 1212 exceptions are noted: 1213 1214 =over 4 1215 1216 =item C<UTF8_GOT_PERL_EXTENDED> 1217 1218 The input sequence is not standard UTF-8, but a Perl extension. This bit is 1219 set only if the input C<flags> parameter contains either the 1220 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags. 1221 1222 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, 1223 and so some extension must be used to express them. Perl uses a natural 1224 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further 1225 extension to represent even higher ones, so that any code point that fits in a 1226 64-bit word can be represented. Text using these extensions is not likely to 1227 be portable to non-Perl code. We lump both of these extensions together and 1228 refer to them as Perl extended UTF-8. There exist other extensions that people 1229 have invented, incompatible with Perl's. 1230 1231 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing 1232 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower 1233 than on ASCII. Prior to that, code points 2**31 and higher were simply 1234 unrepresentable, and a different, incompatible method was used to represent 1235 code points between 2**30 and 2**31 - 1. 1236 1237 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if 1238 Perl extended UTF-8 is used. 1239 1240 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still 1241 may use for backward compatibility. That name is misleading, as this flag may 1242 be set when the code point actually does fit in 31 bits. This happens on 1243 EBCDIC platforms, and sometimes when the L<overlong 1244 malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately 1245 describes the situation in all cases. 1246 1247 =item C<UTF8_GOT_CONTINUATION> 1248 1249 The input sequence was malformed in that the first byte was a UTF-8 1250 continuation byte. 1251 1252 =item C<UTF8_GOT_EMPTY> 1253 1254 The input C<curlen> parameter was 0. 1255 1256 =item C<UTF8_GOT_LONG> 1257 1258 The input sequence was malformed in that there is some other sequence that 1259 evaluates to the same code point, but that sequence is shorter than this one. 1260 1261 Until Unicode 3.1, it was legal for programs to accept this malformation, but 1262 it was discovered that this created security issues. 1263 1264 =item C<UTF8_GOT_NONCHAR> 1265 1266 The code point represented by the input UTF-8 sequence is for a Unicode 1267 non-character code point. 1268 This bit is set only if the input C<flags> parameter contains either the 1269 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags. 1270 1271 =item C<UTF8_GOT_NON_CONTINUATION> 1272 1273 The input sequence was malformed in that a non-continuation type byte was found 1274 in a position where only a continuation type one should be. See also 1275 C<L</UTF8_GOT_SHORT>>. 1276 1277 =item C<UTF8_GOT_OVERFLOW> 1278 1279 The input sequence was malformed in that it is for a code point that is not 1280 representable in the number of bits available in an IV on the current platform. 1281 1282 =item C<UTF8_GOT_SHORT> 1283 1284 The input sequence was malformed in that C<curlen> is smaller than required for 1285 a complete sequence. In other words, the input is for a partial character 1286 sequence. 1287 1288 1289 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short 1290 sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always 1291 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete 1292 sequence was looked at. If no other flags are present, it means that the 1293 sequence was valid as far as it went. Depending on the application, this could 1294 mean one of three things: 1295 1296 =over 1297 1298 =item * 1299 1300 The C<curlen> length parameter passed in was too small, and the function was 1301 prevented from examining all the necessary bytes. 1302 1303 =item * 1304 1305 The buffer being looked at is based on reading data, and the data received so 1306 far stopped in the middle of a character, so that the next read will 1307 read the remainder of this character. (It is up to the caller to deal with the 1308 split bytes somehow.) 1309 1310 =item * 1311 1312 This is a real error, and the partial sequence is all we're going to get. 1313 1314 =back 1315 1316 =item C<UTF8_GOT_SUPER> 1317 1318 The input sequence was malformed in that it is for a non-Unicode code point; 1319 that is, one above the legal Unicode maximum. 1320 This bit is set only if the input C<flags> parameter contains either the 1321 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags. 1322 1323 =item C<UTF8_GOT_SURROGATE> 1324 1325 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate 1326 code point. 1327 This bit is set only if the input C<flags> parameter contains either the 1328 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags. 1329 1330 =back 1331 1332 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY> 1333 flag to suppress any warnings, and then examine the C<*errors> return. 1334 1335 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED 1336 =for apidoc Amnh||UTF8_GOT_CONTINUATION 1337 =for apidoc Amnh||UTF8_GOT_EMPTY 1338 =for apidoc Amnh||UTF8_GOT_LONG 1339 =for apidoc Amnh||UTF8_GOT_NONCHAR 1340 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION 1341 =for apidoc Amnh||UTF8_GOT_OVERFLOW 1342 =for apidoc Amnh||UTF8_GOT_SHORT 1343 =for apidoc Amnh||UTF8_GOT_SUPER 1344 =for apidoc Amnh||UTF8_GOT_SURROGATE 1345 1346 =cut 1347 1348 Also implemented as a macro in utf8.h 1349 */ 1350 1351 UV 1352 Perl_utf8n_to_uvchr_error(const U8 *s, 1353 STRLEN curlen, 1354 STRLEN *retlen, 1355 const U32 flags, 1356 U32 * errors) 1357 { 1358 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; 1359 1360 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); 1361 } 1362 1363 /* 1364 1365 =for apidoc utf8n_to_uvchr_msgs 1366 1367 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. 1368 Most code should use L</utf8_to_uvchr_buf>() rather than call this 1369 directly. 1370 1371 This function is for code that needs to know what the precise malformation(s) 1372 are when an error is found, and wants the corresponding warning and/or error 1373 messages to be returned to the caller rather than be displayed. All messages 1374 that would have been displayed if all lexical warnings are enabled will be 1375 returned. 1376 1377 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter 1378 placed after all the others, C<msgs>. If this parameter is 0, this function 1379 behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should 1380 be a pointer to an C<AV *> variable, in which this function creates a new AV to 1381 contain any appropriate messages. The elements of the array are ordered so 1382 that the first message that would have been displayed is in the 0th element, 1383 and so on. Each element is a hash with three key-value pairs, as follows: 1384 1385 =over 4 1386 1387 =item C<text> 1388 1389 The text of the message as a C<SVpv>. 1390 1391 =item C<warn_categories> 1392 1393 The warning category (or categories) packed into a C<SVuv>. 1394 1395 =item C<flag> 1396 1397 A single flag bit associated with this message, in a C<SVuv>. 1398 The bit corresponds to some bit in the C<*errors> return value, 1399 such as C<UTF8_GOT_LONG>. 1400 1401 =back 1402 1403 It's important to note that specifying this parameter as non-null will cause 1404 any warnings this function would otherwise generate to be suppressed, and 1405 instead be placed in C<*msgs>. The caller can check the lexical warnings state 1406 (or not) when choosing what to do with the returned messages. 1407 1408 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence 1409 no AV is created. 1410 1411 The caller, of course, is responsible for freeing any returned AV. 1412 1413 =cut 1414 */ 1415 1416 UV 1417 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, 1418 STRLEN curlen, 1419 STRLEN *retlen, 1420 const U32 flags, 1421 U32 * errors, 1422 AV ** msgs) 1423 { 1424 const U8 * const s0 = s; 1425 const U8 * send = s0 + curlen; 1426 U32 possible_problems; /* A bit is set here for each potential problem 1427 found as we go along */ 1428 UV uv; 1429 STRLEN expectlen; /* How long should this sequence be? */ 1430 STRLEN avail_len; /* When input is too short, gives what that is */ 1431 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this 1432 gets set and discarded */ 1433 1434 /* The below are used only if there is both an overlong malformation and a 1435 * too short one. Otherwise the first two are set to 's0' and 'send', and 1436 * the third not used at all */ 1437 U8 * adjusted_s0; 1438 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this 1439 routine; see [perl #130921] */ 1440 UV uv_so_far; 1441 dTHX; 1442 1443 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; 1444 1445 /* Here, is one of: a) malformed; b) a problematic code point (surrogate, 1446 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul 1447 * syllables that the dfa doesn't properly handle. Quickly dispose of the 1448 * final case. */ 1449 1450 /* Each of the affected Hanguls starts with \xED */ 1451 1452 if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */ 1453 if (retlen) { 1454 *retlen = 3; 1455 } 1456 if (errors) { 1457 *errors = 0; 1458 } 1459 if (msgs) { 1460 *msgs = NULL; 1461 } 1462 1463 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) 1464 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) 1465 | (s0[2] & UTF_CONTINUATION_MASK); 1466 } 1467 1468 /* In conjunction with the exhaustive tests that can be enabled in 1469 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely 1470 * what it is intended to do, and that no flaws in it are masked by 1471 * dropping down and executing the code below 1472 assert(! isUTF8_CHAR(s0, send) 1473 || UTF8_IS_SURROGATE(s0, send) 1474 || UTF8_IS_SUPER(s0, send) 1475 || UTF8_IS_NONCHAR(s0,send)); 1476 */ 1477 1478 s = s0; 1479 possible_problems = 0; 1480 expectlen = 0; 1481 avail_len = 0; 1482 discard_errors = 0; 1483 adjusted_s0 = (U8 *) s0; 1484 uv_so_far = 0; 1485 1486 if (errors) { 1487 *errors = 0; 1488 } 1489 else { 1490 errors = &discard_errors; 1491 } 1492 1493 /* The order of malformation tests here is important. We should consume as 1494 * few bytes as possible in order to not skip any valid character. This is 1495 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also 1496 * https://unicode.org/reports/tr36 for more discussion as to why. For 1497 * example, once we've done a UTF8SKIP, we can tell the expected number of 1498 * bytes, and could fail right off the bat if the input parameters indicate 1499 * that there are too few available. But it could be that just that first 1500 * byte is garbled, and the intended character occupies fewer bytes. If we 1501 * blindly assumed that the first byte is correct, and skipped based on 1502 * that number, we could skip over a valid input character. So instead, we 1503 * always examine the sequence byte-by-byte. 1504 * 1505 * We also should not consume too few bytes, otherwise someone could inject 1506 * things. For example, an input could be deliberately designed to 1507 * overflow, and if this code bailed out immediately upon discovering that, 1508 * returning to the caller C<*retlen> pointing to the very next byte (one 1509 * which is actually part of the overflowing sequence), that could look 1510 * legitimate to the caller, which could discard the initial partial 1511 * sequence and process the rest, inappropriately. 1512 * 1513 * Some possible input sequences are malformed in more than one way. This 1514 * function goes to lengths to try to find all of them. This is necessary 1515 * for correctness, as the inputs may allow one malformation but not 1516 * another, and if we abandon searching for others after finding the 1517 * allowed one, we could allow in something that shouldn't have been. 1518 */ 1519 1520 if (UNLIKELY(curlen == 0)) { 1521 possible_problems |= UTF8_GOT_EMPTY; 1522 curlen = 0; 1523 uv = UNICODE_REPLACEMENT; 1524 goto ready_to_handle_errors; 1525 } 1526 1527 /* We now know we can examine the first byte of the input */ 1528 expectlen = UTF8SKIP(s); 1529 uv = *s; 1530 1531 /* A well-formed UTF-8 character, as the vast majority of calls to this 1532 * function will be for, has this expected length. For efficiency, set 1533 * things up here to return it. It will be overriden only in those rare 1534 * cases where a malformation is found */ 1535 if (retlen) { 1536 *retlen = expectlen; 1537 } 1538 1539 /* A continuation character can't start a valid sequence */ 1540 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { 1541 possible_problems |= UTF8_GOT_CONTINUATION; 1542 curlen = 1; 1543 uv = UNICODE_REPLACEMENT; 1544 goto ready_to_handle_errors; 1545 } 1546 1547 /* Here is not a continuation byte, nor an invariant. The only thing left 1548 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START 1549 * because it excludes start bytes like \xC0 that always lead to 1550 * overlongs.) */ 1551 1552 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits 1553 * that indicate the number of bytes in the character's whole UTF-8 1554 * sequence, leaving just the bits that are part of the value. */ 1555 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); 1556 1557 /* Setup the loop end point, making sure to not look past the end of the 1558 * input string, and flag it as too short if the size isn't big enough. */ 1559 if (UNLIKELY(curlen < expectlen)) { 1560 possible_problems |= UTF8_GOT_SHORT; 1561 avail_len = curlen; 1562 } 1563 else { 1564 send = (U8*) s0 + expectlen; 1565 } 1566 1567 /* Now, loop through the remaining bytes in the character's sequence, 1568 * accumulating each into the working value as we go. */ 1569 for (s = s0 + 1; s < send; s++) { 1570 if (LIKELY(UTF8_IS_CONTINUATION(*s))) { 1571 uv = UTF8_ACCUMULATE(uv, *s); 1572 continue; 1573 } 1574 1575 /* Here, found a non-continuation before processing all expected bytes. 1576 * This byte indicates the beginning of a new character, so quit, even 1577 * if allowing this malformation. */ 1578 possible_problems |= UTF8_GOT_NON_CONTINUATION; 1579 break; 1580 } /* End of loop through the character's bytes */ 1581 1582 /* Save how many bytes were actually in the character */ 1583 curlen = s - s0; 1584 1585 /* Note that there are two types of too-short malformation. One is when 1586 * there is actual wrong data before the normal termination of the 1587 * sequence. The other is that the sequence wasn't complete before the end 1588 * of the data we are allowed to look at, based on the input 'curlen'. 1589 * This means that we were passed data for a partial character, but it is 1590 * valid as far as we saw. The other is definitely invalid. This 1591 * distinction could be important to a caller, so the two types are kept 1592 * separate. 1593 * 1594 * A convenience macro that matches either of the too-short conditions. */ 1595 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) 1596 1597 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { 1598 uv_so_far = uv; 1599 uv = UNICODE_REPLACEMENT; 1600 } 1601 1602 /* Check for overflow. The algorithm requires us to not look past the end 1603 * of the current character, even if partial, so the upper limit is 's' */ 1604 if (UNLIKELY(0 < does_utf8_overflow(s0, s, 1605 1 /* Do consider overlongs */ 1606 ))) 1607 { 1608 possible_problems |= UTF8_GOT_OVERFLOW; 1609 uv = UNICODE_REPLACEMENT; 1610 } 1611 1612 /* Check for overlong. If no problems so far, 'uv' is the correct code 1613 * point value. Simply see if it is expressible in fewer bytes. Otherwise 1614 * we must look at the UTF-8 byte sequence itself to see if it is for an 1615 * overlong */ 1616 if ( ( LIKELY(! possible_problems) 1617 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) 1618 || ( UNLIKELY(possible_problems) 1619 && ( UNLIKELY(! UTF8_IS_START(*s0)) 1620 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) 1621 { 1622 possible_problems |= UTF8_GOT_LONG; 1623 1624 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT) 1625 1626 /* The calculation in the 'true' branch of this 'if' 1627 * below won't work if overflows, and isn't needed 1628 * anyway. Further below we handle all overflow 1629 * cases */ 1630 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) 1631 { 1632 UV min_uv = uv_so_far; 1633 STRLEN i; 1634 1635 /* Here, the input is both overlong and is missing some trailing 1636 * bytes. There is no single code point it could be for, but there 1637 * may be enough information present to determine if what we have 1638 * so far is for an unallowed code point, such as for a surrogate. 1639 * The code further below has the intelligence to determine this, 1640 * but just for non-overlong UTF-8 sequences. What we do here is 1641 * calculate the smallest code point the input could represent if 1642 * there were no too short malformation. Then we compute and save 1643 * the UTF-8 for that, which is what the code below looks at 1644 * instead of the raw input. It turns out that the smallest such 1645 * code point is all we need. */ 1646 for (i = curlen; i < expectlen; i++) { 1647 min_uv = UTF8_ACCUMULATE(min_uv, 1648 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE)); 1649 } 1650 1651 adjusted_s0 = temp_char_buf; 1652 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); 1653 } 1654 } 1655 1656 /* Here, we have found all the possible problems, except for when the input 1657 * is for a problematic code point not allowed by the input parameters. */ 1658 1659 /* uv is valid for overlongs */ 1660 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) 1661 && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) 1662 || ( UNLIKELY(possible_problems) 1663 1664 /* if overflow, we know without looking further 1665 * precisely which of the problematic types it is, 1666 * and we deal with those in the overflow handling 1667 * code */ 1668 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) 1669 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) 1670 || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) 1671 && ((flags & ( UTF8_DISALLOW_NONCHAR 1672 |UTF8_DISALLOW_SURROGATE 1673 |UTF8_DISALLOW_SUPER 1674 |UTF8_DISALLOW_PERL_EXTENDED 1675 |UTF8_WARN_NONCHAR 1676 |UTF8_WARN_SURROGATE 1677 |UTF8_WARN_SUPER 1678 |UTF8_WARN_PERL_EXTENDED)))) 1679 { 1680 /* If there were no malformations, or the only malformation is an 1681 * overlong, 'uv' is valid */ 1682 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) { 1683 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { 1684 possible_problems |= UTF8_GOT_SURROGATE; 1685 } 1686 else if (UNLIKELY(UNICODE_IS_SUPER(uv))) { 1687 possible_problems |= UTF8_GOT_SUPER; 1688 } 1689 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) { 1690 possible_problems |= UTF8_GOT_NONCHAR; 1691 } 1692 } 1693 else { /* Otherwise, need to look at the source UTF-8, possibly 1694 adjusted to be non-overlong */ 1695 1696 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) 1697 > UTF_START_BYTE_110000_)) 1698 { 1699 possible_problems |= UTF8_GOT_SUPER; 1700 } 1701 else if (curlen > 1) { 1702 if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0) 1703 == UTF_START_BYTE_110000_ 1704 && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)) 1705 >= UTF_FIRST_CONT_BYTE_110000_)) 1706 { 1707 possible_problems |= UTF8_GOT_SUPER; 1708 } 1709 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) { 1710 possible_problems |= UTF8_GOT_SURROGATE; 1711 } 1712 } 1713 1714 /* We need a complete well-formed UTF-8 character to discern 1715 * non-characters, so can't look for them here */ 1716 } 1717 } 1718 1719 ready_to_handle_errors: 1720 1721 /* At this point: 1722 * curlen contains the number of bytes in the sequence that 1723 * this call should advance the input by. 1724 * avail_len gives the available number of bytes passed in, but 1725 * only if this is less than the expected number of 1726 * bytes, based on the code point's start byte. 1727 * possible_problems is 0 if there weren't any problems; otherwise a bit 1728 * is set in it for each potential problem found. 1729 * uv contains the code point the input sequence 1730 * represents; or if there is a problem that prevents 1731 * a well-defined value from being computed, it is 1732 * some subsitute value, typically the REPLACEMENT 1733 * CHARACTER. 1734 * s0 points to the first byte of the character 1735 * s points to just after were we left off processing 1736 * the character 1737 * send points to just after where that character should 1738 * end, based on how many bytes the start byte tells 1739 * us should be in it, but no further than s0 + 1740 * avail_len 1741 */ 1742 1743 if (UNLIKELY(possible_problems)) { 1744 bool disallowed = FALSE; 1745 const U32 orig_problems = possible_problems; 1746 1747 if (msgs) { 1748 *msgs = NULL; 1749 } 1750 1751 while (possible_problems) { /* Handle each possible problem */ 1752 U32 pack_warn = 0; 1753 char * message = NULL; 1754 U32 this_flag_bit = 0; 1755 1756 /* Each 'if' clause handles one problem. They are ordered so that 1757 * the first ones' messages will be displayed before the later 1758 * ones; this is kinda in decreasing severity order. But the 1759 * overlong must come last, as it changes 'uv' looked at by the 1760 * others */ 1761 if (possible_problems & UTF8_GOT_OVERFLOW) { 1762 1763 /* Overflow means also got a super and are using Perl's 1764 * extended UTF-8, but we handle all three cases here */ 1765 possible_problems 1766 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED); 1767 *errors |= UTF8_GOT_OVERFLOW; 1768 1769 /* But the API says we flag all errors found */ 1770 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) { 1771 *errors |= UTF8_GOT_SUPER; 1772 } 1773 if (flags 1774 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED)) 1775 { 1776 *errors |= UTF8_GOT_PERL_EXTENDED; 1777 } 1778 1779 /* Disallow if any of the three categories say to */ 1780 if ( ! (flags & UTF8_ALLOW_OVERFLOW) 1781 || (flags & ( UTF8_DISALLOW_SUPER 1782 |UTF8_DISALLOW_PERL_EXTENDED))) 1783 { 1784 disallowed = TRUE; 1785 } 1786 1787 /* Likewise, warn if any say to */ 1788 if ( ! (flags & UTF8_ALLOW_OVERFLOW) 1789 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) 1790 { 1791 1792 /* The warnings code explicitly says it doesn't handle the 1793 * case of packWARN2 and two categories which have 1794 * parent-child relationship. Even if it works now to 1795 * raise the warning if either is enabled, it wouldn't 1796 * necessarily do so in the future. We output (only) the 1797 * most dire warning */ 1798 if (! (flags & UTF8_CHECK_ONLY)) { 1799 if (msgs || ckWARN_d(WARN_UTF8)) { 1800 pack_warn = packWARN(WARN_UTF8); 1801 } 1802 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) { 1803 pack_warn = packWARN(WARN_NON_UNICODE); 1804 } 1805 if (pack_warn) { 1806 message = Perl_form(aTHX_ "%s: %s (overflows)", 1807 malformed_text, 1808 _byte_dump_string(s0, curlen, 0)); 1809 this_flag_bit = UTF8_GOT_OVERFLOW; 1810 } 1811 } 1812 } 1813 } 1814 else if (possible_problems & UTF8_GOT_EMPTY) { 1815 possible_problems &= ~UTF8_GOT_EMPTY; 1816 *errors |= UTF8_GOT_EMPTY; 1817 1818 if (! (flags & UTF8_ALLOW_EMPTY)) { 1819 1820 /* This so-called malformation is now treated as a bug in 1821 * the caller. If you have nothing to decode, skip calling 1822 * this function */ 1823 assert(0); 1824 1825 disallowed = TRUE; 1826 if ( (msgs 1827 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1828 { 1829 pack_warn = packWARN(WARN_UTF8); 1830 message = Perl_form(aTHX_ "%s (empty string)", 1831 malformed_text); 1832 this_flag_bit = UTF8_GOT_EMPTY; 1833 } 1834 } 1835 } 1836 else if (possible_problems & UTF8_GOT_CONTINUATION) { 1837 possible_problems &= ~UTF8_GOT_CONTINUATION; 1838 *errors |= UTF8_GOT_CONTINUATION; 1839 1840 if (! (flags & UTF8_ALLOW_CONTINUATION)) { 1841 disallowed = TRUE; 1842 if (( msgs 1843 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1844 { 1845 pack_warn = packWARN(WARN_UTF8); 1846 message = Perl_form(aTHX_ 1847 "%s: %s (unexpected continuation byte 0x%02x," 1848 " with no preceding start byte)", 1849 malformed_text, 1850 _byte_dump_string(s0, 1, 0), *s0); 1851 this_flag_bit = UTF8_GOT_CONTINUATION; 1852 } 1853 } 1854 } 1855 else if (possible_problems & UTF8_GOT_SHORT) { 1856 possible_problems &= ~UTF8_GOT_SHORT; 1857 *errors |= UTF8_GOT_SHORT; 1858 1859 if (! (flags & UTF8_ALLOW_SHORT)) { 1860 disallowed = TRUE; 1861 if (( msgs 1862 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1863 { 1864 pack_warn = packWARN(WARN_UTF8); 1865 message = Perl_form(aTHX_ 1866 "%s: %s (too short; %d byte%s available, need %d)", 1867 malformed_text, 1868 _byte_dump_string(s0, send - s0, 0), 1869 (int)avail_len, 1870 avail_len == 1 ? "" : "s", 1871 (int)expectlen); 1872 this_flag_bit = UTF8_GOT_SHORT; 1873 } 1874 } 1875 1876 } 1877 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) { 1878 possible_problems &= ~UTF8_GOT_NON_CONTINUATION; 1879 *errors |= UTF8_GOT_NON_CONTINUATION; 1880 1881 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { 1882 disallowed = TRUE; 1883 if (( msgs 1884 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 1885 { 1886 1887 /* If we don't know for sure that the input length is 1888 * valid, avoid as much as possible reading past the 1889 * end of the buffer */ 1890 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) 1891 ? (int) (s - s0) 1892 : (int) (send - s0); 1893 pack_warn = packWARN(WARN_UTF8); 1894 message = Perl_form(aTHX_ "%s", 1895 unexpected_non_continuation_text(s0, 1896 printlen, 1897 s - s0, 1898 (int) expectlen)); 1899 this_flag_bit = UTF8_GOT_NON_CONTINUATION; 1900 } 1901 } 1902 } 1903 else if (possible_problems & UTF8_GOT_SURROGATE) { 1904 possible_problems &= ~UTF8_GOT_SURROGATE; 1905 1906 if (flags & UTF8_WARN_SURROGATE) { 1907 *errors |= UTF8_GOT_SURROGATE; 1908 1909 if ( ! (flags & UTF8_CHECK_ONLY) 1910 && (msgs || ckWARN_d(WARN_SURROGATE))) 1911 { 1912 pack_warn = packWARN(WARN_SURROGATE); 1913 1914 /* These are the only errors that can occur with a 1915 * surrogate when the 'uv' isn't valid */ 1916 if (orig_problems & UTF8_GOT_TOO_SHORT) { 1917 message = Perl_form(aTHX_ 1918 "UTF-16 surrogate (any UTF-8 sequence that" 1919 " starts with \"%s\" is for a surrogate)", 1920 _byte_dump_string(s0, curlen, 0)); 1921 } 1922 else { 1923 message = Perl_form(aTHX_ surrogate_cp_format, uv); 1924 } 1925 this_flag_bit = UTF8_GOT_SURROGATE; 1926 } 1927 } 1928 1929 if (flags & UTF8_DISALLOW_SURROGATE) { 1930 disallowed = TRUE; 1931 *errors |= UTF8_GOT_SURROGATE; 1932 } 1933 } 1934 else if (possible_problems & UTF8_GOT_SUPER) { 1935 possible_problems &= ~UTF8_GOT_SUPER; 1936 1937 if (flags & UTF8_WARN_SUPER) { 1938 *errors |= UTF8_GOT_SUPER; 1939 1940 if ( ! (flags & UTF8_CHECK_ONLY) 1941 && (msgs || ckWARN_d(WARN_NON_UNICODE))) 1942 { 1943 pack_warn = packWARN(WARN_NON_UNICODE); 1944 1945 if (orig_problems & UTF8_GOT_TOO_SHORT) { 1946 message = Perl_form(aTHX_ 1947 "Any UTF-8 sequence that starts with" 1948 " \"%s\" is for a non-Unicode code point," 1949 " may not be portable", 1950 _byte_dump_string(s0, curlen, 0)); 1951 } 1952 else { 1953 message = Perl_form(aTHX_ super_cp_format, uv); 1954 } 1955 this_flag_bit = UTF8_GOT_SUPER; 1956 } 1957 } 1958 1959 /* Test for Perl's extended UTF-8 after the regular SUPER ones, 1960 * and before possibly bailing out, so that the more dire 1961 * warning will override the regular one. */ 1962 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) { 1963 if ( ! (flags & UTF8_CHECK_ONLY) 1964 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) 1965 && (msgs || ( ckWARN_d(WARN_NON_UNICODE) 1966 || ckWARN(WARN_PORTABLE)))) 1967 { 1968 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); 1969 1970 /* If it is an overlong that evaluates to a code point 1971 * that doesn't have to use the Perl extended UTF-8, it 1972 * still used it, and so we output a message that 1973 * doesn't refer to the code point. The same is true 1974 * if there was a SHORT malformation where the code 1975 * point is not valid. In that case, 'uv' will have 1976 * been set to the REPLACEMENT CHAR, and the message 1977 * below without the code point in it will be selected 1978 * */ 1979 if (UNICODE_IS_PERL_EXTENDED(uv)) { 1980 message = Perl_form(aTHX_ 1981 PL_extended_cp_format, uv); 1982 } 1983 else { 1984 message = Perl_form(aTHX_ 1985 "Any UTF-8 sequence that starts with" 1986 " \"%s\" is a Perl extension, and" 1987 " so is not portable", 1988 _byte_dump_string(s0, curlen, 0)); 1989 } 1990 this_flag_bit = UTF8_GOT_PERL_EXTENDED; 1991 } 1992 1993 if (flags & ( UTF8_WARN_PERL_EXTENDED 1994 |UTF8_DISALLOW_PERL_EXTENDED)) 1995 { 1996 *errors |= UTF8_GOT_PERL_EXTENDED; 1997 1998 if (flags & UTF8_DISALLOW_PERL_EXTENDED) { 1999 disallowed = TRUE; 2000 } 2001 } 2002 } 2003 2004 if (flags & UTF8_DISALLOW_SUPER) { 2005 *errors |= UTF8_GOT_SUPER; 2006 disallowed = TRUE; 2007 } 2008 } 2009 else if (possible_problems & UTF8_GOT_NONCHAR) { 2010 possible_problems &= ~UTF8_GOT_NONCHAR; 2011 2012 if (flags & UTF8_WARN_NONCHAR) { 2013 *errors |= UTF8_GOT_NONCHAR; 2014 2015 if ( ! (flags & UTF8_CHECK_ONLY) 2016 && (msgs || ckWARN_d(WARN_NONCHAR))) 2017 { 2018 /* The code above should have guaranteed that we don't 2019 * get here with errors other than overlong */ 2020 assert (! (orig_problems 2021 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR))); 2022 2023 pack_warn = packWARN(WARN_NONCHAR); 2024 message = Perl_form(aTHX_ nonchar_cp_format, uv); 2025 this_flag_bit = UTF8_GOT_NONCHAR; 2026 } 2027 } 2028 2029 if (flags & UTF8_DISALLOW_NONCHAR) { 2030 disallowed = TRUE; 2031 *errors |= UTF8_GOT_NONCHAR; 2032 } 2033 } 2034 else if (possible_problems & UTF8_GOT_LONG) { 2035 possible_problems &= ~UTF8_GOT_LONG; 2036 *errors |= UTF8_GOT_LONG; 2037 2038 if (flags & UTF8_ALLOW_LONG) { 2039 2040 /* We don't allow the actual overlong value, unless the 2041 * special extra bit is also set */ 2042 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE 2043 & ~UTF8_ALLOW_LONG))) 2044 { 2045 uv = UNICODE_REPLACEMENT; 2046 } 2047 } 2048 else { 2049 disallowed = TRUE; 2050 2051 if (( msgs 2052 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) 2053 { 2054 pack_warn = packWARN(WARN_UTF8); 2055 2056 /* These error types cause 'uv' to be something that 2057 * isn't what was intended, so can't use it in the 2058 * message. The other error types either can't 2059 * generate an overlong, or else the 'uv' is valid */ 2060 if (orig_problems & 2061 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) 2062 { 2063 message = Perl_form(aTHX_ 2064 "%s: %s (any UTF-8 sequence that starts" 2065 " with \"%s\" is overlong which can and" 2066 " should be represented with a" 2067 " different, shorter sequence)", 2068 malformed_text, 2069 _byte_dump_string(s0, send - s0, 0), 2070 _byte_dump_string(s0, curlen, 0)); 2071 } 2072 else { 2073 U8 tmpbuf[UTF8_MAXBYTES+1]; 2074 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, 2075 uv, 0); 2076 /* Don't use U+ for non-Unicode code points, which 2077 * includes those in the Latin1 range */ 2078 const char * preface = ( UNICODE_IS_SUPER(uv) 2079 #ifdef EBCDIC 2080 || uv <= 0xFF 2081 #endif 2082 ) 2083 ? "0x" 2084 : "U+"; 2085 message = Perl_form(aTHX_ 2086 "%s: %s (overlong; instead use %s to represent" 2087 " %s%0*" UVXf ")", 2088 malformed_text, 2089 _byte_dump_string(s0, send - s0, 0), 2090 _byte_dump_string(tmpbuf, e - tmpbuf, 0), 2091 preface, 2092 ((uv < 256) ? 2 : 4), /* Field width of 2 for 2093 small code points */ 2094 UNI_TO_NATIVE(uv)); 2095 } 2096 this_flag_bit = UTF8_GOT_LONG; 2097 } 2098 } 2099 } /* End of looking through the possible flags */ 2100 2101 /* Display the message (if any) for the problem being handled in 2102 * this iteration of the loop */ 2103 if (message) { 2104 if (msgs) { 2105 assert(this_flag_bit); 2106 2107 if (*msgs == NULL) { 2108 *msgs = newAV(); 2109 } 2110 2111 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message, 2112 pack_warn, 2113 this_flag_bit))); 2114 } 2115 else if (PL_op) 2116 Perl_warner(aTHX_ pack_warn, "%s in %s", message, 2117 OP_DESC(PL_op)); 2118 else 2119 Perl_warner(aTHX_ pack_warn, "%s", message); 2120 } 2121 } /* End of 'while (possible_problems)' */ 2122 2123 /* Since there was a possible problem, the returned length may need to 2124 * be changed from the one stored at the beginning of this function. 2125 * Instead of trying to figure out if that's needed, just do it. */ 2126 if (retlen) { 2127 *retlen = curlen; 2128 } 2129 2130 if (disallowed) { 2131 if (flags & UTF8_CHECK_ONLY && retlen) { 2132 *retlen = ((STRLEN) -1); 2133 } 2134 return 0; 2135 } 2136 } 2137 2138 return UNI_TO_NATIVE(uv); 2139 } 2140 2141 /* 2142 =for apidoc utf8_to_uvchr_buf 2143 2144 Returns the native code point of the first character in the string C<s> which 2145 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>. 2146 C<*retlen> will be set to the length, in bytes, of that character. 2147 2148 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are 2149 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 2150 C<NULL>) to -1. If those warnings are off, the computed value, if well-defined 2151 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and 2152 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is 2153 the next possible position in C<s> that could begin a non-malformed character. 2154 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is 2155 returned. 2156 2157 =cut 2158 2159 Also implemented as a macro in utf8.h 2160 2161 */ 2162 2163 2164 UV 2165 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2166 { 2167 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; 2168 2169 return utf8_to_uvchr_buf_helper(s, send, retlen); 2170 } 2171 2172 /* This is marked as deprecated 2173 * 2174 =for apidoc utf8_to_uvuni_buf 2175 2176 Only in very rare circumstances should code need to be dealing in Unicode 2177 (as opposed to native) code points. In those few cases, use 2178 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead. 2179 If you are not absolutely sure this is one of those cases, then assume it isn't 2180 and use plain C<utf8_to_uvchr_buf> instead. 2181 2182 Returns the Unicode (not-native) code point of the first character in the 2183 string C<s> which 2184 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>. 2185 C<retlen> will be set to the length, in bytes, of that character. 2186 2187 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are 2188 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't 2189 NULL) to -1. If those warnings are off, the computed value if well-defined (or 2190 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> 2191 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the 2192 next possible position in C<s> that could begin a non-malformed character. 2193 See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is 2194 returned. 2195 2196 =cut 2197 */ 2198 2199 UV 2200 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 2201 { 2202 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF; 2203 2204 assert(send > s); 2205 2206 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen)); 2207 } 2208 2209 /* 2210 =for apidoc utf8_length 2211 2212 Returns the number of characters in the sequence of UTF-8-encoded bytes starting 2213 at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the 2214 same place, it returns 0 with no warning raised. 2215 2216 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning 2217 and returns the number of valid characters. 2218 2219 =cut 2220 */ 2221 2222 STRLEN 2223 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) 2224 { 2225 STRLEN len = 0; 2226 2227 PERL_ARGS_ASSERT_UTF8_LENGTH; 2228 2229 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. 2230 * the bitops (especially ~) can create illegal UTF-8. 2231 * In other words: in Perl UTF-8 is not just for Unicode. */ 2232 2233 while (s < e) { 2234 Ptrdiff_t expected_byte_count = UTF8SKIP(s); 2235 2236 if (UNLIKELY(e - s < expected_byte_count)) { 2237 goto warn_and_return; 2238 } 2239 2240 len++; 2241 s += expected_byte_count; 2242 } 2243 2244 if (LIKELY(e == s)) { 2245 return len; 2246 } 2247 2248 /* Here, s > e on entry */ 2249 2250 warn_and_return: 2251 if (PL_op) 2252 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2253 "%s in %s", unees, OP_DESC(PL_op)); 2254 else 2255 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); 2256 2257 return len; 2258 } 2259 2260 /* 2261 =for apidoc bytes_cmp_utf8 2262 2263 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the 2264 sequence of characters (stored as UTF-8) 2265 in C<u>, C<ulen>. Returns 0 if they are 2266 equal, -1 or -2 if the first string is less than the second string, +1 or +2 2267 if the first string is greater than the second string. 2268 2269 -1 or +1 is returned if the shorter string was identical to the start of the 2270 longer string. -2 or +2 is returned if 2271 there was a difference between characters 2272 within the strings. 2273 2274 =cut 2275 */ 2276 2277 int 2278 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) 2279 { 2280 const U8 *const bend = b + blen; 2281 const U8 *const uend = u + ulen; 2282 2283 PERL_ARGS_ASSERT_BYTES_CMP_UTF8; 2284 2285 while (b < bend && u < uend) { 2286 U8 c = *u++; 2287 if (!UTF8_IS_INVARIANT(c)) { 2288 if (UTF8_IS_DOWNGRADEABLE_START(c)) { 2289 if (u < uend) { 2290 U8 c1 = *u++; 2291 if (UTF8_IS_CONTINUATION(c1)) { 2292 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1); 2293 } else { 2294 /* diag_listed_as: Malformed UTF-8 character%s */ 2295 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2296 "%s %s%s", 2297 unexpected_non_continuation_text(u - 2, 2, 1, 2), 2298 PL_op ? " in " : "", 2299 PL_op ? OP_DESC(PL_op) : ""); 2300 return -2; 2301 } 2302 } else { 2303 if (PL_op) 2304 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 2305 "%s in %s", unees, OP_DESC(PL_op)); 2306 else 2307 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees); 2308 return -2; /* Really want to return undef :-) */ 2309 } 2310 } else { 2311 return -2; 2312 } 2313 } 2314 if (*b != c) { 2315 return *b < c ? -2 : +2; 2316 } 2317 ++b; 2318 } 2319 2320 if (b == bend && u == uend) 2321 return 0; 2322 2323 return b < bend ? +1 : -1; 2324 } 2325 2326 /* 2327 =for apidoc utf8_to_bytes 2328 2329 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding. 2330 Unlike L</bytes_to_utf8>, this over-writes the original string, and 2331 updates C<*lenp> to contain the new length. 2332 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1. 2333 2334 Upon successful return, the number of variants in the string can be computed by 2335 having saved the value of C<*lenp> before the call, and subtracting the 2336 after-call value of C<*lenp> from it. 2337 2338 If you need a copy of the string, see L</bytes_from_utf8>. 2339 2340 =cut 2341 */ 2342 2343 U8 * 2344 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) 2345 { 2346 U8 * first_variant; 2347 2348 PERL_ARGS_ASSERT_UTF8_TO_BYTES; 2349 PERL_UNUSED_CONTEXT; 2350 2351 /* This is a no-op if no variants at all in the input */ 2352 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) { 2353 return s; 2354 } 2355 2356 { 2357 U8 * const save = s; 2358 U8 * const send = s + *lenp; 2359 U8 * d; 2360 2361 /* Nothing before the first variant needs to be changed, so start the real 2362 * work there */ 2363 s = first_variant; 2364 while (s < send) { 2365 if (! UTF8_IS_INVARIANT(*s)) { 2366 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { 2367 *lenp = ((STRLEN) -1); 2368 return 0; 2369 } 2370 s++; 2371 } 2372 s++; 2373 } 2374 2375 /* Is downgradable, so do it */ 2376 d = s = first_variant; 2377 while (s < send) { 2378 U8 c = *s++; 2379 if (! UVCHR_IS_INVARIANT(c)) { 2380 /* Then it is two-byte encoded */ 2381 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); 2382 s++; 2383 } 2384 *d++ = c; 2385 } 2386 *d = '\0'; 2387 *lenp = d - save; 2388 2389 return save; 2390 } 2391 } 2392 2393 /* 2394 =for apidoc bytes_from_utf8 2395 2396 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native 2397 byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is 2398 actually encoded in UTF-8. 2399 2400 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of 2401 the input string. 2402 2403 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string 2404 not expressible in native byte encoding. In these cases, C<*is_utf8p> and 2405 C<*lenp> are unchanged, and the return value is the original C<s>. 2406 2407 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a 2408 newly created string containing a downgraded copy of C<s>, and whose length is 2409 returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The 2410 caller is responsible for arranging for the memory used by this string to get 2411 freed. 2412 2413 Upon successful return, the number of variants in the string can be computed by 2414 having saved the value of C<*lenp> before the call, and subtracting the 2415 after-call value of C<*lenp> from it. 2416 2417 =cut 2418 2419 There is a macro that avoids this function call, but this is retained for 2420 anyone who calls it with the Perl_ prefix */ 2421 2422 U8 * 2423 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) 2424 { 2425 PERL_ARGS_ASSERT_BYTES_FROM_UTF8; 2426 PERL_UNUSED_CONTEXT; 2427 2428 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); 2429 } 2430 2431 /* 2432 =for apidoc bytes_from_utf8_loc 2433 2434 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer 2435 to where to store the location of the first character in C<"s"> that cannot be 2436 converted to non-UTF8. 2437 2438 If that parameter is C<NULL>, this function behaves identically to 2439 C<bytes_from_utf8>. 2440 2441 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to 2442 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>. 2443 2444 Otherwise, the function returns a newly created C<NUL>-terminated string 2445 containing the non-UTF8 equivalent of the convertible first portion of 2446 C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>. 2447 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, 2448 and C<*first_non_downgradable> is set to C<NULL>. 2449 2450 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the 2451 first character in the original string that wasn't converted. C<*is_utf8p> is 2452 unchanged. Note that the new string may have length 0. 2453 2454 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and 2455 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and 2456 converts as many characters in it as possible stopping at the first one it 2457 finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is 2458 set to point to that. The function returns the portion that could be converted 2459 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length, 2460 not including the terminating C<NUL>. If the very first character in the 2461 original could not be converted, C<*lenp> will be 0, and the new string will 2462 contain just a single C<NUL>. If the entire input string was converted, 2463 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>. 2464 2465 Upon successful return, the number of variants in the converted portion of the 2466 string can be computed by having saved the value of C<*lenp> before the call, 2467 and subtracting the after-call value of C<*lenp> from it. 2468 2469 =cut 2470 2471 2472 */ 2473 2474 U8 * 2475 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) 2476 { 2477 U8 *d; 2478 const U8 *original = s; 2479 U8 *converted_start; 2480 const U8 *send = s + *lenp; 2481 2482 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; 2483 2484 if (! *is_utf8p) { 2485 if (first_unconverted) { 2486 *first_unconverted = NULL; 2487 } 2488 2489 return (U8 *) original; 2490 } 2491 2492 Newx(d, (*lenp) + 1, U8); 2493 2494 converted_start = d; 2495 while (s < send) { 2496 U8 c = *s++; 2497 if (! UTF8_IS_INVARIANT(c)) { 2498 2499 /* Then it is multi-byte encoded. If the code point is above 0xFF, 2500 * have to stop now */ 2501 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { 2502 if (first_unconverted) { 2503 *first_unconverted = s - 1; 2504 goto finish_and_return; 2505 } 2506 else { 2507 Safefree(converted_start); 2508 return (U8 *) original; 2509 } 2510 } 2511 2512 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); 2513 s++; 2514 } 2515 *d++ = c; 2516 } 2517 2518 /* Here, converted the whole of the input */ 2519 *is_utf8p = FALSE; 2520 if (first_unconverted) { 2521 *first_unconverted = NULL; 2522 } 2523 2524 finish_and_return: 2525 *d = '\0'; 2526 *lenp = d - converted_start; 2527 2528 /* Trim unused space */ 2529 Renew(converted_start, *lenp + 1, U8); 2530 2531 return converted_start; 2532 } 2533 2534 /* 2535 =for apidoc bytes_to_utf8 2536 2537 Converts a string C<s> of length C<*lenp> bytes from the native encoding into 2538 UTF-8. 2539 Returns a pointer to the newly-created string, and sets C<*lenp> to 2540 reflect the new length in bytes. The caller is responsible for arranging for 2541 the memory used by this string to get freed. 2542 2543 Upon successful return, the number of variants in the string can be computed by 2544 having saved the value of C<*lenp> before the call, and subtracting it from the 2545 after-call value of C<*lenp>. 2546 2547 A C<NUL> character will be written after the end of the string. 2548 2549 If you want to convert to UTF-8 from encodings other than 2550 the native (Latin1 or EBCDIC), 2551 see L</sv_recode_to_utf8>(). 2552 2553 =cut 2554 */ 2555 2556 U8* 2557 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) 2558 { 2559 const U8 * const send = s + (*lenp); 2560 U8 *d; 2561 U8 *dst; 2562 2563 PERL_ARGS_ASSERT_BYTES_TO_UTF8; 2564 PERL_UNUSED_CONTEXT; 2565 2566 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ 2567 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); 2568 dst = d; 2569 2570 while (s < send) { 2571 append_utf8_from_native_byte(*s, &d); 2572 s++; 2573 } 2574 2575 *d = '\0'; 2576 *lenp = d-dst; 2577 2578 return dst; 2579 } 2580 2581 /* 2582 * Convert native UTF-16 to UTF-8. Called via the more public functions 2583 * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for 2584 * little-endian, 2585 * 2586 * 'p' is the UTF-16 input string, passed as a pointer to U8. 2587 * 'bytelen' is its length (must be even) 2588 * 'd' is the pointer to the destination buffer. The caller must ensure that 2589 * the space is large enough. The maximum expansion factor is 2 times 2590 * 'bytelen'. 1.5 if never going to run on an EBCDIC box. 2591 * '*newlen' will contain the number of bytes this function filled of 'd'. 2592 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE 2593 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE 2594 * 2595 * The expansion factor is because UTF-16 requires 2 bytes for every code point 2596 * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code 2597 * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for 2598 * every code point below 0x1000; otherwise 4-5 bytes. 2599 * 2600 * The worst case is where every code point is below U+10000, hence requiring 2 2601 * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8 2602 * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes. 2603 * 2604 * Do not use in-place. */ 2605 2606 U8* 2607 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen, 2608 const bool high_byte, /* Which of next two bytes is 2609 high order */ 2610 const bool low_byte) 2611 { 2612 U8* pend; 2613 U8* dstart = d; 2614 2615 PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE; 2616 2617 if (bytelen & 1) 2618 Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf, 2619 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen); 2620 pend = p + bytelen; 2621 2622 while (p < pend) { 2623 2624 /* Next 16 bits is what we want. (The bool is cast to U8 because on 2625 * platforms where a bool is implemented as a signed char, a compiler 2626 * warning may be generated) */ 2627 U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte]; 2628 p += 2; 2629 2630 /* If it's a surrogate, we find the uv that the surrogate pair encodes. 2631 * */ 2632 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { 2633 2634 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST 2635 #define LAST_HIGH_SURROGATE 0xDBFF 2636 #define FIRST_LOW_SURROGATE 0xDC00 2637 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST 2638 #define FIRST_IN_PLANE1 0x10000 2639 2640 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) { 2641 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 2642 } 2643 else { 2644 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte]; 2645 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE, 2646 LAST_LOW_SURROGATE))) 2647 { 2648 Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); 2649 } 2650 2651 p += 2; 2652 2653 /* Here uv is the high surrogate. Combine with low surrogate 2654 * just computed to form the actual U32 code point. 2655 * 2656 * From https://unicode.org/faq/utf_bom.html#utf16-4 */ 2657 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10) 2658 + low_surrogate - FIRST_LOW_SURROGATE; 2659 } 2660 } 2661 2662 /* Here, 'uv' is the real U32 we want to find the UTF-8 of */ 2663 d = uvchr_to_utf8(d, uv); 2664 } 2665 2666 *newlen = d - dstart; 2667 return d; 2668 } 2669 2670 U8* 2671 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) 2672 { 2673 PERL_ARGS_ASSERT_UTF16_TO_UTF8; 2674 2675 return utf16_to_utf8(p, d, bytelen, newlen); 2676 } 2677 2678 U8* 2679 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) 2680 { 2681 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; 2682 2683 return utf16_to_utf8_reversed(p, d, bytelen, newlen); 2684 } 2685 2686 /* 2687 * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for 2688 * big-endian and utf8_to_utf16_reversed() for little-endian, 2689 * 2690 * 's' is the UTF-8 input string, passed as a pointer to U8. 2691 * 'bytelen' is its length 2692 * 'd' is the pointer to the destination buffer, currently passed as U8 *. The 2693 * caller must ensure that the space is large enough. The maximum 2694 * expansion factor is 2 times 'bytelen'. This happens when the input is 2695 * entirely single-byte ASCII, expanding to two-byte UTF-16. 2696 * '*newlen' will contain the number of bytes this function filled of 'd'. 2697 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE 2698 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE 2699 * 2700 * Do not use in-place. */ 2701 U8* 2702 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen, 2703 const bool high_byte, /* Which of next two bytes 2704 is high order */ 2705 const bool low_byte) 2706 { 2707 U8* send; 2708 U8* dstart = d; 2709 2710 PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE; 2711 2712 send = s + bytelen; 2713 2714 while (s < send) { 2715 STRLEN retlen; 2716 UV uv = utf8n_to_uvchr(s, send - s, &retlen, 2717 /* No surrogates nor above-Unicode */ 2718 UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); 2719 2720 /* The modern method is to keep going with malformed input, 2721 * substituting the REPLACEMENT CHARACTER */ 2722 if (UNLIKELY(uv == 0 && *s != '\0')) { 2723 uv = UNICODE_REPLACEMENT; 2724 } 2725 2726 if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */ 2727 2728 /* From https://unicode.org/faq/utf_bom.html#utf16-4 */ 2729 U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10) 2730 + FIRST_HIGH_SURROGATE; 2731 2732 /* (The bool is cast to U8 because on platforms where a bool is 2733 * implemented as a signed char, a compiler warning may be 2734 * generated) */ 2735 d[(U8) high_byte] = high_surrogate >> 8; 2736 d[(U8) low_byte] = high_surrogate & nBIT_MASK(8); 2737 d += 2; 2738 2739 /* The low surrogate is the lower 10 bits plus the offset */ 2740 uv &= nBIT_MASK(10); 2741 uv += FIRST_LOW_SURROGATE; 2742 2743 /* Drop down to output the low surrogate like it were a 2744 * non-surrogate */ 2745 } 2746 2747 d[(U8) high_byte] = uv >> 8; 2748 d[(U8) low_byte] = uv & nBIT_MASK(8); 2749 d += 2; 2750 2751 s += retlen; 2752 } 2753 2754 *newlen = d - dstart; 2755 return d; 2756 } 2757 2758 bool 2759 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) 2760 { 2761 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c); 2762 } 2763 2764 bool 2765 Perl__is_uni_perl_idcont(pTHX_ UV c) 2766 { 2767 return _invlist_contains_cp(PL_utf8_perl_idcont, c); 2768 } 2769 2770 bool 2771 Perl__is_uni_perl_idstart(pTHX_ UV c) 2772 { 2773 return _invlist_contains_cp(PL_utf8_perl_idstart, c); 2774 } 2775 2776 UV 2777 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, 2778 const char S_or_s) 2779 { 2780 /* We have the latin1-range values compiled into the core, so just use 2781 * those, converting the result to UTF-8. The only difference between upper 2782 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is 2783 * either "SS" or "Ss". Which one to use is passed into the routine in 2784 * 'S_or_s' to avoid a test */ 2785 2786 UV converted = toUPPER_LATIN1_MOD(c); 2787 2788 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1; 2789 2790 assert(S_or_s == 'S' || S_or_s == 's'); 2791 2792 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for 2793 characters in this range */ 2794 *p = (U8) converted; 2795 *lenp = 1; 2796 return converted; 2797 } 2798 2799 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers, 2800 * which it maps to one of them, so as to only have to have one check for 2801 * it in the main case */ 2802 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { 2803 switch (c) { 2804 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 2805 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; 2806 break; 2807 case MICRO_SIGN: 2808 converted = GREEK_CAPITAL_LETTER_MU; 2809 break; 2810 #if UNICODE_MAJOR_VERSION > 2 \ 2811 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ 2812 && UNICODE_DOT_DOT_VERSION >= 8) 2813 case LATIN_SMALL_LETTER_SHARP_S: 2814 *(p)++ = 'S'; 2815 *p = S_or_s; 2816 *lenp = 2; 2817 return 'S'; 2818 #endif 2819 default: 2820 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect" 2821 " '%c' to map to '%c'", 2822 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); 2823 NOT_REACHED; /* NOTREACHED */ 2824 } 2825 } 2826 2827 *(p)++ = UTF8_TWO_BYTE_HI(converted); 2828 *p = UTF8_TWO_BYTE_LO(converted); 2829 *lenp = 2; 2830 2831 return converted; 2832 } 2833 2834 /* If compiled on an early Unicode version, there may not be auxiliary tables 2835 * */ 2836 #ifndef HAS_UC_AUX_TABLES 2837 # define UC_AUX_TABLE_ptrs NULL 2838 # define UC_AUX_TABLE_lengths NULL 2839 #endif 2840 #ifndef HAS_TC_AUX_TABLES 2841 # define TC_AUX_TABLE_ptrs NULL 2842 # define TC_AUX_TABLE_lengths NULL 2843 #endif 2844 #ifndef HAS_LC_AUX_TABLES 2845 # define LC_AUX_TABLE_ptrs NULL 2846 # define LC_AUX_TABLE_lengths NULL 2847 #endif 2848 #ifndef HAS_CF_AUX_TABLES 2849 # define CF_AUX_TABLE_ptrs NULL 2850 # define CF_AUX_TABLE_lengths NULL 2851 #endif 2852 2853 /* Call the function to convert a UTF-8 encoded character to the specified case. 2854 * Note that there may be more than one character in the result. 2855 * 's' is a pointer to the first byte of the input character 2856 * 'd' will be set to the first byte of the string of changed characters. It 2857 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes 2858 * 'lenp' will be set to the length in bytes of the string of changed characters 2859 * 2860 * The functions return the ordinal of the first character in the string of 2861 * 'd' */ 2862 #define CALL_UPPER_CASE(uv, s, d, lenp) \ 2863 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \ 2864 Uppercase_Mapping_invmap, \ 2865 UC_AUX_TABLE_ptrs, \ 2866 UC_AUX_TABLE_lengths, \ 2867 "uppercase") 2868 #define CALL_TITLE_CASE(uv, s, d, lenp) \ 2869 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \ 2870 Titlecase_Mapping_invmap, \ 2871 TC_AUX_TABLE_ptrs, \ 2872 TC_AUX_TABLE_lengths, \ 2873 "titlecase") 2874 #define CALL_LOWER_CASE(uv, s, d, lenp) \ 2875 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \ 2876 Lowercase_Mapping_invmap, \ 2877 LC_AUX_TABLE_ptrs, \ 2878 LC_AUX_TABLE_lengths, \ 2879 "lowercase") 2880 2881 2882 /* This additionally has the input parameter 'specials', which if non-zero will 2883 * cause this to use the specials hash for folding (meaning get full case 2884 * folding); otherwise, when zero, this implies a simple case fold */ 2885 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \ 2886 (specials) \ 2887 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \ 2888 Case_Folding_invmap, \ 2889 CF_AUX_TABLE_ptrs, \ 2890 CF_AUX_TABLE_lengths, \ 2891 "foldcase") \ 2892 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \ 2893 Simple_Case_Folding_invmap, \ 2894 NULL, NULL, \ 2895 "foldcase") 2896 2897 UV 2898 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) 2899 { 2900 /* Convert the Unicode character whose ordinal is <c> to its uppercase 2901 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>. 2902 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since 2903 * the changed version may be longer than the original character. 2904 * 2905 * The ordinal of the first character of the changed version is returned 2906 * (but note, as explained above, that there may be more.) */ 2907 2908 PERL_ARGS_ASSERT_TO_UNI_UPPER; 2909 2910 if (c < 256) { 2911 return _to_upper_title_latin1((U8) c, p, lenp, 'S'); 2912 } 2913 2914 return CALL_UPPER_CASE(c, NULL, p, lenp); 2915 } 2916 2917 UV 2918 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) 2919 { 2920 PERL_ARGS_ASSERT_TO_UNI_TITLE; 2921 2922 if (c < 256) { 2923 return _to_upper_title_latin1((U8) c, p, lenp, 's'); 2924 } 2925 2926 return CALL_TITLE_CASE(c, NULL, p, lenp); 2927 } 2928 2929 STATIC U8 2930 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy) 2931 { 2932 /* We have the latin1-range values compiled into the core, so just use 2933 * those, converting the result to UTF-8. Since the result is always just 2934 * one character, we allow <p> to be NULL */ 2935 2936 U8 converted = toLOWER_LATIN1(c); 2937 2938 PERL_UNUSED_ARG(dummy); 2939 2940 if (p != NULL) { 2941 if (NATIVE_BYTE_IS_INVARIANT(converted)) { 2942 *p = converted; 2943 *lenp = 1; 2944 } 2945 else { 2946 /* Result is known to always be < 256, so can use the EIGHT_BIT 2947 * macros */ 2948 *p = UTF8_EIGHT_BIT_HI(converted); 2949 *(p+1) = UTF8_EIGHT_BIT_LO(converted); 2950 *lenp = 2; 2951 } 2952 } 2953 return converted; 2954 } 2955 2956 UV 2957 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) 2958 { 2959 PERL_ARGS_ASSERT_TO_UNI_LOWER; 2960 2961 if (c < 256) { 2962 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); 2963 } 2964 2965 return CALL_LOWER_CASE(c, NULL, p, lenp); 2966 } 2967 2968 UV 2969 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) 2970 { 2971 /* Corresponds to to_lower_latin1(); <flags> bits meanings: 2972 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 2973 * FOLD_FLAGS_FULL iff full folding is to be used; 2974 * 2975 * Not to be used for locale folds 2976 */ 2977 2978 UV converted; 2979 2980 PERL_ARGS_ASSERT__TO_FOLD_LATIN1; 2981 2982 assert (! (flags & FOLD_FLAGS_LOCALE)); 2983 2984 if (UNLIKELY(c == MICRO_SIGN)) { 2985 converted = GREEK_SMALL_LETTER_MU; 2986 } 2987 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 2988 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 2989 || UNICODE_DOT_DOT_VERSION > 0) 2990 else if ( (flags & FOLD_FLAGS_FULL) 2991 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S)) 2992 { 2993 /* If can't cross 127/128 boundary, can't return "ss"; instead return 2994 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}") 2995 * under those circumstances. */ 2996 if (flags & FOLD_FLAGS_NOMIX_ASCII) { 2997 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8); 2998 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 2999 p, *lenp, U8); 3000 return LATIN_SMALL_LETTER_LONG_S; 3001 } 3002 else { 3003 *(p)++ = 's'; 3004 *p = 's'; 3005 *lenp = 2; 3006 return 's'; 3007 } 3008 } 3009 #endif 3010 else { /* In this range the fold of all other characters is their lower 3011 case */ 3012 converted = toLOWER_LATIN1(c); 3013 } 3014 3015 if (UVCHR_IS_INVARIANT(converted)) { 3016 *p = (U8) converted; 3017 *lenp = 1; 3018 } 3019 else { 3020 *(p)++ = UTF8_TWO_BYTE_HI(converted); 3021 *p = UTF8_TWO_BYTE_LO(converted); 3022 *lenp = 2; 3023 } 3024 3025 return converted; 3026 } 3027 3028 UV 3029 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) 3030 { 3031 3032 /* Not currently externally documented, and subject to change 3033 * <flags> bits meanings: 3034 * FOLD_FLAGS_FULL iff full folding is to be used; 3035 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 3036 * locale are to be used. 3037 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited 3038 */ 3039 3040 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; 3041 3042 if (flags & FOLD_FLAGS_LOCALE) { 3043 /* Treat a non-Turkic UTF-8 locale as not being in locale at all, 3044 * except for potentially warning */ 3045 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; 3046 if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) { 3047 flags &= ~FOLD_FLAGS_LOCALE; 3048 } 3049 else { 3050 goto needs_full_generality; 3051 } 3052 } 3053 3054 if (c < 256) { 3055 return _to_fold_latin1((U8) c, p, lenp, 3056 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); 3057 } 3058 3059 /* Here, above 255. If no special needs, just use the macro */ 3060 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { 3061 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL); 3062 } 3063 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with 3064 the special flags. */ 3065 U8 utf8_c[UTF8_MAXBYTES + 1]; 3066 3067 needs_full_generality: 3068 uvchr_to_utf8(utf8_c, c); 3069 return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c), 3070 p, lenp, flags); 3071 } 3072 } 3073 3074 PERL_STATIC_INLINE bool 3075 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e, 3076 SV* const invlist) 3077 { 3078 /* returns a boolean giving whether or not the UTF8-encoded character that 3079 * starts at <p>, and extending no further than <e - 1> is in the inversion 3080 * list <invlist>. */ 3081 3082 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0); 3083 3084 PERL_ARGS_ASSERT_IS_UTF8_COMMON; 3085 3086 if (cp == 0 && (p >= e || *p != '\0')) { 3087 _force_out_malformed_utf8_message(p, e, 0, 1); 3088 NOT_REACHED; /* NOTREACHED */ 3089 } 3090 3091 assert(invlist); 3092 return _invlist_contains_cp(invlist, cp); 3093 } 3094 3095 #if 0 /* Not currently used, but may be needed in the future */ 3096 PERLVAR(I, seen_deprecated_macro, HV *) 3097 3098 STATIC void 3099 S_warn_on_first_deprecated_use(pTHX_ const char * const name, 3100 const char * const alternative, 3101 const bool use_locale, 3102 const char * const file, 3103 const unsigned line) 3104 { 3105 const char * key; 3106 3107 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE; 3108 3109 if (ckWARN_d(WARN_DEPRECATED)) { 3110 3111 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line); 3112 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) { 3113 if (! PL_seen_deprecated_macro) { 3114 PL_seen_deprecated_macro = newHV(); 3115 } 3116 if (! hv_store(PL_seen_deprecated_macro, key, 3117 strlen(key), &PL_sv_undef, 0)) 3118 { 3119 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 3120 } 3121 3122 if (instr(file, "mathoms.c")) { 3123 Perl_warner(aTHX_ WARN_DEPRECATED, 3124 "In %s, line %d, starting in Perl v5.32, %s()" 3125 " will be removed. Avoid this message by" 3126 " converting to use %s().\n", 3127 file, line, name, alternative); 3128 } 3129 else { 3130 Perl_warner(aTHX_ WARN_DEPRECATED, 3131 "In %s, line %d, starting in Perl v5.32, %s() will" 3132 " require an additional parameter. Avoid this" 3133 " message by converting to use %s().\n", 3134 file, line, name, alternative); 3135 } 3136 } 3137 } 3138 } 3139 #endif 3140 3141 bool 3142 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) 3143 { 3144 PERL_ARGS_ASSERT__IS_UTF8_FOO; 3145 3146 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]); 3147 } 3148 3149 bool 3150 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e) 3151 { 3152 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; 3153 3154 return is_utf8_common(p, e, PL_utf8_perl_idstart); 3155 } 3156 3157 bool 3158 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e) 3159 { 3160 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; 3161 3162 return is_utf8_common(p, e, PL_utf8_perl_idcont); 3163 } 3164 3165 STATIC UV 3166 S_to_case_cp_list(pTHX_ 3167 const UV original, 3168 const U32 ** const remaining_list, 3169 Size_t * remaining_count, 3170 SV *invlist, const I32 * const invmap, 3171 const U32 * const * const aux_tables, 3172 const U8 * const aux_table_lengths, 3173 const char * const normal) 3174 { 3175 SSize_t index; 3176 I32 base; 3177 3178 /* Calculate the changed case of code point 'original'. The first code 3179 * point of the changed case is returned. 3180 * 3181 * If 'remaining_count' is not NULL, *remaining_count will be set to how 3182 * many *other* code points are in the changed case. If non-zero and 3183 * 'remaining_list' is also not NULL, *remaining_list will be set to point 3184 * to a non-modifiable array containing the second and potentially third 3185 * code points in the changed case. (Unicode guarantees a maximum of 3.) 3186 * Note that this means that *remaining_list is undefined unless there are 3187 * multiple code points, and the caller has chosen to find out how many by 3188 * making 'remaining_count' not NULL. 3189 * 3190 * 'normal' is a string to use to name the new case in any generated 3191 * messages, as a fallback if the operation being used is not available. 3192 * 3193 * The casing to use is given by the data structures in the remaining 3194 * arguments. 3195 */ 3196 3197 PERL_ARGS_ASSERT_TO_CASE_CP_LIST; 3198 3199 /* 'index' is guaranteed to be non-negative, as this is an inversion map 3200 * that covers all possible inputs. See [perl #133365] */ 3201 index = _invlist_search(invlist, original); 3202 base = invmap[index]; 3203 3204 /* Most likely, the case change will contain just a single code point */ 3205 if (remaining_count) { 3206 *remaining_count = 0; 3207 } 3208 3209 if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */ 3210 3211 /* At this bottom level routine is where we warn about illegal code 3212 * points */ 3213 if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) { 3214 if (UNLIKELY(UNICODE_IS_SURROGATE(original))) { 3215 if (ckWARN_d(WARN_SURROGATE)) { 3216 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 3217 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 3218 "Operation \"%s\" returns its argument for" 3219 " UTF-16 surrogate U+%04" UVXf, desc, original); 3220 } 3221 } 3222 else if (UNLIKELY(UNICODE_IS_SUPER(original))) { 3223 if (UNLIKELY(original > MAX_LEGAL_CP)) { 3224 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original)); 3225 } 3226 if (ckWARN_d(WARN_NON_UNICODE)) { 3227 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; 3228 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), 3229 "Operation \"%s\" returns its argument for" 3230 " non-Unicode code point 0x%04" UVXf, desc, original); 3231 } 3232 } 3233 3234 /* Note that non-characters are perfectly legal, so no warning 3235 * should be given. */ 3236 } 3237 3238 return original; 3239 } 3240 3241 if (LIKELY(base > 0)) { /* means original mapped to a single code point, 3242 different from itself */ 3243 return base + original - invlist_array(invlist)[index]; 3244 } 3245 3246 /* Here 'base' is negative. That means the mapping is 1-to-many, and 3247 * requires an auxiliary table look up. abs(base) gives the index into a 3248 * list of such tables which points to the proper aux table. And a 3249 * parallel list gives the length of each corresponding aux table. Skip 3250 * the first entry in the *remaining returns, as it is returned by the 3251 * function. */ 3252 base = -base; 3253 if (remaining_count) { 3254 *remaining_count = (Size_t) (aux_table_lengths[base] - 1); 3255 3256 if (remaining_list) { 3257 *remaining_list = aux_tables[base] + 1; 3258 } 3259 } 3260 3261 return (UV) aux_tables[base][0]; 3262 } 3263 3264 STATIC UV 3265 S__to_utf8_case(pTHX_ const UV original, const U8 *p, 3266 U8* ustrp, STRLEN *lenp, 3267 SV *invlist, const I32 * const invmap, 3268 const U32 * const * const aux_tables, 3269 const U8 * const aux_table_lengths, 3270 const char * const normal) 3271 { 3272 /* Change the case of code point 'original'. If 'p' is non-NULL, it points to 3273 * the beginning of the (assumed to be valid) UTF-8 representation of 3274 * 'original'. 'normal' is a string to use to name the new case in any 3275 * generated messages, as a fallback if the operation being used is not 3276 * available. The new case is given by the data structures in the 3277 * remaining arguments. 3278 * 3279 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the 3280 * entire changed case string, and the return value is the first code point 3281 * in that string 3282 * 3283 * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes 3284 * since the changed version may be longer than the original character. */ 3285 3286 const U32 * remaining_list; 3287 Size_t remaining_count; 3288 UV first = to_case_cp_list(original, 3289 &remaining_list, &remaining_count, 3290 invlist, invmap, 3291 aux_tables, aux_table_lengths, 3292 normal); 3293 3294 PERL_ARGS_ASSERT__TO_UTF8_CASE; 3295 3296 /* If the code point maps to itself and we already have its representation, 3297 * copy it instead of recalculating */ 3298 if (original == first && p) { 3299 *lenp = UTF8SKIP(p); 3300 3301 if (p != ustrp) { /* Don't copy onto itself */ 3302 Copy(p, ustrp, *lenp, U8); 3303 } 3304 } 3305 else { 3306 U8 * d = ustrp; 3307 Size_t i; 3308 3309 d = uvchr_to_utf8(d, first); 3310 3311 for (i = 0; i < remaining_count; i++) { 3312 d = uvchr_to_utf8(d, remaining_list[i]); 3313 } 3314 3315 *d = '\0'; 3316 *lenp = d - ustrp; 3317 } 3318 3319 return first; 3320 } 3321 3322 Size_t 3323 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to, 3324 const U32 ** remaining_folds_to) 3325 { 3326 /* Returns the count of the number of code points that fold to the input 3327 * 'cp' (besides itself). 3328 * 3329 * If the return is 0, there is nothing else that folds to it, and 3330 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL. 3331 * 3332 * If the return is 1, '*first_folds_to' is set to the single code point, 3333 * and '*remaining_folds_to' is set to NULL. 3334 * 3335 * Otherwise, '*first_folds_to' is set to a code point, and 3336 * '*remaining_fold_to' is set to an array that contains the others. The 3337 * length of this array is the returned count minus 1. 3338 * 3339 * The reason for this convolution is to avoid having to deal with 3340 * allocating and freeing memory. The lists are already constructed, so 3341 * the return can point to them, but single code points aren't, so would 3342 * need to be constructed if we didn't employ something like this API 3343 * 3344 * The code points returned by this function are all legal Unicode, which 3345 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are 3346 * constructed with this size (to save space and memory), and we return 3347 * pointers, so they must be this size */ 3348 3349 /* 'index' is guaranteed to be non-negative, as this is an inversion map 3350 * that covers all possible inputs. See [perl #133365] */ 3351 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); 3352 I32 base = _Perl_IVCF_invmap[index]; 3353 3354 PERL_ARGS_ASSERT__INVERSE_FOLDS; 3355 3356 if (base == 0) { /* No fold */ 3357 *first_folds_to = 0; 3358 *remaining_folds_to = NULL; 3359 return 0; 3360 } 3361 3362 #ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */ 3363 3364 assert(base > 0); 3365 3366 #else 3367 3368 if (UNLIKELY(base < 0)) { /* Folds to more than one character */ 3369 3370 /* The data structure is set up so that the absolute value of 'base' is 3371 * an index into a table of pointers to arrays, with the array 3372 * corresponding to the index being the list of code points that fold 3373 * to 'cp', and the parallel array containing the length of the list 3374 * array */ 3375 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; 3376 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; 3377 /* +1 excludes first_folds_to */ 3378 return IVCF_AUX_TABLE_lengths[-base]; 3379 } 3380 3381 #endif 3382 3383 /* Only the single code point. This works like 'fc(G) = G - A + a' */ 3384 *first_folds_to = (U32) (base + cp 3385 - invlist_array(PL_utf8_foldclosures)[index]); 3386 *remaining_folds_to = NULL; 3387 return 1; 3388 } 3389 3390 STATIC UV 3391 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, 3392 U8* const ustrp, STRLEN *lenp) 3393 { 3394 /* This is called when changing the case of a UTF-8-encoded character above 3395 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the 3396 * result contains a character that crosses the 255/256 boundary, disallow 3397 * the change, and return the original code point. See L<perlfunc/lc> for 3398 * why; 3399 * 3400 * p points to the original string whose case was changed; assumed 3401 * by this routine to be well-formed 3402 * result the code point of the first character in the changed-case string 3403 * ustrp points to the changed-case string (<result> represents its 3404 * first char) 3405 * lenp points to the length of <ustrp> */ 3406 3407 UV original; /* To store the first code point of <p> */ 3408 3409 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING; 3410 3411 assert(UTF8_IS_ABOVE_LATIN1(*p)); 3412 3413 /* We know immediately if the first character in the string crosses the 3414 * boundary, so can skip testing */ 3415 if (result > 255) { 3416 3417 /* Look at every character in the result; if any cross the 3418 * boundary, the whole thing is disallowed */ 3419 U8* s = ustrp + UTF8SKIP(ustrp); 3420 U8* e = ustrp + *lenp; 3421 while (s < e) { 3422 if (! UTF8_IS_ABOVE_LATIN1(*s)) { 3423 goto bad_crossing; 3424 } 3425 s += UTF8SKIP(s); 3426 } 3427 3428 /* Here, no characters crossed, result is ok as-is, but we warn. */ 3429 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p)); 3430 return result; 3431 } 3432 3433 bad_crossing: 3434 3435 /* Failed, have to return the original */ 3436 original = valid_utf8_to_uvchr(p, lenp); 3437 3438 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3439 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3440 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8" 3441 " locale; resolved to \"\\x{%" UVXf "}\".", 3442 OP_DESC(PL_op), 3443 original, 3444 original); 3445 Copy(p, ustrp, *lenp, char); 3446 return original; 3447 } 3448 3449 STATIC UV 3450 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, 3451 U8 * ustrp, STRLEN *lenp) 3452 { 3453 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from 3454 * p0..e-1 according to Turkic rules is the same as for non-Turkic. 3455 * Otherwise, it returns the first code point of the Turkic foldcased 3456 * sequence, and the entire sequence will be stored in *ustrp. ustrp will 3457 * contain *lenp bytes 3458 * 3459 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER 3460 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER 3461 * DOTLESS I */ 3462 3463 PERL_ARGS_ASSERT_TURKIC_FC; 3464 assert(e > p); 3465 3466 if (UNLIKELY(*p == 'I')) { 3467 *lenp = 2; 3468 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); 3469 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); 3470 return LATIN_SMALL_LETTER_DOTLESS_I; 3471 } 3472 3473 if (UNLIKELY(memBEGINs(p, e - p, 3474 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8))) 3475 { 3476 *lenp = 1; 3477 *ustrp = 'i'; 3478 return 'i'; 3479 } 3480 3481 return 0; 3482 } 3483 3484 STATIC UV 3485 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, 3486 U8 * ustrp, STRLEN *lenp) 3487 { 3488 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from 3489 * p0..e-1 according to Turkic rules is the same as for non-Turkic. 3490 * Otherwise, it returns the first code point of the Turkic lowercased 3491 * sequence, and the entire sequence will be stored in *ustrp. ustrp will 3492 * contain *lenp bytes */ 3493 3494 PERL_ARGS_ASSERT_TURKIC_LC; 3495 assert(e > p0); 3496 3497 /* A 'I' requires context as to what to do */ 3498 if (UNLIKELY(*p0 == 'I')) { 3499 const U8 * p = p0 + 1; 3500 3501 /* According to the Unicode SpecialCasing.txt file, a capital 'I' 3502 * modified by a dot above lowercases to 'i' even in turkic locales. */ 3503 while (p < e) { 3504 UV cp; 3505 3506 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) { 3507 ustrp[0] = 'i'; 3508 *lenp = 1; 3509 return 'i'; 3510 } 3511 3512 /* For the dot above to modify the 'I', it must be part of a 3513 * combining sequence immediately following the 'I', and no other 3514 * modifier with a ccc of 230 may intervene */ 3515 cp = utf8_to_uvchr_buf(p, e, NULL); 3516 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) { 3517 break; 3518 } 3519 3520 /* Here the combining sequence continues */ 3521 p += UTF8SKIP(p); 3522 } 3523 } 3524 3525 /* In all other cases the lc is the same as the fold */ 3526 return turkic_fc(p0, e, ustrp, lenp); 3527 } 3528 3529 STATIC UV 3530 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, 3531 U8 * ustrp, STRLEN *lenp) 3532 { 3533 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence 3534 * from p0..e-1 according to Turkic rules is the same as for non-Turkic. 3535 * Otherwise, it returns the first code point of the Turkic upper or 3536 * title-cased sequence, and the entire sequence will be stored in *ustrp. 3537 * ustrp will contain *lenp bytes 3538 * 3539 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER 3540 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER 3541 * DOTLESS I */ 3542 3543 PERL_ARGS_ASSERT_TURKIC_UC; 3544 assert(e > p); 3545 3546 if (*p == 'i') { 3547 *lenp = 2; 3548 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 3549 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); 3550 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; 3551 } 3552 3553 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) { 3554 *lenp = 1; 3555 *ustrp = 'I'; 3556 return 'I'; 3557 } 3558 3559 return 0; 3560 } 3561 3562 /* The process for changing the case is essentially the same for the four case 3563 * change types, except there are complications for folding. Otherwise the 3564 * difference is only which case to change to. To make sure that they all do 3565 * the same thing, the bodies of the functions are extracted out into the 3566 * following two macros. The functions are written with the same variable 3567 * names, and these are known and used inside these macros. It would be 3568 * better, of course, to have inline functions to do it, but since different 3569 * macros are called, depending on which case is being changed to, this is not 3570 * feasible in C (to khw's knowledge). Two macros are created so that the fold 3571 * function can start with the common start macro, then finish with its special 3572 * handling; while the other three cases can just use the common end macro. 3573 * 3574 * The algorithm is to use the proper (passed in) macro or function to change 3575 * the case for code points that are below 256. The macro is used if using 3576 * locale rules for the case change; the function if not. If the code point is 3577 * above 255, it is computed from the input UTF-8, and another macro is called 3578 * to do the conversion. If necessary, the output is converted to UTF-8. If 3579 * using a locale, we have to check that the change did not cross the 255/256 3580 * boundary, see check_locale_boundary_crossing() for further details. 3581 * 3582 * The macros are split with the correct case change for the below-256 case 3583 * stored into 'result', and in the middle of an else clause for the above-255 3584 * case. At that point in the 'else', 'result' is not the final result, but is 3585 * the input code point calculated from the UTF-8. The fold code needs to 3586 * realize all this and take it from there. 3587 * 3588 * To deal with Turkic locales, the function specified by the parameter 3589 * 'turkic' is called when appropriate. 3590 * 3591 * If you read the two macros as sequential, it's easier to understand what's 3592 * going on. */ 3593 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \ 3594 L1_func_extra_param, turkic) \ 3595 \ 3596 if (flags & (locale_flags)) { \ 3597 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ 3598 if (IN_UTF8_CTYPE_LOCALE) { \ 3599 if (UNLIKELY(PL_in_utf8_turkic_locale)) { \ 3600 UV ret = turkic(p, e, ustrp, lenp); \ 3601 if (ret) return ret; \ 3602 } \ 3603 \ 3604 /* Otherwise, treat a UTF-8 locale as not being in locale at \ 3605 * all */ \ 3606 flags &= ~(locale_flags); \ 3607 } \ 3608 } \ 3609 \ 3610 if (UTF8_IS_INVARIANT(*p)) { \ 3611 if (flags & (locale_flags)) { \ 3612 result = libc_change_function(*p); \ 3613 } \ 3614 else { \ 3615 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \ 3616 } \ 3617 } \ 3618 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ 3619 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \ 3620 if (flags & (locale_flags)) { \ 3621 result = libc_change_function(c); \ 3622 } \ 3623 else { \ 3624 return L1_func(c, ustrp, lenp, L1_func_extra_param); \ 3625 } \ 3626 } \ 3627 else { /* malformed UTF-8 or ord above 255 */ \ 3628 STRLEN len_result; \ 3629 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \ 3630 if (len_result == (STRLEN) -1) { \ 3631 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \ 3632 } 3633 3634 #define CASE_CHANGE_BODY_END(locale_flags, change_macro) \ 3635 result = change_macro(result, p, ustrp, lenp); \ 3636 \ 3637 if (flags & (locale_flags)) { \ 3638 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \ 3639 } \ 3640 return result; \ 3641 } \ 3642 \ 3643 /* Here, used locale rules. Convert back to UTF-8 */ \ 3644 if (UTF8_IS_INVARIANT(result)) { \ 3645 *ustrp = (U8) result; \ 3646 *lenp = 1; \ 3647 } \ 3648 else { \ 3649 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \ 3650 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \ 3651 *lenp = 2; \ 3652 } \ 3653 \ 3654 return result; 3655 3656 /* Not currently externally documented, and subject to change: 3657 * <flags> is set iff the rules from the current underlying locale are to 3658 * be used. */ 3659 3660 UV 3661 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, 3662 const U8 *e, 3663 U8* ustrp, 3664 STRLEN *lenp, 3665 bool flags) 3666 { 3667 UV result; 3668 3669 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; 3670 3671 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ 3672 /* 2nd char of uc(U+DF) is 'S' */ 3673 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S', 3674 turkic_uc); 3675 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); 3676 } 3677 3678 /* Not currently externally documented, and subject to change: 3679 * <flags> is set iff the rules from the current underlying locale are to be 3680 * used. Since titlecase is not defined in POSIX, for other than a 3681 * UTF-8 locale, uppercase is used instead for code points < 256. 3682 */ 3683 3684 UV 3685 Perl__to_utf8_title_flags(pTHX_ const U8 *p, 3686 const U8 *e, 3687 U8* ustrp, 3688 STRLEN *lenp, 3689 bool flags) 3690 { 3691 UV result; 3692 3693 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; 3694 3695 /* 2nd char of ucfirst(U+DF) is 's' */ 3696 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's', 3697 turkic_uc); 3698 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); 3699 } 3700 3701 /* Not currently externally documented, and subject to change: 3702 * <flags> is set iff the rules from the current underlying locale are to 3703 * be used. 3704 */ 3705 3706 UV 3707 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, 3708 const U8 *e, 3709 U8* ustrp, 3710 STRLEN *lenp, 3711 bool flags) 3712 { 3713 UV result; 3714 3715 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; 3716 3717 CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */, 3718 turkic_lc); 3719 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) 3720 } 3721 3722 /* Not currently externally documented, and subject to change, 3723 * in <flags> 3724 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying 3725 * locale are to be used. 3726 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used; 3727 * otherwise simple folds 3728 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are 3729 * prohibited 3730 */ 3731 3732 UV 3733 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, 3734 const U8 *e, 3735 U8* ustrp, 3736 STRLEN *lenp, 3737 U8 flags) 3738 { 3739 UV result; 3740 3741 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; 3742 3743 /* These are mutually exclusive */ 3744 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII))); 3745 3746 assert(p != ustrp); /* Otherwise overwrites */ 3747 3748 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1, 3749 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)), 3750 turkic_fc); 3751 3752 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); 3753 3754 if (flags & FOLD_FLAGS_LOCALE) { 3755 3756 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 3757 # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 3758 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 3759 3760 /* Special case these two characters, as what normally gets 3761 * returned under locale doesn't work */ 3762 if (memBEGINs((char *) p, e - p, CAP_SHARP_S)) 3763 { 3764 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3765 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3766 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; " 3767 "resolved to \"\\x{17F}\\x{17F}\"."); 3768 goto return_long_s; 3769 } 3770 else 3771 #endif 3772 if (memBEGINs((char *) p, e - p, LONG_S_T)) 3773 { 3774 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3775 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3776 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; " 3777 "resolved to \"\\x{FB06}\"."); 3778 goto return_ligature_st; 3779 } 3780 3781 #if UNICODE_MAJOR_VERSION == 3 \ 3782 && UNICODE_DOT_VERSION == 0 \ 3783 && UNICODE_DOT_DOT_VERSION == 1 3784 # define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8 3785 3786 /* And special case this on this Unicode version only, for the same 3787 * reaons the other two are special cased. They would cross the 3788 * 255/256 boundary which is forbidden under /l, and so the code 3789 * wouldn't catch that they are equivalent (which they are only in 3790 * this release) */ 3791 else if (memBEGINs((char *) p, e - p, DOTTED_I)) { 3792 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ 3793 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 3794 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " 3795 "resolved to \"\\x{0131}\"."); 3796 goto return_dotless_i; 3797 } 3798 #endif 3799 3800 return check_locale_boundary_crossing(p, result, ustrp, lenp); 3801 } 3802 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) { 3803 return result; 3804 } 3805 else { 3806 /* This is called when changing the case of a UTF-8-encoded 3807 * character above the ASCII range, and the result should not 3808 * contain an ASCII character. */ 3809 3810 UV original; /* To store the first code point of <p> */ 3811 3812 /* Look at every character in the result; if any cross the 3813 * boundary, the whole thing is disallowed */ 3814 U8* s = ustrp; 3815 U8* send = ustrp + *lenp; 3816 while (s < send) { 3817 if (isASCII(*s)) { 3818 /* Crossed, have to return the original */ 3819 original = valid_utf8_to_uvchr(p, lenp); 3820 3821 /* But in these instances, there is an alternative we can 3822 * return that is valid */ 3823 if (original == LATIN_SMALL_LETTER_SHARP_S 3824 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ 3825 || original == LATIN_CAPITAL_LETTER_SHARP_S 3826 #endif 3827 ) { 3828 goto return_long_s; 3829 } 3830 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) { 3831 goto return_ligature_st; 3832 } 3833 #if UNICODE_MAJOR_VERSION == 3 \ 3834 && UNICODE_DOT_VERSION == 0 \ 3835 && UNICODE_DOT_DOT_VERSION == 1 3836 3837 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { 3838 goto return_dotless_i; 3839 } 3840 #endif 3841 Copy(p, ustrp, *lenp, char); 3842 return original; 3843 } 3844 s += UTF8SKIP(s); 3845 } 3846 3847 /* Here, no characters crossed, result is ok as-is */ 3848 return result; 3849 } 3850 } 3851 3852 /* Here, used locale rules. Convert back to UTF-8 */ 3853 if (UTF8_IS_INVARIANT(result)) { 3854 *ustrp = (U8) result; 3855 *lenp = 1; 3856 } 3857 else { 3858 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); 3859 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); 3860 *lenp = 2; 3861 } 3862 3863 return result; 3864 3865 return_long_s: 3866 /* Certain folds to 'ss' are prohibited by the options, but they do allow 3867 * folds to a string of two of these characters. By returning this 3868 * instead, then, e.g., 3869 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}") 3870 * works. */ 3871 3872 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8); 3873 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, 3874 ustrp, *lenp, U8); 3875 return LATIN_SMALL_LETTER_LONG_S; 3876 3877 return_ligature_st: 3878 /* Two folds to 'st' are prohibited by the options; instead we pick one and 3879 * have the other one fold to it */ 3880 3881 *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8); 3882 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8); 3883 return LATIN_SMALL_LIGATURE_ST; 3884 3885 #if UNICODE_MAJOR_VERSION == 3 \ 3886 && UNICODE_DOT_VERSION == 0 \ 3887 && UNICODE_DOT_DOT_VERSION == 1 3888 3889 return_dotless_i: 3890 *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8); 3891 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8); 3892 return LATIN_SMALL_LETTER_DOTLESS_I; 3893 3894 #endif 3895 3896 } 3897 3898 bool 3899 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) 3900 { 3901 /* May change: warns if surrogates, non-character code points, or 3902 * non-Unicode code points are in 's' which has length 'len' bytes. 3903 * Returns TRUE if none found; FALSE otherwise. The only other validity 3904 * check is to make sure that this won't exceed the string's length nor 3905 * overflow */ 3906 3907 const U8* const e = s + len; 3908 bool ok = TRUE; 3909 3910 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT; 3911 3912 while (s < e) { 3913 if (UTF8SKIP(s) > len) { 3914 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), 3915 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print"); 3916 return FALSE; 3917 } 3918 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { 3919 if (UNLIKELY(UTF8_IS_SUPER(s, e))) { 3920 if ( ckWARN_d(WARN_NON_UNICODE) 3921 || UNLIKELY(0 < does_utf8_overflow(s, s + len, 3922 0 /* Don't consider overlongs */ 3923 ))) 3924 { 3925 /* A side effect of this function will be to warn */ 3926 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); 3927 ok = FALSE; 3928 } 3929 } 3930 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) { 3931 if (ckWARN_d(WARN_SURROGATE)) { 3932 /* This has a different warning than the one the called 3933 * function would output, so can't just call it, unlike we 3934 * do for the non-chars and above-unicodes */ 3935 UV uv = utf8_to_uvchr_buf(s, e, NULL); 3936 Perl_warner(aTHX_ packWARN(WARN_SURROGATE), 3937 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", 3938 uv); 3939 ok = FALSE; 3940 } 3941 } 3942 else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e)) 3943 && (ckWARN_d(WARN_NONCHAR))) 3944 { 3945 /* A side effect of this function will be to warn */ 3946 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR); 3947 ok = FALSE; 3948 } 3949 } 3950 s += UTF8SKIP(s); 3951 } 3952 3953 return ok; 3954 } 3955 3956 /* 3957 =for apidoc pv_uni_display 3958 3959 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string 3960 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes 3961 long (if longer, the rest is truncated and C<"..."> will be appended). 3962 3963 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display 3964 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH> 3965 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">) 3966 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">). 3967 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both 3968 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on. 3969 3970 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a 3971 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set. 3972 3973 The pointer to the PV of the C<dsv> is returned. 3974 3975 See also L</sv_uni_display>. 3976 3977 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH 3978 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE 3979 =for apidoc Amnh||UNI_DISPLAY_ISPRINT 3980 =for apidoc Amnh||UNI_DISPLAY_QQ 3981 =for apidoc Amnh||UNI_DISPLAY_REGEX 3982 =cut 3983 */ 3984 char * 3985 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, 3986 UV flags) 3987 { 3988 int truncated = 0; 3989 const char *s, *e; 3990 3991 PERL_ARGS_ASSERT_PV_UNI_DISPLAY; 3992 3993 SvPVCLEAR(dsv); 3994 SvUTF8_off(dsv); 3995 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { 3996 UV u; 3997 bool ok = 0; 3998 3999 if (pvlim && SvCUR(dsv) >= pvlim) { 4000 truncated++; 4001 break; 4002 } 4003 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0); 4004 if (u < 256) { 4005 const U8 c = (U8) u; 4006 if (flags & UNI_DISPLAY_BACKSLASH) { 4007 if ( isMNEMONIC_CNTRL(c) 4008 && ( c != '\b' 4009 || (flags & UNI_DISPLAY_BACKSPACE))) 4010 { 4011 const char * mnemonic = cntrl_to_mnemonic(c); 4012 sv_catpvn(dsv, mnemonic, strlen(mnemonic)); 4013 ok = 1; 4014 } 4015 else if (c == '\\') { 4016 sv_catpvs(dsv, "\\\\"); 4017 ok = 1; 4018 } 4019 } 4020 /* isPRINT() is the locale-blind version. */ 4021 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { 4022 const char string = c; 4023 sv_catpvn(dsv, &string, 1); 4024 ok = 1; 4025 } 4026 } 4027 if (!ok) 4028 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u); 4029 } 4030 if (truncated) 4031 sv_catpvs(dsv, "..."); 4032 4033 return SvPVX(dsv); 4034 } 4035 4036 /* 4037 =for apidoc sv_uni_display 4038 4039 Build to the scalar C<dsv> a displayable version of the scalar C<sv>, 4040 the displayable version being at most C<pvlim> bytes long 4041 (if longer, the rest is truncated and "..." will be appended). 4042 4043 The C<flags> argument is as in L</pv_uni_display>(). 4044 4045 The pointer to the PV of the C<dsv> is returned. 4046 4047 =cut 4048 */ 4049 char * 4050 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) 4051 { 4052 const char * const ptr = 4053 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv); 4054 4055 PERL_ARGS_ASSERT_SV_UNI_DISPLAY; 4056 4057 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr, 4058 SvCUR(ssv), pvlim, flags); 4059 } 4060 4061 /* 4062 =for apidoc foldEQ_utf8 4063 4064 Returns true if the leading portions of the strings C<s1> and C<s2> (either or 4065 both of which may be in UTF-8) are the same case-insensitively; false 4066 otherwise. How far into the strings to compare is determined by other input 4067 parameters. 4068 4069 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode; 4070 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for 4071 C<u2> with respect to C<s2>. 4072 4073 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for 4074 fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach. 4075 The scan will not be considered to be a match unless the goal is reached, and 4076 scanning won't continue past that goal. Correspondingly for C<l2> with respect 4077 to C<s2>. 4078 4079 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that 4080 pointer is considered an end pointer to the position 1 byte past the maximum 4081 point in C<s1> beyond which scanning will not continue under any circumstances. 4082 (This routine assumes that UTF-8 encoded input strings are not malformed; 4083 malformed input can cause it to read past C<pe1>). This means that if both 4084 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match 4085 will never be successful because it can never 4086 get as far as its goal (and in fact is asserted against). Correspondingly for 4087 C<pe2> with respect to C<s2>. 4088 4089 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and 4090 C<l2> must be non-zero), and if both do, both have to be 4091 reached for a successful match. Also, if the fold of a character is multiple 4092 characters, all of them must be matched (see tr21 reference below for 4093 'folding'). 4094 4095 Upon a successful match, if C<pe1> is non-C<NULL>, 4096 it will be set to point to the beginning of the I<next> character of C<s1> 4097 beyond what was matched. Correspondingly for C<pe2> and C<s2>. 4098 4099 For case-insensitiveness, the "casefolding" of Unicode is used 4100 instead of upper/lowercasing both the characters, see 4101 L<https://www.unicode.org/reports/tr21/> (Case Mappings). 4102 4103 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII 4104 =for apidoc Cmnh||FOLDEQ_LOCALE 4105 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED 4106 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE 4107 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED 4108 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE 4109 4110 =cut */ 4111 4112 /* A flags parameter has been added which may change, and hence isn't 4113 * externally documented. Currently it is: 4114 * 0 for as-documented above 4115 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an 4116 ASCII one, to not match 4117 * FOLDEQ_LOCALE is set iff the rules from the current underlying 4118 * locale are to be used. 4119 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this 4120 * routine. This allows that step to be skipped. 4121 * Currently, this requires s1 to be encoded as UTF-8 4122 * (u1 must be true), which is asserted for. 4123 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may 4124 * cross certain boundaries. Hence, the caller should 4125 * let this function do the folding instead of 4126 * pre-folding. This code contains an assertion to 4127 * that effect. However, if the caller knows what 4128 * it's doing, it can pass this flag to indicate that, 4129 * and the assertion is skipped. 4130 * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies 4131 * to s2, and s2 doesn't have to be UTF-8 encoded. 4132 * This introduces an asymmetry to save a few branches 4133 * in a loop. Currently, this is not a problem, as 4134 * never are both inputs pre-folded. Simply call this 4135 * function with the pre-folded one as the second 4136 * string. 4137 * FOLDEQ_S2_FOLDS_SANE 4138 */ 4139 4140 I32 4141 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, 4142 const char *s2, char **pe2, UV l2, bool u2, 4143 U32 flags) 4144 { 4145 const U8 *p1 = (const U8*)s1; /* Point to current char */ 4146 const U8 *p2 = (const U8*)s2; 4147 const U8 *g1 = NULL; /* goal for s1 */ 4148 const U8 *g2 = NULL; 4149 const U8 *e1 = NULL; /* Don't scan s1 past this */ 4150 U8 *f1 = NULL; /* Point to current folded */ 4151 const U8 *e2 = NULL; 4152 U8 *f2 = NULL; 4153 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ 4154 U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; 4155 U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; 4156 U8 flags_for_folder = FOLD_FLAGS_FULL; 4157 4158 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; 4159 4160 assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) 4161 && (( (flags & FOLDEQ_S1_ALREADY_FOLDED) 4162 && !(flags & FOLDEQ_S1_FOLDS_SANE)) 4163 || ( (flags & FOLDEQ_S2_ALREADY_FOLDED) 4164 && !(flags & FOLDEQ_S2_FOLDS_SANE))))); 4165 /* The algorithm is to trial the folds without regard to the flags on 4166 * the first line of the above assert(), and then see if the result 4167 * violates them. This means that the inputs can't be pre-folded to a 4168 * violating result, hence the assert. This could be changed, with the 4169 * addition of extra tests here for the already-folded case, which would 4170 * slow it down. That cost is more than any possible gain for when these 4171 * flags are specified, as the flags indicate /il or /iaa matching which 4172 * is less common than /iu, and I (khw) also believe that real-world /il 4173 * and /iaa matches are most likely to involve code points 0-255, and this 4174 * function only under rare conditions gets called for 0-255. */ 4175 4176 if (flags & FOLDEQ_LOCALE) { 4177 if (IN_UTF8_CTYPE_LOCALE) { 4178 if (UNLIKELY(PL_in_utf8_turkic_locale)) { 4179 flags_for_folder |= FOLD_FLAGS_LOCALE; 4180 } 4181 else { 4182 flags &= ~FOLDEQ_LOCALE; 4183 } 4184 } 4185 else { 4186 flags_for_folder |= FOLD_FLAGS_LOCALE; 4187 } 4188 } 4189 if (flags & FOLDEQ_UTF8_NOMIX_ASCII) { 4190 flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII; 4191 } 4192 4193 if (pe1) { 4194 e1 = *(U8**)pe1; 4195 } 4196 4197 if (l1) { 4198 g1 = (const U8*)s1 + l1; 4199 } 4200 4201 if (pe2) { 4202 e2 = *(U8**)pe2; 4203 } 4204 4205 if (l2) { 4206 g2 = (const U8*)s2 + l2; 4207 } 4208 4209 /* Must have at least one goal */ 4210 assert(g1 || g2); 4211 4212 if (g1) { 4213 4214 /* Will never match if goal is out-of-bounds */ 4215 assert(! e1 || e1 >= g1); 4216 4217 /* Here, there isn't an end pointer, or it is beyond the goal. We 4218 * only go as far as the goal */ 4219 e1 = g1; 4220 } 4221 else { 4222 assert(e1); /* Must have an end for looking at s1 */ 4223 } 4224 4225 /* Same for goal for s2 */ 4226 if (g2) { 4227 assert(! e2 || e2 >= g2); 4228 e2 = g2; 4229 } 4230 else { 4231 assert(e2); 4232 } 4233 4234 /* If both operands are already folded, we could just do a memEQ on the 4235 * whole strings at once, but it would be better if the caller realized 4236 * this and didn't even call us */ 4237 4238 /* Look through both strings, a character at a time */ 4239 while (p1 < e1 && p2 < e2) { 4240 4241 /* If at the beginning of a new character in s1, get its fold to use 4242 * and the length of the fold. */ 4243 if (n1 == 0) { 4244 if (flags & FOLDEQ_S1_ALREADY_FOLDED) { 4245 f1 = (U8 *) p1; 4246 assert(u1); 4247 n1 = UTF8SKIP(f1); 4248 } 4249 else { 4250 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) { 4251 4252 /* We have to forbid mixing ASCII with non-ASCII if the 4253 * flags so indicate. And, we can short circuit having to 4254 * call the general functions for this common ASCII case, 4255 * all of whose non-locale folds are also ASCII, and hence 4256 * UTF-8 invariants, so the UTF8ness of the strings is not 4257 * relevant. */ 4258 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) { 4259 return 0; 4260 } 4261 n1 = 1; 4262 *foldbuf1 = toFOLD(*p1); 4263 } 4264 else if (u1) { 4265 _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder); 4266 } 4267 else { /* Not UTF-8, get UTF-8 fold */ 4268 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder); 4269 } 4270 f1 = foldbuf1; 4271 } 4272 } 4273 4274 if (n2 == 0) { /* Same for s2 */ 4275 if (flags & FOLDEQ_S2_ALREADY_FOLDED) { 4276 4277 /* Point to the already-folded character. But for non-UTF-8 4278 * variants, convert to UTF-8 for the algorithm below */ 4279 if (UTF8_IS_INVARIANT(*p2)) { 4280 f2 = (U8 *) p2; 4281 n2 = 1; 4282 } 4283 else if (u2) { 4284 f2 = (U8 *) p2; 4285 n2 = UTF8SKIP(f2); 4286 } 4287 else { 4288 foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2); 4289 foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2); 4290 f2 = foldbuf2; 4291 n2 = 2; 4292 } 4293 } 4294 else { 4295 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) { 4296 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) { 4297 return 0; 4298 } 4299 n2 = 1; 4300 *foldbuf2 = toFOLD(*p2); 4301 } 4302 else if (u2) { 4303 _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder); 4304 } 4305 else { 4306 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder); 4307 } 4308 f2 = foldbuf2; 4309 } 4310 } 4311 4312 /* Here f1 and f2 point to the beginning of the strings to compare. 4313 * These strings are the folds of the next character from each input 4314 * string, stored in UTF-8. */ 4315 4316 /* While there is more to look for in both folds, see if they 4317 * continue to match */ 4318 while (n1 && n2) { 4319 U8 fold_length = UTF8SKIP(f1); 4320 if (fold_length != UTF8SKIP(f2) 4321 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE 4322 function call for single 4323 byte */ 4324 || memNE((char*)f1, (char*)f2, fold_length)) 4325 { 4326 return 0; /* mismatch */ 4327 } 4328 4329 /* Here, they matched, advance past them */ 4330 n1 -= fold_length; 4331 f1 += fold_length; 4332 n2 -= fold_length; 4333 f2 += fold_length; 4334 } 4335 4336 /* When reach the end of any fold, advance the input past it */ 4337 if (n1 == 0) { 4338 p1 += u1 ? UTF8SKIP(p1) : 1; 4339 } 4340 if (n2 == 0) { 4341 p2 += u2 ? UTF8SKIP(p2) : 1; 4342 } 4343 } /* End of loop through both strings */ 4344 4345 /* A match is defined by each scan that specified an explicit length 4346 * reaching its final goal, and the other not having matched a partial 4347 * character (which can happen when the fold of a character is more than one 4348 * character). */ 4349 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { 4350 return 0; 4351 } 4352 4353 /* Successful match. Set output pointers */ 4354 if (pe1) { 4355 *pe1 = (char*)p1; 4356 } 4357 if (pe2) { 4358 *pe2 = (char*)p2; 4359 } 4360 return 1; 4361 } 4362 4363 /* 4364 * ex: set ts=8 sts=4 sw=4 et: 4365 */ 4366