1#!perl -w 2 3# This is a base file to be used by various .t's in its directory 4# It tests various malformed UTF-8 sequences and some code points that are 5# "problematic", and verifies that the correct warnings/flags etc are 6# generated when using them. For the code points, it also takes the UTF-8 and 7# perturbs it to be malformed in various ways, and tests that this gets 8# appropriately detected. 9 10use strict; 11use Test::More; 12 13BEGIN { 14 use_ok('XS::APItest'); 15 require 'charset_tools.pl'; 16 require './t/utf8_setup.pl'; 17}; 18 19$|=1; 20 21use XS::APItest; 22 23my @warnings_gotten; 24 25use warnings 'utf8'; 26local $SIG{__WARN__} = sub { my @copy = @_; 27 push @warnings_gotten, map { chomp; $_ } @copy; 28 }; 29 30my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF; 31my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation); 32 33# C5 is chosen as it is valid for both ASCII and EBCDIC platforms 34my $known_start_byte = I8_to_native("\xC5"); 35 36sub requires_extended_utf8($) { 37 38 # Returns a boolean as to whether or not the code point parameter fits 39 # into 31 bits (30 on EBCDIC), subject to the convention that a negative 40 # code point stands for one that overflows the word size, so won't fit in 41 # 31 bits. 42 43 return shift > $highest_non_extended_utf8_cp; 44} 45 46sub is_extended_utf8($) { 47 48 # Returns a boolean as to whether or not the input UTF-8 sequence uses 49 # Perl extended UTF-8. 50 51 my $byte = substr(shift, 0, 1); 52 return ord $byte >= 0xFE if isASCII; 53 return $byte == I8_to_native("\xFF"); 54} 55 56sub overflow_discern_len($) { 57 58 # Returns how many bytes are needed to tell if a non-overlong UTF-8 59 # sequence is for a code point that won't fit in the platform's word size. 60 # Only the length of the sequence representing a single code point is 61 # needed. 62 63 if (isASCII) { 64 return ($::is64bit) ? 3 : 1; 65 66 # Below is needed for code points above IV_MAX 67 #return ($::is64bit) ? 3 : ((shift == $::max_bytes) 68 # ? 1 69 # : 2); 70 } 71 72 return ($::is64bit) ? 2 : 8; 73} 74 75sub overlong_discern_len($) { 76 77 # Returns how many bytes are needed to tell if the input UTF-8 sequence 78 # for a code point is overlong 79 80 my $string = shift; 81 my $length = length $string; 82 my $byte = ord native_to_I8(substr($string, 0, 1)); 83 if (isASCII) { 84 return ($byte >= 0xFE) 85 ? ((! $::is64bit) 86 ? 1 87 : ($byte == 0xFF) ? 7 : 2) 88 : (($length == 2) ? 1 : 2); 89 # Below is needed for code points above IV_MAX 90 #return ($length == $::max_bytes) 91 # # This is constrained to 1 on 32-bit machines, as it 92 # # overflows there 93 # ? (($::is64bit) ? 7 : 1) 94 # : (($length == 2) ? 1 : 2); 95 } 96 97 return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2); 98} 99 100my @tests; 101{ 102 no warnings qw(portable overflow); 103 @tests = ( 104 # $testname, 105 # $bytes, UTF-8 string 106 # $allowed_uv, code point $bytes evaluates to; -1 if 107 # overflows 108 # $needed_to_discern_len optional, how long an initial substring do 109 # we need to tell that the string must be for 110 # a code point in the category it falls in, 111 # like being a surrogate; 0 indicates we need 112 # the whole string. Some categories have a 113 # default that is used if this is omitted. 114 [ "orphan continuation byte malformation", 115 I8_to_native("$::I8c"), 116 0xFFFD, 117 1, 118 ], 119 [ "overlong malformation, lowest 2-byte", 120 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"), 121 0, # NUL 122 ], 123 [ "overlong malformation, highest 2-byte", 124 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"), 125 (isASCII) ? 0x7F : 0xFF, 126 ], 127 [ "overlong malformation, lowest 3-byte", 128 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"), 129 0, # NUL 130 ], 131 [ "overlong malformation, highest 3-byte", 132 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"), 133 (isASCII) ? 0x7FF : 0x3FF, 134 ], 135 [ "lowest surrogate", 136 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), 137 0xD800, 138 ], 139 [ "a middle surrogate", 140 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), 141 0xD90D, 142 ], 143 [ "highest surrogate", 144 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), 145 0xDFFF, 146 ], 147 [ "first of 32 consecutive non-character code points", 148 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), 149 0xFDD0, 150 ], 151 [ "a mid non-character code point of the 32 consecutive ones", 152 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), 153 0xFDE0, 154 ], 155 [ "final of 32 consecutive non-character code points", 156 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), 157 0xFDEF, 158 ], 159 [ "non-character code point U+FFFE", 160 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), 161 0xFFFE, 162 ], 163 [ "non-character code point U+FFFF", 164 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), 165 0xFFFF, 166 ], 167 [ "overlong malformation, lowest 4-byte", 168 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"), 169 0, # NUL 170 ], 171 [ "overlong malformation, highest 4-byte", 172 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"), 173 (isASCII) ? 0xFFFF : 0x3FFF, 174 ], 175 [ "non-character code point U+1FFFE", 176 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), 177 0x1FFFE, 178 ], 179 [ "non-character code point U+1FFFF", 180 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), 181 0x1FFFF, 182 ], 183 [ "non-character code point U+2FFFE", 184 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), 185 0x2FFFE, 186 ], 187 [ "non-character code point U+2FFFF", 188 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), 189 0x2FFFF, 190 ], 191 [ "non-character code point U+3FFFE", 192 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), 193 0x3FFFE, 194 ], 195 [ "non-character code point U+3FFFF", 196 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), 197 0x3FFFF, 198 ], 199 [ "non-character code point U+4FFFE", 200 (isASCII) 201 ? "\xf1\x8f\xbf\xbe" 202 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), 203 0x4FFFE, 204 ], 205 [ "non-character code point U+4FFFF", 206 (isASCII) 207 ? "\xf1\x8f\xbf\xbf" 208 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), 209 0x4FFFF, 210 ], 211 [ "non-character code point U+5FFFE", 212 (isASCII) 213 ? "\xf1\x9f\xbf\xbe" 214 : I8_to_native("\xf8\xab\xbf\xbf\xbe"), 215 0x5FFFE, 216 ], 217 [ "non-character code point U+5FFFF", 218 (isASCII) 219 ? "\xf1\x9f\xbf\xbf" 220 : I8_to_native("\xf8\xab\xbf\xbf\xbf"), 221 0x5FFFF, 222 ], 223 [ "non-character code point U+6FFFE", 224 (isASCII) 225 ? "\xf1\xaf\xbf\xbe" 226 : I8_to_native("\xf8\xad\xbf\xbf\xbe"), 227 0x6FFFE, 228 ], 229 [ "non-character code point U+6FFFF", 230 (isASCII) 231 ? "\xf1\xaf\xbf\xbf" 232 : I8_to_native("\xf8\xad\xbf\xbf\xbf"), 233 0x6FFFF, 234 ], 235 [ "non-character code point U+7FFFE", 236 (isASCII) 237 ? "\xf1\xbf\xbf\xbe" 238 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), 239 0x7FFFE, 240 ], 241 [ "non-character code point U+7FFFF", 242 (isASCII) 243 ? "\xf1\xbf\xbf\xbf" 244 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), 245 0x7FFFF, 246 ], 247 [ "non-character code point U+8FFFE", 248 (isASCII) 249 ? "\xf2\x8f\xbf\xbe" 250 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), 251 0x8FFFE, 252 ], 253 [ "non-character code point U+8FFFF", 254 (isASCII) 255 ? "\xf2\x8f\xbf\xbf" 256 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), 257 0x8FFFF, 258 ], 259 [ "non-character code point U+9FFFE", 260 (isASCII) 261 ? "\xf2\x9f\xbf\xbe" 262 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), 263 0x9FFFE, 264 ], 265 [ "non-character code point U+9FFFF", 266 (isASCII) 267 ? "\xf2\x9f\xbf\xbf" 268 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), 269 0x9FFFF, 270 ], 271 [ "non-character code point U+AFFFE", 272 (isASCII) 273 ? "\xf2\xaf\xbf\xbe" 274 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), 275 0xAFFFE, 276 ], 277 [ "non-character code point U+AFFFF", 278 (isASCII) 279 ? "\xf2\xaf\xbf\xbf" 280 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), 281 0xAFFFF, 282 ], 283 [ "non-character code point U+BFFFE", 284 (isASCII) 285 ? "\xf2\xbf\xbf\xbe" 286 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), 287 0xBFFFE, 288 ], 289 [ "non-character code point U+BFFFF", 290 (isASCII) 291 ? "\xf2\xbf\xbf\xbf" 292 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), 293 0xBFFFF, 294 ], 295 [ "non-character code point U+CFFFE", 296 (isASCII) 297 ? "\xf3\x8f\xbf\xbe" 298 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), 299 0xCFFFE, 300 ], 301 [ "non-character code point U+CFFFF", 302 (isASCII) 303 ? "\xf3\x8f\xbf\xbf" 304 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), 305 0xCFFFF, 306 ], 307 [ "non-character code point U+DFFFE", 308 (isASCII) 309 ? "\xf3\x9f\xbf\xbe" 310 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), 311 0xDFFFE, 312 ], 313 [ "non-character code point U+DFFFF", 314 (isASCII) 315 ? "\xf3\x9f\xbf\xbf" 316 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), 317 0xDFFFF, 318 ], 319 [ "non-character code point U+EFFFE", 320 (isASCII) 321 ? "\xf3\xaf\xbf\xbe" 322 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), 323 0xEFFFE, 324 ], 325 [ "non-character code point U+EFFFF", 326 (isASCII) 327 ? "\xf3\xaf\xbf\xbf" 328 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), 329 0xEFFFF, 330 ], 331 [ "non-character code point U+FFFFE", 332 (isASCII) 333 ? "\xf3\xbf\xbf\xbe" 334 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), 335 0xFFFFE, 336 ], 337 [ "non-character code point U+FFFFF", 338 (isASCII) 339 ? "\xf3\xbf\xbf\xbf" 340 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), 341 0xFFFFF, 342 ], 343 [ "non-character code point U+10FFFE", 344 (isASCII) 345 ? "\xf4\x8f\xbf\xbe" 346 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), 347 0x10FFFE, 348 ], 349 [ "non-character code point U+10FFFF", 350 (isASCII) 351 ? "\xf4\x8f\xbf\xbf" 352 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), 353 0x10FFFF, 354 ], 355 [ "first non_unicode", 356 (isASCII) 357 ? "\xf4\x90\x80\x80" 358 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), 359 0x110000, 360 2, 361 ], 362 [ "non_unicode whose first byte tells that", 363 (isASCII) 364 ? "\xf5\x80\x80\x80" 365 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), 366 (isASCII) ? 0x140000 : 0x200000, 367 1, 368 ], 369 [ "overlong malformation, lowest 5-byte", 370 (isASCII) 371 ? "\xf8\x80\x80\x80\x80" 372 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"), 373 0, # NUL 374 ], 375 [ "overlong malformation, highest 5-byte", 376 (isASCII) 377 ? "\xf8\x87\xbf\xbf\xbf" 378 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"), 379 (isASCII) ? 0x1FFFFF : 0x3FFFF, 380 ], 381 [ "overlong malformation, lowest 6-byte", 382 (isASCII) 383 ? "\xfc\x80\x80\x80\x80\x80" 384 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"), 385 0, # NUL 386 ], 387 [ "overlong malformation, highest 6-byte", 388 (isASCII) 389 ? "\xfc\x83\xbf\xbf\xbf\xbf" 390 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"), 391 (isASCII) ? 0x3FFFFFF : 0x3FFFFF, 392 ], 393 [ "overlong malformation, lowest 7-byte", 394 (isASCII) 395 ? "\xfe\x80\x80\x80\x80\x80\x80" 396 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"), 397 0, # NUL 398 ], 399 [ "overlong malformation, highest 7-byte", 400 (isASCII) 401 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf" 402 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"), 403 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF, 404 ], 405 [ "highest 31 bit code point", 406 (isASCII) 407 ? "\xfd\xbf\xbf\xbf\xbf\xbf" 408 : I8_to_native( 409 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), 410 0x7FFFFFFF, 411 1, 412 ], 413 [ "lowest 32 bit code point", 414 (isASCII) 415 ? "\xfe\x82\x80\x80\x80\x80\x80" 416 : I8_to_native( 417 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), 418 ($::is64bit) ? 0x80000000 : -1, # Overflows on 32-bit systems 419 1, 420 ], 421 # Used when UV_MAX is allowed as a code point 422 #[ "highest 32 bit code point", 423 # (isASCII) 424 # ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" 425 # : I8_to_native( 426 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), 427 # 0xFFFFFFFF, 428 #], 429 #[ "Lowest 33 bit code point", 430 # (isASCII) 431 # ? "\xfe\x84\x80\x80\x80\x80\x80" 432 # : I8_to_native( 433 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"), 434 # ($::is64bit) ? 0x100000000 : 0x0, # Overflows on 32-bit systems 435 #], 436 ); 437 438 if (! $::is64bit) { 439 if (isASCII) { 440 push @tests, 441 [ "overlong malformation, but naively looks like overflow", 442 "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf", 443 0x7FFFFFFF, 444 ], 445 # Used when above IV_MAX are allowed. 446 #[ "overlong malformation, but naively looks like overflow", 447 # "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf", 448 # 0xFFFFFFFF, 449 #], 450 [ "overflow that old algorithm failed to detect", 451 "\xfe\x86\x80\x80\x80\x80\x80", 452 -1, 453 ]; 454 } 455 } 456 457 push @tests, 458 [ "overlong malformation, lowest max-byte", 459 (isASCII) 460 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" 461 : I8_to_native( 462 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 463 0, # NUL 464 ], 465 [ "overlong malformation, highest max-byte", 466 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC 467 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf" 468 : I8_to_native( 469 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"), 470 (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF, 471 ]; 472 473 if (isASCII) { 474 push @tests, 475 [ "Lowest code point requiring 13 bytes to represent", # 2**36 476 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", 477 ($::is64bit) ? 0x1000000000 : -1, # overflows on 32bit 478 ], 479 }; 480 481 if ($::is64bit) { 482 push @tests, 483 [ "highest 63 bit code point", 484 (isASCII) 485 ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 486 : I8_to_native( 487 "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"), 488 0x7FFFFFFFFFFFFFFF, 489 ], 490 [ "first 64 bit code point", 491 (isASCII) 492 ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" 493 : I8_to_native( 494 "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 495 -1, 496 ]; 497 # Used when UV_MAX is allowed as a code point 498 #[ "highest 64 bit code point", 499 # (isASCII) 500 # ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" 501 # : I8_to_native( 502 # "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"), 503 # 0xFFFFFFFFFFFFFFFF, 504 # (isASCII) ? 1 : 2, 505 #], 506 #[ "first 65 bit code point", 507 # (isASCII) 508 # ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" 509 # : I8_to_native( 510 # "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 511 # 0, 512 #]; 513 if (isASCII) { 514 push @tests, 515 [ "overflow that old algorithm failed to detect", 516 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 517 -1, 518 ]; 519 } 520 else { 521 push @tests, # These could falsely show wrongly in a naive 522 # implementation 523 [ "requires at least 32 bits", 524 I8_to_native( 525 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 526 0x800000000, 527 ], 528 [ "requires at least 32 bits", 529 I8_to_native( 530 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 531 0x10000000000, 532 ], 533 [ "requires at least 32 bits", 534 I8_to_native( 535 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 536 0x200000000000, 537 ], 538 [ "requires at least 32 bits", 539 I8_to_native( 540 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 541 0x4000000000000, 542 ], 543 [ "requires at least 32 bits", 544 I8_to_native( 545 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 546 0x80000000000000, 547 ], 548 [ "requires at least 32 bits", 549 I8_to_native( 550 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), 551 0x1000000000000000, 552 ]; 553 } 554 } 555} 556 557sub flags_to_text($$) 558{ 559 my ($flags, $flags_to_text_ref) = @_; 560 561 # Returns a string containing a mnemonic representation of the bits that 562 # are set in the $flags. These are assumed to be flag bits. The return 563 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an 564 # array that gives the textual representation of all the possible flags. 565 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If 566 # no bits at all are set the string "0" is returned; 567 568 my @flag_text; 569 my $shift = 0; 570 571 return "0" if $flags == 0; 572 573 while ($flags) { 574 #diag sprintf "%x", $flags; 575 if ($flags & 1) { 576 push @flag_text, $flags_to_text_ref->[$shift]; 577 } 578 $shift++; 579 $flags >>= 1; 580 } 581 582 return join "|", @flag_text; 583} 584 585# Possible flag returns from utf8n_to_uvchr_error(). These should have G_, 586# instead of A_, D_, but the prefixes will be used in a later commit, so 587# minimize churn by having them here. 588my @utf8n_flags_to_text = ( qw( 589 A_EMPTY 590 A_CONTINUATION 591 A_NON_CONTINUATION 592 A_SHORT 593 A_LONG 594 A_LONG_AND_ITS_VALUE 595 PLACEHOLDER 596 A_OVERFLOW 597 D_SURROGATE 598 W_SURROGATE 599 D_NONCHAR 600 W_NONCHAR 601 D_SUPER 602 W_SUPER 603 D_PERL_EXTENDED 604 W_PERL_EXTENDED 605 CHECK_ONLY 606 NO_CONFIDENCE_IN_CURLEN_ 607 ) ); 608 609sub utf8n_display_call($) 610{ 611 # Converts an eval string that calls test_utf8n_to_uvchr into a more human 612 # readable form, and returns it. Doesn't work if the byte string contains 613 # an apostrophe. The return will look something like: 614 # test_utf8n_to_uvchr_error('$bytes', $length, $flags) 615 #diag $_[0]; 616 617 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x; 618 my $text1 = $1; # Everything before the byte string 619 my $bytes = $2; 620 my $text2 = $3; # Includes the length 621 my $flags = $4; 622 623 return $text1 624 . display_bytes($bytes) 625 . $text2 626 . flags_to_text($flags, \@utf8n_flags_to_text) 627 . ')'; 628} 629 630my @uvchr_flags_to_text = ( qw( 631 W_SURROGATE 632 W_NONCHAR 633 W_SUPER 634 W_PERL_EXTENDED 635 D_SURROGATE 636 D_NONCHAR 637 D_SUPER 638 D_PERL_EXTENDED 639) ); 640 641sub uvchr_display_call($) 642{ 643 # Converts an eval string that calls test_uvchr_to_utf8 into a more human 644 # readable form, and returns it. The return will look something like: 645 # test_uvchr_to_utf8n_flags($uv, $flags) 646 #diag $_[0]; 647 648 649 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x; 650 my $text = $1; 651 my $cp = sprintf "%X", $2; 652 my $flags = $3; 653 654 return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')'; 655} 656 657sub do_warnings_test(@) 658{ 659 my @expected_warnings = @_; 660 661 # Compares the input expected warnings array with @warnings_gotten, 662 # generating a pass for each found, removing it from @warnings_gotten. 663 # Any discrepancies generate test failures. Returns TRUE if no 664 # discrepcancies; otherwise FALSE. 665 666 my $succeeded = 1; 667 668 if (@expected_warnings == 0) { 669 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) { 670 output_warnings(@warnings_gotten); 671 $succeeded = 0; 672 } 673 return $succeeded; 674 } 675 676 # Check that we got all the expected warnings, 677 # removing each one found 678 WARNING: 679 foreach my $expected (@expected_warnings) { 680 foreach (my $i = 0; $i < @warnings_gotten; $i++) { 681 if ($warnings_gotten[$i] =~ $expected) { 682 pass(" Expected and got warning: " 683 . " $warnings_gotten[$i]"); 684 splice @warnings_gotten, $i, 1; 685 next WARNING; 686 } 687 } 688 fail(" Expected a warning that matches " 689 . $expected . " but didn't get it"); 690 $succeeded = 0; 691 } 692 693 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) { 694 output_warnings(@warnings_gotten); 695 $succeeded = 0; 696 } 697 698 return $succeeded; 699} 700 701my $min_cont = $::lowest_continuation; 702my $continuation_shift = (isASCII) ? 6 : 5; 703my $continuation_mask = (1 << $continuation_shift) - 1; 704 705sub isUTF8_CHAR($$) { # Uses first principals to determine if this I8 input 706 # is legal. (Doesn't work if overflows) 707 my ($native, $length) = @_; 708 my $i8 = native_to_I8($native); 709 710 # Uses first principals to calculate if $i8 is legal 711 712 return 0 if $length <= 0; 713 714 my $first = ord substr($i8, 0, 1); 715 716 # Invariant 717 return 1 if $length == 1 && $first < $min_cont; 718 719 return 0 if $first < 0xC0; # Starts with continuation 720 721 # Calculate the number of leading 1 bits 722 my $utf8skip = 0; 723 my $bits = $first; 724 do { 725 $utf8skip++; 726 $bits = ($bits << 1) & 0xFF; 727 } while ($bits & 0x80); 728 729 return 0 if $utf8skip != $length; 730 731 # Accumulate the $code point. The remaining bits in the start byte count 732 # towards it 733 my $cp = $bits >> $utf8skip; 734 735 for my $i (1 .. $length - 1) { 736 my $ord = ord substr($i8, $i, 1); 737 738 # Wrong if not a continuation 739 return 0 if $ord < $min_cont || $ord >= 0xC0; 740 741 $cp = ($cp << $continuation_shift) 742 | ($ord & $continuation_mask); 743 } 744 745 # If the calculated value can be expressed in fewer bytes than were passed 746 # in, is an illegal overlong. XXX if 'chr' is not working properly, this 747 # may not be right 748 my $chr = uni_to_native(chr $cp); 749 utf8::upgrade($chr); 750 751 use bytes; 752 return 0 if length $chr < $length; 753 754 # Also, its possible on EBCDIC platforms that have more illegal start 755 # bytes than ASCII ones (like C3, C4) for something to have the same 756 # length but still be overlong. We make sure the first byte isn't smaller 757 # than the first byte of the real representation. 758 return 0 if substr($native, 0, 1) lt substr($chr, 0, 1); 759 760 return 1; 761} 762 763sub start_mark($) { 764 my $len = shift; 765 return 0xFF if $len > 7; 766 return (0xFF & (0xFE << (7 - $len))); 767} 768 769sub start_mask($) { 770 my $len = shift; 771 return 0 if $len > 7; 772 return 0x1F >> ($len - 2); 773} 774 775# This test is split into this number of files. 776my $num_test_files = $ENV{TEST_JOBS} || 1; 777$num_test_files = 10 if $num_test_files > 10; 778 779# We only really need to test utf8n_to_uvchr_msgs() once with this flag. 780my $tested_CHECK_ONLY = 0; 781 782my $test_count = -1; 783 784# By setting this environment variable to this particular value, we test 785# essentially all combinations of potential UTF-8, so that can get a 786# comprehensive test of the decoding routine. This test assumes the routine 787# that does the translation from code point to UTF-8 is working. An assert 788# can be used in the routine to make sure that the dfa is working precisely 789# correctly, and any flaws in it aren't being masked by the remainder of the 790# function. 791if ($::TEST_CHUNK == 0 792&& $ENV{PERL_DEBUG_FULL_TEST} 793&& $ENV{PERL_DEBUG_FULL_TEST} == 97) 794{ 795 # We construct UTF-8 (I8 on EBCDIC platforms converted later to native) 796 797 my $min_cont_mask = $min_cont | 0xF; 798 my @bytes = ( 0, # Placeholder to signify to use an empty string "" 799 0x41, # We assume that all the invariant characters are 800 # properly in the same class, so this is an exemplar 801 # character 802 $min_cont .. 0xFF # But test every non-invariant individually 803 ); 804 my $mark = $min_cont; 805 my $mask = (1 << $continuation_shift) - 1; 806 for my $byte1 (@bytes) { 807 for my $byte2 (@bytes) { 808 last if $byte2 && ! $byte1; # Don't test empty preceding byte 809 810 last if $byte2 && $byte1 < 0xC0; # No need to test more than a 811 # single byte unless start byte 812 # indicates those. 813 814 for my $byte3 (@bytes) { 815 last if $byte3 && ! $byte2; 816 last if $byte3 && $byte1 < 0xE0; # Only test 3 bytes for 817 # 3-byte start byte 818 819 # If the preceding byte is a start byte, it should fail, and 820 # there is no need to test illegal bytes that follow. 821 # Instead, limit ourselves to just a few legal bytes that 822 # could follow. This cuts down tremendously on the number of 823 # tests executed. 824 next if $byte2 >= 0xC0 825 && $byte3 >= $min_cont 826 && ($byte3 & $min_cont_mask) != $min_cont; 827 828 for my $byte4 (@bytes) { 829 last if $byte4 && ! $byte3; 830 last if $byte4 && $byte1 < 0xF0; # Only test 4 bytes for 831 # 4 byte strings 832 833 # Like for byte 3, we limit things that come after a 834 # mispositioned start-byte to just a few things that 835 # otherwise would be legal 836 next if ($byte2 >= 0xC0 || $byte3 >= 0xC0) 837 && $byte4 >= $min_cont 838 && ($byte4 & $min_cont_mask) != $min_cont; 839 840 for my $byte5 (@bytes) { 841 last if $byte5 && ! $byte4; 842 last if $byte5 && $byte1 < 0xF8; # Only test 5 bytes for 843 # 5 byte strings 844 845 # Like for byte 4, we limit things that come after a 846 # mispositioned start-byte to just a few things that 847 # otherwise would be legal 848 next if ( $byte2 >= 0xC0 849 || $byte3 >= 0xC0 850 || $byte4 >= 0xC0) 851 && $byte4 >= $min_cont 852 && ($byte4 & $min_cont_mask) != $min_cont; 853 854 my $string = ""; 855 $string .= chr $byte1 if $byte1; 856 $string .= chr $byte2 if $byte2; 857 $string .= chr $byte3 if $byte3; 858 $string .= chr $byte4 if $byte4; 859 $string .= chr $byte5 if $byte5; 860 861 my $length = length $string; 862 next unless $length; 863 last if $byte1 >= ((isASCII) ? 0xF6 : 0xFA); 864 865 my $native = I8_to_native($string); 866 my $is_valid = isUTF8_CHAR($native, $length); 867 my $got_valid = test_isUTF8_CHAR($native, $length); 868 my $got_strict 869 = test_isSTRICT_UTF8_CHAR($native, $length); 870 my $got_C9 871 = test_isC9_STRICT_UTF8_CHAR($native, $length); 872 my $ret = test_utf8n_to_uvchr_msgs($native, $length, 873 $::UTF8_WARN_ILLEGAL_INTERCHANGE); 874 my $is_strict = $is_valid; 875 my $is_C9 = $is_valid; 876 877 if ($is_valid) { 878 879 # Here, is legal UTF-8. Verify that it returned 880 # the correct code point, and if so, that it 881 # correctly classifies the result. 882 my $cp = $ret->[0]; 883 884 my $should_be_string; 885 if ($length == 1) { 886 $should_be_string = native_to_I8(chr $cp); 887 } 888 else { 889 890 # Starting with the code point, use first 891 # principals to find the equivalent I8 string 892 my @bytes; 893 my $uv = ord native_to_uni(chr $cp); 894 for (my $i = $length - 1; $i > 0; $i--) { 895 $bytes[$i] = chr (($uv & $mask) | $mark); 896 $uv >>= $continuation_shift; 897 } 898 $bytes[0] = chr ($uv & start_mask($length) 899 | start_mark($length)); 900 $should_be_string = join "", @bytes; 901 } 902 903 # If the original string and the inverse are the 904 # same, it worked. 905 my $test_name = "utf8n_to_uvchr_msgs(" 906 . display_bytes($native) 907 . ") yields " 908 . sprintf ("0x%x", $cp) 909 . "; does its I8 eq original"; 910 if (is($should_be_string, $string, $test_name)) { 911 my $is_surrogate = $cp >= 0xD800 912 && $cp <= 0xDFFF; 913 my $got_surrogate 914 = ($ret->[2] & $::UTF8_GOT_SURROGATE) != 0; 915 $is_strict = 0 if $is_surrogate; 916 $is_C9 = 0 if $is_surrogate; 917 918 my $is_super = $cp > 0x10FFFF; 919 my $got_super 920 = ($ret->[2] & $::UTF8_GOT_SUPER) != 0; 921 $is_strict = 0 if $is_super; 922 $is_C9 = 0 if $is_super; 923 924 my $is_nonchar = ! $is_super 925 && ( ($cp & 0xFFFE) == 0xFFFE 926 || ($cp >= 0xFDD0 && $cp <= 0xFDEF)); 927 my $got_nonchar 928 = ($ret->[2] & $::UTF8_GOT_NONCHAR) != 0; 929 $is_strict = 0 if $is_nonchar; 930 931 is($got_surrogate, $is_surrogate, 932 " And correctly flagged it as" 933 . ((! $is_surrogate) ? " not" : "") 934 . " being a surrogate"); 935 is($got_super, $is_super, 936 " And correctly flagged it as" 937 . ((! $is_super) ? " not" : "") 938 . " being above Unicode"); 939 is($got_nonchar, $is_nonchar, 940 " And correctly flagged it as" 941 . ((! $is_nonchar) ? " not" : "") 942 . " being a non-char"); 943 } 944 945 # This is how we exit the loop normally if things 946 # are working. The fail-safe code above is used 947 # when they aren't. 948 goto done if $cp > 0x140001; 949 } 950 else { 951 is($ret->[0], 0, "utf8n_to_uvchr_msgs(" 952 . display_bytes($native) 953 . ") correctly returns error"); 954 if (! ($ret->[2] & ($::UTF8_GOT_SHORT 955 |$::UTF8_GOT_NON_CONTINUATION 956 |$::UTF8_GOT_LONG))) 957 { 958 is($ret->[2] & ( $::UTF8_GOT_NONCHAR 959 |$::UTF8_GOT_SURROGATE 960 |$::UTF8_GOT_SUPER), 0, 961 " And isn't a surrogate, non-char, nor" 962 . " above Unicode"); 963 } 964 } 965 966 is($got_valid == 0, $is_valid == 0, 967 " And isUTF8_CHAR() correctly returns " 968 . (($got_valid == 0) ? "0" : "non-zero")); 969 is($got_strict == 0, $is_strict == 0, 970 " And isSTRICT_UTF8_CHAR() correctly returns " 971 . (($got_strict == 0) ? "0" : "non-zero")); 972 is($got_C9 == 0, $is_C9 == 0, 973 " And isC9_UTF8_CHAR() correctly returns " 974 . (($got_C9 == 0) ? "0" : "non-zero")); 975 } 976 } 977 } 978 } 979 } 980 done: 981} 982 983foreach my $test (@tests) { 984 $test_count++; 985 next if $test_count % $num_test_files != $::TEST_CHUNK; 986 987 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test; 988 989 my $length = length $bytes; 990 my $initially_overlong = $testname =~ /overlong/; 991 my $initially_orphan = $testname =~ /orphan/; 992 my $will_overflow = $allowed_uv < 0; 993 994 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); 995 my $display_bytes = display_bytes($bytes); 996 997 my $controlling_warning_category; 998 my $utf8n_flag_to_warn; 999 my $utf8n_flag_to_disallow; 1000 my $uvchr_flag_to_warn; 1001 my $uvchr_flag_to_disallow; 1002 1003 # We want to test that the independent flags are actually independent. 1004 # For example, that a surrogate doesn't trigger a non-character warning, 1005 # and conversely, turning off an above-Unicode flag doesn't suppress a 1006 # surrogate warning. Earlier versions of this file used nested loops to 1007 # test all possible combinations. But that creates lots of tests, making 1008 # this run too long. What is now done instead is to use the complement of 1009 # the category we are testing to greatly reduce the combinatorial 1010 # explosion. For example, if we have a surrogate and we aren't expecting 1011 # a warning about it, we set all the flags for non-surrogates to raise 1012 # warnings. If one shows up, it indicates the flags aren't independent. 1013 my $utf8n_flag_to_warn_complement; 1014 my $utf8n_flag_to_disallow_complement; 1015 my $uvchr_flag_to_warn_complement; 1016 my $uvchr_flag_to_disallow_complement; 1017 1018 # Many of the code points being tested are middling in that if code point 1019 # edge cases work, these are very likely to as well. Because this test 1020 # file takes a while to execute, we skip testing the edge effects of code 1021 # points deemed middling, while testing their basics and continuing to 1022 # fully test the non-middling code points. 1023 my $skip_most_tests = 0; 1024 1025 my $cp_message_qr; # Pattern that matches the message raised when 1026 # that message contains the problematic code 1027 # point. The message is the same (currently) both 1028 # when going from/to utf8. 1029 my $non_cp_trailing_text; # The suffix text when the message doesn't 1030 # contain a code point. (This is a result of 1031 # some sort of malformation that means we 1032 # can't get an exact code poin 1033 my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E 1034 \Q requires a Perl extension, and so is not\E 1035 \Q portable\E/x; 1036 my $extended_non_cp_trailing_text 1037 = "is a Perl extension, and so is not portable"; 1038 1039 # What bytes should have been used to specify a code point that has been 1040 # specified as an overlong. 1041 my $correct_bytes_for_overlong; 1042 1043 # Is this test malformed from the beginning? If so, we know to generally 1044 # expect that the tests will show it isn't valid. 1045 my $initially_malformed = 0; 1046 1047 if ($initially_overlong || $initially_orphan) { 1048 $non_cp_trailing_text = "if you see this, there is an error"; 1049 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; 1050 $initially_malformed = 1; 1051 $utf8n_flag_to_warn = 0; 1052 $utf8n_flag_to_disallow = 0; 1053 1054 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE; 1055 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE; 1056 if (! $will_overflow && $allowed_uv <= 0x10FFFF) { 1057 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER; 1058 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER; 1059 if (($allowed_uv & 0xFFFF) != 0xFFFF) { 1060 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR; 1061 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR; 1062 } 1063 } 1064 if (! is_extended_utf8($bytes)) { 1065 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; 1066 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED; 1067 } 1068 1069 $controlling_warning_category = 'utf8'; 1070 1071 if ($initially_overlong) { 1072 if (! defined $needed_to_discern_len) { 1073 $needed_to_discern_len = overlong_discern_len($bytes); 1074 } 1075 $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv); 1076 } 1077 } 1078 elsif($will_overflow || $allowed_uv > 0x10FFFF) { 1079 1080 # Set the SUPER flags; later, we test for PERL_EXTENDED as well. 1081 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; 1082 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; 1083 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; 1084 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; 1085 1086 # Below, we add the flags for non-perl_extended to the code points 1087 # that don't fit that category. Special tests are done for this 1088 # category in the inner loop. 1089 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR 1090 |$::UTF8_WARN_SURROGATE; 1091 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR 1092 |$::UTF8_DISALLOW_SURROGATE; 1093 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR 1094 |$::UNICODE_WARN_SURROGATE; 1095 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR 1096 |$::UNICODE_DISALLOW_SURROGATE; 1097 $controlling_warning_category = 'non_unicode'; 1098 1099 if ($will_overflow) { # This is realy a malformation 1100 $non_cp_trailing_text = "if you see this, there is an error"; 1101 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; 1102 $initially_malformed = 1; 1103 if (! defined $needed_to_discern_len) { 1104 $needed_to_discern_len = overflow_discern_len($length); 1105 } 1106 } 1107 elsif (requires_extended_utf8($allowed_uv)) { 1108 $cp_message_qr = $extended_cp_message_qr; 1109 $non_cp_trailing_text = $extended_non_cp_trailing_text; 1110 $needed_to_discern_len = 1 unless defined $needed_to_discern_len; 1111 } 1112 else { 1113 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E 1114 \Q may not be portable\E/x; 1115 $non_cp_trailing_text = "is for a non-Unicode code point, may not" 1116 . " be portable"; 1117 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED; 1118 $utf8n_flag_to_disallow_complement 1119 |= $::UTF8_DISALLOW_PERL_EXTENDED; 1120 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED; 1121 $uvchr_flag_to_disallow_complement 1122 |= $::UNICODE_DISALLOW_PERL_EXTENDED; 1123 } 1124 } 1125 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { 1126 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; 1127 $non_cp_trailing_text = "is for a surrogate"; 1128 $needed_to_discern_len = 2 unless defined $needed_to_discern_len; 1129 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; 1130 1131 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; 1132 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; 1133 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; 1134 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; 1135 1136 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR 1137 |$::UTF8_WARN_SUPER 1138 |$::UTF8_WARN_PERL_EXTENDED; 1139 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR 1140 |$::UTF8_DISALLOW_SUPER 1141 |$::UTF8_DISALLOW_PERL_EXTENDED; 1142 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR 1143 |$::UNICODE_WARN_SUPER 1144 |$::UNICODE_WARN_PERL_EXTENDED; 1145 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR 1146 |$::UNICODE_DISALLOW_SUPER 1147 |$::UNICODE_DISALLOW_PERL_EXTENDED; 1148 $controlling_warning_category = 'surrogate'; 1149 } 1150 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) 1151 || ($allowed_uv & 0xFFFE) == 0xFFFE) 1152 { 1153 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E 1154 \Q is not recommended for open interchange\E/x; 1155 $non_cp_trailing_text = "if you see this, there is an error"; 1156 $needed_to_discern_len = $length unless defined $needed_to_discern_len; 1157 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) 1158 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) 1159 { 1160 $skip_most_tests = 1; 1161 } 1162 1163 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; 1164 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; 1165 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; 1166 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; 1167 1168 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE 1169 |$::UTF8_WARN_SUPER 1170 |$::UTF8_WARN_PERL_EXTENDED; 1171 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE 1172 |$::UTF8_DISALLOW_SUPER 1173 |$::UTF8_DISALLOW_PERL_EXTENDED; 1174 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE 1175 |$::UNICODE_WARN_SUPER 1176 |$::UNICODE_WARN_PERL_EXTENDED; 1177 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE 1178 |$::UNICODE_DISALLOW_SUPER 1179 |$::UNICODE_DISALLOW_PERL_EXTENDED; 1180 1181 $controlling_warning_category = 'nonchar'; 1182 } 1183 else { 1184 die "Can't figure out what type of warning to test for $testname" 1185 } 1186 1187 die 'Didn\'t set $needed_to_discern_len for ' . $testname 1188 unless defined $needed_to_discern_len; 1189 1190 # We try various combinations of malformations that can occur 1191 foreach my $short (0, 1) { 1192 next if $skip_most_tests && $short; 1193 foreach my $unexpected_noncont (0, 1) { 1194 next if $skip_most_tests && $unexpected_noncont; 1195 foreach my $overlong (0, 1) { 1196 next if $overlong && $skip_most_tests; 1197 next if $initially_overlong && ! $overlong; 1198 1199 # If we're creating an overlong, it can't be longer than the 1200 # maximum length, so skip if we're already at that length. 1201 next if (! $initially_overlong && $overlong) 1202 && $length >= $::max_bytes; 1203 1204 my $this_cp_message_qr = $cp_message_qr; 1205 my $this_non_cp_trailing_text = $non_cp_trailing_text; 1206 1207 foreach my $malformed_allow_type (0..2) { 1208 # 0 don't allow this malformation; ignored if no malformation 1209 # 1 allow, with REPLACEMENT CHARACTER returned 1210 # 2 allow, with intended code point returned. All malformations 1211 # other than overlong can't determine the intended code point, 1212 # so this isn't valid for them. 1213 next if $malformed_allow_type == 2 1214 && ($will_overflow || $short || $unexpected_noncont); 1215 next if $skip_most_tests && $malformed_allow_type; 1216 1217 # Here we are in the innermost loop for malformations. So we 1218 # know which ones are in effect. Can now change the input to be 1219 # appropriately malformed. We also can set up certain other 1220 # things now, like whether we expect a return flag from this 1221 # malformation, and which flag. 1222 1223 my $this_bytes = $bytes; 1224 my $this_length = $length; 1225 my $this_expected_len = $length; 1226 my $this_needed_to_discern_len = $needed_to_discern_len; 1227 1228 my @malformation_names; 1229 my @expected_malformation_warnings; 1230 my @expected_malformation_return_flags; 1231 1232 # Contains the flags for any allowed malformations. Currently no 1233 # combinations of on/off are tested for. It's either all are 1234 # allowed, or none are. 1235 my $allow_flags = 0; 1236 my $overlong_is_in_perl_extended_utf8 = 0; 1237 my $dont_use_overlong_cp = 0; 1238 1239 if ($initially_orphan) { 1240 next if $overlong || $short || $unexpected_noncont; 1241 } 1242 1243 if ($overlong) { 1244 if (! $initially_overlong) { 1245 my $new_expected_len; 1246 1247 # To force this malformation, we convert the original start 1248 # byte into a continuation byte with the same data bits as 1249 # originally. ... 1250 my $start_byte = substr($this_bytes, 0, 1); 1251 my $converted_to_continuation_byte 1252 = start_byte_to_cont($start_byte); 1253 1254 # ... Then we prepend it with a known overlong sequence. 1255 # This should evaluate to the exact same code point as the 1256 # original. We try to avoid an overlong using Perl 1257 # extended UTF-8. The code points are the highest 1258 # representable as overlongs on the respective platform 1259 # without using extended UTF-8. 1260 if (native_to_I8($start_byte) lt "\xFC") { 1261 $start_byte = I8_to_native("\xFC"); 1262 $new_expected_len = 6; 1263 } 1264 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") { 1265 1266 # FE is not extended UTF-8 on EBCDIC 1267 $start_byte = I8_to_native("\xFE"); 1268 $new_expected_len = 7; 1269 } 1270 else { # Must use extended UTF-8. On ASCII platforms, we 1271 # could express some overlongs here starting with 1272 # \xFE, but there's no real reason to do so. 1273 $overlong_is_in_perl_extended_utf8 = 1; 1274 $start_byte = I8_to_native("\xFF"); 1275 $new_expected_len = $::max_bytes; 1276 $this_cp_message_qr = $extended_cp_message_qr; 1277 1278 # The warning that gets raised doesn't include the 1279 # code point in the message if the code point can be 1280 # expressed without using extended UTF-8, but the 1281 # particular overlong sequence used is in extended 1282 # UTF-8. To do otherwise would be confusing to the 1283 # user, as it would claim the code point requires 1284 # extended, when it doesn't. 1285 $dont_use_overlong_cp = 1 1286 unless requires_extended_utf8($allowed_uv); 1287 $this_non_cp_trailing_text 1288 = $extended_non_cp_trailing_text; 1289 } 1290 1291 # Splice in the revise continuation byte, preceded by the 1292 # start byte and the proper number of the lowest 1293 # continuation bytes. 1294 $this_bytes = $start_byte 1295 . ($native_lowest_continuation_chr 1296 x ( $new_expected_len 1297 - 1 1298 - length($this_bytes))) 1299 . $converted_to_continuation_byte 1300 . substr($this_bytes, 1); 1301 $this_length = length($this_bytes); 1302 $this_needed_to_discern_len = $new_expected_len 1303 - ( $this_expected_len 1304 - $this_needed_to_discern_len); 1305 $this_expected_len = $new_expected_len; 1306 } 1307 } 1308 1309 if ($short) { 1310 1311 # To force this malformation, just tell the test to not look 1312 # as far as it should into the input. 1313 $this_length--; 1314 $this_expected_len--; 1315 1316 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type; 1317 } 1318 1319 if ($unexpected_noncont) { 1320 1321 # To force this malformation, change the final continuation 1322 # byte into a start byte. 1323 my $pos = ($short) ? -2 : -1; 1324 substr($this_bytes, $pos, 1) = $known_start_byte; 1325 $this_expected_len--; 1326 } 1327 1328 # The whole point of a test that is malformed from the beginning 1329 # is to test for that malformation. If we've modified things so 1330 # much that we don't have enough information to detect that 1331 # malformation, there's no point in testing. 1332 next if $initially_malformed 1333 && $this_expected_len < $this_needed_to_discern_len; 1334 1335 # Here, we've transformed the input with all of the desired 1336 # non-overflow malformations. We are now in a position to 1337 # construct any potential warnings for those malformations. But 1338 # it's a pain to get the detailed messages exactly right, so for 1339 # now XXX, only do so for those that return an explicit code 1340 # point. 1341 1342 if ($initially_orphan) { 1343 push @malformation_names, "orphan continuation"; 1344 push @expected_malformation_return_flags, 1345 $::UTF8_GOT_CONTINUATION; 1346 $allow_flags |= $::UTF8_ALLOW_CONTINUATION 1347 if $malformed_allow_type; 1348 push @expected_malformation_warnings, qr/unexpected continuation/; 1349 } 1350 1351 if ($overlong) { 1352 push @malformation_names, 'overlong'; 1353 push @expected_malformation_return_flags, $::UTF8_GOT_LONG; 1354 1355 # If one of the other malformation types is also in effect, we 1356 # don't know what the intended code point was. 1357 if ($short || $unexpected_noncont || $will_overflow) { 1358 push @expected_malformation_warnings, qr/overlong/; 1359 } 1360 else { 1361 my $wrong_bytes = display_bytes_no_quotes( 1362 substr($this_bytes, 0, $this_length)); 1363 if (! defined $correct_bytes_for_overlong) { 1364 $correct_bytes_for_overlong 1365 = display_bytes_no_quotes($bytes); 1366 } 1367 my $prefix = ( $allowed_uv > 0x10FFFF 1368 || ! isASCII && $allowed_uv < 256) 1369 ? "0x" 1370 : "U+"; 1371 push @expected_malformation_warnings, 1372 qr/\QMalformed UTF-8 character: $wrong_bytes\E 1373 \Q (overlong; instead use\E 1374 \Q $correct_bytes_for_overlong to\E 1375 \Q represent $prefix$uv_string)/x; 1376 } 1377 1378 if ($malformed_allow_type == 2) { 1379 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE; 1380 } 1381 elsif ($malformed_allow_type) { 1382 $allow_flags |= $::UTF8_ALLOW_LONG; 1383 } 1384 } 1385 if ($short) { 1386 push @malformation_names, 'short'; 1387 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT; 1388 push @expected_malformation_warnings, qr/too short/; 1389 } 1390 if ($unexpected_noncont) { 1391 push @malformation_names, 'unexpected non-continuation'; 1392 push @expected_malformation_return_flags, 1393 $::UTF8_GOT_NON_CONTINUATION; 1394 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION 1395 if $malformed_allow_type; 1396 push @expected_malformation_warnings, 1397 qr/unexpected non-continuation byte/; 1398 } 1399 1400 # The overflow malformation is done differently than other 1401 # malformations. It comes from manually typed tests in the test 1402 # array. We now make it be treated like one of the other 1403 # malformations. But some has to be deferred until the inner loop 1404 my $overflow_msg_pattern; 1405 if ($will_overflow) { 1406 push @malformation_names, 'overflow'; 1407 1408 $overflow_msg_pattern = display_bytes_no_quotes( 1409 substr($this_bytes, 0, $this_expected_len)); 1410 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E 1411 \Q $overflow_msg_pattern\E 1412 \Q (overflows)\E/x; 1413 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW; 1414 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type; 1415 } 1416 1417 # And we can create the malformation-related text for the test 1418 # names we eventually will generate. 1419 my $malformations_name = ""; 1420 if (@malformation_names) { 1421 $malformations_name .= "dis" unless $malformed_allow_type; 1422 $malformations_name .= "allowed "; 1423 $malformations_name .= "malformation"; 1424 $malformations_name .= "s" if @malformation_names > 1; 1425 $malformations_name .= ": "; 1426 $malformations_name .= join "/", @malformation_names; 1427 $malformations_name = " ($malformations_name)"; 1428 } 1429 1430 # Done setting up the malformation related stuff 1431 1432 { # First test the isFOO calls 1433 use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings 1434 undef @warnings_gotten; 1435 1436 my $ret = test_isUTF8_CHAR($this_bytes, $this_length); 1437 my $ret_flags 1438 = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0); 1439 if ($malformations_name) { 1440 is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0"); 1441 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); 1442 } 1443 else { 1444 is($ret, $this_length, "For $testname: isUTF8_CHAR() returns" 1445 . " expected length: $this_length"); 1446 is($ret_flags, $this_length, 1447 " And isUTF8_CHAR_flags(...,0) returns expected" 1448 . " length: $this_length"); 1449 } 1450 is(scalar @warnings_gotten, 0, 1451 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags" 1452 . " generated any warnings") 1453 or output_warnings(@warnings_gotten); 1454 1455 undef @warnings_gotten; 1456 $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length); 1457 if ($malformations_name) { 1458 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); 1459 } 1460 else { 1461 my $expected_ret 1462 = ( $testname =~ /surrogate|non-character/ 1463 || $allowed_uv > 0x10FFFF) 1464 ? 0 1465 : $this_length; 1466 is($ret, $expected_ret, 1467 " And isSTRICT_UTF8_CHAR() returns expected" 1468 . " length: $expected_ret"); 1469 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, 1470 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); 1471 is($ret, $expected_ret, 1472 " And isUTF8_CHAR_flags('" 1473 . "DISALLOW_ILLEGAL_INTERCHANGE') acts like" 1474 . " isSTRICT_UTF8_CHAR"); 1475 } 1476 is(scalar @warnings_gotten, 0, 1477 " And neither isSTRICT_UTF8_CHAR() nor" 1478 . " isUTF8_CHAR_flags generated any warnings") 1479 or output_warnings(@warnings_gotten); 1480 1481 undef @warnings_gotten; 1482 $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length); 1483 if ($malformations_name) { 1484 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); 1485 } 1486 else { 1487 my $expected_ret = ( $testname =~ /surrogate/ 1488 || $allowed_uv > 0x10FFFF) 1489 ? 0 1490 : $this_expected_len; 1491 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" 1492 . " returns expected length:" 1493 . " $expected_ret"); 1494 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, 1495 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); 1496 is($ret, $expected_ret, 1497 " And isUTF8_CHAR_flags('" 1498 . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like" 1499 . " isC9_STRICT_UTF8_CHAR"); 1500 } 1501 is(scalar @warnings_gotten, 0, 1502 " And neither isC9_STRICT_UTF8_CHAR() nor" 1503 . " isUTF8_CHAR_flags generated any warnings") 1504 or output_warnings(@warnings_gotten); 1505 1506 foreach my $disallow_type (0..2) { 1507 # 0 is don't disallow this type of code point 1508 # 1 is do disallow 1509 # 2 is do disallow, but only code points requiring 1510 # perl-extended-UTF8 1511 1512 my $disallow_flags; 1513 my $expected_ret; 1514 1515 if ($malformations_name) { 1516 1517 # Malformations are by default disallowed, so testing 1518 # with $disallow_type equal to 0 is sufficicient. 1519 next if $disallow_type; 1520 1521 $disallow_flags = 0; 1522 $expected_ret = 0; 1523 } 1524 elsif ($disallow_type == 1) { 1525 $disallow_flags = $utf8n_flag_to_disallow; 1526 $expected_ret = 0; 1527 } 1528 elsif ($disallow_type == 2) { 1529 next if ! requires_extended_utf8($allowed_uv); 1530 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED; 1531 $expected_ret = 0; 1532 } 1533 else { # type is 0 1534 $disallow_flags = $utf8n_flag_to_disallow_complement; 1535 $expected_ret = $this_length; 1536 } 1537 1538 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length, 1539 $disallow_flags); 1540 is($ret, $expected_ret, 1541 " And isUTF8_CHAR_flags($display_bytes," 1542 . " $disallow_flags) returns $expected_ret") 1543 or diag "The flags mean " 1544 . flags_to_text($disallow_flags, 1545 \@utf8n_flags_to_text); 1546 is(scalar @warnings_gotten, 0, 1547 " And isUTF8_CHAR_flags(...) generated" 1548 . " no warnings") 1549 or output_warnings(@warnings_gotten); 1550 1551 # Test partial character handling, for each byte not a 1552 # full character 1553 my $did_test_partial = 0; 1554 for (my $j = 1; $j < $this_length - 1; $j++) { 1555 $did_test_partial = 1; 1556 my $partial = substr($this_bytes, 0, $j); 1557 my $ret_should_be; 1558 my $comment; 1559 if ($disallow_type || $malformations_name) { 1560 $ret_should_be = 0; 1561 $comment = "disallowed"; 1562 1563 # The number of bytes required to tell if a 1564 # sequence has something wrong is the smallest of 1565 # all the things wrong with it. We start with the 1566 # number for this type of code point, if that is 1567 # disallowed; or the whole length if not. The 1568 # latter is what a couple of the malformations 1569 # require. 1570 my $needed_to_tell = ($disallow_type) 1571 ? $this_needed_to_discern_len 1572 : $this_expected_len; 1573 1574 # Then we see if the malformations that are 1575 # detectable early in the string are present. 1576 if ($overlong) { 1577 my $dl = overlong_discern_len($this_bytes); 1578 $needed_to_tell = $dl if $dl < $needed_to_tell; 1579 } 1580 if ($will_overflow) { 1581 my $dl = overflow_discern_len($length); 1582 $needed_to_tell = $dl if $dl < $needed_to_tell; 1583 } 1584 1585 if ($j < $needed_to_tell) { 1586 $ret_should_be = 1; 1587 $comment .= ", but need $needed_to_tell" 1588 . " bytes to discern:"; 1589 } 1590 } 1591 else { 1592 $ret_should_be = 1; 1593 $comment = "allowed"; 1594 } 1595 1596 undef @warnings_gotten; 1597 1598 $ret = test_is_utf8_valid_partial_char_flags($partial, 1599 $j, $disallow_flags); 1600 is($ret, $ret_should_be, 1601 " And is_utf8_valid_partial_char_flags(" 1602 . display_bytes($partial) 1603 . ", $disallow_flags), $comment: returns" 1604 . " $ret_should_be") 1605 or diag "The flags mean " 1606 . flags_to_text($disallow_flags, \@utf8n_flags_to_text); 1607 } 1608 1609 if ($did_test_partial) { 1610 is(scalar @warnings_gotten, 0, 1611 " And is_utf8_valid_partial_char_flags()" 1612 . " generated no warnings for any of the lengths") 1613 or output_warnings(@warnings_gotten); 1614 } 1615 } 1616 } 1617 1618 # Now test the to/from UTF-8 calls. There are several orthogonal 1619 # variables involved. We test most possible combinations 1620 1621 foreach my $do_disallow (0, 1) { 1622 if ($do_disallow) { 1623 next if $initially_overlong || $initially_orphan; 1624 } 1625 else { 1626 next if $skip_most_tests; 1627 } 1628 1629 # This tests four functions: utf8n_to_uvchr_error, 1630 # utf8n_to_uvchr_msgs, uvchr_to_utf8_flags, and 1631 # uvchr_to_utf8_msgs. The first two are variants of each other, 1632 # and the final two also form a pair. We use a loop 'which_func' 1633 # to determine which of each pair is being tested. The main loop 1634 # tests either the first and third, or the 2nd and fourth. 1635 # which_func is sets whether we are expecting warnings or not in 1636 # certain places. The _msgs() version of the functions expects 1637 # warnings even if lexical ones are turned off, so by making its 1638 # which_func == 1, we can say we want warnings; whereas the other 1639 # one with the value 0, doesn't get them. 1640 for my $which_func (0, 1) { 1641 my $utf8_func = ($which_func) 1642 ? 'utf8n_to_uvchr_msgs' 1643 : 'utf8n_to_uvchr_error'; 1644 1645 # We classify the warnings into certain "interesting" types, 1646 # described later 1647 foreach my $warning_type (0..5) { 1648 next if $skip_most_tests && $warning_type != 1; 1649 foreach my $use_warn_flag (0, 1) { 1650 if ($use_warn_flag) { 1651 next if $initially_overlong || $initially_orphan; 1652 1653 # Since foo_msgs() expects warnings even when lexical 1654 # ones are turned off, we can skip testing it when 1655 # they are turned on, with little likelihood of 1656 # missing an error case. 1657 next if $which_func; 1658 } 1659 else { 1660 next if $skip_most_tests; 1661 } 1662 1663 # Finally, here is the inner loop 1664 1665 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn; 1666 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow; 1667 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn; 1668 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow; 1669 1670 my $eval_warn; 1671 my $expect_regular_warnings; 1672 my $expect_warnings_for_malformed; 1673 my $expect_warnings_for_overflow; 1674 1675 if ($warning_type == 0) { 1676 $eval_warn = "use warnings"; 1677 $expect_regular_warnings = $use_warn_flag; 1678 1679 # We ordinarily expect overflow warnings here. But it 1680 # is somewhat more complicated, and the final 1681 # determination is deferred to one place in the file 1682 # where we handle overflow. 1683 $expect_warnings_for_overflow = 1; 1684 1685 # We would ordinarily expect malformed warnings in 1686 # this case, but not if malformations are allowed. 1687 $expect_warnings_for_malformed 1688 = $malformed_allow_type == 0; 1689 } 1690 elsif ($warning_type == 1) { 1691 $eval_warn = "no warnings"; 1692 $expect_regular_warnings = $which_func; 1693 $expect_warnings_for_overflow = $which_func; 1694 $expect_warnings_for_malformed = $which_func; 1695 } 1696 elsif ($warning_type == 2) { 1697 $eval_warn = "no warnings; use warnings 'utf8'"; 1698 $expect_regular_warnings = $use_warn_flag; 1699 $expect_warnings_for_overflow = 1; 1700 $expect_warnings_for_malformed 1701 = $malformed_allow_type == 0; 1702 } 1703 elsif ($warning_type == 3) { 1704 $eval_warn = "no warnings; use warnings" 1705 . " '$controlling_warning_category'"; 1706 $expect_regular_warnings = $use_warn_flag; 1707 $expect_warnings_for_overflow 1708 = $controlling_warning_category eq 'non_unicode'; 1709 $expect_warnings_for_malformed = $which_func; 1710 } 1711 elsif ($warning_type =~ /^[45]$/) { 1712 # Like type 3, but uses the PERL_EXTENDED flags, and 5 1713 # uses PORTABLE warnings; 1714 # The complement flags were set up so that the 1715 # PERL_EXTENDED flags have been tested that they don't 1716 # trigger wrongly for too small code points. And the 1717 # flags have been set up so that those small code 1718 # points are tested for being above Unicode. What's 1719 # left to test is that the large code points do 1720 # trigger the PERL_EXTENDED flags. 1721 next if ! requires_extended_utf8($allowed_uv); 1722 next if $controlling_warning_category ne 'non_unicode'; 1723 $eval_warn = "no warnings;"; 1724 if ($warning_type == 4) { 1725 $eval_warn .= " use warnings 'non_unicode'"; 1726 } 1727 else { 1728 $eval_warn .= " use warnings 'portable'"; 1729 } 1730 $expect_regular_warnings = 1; 1731 $expect_warnings_for_overflow = 1; 1732 $expect_warnings_for_malformed = 0; 1733 $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED; 1734 $this_utf8n_flag_to_disallow 1735 = $::UTF8_DISALLOW_PERL_EXTENDED; 1736 $this_uvchr_flag_to_warn 1737 = $::UNICODE_WARN_PERL_EXTENDED; 1738 $this_uvchr_flag_to_disallow 1739 = $::UNICODE_DISALLOW_PERL_EXTENDED; 1740 } 1741 else { 1742 die "Unexpected warning type '$warning_type'"; 1743 } 1744 1745 # We only need to test the case where all warnings are 1746 # enabled (type 0) to see if turning off the warning flag 1747 # causes things to not be output. If those pass, then 1748 # turning on some sub-category of warnings, or turning off 1749 # warnings altogether are extremely likely to not output 1750 # warnings either, given how the warnings subsystem is 1751 # supposed to work, and this file assumes it does work. 1752 next if $warning_type != 0 && ! $use_warn_flag; 1753 1754 # The convention is that the 'got' flag is the same value 1755 # as the disallow one. If this were violated, the tests 1756 # here should start failing. 1757 my $return_flag = $this_utf8n_flag_to_disallow; 1758 1759 # If we aren't expecting warnings/disallow for this, turn 1760 # on all the other flags. That makes sure that they all 1761 # are independent of this flag, and so we don't need to 1762 # test them individually. 1763 my $this_warning_flags 1764 = ($use_warn_flag) 1765 ? $this_utf8n_flag_to_warn 1766 : ($overlong_is_in_perl_extended_utf8 1767 ? ($utf8n_flag_to_warn_complement 1768 & ~$::UTF8_WARN_PERL_EXTENDED) 1769 : $utf8n_flag_to_warn_complement); 1770 my $this_disallow_flags 1771 = ($do_disallow) 1772 ? $this_utf8n_flag_to_disallow 1773 : ($overlong_is_in_perl_extended_utf8 1774 ? ($utf8n_flag_to_disallow_complement 1775 & ~$::UTF8_DISALLOW_PERL_EXTENDED) 1776 : $utf8n_flag_to_disallow_complement); 1777 my $expected_uv = $allowed_uv; 1778 my $this_uv_string = $uv_string; 1779 1780 my @expected_return_flags 1781 = @expected_malformation_return_flags; 1782 my @expected_warnings; 1783 push @expected_warnings, @expected_malformation_warnings 1784 if $expect_warnings_for_malformed; 1785 1786 # The overflow malformation is done differently than other 1787 # malformations. It comes from manually typed tests in 1788 # the test array, but it also is above Unicode and uses 1789 # Perl extended UTF-8, so affects some of the flags being 1790 # tested. We now make it be treated like one of the other 1791 # generated malformations. 1792 if ($will_overflow) { 1793 1794 # An overflow is (way) above Unicode, and overrides 1795 # everything else. 1796 $expect_regular_warnings = 0; 1797 1798 # Earlier, we tentatively calculated whether this 1799 # should emit a message or not. It's tentative 1800 # because, even if we ordinarily would output it, we 1801 # don't if malformations are allowed -- except an 1802 # overflow is also a SUPER and PERL_EXTENDED, and if 1803 # warnings for those are enabled, the overflow 1804 # warning does get raised. 1805 if ( $expect_warnings_for_overflow 1806 && ( $malformed_allow_type == 0 1807 || ( $this_warning_flags 1808 & ($::UTF8_WARN_SUPER 1809 |$::UTF8_WARN_PERL_EXTENDED)))) 1810 { 1811 push @expected_warnings, $overflow_msg_pattern; 1812 } 1813 } 1814 1815 # It may be that the malformations have shortened the 1816 # amount of input we look at so much that we can't tell 1817 # what the category the code point was in. Otherwise, set 1818 # up the expected return flags based on the warnings and 1819 # disallowments. 1820 if ($this_expected_len < $this_needed_to_discern_len) { 1821 $expect_regular_warnings = 0; 1822 } 1823 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn) 1824 || ( $this_disallow_flags 1825 & $this_utf8n_flag_to_disallow)) 1826 { 1827 push @expected_return_flags, $return_flag; 1828 } 1829 1830 # Finish setting up the expected warning. 1831 if ($expect_regular_warnings) { 1832 1833 # So far the array contains warnings generated by 1834 # malformations. Add the expected regular one. 1835 unshift @expected_warnings, $this_cp_message_qr; 1836 1837 # But it may need to be modified, because either of 1838 # these malformations means we can't determine the 1839 # expected code point. 1840 if ( $short || $unexpected_noncont 1841 || $dont_use_overlong_cp) 1842 { 1843 my $first_byte = substr($this_bytes, 0, 1); 1844 $expected_warnings[0] = display_bytes( 1845 substr($this_bytes, 0, $this_expected_len)); 1846 $expected_warnings[0] 1847 = qr/[Aa]\Qny UTF-8 sequence that starts with\E 1848 \Q $expected_warnings[0]\E 1849 \Q $this_non_cp_trailing_text\E/x; 1850 } 1851 } 1852 1853 # Is effectively disallowed if we've set up a malformation 1854 # (unless malformations are allowed), even if the flag 1855 # indicates it is allowed. Fix up test name to indicate 1856 # this as well 1857 my $disallowed = 0; 1858 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow 1859 && $this_expected_len >= $this_needed_to_discern_len) 1860 { 1861 $disallowed = 1; 1862 } 1863 if ($malformations_name) { 1864 if ($malformed_allow_type == 0) { 1865 $disallowed = 1; 1866 } 1867 elsif ($malformed_allow_type == 1) { 1868 1869 # Even if allowed, the malformation returns the 1870 # REPLACEMENT CHARACTER. 1871 $expected_uv = 0xFFFD; 1872 $this_uv_string = "0xFFFD" 1873 } 1874 } 1875 1876 my $this_name = "$utf8_func() $testname: "; 1877 my @scratch_expected_return_flags = @expected_return_flags; 1878 if (! $initially_malformed) { 1879 $this_name .= ($disallowed) 1880 ? 'disallowed, ' 1881 : 'allowed, '; 1882 } 1883 $this_name .= "$eval_warn"; 1884 $this_name .= ", " . (( $this_warning_flags 1885 & $this_utf8n_flag_to_warn) 1886 ? 'with flag for raising warnings' 1887 : 'no flag for raising warnings'); 1888 $this_name .= $malformations_name; 1889 1890 # Do the actual test using an eval 1891 undef @warnings_gotten; 1892 my $ret_ref; 1893 my $this_flags 1894 = $allow_flags|$this_warning_flags|$this_disallow_flags; 1895 my $eval_text = "$eval_warn; \$ret_ref" 1896 . " = test_$utf8_func(" 1897 . "'$this_bytes', $this_length, $this_flags)"; 1898 eval "$eval_text"; 1899 if (! ok ($@ eq "", "$this_name: eval succeeded")) 1900 { 1901 diag "\$@='$@'; call was: " 1902 . utf8n_display_call($eval_text); 1903 next; 1904 } 1905 1906 if ($disallowed) { 1907 is($ret_ref->[0], 0, " And returns 0") 1908 or diag "Call was: " . utf8n_display_call($eval_text); 1909 } 1910 else { 1911 is($ret_ref->[0], $expected_uv, 1912 " And returns expected uv: " 1913 . $this_uv_string) 1914 or diag "Call was: " . utf8n_display_call($eval_text); 1915 } 1916 is($ret_ref->[1], $this_expected_len, 1917 " And returns expected length:" 1918 . " $this_expected_len") 1919 or diag "Call was: " . utf8n_display_call($eval_text); 1920 1921 my $returned_flags = $ret_ref->[2]; 1922 1923 for (my $i = @scratch_expected_return_flags - 1; 1924 $i >= 0; 1925 $i--) 1926 { 1927 if ($scratch_expected_return_flags[$i] & $returned_flags) 1928 { 1929 if ($scratch_expected_return_flags[$i] 1930 == $::UTF8_GOT_PERL_EXTENDED) 1931 { 1932 pass(" Expected and got return flag for" 1933 . " PERL_EXTENDED"); 1934 } 1935 # The first entries in this are 1936 # malformations 1937 elsif ($i > @malformation_names - 1) { 1938 pass(" Expected and got return flag" 1939 . " for " . $controlling_warning_category); 1940 } 1941 else { 1942 pass(" Expected and got return flag for " 1943 . $malformation_names[$i] 1944 . " malformation"); 1945 } 1946 $returned_flags 1947 &= ~$scratch_expected_return_flags[$i]; 1948 splice @scratch_expected_return_flags, $i, 1; 1949 } 1950 } 1951 1952 if (! is($returned_flags, 0, 1953 " Got no unexpected return flags")) 1954 { 1955 diag "The unexpected flags gotten were: " 1956 . (flags_to_text($returned_flags, 1957 \@utf8n_flags_to_text) 1958 # We strip off any prefixes from the flag 1959 # names 1960 =~ s/ \b [A-Z] _ //xgr); 1961 diag "Call was: " . utf8n_display_call($eval_text); 1962 } 1963 1964 if (! is (scalar @scratch_expected_return_flags, 0, 1965 " Got all expected return flags")) 1966 { 1967 diag "The expected flags not gotten were: " 1968 . (flags_to_text(eval join("|", 1969 @scratch_expected_return_flags), 1970 \@utf8n_flags_to_text) 1971 # We strip off any prefixes from the flag 1972 # names 1973 =~ s/ \b [A-Z] _ //xgr); 1974 diag "Call was: " . utf8n_display_call($eval_text); 1975 } 1976 1977 if ($which_func) { 1978 my @returned_warnings; 1979 for my $element_ref (@{$ret_ref->[3]}) { 1980 push @returned_warnings, $element_ref->{'text'}; 1981 my $text = $element_ref->{'text'}; 1982 my $flag = $element_ref->{'flag_bit'}; 1983 my $category = $element_ref->{'warning_category'}; 1984 1985 if (! ok(($flag & ($flag-1)) == 0, 1986 "flag for returned msg is a single bit")) 1987 { 1988 diag sprintf("flags are %x; msg=%s", $flag, $text); 1989 } 1990 else { 1991 if (grep { $_ == $flag } @expected_return_flags) { 1992 pass("flag for returned msg is expected"); 1993 } 1994 else { 1995 fail("flag (" 1996 . flags_to_text($flag, \@utf8n_flags_to_text) 1997 . ") for returned msg is expected"); 1998 } 1999 } 2000 2001 # In perl space, don't know the category numbers 2002 isnt($category, 0, 2003 "returned category for msg isn't 0"); 2004 } 2005 2006 ok(@warnings_gotten == 0, "$utf8_func raised no warnings;" 2007 . " the next tests are for ones in the returned" 2008 . " variable") 2009 or diag join "\n", "The unexpected warnings were:", 2010 @warnings_gotten; 2011 @warnings_gotten = @returned_warnings; 2012 } 2013 2014 do_warnings_test(@expected_warnings) 2015 or diag "Call was: " . utf8n_display_call($eval_text); 2016 undef @warnings_gotten; 2017 2018 # Check CHECK_ONLY results when the input is 2019 # disallowed. Do this when actually disallowed, 2020 # not just when the $this_disallow_flags is set. We only 2021 # test once utf8n_to_uvchr_msgs() with this. 2022 if ( $disallowed 2023 && ($which_func == 0 || ! $tested_CHECK_ONLY)) 2024 { 2025 $tested_CHECK_ONLY = 1; 2026 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY; 2027 my $eval_text = "use warnings; \$ret_ref =" 2028 . " test_$utf8_func('" 2029 . "$this_bytes', $this_length," 2030 . " $this_flags)"; 2031 eval $eval_text; 2032 if (! ok ($@ eq "", 2033 " And eval succeeded with CHECK_ONLY")) 2034 { 2035 diag "\$@='$@'; Call was: " 2036 . utf8n_display_call($eval_text); 2037 next; 2038 } 2039 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0") 2040 or diag "Call was: " . utf8n_display_call($eval_text); 2041 is($ret_ref->[1], -1, 2042 " CHECK_ONLY: returns -1 for length") 2043 or diag "Call was: " . utf8n_display_call($eval_text); 2044 if (! is(scalar @warnings_gotten, 0, 2045 " CHECK_ONLY: no warnings generated")) 2046 { 2047 diag "Call was: " . utf8n_display_call($eval_text); 2048 output_warnings(@warnings_gotten); 2049 } 2050 } 2051 2052 # Now repeat some of the above, but for 2053 # uvchr_to_utf8_flags(). Since this comes from an 2054 # existing code point, it hasn't overflowed, and isn't 2055 # malformed. 2056 next if @malformation_names; 2057 2058 my $uvchr_func = ($which_func) 2059 ? 'uvchr_to_utf8_flags_msgs' 2060 : 'uvchr_to_utf8_flags'; 2061 2062 $this_warning_flags = ($use_warn_flag) 2063 ? $this_uvchr_flag_to_warn 2064 : 0; 2065 $this_disallow_flags = ($do_disallow) 2066 ? $this_uvchr_flag_to_disallow 2067 : 0; 2068 2069 $disallowed = $this_disallow_flags 2070 & $this_uvchr_flag_to_disallow; 2071 $this_name .= ", " . (( $this_warning_flags 2072 & $this_utf8n_flag_to_warn) 2073 ? 'with flag for raising warnings' 2074 : 'no flag for raising warnings'); 2075 2076 $this_name = "$uvchr_func() $testname: " 2077 . (($disallowed) 2078 ? 'disallowed' 2079 : 'allowed'); 2080 $this_name .= ", $eval_warn"; 2081 $this_name .= ", " . (( $this_warning_flags 2082 & $this_uvchr_flag_to_warn) 2083 ? 'with warning flag' 2084 : 'no warning flag'); 2085 2086 undef @warnings_gotten; 2087 my $ret; 2088 $this_flags = $this_warning_flags|$this_disallow_flags; 2089 $eval_text = "$eval_warn; \$ret =" 2090 . " test_$uvchr_func(" 2091 . "$allowed_uv, $this_flags)"; 2092 eval "$eval_text"; 2093 if (! ok ($@ eq "", "$this_name: eval succeeded")) 2094 { 2095 diag "\$@='$@'; call was: " 2096 . uvchr_display_call($eval_text); 2097 next; 2098 } 2099 2100 if ($which_func) { 2101 if (defined $ret->[1]) { 2102 my @returned_warnings; 2103 push @returned_warnings, $ret->[1]{'text'}; 2104 my $text = $ret->[1]{'text'}; 2105 my $flag = $ret->[1]{'flag_bit'}; 2106 my $category = $ret->[1]{'warning_category'}; 2107 2108 if (! ok(($flag & ($flag-1)) == 0, 2109 "flag for returned msg is a single bit")) 2110 { 2111 diag sprintf("flags are %x; msg=%s", $flag, $text); 2112 } 2113 else { 2114 if ($flag & $this_uvchr_flag_to_disallow) { 2115 pass("flag for returned msg is expected"); 2116 } 2117 else { 2118 fail("flag (" 2119 . flags_to_text($flag, \@utf8n_flags_to_text) 2120 . ") for returned msg is expected"); 2121 } 2122 } 2123 2124 # In perl space, don't know the category numbers 2125 isnt($category, 0, 2126 "returned category for msg isn't 0"); 2127 2128 ok(@warnings_gotten == 0, "$uvchr_func raised no warnings;" 2129 . " the next tests are for ones in the returned" 2130 . " variable") 2131 or diag join "\n", "The unexpected warnings were:", 2132 @warnings_gotten; 2133 @warnings_gotten = @returned_warnings; 2134 } 2135 2136 $ret = $ret->[0]; 2137 } 2138 2139 if ($disallowed) { 2140 is($ret, undef, " And returns undef") 2141 or diag "Call was: " . uvchr_display_call($eval_text); 2142 } 2143 else { 2144 is($ret, $this_bytes, " And returns expected string") 2145 or diag "Call was: " . uvchr_display_call($eval_text); 2146 } 2147 2148 do_warnings_test(@expected_warnings) 2149 or diag "Call was: " . uvchr_display_call($eval_text); 2150 } 2151 } 2152 } 2153 } 2154 } 2155 } 2156 } 2157 } 2158} 2159 2160done_testing; 2161