1# Grind out a lot of combinatoric tests for folding. 2# It uses various charset modifiers, passed in via $::TEST_CHUNK. The caller 3# will also have set the locale to use if /l is the modifier. 4# L is a pseudo-modifier that indicates to use the modifier /l instead, and 5# the locale set by the caller is known to be UTF-8, 6# T is a pseudo-modifier that indicates to use the pseudo modifier /L 7# instead, and the locale set by the caller is known to be Turkic UTF-8, 8 9binmode STDOUT, ":utf8"; 10 11BEGIN { 12 chdir 't' if -d 't'; 13 require './test.pl'; 14 set_up_inc('../lib'); 15 require Config; import Config; 16 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); 17 if ($^O eq 'dec_osf') { 18 skip_all("$^O cannot handle this test"); 19 } 20 my $time_out_factor = $ENV{PERL_TEST_TIME_OUT_FACTOR} || 1; 21 $time_out_factor = 1 if $time_out_factor < 1; 22 23 watchdog(5 * 60 * $time_out_factor); 24 require './loc_tools.pl'; 25} 26 27use charnames ":full"; 28 29my $DEBUG = 0; # Outputs extra information for debugging this .t 30 31use strict; 32use warnings; 33no warnings 'locale'; # Plenty of these would otherwise get generated 34use Encode; 35use POSIX; 36 37my $charset = $::TEST_CHUNK; 38my $use_turkic_rules = 0; 39 40if ($charset eq 'T') { 41 $charset = 'L'; 42 $use_turkic_rules = 1; 43} 44 45# Special-cased characters in the .c's that we want to make sure get tested. 46my %be_sure_to_test = ( 47 chr utf8::unicode_to_native(0xDF) => 1, # LATIN_SMALL_LETTER_SHARP_S 48 "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S 49 "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 50 "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 51 "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 52 "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 53 "I" => 1, 54); 55 56# Tests both unicode and not, so make sure not implicitly testing unicode 57no feature 'unicode_strings'; 58 59# Case-insensitive matching is a large and complicated issue. Perl does not 60# implement it fully, properly. For example, it doesn't include normalization 61# as part of the equation. To test every conceivable combination is clearly 62# impossible; these tests are mostly drawn from visual inspection of the code 63# and experience, trying to exercise all areas. 64 65# There are three basic ranges of characters that Perl may treat differently: 66# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are 67# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants 68# are all controls that fold to themselves. 69my $ASCII = 1; 70 71# 2) Other characters that fit into a byte but are different in utf8 than not; 72# here referred to, taking some liberties, as Latin1. 73my $Latin1 = 2; 74 75# 3) Characters that won't fit in a byte; here referred to as Unicode 76my $Unicode = 3; 77 78# Within these basic groups are equivalence classes that testing any character 79# in is likely to lead to the same results as any other character. This is 80# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is 81# set. 82my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; 83 84# Additionally parts of this test run a lot of subtests, outputting the 85# resulting TAP can be expensive so the tests are summarised internally. The 86# PERL_DEBUG_FULL_TEST environment variable can be set to produce the full 87# output for debugging purposes. 88 89sub range_type { 90 my $ord = ord shift; 91 92 return $ASCII if utf8::native_to_unicode($ord) < 128; 93 return $Latin1 if $ord < 256; 94 return $Unicode; 95} 96 97sub numerically { 98 return $a <=> $b 99} 100 101my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG; 102$| = 1 if $list_all_tests; 103 104# Significant time is saved by not outputting each test but grouping the 105# output into subtests 106my $okays; # Number of ok's in current subtest 107my $this_iteration; # Number of possible tests in current subtest 108my $count = 0; # Number of subtests = number of total tests 109 110sub run_test($$$$) { 111 my ($test, $todo, $do_we_output_locale_name, $debug) = @_; 112 113 $debug = "" unless $DEBUG; 114 my $res = eval $test; 115 116 if ($do_we_output_locale_name) { 117 $do_we_output_locale_name = 'setlocale(LC_CTYPE, "' 118 . POSIX::setlocale(&POSIX::LC_CTYPE) 119 . '"); '; 120 } 121 if (!$res || $list_all_tests) { 122 # Failed or debug; output the result 123 $count++; 124 ok($res, "$do_we_output_locale_name$test; $debug"); 125 } else { 126 # Just count the test as passed 127 $okays++; 128 } 129 $this_iteration++; 130} 131 132my %has_test_by_participants; # Makes sure has tests for each range and each 133 # number of characters that fold to the same 134 # thing 135my %has_test_by_byte_count; # Makes sure has tests for each combination of 136 # n bytes folds to m bytes 137 138my %tests; # The set of tests we expect to pass. 139# Each key is a code point that folds to something else. 140# Each value is a list of things that the key folds to. If the 'thing' is a 141# single code point, it is that ordinal. If it is a multi-char fold, it is an 142# ordered list of the code points in that fold. Here's an example for 'S': 143# '83' => [ 115, 383 ] 144# 145# And one for a multi-char fold: \xDF 146# 223 => [ 147# [ # 'ss' 148# 83, 149# 83 150# ], 151# [ # 'SS' 152# 115, 153# 115 154# ], 155# [ # LATIN SMALL LETTER LONG S 156# 383, 157# 383 158# ], 159# 7838 # LATIN_CAPITAL_LETTER_SHARP_S 160# ], 161 162my %neg_tests; # Same format, but we expect these tests to fail 163 164my %folds; # keys are code points that fold; values are either 0 or 1 which 165 # in turn are keys with their values each a list of code points the 166 # code point key folds to. The folds under 1 are the ones that are 167 # valid in this run; the ones under 0 are ones valid under other 168 # circumstances. 169 170my %inverse_folds; # keys are strings of the folded-to; then come a layer of 171 # 0 or 1, like %folds. The lowest values are lists of 172 # characters that fold to them 173 174# Here's a portion of an %inverse_folds in a run where Turkic folds are not 175# legal, so \x{130} doesn't fold to 'i' in this run. 176# 'h' => { 177# '1' => [ 178# 'H' 179# ] 180# }, 181# "h\x{331}" => { 182# '1' => [ 183# "\x{1e96}" 184# ] 185# }, 186# 'i' => { 187# '0' => [ 188# "\x{130}" 189# ], 190# '1' => [ 191# 'I' 192# ] 193# }, 194# "i\x{307}" => { 195# '1' => [ 196# "\x{130}" 197# ] 198# }, 199# 'j' => { 200# '1' => [ 201# 'J' 202# ] 203# }, 204 205sub add_test($$@) { 206 my ($tests_ref, $to, @from) = @_; 207 208 # Called to cause the input to be tested by adding to $%tests_ref. @from 209 # is the list of characters that fold to the string $to. @from should be 210 # sorted so the lowest code point is first.... 211 # The input is in string form; %tests uses code points, so have to 212 # convert. 213 214 my $to_chars = length $to; 215 my @test_to; # List of tests for $to 216 217 if ($to_chars == 1) { 218 @test_to = ord $to; 219 } 220 else { 221 push @test_to, [ map { ord $_ } split "", $to ]; 222 223 # For multi-char folds, we also test that things that can fold to each 224 # individual character in the fold also work. If we were testing 225 # comprehensively, we would try every combination of upper and lower 226 # case in the fold, but it will have to suffice to avoid running 227 # forever to make sure that each thing that folds to these is tested 228 # at least once. Because of complement matching ([^...]), we need to 229 # do both the folded, and the folded-from. 230 # We first look at each character in the multi-char fold, and save how 231 # many characters fold to it; and also the maximum number of such 232 # folds 233 my @folds_to_count; # 0th char in fold is index 0 ... 234 my $max_folds_to = 0; 235 236 for (my $i = 0; $i < $to_chars; $i++) { 237 my $to_char = substr($to, $i, 1); 238 if (exists $inverse_folds{$to_char}{1}) { 239 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}{1}}; 240 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i]; 241 } 242 else { 243 $folds_to_count[$i] = 0; 244 } 245 } 246 247 # We will need to generate as many tests as the maximum number of 248 # folds, so that each fold will have at least one test. 249 # For example, consider character X which folds to the three character 250 # string 'xyz'. If 2 things fold to x (X and x), 4 to y (Y, Y' 251 # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4 252 # tests will be generated: 253 # xyz 254 # XYz 255 # xY'z 256 # xY''z 257 for (my $i = 0; $i < $max_folds_to; $i++) { 258 my @this_test_to; # Assemble a single test 259 260 # For each character in the multi-char fold ... 261 for (my $j = 0; $j < $to_chars; $j++) { 262 my $this_char = substr($to, $j, 1); 263 264 # Use its corresponding inverse fold, if available. 265 if ( $i < $folds_to_count[$j] 266 && exists $inverse_folds{$this_char}{1}) 267 { 268 push @this_test_to, ord $inverse_folds{$this_char}{1}[$i]; 269 } 270 else { # Or else itself. 271 push @this_test_to, ord $this_char; 272 } 273 } 274 275 # Add this test to the list 276 push @test_to, [ @this_test_to ]; 277 } 278 279 # Here, have assembled all the tests for the multi-char fold. Sort so 280 # lowest code points are first for consistency and aesthetics in 281 # output. We know there are at least two characters in the fold, but 282 # I haven't bothered to worry about sorting on an optional third 283 # character if the first two are identical. 284 @test_to = sort { ($a->[0] == $b->[0]) 285 ? $a->[1] <=> $b->[1] 286 : $a->[0] <=> $b->[0] 287 } @test_to; 288 } 289 290 291 # This test is from n bytes to m bytes. Record that so won't try to add 292 # another test that does the same. 293 use bytes; 294 my $to_bytes = length $to; 295 foreach my $from_map (@from) { 296 $has_test_by_byte_count{length $from_map}{$to_bytes} = $to; 297 } 298 no bytes; 299 300 my $ord_smallest_from = ord shift @from; 301 if (exists $tests_ref->{$ord_smallest_from}) { 302 die "There are already tests for $ord_smallest_from" 303 }; 304 305 # Add in the fold tests, 306 push @{$tests_ref->{$ord_smallest_from}}, @test_to; 307 308 # Then any remaining froms in the equivalence class. 309 push @{$tests_ref->{$ord_smallest_from}}, map { ord $_ } @from; 310} 311 312# Get the Unicode rules and construct inverse mappings from them 313 314use Unicode::UCD; 315my $file="../lib/unicore/CaseFolding.txt"; 316 317# Use the Unicode data file if we are on an ASCII platform (which its data is 318# for), and it is in the modern format (starting in Unicode 3.1.0) and it is 319# available. This avoids being affected by potential bugs introduced by other 320# layers of Perl 321if ($::IS_ASCII 322 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 323 && open my $fh, "<", $file) 324{ 325 # We process the file in reverse order because its easier to see the T 326 # entry first and then know that the next line we process is the 327 # corresponding one for non-T. 328 my @rules = <$fh>; 329 my $prev_was_turkic = 0; 330 while (defined ($_ = pop @rules)) { 331 chomp; 332 333 # Lines look like (though without the initial '#') 334 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 335 336 # Get rid of comments, ignore blank or comment-only lines 337 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx; 338 next unless length $line; 339 my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line; 340 341 next if $fold_type eq 'S'; # If Unicode's tables are correct, the F 342 # should be a superset of S 343 next if $fold_type eq 'I'; # Perl doesn't do old Turkish folding 344 345 my $test_type; 346 if ($fold_type eq 'T') { 347 $test_type = 0 + $use_turkic_rules; 348 $prev_was_turkic = 1; 349 } 350 elsif ($prev_was_turkic) { 351 $test_type = 0 + ! $use_turkic_rules; 352 $prev_was_turkic = 0; 353 } 354 else { 355 $test_type = 1; 356 $prev_was_turkic = 0; 357 } 358 359 my $from = hex $hex_from; 360 my @to = map { hex $_ } @hex_folded; 361 push @{$folds{$from}{$test_type}}, @to; 362 363 my $folded_str = pack ("U0U*", @to); 364 push @{$inverse_folds{$folded_str}{$test_type}}, chr $from; 365 } 366} 367else { # Here, can't use the .txt file: read the Unicode rules file and 368 # construct inverse mappings from it 369 370 skip_all "Don't know how to generate turkic rules on this platform" 371 if $use_turkic_rules; 372 my ($invlist_ref, $invmap_ref, undef, $default) 373 = Unicode::UCD::prop_invmap('Case_Folding'); 374 for my $i (0 .. @$invlist_ref - 1 - 1) { 375 next if $invmap_ref->[$i] == $default; 376 377 # Make into an array if not so already, so can treat uniformly below 378 $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i]; 379 380 # Each subsequent element of the range requires adjustment of +1 from 381 # the previous element 382 my $adjust = -1; 383 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { 384 $adjust++; 385 my @to = map { $_ + $adjust } @{$invmap_ref->[$i]}; 386 push @{$folds{$j}{1}}, @to; 387 my $folded_str = join "", map { chr } @to; 388 utf8::upgrade($folded_str); 389 #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ", 390 # map { sprintf "%04X", $_ + $adjust } @{$invmap_ref->[$i]}); 391 push @{$inverse_folds{$folded_str}{1}}, chr $j; 392 } 393 } 394} 395 396# Analyze the data and generate tests to get adequate test coverage. We sort 397# things so that smallest code points are done first. 398foreach my $to (sort { $a cmp $b } keys %inverse_folds) 399{ 400TO: 401 foreach my $tests_ref (\%tests, \%neg_tests) { 402 my $test_type = ($tests_ref == \%tests) ? 1 : 0; 403 404 next unless exists $inverse_folds{$to}{$test_type}; 405 406 # Within each fold, sort so that the smallest code points are done first 407 @{$inverse_folds{$to}{$test_type}} = sort { $a cmp $b } @{$inverse_folds{$to}{$test_type}}; 408 my @from = @{$inverse_folds{$to}{$test_type}}; 409 410 # Just add it to the tests if doing complete coverage 411 if (! $skip_apparently_redundant) { 412 add_test($tests_ref, $to, @from); 413 next TO; 414 } 415 416 my $to_chars = length $to; 417 my $to_range_type = range_type(substr($to, 0, 1)); 418 419 # If this is required to be tested, do so. We check for these first, as 420 # they will take up slots of byte-to-byte combinations that we otherwise 421 # would have to have other tests to get. 422 foreach my $from_map (@from) { 423 if (exists $be_sure_to_test{$from_map}) { 424 add_test($tests_ref, $to, @from); 425 next TO; 426 } 427 } 428 429 # If the fold contains heterogeneous range types, is suspect and should be 430 # tested. 431 if ($to_chars > 1) { 432 foreach my $char (split "", $to) { 433 if (range_type($char) != $to_range_type) { 434 add_test($tests_ref, $to, @from); 435 next TO; 436 } 437 } 438 } 439 440 # If the mapping crosses range types, is suspect and should be tested 441 foreach my $from_map (@from) { 442 if (range_type($from_map) != $to_range_type) { 443 add_test($tests_ref, $to, @from); 444 next TO; 445 } 446 } 447 448 # Here, all components of the mapping are in the same range type. For 449 # single character folds, we test one case in each range type that has 2 450 # particpants, 3 particpants, etc. 451 if ($to_chars == 1) { 452 if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) { 453 add_test($tests_ref, $to, @from); 454 $has_test_by_participants{scalar @from}{$to_range_type} = $to; 455 next TO; 456 } 457 } 458 459 # We also test all combinations of mappings from m to n bytes. This is 460 # because the regex optimizer cares. (Don't bother worrying about that 461 # Latin1 chars will occupy a different number of bytes under utf8, as 462 # there are plenty of other cases that catch these byte numbers.) 463 use bytes; 464 my $to_bytes = length $to; 465 foreach my $from_map (@from) { 466 if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) { 467 add_test($tests_ref, $to, @from); 468 next TO; 469 } 470 } 471 } 472} 473 474# For each range type, test additionally a character that folds to itself 475add_test(\%tests, ":", ":"); 476add_test(\%tests, chr utf8::unicode_to_native(0xF7), chr utf8::unicode_to_native(0xF7)); 477add_test(\%tests, chr 0x2C7, chr 0x2C7); 478 479# To cut down on the number of tests 480my $has_tested_aa_above_latin1; 481my $has_tested_latin1_aa; 482my $has_tested_ascii_aa; 483my $has_tested_l_above_latin1; 484my $has_tested_above_latin1_l; 485my $has_tested_ascii_l; 486my $has_tested_above_latin1_d; 487my $has_tested_ascii_d; 488my $has_tested_non_latin1_d; 489my $has_tested_above_latin1_a; 490my $has_tested_ascii_a; 491my $has_tested_non_latin1_a; 492 493# For use by pairs() in generating combinations 494sub prefix { 495 my $p = shift; 496 map [ $p, $_ ], @_ 497} 498 499# Returns all ordered combinations of pairs of elements from the input array. 500# It doesn't return pairs like (a, a), (b, b). Change the slice to an array 501# to do that. This was just to have fewer tests. 502sub pairs (@) { 503 #print STDERR __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n"; 504 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ 505} 506 507# Finally ready to do the tests 508foreach my $tests_ref (\%neg_tests, \%tests) { 509foreach my $test (sort { numerically } keys %{$tests_ref}) { 510 511 my $previous_target; 512 my $previous_pattern; 513 my @pairs = pairs(sort numerically $test, @{$tests_ref->{$test}}); 514 515 # Each fold can be viewed as a closure of all the characters that 516 # participate in it. Look at each possible pairing from a closure, with the 517 # first member of the pair the target string to match against, and the 518 # second member forming the pattern. Thus each fold member gets tested as 519 # the string, and the pattern with every other member in the opposite role. 520 while (my $pair = shift @pairs) { 521 my ($target, $pattern) = @$pair; 522 523 # When testing a char that doesn't fold, we can get the same 524 # permutation twice; so skip all but the first. 525 next if $previous_target 526 && $previous_target == $target 527 && $previous_pattern == $pattern; 528 ($previous_target, $previous_pattern) = ($target, $pattern); 529 530 # Each side may be either a single char or a string. Extract each into an 531 # array (perhaps of length 1) 532 my @target, my @pattern; 533 @target = (ref $target) ? @$target : $target; 534 @pattern = (ref $pattern) ? @$pattern : $pattern; 535 536 # We are testing just folds to/from a single character. If our pairs 537 # happens to generate multi/multi, skip. 538 next if @target > 1 && @pattern > 1; 539 540 # Get in hex form. 541 my @x_target = map { sprintf "\\x{%04X}", $_ } @target; 542 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; 543 544 my $target_above_latin1 = grep { $_ > 255 } @target; 545 my $pattern_above_latin1 = grep { $_ > 255 } @pattern; 546 my $target_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @target; 547 my $pattern_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @pattern; 548 my $target_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @target; 549 my $pattern_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @pattern; 550 my $target_has_latin1 = grep { $_ < 256 } @target; 551 my $target_has_upper_latin1 552 = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @target; 553 my $pattern_has_upper_latin1 554 = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @pattern; 555 my $pattern_has_latin1 = grep { $_ < 256 } @pattern; 556 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; 557 558 # We don't test multi-char folding into other multi-chars. We are testing 559 # a code point that folds to or from other characters. Find the single 560 # code point for diagnostic purposes. (If both are single, choose the 561 # target string) 562 my $ord = @target == 1 ? $target[0] : $pattern[0]; 563 my $progress = sprintf "%04X: \"%s\" and /%s/", 564 $test, 565 join("", @x_target), 566 join("", @x_pattern); 567 #note $progress; 568 569 # Now grind out tests, using various combinations. 570 { 571 my $charset_mod = lc $charset; 572 my $current_locale = setlocale(&POSIX::LC_CTYPE); 573 $current_locale = 'C locale' if $current_locale eq 'C'; 574 $okays = 0; 575 $this_iteration = 0; 576 577 # To cut down somewhat on the enormous quantity of tests this currently 578 # runs, skip some for some of the character sets whose results aren't 579 # likely to differ from others. But run all tests on the code points 580 # that don't fold, plus one other set in each range group. 581 if (! $is_self) { 582 583 # /aa should only affect things with folds in the ASCII range. But, try 584 # it on one set in the other ranges just to make sure it doesn't break 585 # them. 586 if ($charset eq 'aa') { 587 588 # It may be that this $pair of code points to test are both 589 # non-ascii, but if either of them actually fold to ascii, that is 590 # suspect and should be tested. So for /aa, use whether their folds 591 # are ascii or not 592 my $target_has_ascii = $target_has_ascii; 593 my $pattern_has_ascii = $pattern_has_ascii; 594 if (! $target_has_ascii) { 595 foreach my $cp (@target) { 596 if (exists $folds{$cp}{1} 597 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}{1}} ) 598 { 599 $target_has_ascii = 1; 600 last; 601 } 602 } 603 } 604 if (! $pattern_has_ascii) { 605 foreach my $cp (@pattern) { 606 if (exists $folds{$cp}{1} 607 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}}{1} ) 608 { 609 $pattern_has_ascii = 1; 610 last; 611 } 612 } 613 } 614 615 if (! $target_has_ascii && ! $pattern_has_ascii) { 616 if ($target_above_latin1 || $pattern_above_latin1) { 617 next if defined $has_tested_aa_above_latin1 618 && $has_tested_aa_above_latin1 != $test; 619 $has_tested_aa_above_latin1 = $test; 620 } 621 next if defined $has_tested_latin1_aa 622 && $has_tested_latin1_aa != $test; 623 $has_tested_latin1_aa = $test; 624 } 625 elsif ($target_only_ascii && $pattern_only_ascii) { 626 627 # And, except for one set just to make sure, skip tests 628 # where both elements in the pair are ASCII. If one works for 629 # aa, the others are likely too. This skips tests where the 630 # fold is from non-ASCII to ASCII, but this part of the test 631 # is just about the ASCII components. 632 next if defined $has_tested_ascii_l 633 && $has_tested_ascii_l != $test; 634 $has_tested_ascii_l = $test; 635 } 636 } 637 elsif ($charset eq 'l') { 638 639 # For l, don't need to test beyond one set those things that are 640 # all above latin1, because unlikely to have different successes 641 # than /u. But, for the same reason as described in the /aa above, 642 # it is suspect and should be tested, if either of the folds are to 643 # latin1. 644 my $target_has_latin1 = $target_has_latin1; 645 my $pattern_has_latin1 = $pattern_has_latin1; 646 if (! $target_has_latin1) { 647 foreach my $cp (@target) { 648 if (exists $folds{$cp}{1} 649 && grep { $_ < 256 } @{$folds{$cp}{1}} ) 650 { 651 $target_has_latin1 = 1; 652 last; 653 } 654 } 655 } 656 if (! $pattern_has_latin1) { 657 foreach my $cp (@pattern) { 658 if (exists $folds{$cp}{1} 659 && grep { $_ < 256 } @{$folds{$cp}{1}} ) 660 { 661 $pattern_has_latin1 = 1; 662 last; 663 } 664 } 665 } 666 if (! $target_has_latin1 && ! $pattern_has_latin1) { 667 next if defined $has_tested_above_latin1_l 668 && $has_tested_above_latin1_l != $test; 669 $has_tested_above_latin1_l = $test; 670 } 671 elsif ($target_only_ascii && $pattern_only_ascii) { 672 673 # And, except for one set just to make sure, skip tests 674 # where both elements in the pair are ASCII. This is 675 # essentially the same reasoning as above for /aa. 676 next if defined $has_tested_ascii_l 677 && $has_tested_ascii_l != $test; 678 $has_tested_ascii_l = $test; 679 } 680 } 681 elsif ($charset eq 'd') { 682 # Similarly for d. Beyond one test (besides self) each, we don't 683 # test pairs that are both ascii; or both above latin1, or are 684 # combinations of ascii and above latin1. 685 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { 686 if ($target_has_ascii && $pattern_has_ascii) { 687 next if defined $has_tested_ascii_d 688 && $has_tested_ascii_d != $test; 689 $has_tested_ascii_d = $test 690 } 691 elsif (! $target_has_latin1 && ! $pattern_has_latin1) { 692 next if defined $has_tested_above_latin1_d 693 && $has_tested_above_latin1_d != $test; 694 $has_tested_above_latin1_d = $test; 695 } 696 else { 697 next if defined $has_tested_non_latin1_d 698 && $has_tested_non_latin1_d != $test; 699 $has_tested_non_latin1_d = $test; 700 } 701 } 702 } 703 elsif ($charset eq 'a') { 704 # Similarly for a. This should match identically to /u, so wasn't 705 # tested at all until a bug was found that was thereby missed. 706 # As a compromise, beyond one test (besides self) each, we don't 707 # test pairs that are both ascii; or both above latin1, or are 708 # combinations of ascii and above latin1. 709 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { 710 if ($target_has_ascii && $pattern_has_ascii) { 711 next if defined $has_tested_ascii_a 712 && $has_tested_ascii_a != $test; 713 $has_tested_ascii_a = $test 714 } 715 elsif (! $target_has_latin1 && ! $pattern_has_latin1) { 716 next if defined $has_tested_above_latin1_a 717 && $has_tested_above_latin1_a != $test; 718 $has_tested_above_latin1_a = $test; 719 } 720 else { 721 next if defined $has_tested_non_latin1_a 722 && $has_tested_non_latin1_a != $test; 723 $has_tested_non_latin1_a = $test; 724 } 725 } 726 } 727 } 728 729 foreach my $utf8_target (0, 1) { # Both utf8 and not, for 730 # code points < 256 731 my $upgrade_target = ""; 732 733 # These must already be in utf8 because the string to match has 734 # something above latin1. So impossible to test if to not to be in 735 # utf8; and otherwise, no upgrade is needed. 736 next if $target_above_latin1 && ! $utf8_target; 737 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; 738 739 foreach my $utf8_pattern (0, 1) { 740 next if $pattern_above_latin1 && ! $utf8_pattern; 741 742 # Our testing of 'l' uses the POSIX locale, which is ASCII-only 743 my $uni_semantics = $charset ne 'l' && ( $utf8_target 744 || $charset eq 'u' 745 || $charset eq 'L' 746 || ($charset eq 'd' && $utf8_pattern) 747 || $charset =~ /a/); 748 my $upgrade_pattern = ""; 749 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern; 750 751 my $lhs = join "", @x_target; 752 my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; 753 my @rhs = @x_pattern; 754 my $rhs = join "", @rhs; 755 756 # Unicode created a folding rule that partially emulates what 757 # happens in a Turkish locale, by using combining characters. The 758 # result is close enough to what really should happen, that it can 759 # pass many of the tests, but not all. So, if we have a rule that 760 # is expecting failure, it may pass instead. The code in the block 761 # below is good enough for skipping the tests, and khw tried to make 762 # it general, but should the rules be revised (unlikely at this 763 # point), this might need to be tweaked. 764 if ($tests_ref == \%neg_tests) { 765 my ($shorter_ref, $longer_ref); 766 767 # Convert the $rhs to a string, like we already did for the lhs 768 my $rhs_str = eval qq{"$rhs"}; fail($@) if $@; 769 770 # If the lengths of the two sides are equal, we don't want to do 771 # this; this is only to bypass the combining characters affecting 772 # things 773 if (length $lhs_str != length $rhs_str) { 774 775 # Find the shorter and longer of the pair 776 if (length $lhs_str < length $rhs_str) { 777 $shorter_ref = \$lhs_str; 778 $longer_ref = \$rhs_str; 779 } 780 else { 781 $shorter_ref = \$rhs_str; 782 $longer_ref = \$lhs_str; 783 } 784 785 # If the shorter string is entirely contained in the longer, we 786 # have generated a test that is likely to succeed, and the 787 # reasons it would fail have nothing to do with folding. But we 788 # are expecting it to fail, and so our test is invalid. Skip 789 # it. 790 next if index($$longer_ref, $$shorter_ref) >= 0; 791 792 793 # The above eliminates about half the failure cases. This gets 794 # the rest. If the shorter string is a single character and has 795 # a fold legal in this run to a character that is in the longer 796 # string, it is also likely to succeed under /i. So again our 797 # computed test is bogus. 798 if ( length $$shorter_ref == 1 799 && exists $folds{ord $$shorter_ref}{1}) 800 { 801 my @folded_to = @{$folds{ord $$shorter_ref}{1}}; 802 next if @folded_to == 1 803 && index($$longer_ref, chr $folded_to[0]) >= 0; 804 } 805 } 806 } 807 808 my $should_fail = (! $uni_semantics && $ord < 256 && ! $is_self && utf8::native_to_unicode($ord) >= 128) 809 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) 810 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1) 811 || $tests_ref == \%neg_tests; 812 813 # Do simple tests of referencing capture buffers, named and 814 # numbered. 815 my $op = '=~'; 816 $op = '!~' if $should_fail; 817 818 my $todo = 0; # No longer any todo's 819 my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 820 run_test($eval, $todo, ($charset_mod eq 'l'), ""); 821 822 $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 823 run_test($eval, $todo, ($charset_mod eq 'l'), ""); 824 825 if ($lhs ne $rhs) { 826 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 827 run_test($eval, "", ($charset_mod eq 'l'), ""); 828 829 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 830 run_test($eval, "", ($charset_mod eq 'l'), ""); 831 } 832 833 # See if works on what could be a simple trie. 834 my $alternate; 835 { 836 # Keep the alternate | branch the same length as the tested one so 837 # that it's length doesn't influence things 838 my $evaled = eval "\"$rhs\""; # Convert e.g. \x{foo} into its 839 # chr equivalent 840 use bytes; 841 $alternate = 'q' x length $evaled; 842 } 843 $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset_mod;$upgrade_target$upgrade_pattern \$c $op \$p"; 844 run_test($eval, "", ($charset_mod eq 'l'), ""); 845 846 # Check that works when the folded character follows something that 847 # is quantified. This test knows the regex code internals to the 848 # extent that it knows this is a potential problem, and that there 849 # are three different types of quantifiers generated: 1) The thing 850 # being quantified matches a single character; 2) it matches more 851 # than one character, but is fixed width; 3) it can match a variable 852 # number of characters. (It doesn't know that case 3 shouldn't 853 # matter, since it doesn't do anything special for the character 854 # following the quantifier; nor that some of the different 855 # quantifiers execute the same underlying code, as these tests are 856 # quick, and this insulates these tests from changes in the 857 # implementation.) 858 for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') { 859 $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset_mod:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 860 run_test($eval, "", ($charset_mod eq 'l'), ""); 861 $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 862 run_test($eval, "", ($charset_mod eq 'l'), ""); 863 $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; 864 run_test($eval, "", ($charset_mod eq 'l'), ""); 865 } 866 867 foreach my $bracketed (0, 1) { # Put rhs in [...], or not 868 next if $bracketed && @pattern != 1; # bracketed makes these 869 # or's instead of a sequence 870 foreach my $optimize_bracketed (0, 1) { 871 next if $optimize_bracketed && ! $bracketed; 872 foreach my $inverted (0,1) { 873 next if $inverted && ! $bracketed; # inversion only valid 874 # in [^...] 875 next if $inverted && @target != 1; # [perl #89750] multi-char 876 # not valid in [^...] 877 878 # In some cases, add an extra character that doesn't fold, and 879 # looks ok in the output. 880 my $extra_char = "_"; 881 foreach my $prepend ("", $extra_char) { 882 foreach my $append ("", $extra_char) { 883 884 # Assemble the rhs. Put each character in a separate 885 # bracketed if using charclasses. This creates a stress on 886 # the code to span a match across multiple elements 887 my $rhs = ""; 888 foreach my $rhs_char (@rhs) { 889 $rhs .= '[' if $bracketed; 890 $rhs .= '^' if $inverted; 891 $rhs .= $rhs_char; 892 893 # Add a character to the class, so class doesn't get 894 # optimized out, unless we are testing that optimization 895 $rhs .= '_' if $optimize_bracketed; 896 $rhs .= ']' if $bracketed; 897 } 898 899 # Add one of: no capturing parens 900 # a single set 901 # a nested set 902 # Use quantifiers and extra variable width matches inside 903 # them to keep some optimizations from happening 904 foreach my $parend (0, 1, 2) { 905 my $interior = (! $parend) 906 ? $rhs 907 : ($parend == 1) 908 ? "(${rhs},?)" 909 : "((${rhs})+,?)"; 910 foreach my $quantifier ("", '?', '*', '+', '{1,3}') { 911 912 # Perhaps should be TODOs, as are unimplemented, but 913 # maybe will never be implemented 914 next if @pattern != 1 && $quantifier; 915 916 # A ? or * quantifier normally causes the thing to be 917 # able to match a null string 918 my $quantifier_can_match_null = $quantifier eq '?' 919 || $quantifier eq '*'; 920 921 # But since we only quantify the last character in a 922 # multiple fold, the other characters will have width, 923 # except if we are quantifying the whole rhs 924 my $can_match_null = $quantifier_can_match_null 925 && (@rhs == 1 || $parend); 926 927 foreach my $l_anchor ("", '^') { # '\A' didn't change 928 # result) 929 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't 930 # change result) 931 # The folded part can match the null string if it 932 # isn't required to have width, and there's not 933 # something on one or both sides that force it to. 934 my $both_sides = ($l_anchor && $r_anchor) 935 || ($l_anchor && $append) 936 || ($r_anchor && $prepend) 937 || ($prepend && $append); 938 my $must_match = ! $can_match_null || $both_sides; 939 # for performance, but doing this missed many failures 940 #next unless $must_match; 941 my $quantified = "(?$charset_mod:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; 942 my $op; 943 if ($must_match && $should_fail) { 944 $op = 0; 945 } else { 946 $op = 1; 947 } 948 $op = ! $op if $must_match && $inverted; 949 950 if ($inverted && @target > 1) { 951 # When doing an inverted match against a 952 # multi-char target, and there is not something on 953 # the left to anchor the match, if it shouldn't 954 # succeed, skip, as what will happen (when working 955 # correctly) is that it will match the first 956 # position correctly, and then be inverted to not 957 # match; then it will go to the second position 958 # where it won't match, but get inverted to match, 959 # and hence succeeding. 960 next if ! ($l_anchor || $prepend) && ! $op; 961 962 # Can't ever match for latin1 code points non-uni 963 # semantics that have a inverted multi-char fold 964 # when there is something on both sides and the 965 # quantifier isn't such as to span the required 966 # width, which is 2 or 3. 967 $op = 0 if $ord < 255 968 && ! $uni_semantics 969 && $both_sides 970 && ( ! $quantifier || $quantifier eq '?') 971 && $parend < 2; 972 973 # Similarly can't ever match when inverting a 974 # multi-char fold for /aa and the quantifier 975 # isn't sufficient to allow it to span to both 976 # sides. 977 $op = 0 if $target_has_ascii 978 && $charset eq 'aa' 979 && $both_sides 980 && ( ! $quantifier || $quantifier eq '?') 981 && $parend < 2; 982 983 # Or for /l 984 $op = 0 if $target_has_latin1 && $charset eq 'l' 985 && $both_sides 986 && ( ! $quantifier || $quantifier eq '?') 987 && $parend < 2; 988 } 989 990 991 my $desc = ""; 992 if ($charset_mod eq 'l') { 993 $desc .= 'setlocale(LC_CTYPE, "' 994 . POSIX::setlocale(&POSIX::LC_CTYPE) 995 . '"); ' 996 } 997 $desc .= "my \$c = \"$prepend$lhs$append\"; " 998 . "my \$p = qr/$quantified/i;" 999 . "$upgrade_target$upgrade_pattern " 1000 . "\$c " . ($op ? "=~" : "!~") . " \$p; "; 1001 if ($DEBUG) { 1002 $desc .= ( 1003 "; uni_semantics=$uni_semantics, " 1004 . "should_fail=$should_fail, " 1005 . "bracketed=$bracketed, " 1006 . "prepend=$prepend, " 1007 . "append=$append, " 1008 . "parend=$parend, " 1009 . "quantifier=$quantifier, " 1010 . "l_anchor=$l_anchor, " 1011 . "r_anchor=$r_anchor; " 1012 . "pattern_above_latin1=$pattern_above_latin1; " 1013 . "utf8_pattern=$utf8_pattern" 1014 ); 1015 } 1016 1017 my $c = "$prepend$lhs_str$append"; 1018 my $p = qr/$quantified/i; 1019 utf8::upgrade($c) if length($upgrade_target); 1020 utf8::upgrade($p) if length($upgrade_pattern); 1021 my $res = $op ? ($c =~ $p): ($c !~ $p); 1022 1023 if (!$res || $list_all_tests) { 1024 # Failed or debug; output the result 1025 $count++; 1026 ok($res, "test $count - $desc"); 1027 } else { 1028 # Just count the test as passed 1029 $okays++; 1030 } 1031 $this_iteration++; 1032 } 1033 } 1034 } 1035 } 1036 } 1037 } 1038 } 1039 } 1040 } 1041 } 1042 } 1043 unless($list_all_tests) { 1044 $count++; 1045 is $okays, $this_iteration, "$okays subtests ok for" 1046 . " /$charset_mod" 1047 . (($charset_mod eq 'l') ? " ($current_locale)" : "") 1048 . ', target="' . join("", @x_target) . '",' 1049 . ' pat="' . join("", @x_pattern) . '"'; 1050 } 1051 } 1052 } 1053} 1054} 1055 1056plan($count); 1057 10581 1059