1f3efcd01Safresh1# Grind out a lot of combinatoric tests for folding. 2f3efcd01Safresh1# It uses various charset modifiers, passed in via $::TEST_CHUNK. The caller 3f3efcd01Safresh1# will also have set the locale to use if /l is the modifier. 4f3efcd01Safresh1# L is a pseudo-modifier that indicates to use the modifier /l instead, and 5f3efcd01Safresh1# the locale set by the caller is known to be UTF-8, 6f3efcd01Safresh1# T is a pseudo-modifier that indicates to use the pseudo modifier /L 7f3efcd01Safresh1# instead, and the locale set by the caller is known to be Turkic UTF-8, 8f3efcd01Safresh1 9f3efcd01Safresh1binmode STDOUT, ":utf8"; 10f3efcd01Safresh1 11f3efcd01Safresh1BEGIN { 12f3efcd01Safresh1 chdir 't' if -d 't'; 13f3efcd01Safresh1 require './test.pl'; 14f3efcd01Safresh1 set_up_inc('../lib'); 15*5486feefSafresh1 require Config; Config->import; 16f3efcd01Safresh1 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); 17f3efcd01Safresh1 if ($^O eq 'dec_osf') { 18f3efcd01Safresh1 skip_all("$^O cannot handle this test"); 19f3efcd01Safresh1 } 20f3efcd01Safresh1 21f2a19305Safresh1 watchdog(5 * 60); 22f3efcd01Safresh1 require './loc_tools.pl'; 23f3efcd01Safresh1} 24f3efcd01Safresh1 25f3efcd01Safresh1use charnames ":full"; 26f3efcd01Safresh1 27f3efcd01Safresh1my $DEBUG = 0; # Outputs extra information for debugging this .t 28f3efcd01Safresh1 29f3efcd01Safresh1use strict; 30f3efcd01Safresh1use warnings; 31f3efcd01Safresh1no warnings 'locale'; # Plenty of these would otherwise get generated 32f3efcd01Safresh1use Encode; 33f3efcd01Safresh1use POSIX; 34f3efcd01Safresh1 35f3efcd01Safresh1my $charset = $::TEST_CHUNK; 36f3efcd01Safresh1my $use_turkic_rules = 0; 37f3efcd01Safresh1 38f3efcd01Safresh1if ($charset eq 'T') { 39f3efcd01Safresh1 $charset = 'L'; 40f3efcd01Safresh1 $use_turkic_rules = 1; 41f3efcd01Safresh1} 42f3efcd01Safresh1 43f2a19305Safresh1my $has_LC_CTYPE = is_category_valid('LC_CTYPE'); 44f2a19305Safresh1 45f3efcd01Safresh1# Special-cased characters in the .c's that we want to make sure get tested. 46f3efcd01Safresh1my %be_sure_to_test = ( 47f3efcd01Safresh1 chr utf8::unicode_to_native(0xDF) => 1, # LATIN_SMALL_LETTER_SHARP_S 48256a93a4Safresh1 49256a93a4Safresh1 # This is included because the uppercase occupies more bytes, but the 50256a93a4Safresh1 # first two bytes of their representations differ only in one bit, 51256a93a4Safresh1 # that could lead the code looking for shortcuts astray; you can't do 52256a93a4Safresh1 # certain shortcuts if the lengths differ 53256a93a4Safresh1 "\x{29E}" => 1, # LATIN SMALL LETTER TURNED K 54256a93a4Safresh1 55f3efcd01Safresh1 "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 56f3efcd01Safresh1 "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 57256a93a4Safresh1 58256a93a4Safresh1 # This is included because the uppercase and lowercase differ by only 59256a93a4Safresh1 # a single bit and it is in the first of the two byte representations. 60256a93a4Safresh1 # This showed that a previous way was erroneous of calculating if 61256a93a4Safresh1 # initial substrings were closely-related bit-wise. 62256a93a4Safresh1 "\x{3CC}" => 1, # GREEK SMALL LETTER OMICRON WITH TONOS 63256a93a4Safresh1 64256a93a4Safresh1 "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S 65f3efcd01Safresh1 "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 66f3efcd01Safresh1 "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 67256a93a4Safresh1 68256a93a4Safresh1 # These are included because they are adjacent and fold to the same 69256a93a4Safresh1 # result, U+01C6. This has tripped up the code in the past that 70256a93a4Safresh1 # wrongly thought that sequential code points must fold to sequential 71256a93a4Safresh1 # code points 72256a93a4Safresh1 "\x{01C4}" => 1, # LATIN CAPITAL LETTER DZ WITH CARON 73256a93a4Safresh1 "\x{01C5}" => 1, # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 74256a93a4Safresh1 75f3efcd01Safresh1 "I" => 1, 76f3efcd01Safresh1); 77f3efcd01Safresh1 78f3efcd01Safresh1# Tests both unicode and not, so make sure not implicitly testing unicode 79f3efcd01Safresh1no feature 'unicode_strings'; 80f3efcd01Safresh1 81f3efcd01Safresh1# Case-insensitive matching is a large and complicated issue. Perl does not 82f3efcd01Safresh1# implement it fully, properly. For example, it doesn't include normalization 83f3efcd01Safresh1# as part of the equation. To test every conceivable combination is clearly 84f3efcd01Safresh1# impossible; these tests are mostly drawn from visual inspection of the code 85f3efcd01Safresh1# and experience, trying to exercise all areas. 86f3efcd01Safresh1 87f3efcd01Safresh1# There are three basic ranges of characters that Perl may treat differently: 88f3efcd01Safresh1# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are 89f3efcd01Safresh1# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants 90f3efcd01Safresh1# are all controls that fold to themselves. 91f3efcd01Safresh1my $ASCII = 1; 92f3efcd01Safresh1 93f3efcd01Safresh1# 2) Other characters that fit into a byte but are different in utf8 than not; 94f3efcd01Safresh1# here referred to, taking some liberties, as Latin1. 95f3efcd01Safresh1my $Latin1 = 2; 96f3efcd01Safresh1 97f3efcd01Safresh1# 3) Characters that won't fit in a byte; here referred to as Unicode 98f3efcd01Safresh1my $Unicode = 3; 99f3efcd01Safresh1 100f3efcd01Safresh1# Within these basic groups are equivalence classes that testing any character 101f3efcd01Safresh1# in is likely to lead to the same results as any other character. This is 102f3efcd01Safresh1# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is 103f3efcd01Safresh1# set. 104f3efcd01Safresh1my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; 105f3efcd01Safresh1 106f3efcd01Safresh1# Additionally parts of this test run a lot of subtests, outputting the 107f3efcd01Safresh1# resulting TAP can be expensive so the tests are summarised internally. The 108f3efcd01Safresh1# PERL_DEBUG_FULL_TEST environment variable can be set to produce the full 109f3efcd01Safresh1# output for debugging purposes. 110f3efcd01Safresh1 111f3efcd01Safresh1sub range_type { 112f3efcd01Safresh1 my $ord = ord shift; 113f3efcd01Safresh1 114f3efcd01Safresh1 return $ASCII if utf8::native_to_unicode($ord) < 128; 115f3efcd01Safresh1 return $Latin1 if $ord < 256; 116f3efcd01Safresh1 return $Unicode; 117f3efcd01Safresh1} 118f3efcd01Safresh1 119f3efcd01Safresh1sub numerically { 120f3efcd01Safresh1 return $a <=> $b 121f3efcd01Safresh1} 122f3efcd01Safresh1 123f3efcd01Safresh1my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG; 124f3efcd01Safresh1$| = 1 if $list_all_tests; 125f3efcd01Safresh1 126f3efcd01Safresh1# Significant time is saved by not outputting each test but grouping the 127f3efcd01Safresh1# output into subtests 128f3efcd01Safresh1my $okays; # Number of ok's in current subtest 129f3efcd01Safresh1my $this_iteration; # Number of possible tests in current subtest 130f3efcd01Safresh1my $count = 0; # Number of subtests = number of total tests 131f3efcd01Safresh1 132f3efcd01Safresh1sub run_test($$$$) { 133f3efcd01Safresh1 my ($test, $todo, $do_we_output_locale_name, $debug) = @_; 134f3efcd01Safresh1 135f3efcd01Safresh1 $debug = "" unless $DEBUG; 136f3efcd01Safresh1 my $res = eval $test; 137f3efcd01Safresh1 138f3efcd01Safresh1 if ($do_we_output_locale_name) { 139f3efcd01Safresh1 $do_we_output_locale_name = 'setlocale(LC_CTYPE, "' 140f3efcd01Safresh1 . POSIX::setlocale(&POSIX::LC_CTYPE) 141f3efcd01Safresh1 . '"); '; 142f3efcd01Safresh1 } 143f3efcd01Safresh1 if (!$res || $list_all_tests) { 144f3efcd01Safresh1 # Failed or debug; output the result 145f3efcd01Safresh1 $count++; 146f3efcd01Safresh1 ok($res, "$do_we_output_locale_name$test; $debug"); 147f3efcd01Safresh1 } else { 148f3efcd01Safresh1 # Just count the test as passed 149f3efcd01Safresh1 $okays++; 150f3efcd01Safresh1 } 151f3efcd01Safresh1 $this_iteration++; 152f3efcd01Safresh1} 153f3efcd01Safresh1 154f3efcd01Safresh1my %has_test_by_participants; # Makes sure has tests for each range and each 155f3efcd01Safresh1 # number of characters that fold to the same 156f3efcd01Safresh1 # thing 157f3efcd01Safresh1my %has_test_by_byte_count; # Makes sure has tests for each combination of 158f3efcd01Safresh1 # n bytes folds to m bytes 159f3efcd01Safresh1 160f3efcd01Safresh1my %tests; # The set of tests we expect to pass. 161f3efcd01Safresh1# Each key is a code point that folds to something else. 162f3efcd01Safresh1# Each value is a list of things that the key folds to. If the 'thing' is a 163f3efcd01Safresh1# single code point, it is that ordinal. If it is a multi-char fold, it is an 164f3efcd01Safresh1# ordered list of the code points in that fold. Here's an example for 'S': 165f3efcd01Safresh1# '83' => [ 115, 383 ] 166f3efcd01Safresh1# 167f3efcd01Safresh1# And one for a multi-char fold: \xDF 168f3efcd01Safresh1# 223 => [ 169f3efcd01Safresh1# [ # 'ss' 170f3efcd01Safresh1# 83, 171f3efcd01Safresh1# 83 172f3efcd01Safresh1# ], 173f3efcd01Safresh1# [ # 'SS' 174f3efcd01Safresh1# 115, 175f3efcd01Safresh1# 115 176f3efcd01Safresh1# ], 177f3efcd01Safresh1# [ # LATIN SMALL LETTER LONG S 178f3efcd01Safresh1# 383, 179f3efcd01Safresh1# 383 180f3efcd01Safresh1# ], 181f3efcd01Safresh1# 7838 # LATIN_CAPITAL_LETTER_SHARP_S 182f3efcd01Safresh1# ], 183f3efcd01Safresh1 184f3efcd01Safresh1my %neg_tests; # Same format, but we expect these tests to fail 185f3efcd01Safresh1 186f3efcd01Safresh1my %folds; # keys are code points that fold; values are either 0 or 1 which 187f3efcd01Safresh1 # in turn are keys with their values each a list of code points the 188f3efcd01Safresh1 # code point key folds to. The folds under 1 are the ones that are 189f3efcd01Safresh1 # valid in this run; the ones under 0 are ones valid under other 190f3efcd01Safresh1 # circumstances. 191f3efcd01Safresh1 192f3efcd01Safresh1my %inverse_folds; # keys are strings of the folded-to; then come a layer of 193f3efcd01Safresh1 # 0 or 1, like %folds. The lowest values are lists of 194f3efcd01Safresh1 # characters that fold to them 195f3efcd01Safresh1 196f3efcd01Safresh1# Here's a portion of an %inverse_folds in a run where Turkic folds are not 197f3efcd01Safresh1# legal, so \x{130} doesn't fold to 'i' in this run. 198f3efcd01Safresh1# 'h' => { 199f3efcd01Safresh1# '1' => [ 200f3efcd01Safresh1# 'H' 201f3efcd01Safresh1# ] 202f3efcd01Safresh1# }, 203f3efcd01Safresh1# "h\x{331}" => { 204f3efcd01Safresh1# '1' => [ 205f3efcd01Safresh1# "\x{1e96}" 206f3efcd01Safresh1# ] 207f3efcd01Safresh1# }, 208f3efcd01Safresh1# 'i' => { 209f3efcd01Safresh1# '0' => [ 210f3efcd01Safresh1# "\x{130}" 211f3efcd01Safresh1# ], 212f3efcd01Safresh1# '1' => [ 213f3efcd01Safresh1# 'I' 214f3efcd01Safresh1# ] 215f3efcd01Safresh1# }, 216f3efcd01Safresh1# "i\x{307}" => { 217f3efcd01Safresh1# '1' => [ 218f3efcd01Safresh1# "\x{130}" 219f3efcd01Safresh1# ] 220f3efcd01Safresh1# }, 221f3efcd01Safresh1# 'j' => { 222f3efcd01Safresh1# '1' => [ 223f3efcd01Safresh1# 'J' 224f3efcd01Safresh1# ] 225f3efcd01Safresh1# }, 226f3efcd01Safresh1 227f3efcd01Safresh1sub add_test($$@) { 228f3efcd01Safresh1 my ($tests_ref, $to, @from) = @_; 229f3efcd01Safresh1 230f3efcd01Safresh1 # Called to cause the input to be tested by adding to $%tests_ref. @from 231f3efcd01Safresh1 # is the list of characters that fold to the string $to. @from should be 232f3efcd01Safresh1 # sorted so the lowest code point is first.... 233f3efcd01Safresh1 # The input is in string form; %tests uses code points, so have to 234f3efcd01Safresh1 # convert. 235f3efcd01Safresh1 236f3efcd01Safresh1 my $to_chars = length $to; 237f3efcd01Safresh1 my @test_to; # List of tests for $to 238f3efcd01Safresh1 239f3efcd01Safresh1 if ($to_chars == 1) { 240f3efcd01Safresh1 @test_to = ord $to; 241f3efcd01Safresh1 } 242f3efcd01Safresh1 else { 243f3efcd01Safresh1 push @test_to, [ map { ord $_ } split "", $to ]; 244f3efcd01Safresh1 245f3efcd01Safresh1 # For multi-char folds, we also test that things that can fold to each 246f3efcd01Safresh1 # individual character in the fold also work. If we were testing 247f3efcd01Safresh1 # comprehensively, we would try every combination of upper and lower 248f3efcd01Safresh1 # case in the fold, but it will have to suffice to avoid running 249f3efcd01Safresh1 # forever to make sure that each thing that folds to these is tested 250f3efcd01Safresh1 # at least once. Because of complement matching ([^...]), we need to 251f3efcd01Safresh1 # do both the folded, and the folded-from. 252f3efcd01Safresh1 # We first look at each character in the multi-char fold, and save how 253f3efcd01Safresh1 # many characters fold to it; and also the maximum number of such 254f3efcd01Safresh1 # folds 255f3efcd01Safresh1 my @folds_to_count; # 0th char in fold is index 0 ... 256f3efcd01Safresh1 my $max_folds_to = 0; 257f3efcd01Safresh1 258f3efcd01Safresh1 for (my $i = 0; $i < $to_chars; $i++) { 259f3efcd01Safresh1 my $to_char = substr($to, $i, 1); 260f3efcd01Safresh1 if (exists $inverse_folds{$to_char}{1}) { 261f3efcd01Safresh1 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}{1}}; 262f3efcd01Safresh1 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i]; 263f3efcd01Safresh1 } 264f3efcd01Safresh1 else { 265f3efcd01Safresh1 $folds_to_count[$i] = 0; 266f3efcd01Safresh1 } 267f3efcd01Safresh1 } 268f3efcd01Safresh1 269f3efcd01Safresh1 # We will need to generate as many tests as the maximum number of 270f3efcd01Safresh1 # folds, so that each fold will have at least one test. 271f3efcd01Safresh1 # For example, consider character X which folds to the three character 272f3efcd01Safresh1 # string 'xyz'. If 2 things fold to x (X and x), 4 to y (Y, Y' 273f3efcd01Safresh1 # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4 274f3efcd01Safresh1 # tests will be generated: 275f3efcd01Safresh1 # xyz 276f3efcd01Safresh1 # XYz 277f3efcd01Safresh1 # xY'z 278f3efcd01Safresh1 # xY''z 279f3efcd01Safresh1 for (my $i = 0; $i < $max_folds_to; $i++) { 280f3efcd01Safresh1 my @this_test_to; # Assemble a single test 281f3efcd01Safresh1 282f3efcd01Safresh1 # For each character in the multi-char fold ... 283f3efcd01Safresh1 for (my $j = 0; $j < $to_chars; $j++) { 284f3efcd01Safresh1 my $this_char = substr($to, $j, 1); 285f3efcd01Safresh1 286f3efcd01Safresh1 # Use its corresponding inverse fold, if available. 287f3efcd01Safresh1 if ( $i < $folds_to_count[$j] 288f3efcd01Safresh1 && exists $inverse_folds{$this_char}{1}) 289f3efcd01Safresh1 { 290f3efcd01Safresh1 push @this_test_to, ord $inverse_folds{$this_char}{1}[$i]; 291f3efcd01Safresh1 } 292f3efcd01Safresh1 else { # Or else itself. 293f3efcd01Safresh1 push @this_test_to, ord $this_char; 294f3efcd01Safresh1 } 295f3efcd01Safresh1 } 296f3efcd01Safresh1 297f3efcd01Safresh1 # Add this test to the list 298f3efcd01Safresh1 push @test_to, [ @this_test_to ]; 299f3efcd01Safresh1 } 300f3efcd01Safresh1 301f3efcd01Safresh1 # Here, have assembled all the tests for the multi-char fold. Sort so 302f3efcd01Safresh1 # lowest code points are first for consistency and aesthetics in 303f3efcd01Safresh1 # output. We know there are at least two characters in the fold, but 304f3efcd01Safresh1 # I haven't bothered to worry about sorting on an optional third 305f3efcd01Safresh1 # character if the first two are identical. 306f3efcd01Safresh1 @test_to = sort { ($a->[0] == $b->[0]) 307f3efcd01Safresh1 ? $a->[1] <=> $b->[1] 308f3efcd01Safresh1 : $a->[0] <=> $b->[0] 309f3efcd01Safresh1 } @test_to; 310f3efcd01Safresh1 } 311f3efcd01Safresh1 312f3efcd01Safresh1 313f3efcd01Safresh1 # This test is from n bytes to m bytes. Record that so won't try to add 314f3efcd01Safresh1 # another test that does the same. 315f3efcd01Safresh1 use bytes; 316f3efcd01Safresh1 my $to_bytes = length $to; 317f3efcd01Safresh1 foreach my $from_map (@from) { 318f3efcd01Safresh1 $has_test_by_byte_count{length $from_map}{$to_bytes} = $to; 319f3efcd01Safresh1 } 320f3efcd01Safresh1 no bytes; 321f3efcd01Safresh1 322f3efcd01Safresh1 my $ord_smallest_from = ord shift @from; 323f3efcd01Safresh1 if (exists $tests_ref->{$ord_smallest_from}) { 324f3efcd01Safresh1 die "There are already tests for $ord_smallest_from" 325f3efcd01Safresh1 }; 326f3efcd01Safresh1 327f3efcd01Safresh1 # Add in the fold tests, 328f3efcd01Safresh1 push @{$tests_ref->{$ord_smallest_from}}, @test_to; 329f3efcd01Safresh1 330f3efcd01Safresh1 # Then any remaining froms in the equivalence class. 331f3efcd01Safresh1 push @{$tests_ref->{$ord_smallest_from}}, map { ord $_ } @from; 332f3efcd01Safresh1} 333f3efcd01Safresh1 334f3efcd01Safresh1# Get the Unicode rules and construct inverse mappings from them 335f3efcd01Safresh1 336f3efcd01Safresh1use Unicode::UCD; 337f3efcd01Safresh1my $file="../lib/unicore/CaseFolding.txt"; 338f3efcd01Safresh1 339f3efcd01Safresh1# Use the Unicode data file if we are on an ASCII platform (which its data is 340f3efcd01Safresh1# for), and it is in the modern format (starting in Unicode 3.1.0) and it is 341f3efcd01Safresh1# available. This avoids being affected by potential bugs introduced by other 342f3efcd01Safresh1# layers of Perl 343f3efcd01Safresh1if ($::IS_ASCII 344f3efcd01Safresh1 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 345f3efcd01Safresh1 && open my $fh, "<", $file) 346f3efcd01Safresh1{ 347f3efcd01Safresh1 # We process the file in reverse order because its easier to see the T 348f3efcd01Safresh1 # entry first and then know that the next line we process is the 349f3efcd01Safresh1 # corresponding one for non-T. 350f3efcd01Safresh1 my @rules = <$fh>; 351f3efcd01Safresh1 my $prev_was_turkic = 0; 352f3efcd01Safresh1 while (defined ($_ = pop @rules)) { 353f3efcd01Safresh1 chomp; 354f3efcd01Safresh1 355f3efcd01Safresh1 # Lines look like (though without the initial '#') 356f3efcd01Safresh1 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 357f3efcd01Safresh1 358f3efcd01Safresh1 # Get rid of comments, ignore blank or comment-only lines 359f3efcd01Safresh1 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx; 360f3efcd01Safresh1 next unless length $line; 361f3efcd01Safresh1 my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line; 362f3efcd01Safresh1 363f3efcd01Safresh1 next if $fold_type eq 'S'; # If Unicode's tables are correct, the F 364f3efcd01Safresh1 # should be a superset of S 365f3efcd01Safresh1 next if $fold_type eq 'I'; # Perl doesn't do old Turkish folding 366f3efcd01Safresh1 367f3efcd01Safresh1 my $test_type; 368f3efcd01Safresh1 if ($fold_type eq 'T') { 369f3efcd01Safresh1 $test_type = 0 + $use_turkic_rules; 370f3efcd01Safresh1 $prev_was_turkic = 1; 371f3efcd01Safresh1 } 372f3efcd01Safresh1 elsif ($prev_was_turkic) { 373f3efcd01Safresh1 $test_type = 0 + ! $use_turkic_rules; 374f3efcd01Safresh1 $prev_was_turkic = 0; 375f3efcd01Safresh1 } 376f3efcd01Safresh1 else { 377f3efcd01Safresh1 $test_type = 1; 378f3efcd01Safresh1 $prev_was_turkic = 0; 379f3efcd01Safresh1 } 380f3efcd01Safresh1 381f3efcd01Safresh1 my $from = hex $hex_from; 382f3efcd01Safresh1 my @to = map { hex $_ } @hex_folded; 383f3efcd01Safresh1 push @{$folds{$from}{$test_type}}, @to; 384f3efcd01Safresh1 385f3efcd01Safresh1 my $folded_str = pack ("U0U*", @to); 386f3efcd01Safresh1 push @{$inverse_folds{$folded_str}{$test_type}}, chr $from; 387f3efcd01Safresh1 } 388f3efcd01Safresh1} 389f3efcd01Safresh1else { # Here, can't use the .txt file: read the Unicode rules file and 390f3efcd01Safresh1 # construct inverse mappings from it 391f3efcd01Safresh1 392f3efcd01Safresh1 skip_all "Don't know how to generate turkic rules on this platform" 393f3efcd01Safresh1 if $use_turkic_rules; 394f3efcd01Safresh1 my ($invlist_ref, $invmap_ref, undef, $default) 395f3efcd01Safresh1 = Unicode::UCD::prop_invmap('Case_Folding'); 396f3efcd01Safresh1 for my $i (0 .. @$invlist_ref - 1 - 1) { 397f3efcd01Safresh1 next if $invmap_ref->[$i] == $default; 398f3efcd01Safresh1 399f3efcd01Safresh1 # Make into an array if not so already, so can treat uniformly below 400f3efcd01Safresh1 $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i]; 401f3efcd01Safresh1 402f3efcd01Safresh1 # Each subsequent element of the range requires adjustment of +1 from 403f3efcd01Safresh1 # the previous element 404f3efcd01Safresh1 my $adjust = -1; 405f3efcd01Safresh1 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { 406f3efcd01Safresh1 $adjust++; 407f3efcd01Safresh1 my @to = map { $_ + $adjust } @{$invmap_ref->[$i]}; 408f3efcd01Safresh1 push @{$folds{$j}{1}}, @to; 409f3efcd01Safresh1 my $folded_str = join "", map { chr } @to; 410f3efcd01Safresh1 utf8::upgrade($folded_str); 411f3efcd01Safresh1 #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ", 412f3efcd01Safresh1 # map { sprintf "%04X", $_ + $adjust } @{$invmap_ref->[$i]}); 413f3efcd01Safresh1 push @{$inverse_folds{$folded_str}{1}}, chr $j; 414f3efcd01Safresh1 } 415f3efcd01Safresh1 } 416f3efcd01Safresh1} 417f3efcd01Safresh1 418f3efcd01Safresh1# Analyze the data and generate tests to get adequate test coverage. We sort 419f3efcd01Safresh1# things so that smallest code points are done first. 420f3efcd01Safresh1foreach my $to (sort { $a cmp $b } keys %inverse_folds) 421f3efcd01Safresh1{ 422f3efcd01Safresh1TO: 423f3efcd01Safresh1 foreach my $tests_ref (\%tests, \%neg_tests) { 424f3efcd01Safresh1 my $test_type = ($tests_ref == \%tests) ? 1 : 0; 425f3efcd01Safresh1 426f3efcd01Safresh1 next unless exists $inverse_folds{$to}{$test_type}; 427f3efcd01Safresh1 428f3efcd01Safresh1 # Within each fold, sort so that the smallest code points are done first 429f3efcd01Safresh1 @{$inverse_folds{$to}{$test_type}} = sort { $a cmp $b } @{$inverse_folds{$to}{$test_type}}; 430f3efcd01Safresh1 my @from = @{$inverse_folds{$to}{$test_type}}; 431f3efcd01Safresh1 432f3efcd01Safresh1 # Just add it to the tests if doing complete coverage 433f3efcd01Safresh1 if (! $skip_apparently_redundant) { 434f3efcd01Safresh1 add_test($tests_ref, $to, @from); 435f3efcd01Safresh1 next TO; 436f3efcd01Safresh1 } 437f3efcd01Safresh1 438f3efcd01Safresh1 my $to_chars = length $to; 439f3efcd01Safresh1 my $to_range_type = range_type(substr($to, 0, 1)); 440f3efcd01Safresh1 441f3efcd01Safresh1 # If this is required to be tested, do so. We check for these first, as 442f3efcd01Safresh1 # they will take up slots of byte-to-byte combinations that we otherwise 443f3efcd01Safresh1 # would have to have other tests to get. 444f3efcd01Safresh1 foreach my $from_map (@from) { 445f3efcd01Safresh1 if (exists $be_sure_to_test{$from_map}) { 446f3efcd01Safresh1 add_test($tests_ref, $to, @from); 447f3efcd01Safresh1 next TO; 448f3efcd01Safresh1 } 449f3efcd01Safresh1 } 450f3efcd01Safresh1 451f3efcd01Safresh1 # If the fold contains heterogeneous range types, is suspect and should be 452f3efcd01Safresh1 # tested. 453f3efcd01Safresh1 if ($to_chars > 1) { 454f3efcd01Safresh1 foreach my $char (split "", $to) { 455f3efcd01Safresh1 if (range_type($char) != $to_range_type) { 456f3efcd01Safresh1 add_test($tests_ref, $to, @from); 457f3efcd01Safresh1 next TO; 458f3efcd01Safresh1 } 459f3efcd01Safresh1 } 460f3efcd01Safresh1 } 461f3efcd01Safresh1 462f3efcd01Safresh1 # If the mapping crosses range types, is suspect and should be tested 463f3efcd01Safresh1 foreach my $from_map (@from) { 464f3efcd01Safresh1 if (range_type($from_map) != $to_range_type) { 465f3efcd01Safresh1 add_test($tests_ref, $to, @from); 466f3efcd01Safresh1 next TO; 467f3efcd01Safresh1 } 468f3efcd01Safresh1 } 469f3efcd01Safresh1 470f3efcd01Safresh1 # Here, all components of the mapping are in the same range type. For 471f3efcd01Safresh1 # single character folds, we test one case in each range type that has 2 472f3efcd01Safresh1 # particpants, 3 particpants, etc. 473f3efcd01Safresh1 if ($to_chars == 1) { 474f3efcd01Safresh1 if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) { 475f3efcd01Safresh1 add_test($tests_ref, $to, @from); 476f3efcd01Safresh1 $has_test_by_participants{scalar @from}{$to_range_type} = $to; 477f3efcd01Safresh1 next TO; 478f3efcd01Safresh1 } 479f3efcd01Safresh1 } 480f3efcd01Safresh1 481f3efcd01Safresh1 # We also test all combinations of mappings from m to n bytes. This is 482f3efcd01Safresh1 # because the regex optimizer cares. (Don't bother worrying about that 483f3efcd01Safresh1 # Latin1 chars will occupy a different number of bytes under utf8, as 484f3efcd01Safresh1 # there are plenty of other cases that catch these byte numbers.) 485f3efcd01Safresh1 use bytes; 486f3efcd01Safresh1 my $to_bytes = length $to; 487f3efcd01Safresh1 foreach my $from_map (@from) { 488f3efcd01Safresh1 if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) { 489f3efcd01Safresh1 add_test($tests_ref, $to, @from); 490f3efcd01Safresh1 next TO; 491f3efcd01Safresh1 } 492f3efcd01Safresh1 } 493f3efcd01Safresh1 } 494f3efcd01Safresh1} 495f3efcd01Safresh1 496f3efcd01Safresh1# For each range type, test additionally a character that folds to itself 497f3efcd01Safresh1add_test(\%tests, ":", ":"); 498f3efcd01Safresh1add_test(\%tests, chr utf8::unicode_to_native(0xF7), chr utf8::unicode_to_native(0xF7)); 499f3efcd01Safresh1add_test(\%tests, chr 0x2C7, chr 0x2C7); 500f3efcd01Safresh1 501f3efcd01Safresh1# To cut down on the number of tests 502f3efcd01Safresh1my $has_tested_aa_above_latin1; 503f3efcd01Safresh1my $has_tested_latin1_aa; 504f3efcd01Safresh1my $has_tested_ascii_aa; 505f3efcd01Safresh1my $has_tested_l_above_latin1; 506f3efcd01Safresh1my $has_tested_above_latin1_l; 507f3efcd01Safresh1my $has_tested_ascii_l; 508f3efcd01Safresh1my $has_tested_above_latin1_d; 509f3efcd01Safresh1my $has_tested_ascii_d; 510f3efcd01Safresh1my $has_tested_non_latin1_d; 511f3efcd01Safresh1my $has_tested_above_latin1_a; 512f3efcd01Safresh1my $has_tested_ascii_a; 513f3efcd01Safresh1my $has_tested_non_latin1_a; 514f3efcd01Safresh1 515f3efcd01Safresh1# For use by pairs() in generating combinations 516f3efcd01Safresh1sub prefix { 517f3efcd01Safresh1 my $p = shift; 518f3efcd01Safresh1 map [ $p, $_ ], @_ 519f3efcd01Safresh1} 520f3efcd01Safresh1 521f3efcd01Safresh1# Returns all ordered combinations of pairs of elements from the input array. 522f3efcd01Safresh1# It doesn't return pairs like (a, a), (b, b). Change the slice to an array 523f3efcd01Safresh1# to do that. This was just to have fewer tests. 524f3efcd01Safresh1sub pairs (@) { 525f3efcd01Safresh1 #print STDERR __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n"; 526f3efcd01Safresh1 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ 527f3efcd01Safresh1} 528f3efcd01Safresh1 529f3efcd01Safresh1# Finally ready to do the tests 530f3efcd01Safresh1foreach my $tests_ref (\%neg_tests, \%tests) { 531f3efcd01Safresh1foreach my $test (sort { numerically } keys %{$tests_ref}) { 532f3efcd01Safresh1 533f3efcd01Safresh1 my $previous_target; 534f3efcd01Safresh1 my $previous_pattern; 535f3efcd01Safresh1 my @pairs = pairs(sort numerically $test, @{$tests_ref->{$test}}); 536f3efcd01Safresh1 537f3efcd01Safresh1 # Each fold can be viewed as a closure of all the characters that 538f3efcd01Safresh1 # participate in it. Look at each possible pairing from a closure, with the 539f3efcd01Safresh1 # first member of the pair the target string to match against, and the 540f3efcd01Safresh1 # second member forming the pattern. Thus each fold member gets tested as 541f3efcd01Safresh1 # the string, and the pattern with every other member in the opposite role. 542f3efcd01Safresh1 while (my $pair = shift @pairs) { 543f3efcd01Safresh1 my ($target, $pattern) = @$pair; 544f3efcd01Safresh1 545f3efcd01Safresh1 # When testing a char that doesn't fold, we can get the same 546f3efcd01Safresh1 # permutation twice; so skip all but the first. 547f3efcd01Safresh1 next if $previous_target 548f3efcd01Safresh1 && $previous_target == $target 549f3efcd01Safresh1 && $previous_pattern == $pattern; 550f3efcd01Safresh1 ($previous_target, $previous_pattern) = ($target, $pattern); 551f3efcd01Safresh1 552f3efcd01Safresh1 # Each side may be either a single char or a string. Extract each into an 553f3efcd01Safresh1 # array (perhaps of length 1) 554f3efcd01Safresh1 my @target, my @pattern; 555f3efcd01Safresh1 @target = (ref $target) ? @$target : $target; 556f3efcd01Safresh1 @pattern = (ref $pattern) ? @$pattern : $pattern; 557f3efcd01Safresh1 558f3efcd01Safresh1 # We are testing just folds to/from a single character. If our pairs 559f3efcd01Safresh1 # happens to generate multi/multi, skip. 560f3efcd01Safresh1 next if @target > 1 && @pattern > 1; 561f3efcd01Safresh1 562f3efcd01Safresh1 # Get in hex form. 563f3efcd01Safresh1 my @x_target = map { sprintf "\\x{%04X}", $_ } @target; 564f3efcd01Safresh1 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; 565f3efcd01Safresh1 566f3efcd01Safresh1 my $target_above_latin1 = grep { $_ > 255 } @target; 567f3efcd01Safresh1 my $pattern_above_latin1 = grep { $_ > 255 } @pattern; 568f3efcd01Safresh1 my $target_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @target; 569f3efcd01Safresh1 my $pattern_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @pattern; 570f3efcd01Safresh1 my $target_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @target; 571f3efcd01Safresh1 my $pattern_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @pattern; 572f3efcd01Safresh1 my $target_has_latin1 = grep { $_ < 256 } @target; 573f3efcd01Safresh1 my $target_has_upper_latin1 574f3efcd01Safresh1 = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @target; 575f3efcd01Safresh1 my $pattern_has_upper_latin1 576f3efcd01Safresh1 = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @pattern; 577f3efcd01Safresh1 my $pattern_has_latin1 = grep { $_ < 256 } @pattern; 578f3efcd01Safresh1 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; 579f3efcd01Safresh1 580f3efcd01Safresh1 # We don't test multi-char folding into other multi-chars. We are testing 581f3efcd01Safresh1 # a code point that folds to or from other characters. Find the single 582f3efcd01Safresh1 # code point for diagnostic purposes. (If both are single, choose the 583f3efcd01Safresh1 # target string) 584f3efcd01Safresh1 my $ord = @target == 1 ? $target[0] : $pattern[0]; 585f3efcd01Safresh1 my $progress = sprintf "%04X: \"%s\" and /%s/", 586f3efcd01Safresh1 $test, 587f3efcd01Safresh1 join("", @x_target), 588f3efcd01Safresh1 join("", @x_pattern); 589f3efcd01Safresh1 #note $progress; 590f3efcd01Safresh1 591f3efcd01Safresh1 # Now grind out tests, using various combinations. 592f3efcd01Safresh1 { 593f3efcd01Safresh1 my $charset_mod = lc $charset; 594f2a19305Safresh1 my $current_locale = ($has_LC_CTYPE) 595f2a19305Safresh1 ? setlocale(&POSIX::LC_CTYPE) 596f2a19305Safresh1 : 'C'; 597f3efcd01Safresh1 $current_locale = 'C locale' if $current_locale eq 'C'; 598f3efcd01Safresh1 $okays = 0; 599f3efcd01Safresh1 $this_iteration = 0; 600f3efcd01Safresh1 601f3efcd01Safresh1 # To cut down somewhat on the enormous quantity of tests this currently 602f3efcd01Safresh1 # runs, skip some for some of the character sets whose results aren't 603f3efcd01Safresh1 # likely to differ from others. But run all tests on the code points 604f3efcd01Safresh1 # that don't fold, plus one other set in each range group. 605f3efcd01Safresh1 if (! $is_self) { 606f3efcd01Safresh1 607f3efcd01Safresh1 # /aa should only affect things with folds in the ASCII range. But, try 608f3efcd01Safresh1 # it on one set in the other ranges just to make sure it doesn't break 609f3efcd01Safresh1 # them. 610f3efcd01Safresh1 if ($charset eq 'aa') { 611f3efcd01Safresh1 612f3efcd01Safresh1 # It may be that this $pair of code points to test are both 613f3efcd01Safresh1 # non-ascii, but if either of them actually fold to ascii, that is 614f3efcd01Safresh1 # suspect and should be tested. So for /aa, use whether their folds 615f3efcd01Safresh1 # are ascii or not 616f3efcd01Safresh1 my $target_has_ascii = $target_has_ascii; 617f3efcd01Safresh1 my $pattern_has_ascii = $pattern_has_ascii; 618f3efcd01Safresh1 if (! $target_has_ascii) { 619f3efcd01Safresh1 foreach my $cp (@target) { 620f3efcd01Safresh1 if (exists $folds{$cp}{1} 621f3efcd01Safresh1 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}{1}} ) 622f3efcd01Safresh1 { 623f3efcd01Safresh1 $target_has_ascii = 1; 624f3efcd01Safresh1 last; 625f3efcd01Safresh1 } 626f3efcd01Safresh1 } 627f3efcd01Safresh1 } 628f3efcd01Safresh1 if (! $pattern_has_ascii) { 629f3efcd01Safresh1 foreach my $cp (@pattern) { 630f3efcd01Safresh1 if (exists $folds{$cp}{1} 631f3efcd01Safresh1 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}}{1} ) 632f3efcd01Safresh1 { 633f3efcd01Safresh1 $pattern_has_ascii = 1; 634f3efcd01Safresh1 last; 635f3efcd01Safresh1 } 636f3efcd01Safresh1 } 637f3efcd01Safresh1 } 638f3efcd01Safresh1 639f3efcd01Safresh1 if (! $target_has_ascii && ! $pattern_has_ascii) { 640f3efcd01Safresh1 if ($target_above_latin1 || $pattern_above_latin1) { 641f3efcd01Safresh1 next if defined $has_tested_aa_above_latin1 642f3efcd01Safresh1 && $has_tested_aa_above_latin1 != $test; 643f3efcd01Safresh1 $has_tested_aa_above_latin1 = $test; 644f3efcd01Safresh1 } 645f3efcd01Safresh1 next if defined $has_tested_latin1_aa 646f3efcd01Safresh1 && $has_tested_latin1_aa != $test; 647f3efcd01Safresh1 $has_tested_latin1_aa = $test; 648f3efcd01Safresh1 } 649f3efcd01Safresh1 elsif ($target_only_ascii && $pattern_only_ascii) { 650f3efcd01Safresh1 651f3efcd01Safresh1 # And, except for one set just to make sure, skip tests 652f3efcd01Safresh1 # where both elements in the pair are ASCII. If one works for 653f3efcd01Safresh1 # aa, the others are likely too. This skips tests where the 654f3efcd01Safresh1 # fold is from non-ASCII to ASCII, but this part of the test 655f3efcd01Safresh1 # is just about the ASCII components. 656f3efcd01Safresh1 next if defined $has_tested_ascii_l 657f3efcd01Safresh1 && $has_tested_ascii_l != $test; 658f3efcd01Safresh1 $has_tested_ascii_l = $test; 659f3efcd01Safresh1 } 660f3efcd01Safresh1 } 661f3efcd01Safresh1 elsif ($charset eq 'l') { 662f3efcd01Safresh1 663f3efcd01Safresh1 # For l, don't need to test beyond one set those things that are 664f3efcd01Safresh1 # all above latin1, because unlikely to have different successes 665f3efcd01Safresh1 # than /u. But, for the same reason as described in the /aa above, 666f3efcd01Safresh1 # it is suspect and should be tested, if either of the folds are to 667f3efcd01Safresh1 # latin1. 668f3efcd01Safresh1 my $target_has_latin1 = $target_has_latin1; 669f3efcd01Safresh1 my $pattern_has_latin1 = $pattern_has_latin1; 670f3efcd01Safresh1 if (! $target_has_latin1) { 671f3efcd01Safresh1 foreach my $cp (@target) { 672f3efcd01Safresh1 if (exists $folds{$cp}{1} 673f3efcd01Safresh1 && grep { $_ < 256 } @{$folds{$cp}{1}} ) 674f3efcd01Safresh1 { 675f3efcd01Safresh1 $target_has_latin1 = 1; 676f3efcd01Safresh1 last; 677f3efcd01Safresh1 } 678f3efcd01Safresh1 } 679f3efcd01Safresh1 } 680f3efcd01Safresh1 if (! $pattern_has_latin1) { 681f3efcd01Safresh1 foreach my $cp (@pattern) { 682f3efcd01Safresh1 if (exists $folds{$cp}{1} 683f3efcd01Safresh1 && grep { $_ < 256 } @{$folds{$cp}{1}} ) 684f3efcd01Safresh1 { 685f3efcd01Safresh1 $pattern_has_latin1 = 1; 686f3efcd01Safresh1 last; 687f3efcd01Safresh1 } 688f3efcd01Safresh1 } 689f3efcd01Safresh1 } 690f3efcd01Safresh1 if (! $target_has_latin1 && ! $pattern_has_latin1) { 691f3efcd01Safresh1 next if defined $has_tested_above_latin1_l 692f3efcd01Safresh1 && $has_tested_above_latin1_l != $test; 693f3efcd01Safresh1 $has_tested_above_latin1_l = $test; 694f3efcd01Safresh1 } 695f3efcd01Safresh1 elsif ($target_only_ascii && $pattern_only_ascii) { 696f3efcd01Safresh1 697f3efcd01Safresh1 # And, except for one set just to make sure, skip tests 698f3efcd01Safresh1 # where both elements in the pair are ASCII. This is 699f3efcd01Safresh1 # essentially the same reasoning as above for /aa. 700f3efcd01Safresh1 next if defined $has_tested_ascii_l 701f3efcd01Safresh1 && $has_tested_ascii_l != $test; 702f3efcd01Safresh1 $has_tested_ascii_l = $test; 703f3efcd01Safresh1 } 704f3efcd01Safresh1 } 705f3efcd01Safresh1 elsif ($charset eq 'd') { 706f3efcd01Safresh1 # Similarly for d. Beyond one test (besides self) each, we don't 707f3efcd01Safresh1 # test pairs that are both ascii; or both above latin1, or are 708f3efcd01Safresh1 # combinations of ascii and above latin1. 709f3efcd01Safresh1 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { 710f3efcd01Safresh1 if ($target_has_ascii && $pattern_has_ascii) { 711f3efcd01Safresh1 next if defined $has_tested_ascii_d 712f3efcd01Safresh1 && $has_tested_ascii_d != $test; 713f3efcd01Safresh1 $has_tested_ascii_d = $test 714f3efcd01Safresh1 } 715f3efcd01Safresh1 elsif (! $target_has_latin1 && ! $pattern_has_latin1) { 716f3efcd01Safresh1 next if defined $has_tested_above_latin1_d 717f3efcd01Safresh1 && $has_tested_above_latin1_d != $test; 718f3efcd01Safresh1 $has_tested_above_latin1_d = $test; 719f3efcd01Safresh1 } 720f3efcd01Safresh1 else { 721f3efcd01Safresh1 next if defined $has_tested_non_latin1_d 722f3efcd01Safresh1 && $has_tested_non_latin1_d != $test; 723f3efcd01Safresh1 $has_tested_non_latin1_d = $test; 724f3efcd01Safresh1 } 725f3efcd01Safresh1 } 726f3efcd01Safresh1 } 727f3efcd01Safresh1 elsif ($charset eq 'a') { 728f3efcd01Safresh1 # Similarly for a. This should match identically to /u, so wasn't 729f3efcd01Safresh1 # tested at all until a bug was found that was thereby missed. 730f3efcd01Safresh1 # As a compromise, beyond one test (besides self) each, we don't 731f3efcd01Safresh1 # test pairs that are both ascii; or both above latin1, or are 732f3efcd01Safresh1 # combinations of ascii and above latin1. 733f3efcd01Safresh1 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { 734f3efcd01Safresh1 if ($target_has_ascii && $pattern_has_ascii) { 735f3efcd01Safresh1 next if defined $has_tested_ascii_a 736f3efcd01Safresh1 && $has_tested_ascii_a != $test; 737f3efcd01Safresh1 $has_tested_ascii_a = $test 738f3efcd01Safresh1 } 739f3efcd01Safresh1 elsif (! $target_has_latin1 && ! $pattern_has_latin1) { 740f3efcd01Safresh1 next if defined $has_tested_above_latin1_a 741f3efcd01Safresh1 && $has_tested_above_latin1_a != $test; 742f3efcd01Safresh1 $has_tested_above_latin1_a = $test; 743f3efcd01Safresh1 } 744f3efcd01Safresh1 else { 745f3efcd01Safresh1 next if defined $has_tested_non_latin1_a 746f3efcd01Safresh1 && $has_tested_non_latin1_a != $test; 747f3efcd01Safresh1 $has_tested_non_latin1_a = $test; 748f3efcd01Safresh1 } 749f3efcd01Safresh1 } 750f3efcd01Safresh1 } 751f3efcd01Safresh1 } 752f3efcd01Safresh1 753f3efcd01Safresh1 foreach my $utf8_target (0, 1) { # Both utf8 and not, for 754f3efcd01Safresh1 # code points < 256 755f3efcd01Safresh1 my $upgrade_target = ""; 756f3efcd01Safresh1 757f3efcd01Safresh1 # These must already be in utf8 because the string to match has 758f3efcd01Safresh1 # something above latin1. So impossible to test if to not to be in 759f3efcd01Safresh1 # utf8; and otherwise, no upgrade is needed. 760f3efcd01Safresh1 next if $target_above_latin1 && ! $utf8_target; 761f3efcd01Safresh1 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; 762f3efcd01Safresh1 763f3efcd01Safresh1 foreach my $utf8_pattern (0, 1) { 764f3efcd01Safresh1 next if $pattern_above_latin1 && ! $utf8_pattern; 765f3efcd01Safresh1 766f3efcd01Safresh1 # Our testing of 'l' uses the POSIX locale, which is ASCII-only 767f3efcd01Safresh1 my $uni_semantics = $charset ne 'l' && ( $utf8_target 768f3efcd01Safresh1 || $charset eq 'u' 769f3efcd01Safresh1 || $charset eq 'L' 770f3efcd01Safresh1 || ($charset eq 'd' && $utf8_pattern) 771f3efcd01Safresh1 || $charset =~ /a/); 772f3efcd01Safresh1 my $upgrade_pattern = ""; 773f2a19305Safresh1 $upgrade_pattern = ' utf8::upgrade($rhs);' 774f2a19305Safresh1 if ! $pattern_above_latin1 && $utf8_pattern; 775f3efcd01Safresh1 776f3efcd01Safresh1 my $lhs = join "", @x_target; 777f3efcd01Safresh1 my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; 778f3efcd01Safresh1 my @rhs = @x_pattern; 779f3efcd01Safresh1 my $rhs = join "", @rhs; 780f3efcd01Safresh1 781f3efcd01Safresh1 # Unicode created a folding rule that partially emulates what 782f3efcd01Safresh1 # happens in a Turkish locale, by using combining characters. The 783f3efcd01Safresh1 # result is close enough to what really should happen, that it can 784f3efcd01Safresh1 # pass many of the tests, but not all. So, if we have a rule that 785f3efcd01Safresh1 # is expecting failure, it may pass instead. The code in the block 786f3efcd01Safresh1 # below is good enough for skipping the tests, and khw tried to make 787f3efcd01Safresh1 # it general, but should the rules be revised (unlikely at this 788f3efcd01Safresh1 # point), this might need to be tweaked. 789f3efcd01Safresh1 if ($tests_ref == \%neg_tests) { 790f3efcd01Safresh1 my ($shorter_ref, $longer_ref); 791f3efcd01Safresh1 792f3efcd01Safresh1 # Convert the $rhs to a string, like we already did for the lhs 793f3efcd01Safresh1 my $rhs_str = eval qq{"$rhs"}; fail($@) if $@; 794f3efcd01Safresh1 795f3efcd01Safresh1 # If the lengths of the two sides are equal, we don't want to do 796f3efcd01Safresh1 # this; this is only to bypass the combining characters affecting 797f3efcd01Safresh1 # things 798f3efcd01Safresh1 if (length $lhs_str != length $rhs_str) { 799f3efcd01Safresh1 800f3efcd01Safresh1 # Find the shorter and longer of the pair 801f3efcd01Safresh1 if (length $lhs_str < length $rhs_str) { 802f3efcd01Safresh1 $shorter_ref = \$lhs_str; 803f3efcd01Safresh1 $longer_ref = \$rhs_str; 804f3efcd01Safresh1 } 805f3efcd01Safresh1 else { 806f3efcd01Safresh1 $shorter_ref = \$rhs_str; 807f3efcd01Safresh1 $longer_ref = \$lhs_str; 808f3efcd01Safresh1 } 809f3efcd01Safresh1 810f3efcd01Safresh1 # If the shorter string is entirely contained in the longer, we 811f3efcd01Safresh1 # have generated a test that is likely to succeed, and the 812f3efcd01Safresh1 # reasons it would fail have nothing to do with folding. But we 813f3efcd01Safresh1 # are expecting it to fail, and so our test is invalid. Skip 814f3efcd01Safresh1 # it. 815f3efcd01Safresh1 next if index($$longer_ref, $$shorter_ref) >= 0; 816f3efcd01Safresh1 817f3efcd01Safresh1 818f3efcd01Safresh1 # The above eliminates about half the failure cases. This gets 819f3efcd01Safresh1 # the rest. If the shorter string is a single character and has 820f3efcd01Safresh1 # a fold legal in this run to a character that is in the longer 821f3efcd01Safresh1 # string, it is also likely to succeed under /i. So again our 822f3efcd01Safresh1 # computed test is bogus. 823f3efcd01Safresh1 if ( length $$shorter_ref == 1 824f3efcd01Safresh1 && exists $folds{ord $$shorter_ref}{1}) 825f3efcd01Safresh1 { 826f3efcd01Safresh1 my @folded_to = @{$folds{ord $$shorter_ref}{1}}; 827f3efcd01Safresh1 next if @folded_to == 1 828f3efcd01Safresh1 && index($$longer_ref, chr $folded_to[0]) >= 0; 829f3efcd01Safresh1 } 830f3efcd01Safresh1 } 831f3efcd01Safresh1 } 832f3efcd01Safresh1 833f3efcd01Safresh1 my $should_fail = (! $uni_semantics && $ord < 256 && ! $is_self && utf8::native_to_unicode($ord) >= 128) 834f3efcd01Safresh1 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) 835f3efcd01Safresh1 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1) 836f3efcd01Safresh1 || $tests_ref == \%neg_tests; 837f3efcd01Safresh1 838f3efcd01Safresh1 # Do simple tests of referencing capture buffers, named and 839f3efcd01Safresh1 # numbered. 840f3efcd01Safresh1 my $op = '=~'; 841f3efcd01Safresh1 $op = '!~' if $should_fail; 842f3efcd01Safresh1 843f3efcd01Safresh1 my $todo = 0; # No longer any todo's 844f2a19305Safresh1 my $eval = "my \$c = \"$lhs$rhs\"; my \$rhs = \"$rhs\"; " 845f2a19305Safresh1 . $upgrade_pattern 846f2a19305Safresh1 . " my \$p = qr/(?$charset_mod:^(\$rhs)\\1\$)/i;" 847f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 848f3efcd01Safresh1 run_test($eval, $todo, ($charset_mod eq 'l'), ""); 849f3efcd01Safresh1 850f2a19305Safresh1 $eval = "my \$c = \"$lhs$rhs\"; my \$rhs = \"$rhs\"; " 851f2a19305Safresh1 . $upgrade_pattern 852f2a19305Safresh1 . " my \$p = qr/(?$charset_mod:^(?<grind>\$rhs)\\k<grind>\$)/i;" 853f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 854f3efcd01Safresh1 run_test($eval, $todo, ($charset_mod eq 'l'), ""); 855f3efcd01Safresh1 856f3efcd01Safresh1 if ($lhs ne $rhs) { 857f2a19305Safresh1 $eval = "my \$c = \"$rhs$lhs\"; my \$rhs = \"$rhs\"; " 858f2a19305Safresh1 . $upgrade_pattern 859f2a19305Safresh1 . " my \$p = qr/(?$charset_mod:^(\$rhs)\\1\$)/i;" 860f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 861f3efcd01Safresh1 run_test($eval, "", ($charset_mod eq 'l'), ""); 862f3efcd01Safresh1 863f2a19305Safresh1 $eval = "my \$c = \"$rhs$lhs\"; my \$rhs = \"$rhs\"; " 864f2a19305Safresh1 . $upgrade_pattern 865f2a19305Safresh1 . " my \$p = qr/(?$charset_mod:^(?<grind>\$rhs)\\k<grind>\$)/i;" 866f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 867f3efcd01Safresh1 run_test($eval, "", ($charset_mod eq 'l'), ""); 868f3efcd01Safresh1 } 869f3efcd01Safresh1 870f3efcd01Safresh1 # See if works on what could be a simple trie. 871f3efcd01Safresh1 my $alternate; 872f3efcd01Safresh1 { 873f3efcd01Safresh1 # Keep the alternate | branch the same length as the tested one so 874f3efcd01Safresh1 # that it's length doesn't influence things 875f3efcd01Safresh1 my $evaled = eval "\"$rhs\""; # Convert e.g. \x{foo} into its 876f3efcd01Safresh1 # chr equivalent 877f3efcd01Safresh1 use bytes; 878f3efcd01Safresh1 $alternate = 'q' x length $evaled; 879f3efcd01Safresh1 } 880f2a19305Safresh1 $eval = "my \$c = \"$lhs\"; my \$rhs = \"$rhs\"; " 881f2a19305Safresh1 . $upgrade_pattern 882f2a19305Safresh1 . " my \$p = qr/\$rhs|$alternate/i$charset_mod;" 883f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 884f3efcd01Safresh1 run_test($eval, "", ($charset_mod eq 'l'), ""); 885f3efcd01Safresh1 886f3efcd01Safresh1 # Check that works when the folded character follows something that 887f3efcd01Safresh1 # is quantified. This test knows the regex code internals to the 888f3efcd01Safresh1 # extent that it knows this is a potential problem, and that there 889f3efcd01Safresh1 # are three different types of quantifiers generated: 1) The thing 890f3efcd01Safresh1 # being quantified matches a single character; 2) it matches more 891f3efcd01Safresh1 # than one character, but is fixed width; 3) it can match a variable 892f3efcd01Safresh1 # number of characters. (It doesn't know that case 3 shouldn't 893f3efcd01Safresh1 # matter, since it doesn't do anything special for the character 894f3efcd01Safresh1 # following the quantifier; nor that some of the different 895f3efcd01Safresh1 # quantifiers execute the same underlying code, as these tests are 896f3efcd01Safresh1 # quick, and this insulates these tests from changes in the 897f3efcd01Safresh1 # implementation.) 898f3efcd01Safresh1 for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') { 899f2a19305Safresh1 $eval = "my \$c = \"_$lhs\"; my \$rhs = \"$rhs\"; $upgrade_pattern " 900f2a19305Safresh1 . "my \$p = qr/(?$charset_mod:.$quantifier\$rhs)/i;" 901f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 902f3efcd01Safresh1 run_test($eval, "", ($charset_mod eq 'l'), ""); 903f2a19305Safresh1 $eval = "my \$c = \"__$lhs\"; my \$rhs = \"$rhs\"; $upgrade_pattern " 904f2a19305Safresh1 . "my \$p = qr/(?$charset_mod:(?:..)$quantifier\$rhs)/i;" 905f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 906f3efcd01Safresh1 run_test($eval, "", ($charset_mod eq 'l'), ""); 907f2a19305Safresh1 $eval = "my \$c = \"__$lhs\"; my \$rhs = \"$rhs\"; $upgrade_pattern " 908f2a19305Safresh1 . "my \$p = qr/(?$charset_mod:(?:.|\\R)$quantifier\$rhs)/i;" 909f2a19305Safresh1 . "$upgrade_target \$c $op \$p"; 910f3efcd01Safresh1 run_test($eval, "", ($charset_mod eq 'l'), ""); 911f3efcd01Safresh1 } 912f3efcd01Safresh1 913f3efcd01Safresh1 foreach my $bracketed (0, 1) { # Put rhs in [...], or not 914f3efcd01Safresh1 next if $bracketed && @pattern != 1; # bracketed makes these 915f3efcd01Safresh1 # or's instead of a sequence 916f3efcd01Safresh1 foreach my $optimize_bracketed (0, 1) { 917f3efcd01Safresh1 next if $optimize_bracketed && ! $bracketed; 918f3efcd01Safresh1 foreach my $inverted (0,1) { 919f3efcd01Safresh1 next if $inverted && ! $bracketed; # inversion only valid 920f3efcd01Safresh1 # in [^...] 921f3efcd01Safresh1 next if $inverted && @target != 1; # [perl #89750] multi-char 922f3efcd01Safresh1 # not valid in [^...] 923f3efcd01Safresh1 924f3efcd01Safresh1 # In some cases, add an extra character that doesn't fold, and 925f3efcd01Safresh1 # looks ok in the output. 926f3efcd01Safresh1 my $extra_char = "_"; 927f3efcd01Safresh1 foreach my $prepend ("", $extra_char) { 928f3efcd01Safresh1 foreach my $append ("", $extra_char) { 929f3efcd01Safresh1 930f3efcd01Safresh1 # Assemble the rhs. Put each character in a separate 931f3efcd01Safresh1 # bracketed if using charclasses. This creates a stress on 932f3efcd01Safresh1 # the code to span a match across multiple elements 933f3efcd01Safresh1 my $rhs = ""; 934f3efcd01Safresh1 foreach my $rhs_char (@rhs) { 935f3efcd01Safresh1 $rhs .= '[' if $bracketed; 936f3efcd01Safresh1 $rhs .= '^' if $inverted; 937f3efcd01Safresh1 $rhs .= $rhs_char; 938f3efcd01Safresh1 939f3efcd01Safresh1 # Add a character to the class, so class doesn't get 940f3efcd01Safresh1 # optimized out, unless we are testing that optimization 941f3efcd01Safresh1 $rhs .= '_' if $optimize_bracketed; 942f3efcd01Safresh1 $rhs .= ']' if $bracketed; 943f3efcd01Safresh1 } 944f3efcd01Safresh1 945f3efcd01Safresh1 # Add one of: no capturing parens 946f3efcd01Safresh1 # a single set 947f3efcd01Safresh1 # a nested set 948f3efcd01Safresh1 # Use quantifiers and extra variable width matches inside 949f3efcd01Safresh1 # them to keep some optimizations from happening 950f3efcd01Safresh1 foreach my $parend (0, 1, 2) { 951f3efcd01Safresh1 my $interior = (! $parend) 952f3efcd01Safresh1 ? $rhs 953f3efcd01Safresh1 : ($parend == 1) 954f3efcd01Safresh1 ? "(${rhs},?)" 955f3efcd01Safresh1 : "((${rhs})+,?)"; 956f3efcd01Safresh1 foreach my $quantifier ("", '?', '*', '+', '{1,3}') { 957f3efcd01Safresh1 958f3efcd01Safresh1 # Perhaps should be TODOs, as are unimplemented, but 959f3efcd01Safresh1 # maybe will never be implemented 960f3efcd01Safresh1 next if @pattern != 1 && $quantifier; 961f3efcd01Safresh1 962f3efcd01Safresh1 # A ? or * quantifier normally causes the thing to be 963f3efcd01Safresh1 # able to match a null string 964f3efcd01Safresh1 my $quantifier_can_match_null = $quantifier eq '?' 965f3efcd01Safresh1 || $quantifier eq '*'; 966f3efcd01Safresh1 967f3efcd01Safresh1 # But since we only quantify the last character in a 968f3efcd01Safresh1 # multiple fold, the other characters will have width, 969f3efcd01Safresh1 # except if we are quantifying the whole rhs 970f3efcd01Safresh1 my $can_match_null = $quantifier_can_match_null 971f3efcd01Safresh1 && (@rhs == 1 || $parend); 972f3efcd01Safresh1 973f3efcd01Safresh1 foreach my $l_anchor ("", '^') { # '\A' didn't change 974f3efcd01Safresh1 # result) 975f3efcd01Safresh1 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't 976f3efcd01Safresh1 # change result) 977f3efcd01Safresh1 # The folded part can match the null string if it 978f3efcd01Safresh1 # isn't required to have width, and there's not 979f3efcd01Safresh1 # something on one or both sides that force it to. 980f3efcd01Safresh1 my $both_sides = ($l_anchor && $r_anchor) 981f3efcd01Safresh1 || ($l_anchor && $append) 982f3efcd01Safresh1 || ($r_anchor && $prepend) 983f3efcd01Safresh1 || ($prepend && $append); 984f3efcd01Safresh1 my $must_match = ! $can_match_null || $both_sides; 985f3efcd01Safresh1 # for performance, but doing this missed many failures 986f3efcd01Safresh1 #next unless $must_match; 987f2a19305Safresh1 my $quantified = "(?$charset_mod:$l_anchor$prepend" 988f2a19305Safresh1 . "$interior${quantifier}$append$r_anchor)"; 989f3efcd01Safresh1 my $op; 990f3efcd01Safresh1 if ($must_match && $should_fail) { 991f3efcd01Safresh1 $op = 0; 992f3efcd01Safresh1 } else { 993f3efcd01Safresh1 $op = 1; 994f3efcd01Safresh1 } 995f3efcd01Safresh1 $op = ! $op if $must_match && $inverted; 996f3efcd01Safresh1 997f3efcd01Safresh1 if ($inverted && @target > 1) { 998f3efcd01Safresh1 # When doing an inverted match against a 999f3efcd01Safresh1 # multi-char target, and there is not something on 1000f3efcd01Safresh1 # the left to anchor the match, if it shouldn't 1001f3efcd01Safresh1 # succeed, skip, as what will happen (when working 1002f3efcd01Safresh1 # correctly) is that it will match the first 1003f3efcd01Safresh1 # position correctly, and then be inverted to not 1004f3efcd01Safresh1 # match; then it will go to the second position 1005f3efcd01Safresh1 # where it won't match, but get inverted to match, 1006f3efcd01Safresh1 # and hence succeeding. 1007f3efcd01Safresh1 next if ! ($l_anchor || $prepend) && ! $op; 1008f3efcd01Safresh1 1009f3efcd01Safresh1 # Can't ever match for latin1 code points non-uni 1010f3efcd01Safresh1 # semantics that have a inverted multi-char fold 1011f3efcd01Safresh1 # when there is something on both sides and the 1012f3efcd01Safresh1 # quantifier isn't such as to span the required 1013f3efcd01Safresh1 # width, which is 2 or 3. 1014f3efcd01Safresh1 $op = 0 if $ord < 255 1015f3efcd01Safresh1 && ! $uni_semantics 1016f3efcd01Safresh1 && $both_sides 1017f3efcd01Safresh1 && ( ! $quantifier || $quantifier eq '?') 1018f3efcd01Safresh1 && $parend < 2; 1019f3efcd01Safresh1 1020f3efcd01Safresh1 # Similarly can't ever match when inverting a 1021f3efcd01Safresh1 # multi-char fold for /aa and the quantifier 1022f3efcd01Safresh1 # isn't sufficient to allow it to span to both 1023f3efcd01Safresh1 # sides. 1024f3efcd01Safresh1 $op = 0 if $target_has_ascii 1025f3efcd01Safresh1 && $charset eq 'aa' 1026f3efcd01Safresh1 && $both_sides 1027f3efcd01Safresh1 && ( ! $quantifier || $quantifier eq '?') 1028f3efcd01Safresh1 && $parend < 2; 1029f3efcd01Safresh1 1030f3efcd01Safresh1 # Or for /l 1031f3efcd01Safresh1 $op = 0 if $target_has_latin1 && $charset eq 'l' 1032f3efcd01Safresh1 && $both_sides 1033f3efcd01Safresh1 && ( ! $quantifier || $quantifier eq '?') 1034f3efcd01Safresh1 && $parend < 2; 1035f3efcd01Safresh1 } 1036f3efcd01Safresh1 1037f3efcd01Safresh1 1038f3efcd01Safresh1 my $desc = ""; 1039f3efcd01Safresh1 if ($charset_mod eq 'l') { 1040f3efcd01Safresh1 $desc .= 'setlocale(LC_CTYPE, "' 1041f3efcd01Safresh1 . POSIX::setlocale(&POSIX::LC_CTYPE) 1042f3efcd01Safresh1 . '"); ' 1043f3efcd01Safresh1 } 1044f3efcd01Safresh1 $desc .= "my \$c = \"$prepend$lhs$append\"; " 1045f2a19305Safresh1 . "my \$rhs = \"\"; $upgrade_pattern" 1046f2a19305Safresh1 . "my \$p = qr/$quantified\$rhs/i;" 1047f2a19305Safresh1 . "$upgrade_target " 1048f3efcd01Safresh1 . "\$c " . ($op ? "=~" : "!~") . " \$p; "; 1049f3efcd01Safresh1 if ($DEBUG) { 1050f3efcd01Safresh1 $desc .= ( 1051f3efcd01Safresh1 "; uni_semantics=$uni_semantics, " 1052f3efcd01Safresh1 . "should_fail=$should_fail, " 1053f3efcd01Safresh1 . "bracketed=$bracketed, " 1054f3efcd01Safresh1 . "prepend=$prepend, " 1055f3efcd01Safresh1 . "append=$append, " 1056f3efcd01Safresh1 . "parend=$parend, " 1057f3efcd01Safresh1 . "quantifier=$quantifier, " 1058f3efcd01Safresh1 . "l_anchor=$l_anchor, " 1059f3efcd01Safresh1 . "r_anchor=$r_anchor; " 1060f3efcd01Safresh1 . "pattern_above_latin1=$pattern_above_latin1; " 1061f3efcd01Safresh1 . "utf8_pattern=$utf8_pattern" 1062f3efcd01Safresh1 ); 1063f3efcd01Safresh1 } 1064f3efcd01Safresh1 1065f3efcd01Safresh1 my $c = "$prepend$lhs_str$append"; 1066f2a19305Safresh1 my $p = "$quantified"; # string copy deliberate 1067f3efcd01Safresh1 utf8::upgrade($c) if length($upgrade_target); 1068f3efcd01Safresh1 utf8::upgrade($p) if length($upgrade_pattern); 1069f2a19305Safresh1 $p = qr/$p/i; 1070f3efcd01Safresh1 my $res = $op ? ($c =~ $p): ($c !~ $p); 1071f3efcd01Safresh1 1072f3efcd01Safresh1 if (!$res || $list_all_tests) { 1073f3efcd01Safresh1 # Failed or debug; output the result 1074f3efcd01Safresh1 $count++; 1075f3efcd01Safresh1 ok($res, "test $count - $desc"); 1076f3efcd01Safresh1 } else { 1077f3efcd01Safresh1 # Just count the test as passed 1078f3efcd01Safresh1 $okays++; 1079f3efcd01Safresh1 } 1080f3efcd01Safresh1 $this_iteration++; 1081f3efcd01Safresh1 } 1082f3efcd01Safresh1 } 1083f3efcd01Safresh1 } 1084f3efcd01Safresh1 } 1085f3efcd01Safresh1 } 1086f3efcd01Safresh1 } 1087f3efcd01Safresh1 } 1088f3efcd01Safresh1 } 1089f3efcd01Safresh1 } 1090f3efcd01Safresh1 } 1091f3efcd01Safresh1 } 1092f3efcd01Safresh1 unless($list_all_tests) { 1093f3efcd01Safresh1 $count++; 1094f3efcd01Safresh1 is $okays, $this_iteration, "$okays subtests ok for" 1095f3efcd01Safresh1 . " /$charset_mod" 1096f3efcd01Safresh1 . (($charset_mod eq 'l') ? " ($current_locale)" : "") 1097f3efcd01Safresh1 . ', target="' . join("", @x_target) . '",' 1098f3efcd01Safresh1 . ' pat="' . join("", @x_pattern) . '"'; 1099f3efcd01Safresh1 } 1100f3efcd01Safresh1 } 1101f3efcd01Safresh1 } 1102f3efcd01Safresh1} 1103f3efcd01Safresh1} 1104f3efcd01Safresh1 1105f3efcd01Safresh1plan($count); 1106f3efcd01Safresh1 1107f3efcd01Safresh11 1108