xref: /openbsd-src/gnu/usr.bin/perl/t/re/fold_grind.pl (revision 56d68f1e19ff848c889ecfa71d3a06340ff64892)
1# Grind out a lot of combinatoric tests for folding.
2# It uses various charset modifiers, passed in via $::TEST_CHUNK.  The caller
3# will also have set the locale to use if /l is the modifier.
4#   L is a pseudo-modifier that indicates to use the modifier /l instead, and
5#     the locale set by the caller is known to be UTF-8,
6#   T is a pseudo-modifier that indicates to use the pseudo modifier /L
7#     instead, and the locale set by the caller is known to be Turkic UTF-8,
8
9binmode STDOUT, ":utf8";
10
11BEGIN {
12    chdir 't' if -d 't';
13    require './test.pl';
14    set_up_inc('../lib');
15    require Config; import Config;
16    skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
17    if ($^O eq 'dec_osf') {
18      skip_all("$^O cannot handle this test");
19    }
20    my $time_out_factor = $ENV{PERL_TEST_TIME_OUT_FACTOR} || 1;
21    $time_out_factor = 1 if $time_out_factor < 1;
22
23    watchdog(5 * 60 * $time_out_factor);
24    require './loc_tools.pl';
25}
26
27use charnames ":full";
28
29my $DEBUG = 0;  # Outputs extra information for debugging this .t
30
31use strict;
32use warnings;
33no warnings 'locale';   # Plenty of these would otherwise get generated
34use Encode;
35use POSIX;
36
37my $charset = $::TEST_CHUNK;
38my $use_turkic_rules = 0;
39
40if ($charset eq 'T') {
41    $charset = 'L';
42    $use_turkic_rules = 1;
43}
44
45# Special-cased characters in the .c's that we want to make sure get tested.
46my %be_sure_to_test = (
47        chr utf8::unicode_to_native(0xDF) => 1, # LATIN_SMALL_LETTER_SHARP_S
48        "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
49        "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
50        "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
51        "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
52        "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
53        "I" => 1,
54);
55
56# Tests both unicode and not, so make sure not implicitly testing unicode
57no feature 'unicode_strings';
58
59# Case-insensitive matching is a large and complicated issue.  Perl does not
60# implement it fully, properly.  For example, it doesn't include normalization
61# as part of the equation.  To test every conceivable combination is clearly
62# impossible; these tests are mostly drawn from visual inspection of the code
63# and experience, trying to exercise all areas.
64
65# There are three basic ranges of characters that Perl may treat differently:
66# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
67#    referred to here as ASCII.  On EBCDIC machines, the non-ASCII invariants
68#    are all controls that fold to themselves.
69my $ASCII = 1;
70
71# 2) Other characters that fit into a byte but are different in utf8 than not;
72#    here referred to, taking some liberties, as Latin1.
73my $Latin1 = 2;
74
75# 3) Characters that won't fit in a byte; here referred to as Unicode
76my $Unicode = 3;
77
78# Within these basic groups are equivalence classes that testing any character
79# in is likely to lead to the same results as any other character.  This is
80# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
81# set.
82my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
83
84# Additionally parts of this test run a lot of subtests, outputting the
85# resulting TAP can be expensive so the tests are summarised internally. The
86# PERL_DEBUG_FULL_TEST environment variable can be set to produce the full
87# output for debugging purposes.
88
89sub range_type {
90    my $ord = ord shift;
91
92    return $ASCII if utf8::native_to_unicode($ord) < 128;
93    return $Latin1 if $ord < 256;
94    return $Unicode;
95}
96
97sub numerically {
98    return $a <=> $b
99}
100
101my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG;
102$| = 1 if $list_all_tests;
103
104# Significant time is saved by not outputting each test but grouping the
105# output into subtests
106my $okays;          # Number of ok's in current subtest
107my $this_iteration; # Number of possible tests in current subtest
108my $count = 0;      # Number of subtests = number of total tests
109
110sub run_test($$$$) {
111    my ($test, $todo, $do_we_output_locale_name, $debug) = @_;
112
113    $debug = "" unless $DEBUG;
114    my $res = eval $test;
115
116    if ($do_we_output_locale_name) {
117        $do_we_output_locale_name = 'setlocale(LC_CTYPE, "'
118                         .  POSIX::setlocale(&POSIX::LC_CTYPE)
119                         . '"); ';
120    }
121    if (!$res || $list_all_tests) {
122      # Failed or debug; output the result
123      $count++;
124      ok($res, "$do_we_output_locale_name$test; $debug");
125    } else {
126      # Just count the test as passed
127      $okays++;
128    }
129    $this_iteration++;
130}
131
132my %has_test_by_participants;   # Makes sure has tests for each range and each
133                                # number of characters that fold to the same
134                                # thing
135my %has_test_by_byte_count; # Makes sure has tests for each combination of
136                            # n bytes folds to m bytes
137
138my %tests; # The set of tests we expect to pass.
139# Each key is a code point that folds to something else.
140# Each value is a list of things that the key folds to.  If the 'thing' is a
141# single code point, it is that ordinal.  If it is a multi-char fold, it is an
142# ordered list of the code points in that fold.  Here's an example for 'S':
143#  '83' => [ 115, 383 ]
144#
145# And one for a multi-char fold: \xDF
146#  223 => [
147#            [  # 'ss'
148#                83,
149#                83
150#            ],
151#            [  # 'SS'
152#                115,
153#                115
154#            ],
155#            [  # LATIN SMALL LETTER LONG S
156#                383,
157#                383
158#            ],
159#          7838 # LATIN_CAPITAL_LETTER_SHARP_S
160#        ],
161
162my %neg_tests;  # Same format, but we expect these tests to fail
163
164my %folds; # keys are code points that fold; values are either 0 or 1 which
165           # in turn are keys with their values each a list of code points the
166           # code point key folds to.  The folds under 1 are the ones that are
167           # valid in this run; the ones under 0 are ones valid under other
168           # circumstances.
169
170my %inverse_folds;  # keys are strings of the folded-to; then come a layer of
171                    # 0 or 1, like %folds.  The lowest values are lists of
172                    # characters that fold to them
173
174# Here's a portion of an %inverse_folds in a run where Turkic folds are not
175# legal, so \x{130} doesn't fold to 'i' in this run.
176#         'h' => {
177#                  '1' => [
178#                           'H'
179#                         ]
180#                },
181#         "h\x{331}" => {
182#                         '1' => [
183#                                  "\x{1e96}"
184#                                ]
185#                       },
186#         'i' => {
187#                  '0' => [
188#                           "\x{130}"
189#                         ],
190#                  '1' => [
191#                           'I'
192#                         ]
193#                },
194#         "i\x{307}" => {
195#                         '1' => [
196#                                  "\x{130}"
197#                                ]
198#                       },
199#         'j' => {
200#                  '1' => [
201#                           'J'
202#                         ]
203#                },
204
205sub add_test($$@) {
206    my ($tests_ref, $to, @from) = @_;
207
208    # Called to cause the input to be tested by adding to $%tests_ref.  @from
209    # is the list of characters that fold to the string $to.  @from should be
210    # sorted so the lowest code point is first....
211    # The input is in string form; %tests uses code points, so have to
212    # convert.
213
214    my $to_chars = length $to;
215    my @test_to;        # List of tests for $to
216
217    if ($to_chars == 1) {
218        @test_to = ord $to;
219    }
220    else {
221        push @test_to, [ map { ord $_ } split "", $to ];
222
223        # For multi-char folds, we also test that things that can fold to each
224        # individual character in the fold also work.  If we were testing
225        # comprehensively, we would try every combination of upper and lower
226        # case in the fold, but it will have to suffice to avoid running
227        # forever to make sure that each thing that folds to these is tested
228        # at least once.  Because of complement matching ([^...]), we need to
229        # do both the folded, and the folded-from.
230        # We first look at each character in the multi-char fold, and save how
231        # many characters fold to it; and also the maximum number of such
232        # folds
233        my @folds_to_count;     # 0th char in fold is index 0 ...
234        my $max_folds_to = 0;
235
236        for (my $i = 0; $i < $to_chars; $i++) {
237            my $to_char = substr($to, $i, 1);
238            if (exists $inverse_folds{$to_char}{1}) {
239                $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}{1}};
240                $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
241            }
242            else {
243                $folds_to_count[$i] = 0;
244            }
245        }
246
247        # We will need to generate as many tests as the maximum number of
248        # folds, so that each fold will have at least one test.
249        # For example, consider character X which folds to the three character
250        # string 'xyz'.  If 2 things fold to x (X and x), 4 to y (Y, Y'
251        # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4
252        # tests will be generated:
253        #   xyz
254        #   XYz
255        #   xY'z
256        #   xY''z
257        for (my $i = 0; $i < $max_folds_to; $i++) {
258            my @this_test_to;   # Assemble a single test
259
260            # For each character in the multi-char fold ...
261            for (my $j = 0; $j < $to_chars; $j++) {
262                my $this_char = substr($to, $j, 1);
263
264                # Use its corresponding inverse fold, if available.
265                if (   $i < $folds_to_count[$j]
266                    && exists $inverse_folds{$this_char}{1})
267                  {
268                    push @this_test_to, ord $inverse_folds{$this_char}{1}[$i];
269                }
270                else {  # Or else itself.
271                    push @this_test_to, ord $this_char;
272                }
273            }
274
275            # Add this test to the list
276            push @test_to, [ @this_test_to ];
277        }
278
279        # Here, have assembled all the tests for the multi-char fold.  Sort so
280        # lowest code points are first for consistency and aesthetics in
281        # output.  We know there are at least two characters in the fold, but
282        # I haven't bothered to worry about sorting on an optional third
283        # character if the first two are identical.
284        @test_to = sort { ($a->[0] == $b->[0])
285                           ? $a->[1] <=> $b->[1]
286                           : $a->[0] <=> $b->[0]
287                        } @test_to;
288    }
289
290
291    # This test is from n bytes to m bytes.  Record that so won't try to add
292    # another test that does the same.
293    use bytes;
294    my $to_bytes = length $to;
295    foreach my $from_map (@from) {
296        $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
297    }
298    no bytes;
299
300    my $ord_smallest_from = ord shift @from;
301    if (exists $tests_ref->{$ord_smallest_from}) {
302        die "There are already tests for $ord_smallest_from"
303    };
304
305    # Add in the fold tests,
306    push @{$tests_ref->{$ord_smallest_from}}, @test_to;
307
308    # Then any remaining froms in the equivalence class.
309    push @{$tests_ref->{$ord_smallest_from}}, map { ord $_ } @from;
310}
311
312# Get the Unicode rules and construct inverse mappings from them
313
314use Unicode::UCD;
315my $file="../lib/unicore/CaseFolding.txt";
316
317# Use the Unicode data file if we are on an ASCII platform (which its data is
318# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
319# available.  This avoids being affected by potential bugs introduced by other
320# layers of Perl
321if ($::IS_ASCII
322    && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
323    && open my $fh, "<", $file)
324{
325    # We process the file in reverse order because its easier to see the T
326    # entry first and then know that the next line we process is the
327    # corresponding one for non-T.
328    my @rules = <$fh>;
329    my $prev_was_turkic = 0;
330    while (defined ($_ = pop @rules)) {
331        chomp;
332
333        # Lines look like (though without the initial '#')
334        #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
335
336        # Get rid of comments, ignore blank or comment-only lines
337        my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
338        next unless length $line;
339        my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line;
340
341        next if $fold_type eq 'S';  # If Unicode's tables are correct, the F
342                                    # should be a superset of S
343        next if $fold_type eq 'I';  # Perl doesn't do old Turkish folding
344
345        my $test_type;
346        if ($fold_type eq 'T') {
347            $test_type = 0 + $use_turkic_rules;
348            $prev_was_turkic = 1;
349        }
350        elsif ($prev_was_turkic) {
351            $test_type = 0 + ! $use_turkic_rules;
352            $prev_was_turkic = 0;
353        }
354        else {
355            $test_type = 1;
356            $prev_was_turkic = 0;
357        }
358
359        my $from = hex $hex_from;
360        my @to = map { hex $_ } @hex_folded;
361        push @{$folds{$from}{$test_type}}, @to;
362
363        my $folded_str = pack ("U0U*", @to);
364        push @{$inverse_folds{$folded_str}{$test_type}}, chr $from;
365    }
366}
367else {  # Here, can't use the .txt file: read the Unicode rules file and
368        # construct inverse mappings from it
369
370    skip_all "Don't know how to generate turkic rules on this platform"
371                                                            if $use_turkic_rules;
372    my ($invlist_ref, $invmap_ref, undef, $default)
373                                    = Unicode::UCD::prop_invmap('Case_Folding');
374    for my $i (0 .. @$invlist_ref - 1 - 1) {
375        next if $invmap_ref->[$i] == $default;
376
377        # Make into an array if not so already, so can treat uniformly below
378        $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i];
379
380        # Each subsequent element of the range requires adjustment of +1 from
381        # the previous element
382        my $adjust = -1;
383        for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
384            $adjust++;
385            my @to = map { $_ + $adjust } @{$invmap_ref->[$i]};
386            push @{$folds{$j}{1}}, @to;
387            my $folded_str = join "", map { chr } @to;
388            utf8::upgrade($folded_str);
389            #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ",
390            #    map { sprintf "%04X", $_  + $adjust } @{$invmap_ref->[$i]});
391            push @{$inverse_folds{$folded_str}{1}}, chr $j;
392        }
393    }
394}
395
396# Analyze the data and generate tests to get adequate test coverage.  We sort
397# things so that smallest code points are done first.
398foreach my $to (sort { $a cmp $b } keys %inverse_folds)
399{
400TO:
401  foreach my $tests_ref (\%tests, \%neg_tests) {
402    my $test_type = ($tests_ref == \%tests) ? 1 : 0;
403
404    next unless exists $inverse_folds{$to}{$test_type};
405
406    # Within each fold, sort so that the smallest code points are done first
407    @{$inverse_folds{$to}{$test_type}} = sort { $a cmp $b } @{$inverse_folds{$to}{$test_type}};
408    my @from = @{$inverse_folds{$to}{$test_type}};
409
410    # Just add it to the tests if doing complete coverage
411    if (! $skip_apparently_redundant) {
412        add_test($tests_ref, $to, @from);
413        next TO;
414    }
415
416    my $to_chars = length $to;
417    my $to_range_type = range_type(substr($to, 0, 1));
418
419    # If this is required to be tested, do so.  We check for these first, as
420    # they will take up slots of byte-to-byte combinations that we otherwise
421    # would have to have other tests to get.
422    foreach my $from_map (@from) {
423        if (exists $be_sure_to_test{$from_map}) {
424            add_test($tests_ref, $to, @from);
425            next TO;
426        }
427    }
428
429    # If the fold contains heterogeneous range types, is suspect and should be
430    # tested.
431    if ($to_chars > 1) {
432        foreach my $char (split "", $to) {
433            if (range_type($char) != $to_range_type) {
434                add_test($tests_ref, $to, @from);
435                next TO;
436            }
437        }
438    }
439
440    # If the mapping crosses range types, is suspect and should be tested
441    foreach my $from_map (@from) {
442        if (range_type($from_map) != $to_range_type) {
443            add_test($tests_ref, $to, @from);
444            next TO;
445        }
446    }
447
448    # Here, all components of the mapping are in the same range type.  For
449    # single character folds, we test one case in each range type that has 2
450    # particpants, 3 particpants, etc.
451    if ($to_chars == 1) {
452        if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) {
453            add_test($tests_ref, $to, @from);
454            $has_test_by_participants{scalar @from}{$to_range_type} = $to;
455            next TO;
456        }
457    }
458
459    # We also test all combinations of mappings from m to n bytes.  This is
460    # because the regex optimizer cares.  (Don't bother worrying about that
461    # Latin1 chars will occupy a different number of bytes under utf8, as
462    # there are plenty of other cases that catch these byte numbers.)
463    use bytes;
464    my $to_bytes = length $to;
465    foreach my $from_map (@from) {
466        if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) {
467            add_test($tests_ref, $to, @from);
468            next TO;
469        }
470    }
471  }
472}
473
474# For each range type, test additionally a character that folds to itself
475add_test(\%tests, ":", ":");
476add_test(\%tests, chr utf8::unicode_to_native(0xF7), chr utf8::unicode_to_native(0xF7));
477add_test(\%tests, chr 0x2C7, chr 0x2C7);
478
479# To cut down on the number of tests
480my $has_tested_aa_above_latin1;
481my $has_tested_latin1_aa;
482my $has_tested_ascii_aa;
483my $has_tested_l_above_latin1;
484my $has_tested_above_latin1_l;
485my $has_tested_ascii_l;
486my $has_tested_above_latin1_d;
487my $has_tested_ascii_d;
488my $has_tested_non_latin1_d;
489my $has_tested_above_latin1_a;
490my $has_tested_ascii_a;
491my $has_tested_non_latin1_a;
492
493# For use by pairs() in generating combinations
494sub prefix {
495    my $p = shift;
496    map [ $p, $_ ], @_
497}
498
499# Returns all ordered combinations of pairs of elements from the input array.
500# It doesn't return pairs like (a, a), (b, b).  Change the slice to an array
501# to do that.  This was just to have fewer tests.
502sub pairs (@) {
503    #print STDERR __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n";
504    map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
505}
506
507# Finally ready to do the tests
508foreach my $tests_ref (\%neg_tests, \%tests) {
509foreach my $test (sort { numerically } keys %{$tests_ref}) {
510
511  my $previous_target;
512  my $previous_pattern;
513  my @pairs = pairs(sort numerically $test, @{$tests_ref->{$test}});
514
515  # Each fold can be viewed as a closure of all the characters that
516  # participate in it.  Look at each possible pairing from a closure, with the
517  # first member of the pair the target string to match against, and the
518  # second member forming the pattern.  Thus each fold member gets tested as
519  # the string, and the pattern with every other member in the opposite role.
520  while (my $pair = shift @pairs) {
521    my ($target, $pattern) = @$pair;
522
523    # When testing a char that doesn't fold, we can get the same
524    # permutation twice; so skip all but the first.
525    next if $previous_target
526            && $previous_target == $target
527            && $previous_pattern == $pattern;
528    ($previous_target, $previous_pattern) = ($target, $pattern);
529
530    # Each side may be either a single char or a string.  Extract each into an
531    # array (perhaps of length 1)
532    my @target, my @pattern;
533    @target = (ref $target) ? @$target : $target;
534    @pattern = (ref $pattern) ? @$pattern : $pattern;
535
536    # We are testing just folds to/from a single character.  If our pairs
537    # happens to generate multi/multi, skip.
538    next if @target > 1 && @pattern > 1;
539
540    # Get in hex form.
541    my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
542    my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
543
544    my $target_above_latin1 = grep { $_ > 255 } @target;
545    my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
546    my $target_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @target;
547    my $pattern_has_ascii = grep { utf8::native_to_unicode($_) < 128 } @pattern;
548    my $target_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @target;
549    my $pattern_only_ascii = ! grep { utf8::native_to_unicode($_) > 127 } @pattern;
550    my $target_has_latin1 = grep { $_ < 256 } @target;
551    my $target_has_upper_latin1
552                = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @target;
553    my $pattern_has_upper_latin1
554                = grep { $_ < 256 && utf8::native_to_unicode($_) > 127 } @pattern;
555    my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
556    my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
557
558    # We don't test multi-char folding into other multi-chars.  We are testing
559    # a code point that folds to or from other characters.  Find the single
560    # code point for diagnostic purposes.  (If both are single, choose the
561    # target string)
562    my $ord = @target == 1 ? $target[0] : $pattern[0];
563    my $progress = sprintf "%04X: \"%s\" and /%s/",
564                            $test,
565                            join("", @x_target),
566                            join("", @x_pattern);
567    #note $progress;
568
569    # Now grind out tests, using various combinations.
570    {
571      my $charset_mod = lc $charset;
572      my $current_locale = setlocale(&POSIX::LC_CTYPE);
573      $current_locale = 'C locale' if $current_locale eq 'C';
574      $okays = 0;
575      $this_iteration = 0;
576
577      # To cut down somewhat on the enormous quantity of tests this currently
578      # runs, skip some for some of the character sets whose results aren't
579      # likely to differ from others.  But run all tests on the code points
580      # that don't fold, plus one other set in each range group.
581      if (! $is_self) {
582
583        # /aa should only affect things with folds in the ASCII range.  But, try
584        # it on one set in the other ranges just to make sure it doesn't break
585        # them.
586        if ($charset eq 'aa') {
587
588          # It may be that this $pair of code points to test are both
589          # non-ascii, but if either of them actually fold to ascii, that is
590          # suspect and should be tested.  So for /aa, use whether their folds
591          # are ascii or not
592          my $target_has_ascii = $target_has_ascii;
593          my $pattern_has_ascii = $pattern_has_ascii;
594          if (! $target_has_ascii) {
595            foreach my $cp (@target) {
596              if (exists $folds{$cp}{1}
597                  && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}{1}} )
598              {
599                  $target_has_ascii = 1;
600                  last;
601              }
602            }
603          }
604          if (! $pattern_has_ascii) {
605            foreach my $cp (@pattern) {
606              if (exists $folds{$cp}{1}
607                  && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}}{1} )
608              {
609                  $pattern_has_ascii = 1;
610                  last;
611              }
612            }
613          }
614
615          if (! $target_has_ascii && ! $pattern_has_ascii) {
616            if ($target_above_latin1 || $pattern_above_latin1) {
617              next if defined $has_tested_aa_above_latin1
618                      && $has_tested_aa_above_latin1 != $test;
619              $has_tested_aa_above_latin1 = $test;
620            }
621            next if defined $has_tested_latin1_aa
622                    && $has_tested_latin1_aa != $test;
623            $has_tested_latin1_aa = $test;
624          }
625          elsif ($target_only_ascii && $pattern_only_ascii) {
626
627              # And, except for one set just to make sure, skip tests
628              # where both elements in the pair are ASCII.  If one works for
629              # aa, the others are likely too.  This skips tests where the
630              # fold is from non-ASCII to ASCII, but this part of the test
631              # is just about the ASCII components.
632              next if defined $has_tested_ascii_l
633                      && $has_tested_ascii_l != $test;
634              $has_tested_ascii_l = $test;
635          }
636        }
637        elsif ($charset eq 'l') {
638
639          # For l, don't need to test beyond one set those things that are
640          # all above latin1, because unlikely to have different successes
641          # than /u.  But, for the same reason as described in the /aa above,
642          # it is suspect and should be tested, if either of the folds are to
643          # latin1.
644          my $target_has_latin1 = $target_has_latin1;
645          my $pattern_has_latin1 = $pattern_has_latin1;
646          if (! $target_has_latin1) {
647            foreach my $cp (@target) {
648              if (exists $folds{$cp}{1}
649                  && grep { $_ < 256 } @{$folds{$cp}{1}} )
650              {
651                $target_has_latin1 = 1;
652                last;
653              }
654            }
655          }
656          if (! $pattern_has_latin1) {
657            foreach my $cp (@pattern) {
658              if (exists $folds{$cp}{1}
659                  && grep { $_ < 256 } @{$folds{$cp}{1}} )
660              {
661                $pattern_has_latin1 = 1;
662                last;
663              }
664            }
665          }
666          if (! $target_has_latin1 && ! $pattern_has_latin1) {
667            next if defined $has_tested_above_latin1_l
668                    && $has_tested_above_latin1_l != $test;
669            $has_tested_above_latin1_l = $test;
670          }
671          elsif ($target_only_ascii && $pattern_only_ascii) {
672
673              # And, except for one set just to make sure, skip tests
674              # where both elements in the pair are ASCII.  This is
675              # essentially the same reasoning as above for /aa.
676              next if defined $has_tested_ascii_l
677                      && $has_tested_ascii_l != $test;
678              $has_tested_ascii_l = $test;
679          }
680        }
681        elsif ($charset eq 'd') {
682          # Similarly for d.  Beyond one test (besides self) each, we  don't
683          # test pairs that are both ascii; or both above latin1, or are
684          # combinations of ascii and above latin1.
685          if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
686            if ($target_has_ascii && $pattern_has_ascii) {
687              next if defined $has_tested_ascii_d
688                      && $has_tested_ascii_d != $test;
689              $has_tested_ascii_d = $test
690            }
691            elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
692              next if defined $has_tested_above_latin1_d
693                      && $has_tested_above_latin1_d != $test;
694              $has_tested_above_latin1_d = $test;
695            }
696            else {
697              next if defined $has_tested_non_latin1_d
698                      && $has_tested_non_latin1_d != $test;
699              $has_tested_non_latin1_d = $test;
700            }
701          }
702        }
703        elsif ($charset eq 'a') {
704          # Similarly for a.  This should match identically to /u, so wasn't
705          # tested at all until a bug was found that was thereby missed.
706          # As a compromise, beyond one test (besides self) each, we  don't
707          # test pairs that are both ascii; or both above latin1, or are
708          # combinations of ascii and above latin1.
709          if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
710            if ($target_has_ascii && $pattern_has_ascii) {
711              next if defined $has_tested_ascii_a
712                      && $has_tested_ascii_a != $test;
713              $has_tested_ascii_a = $test
714            }
715            elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
716              next if defined $has_tested_above_latin1_a
717                      && $has_tested_above_latin1_a != $test;
718              $has_tested_above_latin1_a = $test;
719            }
720            else {
721              next if defined $has_tested_non_latin1_a
722                      && $has_tested_non_latin1_a != $test;
723              $has_tested_non_latin1_a = $test;
724            }
725          }
726        }
727      }
728
729      foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
730                                          # code points < 256
731        my $upgrade_target = "";
732
733        # These must already be in utf8 because the string to match has
734        # something above latin1.  So impossible to test if to not to be in
735        # utf8; and otherwise, no upgrade is needed.
736        next if $target_above_latin1 && ! $utf8_target;
737        $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
738
739        foreach my $utf8_pattern (0, 1) {
740          next if $pattern_above_latin1 && ! $utf8_pattern;
741
742          # Our testing of 'l' uses the POSIX locale, which is ASCII-only
743          my $uni_semantics = $charset ne 'l' && (    $utf8_target
744                                                  ||  $charset eq 'u'
745                                                  ||  $charset eq 'L'
746                                                  || ($charset eq 'd' && $utf8_pattern)
747                                                  ||  $charset =~ /a/);
748          my $upgrade_pattern = "";
749          $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
750
751          my $lhs = join "", @x_target;
752          my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
753          my @rhs = @x_pattern;
754          my $rhs = join "", @rhs;
755
756          # Unicode created a folding rule that partially emulates what
757          # happens in a Turkish locale, by using combining characters.  The
758          # result is close enough to what really should happen, that it can
759          # pass many of the tests, but not all.  So, if we have a rule that
760          # is expecting failure, it may pass instead.  The code in the block
761          # below is good enough for skipping the tests, and khw tried to make
762          # it general, but should the rules be revised (unlikely at this
763          # point), this might need to be tweaked.
764          if ($tests_ref == \%neg_tests) {
765            my ($shorter_ref, $longer_ref);
766
767            # Convert the $rhs to a string, like we already did for the lhs
768            my $rhs_str = eval qq{"$rhs"}; fail($@) if $@;
769
770            # If the lengths of the two sides are equal, we don't want to do
771            # this; this is only to bypass the combining characters affecting
772            # things
773            if (length $lhs_str != length $rhs_str) {
774
775              # Find the shorter and longer of the pair
776              if (length $lhs_str < length $rhs_str) {
777                  $shorter_ref = \$lhs_str;
778                  $longer_ref = \$rhs_str;
779              }
780              else {
781                  $shorter_ref = \$rhs_str;
782                  $longer_ref = \$lhs_str;
783              }
784
785              # If the shorter string is entirely contained in the longer, we
786              # have generated a test that is likely to succeed, and the
787              # reasons it would fail have nothing to do with folding.  But we
788              # are expecting it to fail, and so our test is invalid.  Skip
789              # it.
790              next if index($$longer_ref, $$shorter_ref) >= 0;
791
792
793              # The above eliminates about half the failure cases.  This gets
794              # the rest.  If the shorter string is a single character and has
795              # a fold legal in this run to a character that is in the longer
796              # string, it is also likely to succeed under /i.  So again our
797              # computed test is bogus.
798              if (   length $$shorter_ref == 1
799                  && exists $folds{ord $$shorter_ref}{1})
800              {
801                my @folded_to = @{$folds{ord $$shorter_ref}{1}};
802                next if   @folded_to == 1
803                       && index($$longer_ref, chr $folded_to[0]) >= 0;
804              }
805            }
806          }
807
808          my $should_fail = (! $uni_semantics && $ord < 256 && ! $is_self && utf8::native_to_unicode($ord) >= 128)
809                            || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
810                            || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1)
811                            || $tests_ref == \%neg_tests;
812
813          # Do simple tests of referencing capture buffers, named and
814          # numbered.
815          my $op = '=~';
816          $op = '!~' if $should_fail;
817
818          my $todo = 0;  # No longer any todo's
819          my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
820          run_test($eval, $todo, ($charset_mod eq 'l'), "");
821
822          $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
823          run_test($eval, $todo, ($charset_mod eq 'l'), "");
824
825          if ($lhs ne $rhs) {
826            $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
827            run_test($eval, "", ($charset_mod eq 'l'), "");
828
829            $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
830            run_test($eval, "", ($charset_mod eq 'l'), "");
831          }
832
833          # See if works on what could be a simple trie.
834          my $alternate;
835          {
836            # Keep the alternate | branch the same length as the tested one so
837            # that it's length doesn't influence things
838            my $evaled = eval "\"$rhs\"";   # Convert e.g. \x{foo} into its
839                                            # chr equivalent
840            use bytes;
841            $alternate = 'q' x length $evaled;
842          }
843          $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset_mod;$upgrade_target$upgrade_pattern \$c $op \$p";
844          run_test($eval, "", ($charset_mod eq 'l'), "");
845
846          # Check that works when the folded character follows something that
847          # is quantified.  This test knows the regex code internals to the
848          # extent that it knows this is a potential problem, and that there
849          # are three different types of quantifiers generated: 1) The thing
850          # being quantified matches a single character; 2) it matches more
851          # than one character, but is fixed width; 3) it can match a variable
852          # number of characters.  (It doesn't know that case 3 shouldn't
853          # matter, since it doesn't do anything special for the character
854          # following the quantifier; nor that some of the different
855          # quantifiers execute the same underlying code, as these tests are
856          # quick, and this insulates these tests from changes in the
857          # implementation.)
858          for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') {
859            $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset_mod:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
860            run_test($eval, "", ($charset_mod eq 'l'), "");
861            $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
862            run_test($eval, "", ($charset_mod eq 'l'), "");
863            $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
864            run_test($eval, "", ($charset_mod eq 'l'), "");
865          }
866
867          foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
868            next if $bracketed && @pattern != 1;    # bracketed makes these
869                                                    # or's instead of a sequence
870            foreach my $optimize_bracketed (0, 1) {
871              next if $optimize_bracketed && ! $bracketed;
872              foreach my $inverted (0,1) {
873                  next if $inverted && ! $bracketed;  # inversion only valid
874                                                      # in [^...]
875                  next if $inverted && @target != 1;  # [perl #89750] multi-char
876                                                      # not valid in [^...]
877
878                # In some cases, add an extra character that doesn't fold, and
879                # looks ok in the output.
880                my $extra_char = "_";
881                foreach my $prepend ("", $extra_char) {
882                  foreach my $append ("", $extra_char) {
883
884                    # Assemble the rhs.  Put each character in a separate
885                    # bracketed if using charclasses.  This creates a stress on
886                    # the code to span a match across multiple elements
887                    my $rhs = "";
888                    foreach my $rhs_char (@rhs) {
889                        $rhs .= '[' if $bracketed;
890                        $rhs .= '^' if $inverted;
891                        $rhs .=  $rhs_char;
892
893                        # Add a character to the class, so class doesn't get
894                        # optimized out, unless we are testing that optimization
895                        $rhs .= '_' if $optimize_bracketed;
896                        $rhs .= ']' if $bracketed;
897                    }
898
899                    # Add one of: no capturing parens
900                    #             a single set
901                    #             a nested set
902                    # Use quantifiers and extra variable width matches inside
903                    # them to keep some optimizations from happening
904                    foreach my $parend (0, 1, 2) {
905                      my $interior = (! $parend)
906                                      ? $rhs
907                                      : ($parend == 1)
908                                          ? "(${rhs},?)"
909                                          : "((${rhs})+,?)";
910                      foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
911
912                        # Perhaps should be TODOs, as are unimplemented, but
913                        # maybe will never be implemented
914                        next if @pattern != 1 && $quantifier;
915
916                        # A ? or * quantifier normally causes the thing to be
917                        # able to match a null string
918                        my $quantifier_can_match_null = $quantifier eq '?'
919                                                     || $quantifier eq '*';
920
921                        # But since we only quantify the last character in a
922                        # multiple fold, the other characters will have width,
923                        # except if we are quantifying the whole rhs
924                        my $can_match_null = $quantifier_can_match_null
925                                             && (@rhs == 1 || $parend);
926
927                        foreach my $l_anchor ("", '^') { # '\A' didn't change
928                                                         # result)
929                          foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't
930                                                           # change result)
931                            # The folded part can match the null string if it
932                            # isn't required to have width, and there's not
933                            # something on one or both sides that force it to.
934                            my $both_sides = ($l_anchor && $r_anchor)
935                                              || ($l_anchor && $append)
936                                              || ($r_anchor && $prepend)
937                                              || ($prepend && $append);
938                            my $must_match = ! $can_match_null || $both_sides;
939                            # for performance, but doing this missed many failures
940                            #next unless $must_match;
941                            my $quantified = "(?$charset_mod:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
942                            my $op;
943                            if ($must_match && $should_fail)  {
944                                $op = 0;
945                            } else {
946                                $op = 1;
947                            }
948                            $op = ! $op if $must_match && $inverted;
949
950                            if ($inverted && @target > 1) {
951                              # When doing an inverted match against a
952                              # multi-char target, and there is not something on
953                              # the left to anchor the match, if it shouldn't
954                              # succeed, skip, as what will happen (when working
955                              # correctly) is that it will match the first
956                              # position correctly, and then be inverted to not
957                              # match; then it will go to the second position
958                              # where it won't match, but get inverted to match,
959                              # and hence succeeding.
960                              next if ! ($l_anchor || $prepend) && ! $op;
961
962                              # Can't ever match for latin1 code points non-uni
963                              # semantics that have a inverted multi-char fold
964                              # when there is something on both sides and the
965                              # quantifier isn't such as to span the required
966                              # width, which is 2 or 3.
967                              $op = 0 if $ord < 255
968                                        && ! $uni_semantics
969                                        && $both_sides
970                                        && ( ! $quantifier || $quantifier eq '?')
971                                        && $parend < 2;
972
973                              # Similarly can't ever match when inverting a
974                              # multi-char fold for /aa and the quantifier
975                              # isn't sufficient to allow it to span to both
976                              # sides.
977                              $op = 0 if $target_has_ascii
978                                         && $charset eq 'aa'
979                                         && $both_sides
980                                         && ( ! $quantifier || $quantifier eq '?')
981                                         && $parend < 2;
982
983                              # Or for /l
984                              $op = 0 if $target_has_latin1 && $charset eq 'l'
985                                      && $both_sides
986                                      && ( ! $quantifier || $quantifier eq '?')
987                                      && $parend < 2;
988                            }
989
990
991                            my $desc = "";
992                            if ($charset_mod eq 'l') {
993                                $desc .= 'setlocale(LC_CTYPE, "'
994                                        . POSIX::setlocale(&POSIX::LC_CTYPE)
995                                        . '"); '
996                            }
997                            $desc .= "my \$c = \"$prepend$lhs$append\"; "
998                                    . "my \$p = qr/$quantified/i;"
999                                    . "$upgrade_target$upgrade_pattern "
1000                                    . "\$c " . ($op ? "=~" : "!~") . " \$p; ";
1001                            if ($DEBUG) {
1002                              $desc .= (
1003                              "; uni_semantics=$uni_semantics, "
1004                              . "should_fail=$should_fail, "
1005                              . "bracketed=$bracketed, "
1006                              . "prepend=$prepend, "
1007                              . "append=$append, "
1008                              . "parend=$parend, "
1009                              . "quantifier=$quantifier, "
1010                              . "l_anchor=$l_anchor, "
1011                              . "r_anchor=$r_anchor; "
1012                              . "pattern_above_latin1=$pattern_above_latin1; "
1013                              . "utf8_pattern=$utf8_pattern"
1014                              );
1015                            }
1016
1017                            my $c = "$prepend$lhs_str$append";
1018                            my $p = qr/$quantified/i;
1019                            utf8::upgrade($c) if length($upgrade_target);
1020                            utf8::upgrade($p) if length($upgrade_pattern);
1021                            my $res = $op ? ($c =~ $p): ($c !~ $p);
1022
1023                            if (!$res || $list_all_tests) {
1024                              # Failed or debug; output the result
1025                              $count++;
1026                              ok($res, "test $count - $desc");
1027                            } else {
1028                              # Just count the test as passed
1029                              $okays++;
1030                            }
1031                            $this_iteration++;
1032                          }
1033                        }
1034                      }
1035                    }
1036                  }
1037                }
1038              }
1039            }
1040          }
1041        }
1042      }
1043      unless($list_all_tests) {
1044        $count++;
1045        is $okays, $this_iteration, "$okays subtests ok for"
1046          . " /$charset_mod"
1047          . (($charset_mod eq 'l') ? " ($current_locale)" : "")
1048          . ', target="' . join("", @x_target) . '",'
1049          . ' pat="' . join("", @x_pattern) . '"';
1050      }
1051    }
1052  }
1053}
1054}
1055
1056plan($count);
1057
10581
1059