xref: /openbsd-src/gnu/usr.bin/perl/t/re/charset.t (revision 3d61058aa5c692477b6d18acfbbdb653a9930ff9)
1# Test the /a, /d, etc regex modifiers
2# For comprehensive tests, set $ENV{PERL_DEBUG_FULL_TEST} to some true value
3
4BEGIN {
5    chdir 't' if -d 't';
6    require './test.pl';
7    set_up_inc('../lib', '../dist/if');
8    require './loc_tools.pl';
9}
10
11use strict;
12use warnings;
13no warnings 'locale';   # Some /l tests use above-latin1 chars to make sure
14                        # they work, even though they warn.
15use Config;
16
17plan('no_plan');
18
19# Each case is a valid element of its hash key.  Choose, where available, at
20# least one of each type: ASCII-range, non-ASCII range Latin-1, and above
21# Latin1 range code point.
22my %testcases = (
23    '\w' => [ 0x16B ],
24    '\d' => [ ord("0"), ord("1"), ord("2"), ord("3"), ord("4"), ord("5"),
25              ord("6"), ord("7"), ord("8"), ord("9"),
26              0x0662,
27            ],
28    '[:blank:]' => [ ord("\t"), ord(" "),
29                     0x1680
30                   ],
31    '\s' => [ ord("\t"), ord("\n"), ord("\cK"), ord("\f"), ord("\r"),
32              ord(" "),
33              utf8::unicode_to_native(0x85),
34              utf8::unicode_to_native(0xA0),
35              0x2029,
36            ],
37    '[:graph:]' => [ 0x02C7 ],
38    '[:lower:]' => [ ord("a"), ord("b"), ord("c"), ord("d"), ord("e"),
39                     ord("f"), ord("g"), ord("h"), ord("i"), ord("j"),
40                     ord("k"), ord("l"), ord("m"), ord("n"), ord("o"),
41                     ord("p"), ord("q"), ord("r"), ord("s"), ord("t"),
42                     ord("u"), ord("v"), ord("w"), ord("x"), ord("y"),
43                     ord("z"),
44                     0x0127 ],
45    '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='),
46                     ord('>'), ord('|'), ord('-'), ord(','), ord(';'),
47                     ord(':'), ord('!'), ord('?'), ord('/'), ord('.'),
48                     ord('"'), ord('('), ord(')'), ord('['), ord(']'),
49                     ord('{'), ord('}'), ord('@'), ord('$'), ord('*'),
50                     ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'),
51                     ord("'"),
52                     0x055C
53                   ],
54    '[:upper:]' => [ ord("A"), ord("B"), ord("C"), ord("D"), ord("E"),
55                     ord("F"), ord("G"), ord("H"), ord("I"), ord("J"),
56                     ord("K"), ord("L"), ord("M"), ord("N"), ord("O"),
57                     ord("P"), ord("Q"), ord("R"), ord("S"), ord("T"),
58                     ord("U"), ord("V"), ord("W"), ord("X"), ord("Y"),
59                     ord("Z"),
60                     0x0126
61                   ],
62    '[:xdigit:]' => [ ord("0"), ord("1"), ord("2"), ord("3"), ord("4"),
63                      ord("5"), ord("6"), ord("7"), ord("8"), ord("9"),
64                      ord("A"), ord("B"), ord("C"), ord("D"), ord("E"),
65                      ord("F"), ord("a"), ord("b"), ord("c"), ord("d"),
66                      ord("e"), ord("f"),
67                      0xFF15,
68                    ],
69);
70
71if ($ENV{PERL_DEBUG_FULL_TEST}) {
72    push @{$testcases{'[:cntrl:]'}}, utf8::unicode_to_native($_)
73                                               for (0x00 .. 0x1F, 0x7F .. 0x9F);
74    push @{$testcases{'[:blank:]'}}, utf8::unicode_to_native(0xA0);
75    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xA1);
76    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native($_)
77                                                            for (0xA2 .. 0xA6);
78    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xA7);
79    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xA8);
80    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xA9);
81    push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native(0xAA);
82    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xAB);
83    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native($_)
84                                                            for (0xAC .. 0xB4);
85    push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native(0xB5);
86    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xB6);
87    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xB7);
88    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xB8);
89    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xB9);
90    push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native(0xBA);
91    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xBB);
92    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xBC);
93    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xBD);
94    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xBE);
95    push @{$testcases{'[:punct:]'}}, utf8::unicode_to_native(0xBF);
96    push @{$testcases{'[:upper:]'}}, utf8::unicode_to_native($_)
97                                                            for (0xC0 .. 0xD6);
98    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xD7);
99    push @{$testcases{'[:upper:]'}}, utf8::unicode_to_native($_)
100                                                            for (0xD8 .. 0xDE);
101    push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native($_)
102                                                            for (0xDF .. 0xF6);
103    push @{$testcases{'[:graph:]'}}, utf8::unicode_to_native(0xF7);
104    push @{$testcases{'[:lower:]'}}, utf8::unicode_to_native($_)
105                                                            for (0xF8 .. 0xF8);
106
107    push @{$testcases{'[:alpha:]'}}, @{$testcases{'[:lower:]'}},
108                                 @{$testcases{'[:upper:]'}};
109    push @{$testcases{'[:alnum:]'}}, @{$testcases{'[:alpha:]'}},
110                                 @{$testcases{'\d'}};
111    push @{$testcases{'\w'}}, @{$testcases{'[:alnum:]'}}, ord("_");
112    push @{$testcases{'[:print:]'}}, @{$testcases{'[:graph:]'}},
113                                ord(" "),
114                                utf8::unicode_to_native(0xA0);
115}
116
117@{$testcases{'[:digit:]'}} = @{$testcases{'\d'}};
118@{$testcases{'[:space:]'}} = @{$testcases{'\s'}};
119@{$testcases{'[:word:]'}}  = @{$testcases{'\w'}};
120
121#use Data::Dumper;
122#$Data::Dumper::Sortkeys = 1;
123#print STDERR Dumper \%testcases;
124
125my $utf8_locale;
126
127my @charsets = qw(a d u aa);
128my $locales_ok = locales_enabled([ 'LC_CTYPE', 'LC_ALL' ]);
129if (! is_miniperl() && $locales_ok) {
130    require POSIX;
131    my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
132    if ($current_locale eq 'C') {
133
134        # test for d_setlocale is repeated here because this one is compile
135        # time, and the one above is run time
136        use if $Config{d_setlocale}, 'locale';
137
138        # Some implementations don't have the 128-255 range characters all
139        # mean nothing under the C locale (an example being VMS).  This is
140        # legal, but since we don't know what the right answers should be,
141        # skip the locale tests in that situation.
142        for my $i (128 .. 255) {
143            goto skip_adding_C_locale
144                              if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
145        }
146        push @charsets, 'l';
147
148    skip_adding_C_locale:
149
150        # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
151        $utf8_locale = find_utf8_ctype_locale();
152        push @charsets, 'L' if defined $utf8_locale;
153    }
154}
155
156# For each possible character set...
157foreach my $charset (@charsets) {
158    my $locale;
159    my $charset_mod = lc $charset;
160    my $charset_display;
161    if ($charset_mod eq 'l') {
162        $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
163                                            ? "C"
164                                            : $utf8_locale
165                           );
166        die "Couldn't change to locale " . (($charset eq 'l') ? "C" : $utf8_locale) unless $locale;
167        $charset_display = $charset_mod . " ($locale)";
168    }
169    else {
170        $charset_display = $charset_mod;
171    }
172
173    # And in utf8 or not
174    foreach my $upgrade ("", 'utf8::upgrade($a); ') {
175
176        # reverse gets the, \w, \s, \d first.
177        for my $class (reverse sort keys %testcases) {
178
179            # The complement of \w is \W; of [:posix:] is [:^posix:]
180            my $complement = $class;
181            if ($complement !~ s/ ( \[: ) /$1^/x) {
182                $complement = uc($class);
183            }
184
185            # For each test case
186            foreach my $ord (@{$testcases{$class}}) {
187                my $char = chr($ord);
188                $char = ($char eq '$') ? '\$' : display($char);
189
190                # > 255 already implies upgraded.  Skip the ones that don't
191                # have an explicit upgrade.  This shows more clearly in the
192                # output which tests are in utf8, or not.
193                next if $ord > 255 && ! $upgrade;
194
195                my $reason = "";    # Explanation output with each test
196                my $neg_reason = "";
197                my $match = 1;      # Calculated whether test regex should
198                                    # match or not
199
200                # Everything always matches in ASCII, or under /u, or under /l
201                # with a UTF-8 locale
202                if (utf8::native_to_unicode($ord) < 128
203                    || $charset eq 'u'
204                    || $charset eq 'L')
205                {
206                    $reason = "\"$char\" is a $class under /$charset_display";
207                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
208                }
209                elsif ($charset eq "a" || $charset eq "aa") {
210                    $match = 0;
211                    $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
212                    $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
213                }
214                elsif ($ord > 255) {
215                    $reason = "\"$char\" is a $class under /$charset_display";
216                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
217                }
218                elsif ($charset eq 'l') {
219
220                    # We are using the C locale, which is essentially ASCII,
221                    # but under utf8, the above-latin1 chars are treated as
222                    # Unicode)
223                    $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
224                    $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
225                    $match = 0;
226                }
227                elsif ($upgrade) {
228                    $reason = "\"$char\" is a $class in utf8 under /$charset_display";
229                    $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
230                }
231                else {
232                    $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
233                    $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
234                    $match = 0;
235                }
236                $reason = "; $reason" if $reason;
237                $neg_reason = "; $neg_reason" if $neg_reason;
238
239                my $op;
240                my $neg_op;
241                if ($match) {
242                    $op = '=~';
243                    $neg_op = '!~';
244                }
245                else {
246                    $op = '!~';
247                    $neg_op = '=~';
248                }
249
250                # In [...] or not
251                foreach my $bracketed (0, 1) {
252                    my $lb = "";
253                    my $rb = "";
254                    if ($bracketed) {
255
256                        # Adds an extra char to the character class to make sure
257                        # that the class doesn't get optimized away.  (Make
258                        # sure to not use the character being tested.)
259                        my $extra = ($char eq "_") ? ":" : "_";
260                        $lb = ($bracketed) ? "[$extra" : "";
261                        $rb = ($bracketed) ? ']' : "";
262                    }
263                    else {  # [:posix:] must be inside outer [ ]
264                        next if $class =~ /\[/;
265                    }
266
267                    my $length = 10;    # For regexec.c regrepeat() cases by
268                                        # matching more than one item
269                    # Test both class and its complement, and with one or more
270                    # than one item to match.
271                    foreach my $eval (
272                        qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
273                        qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
274                    ) {
275                        ok (eval $eval, $eval . $reason);
276                    }
277                    foreach my $eval (
278                        qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
279                        qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
280                    ) {
281                        ok (eval $eval, $eval . $neg_reason);
282                    }
283                }
284
285                next if $class ne '\w';
286
287                # Test \b, \B at beginning and end of string
288                foreach my $eval (
289                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
290                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
291                ) {
292                    ok (eval $eval, $eval . $reason);
293                }
294                foreach my $eval (
295                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
296                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
297                ) {
298                    ok (eval $eval, $eval . $neg_reason);
299                }
300
301                # Test \b, \B adjacent to a non-word char, both before it and
302                # after.  We test with ASCII, Latin1 and Unicode non-word chars
303                foreach my $space_ord (@{$testcases{'\s'}}) {
304
305                    # This is an anomalous character, so skip.
306                    next if $space_ord == ord("\n");
307
308                    # Useless to try to test non-utf8 when the ord itself
309                    # forces utf8
310                    next if $space_ord > 255 && ! $upgrade;
311
312                    my $space = display(chr $space_ord);
313
314                    foreach my $eval (
315                        qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
316                        qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
317                    ) {
318                        ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
319                    }
320                    foreach my $eval (
321                        qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
322                        qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
323                    ) {
324                        ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
325                    }
326                }
327
328                # Test \b, \B in the middle of two nominally word chars, but
329                # one or both may be considered non-word depending on range
330                # and charset.
331                foreach my $other_ord (@{$testcases{'\w'}}) {
332                    next if $other_ord > 255 && ! $upgrade;
333                    my $other = display(chr $other_ord);
334
335                    # Determine if the other char is a word char in current
336                    # circumstances
337                    my $other_is_word = 1;
338                    my $other_reason = "\"$other\" is a $class under /$charset_display";
339                    my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
340                    if (utf8::native_to_unicode($other_ord) > 127
341                        && $charset ne 'u' && $charset ne 'L'
342                        && (($charset eq "a" || $charset eq "aa")
343                            || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
344                    {
345                        $other_is_word = 0;
346                        $other_reason = "\"$other\" is not a $class under /$charset_display";
347                        $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
348                    }
349                    my $both_reason = $reason;
350                    $both_reason .= "; $other_reason" if $other_ord != $ord;
351                    my $both_neg_reason = $neg_reason;
352                    $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
353
354                    # If both are the same wordness, then \b will fail; \B
355                    # succeed
356                    if ($match == $other_is_word) {
357                        $op = '!~';
358                        $neg_op = '=~';
359                    }
360                    else {
361                        $op = '=~';
362                        $neg_op = '!~';
363                    }
364
365                    foreach my $eval (
366                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
367                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
368                    ) {
369                        ok (eval $eval, $eval . $both_reason);
370                    }
371                    foreach my $eval (
372                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
373                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
374                    ) {
375                        ok (eval $eval, $eval . $both_neg_reason);
376                    }
377
378                    next if $other_ord == $ord;
379
380                    # These start with the \b or \B.  They are included, based
381                    # on source code analysis, to force the testing of the FBC
382                    # (find_by_class) portions of regexec.c.
383                    foreach my $eval (
384                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
385                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
386                    ) {
387                        ok (eval $eval, $eval . $both_reason);
388                    }
389                    foreach my $eval (
390                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
391                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
392                    ) {
393                        ok (eval $eval, $eval . $both_neg_reason);
394                    }
395                }
396            } # End of each test case in a class
397        } # End of \w, \s, ...
398    } # End of utf8 upgraded or not
399}
400
401plan(curr_test() - 1);
402