xref: /openbsd-src/gnu/usr.bin/perl/t/re/charset.t (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1# Test the /a, /d, etc regex modifiers
2
3BEGIN {
4    chdir 't' if -d 't';
5    @INC = '../lib';
6    require './test.pl';
7    require './loc_tools.pl';
8}
9
10use strict;
11use warnings;
12use Config;
13
14plan('no_plan');
15
16# Each case is a valid element of its hash key.  Choose, where available, an
17# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
18my %testcases = (
19    '\w' => [ ord("A"), 0xE2, 0x16B ],   # Below expects these to all be alpha
20    '\d' => [ ord("0"), 0x0662 ],
21    '\s' => [ ord("\t"), 0xA0, 0x1680 ],  # Below expects these to be [:blank:]
22    '[:cntrl:]' => [ 0x00, 0x88 ],
23    '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
24                                               # [:print:]
25    '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
26    '[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
27    '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
28    '[:xdigit:]' => [ ord("4"), 0xFF15 ],
29);
30
31$testcases{'[:digit:]'} = $testcases{'\d'};
32$testcases{'[:alnum:]'} = $testcases{'\w'};
33$testcases{'[:alpha:]'} = $testcases{'\w'};
34$testcases{'[:blank:]'} = $testcases{'\s'};
35$testcases{'[:print:]'} = $testcases{'[:graph:]'};
36$testcases{'[:space:]'} = $testcases{'\s'};
37$testcases{'[:word:]'} = $testcases{'\w'};
38
39my $utf8_locale;
40
41my @charsets = qw(a d u aa);
42if (! is_miniperl() && $Config{d_setlocale}) {
43    require POSIX;
44    my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
45    if ($current_locale eq 'C') {
46
47        # test for d_setlocale is repeated here because this one is compile
48        # time, and the one above is run time
49        use if $Config{d_setlocale}, 'locale';
50
51        # Some implementations don't have the 128-255 range characters all
52        # mean nothing under the C locale (an example being VMS).  This is
53        # legal, but since we don't know what the right answers should be,
54        # skip the locale tests in that situation.
55        for my $i (128 .. 255) {
56            goto skip_adding_C_locale if chr($i) =~ /[[:print:]]/;
57        }
58        push @charsets, 'l';
59
60    skip_adding_C_locale:
61
62        # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
63        $utf8_locale = find_utf8_ctype_locale();
64        push @charsets, 'L' if defined $utf8_locale;
65    }
66}
67
68# For each possible character set...
69foreach my $charset (@charsets) {
70    my $locale;
71    my $charset_mod = lc $charset;
72    my $charset_display;
73    if ($charset_mod eq 'l') {
74        $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
75                                            ? "C"
76                                            : $utf8_locale
77                           );
78        die "Couldn't change locale" unless $locale;
79        $charset_display = $charset_mod . " ($locale)";
80    }
81    else {
82        $charset_display = $charset_mod;
83    }
84
85    # And in utf8 or not
86    foreach my $upgrade ("", 'utf8::upgrade($a); ') {
87
88        # reverse gets the, \w, \s, \d first.
89        for my $class (reverse sort keys %testcases) {
90
91            # The complement of \w is \W; of [:posix:] is [:^posix:]
92            my $complement = $class;
93            if ($complement !~ s/ ( \[: ) /$1^/x) {
94                $complement = uc($class);
95            }
96
97            # For each test case
98            foreach my $ord (@{$testcases{$class}}) {
99                my $char = display(chr($ord));
100
101                # > 255 already implies upgraded.  Skip the ones that don't
102                # have an explicit upgrade.  This shows more clearly in the
103                # output which tests are in utf8, or not.
104                next if $ord > 255 && ! $upgrade;
105
106                my $reason = "";    # Explanation output with each test
107                my $neg_reason = "";
108                my $match = 1;      # Calculated whether test regex should
109                                    # match or not
110
111                # Everything always matches in ASCII, or under /u
112                if ($ord < 128 || $charset eq 'u' || $charset eq 'L') {
113                    $reason = "\"$char\" is a $class under /$charset_display";
114                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
115                }
116                elsif ($charset eq "a" || $charset eq "aa") {
117                    $match = 0;
118                    $reason = "\"$char\" is non-ASCII, which can't be a $class under /a";
119                    $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /a";
120                }
121                elsif ($ord > 255) {
122                    $reason = "\"$char\" is a $class under /$charset_display";
123                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
124                }
125                elsif ($charset eq 'l') {
126
127                    # We are using the C locale, which is essentially ASCII,
128                    # but under utf8, the above-latin1 chars are treated as
129                    # Unicode)
130                    $reason = "\"$char\" is not a $class in the C locale under /l";
131                    $neg_reason = "\"$char\" is a $complement in the C locale under /l";
132                    $match = 0;
133                }
134                elsif ($upgrade) {
135                    $reason = "\"$char\" is a $class in utf8 under /d";
136                    $neg_reason = "\"$char\" is not a $complement in utf8 under /d";
137                }
138                else {
139                    $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /d";
140                    $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /d (unless in utf8)";
141                    $match = 0;
142                }
143                $reason = "; $reason" if $reason;
144                $neg_reason = "; $neg_reason" if $neg_reason;
145
146                my $op;
147                my $neg_op;
148                if ($match) {
149                    $op = '=~';
150                    $neg_op = '!~';
151                }
152                else {
153                    $op = '!~';
154                    $neg_op = '=~';
155                }
156
157                # In [...] or not
158                foreach my $bracketed (0, 1) {
159                    my $lb = "";
160                    my $rb = "";
161                    if ($bracketed) {
162
163                        # Adds an extra char to the character class to make sure
164                        # that the class doesn't get optimized away.
165                        $lb = ($bracketed) ? '[_' : "";
166                        $rb = ($bracketed) ? ']' : "";
167                    }
168                    else {  # [:posix:] must be inside outer [ ]
169                        next if $class =~ /\[/;
170                    }
171
172                    my $length = 10;    # For regexec.c regrepeat() cases by
173                                        # matching more than one item
174                    # Test both class and its complement, and with one or more
175                    # than one item to match.
176                    foreach my $eval (
177                        qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
178                        qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
179                    ) {
180                        ok (eval $eval, $eval . $reason);
181                    }
182                    foreach my $eval (
183                        qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
184                        qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
185                    ) {
186                        ok (eval $eval, $eval . $neg_reason);
187                    }
188                }
189
190                next if $class ne '\w';
191
192                # Test \b, \B at beginning and end of string
193                foreach my $eval (
194                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
195                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
196                ) {
197                    ok (eval $eval, $eval . $reason);
198                }
199                foreach my $eval (
200                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
201                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
202                ) {
203                    ok (eval $eval, $eval . $neg_reason);
204                }
205
206                # Test \b, \B adjacent to a non-word char, both before it and
207                # after.  We test with ASCII, Latin1 and Unicode non-word chars
208                foreach my $space_ord (@{$testcases{'\s'}}) {
209
210                    # Useless to try to test non-utf8 when the ord itself
211                    # forces utf8
212                    next if $space_ord > 255 && ! $upgrade;
213
214                    my $space = display(chr $space_ord);
215
216                    foreach my $eval (
217                        qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
218                        qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
219                    ) {
220                        ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
221                    }
222                    foreach my $eval (
223                        qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
224                        qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
225                    ) {
226                        ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
227                    }
228                }
229
230                # Test \b, \B in the middle of two nominally word chars, but
231                # one or both may be considered non-word depending on range
232                # and charset.
233                foreach my $other_ord (@{$testcases{'\w'}}) {
234                    next if $other_ord > 255 && ! $upgrade;
235                    my $other = display(chr $other_ord);
236
237                    # Determine if the other char is a word char in current
238                    # circumstances
239                    my $other_is_word = 1;
240                    my $other_reason = "\"$other\" is a $class under /$charset_display";
241                    my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
242                    if ($other_ord > 127
243                        && $charset ne 'u' && $charset ne 'L'
244                        && (($charset eq "a" || $charset eq "aa")
245                            || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
246                    {
247                        $other_is_word = 0;
248                        $other_reason = "\"$other\" is not a $class under /$charset_display";
249                        $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
250                    }
251                    my $both_reason = $reason;
252                    $both_reason .= "; $other_reason" if $other_ord != $ord;
253                    my $both_neg_reason = $neg_reason;
254                    $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
255
256                    # If both are the same wordness, then \b will fail; \B
257                    # succeed
258                    if ($match == $other_is_word) {
259                        $op = '!~';
260                        $neg_op = '=~';
261                    }
262                    else {
263                        $op = '=~';
264                        $neg_op = '!~';
265                    }
266
267                    foreach my $eval (
268                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
269                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
270                    ) {
271                        ok (eval $eval, $eval . $both_reason);
272                    }
273                    foreach my $eval (
274                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
275                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
276                    ) {
277                        ok (eval $eval, $eval . $both_neg_reason);
278                    }
279
280                    next if $other_ord == $ord;
281
282                    # These start with the \b or \B.  They are included, based
283                    # on source code analysis, to force the testing of the FBC
284                    # (find_by_class) portions of regexec.c.
285                    foreach my $eval (
286                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
287                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
288                    ) {
289                        ok (eval $eval, $eval . $both_reason);
290                    }
291                    foreach my $eval (
292                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
293                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
294                    ) {
295                        ok (eval $eval, $eval . $both_neg_reason);
296                    }
297                }
298            } # End of each test case in a class
299        } # End of \w, \s, ...
300    } # End of utf8 upgraded or not
301}
302
303plan(curr_test() - 1);
304