xref: /openbsd-src/gnu/usr.bin/perl/t/re/regexp.t (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1#!./perl
2
3# The tests are in a separate file 't/re/re_tests'.
4# Each line in that file is a separate test.
5# There are five columns, separated by tabs.
6#
7# Column 1 contains the pattern, optionally enclosed in C<''>.
8# Modifiers can be put after the closing C<'>.
9#
10# Column 2 contains the string to be matched.
11#
12# Column 3 contains the expected result:
13# 	y	expect a match
14# 	n	expect no match
15# 	c	expect an error
16#	T	the test is a TODO (can be combined with y/n/c)
17#	M	skip test on miniperl (combine with y/n/c/T)
18#	B	test exposes a known bug in Perl, should be skipped
19#	b	test exposes a known bug in Perl, should be skipped if noamp
20#	t	test exposes a bug with threading, TODO if qr_embed_thr
21#       s       test should only be run for regex_sets_compat.t
22#       S       test should not be run for regex_sets_compat.t
23#
24# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
25#
26# Column 4 contains a string, usually C<$&>.
27#
28# Column 5 contains the expected result of double-quote
29# interpolating that string after the match, or start of error message.
30#
31# Column 6, if present, contains a reason why the test is skipped.
32# This is printed with "skipped", for harness to pick up.
33#
34# \n in the tests are interpolated, as are variables of the form ${\w+}.
35#
36# Blanks lines are treated as PASSING tests to keep the line numbers
37# linked to the test number.
38#
39# If you want to add a regular expression test that can't be expressed
40# in this format, don't add it here: put it in re/pat.t instead.
41#
42# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
43# This means this file cannot be used for testing anything that the lexer
44# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
45#
46# Note that columns 2,3 and 5 are all enclosed in double quotes and then
47# evalled; so something like a\"\x{100}$1 has length 3+length($1).
48
49my ($file, $iters);
50BEGIN {
51    $iters = shift || 1;	# Poor man performance suite, 10000 is OK.
52
53    # Do this open before any chdir
54    $file = shift;
55    if (defined $file) {
56	open TESTS, $file or die "Can't open $file";
57    }
58
59    chdir 't' if -d 't';
60    @INC = '../lib';
61
62}
63
64sub _comment {
65    return map { /^#/ ? "$_\n" : "# $_\n" }
66           map { split /\n/ } @_;
67}
68
69use strict;
70use warnings FATAL=>"all";
71use vars qw($bang $ffff $nulnul); # used by the tests
72use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers
73
74
75
76if (!defined $file) {
77    open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
78}
79
80my @tests = <TESTS>;
81
82close TESTS;
83
84$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
85$ffff  = chr(0xff) x 2;
86$nulnul = "\0" x 2;
87my $OP = $qr ? 'qr' : 'm';
88
89$| = 1;
90printf "1..%d\n# $iters iterations\n", scalar @tests;
91
92my $test;
93TEST:
94foreach (@tests) {
95    $test++;
96    if (!/\S/ || /^\s*#/ || /^__END__$/) {
97        print "ok $test # (Blank line or comment)\n";
98        if (/#/) { print $_ };
99        next;
100    }
101    chomp;
102    s/\\n/\n/g unless $regex_sets;
103    my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6);
104    $reason = '' unless defined $reason;
105    my $input = join(':',$pat,$subject,$result,$repl,$expect);
106    # the double '' below keeps simple syntax highlighters from going crazy
107    $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
108    $pat =~ s/(\$\{\w+\})/$1/eeg;
109    $pat =~ s/\\n/\n/g unless $regex_sets;
110    $subject = eval qq("$subject"); die $@ if $@;
111    $expect  = eval qq("$expect"); die $@ if $@;
112    $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
113    my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
114    my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
115    ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
116    if ($result =~ s/ ( [Ss] ) //x) {
117        if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
118            $skip++;
119            $reason = "Test not valid for $0";
120        }
121    }
122    $reason = 'skipping $&' if $reason eq  '' && $skip_amp;
123    $result =~ s/B//i unless $skip;
124    my $todo= $result =~ s/T// ? " # TODO" : "";
125    if (! $skip && $regex_sets) {
126
127        # If testing regex sets, change the [bracketed] classes into
128        # (?[bracketed]).
129
130        if ($pat !~ / \[ /x) {
131
132            $skip++;
133            $reason = "Pattern doesn't contain [brackets]";
134        }
135        else { # Use non-regex features of Perl to accomplish this.
136            my $modified = "";
137            my $in_brackets = 0;
138
139            # Go through the pattern character-by-character.  We also add
140            # blanks around each token to test the /x parts of (?[ ])
141            my $pat_len = length($pat);
142      CHAR: for (my $i = 0; $i < $pat_len; $i++) {
143                my $curchar = substr($pat, $i, 1);
144                if ($curchar eq '\\') {
145                    $modified .= " " if $in_brackets;
146                    $modified .= $curchar;
147                    $i++;
148
149                    # Get the character the backslash is escaping
150                    $curchar = substr($pat, $i, 1);
151                    $modified .= $curchar;
152
153                    # If the character following that is a '{}', treat the
154                    # entire amount as a single token
155                    if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
156                        my $j = index($pat, '}', $i+2);
157                        if ($j < 0) {
158                            last unless $in_brackets;
159                            if ($result eq 'c') {
160                                $skip++;
161                                $reason = "Can't handle compilation errors with unmatched '{'";
162                            }
163                            else {
164                                print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
165                                next TEST;
166                            }
167                        }
168                        $modified .= substr($pat, $i+1, $j - $i);
169                        $i = $j;
170                    }
171                    elsif ($curchar eq 'x') {
172
173                        # \x without brackets is supposed to be followed by 2
174                        # hex digits.  Take up to 2, and then add a blank
175                        # after the last one.  This avoids getting errors from
176                        # (?[ ]) for run-ons, like \xabc
177                        my $j = $i + 1;
178                        for (; $j < $i + 3 && $j < $pat_len; $j++) {
179                            my $curord = ord(substr($pat, $j, 1));
180                            if (!(($curord >= ord("A") && $curord <= ord("F"))
181                                 || ($curord >= ord("a") && $curord <= ord("f"))
182                                 || ($curord >= ord("0") && $curord <= ord("9"))))
183                            {
184                                $j++;
185                                last;
186                            }
187                        }
188                        $j--;
189                        $modified .= substr($pat, $i + 1, $j - $i) . " ";
190                        $i = $j;
191                    }
192                    elsif (ord($curchar) >= ord('0')
193                           && (ord($curchar) <= ord('7')))
194                    {
195                        # Similarly, octal constants have up to 3 digits.
196                        my $j = $i + 1;
197                        for (; $j < $i + 3 && $j < $pat_len; $j++) {
198                            my $curord = ord(substr($pat, $j, 1));
199                            if (! ($curord >= ord("0") &&  $curord <= ord("7"))) {
200                                $j++;
201                                last;
202                            }
203                        }
204                        $j--;
205                        $modified .= substr($pat, $i + 1, $j - $i);
206                        $i = $j;
207                    }
208
209                    next;
210                } # End of processing a backslash sequence
211
212                if (! $in_brackets  # Skip (?{ })
213                    && $curchar eq '('
214                    && $i < $pat_len - 2
215                    && substr($pat, $i+1, 1) eq '?'
216                    && substr($pat, $i+2, 1) eq '{')
217                {
218                    $skip++;
219                    $reason = "Pattern contains '(?{'";
220                    last;
221                }
222
223                # Closing ']'
224                if ($curchar eq ']' && $in_brackets) {
225                    $modified .= " ] ])";
226                    $in_brackets = 0;
227                    next;
228                }
229
230                # A regular character.
231                if ($curchar ne '[') {
232                    if (! $in_brackets) {
233                        $modified .= $curchar;
234                    }
235                    else {
236                        $modified .= " $curchar ";
237                    }
238                    next;
239                }
240
241                # Here is a '['; If not in a bracketed class, treat as the
242                # beginning of one.
243                if (! $in_brackets) {
244                    $in_brackets = 1;
245                    $modified .= "(?[ [ ";
246
247                    # An immediately following ']' or '^]' is not the ending
248                    # of the class, but is to be treated literally.
249                    if ($i < $pat_len - 1
250                        && substr($pat, $i+1, 1) eq ']')
251                    {
252                        $i ++;
253                        $modified .= " ] ";
254                    }
255                    elsif ($i < $pat_len - 2
256                            && substr($pat, $i+1, 1) eq '^'
257                            && substr($pat, $i+2, 1) eq ']')
258                    {
259                        $i += 2;
260                        $modified .= " ^ ] ";
261                    }
262                    next;
263                }
264
265                # Here is a plain '[' within [ ].  Could mean wants to
266                # match a '[', or it could be a posix class that has a
267                # corresponding ']'.  Absorb either
268
269                $modified .= ' [';
270                last if $i >= $pat_len - 1;
271
272                $i++;
273                $curchar = substr($pat, $i, 1);
274                if ($curchar =~ /[:=.]/) {
275                    for (my $j = $i + 1; $j < $pat_len; $j++) {
276                        next unless substr($pat, $j, 1) eq ']';
277                        last if $j - $i < 2;
278                        if (substr($pat, $j - 1, 1) eq $curchar) {
279                            # Here, is a posix class
280                            $modified .= substr($pat, $i, $j - $i + 1) . " ";
281                            $i = $j;
282                            next CHAR;
283                        }
284                    }
285                }
286
287                # Here wasn't a posix class, just process normally
288                $modified .= " $curchar ";
289            }
290
291            if ($in_brackets && ! $skip) {
292                if ($result eq 'c') {
293                    $skip++;
294                    $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
295                }
296                else {
297                    print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
298                    next TEST;
299                }
300            }
301
302            # Use our modified pattern instead of the original
303            $pat = $modified;
304        }
305    }
306
307    for my $study ('', 'study $subject', 'utf8::upgrade($subject)',
308		   'utf8::upgrade($subject); study $subject') {
309	# Need to make a copy, else the utf8::upgrade of an already studied
310	# scalar confuses things.
311	my $subject = $subject;
312	my $c = $iters;
313	my ($code, $match, $got);
314        if ($repl eq 'pos') {
315            $code= <<EOFCODE;
316                $study;
317                pos(\$subject)=0;
318                \$match = ( \$subject =~ m${pat}g );
319                \$got = pos(\$subject);
320EOFCODE
321        }
322        elsif ($qr_embed) {
323            $code= <<EOFCODE;
324                my \$RE = qr$pat;
325                $study;
326                \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
327                \$got = "$repl";
328EOFCODE
329        }
330        elsif ($qr_embed_thr) {
331            $code= <<EOFCODE;
332		# Can't run the match in a subthread, but can do this and
333	 	# clone the pattern the other way.
334                my \$RE = threads->new(sub {qr$pat})->join();
335                $study;
336                \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
337                \$got = "$repl";
338EOFCODE
339        }
340        else {
341            $code= <<EOFCODE;
342                $study;
343                \$match = (\$subject =~ $OP$pat) while \$c--;
344                \$got = "$repl";
345EOFCODE
346        }
347        $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
348        #$code.=qq[\n\$expect="$expect";\n];
349        #use Devel::Peek;
350        #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
351	{
352	    # Probably we should annotate specific tests with which warnings
353	    # categories they're known to trigger, and hence should be
354	    # disabled just for that test
355	    no warnings qw(uninitialized regexp);
356	    eval $code;
357	}
358	chomp( my $err = $@ );
359	if ( $skip ) {
360	    print "ok $test # skipped", length($reason) ? ".  $reason" : '', "\n";
361	    next TEST;
362	}
363	elsif ($result eq 'c') {
364	    if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST }
365	    last;  # no need to study a syntax error
366	}
367	elsif ( $todo_qr ) {
368	    print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n";
369	    next TEST;
370	}
371	elsif ($@) {
372	    print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
373	}
374	elsif ($result =~ /^n/) {
375	    if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST }
376	}
377	else {
378	    if (!$match || $got ne $expect) {
379	        eval { require Data::Dumper };
380                no warnings "utf8"; # But handle should be utf8
381		if ($@ || !defined &DynaLoader::boot_DynaLoader) {
382		    # Data::Dumper will load on miniperl, but fail when used in
383		    # anger as it tries to load B. I'd prefer to keep the
384		    # regular calls below outside of an eval so that real
385		    # (unknown) failures get spotted, not ignored.
386		    print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
387		}
388		else { # better diagnostics
389		    my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
390		    my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
391		    print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n");
392		}
393		next TEST;
394	    }
395	}
396    }
397    print "ok $test$todo\n";
398}
399
4001;
401